From 9c1c0c923e75d85d1de98a2a2324bb5d0702e59b Mon Sep 17 00:00:00 2001 From: Qi Xiao Date: Fri, 4 Nov 2022 12:51:54 +0000 Subject: [PATCH] pkg/md: Fix more bugs found by fuzzing. --- pkg/md/fmt.go | 135 ++++++++++++++---- pkg/md/fmt_test.go | 4 + pkg/md/html.go | 5 +- ...fe7a0be1704811421c5c28a78c9c3b74983157b1e4 | 2 + ...7ace3ece80a752882b1a06d06a101f375abc30fa32 | 2 + ...93aa1462f9a1b4c1a4709c0256ae7bc4244578601c | 2 + ...6641570cfe484af56c6a406c3d89948406e4f11d62 | 2 + ...ce1a30e32ea78945cd5946af20a92307506bf4fa53 | 2 + ...cfc58f28c496c9455b3c988b0964c02b45ac451b74 | 2 + 9 files changed, 124 insertions(+), 32 deletions(-) create mode 100644 pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/51e230c835df97b3aec428fe7a0be1704811421c5c28a78c9c3b74983157b1e4 create mode 100644 pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/6ed9c798d44c6e3a7c68257ace3ece80a752882b1a06d06a101f375abc30fa32 create mode 100644 pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/714ffe658ccfa9609ff79b93aa1462f9a1b4c1a4709c0256ae7bc4244578601c create mode 100644 pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/763096c0f188a6f5a2cad16641570cfe484af56c6a406c3d89948406e4f11d62 create mode 100644 pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/9e5f2825cc97261ce08d95ce1a30e32ea78945cd5946af20a92307506bf4fa53 create mode 100644 pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/d853024ac25e2be9f90784cfc58f28c496c9455b3c988b0964c02b45ac451b74 diff --git a/pkg/md/fmt.go b/pkg/md/fmt.go index ef979b46..80f2d8e2 100644 --- a/pkg/md/fmt.go +++ b/pkg/md/fmt.go @@ -97,17 +97,44 @@ func (c *FmtCodec) setUnsupported() *FmtUnsupported { } var ( - escapeText = strings.NewReplacer( - "[", `\[`, "]", `\]`, "*", `\*`, "`", "\\`", `\`, `\\`, - "&", "&", "<", "<", - // TODO: Don't escape _ when in-word - "_", "\\_", - ).Replace + // Unlike escapeHTML, double and single quotes in autolinks do not need + // escaping. escapeAutolink = strings.NewReplacer( "&", "&", "<", "<", ">", ">", ).Replace ) +func escapeText(s string) string { + if !strings.ContainsAny(s, "[]*_`\\&<>") { + return s + } + var sb strings.Builder + for i := 0; i < len(s); i++ { + switch s[i] { + case '[', ']', '*', '`', '\\': + sb.WriteByte('\\') + sb.WriteByte(s[i]) + case '_': + if isWord(utf8.DecodeLastRuneInString(s[:i])) && isWord(utf8.DecodeRuneInString(s[i+1:])) { + sb.WriteByte('_') + } else { + sb.WriteString("\\_") + } + case '&': + sb.WriteString("&") + case '<': + sb.WriteString("<") + case '>': + // ">" technically does not need escaping in text, but we escape it + // anyway for consistency with "<" + sb.WriteString(">") + default: + sb.WriteByte(s[i]) + } + } + return sb.String() +} + var ( backquoteRunRegexp = regexp.MustCompile("`+") tildeRunRegexp = regexp.MustCompile("~+") @@ -310,8 +337,30 @@ func (c *FmtCodec) doInlineContent(ops []InlineOp, atxHeading bool) { } } if thematicBreakRegexp.MatchString(line) { - c.write(`\`) - c.write(text) + switch text[len(text)-1] { + case ' ': + // If the line ends with a space, it has to be escaped + // to be preserved. This has the side effect of making + // the line no longer parse as thematic break. + c.write(escapeText(text[:len(text)-1])) + c.write(" ") + case '\t': + // Same for lines ending with a tab. + c.write(escapeText(text[:len(text)-1])) + c.write(" ") + default: + if escaped := escapeText(text); escaped != text { + // If the text needs escaping anyway, the escaping + // anything extra. + c.write(escaped) + } else { + // Otherwise, escape the first byte, which must be a + // punctuation at this point. + c.write(`\`) + c.write(text[:1]) + c.write(escapeText(text[1:])) + } + } continue } } @@ -371,16 +420,12 @@ func (c *FmtCodec) doInlineContent(ops []InlineOp, atxHeading bool) { c.write("&#" + strconv.Itoa(int(r)) + ";") text = text[l:] } - } else if i > 1 && isEmphasisEnd(ops[i-1]) && ops[i-2].Type == OpText { - if r, l := utf8.DecodeLastRuneInString(ops[i-2].Text); l > 0 && unicode.IsSpace(r) || isUnicodePunct(r) { - if r, l := utf8.DecodeRuneInString(text); l > 0 && !unicode.IsSpace(r) && !isUnicodePunct(r) { - // Escape "other" (word character) immediately after - // emphasis end if emphasis content ended with a - // punctuation or a space (the space has been escaped - // into a punctuation). - c.write("&#" + strconv.Itoa(int(r)) + ";") - text = text[l:] - } + } else if i > 1 && isEmphasisEnd(ops[i-1]) && emphasisOutputEndsWithPunct(ops[i-2]) { + if r, l := utf8.DecodeRuneInString(text); isWord(r, l) { + // Escape "other" (word character) immediately after + // emphasis end if emphasis content ends with a punctuation. + c.write("&#" + strconv.Itoa(int(r)) + ";") + text = text[l:] } } @@ -414,16 +459,13 @@ func (c *FmtCodec) doInlineContent(ops []InlineOp, atxHeading bool) { text = text[:len(text)-l] suffix = "&#" + strconv.Itoa(int(r)) + ";" } - } else if i < len(ops)-2 && isEmphasisStart(ops[i+1]) && ops[i+2].Type == OpText { - if r, l := utf8.DecodeRuneInString(ops[i+2].Text); l > 0 && unicode.IsSpace(r) || isUnicodePunct(r) { - if r, l := utf8.DecodeLastRuneInString(text); l > 0 && !unicode.IsSpace(r) && !isUnicodePunct(r) { - // Escape "other" (word character) immediately before - // emphasis start if emphasis content starts with a - // punctuation or a space (which will be escaped into a - // punctuation). - text = text[:len(text)-l] - suffix = "&#" + strconv.Itoa(int(r)) + ";" - } + } else if i < len(ops)-2 && isEmphasisStart(ops[i+1]) && emphasisOutputStartsWithPunct(ops[i+2]) { + if r, l := utf8.DecodeLastRuneInString(text); isWord(r, l) { + // Escape "other" (word character) immediately before + // emphasis start if the output of the emphasis content will + // start with a punctuation. + text = text[:len(text)-l] + suffix = "&#" + strconv.Itoa(int(r)) + ";" } } @@ -516,7 +558,7 @@ func (c *FmtCodec) doInlineContent(ops []InlineOp, atxHeading bool) { c.writeLinkTail(op.Dest, op.Text) case OpImage: c.write("![") - c.write(strings.ReplaceAll(escapeText(op.Alt), "\n", " ")) + c.write(escapeNewLines(escapeText(op.Alt))) c.write("]") c.writeLinkTail(op.Dest, op.Text) case OpAutolink: @@ -544,6 +586,28 @@ func endsWithSpaceOrTab(s string) bool { return s != "" && (s[len(s)-1] == ' ' || s[len(s)-1] == '\t') } +func emphasisOutputStartsWithPunct(op InlineOp) bool { + switch op.Type { + case OpText: + r, l := utf8.DecodeRuneInString(op.Text) + // If the content starts with a space, it will be escaped into " " + return l > 0 && unicode.IsSpace(r) || isUnicodePunct(r) + default: + return true + } +} + +func emphasisOutputEndsWithPunct(op InlineOp) bool { + switch op.Type { + case OpText: + r, l := utf8.DecodeLastRuneInString(op.Text) + // If the content starts with a space, it will be escaped into " " + return l > 0 && unicode.IsSpace(r) || isUnicodePunct(r) + default: + return true + } +} + func matchLens(pieces []string, pattern *regexp.Regexp) map[int]bool { hasRunWithLen := make(map[int]bool) for _, piece := range pieces { @@ -566,7 +630,8 @@ func (c *FmtCodec) writeLinkTail(dest, title string) { c.write(escapeText(dest)) } if title != "" { - c.write(" " + wrapAndEscapeLinkTitle(title)) + c.write(" ") + c.write(escapeNewLines(wrapAndEscapeLinkTitle(title))) } c.write(")") } @@ -598,7 +663,6 @@ func wrapAndEscapeLinkTitle(title string) string { func (c *FmtCodec) startLine() { for _, container := range c.containers { - // TODO: Remove trailing spaces on empty lines c.write(container.useMarker()) } } @@ -608,6 +672,7 @@ func (c *FmtCodec) finishLine() { } func (c *FmtCodec) writeLine(s string) { + // TODO: Trim markers of trailing spaces if s is empty c.startLine() c.write(s) c.finishLine() @@ -654,3 +719,11 @@ func isEmphasisStart(op InlineOp) bool { func isEmphasisEnd(op InlineOp) bool { return op.Type == OpEmphasisEnd || op.Type == OpStrongEmphasisEnd } + +func escapeNewLines(s string) string { return strings.ReplaceAll(s, "\n", " ") } + +// Takes the result of utf8.Decode*, and returns whether the character is +// non-empty and a "word" character for the purpose of emphasis parsing. +func isWord(r rune, l int) bool { + return l > 0 && !unicode.IsSpace(r) && !isUnicodePunct(r) +} diff --git a/pkg/md/fmt_test.go b/pkg/md/fmt_test.go index e185eb7f..100cae4a 100644 --- a/pkg/md/fmt_test.go +++ b/pkg/md/fmt_test.go @@ -4,6 +4,7 @@ import ( "html" "strings" "testing" + "unicode/utf8" "github.com/google/go-cmp/cmp" . "src.elv.sh/pkg/md" @@ -115,6 +116,9 @@ func testFmtPreservesHTMLRender(t *testing.T, original string) { func formatAndSkipIfUnsupported(t *testing.T, original string) string { t.Helper() + if !utf8.ValidString(original) { + t.Skipf("input is not valid UTF-8") + } if strings.Contains(original, "\t") { t.Skipf("input contains tab") } diff --git a/pkg/md/html.go b/pkg/md/html.go index 3b33bbce..df32b6ef 100644 --- a/pkg/md/html.go +++ b/pkg/md/html.go @@ -11,7 +11,10 @@ import ( // The schemes below are chosen to match the spec tests. var ( escapeHTML = strings.NewReplacer( - "&", "&", `"`, """, "<", "<", ">", ">").Replace + "&", "&", `"`, """, "<", "<", ">", ">", + // No need to escape single quotes since, attributes in the output + // always use double quotes. + ).Replace escapeURL = strings.NewReplacer( `"`, "%22", `\`, "%5C", " ", "%20", "`", "%60", "[", "%5B", "]", "%5D", "<", "%3C", ">", "%3E", diff --git a/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/51e230c835df97b3aec428fe7a0be1704811421c5c28a78c9c3b74983157b1e4 b/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/51e230c835df97b3aec428fe7a0be1704811421c5c28a78c9c3b74983157b1e4 new file mode 100644 index 00000000..a0e6d394 --- /dev/null +++ b/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/51e230c835df97b3aec428fe7a0be1704811421c5c28a78c9c3b74983157b1e4 @@ -0,0 +1,2 @@ +go test fuzz v1 +string("�*`0`*") diff --git a/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/6ed9c798d44c6e3a7c68257ace3ece80a752882b1a06d06a101f375abc30fa32 b/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/6ed9c798d44c6e3a7c68257ace3ece80a752882b1a06d06a101f375abc30fa32 new file mode 100644 index 00000000..c306c63e --- /dev/null +++ b/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/6ed9c798d44c6e3a7c68257ace3ece80a752882b1a06d06a101f375abc30fa32 @@ -0,0 +1,2 @@ +go test fuzz v1 +string("*0!*�") diff --git a/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/714ffe658ccfa9609ff79b93aa1462f9a1b4c1a4709c0256ae7bc4244578601c b/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/714ffe658ccfa9609ff79b93aa1462f9a1b4c1a4709c0256ae7bc4244578601c new file mode 100644 index 00000000..97c2cf43 --- /dev/null +++ b/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/714ffe658ccfa9609ff79b93aa1462f9a1b4c1a4709c0256ae7bc4244578601c @@ -0,0 +1,2 @@ +go test fuzz v1 +string("*!00*\xb1") diff --git a/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/763096c0f188a6f5a2cad16641570cfe484af56c6a406c3d89948406e4f11d62 b/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/763096c0f188a6f5a2cad16641570cfe484af56c6a406c3d89948406e4f11d62 new file mode 100644 index 00000000..8b562207 --- /dev/null +++ b/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/763096c0f188a6f5a2cad16641570cfe484af56c6a406c3d89948406e4f11d62 @@ -0,0 +1,2 @@ +go test fuzz v1 +string("[](0 (\n>))") diff --git a/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/9e5f2825cc97261ce08d95ce1a30e32ea78945cd5946af20a92307506bf4fa53 b/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/9e5f2825cc97261ce08d95ce1a30e32ea78945cd5946af20a92307506bf4fa53 new file mode 100644 index 00000000..3cd03a48 --- /dev/null +++ b/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/9e5f2825cc97261ce08d95ce1a30e32ea78945cd5946af20a92307506bf4fa53 @@ -0,0 +1,2 @@ +go test fuzz v1 +string("___ ___") diff --git a/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/d853024ac25e2be9f90784cfc58f28c496c9455b3c988b0964c02b45ac451b74 b/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/d853024ac25e2be9f90784cfc58f28c496c9455b3c988b0964c02b45ac451b74 new file mode 100644 index 00000000..3977a351 --- /dev/null +++ b/pkg/md/testdata/fuzz/FuzzFmtPreservesHTMLRender/d853024ac25e2be9f90784cfc58f28c496c9455b3c988b0964c02b45ac451b74 @@ -0,0 +1,2 @@ +go test fuzz v1 +string("___ 0")