pkg/md: Support HTML entities.

2024-12-14 02:57:52 +08:00 · 2022-09-26 01:07:51 +01:00 · 2022-09-26 01:07:51 +01:00 · 383a7f2696
commit 383a7f2696
parent 5bf0cf7d6c
2 changed files with 33 additions and 11 deletions
--- a/pkg/md/md.go
+++ b/pkg/md/md.go
@ -15,6 +15,7 @@ package md
 import (
 	"fmt"
 	"html"
 	"regexp"
 	"strings"
 	"unicode"
@ -130,6 +131,7 @@ var isASCIIPunct = map[byte]bool{
 }
 var (
 	entityRegexp  = regexp.MustCompile(`^&(?:[a-zA-Z0-9]+|#[0-9]{1,7}|#[xX][0-9a-fA-F]{1,6});`)
 	openTagRegexp = regexp.MustCompile(fmt.Sprintf(`^<`+
 		`[a-zA-Z][a-zA-Z0-9-]*`+ // tag name
 		(`(?:`+
@ -348,6 +350,14 @@ func (p *inlineParser) render() {
 				}
 			}
 			parseText()
 		case '&':
 			entity := entityRegexp.FindString(p.text[begin:])
 			if entity != "" {
 				p.buf.push(piece{text: p.syntax.escape(html.UnescapeString(entity))})
 				p.pos = begin + len(entity)
 			} else {
 				parseText()
 			}
 		case '\\':
 			if p.pos < len(p.text) && isASCIIPunct[p.text[p.pos]] {
 				begin++
@ -552,7 +562,7 @@ func (p *linkTailParser) parse() (n int, dest, title string) {
 	if p.pos == len(p.text) || p.text[p.pos] != ')' {
 		return -1, "", ""
 	}
-	return p.pos + 1, destBuilder.String(), titleBuilder.String()
+	return p.pos + 1, html.UnescapeString(destBuilder.String()), html.UnescapeString(titleBuilder.String())
 }
 func (p *linkTailParser) skipWhitespaces() {
@ -590,7 +600,7 @@ func isASCIIControl(b byte) bool {
 func isMeta(b byte) bool {
 	switch b {
-	case '[', ']', '*', '_', '`', '\\', '!', '<', '\n':
+	case '!', '[', ']', '*', '_', '`', '\\', '&', '<', '\n':
 		return true
 	default:
 		return false
--- a/pkg/md/md_test.go
+++ b/pkg/md/md_test.go
@ -26,11 +26,12 @@ func init() {
 var (
 	escapeHTML = strings.NewReplacer(
-		`"`, "&quot;", "<", "&lt;", ">", "&gt;").Replace
+		"&", "&amp;", `"`, "&quot;", "<", "&lt;", ">", "&gt;").Replace
 	escapeDest = strings.NewReplacer(
 		`"`, "%22", `\`, "%5C", " ", "%20", "`", "%60",
 		"[", "%5B", "]", "%5D",
-		"&auml;", "%C3%A4", " ", "%C2%A0").Replace
+		"ö", "%C3%B6",
 		"ä", "%C3%A4", " ", "%C2%A0").Replace
 )
 var htmlSyntax = outSyntax{
@ -67,17 +68,18 @@ func TestConvertInline(t *testing.T) {
 			if strings.HasPrefix(tc.Markdown, "#") {
 				t.Skipf("Header not supported")
 			}
 			if strings.HasPrefix(tc.Markdown, "```") || strings.HasPrefix(tc.Markdown, "    ") {
 				t.Skipf("Code block not supported")
 			}
 			if strings.Contains(tc.Markdown, "\n\n") {
 				t.Skipf("Multiple blocks not supported")
 			}
 			if strings.Contains(tc.HTML, "&amp;") {
 				t.Skipf("Ampersand escape not implemented correctly yet")
 			}
-			want := strings.TrimSuffix(strings.TrimPrefix(tc.HTML, "<p>"), "</p>\n") + "\n"
+			want := strings.TrimSuffix(strings.TrimPrefix(
-			got := renderInline(tc.Markdown, htmlSyntax)
+				strings.TrimRight(tc.HTML, "\n"), "<p>"), "</p>")
 			got := strings.TrimRight(renderInline(tc.Markdown, htmlSyntax), "\n")
 			if want != got {
-				t.Errorf("input:\n%swant:\n%sgot:\n%s", tc.Markdown, want, got)
+				t.Errorf("input:\n%swant:\n%s\ngot:\n%s", tc.Markdown, want, got)
 			}
 		})
 	}
@ -85,7 +87,17 @@ func TestConvertInline(t *testing.T) {
 func supportedSection(section string) bool {
 	switch section {
-	case "Inlines", "Code spans", "Emphasis and strong emphasis", "Links", "Autolinks", "Images", "Raw HTML", "Hard line breaks", "Soft line breaks", "Textual content":
+	case "Entity and numeric character references",
 		"Inlines",
 		"Code spans",
 		"Emphasis and strong emphasis",
 		"Links",
 		"Autolinks",
 		"Images",
 		"Raw HTML",
 		"Hard line breaks",
 		"Soft line breaks",
 		"Textual content":
 		return true
 	default:
 		return false