pkg/md: Support HTML entities.

2024-12-13 01:47:51 +08:00 · 2022-09-26 01:07:51 +01:00 · 2022-09-26 01:07:51 +01:00 · 383a7f2696
commit 383a7f2696
parent 5bf0cf7d6c
2 changed files with 33 additions and 11 deletions
--- a/pkg/md/md.go
+++ b/pkg/md/md.go
@ -15,6 +15,7 @@ package md

 import (
 	"fmt"
+	"html"
 	"regexp"
 	"strings"
 	"unicode"
@ -130,6 +131,7 @@ var isASCIIPunct = map[byte]bool{
 }

 var (
+	entityRegexp  = regexp.MustCompile(`^&(?:[a-zA-Z0-9]+|#[0-9]{1,7}|#[xX][0-9a-fA-F]{1,6});`)
 	openTagRegexp = regexp.MustCompile(fmt.Sprintf(`^<`+
 		`[a-zA-Z][a-zA-Z0-9-]*`+ // tag name
 		(`(?:`+
@ -348,6 +350,14 @@ func (p *inlineParser) render() {
 				}
 			}
 			parseText()
+		case '&':
+			entity := entityRegexp.FindString(p.text[begin:])
+			if entity != "" {
+				p.buf.push(piece{text: p.syntax.escape(html.UnescapeString(entity))})
+				p.pos = begin + len(entity)
+			} else {
+				parseText()
+			}
 		case '\\':
 			if p.pos < len(p.text) && isASCIIPunct[p.text[p.pos]] {
 				begin++
@ -552,7 +562,7 @@ func (p *linkTailParser) parse() (n int, dest, title string) {
 	if p.pos == len(p.text) || p.text[p.pos] != ')' {
 		return -1, "", ""
 	}
-	return p.pos + 1, destBuilder.String(), titleBuilder.String()
+	return p.pos + 1, html.UnescapeString(destBuilder.String()), html.UnescapeString(titleBuilder.String())
 }

 func (p *linkTailParser) skipWhitespaces() {
@ -590,7 +600,7 @@ func isASCIIControl(b byte) bool {

 func isMeta(b byte) bool {
 	switch b {
-	case '[', ']', '*', '_', '`', '\\', '!', '<', '\n':
+	case '!', '[', ']', '*', '_', '`', '\\', '&', '<', '\n':
 		return true
 	default:
 		return false
--- a/pkg/md/md_test.go
+++ b/pkg/md/md_test.go
@ -26,11 +26,12 @@ func init() {

 var (
 	escapeHTML = strings.NewReplacer(
-		`"`, "&quot;", "<", "&lt;", ">", "&gt;").Replace
+		"&", "&amp;", `"`, "&quot;", "<", "&lt;", ">", "&gt;").Replace
 	escapeDest = strings.NewReplacer(
 		`"`, "%22", `\`, "%5C", " ", "%20", "`", "%60",
 		"[", "%5B", "]", "%5D",
-		"&auml;", "%C3%A4", " ", "%C2%A0").Replace
+		"ö", "%C3%B6",
+		"ä", "%C3%A4", " ", "%C2%A0").Replace
 )

 var htmlSyntax = outSyntax{
@ -67,17 +68,18 @@ func TestConvertInline(t *testing.T) {
 			if strings.HasPrefix(tc.Markdown, "#") {
 				t.Skipf("Header not supported")
 			}
+			if strings.HasPrefix(tc.Markdown, "```") || strings.HasPrefix(tc.Markdown, "    ") {
+				t.Skipf("Code block not supported")
+			}
 			if strings.Contains(tc.Markdown, "\n\n") {
 				t.Skipf("Multiple blocks not supported")
 			}
-			if strings.Contains(tc.HTML, "&amp;") {
-				t.Skipf("Ampersand escape not implemented correctly yet")
-			}

-			want := strings.TrimSuffix(strings.TrimPrefix(tc.HTML, "<p>"), "</p>\n") + "\n"
-			got := renderInline(tc.Markdown, htmlSyntax)
+			want := strings.TrimSuffix(strings.TrimPrefix(
+				strings.TrimRight(tc.HTML, "\n"), "<p>"), "</p>")
+			got := strings.TrimRight(renderInline(tc.Markdown, htmlSyntax), "\n")
 			if want != got {
-				t.Errorf("input:\n%swant:\n%sgot:\n%s", tc.Markdown, want, got)
+				t.Errorf("input:\n%swant:\n%s\ngot:\n%s", tc.Markdown, want, got)
 			}
 		})
 	}
@ -85,7 +87,17 @@ func TestConvertInline(t *testing.T) {

 func supportedSection(section string) bool {
 	switch section {
-	case "Inlines", "Code spans", "Emphasis and strong emphasis", "Links", "Autolinks", "Images", "Raw HTML", "Hard line breaks", "Soft line breaks", "Textual content":
+	case "Entity and numeric character references",
+		"Inlines",
+		"Code spans",
+		"Emphasis and strong emphasis",
+		"Links",
+		"Autolinks",
+		"Images",
+		"Raw HTML",
+		"Hard line breaks",
+		"Soft line breaks",
+		"Textual content":
 		return true
 	default:
 		return false