pkg/md: Support HTML entities.

This commit is contained in:
Qi Xiao 2022-09-26 01:07:51 +01:00
parent 5bf0cf7d6c
commit 383a7f2696
2 changed files with 33 additions and 11 deletions

View File

@ -15,6 +15,7 @@ package md
import ( import (
"fmt" "fmt"
"html"
"regexp" "regexp"
"strings" "strings"
"unicode" "unicode"
@ -130,6 +131,7 @@ var isASCIIPunct = map[byte]bool{
} }
var ( var (
entityRegexp = regexp.MustCompile(`^&(?:[a-zA-Z0-9]+|#[0-9]{1,7}|#[xX][0-9a-fA-F]{1,6});`)
openTagRegexp = regexp.MustCompile(fmt.Sprintf(`^<`+ openTagRegexp = regexp.MustCompile(fmt.Sprintf(`^<`+
`[a-zA-Z][a-zA-Z0-9-]*`+ // tag name `[a-zA-Z][a-zA-Z0-9-]*`+ // tag name
(`(?:`+ (`(?:`+
@ -348,6 +350,14 @@ func (p *inlineParser) render() {
} }
} }
parseText() parseText()
case '&':
entity := entityRegexp.FindString(p.text[begin:])
if entity != "" {
p.buf.push(piece{text: p.syntax.escape(html.UnescapeString(entity))})
p.pos = begin + len(entity)
} else {
parseText()
}
case '\\': case '\\':
if p.pos < len(p.text) && isASCIIPunct[p.text[p.pos]] { if p.pos < len(p.text) && isASCIIPunct[p.text[p.pos]] {
begin++ begin++
@ -552,7 +562,7 @@ func (p *linkTailParser) parse() (n int, dest, title string) {
if p.pos == len(p.text) || p.text[p.pos] != ')' { if p.pos == len(p.text) || p.text[p.pos] != ')' {
return -1, "", "" return -1, "", ""
} }
return p.pos + 1, destBuilder.String(), titleBuilder.String() return p.pos + 1, html.UnescapeString(destBuilder.String()), html.UnescapeString(titleBuilder.String())
} }
func (p *linkTailParser) skipWhitespaces() { func (p *linkTailParser) skipWhitespaces() {
@ -590,7 +600,7 @@ func isASCIIControl(b byte) bool {
func isMeta(b byte) bool { func isMeta(b byte) bool {
switch b { switch b {
case '[', ']', '*', '_', '`', '\\', '!', '<', '\n': case '!', '[', ']', '*', '_', '`', '\\', '&', '<', '\n':
return true return true
default: default:
return false return false

View File

@ -26,11 +26,12 @@ func init() {
var ( var (
escapeHTML = strings.NewReplacer( escapeHTML = strings.NewReplacer(
`"`, "&quot;", "<", "&lt;", ">", "&gt;").Replace "&", "&amp;", `"`, "&quot;", "<", "&lt;", ">", "&gt;").Replace
escapeDest = strings.NewReplacer( escapeDest = strings.NewReplacer(
`"`, "%22", `\`, "%5C", " ", "%20", "`", "%60", `"`, "%22", `\`, "%5C", " ", "%20", "`", "%60",
"[", "%5B", "]", "%5D", "[", "%5B", "]", "%5D",
"&auml;", "%C3%A4", " ", "%C2%A0").Replace "ö", "%C3%B6",
"ä", "%C3%A4", " ", "%C2%A0").Replace
) )
var htmlSyntax = outSyntax{ var htmlSyntax = outSyntax{
@ -67,17 +68,18 @@ func TestConvertInline(t *testing.T) {
if strings.HasPrefix(tc.Markdown, "#") { if strings.HasPrefix(tc.Markdown, "#") {
t.Skipf("Header not supported") t.Skipf("Header not supported")
} }
if strings.HasPrefix(tc.Markdown, "```") || strings.HasPrefix(tc.Markdown, " ") {
t.Skipf("Code block not supported")
}
if strings.Contains(tc.Markdown, "\n\n") { if strings.Contains(tc.Markdown, "\n\n") {
t.Skipf("Multiple blocks not supported") t.Skipf("Multiple blocks not supported")
} }
if strings.Contains(tc.HTML, "&amp;") {
t.Skipf("Ampersand escape not implemented correctly yet")
}
want := strings.TrimSuffix(strings.TrimPrefix(tc.HTML, "<p>"), "</p>\n") + "\n" want := strings.TrimSuffix(strings.TrimPrefix(
got := renderInline(tc.Markdown, htmlSyntax) strings.TrimRight(tc.HTML, "\n"), "<p>"), "</p>")
got := strings.TrimRight(renderInline(tc.Markdown, htmlSyntax), "\n")
if want != got { if want != got {
t.Errorf("input:\n%swant:\n%sgot:\n%s", tc.Markdown, want, got) t.Errorf("input:\n%swant:\n%s\ngot:\n%s", tc.Markdown, want, got)
} }
}) })
} }
@ -85,7 +87,17 @@ func TestConvertInline(t *testing.T) {
func supportedSection(section string) bool { func supportedSection(section string) bool {
switch section { switch section {
case "Inlines", "Code spans", "Emphasis and strong emphasis", "Links", "Autolinks", "Images", "Raw HTML", "Hard line breaks", "Soft line breaks", "Textual content": case "Entity and numeric character references",
"Inlines",
"Code spans",
"Emphasis and strong emphasis",
"Links",
"Autolinks",
"Images",
"Raw HTML",
"Hard line breaks",
"Soft line breaks",
"Textual content":
return true return true
default: default:
return false return false