pkg/md: Support HTML entities.

This commit is contained in:
Qi Xiao 2022-09-26 01:07:51 +01:00
parent 5bf0cf7d6c
commit 383a7f2696
2 changed files with 33 additions and 11 deletions

View File

@ -15,6 +15,7 @@ package md
import (
"fmt"
"html"
"regexp"
"strings"
"unicode"
@ -130,6 +131,7 @@ var isASCIIPunct = map[byte]bool{
}
var (
entityRegexp = regexp.MustCompile(`^&(?:[a-zA-Z0-9]+|#[0-9]{1,7}|#[xX][0-9a-fA-F]{1,6});`)
openTagRegexp = regexp.MustCompile(fmt.Sprintf(`^<`+
`[a-zA-Z][a-zA-Z0-9-]*`+ // tag name
(`(?:`+
@ -348,6 +350,14 @@ func (p *inlineParser) render() {
}
}
parseText()
case '&':
entity := entityRegexp.FindString(p.text[begin:])
if entity != "" {
p.buf.push(piece{text: p.syntax.escape(html.UnescapeString(entity))})
p.pos = begin + len(entity)
} else {
parseText()
}
case '\\':
if p.pos < len(p.text) && isASCIIPunct[p.text[p.pos]] {
begin++
@ -552,7 +562,7 @@ func (p *linkTailParser) parse() (n int, dest, title string) {
if p.pos == len(p.text) || p.text[p.pos] != ')' {
return -1, "", ""
}
return p.pos + 1, destBuilder.String(), titleBuilder.String()
return p.pos + 1, html.UnescapeString(destBuilder.String()), html.UnescapeString(titleBuilder.String())
}
func (p *linkTailParser) skipWhitespaces() {
@ -590,7 +600,7 @@ func isASCIIControl(b byte) bool {
func isMeta(b byte) bool {
switch b {
case '[', ']', '*', '_', '`', '\\', '!', '<', '\n':
case '!', '[', ']', '*', '_', '`', '\\', '&', '<', '\n':
return true
default:
return false

View File

@ -26,11 +26,12 @@ func init() {
var (
escapeHTML = strings.NewReplacer(
`"`, "&quot;", "<", "&lt;", ">", "&gt;").Replace
"&", "&amp;", `"`, "&quot;", "<", "&lt;", ">", "&gt;").Replace
escapeDest = strings.NewReplacer(
`"`, "%22", `\`, "%5C", " ", "%20", "`", "%60",
"[", "%5B", "]", "%5D",
"&auml;", "%C3%A4", " ", "%C2%A0").Replace
"ö", "%C3%B6",
"ä", "%C3%A4", " ", "%C2%A0").Replace
)
var htmlSyntax = outSyntax{
@ -67,17 +68,18 @@ func TestConvertInline(t *testing.T) {
if strings.HasPrefix(tc.Markdown, "#") {
t.Skipf("Header not supported")
}
if strings.HasPrefix(tc.Markdown, "```") || strings.HasPrefix(tc.Markdown, " ") {
t.Skipf("Code block not supported")
}
if strings.Contains(tc.Markdown, "\n\n") {
t.Skipf("Multiple blocks not supported")
}
if strings.Contains(tc.HTML, "&amp;") {
t.Skipf("Ampersand escape not implemented correctly yet")
}
want := strings.TrimSuffix(strings.TrimPrefix(tc.HTML, "<p>"), "</p>\n") + "\n"
got := renderInline(tc.Markdown, htmlSyntax)
want := strings.TrimSuffix(strings.TrimPrefix(
strings.TrimRight(tc.HTML, "\n"), "<p>"), "</p>")
got := strings.TrimRight(renderInline(tc.Markdown, htmlSyntax), "\n")
if want != got {
t.Errorf("input:\n%swant:\n%sgot:\n%s", tc.Markdown, want, got)
t.Errorf("input:\n%swant:\n%s\ngot:\n%s", tc.Markdown, want, got)
}
})
}
@ -85,7 +87,17 @@ func TestConvertInline(t *testing.T) {
func supportedSection(section string) bool {
switch section {
case "Inlines", "Code spans", "Emphasis and strong emphasis", "Links", "Autolinks", "Images", "Raw HTML", "Hard line breaks", "Soft line breaks", "Textual content":
case "Entity and numeric character references",
"Inlines",
"Code spans",
"Emphasis and strong emphasis",
"Links",
"Autolinks",
"Images",
"Raw HTML",
"Hard line breaks",
"Soft line breaks",
"Textual content":
return true
default:
return false