pkg/md: Support HTML blocks.

2024-12-13 18:07:51 +08:00 · 2022-10-03 02:51:07 +01:00 · 2022-10-03 02:51:07 +01:00 · a2a7126064
commit a2a7126064
parent e9e7827592
2 changed files with 160 additions and 55 deletions
--- a/pkg/md/md.go
+++ b/pkg/md/md.go
@ -39,7 +39,9 @@ type OutputSyntax struct {
 	StrongEmphasis TagPair
 	Link           func(dest, title string) TagPair
 	Image          func(dest, alt, title string) string
-	Escape         func(string) string
+
+	Escape      func(string) string
+	ConvertHTML func(string) string
 }

 // TagPair specifies a pair of "tags" to enclose a construct in the output.
@ -68,9 +70,11 @@ type blockParser struct {
 var (
 	thematicBreakRegexp = regexp.MustCompile(
 		`^[ \t]*((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$`)
+
 	// Capture group 1: heading opener
 	atxHeadingRegexp       = regexp.MustCompile(`^ *(#{1,6})(?:[ \t]|$)`)
 	atxHeadingCloserRegexp = regexp.MustCompile(`[ \t]#+[ \t]*$`)
+
 	// Capture groups:
 	// 1. Indent
 	// 2. Fence punctuations (backquote fence)
@ -79,15 +83,43 @@ var (
 	// 5. Untrimmed info string (tilde fence)
 	codeFenceRegexp = regexp.MustCompile("(^ {0,3})(?:(`{3,})([^`]*)|(~{3,})(.*))$")
 	// Capture group 1: fence punctuations
-	closingCodeFenceRegexp = regexp.MustCompile("(?:^ {0,3})(`{3,}|~{3,})[ \t]*$")
+	codeFenceCloserRegexp = regexp.MustCompile("(?:^ {0,3})(`{3,}|~{3,})[ \t]*$")
+
+	html1Regexp       = regexp.MustCompile(`^ {0,3}<(?i:pre|script|style|textarea)`)
+	html1CloserRegexp = regexp.MustCompile(`</(?i:pre|script|style|textarea)`)
+	html2Regexp       = regexp.MustCompile(`^ {0,3}<!--`)
+	html2CloserRegexp = regexp.MustCompile(`-->`)
+	html3Regexp       = regexp.MustCompile(`^ {0,3}<\?`)
+	html3CloserRegexp = regexp.MustCompile(`\?>`)
+	html4Regexp       = regexp.MustCompile(`^ {0,3}<![a-zA-Z]`)
+	html4CloserRegexp = regexp.MustCompile(`>`)
+	html5Regexp       = regexp.MustCompile(`^ {0,3}<!\[CDATA\[`)
+	html5CloserRegexp = regexp.MustCompile(`\]\]>`)
+
+	html6Regexp = regexp.MustCompile(`^ {0,3}</?(?i:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(?:[ \t>]|$|/>)`)
+	html7Regexp = regexp.MustCompile(
+		fmt.Sprintf(`^ {0,3}(?:%s|%s)[ \t]*$`, openTag, closingTag))
+)
+
+const (
+	openTag = `<` +
+		`[a-zA-Z][a-zA-Z0-9-]*` + // tag name
+		(`(?:` +
+			`[ \t\n]+` + // whitespace
+			`[a-zA-Z_:][a-zA-Z0-9_\.:-]*` + // attribute name
+			`(?:[ \t\n]*=[ \t\n]*(?:[^ \t\n"'=<>` + "`" + `]+|'[^']*'|"[^"]*"))?` + // attribute value specification
+			`)*`) + // zero or more attributes
+		`[ \t\n]*` + // whitespace
+		`/?>`
+	closingTag = `</[a-zA-Z][a-zA-Z0-9-]*[ \t\n]*>`
 )

 func (p *blockParser) render() {
 	for p.lines.more() {
 		line := p.lines.next()
 		line, matchedContainers := matchContinuationMarkers(line, p.containers)
-		line, newContainers := parseStartingMarkers(line,
-			matchedContainers != len(p.containers) || len(p.paragraph) == 0)
+		newParagraph := matchedContainers != len(p.containers) || len(p.paragraph) == 0
+		line, newContainers := parseStartingMarkers(line, newParagraph)
 		if len(newContainers) > 0 {
 			p.popParagraph(matchedContainers)
 			for _, c := range newContainers {
@ -96,11 +128,11 @@ func (p *blockParser) render() {
 			matchedContainers = len(p.containers)
 		}

-	afterContainer:
 		if isBlankLine(line) {
 			for i := matchedContainers; i < len(p.containers); i++ {
 				if p.containers[i].typ == blockquote {
 					p.popParagraph(i)
+					p.popList()
 					continue
 				}
 			}
@ -127,39 +159,31 @@ func (p *blockParser) render() {
 			if opener == "" {
 				opener, info = m[4], m[5]
 			}
-			tags := p.syntax.CodeBlock(strings.Trim(info, " \t"))
-			p.sb.WriteString(tags.Start)
-			for p.lines.more() {
-				line = p.lines.next()
-				line, matchedContainers = matchContinuationMarkers(line, p.containers)
-				if isBlankLine(line) {
-					for i := matchedContainers; i < len(p.containers); i++ {
-						if p.containers[i].typ == blockquote {
-							p.sb.WriteString(tags.End)
-							p.sb.WriteByte('\n')
-							p.popParagraph(i)
-							goto afterContainer
-						}
-					}
-				} else if matchedContainers < len(p.containers) {
-					p.sb.WriteString(tags.End)
-					p.sb.WriteByte('\n')
-					goto afterContainer
-				}
-				if m := closingCodeFenceRegexp.FindStringSubmatch(line); m != nil {
-					closer := m[1]
-					if closer[0] == opener[0] && len(closer) >= len(opener) {
-						break
-					}
-				}
-				for i := indent; i > 0 && line != "" && line[0] == ' '; i-- {
-					line = line[1:]
-				}
-				p.sb.WriteString(p.syntax.Escape(line))
-				p.sb.WriteByte('\n')
-			}
-			p.sb.WriteString(tags.End)
-			p.sb.WriteByte('\n')
+			p.parseFencedCodeBlock(indent, opener, info)
+		} else if html1Regexp.MatchString(line) {
+			p.popParagraph(matchedContainers)
+			p.popList()
+			p.parseHTMLBlock(line, html1CloserRegexp.MatchString)
+		} else if html2Regexp.MatchString(line) {
+			p.popParagraph(matchedContainers)
+			p.popList()
+			p.parseHTMLBlock(line, html2CloserRegexp.MatchString)
+		} else if html3Regexp.MatchString(line) {
+			p.popParagraph(matchedContainers)
+			p.popList()
+			p.parseHTMLBlock(line, html3CloserRegexp.MatchString)
+		} else if html4Regexp.MatchString(line) {
+			p.popParagraph(matchedContainers)
+			p.popList()
+			p.parseHTMLBlock(line, html4CloserRegexp.MatchString)
+		} else if html5Regexp.MatchString(line) {
+			p.popParagraph(matchedContainers)
+			p.popList()
+			p.parseHTMLBlock(line, html5CloserRegexp.MatchString)
+		} else if html6Regexp.MatchString(line) || (newParagraph && html7Regexp.MatchString(line)) {
+			p.popParagraph(matchedContainers)
+			p.popList()
+			p.parseBlankLineTerminatedHTMLBlock(line)
 		} else {
 			if len(p.paragraph) == 0 {
 				p.popParagraph(matchedContainers)
@ -235,6 +259,94 @@ func isBlankLine(line string) bool {
 	return strings.Trim(line, " \t") == ""
 }

+func (p *blockParser) parseFencedCodeBlock(indent int, opener, info string) {
+	tags := p.syntax.CodeBlock(strings.Trim(info, " \t"))
+	p.sb.WriteString(tags.Start)
+	for p.lines.more() {
+		line := p.lines.next()
+		line, matchedContainers := matchContinuationMarkers(line, p.containers)
+		if isBlankLine(line) {
+			for i := matchedContainers; i < len(p.containers); i++ {
+				if p.containers[i].typ == blockquote {
+					p.sb.WriteString(tags.End)
+					p.sb.WriteByte('\n')
+					p.popParagraph(i)
+					return
+				}
+			}
+		} else if matchedContainers < len(p.containers) {
+			p.sb.WriteString(tags.End)
+			p.sb.WriteByte('\n')
+			p.lines.backup()
+			return
+		}
+		if m := codeFenceCloserRegexp.FindStringSubmatch(line); m != nil {
+			closer := m[1]
+			if closer[0] == opener[0] && len(closer) >= len(opener) {
+				break
+			}
+		}
+		for i := indent; i > 0 && line != "" && line[0] == ' '; i-- {
+			line = line[1:]
+		}
+		p.sb.WriteString(p.syntax.Escape(line))
+		p.sb.WriteByte('\n')
+	}
+	p.sb.WriteString(tags.End)
+	p.sb.WriteByte('\n')
+}
+
+func (p *blockParser) parseHTMLBlock(line string, closer func(string) bool) {
+	p.sb.WriteString(line)
+	p.sb.WriteByte('\n')
+	if closer(line) {
+		return
+	}
+	for p.lines.more() {
+		line := p.lines.next()
+		line, matchedContainers := matchContinuationMarkers(line, p.containers)
+		if isBlankLine(line) {
+			for i := matchedContainers; i < len(p.containers); i++ {
+				if p.containers[i].typ == blockquote {
+					p.popParagraph(i)
+					return
+				}
+			}
+		} else if matchedContainers < len(p.containers) {
+			p.lines.backup()
+			return
+		}
+		p.sb.WriteString(line)
+		p.sb.WriteByte('\n')
+		if closer(line) {
+			return
+		}
+	}
+}
+
+func (p *blockParser) parseBlankLineTerminatedHTMLBlock(line string) {
+	p.sb.WriteString(line)
+	p.sb.WriteByte('\n')
+	for p.lines.more() {
+		line := p.lines.next()
+		line, matchedContainers := matchContinuationMarkers(line, p.containers)
+		if isBlankLine(line) {
+			for i := matchedContainers; i < len(p.containers); i++ {
+				if p.containers[i].typ == blockquote {
+					p.popParagraph(i)
+					return
+				}
+			}
+			return
+		} else if matchedContainers < len(p.containers) {
+			p.lines.backup()
+			return
+		}
+		p.sb.WriteString(line)
+		p.sb.WriteByte('\n')
+	}
+}
+
 func (p *blockParser) appendContainer(c container) {
 	if len(p.containers) > 0 {
 		leaf := p.containers[len(p.containers)-1]
@ -316,6 +428,13 @@ func (s *lineSplitter) next() string {
 	return s.text[begin : s.pos-1]
 }

+func (s *lineSplitter) backup() {
+	if s.pos == 0 {
+		return
+	}
+	s.pos = 1 + strings.LastIndexByte(s.text[:s.pos-1], '\n')
+}
+
 type container struct {
 	typ    containerType
 	punct  byte
@ -464,18 +583,9 @@ var isASCIIPunct = map[byte]bool{
 }

 var (
-	entityRegexp  = regexp.MustCompile(`^&(?:[a-zA-Z0-9]+|#[0-9]{1,7}|#[xX][0-9a-fA-F]{1,6});`)
-	openTagRegexp = regexp.MustCompile(fmt.Sprintf(`^<`+
-		`[a-zA-Z][a-zA-Z0-9-]*`+ // tag name
-		(`(?:`+
-			`[ \t\n]+`+ // whitespace
-			`[a-zA-Z_:][a-zA-Z0-9_\.:-]*`+ // attribute name
-			`(?:[ \t\n]*=[ \t\n]*(?:[^ \t\n"'=<>%s]+|'[^']*'|"[^"]*"))?`+ // attribute value specification
-			`)*`)+ // zero or more attributes
-		`[ \t\n]*`+ // whitespace
-		`/?>`,
-		"`"))
-	closingTagRegexp = regexp.MustCompile(`^</[a-zA-Z][a-zA-Z0-9-]*[ \t\n]*>`)
+	entityRegexp     = regexp.MustCompile(`^&(?:[a-zA-Z0-9]+|#[0-9]{1,7}|#[xX][0-9a-fA-F]{1,6});`)
+	openTagRegexp    = regexp.MustCompile(`^` + openTag)
+	closingTagRegexp = regexp.MustCompile(`^` + closingTag)
 	autolinkRegexp   = regexp.MustCompile(`^<` +
 		`[a-zA-Z][a-zA-Z0-9+.-]{1,31}` + // scheme
 		`:[^\x00-\x19 <>]*` +
--- a/pkg/md/md_test.go
+++ b/pkg/md/md_test.go
@ -160,7 +160,6 @@ var (
 	linkRef           = regexp.MustCompile(`(^|\n) {0,3}\[([^\\\[\]]|\\[\\\[\]])+\]:`)
 	indentedCodeBlock = regexp.MustCompile("(^|\n)[ >]*(    )")
 	emptyListItem     = regexp.MustCompile(`(^|\n)([-+*]|[0-9]{1,9}[.)])(\n|$)`)
-	htmlBlock         = regexp.MustCompile(`(^|\n)(<a |<!--)`)
 )

 func TestRender(t *testing.T) {
@ -185,9 +184,6 @@ func TestRender(t *testing.T) {
 			if emptyListItem.MatchString(tc.Markdown) {
 				t.Skipf("Empty list item not supported")
 			}
-			if htmlBlock.MatchString(tc.Markdown) {
-				t.Skipf("HTML block not supported")
-			}

 			got := Render(tc.Markdown, htmlSyntax)
 			want := loosifyLists(tc.HTML)
@ -203,7 +199,6 @@ func unsupportedSection(section string) bool {
 	case "Tabs",
 		"Setext headings",
 		"Indented code blocks",
-		"HTML blocks",
 		"Link reference definitions":
 		return true
 	default: