pkg/md: Support thematic breaks, ATX headings and blockquotes.

Also rearrange fields of OutputSyntax to match the order they appear in the
CommonMark spec.
This commit is contained in:
Qi Xiao 2022-10-01 22:43:22 +01:00
parent befab9b5f0
commit 8d85b89a05
2 changed files with 214 additions and 105 deletions

View File

@ -24,11 +24,14 @@ import (
// OutputSyntax specifies the output syntax.
type OutputSyntax struct {
ThematicBreak func(original string) string
Heading func(level int) TagPair
Paragraph TagPair
Code TagPair
Em TagPair
Strong TagPair
Link func(dest, title string) (string, string)
Blockquote TagPair
CodeSpan TagPair
Emphasis TagPair
StrongEmphasis TagPair
Link func(dest, title string) TagPair
Image func(dest, alt, title string) string
Escape func(string) string
}
@ -43,7 +46,6 @@ func Render(text string, syntax OutputSyntax) string {
p := blockParser{
lines: lineSplitter{text, 0},
syntax: syntax,
blocks: []block{{typ: documentBlock}},
}
p.render()
return p.sb.String()
@ -52,70 +54,97 @@ func Render(text string, syntax OutputSyntax) string {
type blockParser struct {
lines lineSplitter
syntax OutputSyntax
blocks []block
containers []container
paragraph []string
sb strings.Builder
}
var (
blockquoteMarkerRegexp = regexp.MustCompile(`^ {0,3}> ?`)
thematicBreakRegexp = regexp.MustCompile(`^[ \t]*((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$`)
atxHeadingRegexp = regexp.MustCompile(`^ *(#{1,6})(?:[ \t]|$)`)
atxHeadingCloserRegexp = regexp.MustCompile(`[ \t]#+[ \t]*$`)
)
func (p *blockParser) render() {
for p.lines.more() {
line := p.lines.next()
if line == "\n" {
switch p.leaf().typ {
case documentBlock:
// Nothing to do
case paragraphBlock:
p.pop()
i := 0
for i = 0; i < len(p.containers); i++ {
markerLen := p.containers[i].findMarker(line)
if markerLen == 0 {
break
}
line = line[markerLen:]
}
if m := blockquoteMarkerRegexp.FindString(line); m != "" {
p.popParagraph(i)
for m != "" {
p.appendContainer(container{typ: blockquoteContainer})
line = line[len(m):]
m = blockquoteMarkerRegexp.FindString(line)
}
i = len(p.containers)
}
if strings.Trim(line, " \t") == "" {
p.popParagraph(i)
} else if thematicBreakRegexp.MatchString(line) {
p.popParagraph(i)
p.sb.WriteString(p.syntax.ThematicBreak(line))
p.sb.WriteByte('\n')
} else if m := atxHeadingRegexp.FindStringSubmatchIndex(line); m != nil {
p.popParagraph(i)
// ATX headings always span one line only, so render it right away
// without pushing a node.
openerStart, openerEnd := m[2], m[3]
opener := line[openerStart:openerEnd]
line = strings.TrimRight(line[openerEnd:], " \t")
if closer := atxHeadingCloserRegexp.FindString(line); closer != "" {
line = line[:len(line)-len(closer)]
}
p.renderLeaf(p.syntax.Heading(len(opener)), strings.Trim(line, " \t"))
} else {
switch p.leaf().typ {
case documentBlock:
p.push(paragraphBlock).text.WriteString(line)
case paragraphBlock:
p.leaf().text.WriteString(line)
if len(p.paragraph) == 0 {
p.popParagraph(i)
}
p.addParagraphLine(line)
}
}
}
for len(p.blocks) > 0 {
p.pop()
}
p.popParagraph(0)
}
func (p *blockParser) push(typ blockType) *block {
switch typ {
case paragraphBlock:
p.sb.WriteString(p.syntax.Paragraph.Start)
}
p.blocks = append(p.blocks, block{typ: typ})
return p.leaf()
}
func (p *blockParser) leaf() *block { return &p.blocks[len(p.blocks)-1] }
func (p *blockParser) pop() {
leaf := p.leaf()
switch leaf.typ {
case paragraphBlock:
text := strings.Trim(strings.TrimSuffix(leaf.text.String(), "\n"), " \t")
p.sb.WriteString(renderInline(text, p.syntax))
p.sb.WriteString(p.syntax.Paragraph.End)
func (p *blockParser) appendContainer(c container) {
p.containers = append(p.containers, c)
p.sb.WriteString(c.tagPair(&p.syntax).Start)
p.sb.WriteByte('\n')
}
p.blocks = p.blocks[:len(p.blocks)-1]
func (p *blockParser) addParagraphLine(line string) {
p.paragraph = append(p.paragraph, line)
}
type block struct {
typ blockType
text strings.Builder
func (p *blockParser) popParagraph(i int) {
if len(p.paragraph) > 0 {
text := strings.Trim(strings.Join(p.paragraph, "\n"), " \t")
p.renderLeaf(p.syntax.Paragraph, text)
p.paragraph = p.paragraph[:0]
}
for j := len(p.containers) - 1; j >= i; j-- {
p.sb.WriteString(p.containers[i].tagPair(&p.syntax).End)
p.sb.WriteByte('\n')
}
p.containers = p.containers[:i]
}
type blockType uint
func (p *blockParser) renderLeaf(tags TagPair, content string) {
p.sb.WriteString(tags.Start)
p.sb.WriteString(renderInline(content, p.syntax))
p.sb.WriteString(tags.End)
p.sb.WriteByte('\n')
}
const (
documentBlock blockType = iota
paragraphBlock
)
// Splits a string into lines, preserving the trailing newlines.
type lineSplitter struct {
text string
pos int
@ -133,7 +162,33 @@ func (s *lineSplitter) next() string {
return s.text[begin:]
}
s.pos += delta + 1
return s.text[begin:s.pos]
return s.text[begin : s.pos-1]
}
type container struct {
typ containerType
}
type containerType uint8
const (
blockquoteContainer containerType = iota
)
func (c container) findMarker(line string) int {
switch c.typ {
case blockquoteContainer:
return len(blockquoteMarkerRegexp.FindString(line))
}
panic("unreachable")
}
func (c container) tagPair(syntax *OutputSyntax) TagPair {
switch c.typ {
case blockquoteContainer:
return syntax.Blockquote
}
panic("unreachable")
}
type buffer struct {
@ -313,9 +368,9 @@ func (p *inlineParser) render() {
}
unlink(opener)
if opener.typ == '[' {
start, end := p.syntax.Link(dest, title)
p.buf.pieces[opener.bufIdx] = piece{appendMarkup: []string{start}}
p.buf.push(piece{appendMarkup: []string{end}})
tags := p.syntax.Link(dest, title)
p.buf.pieces[opener.bufIdx] = piece{appendMarkup: []string{tags.Start}}
p.buf.push(piece{appendMarkup: []string{tags.End}})
} else {
var altBuilder strings.Builder
for _, piece := range p.buf.pieces[opener.bufIdx+1:] {
@ -363,9 +418,9 @@ func (p *inlineParser) render() {
continue
}
p.buf.push(piece{
prependMarkup: []string{p.syntax.Code.Start},
prependMarkup: []string{p.syntax.CodeSpan.Start},
text: p.syntax.Escape(normalizeCodeSpanContent(p.text[p.pos:closer])),
appendMarkup: []string{p.syntax.Code.End}})
appendMarkup: []string{p.syntax.CodeSpan.End}})
p.pos = closer + (p.pos - begin)
case '<':
if p.pos == len(p.text) {
@ -441,11 +496,11 @@ func (p *inlineParser) render() {
if email {
dest = "mailto:" + dest
}
start, end := p.syntax.Link(dest, "")
tags := p.syntax.Link(dest, "")
p.buf.push(piece{
prependMarkup: []string{start},
prependMarkup: []string{tags.Start},
text: text,
appendMarkup: []string{end},
appendMarkup: []string{tags.End},
})
continue
}
@ -467,6 +522,7 @@ func (p *inlineParser) render() {
}
parseText()
case '\n':
if len(p.buf.pieces) > 0 {
last := &p.buf.pieces[len(p.buf.pieces)-1]
if last.prependMarkup == nil && last.appendMarkup == nil {
if p.pos == len(p.text) {
@ -485,6 +541,7 @@ func (p *inlineParser) render() {
}
}
}
}
p.buf.push(piece{text: "\n"})
for p.pos < len(p.text) && p.text[p.pos] == ' ' {
p.pos++
@ -530,14 +587,14 @@ func (p *inlineParser) processEmphasis(bottom *delim) {
strong := len(openerPiece.text) >= 2 && len(closerPiece.text) >= 2
if strong {
openerPiece.text = openerPiece.text[2:]
openerPiece.appendMarkup = append(openerPiece.appendMarkup, p.syntax.Strong.Start)
openerPiece.appendMarkup = append(openerPiece.appendMarkup, p.syntax.StrongEmphasis.Start)
closerPiece.text = closerPiece.text[2:]
closerPiece.prependMarkup = append(closerPiece.prependMarkup, p.syntax.Strong.End)
closerPiece.prependMarkup = append(closerPiece.prependMarkup, p.syntax.StrongEmphasis.End)
} else {
openerPiece.text = openerPiece.text[1:]
openerPiece.appendMarkup = append(openerPiece.appendMarkup, p.syntax.Em.Start)
openerPiece.appendMarkup = append(openerPiece.appendMarkup, p.syntax.Emphasis.Start)
closerPiece.text = closerPiece.text[1:]
closerPiece.prependMarkup = append(closerPiece.prependMarkup, p.syntax.Em.End)
closerPiece.prependMarkup = append(closerPiece.prependMarkup, p.syntax.Emphasis.End)
}
opener.next = closer
closer.prev = opener

View File

@ -5,6 +5,7 @@ import (
"encoding/json"
"fmt"
"regexp"
"strconv"
"strings"
"testing"
@ -12,18 +13,52 @@ import (
"src.elv.sh/pkg/must"
)
//go:embed spec.json
var specJSON []byte
var spec []struct {
type testCase struct {
Markdown string `json:"markdown"`
HTML string `json:"html"`
Example int `json:"example"`
Section string `json:"section"`
Name string
}
//go:embed spec.json
var specJSON []byte
var testCases []testCase
var additionalCases = []testCase{
{
Markdown: `> a
>> b
`,
HTML: `<blockquote>
<p>a</p>
<blockquote>
<p>b</p>
</blockquote>
</blockquote>
`,
Name: "Increasing blockquote level",
},
{
Markdown: `>> a
>
> b
`,
HTML: `<blockquote>
<blockquote>
<p>a</p>
</blockquote>
<p>b</p>
</blockquote>
`,
Name: "Reducing blockquote level",
},
}
func init() {
must.OK(json.Unmarshal(specJSON, &spec))
must.OK(json.Unmarshal(specJSON, &testCases))
testCases = append(testCases, additionalCases...)
}
var (
@ -37,18 +72,24 @@ var (
)
var htmlSyntax = OutputSyntax{
ThematicBreak: func(_ string) string { return "<hr />" },
Heading: func(level int) TagPair {
tag := "h" + strconv.Itoa(level)
return TagPair{Start: "<" + tag + ">", End: "</" + tag + ">"}
},
Paragraph: TagPair{Start: "<p>", End: "</p>"},
Code: TagPair{Start: "<code>", End: "</code>"},
Em: TagPair{Start: "<em>", End: "</em>"},
Strong: TagPair{Start: "<strong>", End: "</strong>"},
Link: func(dest, title string) (string, string) {
Blockquote: TagPair{Start: "<blockquote>", End: "</blockquote>"},
CodeSpan: TagPair{Start: "<code>", End: "</code>"},
Emphasis: TagPair{Start: "<em>", End: "</em>"},
StrongEmphasis: TagPair{Start: "<strong>", End: "</strong>"},
Link: func(dest, title string) TagPair {
start := ""
if title == "" {
start = fmt.Sprintf(`<a href="%s">`, escapeDest(dest))
} else {
start = fmt.Sprintf(`<a href="%s" title="%s">`, escapeDest(dest), escapeHTML(title))
}
return start, "</a>"
return TagPair{Start: start, End: "</a>"}
},
Image: func(dest, alt, title string) string {
if title == "" {
@ -61,19 +102,24 @@ var htmlSyntax = OutputSyntax{
var (
linkRef = regexp.MustCompile(`(^|\n)\[([^\\\[\]]|\\[\\\[\]])+\]:`)
listItem = regexp.MustCompile(`(^|\n)\* `)
listItem = regexp.MustCompile(`(^|\n)[*-] `)
codeBlock = regexp.MustCompile("(^|\n)>*(```|~~~| )")
)
func TestRender(t *testing.T) {
for _, tc := range spec {
t.Run(fmt.Sprintf("%s/%d", tc.Section, tc.Example), func(t *testing.T) {
if !supportedSection(tc.Section) {
for _, tc := range testCases {
name := tc.Name
if name == "" {
name = fmt.Sprintf("%s/%d", tc.Section, tc.Example)
}
t.Run(name, func(t *testing.T) {
if unsupportedSection(tc.Section) {
t.Skipf("Section %q not supported", tc.Section)
}
if strings.HasPrefix(tc.Markdown, "#") {
t.Skipf("Header not supported")
if reason := unsupportedExample(tc.Example); reason != "" {
t.Skipf("Example %d not supported: %s", tc.Example, reason)
}
if strings.HasPrefix(tc.Markdown, "```") || strings.HasPrefix(tc.Markdown, "~~~") || strings.HasPrefix(tc.Markdown, " ") {
if codeBlock.MatchString(tc.Markdown) {
t.Skipf("Code block not supported")
}
if linkRef.MatchString(tc.Markdown) {
@ -94,23 +140,29 @@ func TestRender(t *testing.T) {
}
}
func supportedSection(section string) bool {
func unsupportedSection(section string) bool {
switch section {
case "Tabs",
"Precedence",
"Thematic breaks",
"ATX headings",
"Setext headings",
"Indented code blocks",
"Fenced code blocks",
"HTML blocks",
"Link reference definitions",
"Blank lines",
"Block quotes",
"List items",
"Lists":
return false
default:
return true
default:
return false
}
}
func unsupportedExample(example int) string {
switch example {
case 59:
return "has setext heading"
default:
return ""
}
}