pkg/md: Support thematic breaks, ATX headings and blockquotes.

Also rearrange fields of OutputSyntax to match the order they appear in the
CommonMark spec.
This commit is contained in:
Qi Xiao 2022-10-01 22:43:22 +01:00
parent befab9b5f0
commit 8d85b89a05
2 changed files with 214 additions and 105 deletions

View File

@ -24,13 +24,16 @@ import (
// OutputSyntax specifies the output syntax. // OutputSyntax specifies the output syntax.
type OutputSyntax struct { type OutputSyntax struct {
Paragraph TagPair ThematicBreak func(original string) string
Code TagPair Heading func(level int) TagPair
Em TagPair Paragraph TagPair
Strong TagPair Blockquote TagPair
Link func(dest, title string) (string, string) CodeSpan TagPair
Image func(dest, alt, title string) string Emphasis TagPair
Escape func(string) string StrongEmphasis TagPair
Link func(dest, title string) TagPair
Image func(dest, alt, title string) string
Escape func(string) string
} }
// TagPair specifies a pair of "tags" to enclose a construct in the output. // TagPair specifies a pair of "tags" to enclose a construct in the output.
@ -43,79 +46,105 @@ func Render(text string, syntax OutputSyntax) string {
p := blockParser{ p := blockParser{
lines: lineSplitter{text, 0}, lines: lineSplitter{text, 0},
syntax: syntax, syntax: syntax,
blocks: []block{{typ: documentBlock}},
} }
p.render() p.render()
return p.sb.String() return p.sb.String()
} }
type blockParser struct { type blockParser struct {
lines lineSplitter lines lineSplitter
syntax OutputSyntax syntax OutputSyntax
blocks []block containers []container
sb strings.Builder paragraph []string
sb strings.Builder
} }
var (
blockquoteMarkerRegexp = regexp.MustCompile(`^ {0,3}> ?`)
thematicBreakRegexp = regexp.MustCompile(`^[ \t]*((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$`)
atxHeadingRegexp = regexp.MustCompile(`^ *(#{1,6})(?:[ \t]|$)`)
atxHeadingCloserRegexp = regexp.MustCompile(`[ \t]#+[ \t]*$`)
)
func (p *blockParser) render() { func (p *blockParser) render() {
for p.lines.more() { for p.lines.more() {
line := p.lines.next() line := p.lines.next()
if line == "\n" { i := 0
switch p.leaf().typ { for i = 0; i < len(p.containers); i++ {
case documentBlock: markerLen := p.containers[i].findMarker(line)
// Nothing to do if markerLen == 0 {
case paragraphBlock: break
p.pop()
} }
line = line[markerLen:]
}
if m := blockquoteMarkerRegexp.FindString(line); m != "" {
p.popParagraph(i)
for m != "" {
p.appendContainer(container{typ: blockquoteContainer})
line = line[len(m):]
m = blockquoteMarkerRegexp.FindString(line)
}
i = len(p.containers)
}
if strings.Trim(line, " \t") == "" {
p.popParagraph(i)
} else if thematicBreakRegexp.MatchString(line) {
p.popParagraph(i)
p.sb.WriteString(p.syntax.ThematicBreak(line))
p.sb.WriteByte('\n')
} else if m := atxHeadingRegexp.FindStringSubmatchIndex(line); m != nil {
p.popParagraph(i)
// ATX headings always span one line only, so render it right away
// without pushing a node.
openerStart, openerEnd := m[2], m[3]
opener := line[openerStart:openerEnd]
line = strings.TrimRight(line[openerEnd:], " \t")
if closer := atxHeadingCloserRegexp.FindString(line); closer != "" {
line = line[:len(line)-len(closer)]
}
p.renderLeaf(p.syntax.Heading(len(opener)), strings.Trim(line, " \t"))
} else { } else {
switch p.leaf().typ { if len(p.paragraph) == 0 {
case documentBlock: p.popParagraph(i)
p.push(paragraphBlock).text.WriteString(line)
case paragraphBlock:
p.leaf().text.WriteString(line)
} }
p.addParagraphLine(line)
} }
} }
for len(p.blocks) > 0 { p.popParagraph(0)
p.pop()
}
} }
func (p *blockParser) push(typ blockType) *block { func (p *blockParser) appendContainer(c container) {
switch typ { p.containers = append(p.containers, c)
case paragraphBlock: p.sb.WriteString(c.tagPair(&p.syntax).Start)
p.sb.WriteString(p.syntax.Paragraph.Start) p.sb.WriteByte('\n')
}
p.blocks = append(p.blocks, block{typ: typ})
return p.leaf()
} }
func (p *blockParser) leaf() *block { return &p.blocks[len(p.blocks)-1] } func (p *blockParser) addParagraphLine(line string) {
p.paragraph = append(p.paragraph, line)
}
func (p *blockParser) pop() { func (p *blockParser) popParagraph(i int) {
leaf := p.leaf() if len(p.paragraph) > 0 {
switch leaf.typ { text := strings.Trim(strings.Join(p.paragraph, "\n"), " \t")
case paragraphBlock: p.renderLeaf(p.syntax.Paragraph, text)
text := strings.Trim(strings.TrimSuffix(leaf.text.String(), "\n"), " \t") p.paragraph = p.paragraph[:0]
p.sb.WriteString(renderInline(text, p.syntax)) }
p.sb.WriteString(p.syntax.Paragraph.End) for j := len(p.containers) - 1; j >= i; j-- {
p.sb.WriteString(p.containers[i].tagPair(&p.syntax).End)
p.sb.WriteByte('\n') p.sb.WriteByte('\n')
} }
p.blocks = p.blocks[:len(p.blocks)-1] p.containers = p.containers[:i]
} }
type block struct { func (p *blockParser) renderLeaf(tags TagPair, content string) {
typ blockType p.sb.WriteString(tags.Start)
text strings.Builder p.sb.WriteString(renderInline(content, p.syntax))
p.sb.WriteString(tags.End)
p.sb.WriteByte('\n')
} }
type blockType uint
const (
documentBlock blockType = iota
paragraphBlock
)
// Splits a string into lines, preserving the trailing newlines.
type lineSplitter struct { type lineSplitter struct {
text string text string
pos int pos int
@ -133,7 +162,33 @@ func (s *lineSplitter) next() string {
return s.text[begin:] return s.text[begin:]
} }
s.pos += delta + 1 s.pos += delta + 1
return s.text[begin:s.pos] return s.text[begin : s.pos-1]
}
type container struct {
typ containerType
}
type containerType uint8
const (
blockquoteContainer containerType = iota
)
func (c container) findMarker(line string) int {
switch c.typ {
case blockquoteContainer:
return len(blockquoteMarkerRegexp.FindString(line))
}
panic("unreachable")
}
func (c container) tagPair(syntax *OutputSyntax) TagPair {
switch c.typ {
case blockquoteContainer:
return syntax.Blockquote
}
panic("unreachable")
} }
type buffer struct { type buffer struct {
@ -313,9 +368,9 @@ func (p *inlineParser) render() {
} }
unlink(opener) unlink(opener)
if opener.typ == '[' { if opener.typ == '[' {
start, end := p.syntax.Link(dest, title) tags := p.syntax.Link(dest, title)
p.buf.pieces[opener.bufIdx] = piece{appendMarkup: []string{start}} p.buf.pieces[opener.bufIdx] = piece{appendMarkup: []string{tags.Start}}
p.buf.push(piece{appendMarkup: []string{end}}) p.buf.push(piece{appendMarkup: []string{tags.End}})
} else { } else {
var altBuilder strings.Builder var altBuilder strings.Builder
for _, piece := range p.buf.pieces[opener.bufIdx+1:] { for _, piece := range p.buf.pieces[opener.bufIdx+1:] {
@ -363,9 +418,9 @@ func (p *inlineParser) render() {
continue continue
} }
p.buf.push(piece{ p.buf.push(piece{
prependMarkup: []string{p.syntax.Code.Start}, prependMarkup: []string{p.syntax.CodeSpan.Start},
text: p.syntax.Escape(normalizeCodeSpanContent(p.text[p.pos:closer])), text: p.syntax.Escape(normalizeCodeSpanContent(p.text[p.pos:closer])),
appendMarkup: []string{p.syntax.Code.End}}) appendMarkup: []string{p.syntax.CodeSpan.End}})
p.pos = closer + (p.pos - begin) p.pos = closer + (p.pos - begin)
case '<': case '<':
if p.pos == len(p.text) { if p.pos == len(p.text) {
@ -441,11 +496,11 @@ func (p *inlineParser) render() {
if email { if email {
dest = "mailto:" + dest dest = "mailto:" + dest
} }
start, end := p.syntax.Link(dest, "") tags := p.syntax.Link(dest, "")
p.buf.push(piece{ p.buf.push(piece{
prependMarkup: []string{start}, prependMarkup: []string{tags.Start},
text: text, text: text,
appendMarkup: []string{end}, appendMarkup: []string{tags.End},
}) })
continue continue
} }
@ -467,21 +522,23 @@ func (p *inlineParser) render() {
} }
parseText() parseText()
case '\n': case '\n':
last := &p.buf.pieces[len(p.buf.pieces)-1] if len(p.buf.pieces) > 0 {
if last.prependMarkup == nil && last.appendMarkup == nil { last := &p.buf.pieces[len(p.buf.pieces)-1]
if p.pos == len(p.text) { if last.prependMarkup == nil && last.appendMarkup == nil {
last.text = strings.TrimRight(last.text, " ") if p.pos == len(p.text) {
} else {
hardLineBreak := false
if strings.HasSuffix(last.text, "\\") {
hardLineBreak = true
last.text = last.text[:len(last.text)-1]
} else {
hardLineBreak = strings.HasSuffix(last.text, " ")
last.text = strings.TrimRight(last.text, " ") last.text = strings.TrimRight(last.text, " ")
} } else {
if hardLineBreak { hardLineBreak := false
p.buf.push(piece{prependMarkup: []string{"<br />"}}) if strings.HasSuffix(last.text, "\\") {
hardLineBreak = true
last.text = last.text[:len(last.text)-1]
} else {
hardLineBreak = strings.HasSuffix(last.text, " ")
last.text = strings.TrimRight(last.text, " ")
}
if hardLineBreak {
p.buf.push(piece{prependMarkup: []string{"<br />"}})
}
} }
} }
} }
@ -530,14 +587,14 @@ func (p *inlineParser) processEmphasis(bottom *delim) {
strong := len(openerPiece.text) >= 2 && len(closerPiece.text) >= 2 strong := len(openerPiece.text) >= 2 && len(closerPiece.text) >= 2
if strong { if strong {
openerPiece.text = openerPiece.text[2:] openerPiece.text = openerPiece.text[2:]
openerPiece.appendMarkup = append(openerPiece.appendMarkup, p.syntax.Strong.Start) openerPiece.appendMarkup = append(openerPiece.appendMarkup, p.syntax.StrongEmphasis.Start)
closerPiece.text = closerPiece.text[2:] closerPiece.text = closerPiece.text[2:]
closerPiece.prependMarkup = append(closerPiece.prependMarkup, p.syntax.Strong.End) closerPiece.prependMarkup = append(closerPiece.prependMarkup, p.syntax.StrongEmphasis.End)
} else { } else {
openerPiece.text = openerPiece.text[1:] openerPiece.text = openerPiece.text[1:]
openerPiece.appendMarkup = append(openerPiece.appendMarkup, p.syntax.Em.Start) openerPiece.appendMarkup = append(openerPiece.appendMarkup, p.syntax.Emphasis.Start)
closerPiece.text = closerPiece.text[1:] closerPiece.text = closerPiece.text[1:]
closerPiece.prependMarkup = append(closerPiece.prependMarkup, p.syntax.Em.End) closerPiece.prependMarkup = append(closerPiece.prependMarkup, p.syntax.Emphasis.End)
} }
opener.next = closer opener.next = closer
closer.prev = opener closer.prev = opener

View File

@ -5,6 +5,7 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"regexp" "regexp"
"strconv"
"strings" "strings"
"testing" "testing"
@ -12,18 +13,52 @@ import (
"src.elv.sh/pkg/must" "src.elv.sh/pkg/must"
) )
//go:embed spec.json type testCase struct {
var specJSON []byte
var spec []struct {
Markdown string `json:"markdown"` Markdown string `json:"markdown"`
HTML string `json:"html"` HTML string `json:"html"`
Example int `json:"example"` Example int `json:"example"`
Section string `json:"section"` Section string `json:"section"`
Name string
}
//go:embed spec.json
var specJSON []byte
var testCases []testCase
var additionalCases = []testCase{
{
Markdown: `> a
>> b
`,
HTML: `<blockquote>
<p>a</p>
<blockquote>
<p>b</p>
</blockquote>
</blockquote>
`,
Name: "Increasing blockquote level",
},
{
Markdown: `>> a
>
> b
`,
HTML: `<blockquote>
<blockquote>
<p>a</p>
</blockquote>
<p>b</p>
</blockquote>
`,
Name: "Reducing blockquote level",
},
} }
func init() { func init() {
must.OK(json.Unmarshal(specJSON, &spec)) must.OK(json.Unmarshal(specJSON, &testCases))
testCases = append(testCases, additionalCases...)
} }
var ( var (
@ -37,18 +72,24 @@ var (
) )
var htmlSyntax = OutputSyntax{ var htmlSyntax = OutputSyntax{
Paragraph: TagPair{Start: "<p>", End: "</p>"}, ThematicBreak: func(_ string) string { return "<hr />" },
Code: TagPair{Start: "<code>", End: "</code>"}, Heading: func(level int) TagPair {
Em: TagPair{Start: "<em>", End: "</em>"}, tag := "h" + strconv.Itoa(level)
Strong: TagPair{Start: "<strong>", End: "</strong>"}, return TagPair{Start: "<" + tag + ">", End: "</" + tag + ">"}
Link: func(dest, title string) (string, string) { },
Paragraph: TagPair{Start: "<p>", End: "</p>"},
Blockquote: TagPair{Start: "<blockquote>", End: "</blockquote>"},
CodeSpan: TagPair{Start: "<code>", End: "</code>"},
Emphasis: TagPair{Start: "<em>", End: "</em>"},
StrongEmphasis: TagPair{Start: "<strong>", End: "</strong>"},
Link: func(dest, title string) TagPair {
start := "" start := ""
if title == "" { if title == "" {
start = fmt.Sprintf(`<a href="%s">`, escapeDest(dest)) start = fmt.Sprintf(`<a href="%s">`, escapeDest(dest))
} else { } else {
start = fmt.Sprintf(`<a href="%s" title="%s">`, escapeDest(dest), escapeHTML(title)) start = fmt.Sprintf(`<a href="%s" title="%s">`, escapeDest(dest), escapeHTML(title))
} }
return start, "</a>" return TagPair{Start: start, End: "</a>"}
}, },
Image: func(dest, alt, title string) string { Image: func(dest, alt, title string) string {
if title == "" { if title == "" {
@ -60,20 +101,25 @@ var htmlSyntax = OutputSyntax{
} }
var ( var (
linkRef = regexp.MustCompile(`(^|\n)\[([^\\\[\]]|\\[\\\[\]])+\]:`) linkRef = regexp.MustCompile(`(^|\n)\[([^\\\[\]]|\\[\\\[\]])+\]:`)
listItem = regexp.MustCompile(`(^|\n)\* `) listItem = regexp.MustCompile(`(^|\n)[*-] `)
codeBlock = regexp.MustCompile("(^|\n)>*(```|~~~| )")
) )
func TestRender(t *testing.T) { func TestRender(t *testing.T) {
for _, tc := range spec { for _, tc := range testCases {
t.Run(fmt.Sprintf("%s/%d", tc.Section, tc.Example), func(t *testing.T) { name := tc.Name
if !supportedSection(tc.Section) { if name == "" {
name = fmt.Sprintf("%s/%d", tc.Section, tc.Example)
}
t.Run(name, func(t *testing.T) {
if unsupportedSection(tc.Section) {
t.Skipf("Section %q not supported", tc.Section) t.Skipf("Section %q not supported", tc.Section)
} }
if strings.HasPrefix(tc.Markdown, "#") { if reason := unsupportedExample(tc.Example); reason != "" {
t.Skipf("Header not supported") t.Skipf("Example %d not supported: %s", tc.Example, reason)
} }
if strings.HasPrefix(tc.Markdown, "```") || strings.HasPrefix(tc.Markdown, "~~~") || strings.HasPrefix(tc.Markdown, " ") { if codeBlock.MatchString(tc.Markdown) {
t.Skipf("Code block not supported") t.Skipf("Code block not supported")
} }
if linkRef.MatchString(tc.Markdown) { if linkRef.MatchString(tc.Markdown) {
@ -94,23 +140,29 @@ func TestRender(t *testing.T) {
} }
} }
func supportedSection(section string) bool { func unsupportedSection(section string) bool {
switch section { switch section {
case "Tabs", case "Tabs",
"Precedence", "Precedence",
"Thematic breaks",
"ATX headings",
"Setext headings", "Setext headings",
"Indented code blocks", "Indented code blocks",
"Fenced code blocks", "Fenced code blocks",
"HTML blocks", "HTML blocks",
"Link reference definitions", "Link reference definitions",
"Blank lines", "Blank lines",
"Block quotes",
"List items", "List items",
"Lists": "Lists":
return false
default:
return true return true
default:
return false
}
}
func unsupportedExample(example int) string {
switch example {
case 59:
return "has setext heading"
default:
return ""
} }
} }