publisher: Make the HTML element collector more robust
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Thu, 13 May 2021 11:10:32 +0000 (13:10 +0200)
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Mon, 17 May 2021 15:15:32 +0000 (17:15 +0200)
Fixes #8530

common/text/transform.go
publisher/htmlElementsCollector.go
publisher/htmlElementsCollector_test.go

index f5957780363cebd96b2b3fc36ebe0f772b48d69a..2d51f6c33b680604c81109206b93642b9a7e3cbd 100644 (file)
@@ -45,3 +45,25 @@ func RemoveAccentsString(s string) string {
        accentTransformerPool.Put(t)
        return s
 }
+
+// Chunk splits s into strings of size.
+func Chunk(s string, size int) []string {
+       if size >= len(s) {
+               return []string{s}
+       }
+       var chunks []string
+       chunk := make([]rune, size)
+       l := 0
+       for _, r := range s {
+               chunk[l] = r
+               l++
+               if l == size {
+                       chunks = append(chunks, string(chunk))
+                       l = 0
+               }
+       }
+       if l > 0 {
+               chunks = append(chunks, string(chunk[:l]))
+       }
+       return chunks
+}
index 9dc28c4c2cf6626f1135bcdba3c25e1c17d031ca..1bc1a09bcd4b6606bf00593ab4c841fb0cd6fe4f 100644 (file)
@@ -19,12 +19,51 @@ import (
        "sort"
        "strings"
        "sync"
+       "unicode"
+       "unicode/utf8"
 
        "golang.org/x/net/html"
 
        "github.com/gohugoio/hugo/helpers"
 )
 
+const eof = -1
+
+var (
+       htmlJsonFixer = strings.NewReplacer(", ", "\n")
+       jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
+       classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
+
+       skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
+       skipAllElementRe   = regexp.MustCompile(`(?i)^!DOCTYPE`)
+       endTagRe           = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
+
+       exceptionList = map[string]bool{
+               "thead": true,
+               "tbody": true,
+               "tfoot": true,
+               "td":    true,
+               "tr":    true,
+       }
+)
+
+func newHTMLElementsCollector() *htmlElementsCollector {
+       return &htmlElementsCollector{
+               elementSet: make(map[string]bool),
+       }
+}
+
+func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
+       w := &htmlElementsCollectorWriter{
+               collector: collector,
+               state:     htmlLexStart,
+       }
+
+       w.defaultLexElementInside = w.lexElementInside(htmlLexStart)
+
+       return w
+}
+
 // HTMLElements holds lists of tags and attribute values for classes and id.
 type HTMLElements struct {
        Tags    []string `json:"tags"`
@@ -48,6 +87,12 @@ func (h *HTMLElements) Sort() {
        sort.Strings(h.IDs)
 }
 
+type htmlElement struct {
+       Tag     string
+       Classes []string
+       IDs     []string
+}
+
 type htmlElementsCollector struct {
        // Contains the raw HTML string. We will get the same element
        // several times, and want to avoid costly reparsing when this
@@ -59,12 +104,6 @@ type htmlElementsCollector struct {
        mu sync.RWMutex
 }
 
-func newHTMLElementsCollector() *htmlElementsCollector {
-       return &htmlElementsCollector{
-               elementSet: make(map[string]bool),
-       }
-}
-
 func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
        var (
                classes []string
@@ -93,114 +132,118 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
 
 type htmlElementsCollectorWriter struct {
        collector *htmlElementsCollector
-       buff      bytes.Buffer
 
-       isCollecting bool
-       inPreTag     string
+       r     rune   // Current rune
+       width int    // The width in bytes of r
+       input []byte // The current slice written to Write
+       pos   int    // The current position in input
 
-       inQuote    bool
-       quoteValue byte
-}
+       err error
 
-func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
-       return &htmlElementsCollectorWriter{
-               collector: collector,
-       }
+       inQuote rune
+
+       buff bytes.Buffer
+
+       // Current state
+       state htmlCollectorStateFunc
+
+       // Precompiled state funcs
+       defaultLexElementInside htmlCollectorStateFunc
 }
 
-// Write splits the incoming stream into single html element.
+// Write collects HTML elements from p.
 func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
        n = len(p)
-       i := 0
-
-       for i < len(p) {
-               // If we are not collecting, cycle through byte stream until start bracket "<" is found.
-               if !w.isCollecting {
-                       for ; i < len(p); i++ {
-                               b := p[i]
-                               if b == '<' {
-                                       w.startCollecting()
-                                       break
-                               }
-                       }
+       w.input = p
+       w.pos = 0
+
+       for {
+               w.r = w.next()
+               if w.r == eof {
+                       return
                }
+               w.state = w.state(w)
+       }
+}
 
-               if w.isCollecting {
-                       // If we are collecting, cycle through byte stream until end bracket ">" is found,
-                       // disregard any ">" if within a quote,
-                       // write bytes until found to buffer.
-                       for ; i < len(p); i++ {
-                               b := p[i]
-                               w.toggleIfQuote(b)
-                               w.buff.WriteByte(b)
-
-                               if !w.inQuote && b == '>' {
-                                       w.endCollecting()
-                                       break
-                               }
-                       }
+func (l *htmlElementsCollectorWriter) backup() {
+       l.pos -= l.width
+       l.r, _ = utf8.DecodeRune(l.input[l.pos:])
+}
+
+func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
+       var s htmlCollectorStateFunc
+       s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
+               w.buff.WriteRune(w.r)
+               if condition() {
+                       w.buff.Reset()
+                       return resolve
                }
+               return s
+       }
+       return s
+}
 
-               // If no end bracket ">" is found while collecting, but the stream ended
-               // this could mean we received chunks of a stream from e.g. the minify functionality
-               // next if loop will be skipped.
+func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
+       var s htmlCollectorStateFunc
+       s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
+               if condition(w.r) {
+                       return resolve
+               }
+               return s
+       }
+       return s
+}
 
-               // At this point we have collected an element line between angle brackets "<" and ">".
-               if !w.isCollecting {
-                       if w.buff.Len() == 0 {
-                               continue
+// Starts with e.g. "<body " or "<div"
+func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
+       var s htmlCollectorStateFunc
+       s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
+               w.buff.WriteRune(w.r)
+
+               // Skip any text inside a quote.
+               if w.r == '\'' || w.r == '"' {
+                       if w.inQuote == w.r {
+                               w.inQuote = 0
+                       } else if w.inQuote == 0 {
+                               w.inQuote = w.r
                        }
+               }
 
-                       if w.inPreTag != "" { // within preformatted code block
-                               s := w.buff.String()
-                               w.buff.Reset()
-                               if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
-                                       w.inPreTag = ""
-                               }
-                               continue
-                       }
+               if w.inQuote != 0 {
+                       return s
+               }
 
-                       // First check if we have processed this element before.
-                       w.collector.mu.RLock()
+               if w.r == '>' {
 
                        // Work with the bytes slice as long as it's practical,
                        // to save memory allocations.
                        b := w.buff.Bytes()
 
-                       // See https://github.com/dominikh/go-tools/issues/723
-                       //lint:ignore S1030 This construct avoids memory allocation for the string.
+                       defer func() {
+                               w.buff.Reset()
+                       }()
+
+                       // First check if we have processed this element before.
+                       w.collector.mu.RLock()
+
                        seen := w.collector.elementSet[string(b)]
                        w.collector.mu.RUnlock()
                        if seen {
-                               w.buff.Reset()
-                               continue
-                       }
-
-                       // Filter out unwanted tags
-                       // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
-                       // comments and doctype tags
-                       // end tags.
-                       switch {
-                       case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag
-                               w.buff.Reset()
-                               continue
-                       case bytes.HasPrefix(b, []byte("</")): // end tag
-                               w.buff.Reset()
-                               continue
+                               return resolve
                        }
 
                        s := w.buff.String()
-                       w.buff.Reset()
 
-                       // Check if a preformatted code block started.
-                       if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
-                               w.inPreTag = tagName
+                       if s == "" {
+                               return resolve
                        }
 
                        // Parse each collected element.
                        el, err := parseHTMLElement(s)
                        if err != nil {
-                               return n, err
+                               w.err = err
+                               return resolve
                        }
 
                        // Write this tag to the element set.
@@ -208,109 +251,137 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
                        w.collector.elementSet[s] = true
                        w.collector.elements = append(w.collector.elements, el)
                        w.collector.mu.Unlock()
+
+                       return resolve
+
                }
+
+               return s
        }
 
-       return
+       return s
 }
 
-func (c *htmlElementsCollectorWriter) startCollecting() {
-       c.isCollecting = true
-}
+func (l *htmlElementsCollectorWriter) next() rune {
+       if l.pos >= len(l.input) {
+               l.width = 0
+               return eof
+       }
 
-func (c *htmlElementsCollectorWriter) endCollecting() {
-       c.isCollecting = false
-       c.inQuote = false
+       runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
+       l.width = runeWidth
+       l.pos += l.width
+       return runeValue
 }
 
-func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) {
-       if isQuote(b) {
-               if c.inQuote && b == c.quoteValue {
-                       c.inQuote = false
-               } else if !c.inQuote {
-                       c.inQuote = true
-                       c.quoteValue = b
+// returns the next state in HTML element scanner.
+type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc
+
+// At "<", buffer empty.
+// Potentially starting a HTML element.
+func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
+       if w.r == '>' || unicode.IsSpace(w.r) {
+               if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) {
+                       w.buff.Reset()
+                       return htmlLexStart
                }
-       }
-}
 
-func isQuote(b byte) bool {
-       return b == '"' || b == '\''
-}
+               tagName := w.buff.Bytes()[1:]
 
-func parseStartTag(s string) (string, bool) {
-       s = strings.TrimPrefix(s, "<")
-       s = strings.TrimSuffix(s, ">")
+               switch {
+               case skipInnerElementRe.Match(tagName):
+                       // pre, script etc. We collect classes etc. on the surrounding
+                       // element, but skip the inner content.
+                       w.backup()
 
-       spaceIndex := strings.Index(s, " ")
-       if spaceIndex != -1 {
-               s = s[:spaceIndex]
+                       // tagName will be overwritten, so make a copy.
+                       tagNameCopy := make([]byte, len(tagName))
+                       copy(tagNameCopy, tagName)
+
+                       return w.lexElementInside(
+                               w.consumeBuffUntil(
+                                       func() bool {
+                                               if w.r != '>' {
+                                                       return false
+                                               }
+                                               m := endTagRe.FindSubmatch(w.buff.Bytes())
+                                               if m == nil {
+                                                       return false
+                                               }
+                                               return bytes.EqualFold(m[1], tagNameCopy)
+                                       },
+                                       htmlLexStart,
+                               ))
+               case skipAllElementRe.Match(tagName):
+                       // E.g. "<!DOCTYPE ..."
+                       w.buff.Reset()
+                       return w.consumeRuneUntil(func(r rune) bool {
+                               return r == '>'
+                       }, htmlLexStart)
+               default:
+                       w.backup()
+                       return w.defaultLexElementInside
+               }
        }
 
-       return strings.ToLower(strings.TrimSpace(s)), true
-}
+       w.buff.WriteRune(w.r)
 
-func parseEndTag(s string) (string, bool) {
-       if !strings.HasPrefix(s, "</") {
-               return "", false
+       // If it's a comment, skip to its end.
+       if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) {
+               w.buff.Reset()
+               return htmlLexToEndOfComment
        }
 
-       s = strings.TrimPrefix(s, "</")
-       s = strings.TrimSuffix(s, ">")
-
-       return strings.ToLower(strings.TrimSpace(s)), true
+       return htmlLexElementStart
 }
 
-// No need to look inside these for HTML elements.
-func isPreFormatted(s string) bool {
-       return s == "pre" || s == "textarea" || s == "script" || s == "style"
-}
+// Entry state func.
+// Looks for a opening bracket, '<'.
+func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
+       if w.r == '<' {
+               w.backup()
+               w.buff.Reset()
+               return htmlLexElementStart
+       }
 
-type htmlElement struct {
-       Tag     string
-       Classes []string
-       IDs     []string
+       return htmlLexStart
 }
 
-var (
-       htmlJsonFixer = strings.NewReplacer(", ", "\n")
-       jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
-       classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
+// After "<!--", buff empty.
+func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
+       w.buff.WriteRune(w.r)
 
-       exceptionList = map[string]bool{
-               "thead": true,
-               "tbody": true,
-               "tfoot": true,
-               "td":    true,
-               "tr":    true,
+       if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) {
+               // Done, start looking for HTML elements again.
+               return htmlLexStart
        }
-)
+
+       return htmlLexToEndOfComment
+}
 
 func parseHTMLElement(elStr string) (el htmlElement, err error) {
-       var tagBuffer string = ""
 
-       tagName, ok := parseStartTag(elStr)
-       if !ok {
-               return
-       }
+       tagName := parseStartTag(elStr)
+
+       el.Tag = strings.ToLower(tagName)
+       tagNameToParse := el.Tag
 
        // The net/html parser does not handle single table elements as input, e.g. tbody.
        // We only care about the element/class/ids, so just store away the original tag name
        // and pretend it's a <div>.
-       if exceptionList[tagName] {
-               tagBuffer = tagName
+       if exceptionList[el.Tag] {
                elStr = strings.Replace(elStr, tagName, "div", 1)
+               tagNameToParse = "div"
        }
 
        n, err := html.Parse(strings.NewReader(elStr))
        if err != nil {
                return
        }
+
        var walk func(*html.Node)
        walk = func(n *html.Node) {
-               if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) {
-                       el.Tag = n.Data
-
+               if n.Type == html.ElementNode && n.Data == tagNameToParse {
                        for _, a := range n.Attr {
                                switch {
                                case strings.EqualFold(a.Key, "id"):
@@ -345,10 +416,20 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {
 
        walk(n)
 
-       // did we replaced the start tag?
-       if tagBuffer != "" {
-               el.Tag = tagBuffer
+       return
+}
+
+// Variants of s
+//    <body class="b a">
+//    <div>
+func parseStartTag(s string) string {
+       spaceIndex := strings.IndexFunc(s, func(r rune) bool {
+               return unicode.IsSpace(r)
+       })
+
+       if spaceIndex == -1 {
+               return s[1 : len(s)-1]
        }
 
-       return
+       return s[1:spaceIndex]
 }
index 0c8b2b65b347fa862ea39b2719b7bc7cab5b09d0..2eac31f73b7894bf1a69451c2068152865a13bfe 100644 (file)
@@ -15,8 +15,12 @@ package publisher
 
 import (
        "fmt"
+       "math/rand"
        "strings"
        "testing"
+       "time"
+
+       "github.com/gohugoio/hugo/common/text"
 
        "github.com/gohugoio/hugo/media"
        "github.com/gohugoio/hugo/minifiers"
@@ -28,6 +32,7 @@ import (
 
 func TestClassCollector(t *testing.T) {
        c := qt.New((t))
+       rnd := rand.New(rand.NewSource(time.Now().Unix()))
 
        f := func(tags, classes, ids string) HTMLElements {
                var tagss, classess, idss []string
@@ -57,14 +62,20 @@ func TestClassCollector(t *testing.T) {
                expect HTMLElements
        }{
                {"basic", `<body class="b a"></body>`, f("body", "a b", "")},
-               {"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")},
+               {"duplicates", `<div class="b a b"></div><div class="b a b"></div>x'`, f("div", "a b", "")},
                {"single quote", `<body class='b a'></body>`, f("body", "a b", "")},
                {"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")},
+               {"short", `<i>`, f("i", "", "")},
+               {"invalid", `< body class="b a"></body><div></div>`, f("div", "", "")},
                // https://github.com/gohugoio/hugo/issues/7318
                {"thead", `<table class="cl1">
     <thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead>
     <tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody>
 </table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")},
+               {"thead uppercase", `<TABLE class="CL1">
+    <THEAD class="CL2"><TR class="CL3"><TD class="CL4"></TD></TR></THEAD>
+    <TBODY class="CL5"><TR class="CL6"><TD class="CL7"></TD></TR></TBODY>
+</TABLE>`, f("table tbody td thead tr", "CL1 CL2 CL3 CL4 CL5 CL6 CL7", "")},
                // https://github.com/gohugoio/hugo/issues/7161
                {"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")},
                {"AlpineJS bind 1", `<body>
@@ -98,6 +109,11 @@ func TestClassCollector(t *testing.T) {
                {"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
                {"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")},
                {"Comments should be skipped", `<!-- example comment -->`, f("", "", "")},
+               {"Comments with elements before and after", `<div></div><!-- example comment --><span><span>`, f("div span", "", "")},
+               // Issue #8530
+               {"Comment with single quote", `<!-- Hero Area Image d'accueil --><i class="foo">`, f("i", "foo", "")},
+               {"Uppercase tags", `<DIV></DIV>`, f("div", "", "")},
+               {"Predefined tags with distinct casing", `<script>if (a < b) { nothing(); }</SCRIPT><div></div>`, f("div script", "", "")},
                // Issue #8417
                {"Tabs inline", `<hr    id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")},
                {"Tabs on multiple rows", `<form
@@ -108,16 +124,29 @@ func TestClassCollector(t *testing.T) {
 <div id="b" class="foo">d</div>`, f("div form", "foo", "a b")},
        } {
 
-               for _, minify := range []bool{false, true} {
-                       c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) {
+               for _, variant := range []struct {
+                       minify bool
+                       stream bool
+               }{
+                       {minify: false, stream: false},
+                       {minify: true, stream: false},
+                       {minify: false, stream: true},
+               } {
+
+                       c.Run(fmt.Sprintf("%s--minify-%t--stream-%t", test.name, variant.minify, variant.stream), func(c *qt.C) {
                                w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-                               if minify {
+                               if variant.minify {
                                        if skipMinifyTest[test.name] {
                                                c.Skip("skip minify test")
                                        }
                                        v := viper.New()
                                        m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
                                        m.Minify(media.HTMLType, w, strings.NewReader(test.html))
+                               } else if variant.stream {
+                                       chunks := text.Chunk(test.html, rnd.Intn(41)+1)
+                                       for _, chunk := range chunks {
+                                               fmt.Fprint(w, chunk)
+                                       }
                                } else {
                                        fmt.Fprint(w, test.html)
                                }
@@ -126,6 +155,7 @@ func TestClassCollector(t *testing.T) {
                        })
                }
        }
+
 }
 
 func BenchmarkElementsCollectorWriter(b *testing.B) {