Revert "publisher: Make the HTML element collector more robust"
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Wed, 19 May 2021 01:45:36 +0000 (03:45 +0200)
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Wed, 19 May 2021 01:45:36 +0000 (03:45 +0200)
This reverts commit ef0f1a726901d6c614040cfc2d7e8f9a2ca97816.

common/text/transform.go
publisher/htmlElementsCollector.go
publisher/htmlElementsCollector_test.go

index 2d51f6c33b680604c81109206b93642b9a7e3cbd..f5957780363cebd96b2b3fc36ebe0f772b48d69a 100644 (file)
@@ -45,25 +45,3 @@ func RemoveAccentsString(s string) string {
        accentTransformerPool.Put(t)
        return s
 }
-
-// Chunk splits s into strings of size.
-func Chunk(s string, size int) []string {
-       if size >= len(s) {
-               return []string{s}
-       }
-       var chunks []string
-       chunk := make([]rune, size)
-       l := 0
-       for _, r := range s {
-               chunk[l] = r
-               l++
-               if l == size {
-                       chunks = append(chunks, string(chunk))
-                       l = 0
-               }
-       }
-       if l > 0 {
-               chunks = append(chunks, string(chunk[:l]))
-       }
-       return chunks
-}
index 1bc1a09bcd4b6606bf00593ab4c841fb0cd6fe4f..9dc28c4c2cf6626f1135bcdba3c25e1c17d031ca 100644 (file)
@@ -19,51 +19,12 @@ import (
        "sort"
        "strings"
        "sync"
-       "unicode"
-       "unicode/utf8"
 
        "golang.org/x/net/html"
 
        "github.com/gohugoio/hugo/helpers"
 )
 
-const eof = -1
-
-var (
-       htmlJsonFixer = strings.NewReplacer(", ", "\n")
-       jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
-       classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
-
-       skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
-       skipAllElementRe   = regexp.MustCompile(`(?i)^!DOCTYPE`)
-       endTagRe           = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
-
-       exceptionList = map[string]bool{
-               "thead": true,
-               "tbody": true,
-               "tfoot": true,
-               "td":    true,
-               "tr":    true,
-       }
-)
-
-func newHTMLElementsCollector() *htmlElementsCollector {
-       return &htmlElementsCollector{
-               elementSet: make(map[string]bool),
-       }
-}
-
-func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
-       w := &htmlElementsCollectorWriter{
-               collector: collector,
-               state:     htmlLexStart,
-       }
-
-       w.defaultLexElementInside = w.lexElementInside(htmlLexStart)
-
-       return w
-}
-
 // HTMLElements holds lists of tags and attribute values for classes and id.
 type HTMLElements struct {
        Tags    []string `json:"tags"`
@@ -87,12 +48,6 @@ func (h *HTMLElements) Sort() {
        sort.Strings(h.IDs)
 }
 
-type htmlElement struct {
-       Tag     string
-       Classes []string
-       IDs     []string
-}
-
 type htmlElementsCollector struct {
        // Contains the raw HTML string. We will get the same element
        // several times, and want to avoid costly reparsing when this
@@ -104,6 +59,12 @@ type htmlElementsCollector struct {
        mu sync.RWMutex
 }
 
+func newHTMLElementsCollector() *htmlElementsCollector {
+       return &htmlElementsCollector{
+               elementSet: make(map[string]bool),
+       }
+}
+
 func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
        var (
                classes []string
@@ -132,118 +93,114 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
 
 type htmlElementsCollectorWriter struct {
        collector *htmlElementsCollector
+       buff      bytes.Buffer
 
-       r     rune   // Current rune
-       width int    // The width in bytes of r
-       input []byte // The current slice written to Write
-       pos   int    // The current position in input
-
-       err error
-
-       inQuote rune
-
-       buff bytes.Buffer
+       isCollecting bool
+       inPreTag     string
 
-       // Current state
-       state htmlCollectorStateFunc
+       inQuote    bool
+       quoteValue byte
+}
 
-       // Precompiled state funcs
-       defaultLexElementInside htmlCollectorStateFunc
+func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
+       return &htmlElementsCollectorWriter{
+               collector: collector,
+       }
 }
 
-// Write collects HTML elements from p.
+// Write splits the incoming stream into single html element.
 func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
        n = len(p)
-       w.input = p
-       w.pos = 0
-
-       for {
-               w.r = w.next()
-               if w.r == eof {
-                       return
+       i := 0
+
+       for i < len(p) {
+               // If we are not collecting, cycle through byte stream until start bracket "<" is found.
+               if !w.isCollecting {
+                       for ; i < len(p); i++ {
+                               b := p[i]
+                               if b == '<' {
+                                       w.startCollecting()
+                                       break
+                               }
+                       }
                }
-               w.state = w.state(w)
-       }
-}
-
-func (l *htmlElementsCollectorWriter) backup() {
-       l.pos -= l.width
-       l.r, _ = utf8.DecodeRune(l.input[l.pos:])
-}
 
-func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
-       var s htmlCollectorStateFunc
-       s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
-               w.buff.WriteRune(w.r)
-               if condition() {
-                       w.buff.Reset()
-                       return resolve
+               if w.isCollecting {
+                       // If we are collecting, cycle through byte stream until end bracket ">" is found,
+                       // disregard any ">" if within a quote,
+                       // write bytes until found to buffer.
+                       for ; i < len(p); i++ {
+                               b := p[i]
+                               w.toggleIfQuote(b)
+                               w.buff.WriteByte(b)
+
+                               if !w.inQuote && b == '>' {
+                                       w.endCollecting()
+                                       break
+                               }
+                       }
                }
-               return s
-       }
-       return s
-}
 
-func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
-       var s htmlCollectorStateFunc
-       s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
-               if condition(w.r) {
-                       return resolve
-               }
-               return s
-       }
-       return s
-}
+               // If no end bracket ">" is found while collecting, but the stream ended
+               // this could mean we received chunks of a stream from e.g. the minify functionality
+               // next if loop will be skipped.
 
-// Starts with e.g. "<body " or "<div"
-func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
-       var s htmlCollectorStateFunc
-       s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
-               w.buff.WriteRune(w.r)
-
-               // Skip any text inside a quote.
-               if w.r == '\'' || w.r == '"' {
-                       if w.inQuote == w.r {
-                               w.inQuote = 0
-                       } else if w.inQuote == 0 {
-                               w.inQuote = w.r
+               // At this point we have collected an element line between angle brackets "<" and ">".
+               if !w.isCollecting {
+                       if w.buff.Len() == 0 {
+                               continue
                        }
-               }
 
-               if w.inQuote != 0 {
-                       return s
-               }
+                       if w.inPreTag != "" { // within preformatted code block
+                               s := w.buff.String()
+                               w.buff.Reset()
+                               if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
+                                       w.inPreTag = ""
+                               }
+                               continue
+                       }
 
-               if w.r == '>' {
+                       // First check if we have processed this element before.
+                       w.collector.mu.RLock()
 
                        // Work with the bytes slice as long as it's practical,
                        // to save memory allocations.
                        b := w.buff.Bytes()
 
-                       defer func() {
-                               w.buff.Reset()
-                       }()
-
-                       // First check if we have processed this element before.
-                       w.collector.mu.RLock()
-
+                       // See https://github.com/dominikh/go-tools/issues/723
+                       //lint:ignore S1030 This construct avoids memory allocation for the string.
                        seen := w.collector.elementSet[string(b)]
                        w.collector.mu.RUnlock()
                        if seen {
-                               return resolve
+                               w.buff.Reset()
+                               continue
+                       }
+
+                       // Filter out unwanted tags
+                       // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
+                       // comments and doctype tags
+                       // end tags.
+                       switch {
+                       case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag
+                               w.buff.Reset()
+                               continue
+                       case bytes.HasPrefix(b, []byte("</")): // end tag
+                               w.buff.Reset()
+                               continue
                        }
 
                        s := w.buff.String()
+                       w.buff.Reset()
 
-                       if s == "" {
-                               return resolve
+                       // Check if a preformatted code block started.
+                       if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
+                               w.inPreTag = tagName
                        }
 
                        // Parse each collected element.
                        el, err := parseHTMLElement(s)
                        if err != nil {
-                               w.err = err
-                               return resolve
+                               return n, err
                        }
 
                        // Write this tag to the element set.
@@ -251,137 +208,109 @@ func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStat
                        w.collector.elementSet[s] = true
                        w.collector.elements = append(w.collector.elements, el)
                        w.collector.mu.Unlock()
-
-                       return resolve
-
                }
-
-               return s
        }
 
-       return s
+       return
 }
 
-func (l *htmlElementsCollectorWriter) next() rune {
-       if l.pos >= len(l.input) {
-               l.width = 0
-               return eof
-       }
-
-       runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
-       l.width = runeWidth
-       l.pos += l.width
-       return runeValue
+func (c *htmlElementsCollectorWriter) startCollecting() {
+       c.isCollecting = true
 }
 
-// returns the next state in HTML element scanner.
-type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc
+func (c *htmlElementsCollectorWriter) endCollecting() {
+       c.isCollecting = false
+       c.inQuote = false
+}
 
-// At "<", buffer empty.
-// Potentially starting a HTML element.
-func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
-       if w.r == '>' || unicode.IsSpace(w.r) {
-               if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) {
-                       w.buff.Reset()
-                       return htmlLexStart
+func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) {
+       if isQuote(b) {
+               if c.inQuote && b == c.quoteValue {
+                       c.inQuote = false
+               } else if !c.inQuote {
+                       c.inQuote = true
+                       c.quoteValue = b
                }
+       }
+}
 
-               tagName := w.buff.Bytes()[1:]
-
-               switch {
-               case skipInnerElementRe.Match(tagName):
-                       // pre, script etc. We collect classes etc. on the surrounding
-                       // element, but skip the inner content.
-                       w.backup()
+func isQuote(b byte) bool {
+       return b == '"' || b == '\''
+}
 
-                       // tagName will be overwritten, so make a copy.
-                       tagNameCopy := make([]byte, len(tagName))
-                       copy(tagNameCopy, tagName)
+func parseStartTag(s string) (string, bool) {
+       s = strings.TrimPrefix(s, "<")
+       s = strings.TrimSuffix(s, ">")
 
-                       return w.lexElementInside(
-                               w.consumeBuffUntil(
-                                       func() bool {
-                                               if w.r != '>' {
-                                                       return false
-                                               }
-                                               m := endTagRe.FindSubmatch(w.buff.Bytes())
-                                               if m == nil {
-                                                       return false
-                                               }
-                                               return bytes.EqualFold(m[1], tagNameCopy)
-                                       },
-                                       htmlLexStart,
-                               ))
-               case skipAllElementRe.Match(tagName):
-                       // E.g. "<!DOCTYPE ..."
-                       w.buff.Reset()
-                       return w.consumeRuneUntil(func(r rune) bool {
-                               return r == '>'
-                       }, htmlLexStart)
-               default:
-                       w.backup()
-                       return w.defaultLexElementInside
-               }
+       spaceIndex := strings.Index(s, " ")
+       if spaceIndex != -1 {
+               s = s[:spaceIndex]
        }
 
-       w.buff.WriteRune(w.r)
+       return strings.ToLower(strings.TrimSpace(s)), true
+}
 
-       // If it's a comment, skip to its end.
-       if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) {
-               w.buff.Reset()
-               return htmlLexToEndOfComment
+func parseEndTag(s string) (string, bool) {
+       if !strings.HasPrefix(s, "</") {
+               return "", false
        }
 
-       return htmlLexElementStart
+       s = strings.TrimPrefix(s, "</")
+       s = strings.TrimSuffix(s, ">")
+
+       return strings.ToLower(strings.TrimSpace(s)), true
 }
 
-// Entry state func.
-// Looks for a opening bracket, '<'.
-func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
-       if w.r == '<' {
-               w.backup()
-               w.buff.Reset()
-               return htmlLexElementStart
-       }
+// No need to look inside these for HTML elements.
+func isPreFormatted(s string) bool {
+       return s == "pre" || s == "textarea" || s == "script" || s == "style"
+}
 
-       return htmlLexStart
+type htmlElement struct {
+       Tag     string
+       Classes []string
+       IDs     []string
 }
 
-// After "<!--", buff empty.
-func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
-       w.buff.WriteRune(w.r)
+var (
+       htmlJsonFixer = strings.NewReplacer(", ", "\n")
+       jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
+       classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
 
-       if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) {
-               // Done, start looking for HTML elements again.
-               return htmlLexStart
+       exceptionList = map[string]bool{
+               "thead": true,
+               "tbody": true,
+               "tfoot": true,
+               "td":    true,
+               "tr":    true,
        }
-
-       return htmlLexToEndOfComment
-}
+)
 
 func parseHTMLElement(elStr string) (el htmlElement, err error) {
+       var tagBuffer string = ""
 
-       tagName := parseStartTag(elStr)
-
-       el.Tag = strings.ToLower(tagName)
-       tagNameToParse := el.Tag
+       tagName, ok := parseStartTag(elStr)
+       if !ok {
+               return
+       }
 
        // The net/html parser does not handle single table elements as input, e.g. tbody.
        // We only care about the element/class/ids, so just store away the original tag name
        // and pretend it's a <div>.
-       if exceptionList[el.Tag] {
+       if exceptionList[tagName] {
+               tagBuffer = tagName
                elStr = strings.Replace(elStr, tagName, "div", 1)
-               tagNameToParse = "div"
        }
 
        n, err := html.Parse(strings.NewReader(elStr))
        if err != nil {
                return
        }
-
        var walk func(*html.Node)
        walk = func(n *html.Node) {
-               if n.Type == html.ElementNode && n.Data == tagNameToParse {
+               if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) {
+                       el.Tag = n.Data
+
                        for _, a := range n.Attr {
                                switch {
                                case strings.EqualFold(a.Key, "id"):
@@ -416,20 +345,10 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {
 
        walk(n)
 
-       return
-}
-
-// Variants of s
-//    <body class="b a">
-//    <div>
-func parseStartTag(s string) string {
-       spaceIndex := strings.IndexFunc(s, func(r rune) bool {
-               return unicode.IsSpace(r)
-       })
-
-       if spaceIndex == -1 {
-               return s[1 : len(s)-1]
+       // did we replaced the start tag?
+       if tagBuffer != "" {
+               el.Tag = tagBuffer
        }
 
-       return s[1:spaceIndex]
+       return
 }
index 2eac31f73b7894bf1a69451c2068152865a13bfe..0c8b2b65b347fa862ea39b2719b7bc7cab5b09d0 100644 (file)
@@ -15,12 +15,8 @@ package publisher
 
 import (
        "fmt"
-       "math/rand"
        "strings"
        "testing"
-       "time"
-
-       "github.com/gohugoio/hugo/common/text"
 
        "github.com/gohugoio/hugo/media"
        "github.com/gohugoio/hugo/minifiers"
@@ -32,7 +28,6 @@ import (
 
 func TestClassCollector(t *testing.T) {
        c := qt.New((t))
-       rnd := rand.New(rand.NewSource(time.Now().Unix()))
 
        f := func(tags, classes, ids string) HTMLElements {
                var tagss, classess, idss []string
@@ -62,20 +57,14 @@ func TestClassCollector(t *testing.T) {
                expect HTMLElements
        }{
                {"basic", `<body class="b a"></body>`, f("body", "a b", "")},
-               {"duplicates", `<div class="b a b"></div><div class="b a b"></div>x'`, f("div", "a b", "")},
+               {"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")},
                {"single quote", `<body class='b a'></body>`, f("body", "a b", "")},
                {"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")},
-               {"short", `<i>`, f("i", "", "")},
-               {"invalid", `< body class="b a"></body><div></div>`, f("div", "", "")},
                // https://github.com/gohugoio/hugo/issues/7318
                {"thead", `<table class="cl1">
     <thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead>
     <tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody>
 </table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")},
-               {"thead uppercase", `<TABLE class="CL1">
-    <THEAD class="CL2"><TR class="CL3"><TD class="CL4"></TD></TR></THEAD>
-    <TBODY class="CL5"><TR class="CL6"><TD class="CL7"></TD></TR></TBODY>
-</TABLE>`, f("table tbody td thead tr", "CL1 CL2 CL3 CL4 CL5 CL6 CL7", "")},
                // https://github.com/gohugoio/hugo/issues/7161
                {"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")},
                {"AlpineJS bind 1", `<body>
@@ -109,11 +98,6 @@ func TestClassCollector(t *testing.T) {
                {"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
                {"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")},
                {"Comments should be skipped", `<!-- example comment -->`, f("", "", "")},
-               {"Comments with elements before and after", `<div></div><!-- example comment --><span><span>`, f("div span", "", "")},
-               // Issue #8530
-               {"Comment with single quote", `<!-- Hero Area Image d'accueil --><i class="foo">`, f("i", "foo", "")},
-               {"Uppercase tags", `<DIV></DIV>`, f("div", "", "")},
-               {"Predefined tags with distinct casing", `<script>if (a < b) { nothing(); }</SCRIPT><div></div>`, f("div script", "", "")},
                // Issue #8417
                {"Tabs inline", `<hr    id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")},
                {"Tabs on multiple rows", `<form
@@ -124,29 +108,16 @@ func TestClassCollector(t *testing.T) {
 <div id="b" class="foo">d</div>`, f("div form", "foo", "a b")},
        } {
 
-               for _, variant := range []struct {
-                       minify bool
-                       stream bool
-               }{
-                       {minify: false, stream: false},
-                       {minify: true, stream: false},
-                       {minify: false, stream: true},
-               } {
-
-                       c.Run(fmt.Sprintf("%s--minify-%t--stream-%t", test.name, variant.minify, variant.stream), func(c *qt.C) {
+               for _, minify := range []bool{false, true} {
+                       c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) {
                                w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-                               if variant.minify {
+                               if minify {
                                        if skipMinifyTest[test.name] {
                                                c.Skip("skip minify test")
                                        }
                                        v := viper.New()
                                        m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
                                        m.Minify(media.HTMLType, w, strings.NewReader(test.html))
-                               } else if variant.stream {
-                                       chunks := text.Chunk(test.html, rnd.Intn(41)+1)
-                                       for _, chunk := range chunks {
-                                               fmt.Fprint(w, chunk)
-                                       }
                                } else {
                                        fmt.Fprint(w, test.html)
                                }
@@ -155,7 +126,6 @@ func TestClassCollector(t *testing.T) {
                        })
                }
        }
-
 }
 
 func BenchmarkElementsCollectorWriter(b *testing.B) {