publisher: Make the HTML element collector more robust

author Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>

Thu, 13 May 2021 11:10:32 +0000 (13:10 +0200)

committer Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>

Mon, 17 May 2021 15:15:32 +0000 (17:15 +0200)
author Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Thu, 13 May 2021 11:10:32 +0000 (13:10 +0200)
committer Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Mon, 17 May 2021 15:15:32 +0000 (17:15 +0200)
diff --git a/common/text/transform.go b/common/text/transform.go

index f5957780363cebd96b2b3fc36ebe0f772b48d69a..2d51f6c33b680604c81109206b93642b9a7e3cbd 100644 (file)
--- a/common/text/transform.go
+++ b/common/text/transform.go
@@ -45,3 +45,25 @@ func RemoveAccentsString(s string) string {
         accentTransformerPool.Put(t)
         return s
  }
+
+// Chunk splits s into strings of size.
+func Chunk(s string, size int) []string {
+       if size >= len(s) {
+               return []string{s}
+       }
+       var chunks []string
+       chunk := make([]rune, size)
+       l := 0
+       for _, r := range s {
+               chunk[l] = r
+               l++
+               if l == size {
+                       chunks = append(chunks, string(chunk))
+                       l = 0
+               }
+       }
+       if l > 0 {
+               chunks = append(chunks, string(chunk[:l]))
+       }
+       return chunks
+}
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go

index 9dc28c4c2cf6626f1135bcdba3c25e1c17d031ca..1bc1a09bcd4b6606bf00593ab4c841fb0cd6fe4f 100644 (file)
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@@ -19,12 +19,51 @@ import (
         "sort"
         "strings"
         "sync"
+       "unicode"
+       "unicode/utf8"
  
         "golang.org/x/net/html"
  
         "github.com/gohugoio/hugo/helpers"
  )
  
+const eof = -1
+
+var (
+       htmlJsonFixer = strings.NewReplacer(", ", "\n")
+       jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
+       classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
+
+       skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
+       skipAllElementRe   = regexp.MustCompile(`(?i)^!DOCTYPE`)
+       endTagRe           = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
+
+       exceptionList = map[string]bool{
+               "thead": true,
+               "tbody": true,
+               "tfoot": true,
+               "td":    true,
+               "tr":    true,
+       }
+)
+
+func newHTMLElementsCollector() *htmlElementsCollector {
+       return &htmlElementsCollector{
+               elementSet: make(map[string]bool),
+       }
+}
+
+func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
+       w := &htmlElementsCollectorWriter{
+               collector: collector,
+               state:     htmlLexStart,
+       }
+
+       w.defaultLexElementInside = w.lexElementInside(htmlLexStart)
+
+       return w
+}
+
  // HTMLElements holds lists of tags and attribute values for classes and id.
  type HTMLElements struct {
         Tags    []string `json:"tags"`
@@ -48,6 +87,12 @@ func (h *HTMLElements) Sort() {
         sort.Strings(h.IDs)
  }
  
+type htmlElement struct {
+       Tag     string
+       Classes []string
+       IDs     []string
+}
+
  type htmlElementsCollector struct {
         // Contains the raw HTML string. We will get the same element
         // several times, and want to avoid costly reparsing when this
@@ -59,12 +104,6 @@ type htmlElementsCollector struct {
         mu sync.RWMutex
  }
  
-func newHTMLElementsCollector() *htmlElementsCollector {
-       return &htmlElementsCollector{
-               elementSet: make(map[string]bool),
-       }
-}
-
  func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
         var (
                 classes []string
@@ -93,114 +132,118 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
  
  type htmlElementsCollectorWriter struct {
         collector *htmlElementsCollector
-       buff      bytes.Buffer
  
-       isCollecting bool
-       inPreTag     string
+       r     rune   // Current rune
+       width int    // The width in bytes of r
+       input []byte // The current slice written to Write
+       pos   int    // The current position in input
  
-       inQuote    bool
-       quoteValue byte
-}
+       err error
  
-func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
-       return &htmlElementsCollectorWriter{
-               collector: collector,
-       }
+       inQuote rune
+
+       buff bytes.Buffer
+
+       // Current state
+       state htmlCollectorStateFunc
+
+       // Precompiled state funcs
+       defaultLexElementInside htmlCollectorStateFunc
  }
  
-// Write splits the incoming stream into single html element.
+// Write collects HTML elements from p.
  func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
         n = len(p)
-       i := 0
-
-       for i < len(p) {
-               // If we are not collecting, cycle through byte stream until start bracket "<" is found.
-               if !w.isCollecting {
-                       for ; i < len(p); i++ {
-                               b := p[i]
-                               if b == '<' {
-                                       w.startCollecting()
-                                       break
-                               }
-                       }
+       w.input = p
+       w.pos = 0
+
+       for {
+               w.r = w.next()
+               if w.r == eof {
+                       return
                 }
+               w.state = w.state(w)
+       }
+}
  
-               if w.isCollecting {
-                       // If we are collecting, cycle through byte stream until end bracket ">" is found,
-                       // disregard any ">" if within a quote,
-                       // write bytes until found to buffer.
-                       for ; i < len(p); i++ {
-                               b := p[i]
-                               w.toggleIfQuote(b)
-                               w.buff.WriteByte(b)
-
-                               if !w.inQuote && b == '>' {
-                                       w.endCollecting()
-                                       break
-                               }
-                       }
+func (l *htmlElementsCollectorWriter) backup() {
+       l.pos -= l.width
+       l.r, _ = utf8.DecodeRune(l.input[l.pos:])
+}
+
+func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
+       var s htmlCollectorStateFunc
+       s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
+               w.buff.WriteRune(w.r)
+               if condition() {
+                       w.buff.Reset()
+                       return resolve
                 }
+               return s
+       }
+       return s
+}
  
-               // If no end bracket ">" is found while collecting, but the stream ended
-               // this could mean we received chunks of a stream from e.g. the minify functionality
-               // next if loop will be skipped.
+func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
+       var s htmlCollectorStateFunc
+       s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
+               if condition(w.r) {
+                       return resolve
+               }
+               return s
+       }
+       return s
+}
  
-               // At this point we have collected an element line between angle brackets "<" and ">".
-               if !w.isCollecting {
-                       if w.buff.Len() == 0 {
-                               continue
+// Starts with e.g. "<body " or "<div"
+func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
+       var s htmlCollectorStateFunc
+       s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
+               w.buff.WriteRune(w.r)
+
+               // Skip any text inside a quote.
+               if w.r == '\'' || w.r == '"' {
+                       if w.inQuote == w.r {
+                               w.inQuote = 0
+                       } else if w.inQuote == 0 {
+                               w.inQuote = w.r
                         }
+               }
  
-                       if w.inPreTag != "" { // within preformatted code block
-                               s := w.buff.String()
-                               w.buff.Reset()
-                               if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
-                                       w.inPreTag = ""
-                               }
-                               continue
-                       }
+               if w.inQuote != 0 {
+                       return s
+               }
  
-                       // First check if we have processed this element before.
-                       w.collector.mu.RLock()
+               if w.r == '>' {
  
                         // Work with the bytes slice as long as it's practical,
                         // to save memory allocations.
                         b := w.buff.Bytes()
  
-                       // See https://github.com/dominikh/go-tools/issues/723
-                       //lint:ignore S1030 This construct avoids memory allocation for the string.
+                       defer func() {
+                               w.buff.Reset()
+                       }()
+
+                       // First check if we have processed this element before.
+                       w.collector.mu.RLock()
+
                         seen := w.collector.elementSet[string(b)]
                         w.collector.mu.RUnlock()
                         if seen {
-                               w.buff.Reset()
-                               continue
-                       }
-
-                       // Filter out unwanted tags
-                       // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
-                       // comments and doctype tags
-                       // end tags.
-                       switch {
-                       case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag
-                               w.buff.Reset()
-                               continue
-                       case bytes.HasPrefix(b, []byte("</")): // end tag
-                               w.buff.Reset()
-                               continue
+                               return resolve
                         }
  
                         s := w.buff.String()
-                       w.buff.Reset()
  
-                       // Check if a preformatted code block started.
-                       if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
-                               w.inPreTag = tagName
+                       if s == "" {
+                               return resolve
                         }
  
                         // Parse each collected element.
                         el, err := parseHTMLElement(s)
                         if err != nil {
-                               return n, err
+                               w.err = err
+                               return resolve
                         }
  
                         // Write this tag to the element set.
@@ -208,109 +251,137 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
                         w.collector.elementSet[s] = true
                         w.collector.elements = append(w.collector.elements, el)
                         w.collector.mu.Unlock()
+
+                       return resolve
+
                 }
+
+               return s
         }
  
-       return
+       return s
  }
  
-func (c *htmlElementsCollectorWriter) startCollecting() {
-       c.isCollecting = true
-}
+func (l *htmlElementsCollectorWriter) next() rune {
+       if l.pos >= len(l.input) {
+               l.width = 0
+               return eof
+       }
  
-func (c *htmlElementsCollectorWriter) endCollecting() {
-       c.isCollecting = false
-       c.inQuote = false
+       runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
+       l.width = runeWidth
+       l.pos += l.width
+       return runeValue
  }
  
-func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) {
-       if isQuote(b) {
-               if c.inQuote && b == c.quoteValue {
-                       c.inQuote = false
-               } else if !c.inQuote {
-                       c.inQuote = true
-                       c.quoteValue = b
+// returns the next state in HTML element scanner.
+type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc
+
+// At "<", buffer empty.
+// Potentially starting a HTML element.
+func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
+       if w.r == '>' || unicode.IsSpace(w.r) {
+               if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) {
+                       w.buff.Reset()
+                       return htmlLexStart
                 }
-       }
-}
  
-func isQuote(b byte) bool {
-       return b == '"' || b == '\''
-}
+               tagName := w.buff.Bytes()[1:]
  
-func parseStartTag(s string) (string, bool) {
-       s = strings.TrimPrefix(s, "<")
-       s = strings.TrimSuffix(s, ">")
+               switch {
+               case skipInnerElementRe.Match(tagName):
+                       // pre, script etc. We collect classes etc. on the surrounding
+                       // element, but skip the inner content.
+                       w.backup()
  
-       spaceIndex := strings.Index(s, " ")
-       if spaceIndex != -1 {
-               s = s[:spaceIndex]
+                       // tagName will be overwritten, so make a copy.
+                       tagNameCopy := make([]byte, len(tagName))
+                       copy(tagNameCopy, tagName)
+
+                       return w.lexElementInside(
+                               w.consumeBuffUntil(
+                                       func() bool {
+                                               if w.r != '>' {
+                                                       return false
+                                               }
+                                               m := endTagRe.FindSubmatch(w.buff.Bytes())
+                                               if m == nil {
+                                                       return false
+                                               }
+                                               return bytes.EqualFold(m[1], tagNameCopy)
+                                       },
+                                       htmlLexStart,
+                               ))
+               case skipAllElementRe.Match(tagName):
+                       // E.g. "<!DOCTYPE ..."
+                       w.buff.Reset()
+                       return w.consumeRuneUntil(func(r rune) bool {
+                               return r == '>'
+                       }, htmlLexStart)
+               default:
+                       w.backup()
+                       return w.defaultLexElementInside
+               }
         }
  
-       return strings.ToLower(strings.TrimSpace(s)), true
-}
+       w.buff.WriteRune(w.r)
  
-func parseEndTag(s string) (string, bool) {
-       if !strings.HasPrefix(s, "</") {
-               return "", false
+       // If it's a comment, skip to its end.
+       if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) {
+               w.buff.Reset()
+               return htmlLexToEndOfComment
         }
  
-       s = strings.TrimPrefix(s, "</")
-       s = strings.TrimSuffix(s, ">")
-
-       return strings.ToLower(strings.TrimSpace(s)), true
+       return htmlLexElementStart
  }
  
-// No need to look inside these for HTML elements.
-func isPreFormatted(s string) bool {
-       return s == "pre" || s == "textarea" || s == "script" || s == "style"
-}
+// Entry state func.
+// Looks for a opening bracket, '<'.
+func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
+       if w.r == '<' {
+               w.backup()
+               w.buff.Reset()
+               return htmlLexElementStart
+       }
  
-type htmlElement struct {
-       Tag     string
-       Classes []string
-       IDs     []string
+       return htmlLexStart
  }
  
-var (
-       htmlJsonFixer = strings.NewReplacer(", ", "\n")
-       jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
-       classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
+// After "<!--", buff empty.
+func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
+       w.buff.WriteRune(w.r)
  
-       exceptionList = map[string]bool{
-               "thead": true,
-               "tbody": true,
-               "tfoot": true,
-               "td":    true,
-               "tr":    true,
+       if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) {
+               // Done, start looking for HTML elements again.
+               return htmlLexStart
         }
-)
+
+       return htmlLexToEndOfComment
+}
  
  func parseHTMLElement(elStr string) (el htmlElement, err error) {
-       var tagBuffer string = ""
  
-       tagName, ok := parseStartTag(elStr)
-       if !ok {
-               return
-       }
+       tagName := parseStartTag(elStr)
+
+       el.Tag = strings.ToLower(tagName)
+       tagNameToParse := el.Tag
  
         // The net/html parser does not handle single table elements as input, e.g. tbody.
         // We only care about the element/class/ids, so just store away the original tag name
         // and pretend it's a <div>.
-       if exceptionList[tagName] {
-               tagBuffer = tagName
+       if exceptionList[el.Tag] {
                 elStr = strings.Replace(elStr, tagName, "div", 1)
+               tagNameToParse = "div"
         }
  
         n, err := html.Parse(strings.NewReader(elStr))
         if err != nil {
                 return
         }
+
         var walk func(*html.Node)
         walk = func(n *html.Node) {
-               if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) {
-                       el.Tag = n.Data
-
+               if n.Type == html.ElementNode && n.Data == tagNameToParse {
                         for _, a := range n.Attr {
                                 switch {
                                 case strings.EqualFold(a.Key, "id"):
@@ -345,10 +416,20 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {
  
         walk(n)
  
-       // did we replaced the start tag?
-       if tagBuffer != "" {
-               el.Tag = tagBuffer
+       return
+}
+
+// Variants of s
+//    <body class="b a">
+//    <div>
+func parseStartTag(s string) string {
+       spaceIndex := strings.IndexFunc(s, func(r rune) bool {
+               return unicode.IsSpace(r)
+       })
+
+       if spaceIndex == -1 {
+               return s[1 : len(s)-1]
         }
  
-       return
+       return s[1:spaceIndex]
  }
diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go

index 0c8b2b65b347fa862ea39b2719b7bc7cab5b09d0..2eac31f73b7894bf1a69451c2068152865a13bfe 100644 (file)
--- a/publisher/htmlElementsCollector_test.go
+++ b/publisher/htmlElementsCollector_test.go
@@ -15,8 +15,12 @@ package publisher
  
  import (
         "fmt"
+       "math/rand"
         "strings"
         "testing"
+       "time"
+
+       "github.com/gohugoio/hugo/common/text"
  
         "github.com/gohugoio/hugo/media"
         "github.com/gohugoio/hugo/minifiers"
@@ -28,6 +32,7 @@ import (
  
  func TestClassCollector(t *testing.T) {
         c := qt.New((t))
+       rnd := rand.New(rand.NewSource(time.Now().Unix()))
  
         f := func(tags, classes, ids string) HTMLElements {
                 var tagss, classess, idss []string
@@ -57,14 +62,20 @@ func TestClassCollector(t *testing.T) {
                 expect HTMLElements
         }{
                 {"basic", `<body class="b a"></body>`, f("body", "a b", "")},
-               {"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")},
+               {"duplicates", `<div class="b a b"></div><div class="b a b"></div>x'`, f("div", "a b", "")},
                 {"single quote", `<body class='b a'></body>`, f("body", "a b", "")},
                 {"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")},
+               {"short", `<i>`, f("i", "", "")},
+               {"invalid", `< body class="b a"></body><div></div>`, f("div", "", "")},
                 // https://github.com/gohugoio/hugo/issues/7318
                 {"thead", `<table class="cl1">
      <thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead>
      <tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody>
  </table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")},
+               {"thead uppercase", `<TABLE class="CL1">
+    <THEAD class="CL2"><TR class="CL3"><TD class="CL4"></TD></TR></THEAD>
+    <TBODY class="CL5"><TR class="CL6"><TD class="CL7"></TD></TR></TBODY>
+</TABLE>`, f("table tbody td thead tr", "CL1 CL2 CL3 CL4 CL5 CL6 CL7", "")},
                 // https://github.com/gohugoio/hugo/issues/7161
                 {"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")},
                 {"AlpineJS bind 1", `<body>
@@ -98,6 +109,11 @@ func TestClassCollector(t *testing.T) {
                 {"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
                 {"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")},
                 {"Comments should be skipped", `<!-- example comment -->`, f("", "", "")},
+               {"Comments with elements before and after", `<div></div><!-- example comment --><span><span>`, f("div span", "", "")},
+               // Issue #8530
+               {"Comment with single quote", `<!-- Hero Area Image d'accueil --><i class="foo">`, f("i", "foo", "")},
+               {"Uppercase tags", `<DIV></DIV>`, f("div", "", "")},
+               {"Predefined tags with distinct casing", `<script>if (a < b) { nothing(); }</SCRIPT><div></div>`, f("div script", "", "")},
                 // Issue #8417
                 {"Tabs inline", `<hr    id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")},
                 {"Tabs on multiple rows", `<form
@@ -108,16 +124,29 @@ func TestClassCollector(t *testing.T) {
  <div id="b" class="foo">d</div>`, f("div form", "foo", "a b")},
         } {
  
-               for _, minify := range []bool{false, true} {
-                       c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) {
+               for _, variant := range []struct {
+                       minify bool
+                       stream bool
+               }{
+                       {minify: false, stream: false},
+                       {minify: true, stream: false},
+                       {minify: false, stream: true},
+               } {
+
+                       c.Run(fmt.Sprintf("%s--minify-%t--stream-%t", test.name, variant.minify, variant.stream), func(c *qt.C) {
                                 w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-                               if minify {
+                               if variant.minify {
                                         if skipMinifyTest[test.name] {
                                                 c.Skip("skip minify test")
                                         }
                                         v := viper.New()
                                         m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
                                         m.Minify(media.HTMLType, w, strings.NewReader(test.html))
+                               } else if variant.stream {
+                                       chunks := text.Chunk(test.html, rnd.Intn(41)+1)
+                                       for _, chunk := range chunks {
+                                               fmt.Fprint(w, chunk)
+                                       }
                                 } else {
                                         fmt.Fprint(w, test.html)
                                 }
@@ -126,6 +155,7 @@ func TestClassCollector(t *testing.T) {
                         })
                 }
         }
+
  }
  
  func BenchmarkElementsCollectorWriter(b *testing.B) {
author	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
	Thu, 13 May 2021 11:10:32 +0000 (13:10 +0200)
committer	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
	Mon, 17 May 2021 15:15:32 +0000 (17:15 +0200)
common/text/transform.go		patch \| blob \| history
publisher/htmlElementsCollector.go		patch \| blob \| history
publisher/htmlElementsCollector_test.go		patch \| blob \| history