Revert "publisher: Make the HTML element collector more robust"

author Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>

Wed, 19 May 2021 01:45:36 +0000 (03:45 +0200)

committer Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>

Wed, 19 May 2021 01:45:36 +0000 (03:45 +0200)
author Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Wed, 19 May 2021 01:45:36 +0000 (03:45 +0200)
committer Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Wed, 19 May 2021 01:45:36 +0000 (03:45 +0200)
diff --git a/common/text/transform.go b/common/text/transform.go

index 2d51f6c33b680604c81109206b93642b9a7e3cbd..f5957780363cebd96b2b3fc36ebe0f772b48d69a 100644 (file)
--- a/common/text/transform.go
+++ b/common/text/transform.go
@@ -45,25 +45,3 @@ func RemoveAccentsString(s string) string {
         accentTransformerPool.Put(t)
         return s
  }
-
-// Chunk splits s into strings of size.
-func Chunk(s string, size int) []string {
-       if size >= len(s) {
-               return []string{s}
-       }
-       var chunks []string
-       chunk := make([]rune, size)
-       l := 0
-       for _, r := range s {
-               chunk[l] = r
-               l++
-               if l == size {
-                       chunks = append(chunks, string(chunk))
-                       l = 0
-               }
-       }
-       if l > 0 {
-               chunks = append(chunks, string(chunk[:l]))
-       }
-       return chunks
-}
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go

index 1bc1a09bcd4b6606bf00593ab4c841fb0cd6fe4f..9dc28c4c2cf6626f1135bcdba3c25e1c17d031ca 100644 (file)
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@@ -19,51 +19,12 @@ import (
         "sort"
         "strings"
         "sync"
-       "unicode"
-       "unicode/utf8"
  
         "golang.org/x/net/html"
  
         "github.com/gohugoio/hugo/helpers"
  )
  
-const eof = -1
-
-var (
-       htmlJsonFixer = strings.NewReplacer(", ", "\n")
-       jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
-       classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
-
-       skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
-       skipAllElementRe   = regexp.MustCompile(`(?i)^!DOCTYPE`)
-       endTagRe           = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
-
-       exceptionList = map[string]bool{
-               "thead": true,
-               "tbody": true,
-               "tfoot": true,
-               "td":    true,
-               "tr":    true,
-       }
-)
-
-func newHTMLElementsCollector() *htmlElementsCollector {
-       return &htmlElementsCollector{
-               elementSet: make(map[string]bool),
-       }
-}
-
-func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
-       w := &htmlElementsCollectorWriter{
-               collector: collector,
-               state:     htmlLexStart,
-       }
-
-       w.defaultLexElementInside = w.lexElementInside(htmlLexStart)
-
-       return w
-}
-
  // HTMLElements holds lists of tags and attribute values for classes and id.
  type HTMLElements struct {
         Tags    []string `json:"tags"`
@@ -87,12 +48,6 @@ func (h *HTMLElements) Sort() {
         sort.Strings(h.IDs)
  }
  
-type htmlElement struct {
-       Tag     string
-       Classes []string
-       IDs     []string
-}
-
  type htmlElementsCollector struct {
         // Contains the raw HTML string. We will get the same element
         // several times, and want to avoid costly reparsing when this
@@ -104,6 +59,12 @@ type htmlElementsCollector struct {
         mu sync.RWMutex
  }
  
+func newHTMLElementsCollector() *htmlElementsCollector {
+       return &htmlElementsCollector{
+               elementSet: make(map[string]bool),
+       }
+}
+
  func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
         var (
                 classes []string
@@ -132,118 +93,114 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
  
  type htmlElementsCollectorWriter struct {
         collector *htmlElementsCollector
+       buff      bytes.Buffer
  
-       r     rune   // Current rune
-       width int    // The width in bytes of r
-       input []byte // The current slice written to Write
-       pos   int    // The current position in input
-
-       err error
-
-       inQuote rune
-
-       buff bytes.Buffer
+       isCollecting bool
+       inPreTag     string
  
-       // Current state
-       state htmlCollectorStateFunc
+       inQuote    bool
+       quoteValue byte
+}
  
-       // Precompiled state funcs
-       defaultLexElementInside htmlCollectorStateFunc
+func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
+       return &htmlElementsCollectorWriter{
+               collector: collector,
+       }
  }
  
-// Write collects HTML elements from p.
+// Write splits the incoming stream into single html element.
  func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
         n = len(p)
-       w.input = p
-       w.pos = 0
-
-       for {
-               w.r = w.next()
-               if w.r == eof {
-                       return
+       i := 0
+
+       for i < len(p) {
+               // If we are not collecting, cycle through byte stream until start bracket "<" is found.
+               if !w.isCollecting {
+                       for ; i < len(p); i++ {
+                               b := p[i]
+                               if b == '<' {
+                                       w.startCollecting()
+                                       break
+                               }
+                       }
                 }
-               w.state = w.state(w)
-       }
-}
-
-func (l *htmlElementsCollectorWriter) backup() {
-       l.pos -= l.width
-       l.r, _ = utf8.DecodeRune(l.input[l.pos:])
-}
  
-func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
-       var s htmlCollectorStateFunc
-       s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
-               w.buff.WriteRune(w.r)
-               if condition() {
-                       w.buff.Reset()
-                       return resolve
+               if w.isCollecting {
+                       // If we are collecting, cycle through byte stream until end bracket ">" is found,
+                       // disregard any ">" if within a quote,
+                       // write bytes until found to buffer.
+                       for ; i < len(p); i++ {
+                               b := p[i]
+                               w.toggleIfQuote(b)
+                               w.buff.WriteByte(b)
+
+                               if !w.inQuote && b == '>' {
+                                       w.endCollecting()
+                                       break
+                               }
+                       }
                 }
-               return s
-       }
-       return s
-}
  
-func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
-       var s htmlCollectorStateFunc
-       s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
-               if condition(w.r) {
-                       return resolve
-               }
-               return s
-       }
-       return s
-}
+               // If no end bracket ">" is found while collecting, but the stream ended
+               // this could mean we received chunks of a stream from e.g. the minify functionality
+               // next if loop will be skipped.
  
-// Starts with e.g. "<body " or "<div"
-func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
-       var s htmlCollectorStateFunc
-       s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
-               w.buff.WriteRune(w.r)
-
-               // Skip any text inside a quote.
-               if w.r == '\'' || w.r == '"' {
-                       if w.inQuote == w.r {
-                               w.inQuote = 0
-                       } else if w.inQuote == 0 {
-                               w.inQuote = w.r
+               // At this point we have collected an element line between angle brackets "<" and ">".
+               if !w.isCollecting {
+                       if w.buff.Len() == 0 {
+                               continue
                         }
-               }
  
-               if w.inQuote != 0 {
-                       return s
-               }
+                       if w.inPreTag != "" { // within preformatted code block
+                               s := w.buff.String()
+                               w.buff.Reset()
+                               if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
+                                       w.inPreTag = ""
+                               }
+                               continue
+                       }
  
-               if w.r == '>' {
+                       // First check if we have processed this element before.
+                       w.collector.mu.RLock()
  
                         // Work with the bytes slice as long as it's practical,
                         // to save memory allocations.
                         b := w.buff.Bytes()
  
-                       defer func() {
-                               w.buff.Reset()
-                       }()
-
-                       // First check if we have processed this element before.
-                       w.collector.mu.RLock()
-
+                       // See https://github.com/dominikh/go-tools/issues/723
+                       //lint:ignore S1030 This construct avoids memory allocation for the string.
                         seen := w.collector.elementSet[string(b)]
                         w.collector.mu.RUnlock()
                         if seen {
-                               return resolve
+                               w.buff.Reset()
+                               continue
+                       }
+
+                       // Filter out unwanted tags
+                       // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
+                       // comments and doctype tags
+                       // end tags.
+                       switch {
+                       case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag
+                               w.buff.Reset()
+                               continue
+                       case bytes.HasPrefix(b, []byte("</")): // end tag
+                               w.buff.Reset()
+                               continue
                         }
  
                         s := w.buff.String()
+                       w.buff.Reset()
  
-                       if s == "" {
-                               return resolve
+                       // Check if a preformatted code block started.
+                       if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
+                               w.inPreTag = tagName
                         }
  
                         // Parse each collected element.
                         el, err := parseHTMLElement(s)
                         if err != nil {
-                               w.err = err
-                               return resolve
+                               return n, err
                         }
  
                         // Write this tag to the element set.
@@ -251,137 +208,109 @@ func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStat
                         w.collector.elementSet[s] = true
                         w.collector.elements = append(w.collector.elements, el)
                         w.collector.mu.Unlock()
-
-                       return resolve
-
                 }
-
-               return s
         }
  
-       return s
+       return
  }
  
-func (l *htmlElementsCollectorWriter) next() rune {
-       if l.pos >= len(l.input) {
-               l.width = 0
-               return eof
-       }
-
-       runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
-       l.width = runeWidth
-       l.pos += l.width
-       return runeValue
+func (c *htmlElementsCollectorWriter) startCollecting() {
+       c.isCollecting = true
  }
  
-// returns the next state in HTML element scanner.
-type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc
+func (c *htmlElementsCollectorWriter) endCollecting() {
+       c.isCollecting = false
+       c.inQuote = false
+}
  
-// At "<", buffer empty.
-// Potentially starting a HTML element.
-func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
-       if w.r == '>' || unicode.IsSpace(w.r) {
-               if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) {
-                       w.buff.Reset()
-                       return htmlLexStart
+func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) {
+       if isQuote(b) {
+               if c.inQuote && b == c.quoteValue {
+                       c.inQuote = false
+               } else if !c.inQuote {
+                       c.inQuote = true
+                       c.quoteValue = b
                 }
+       }
+}
  
-               tagName := w.buff.Bytes()[1:]
-
-               switch {
-               case skipInnerElementRe.Match(tagName):
-                       // pre, script etc. We collect classes etc. on the surrounding
-                       // element, but skip the inner content.
-                       w.backup()
+func isQuote(b byte) bool {
+       return b == '"' || b == '\''
+}
  
-                       // tagName will be overwritten, so make a copy.
-                       tagNameCopy := make([]byte, len(tagName))
-                       copy(tagNameCopy, tagName)
+func parseStartTag(s string) (string, bool) {
+       s = strings.TrimPrefix(s, "<")
+       s = strings.TrimSuffix(s, ">")
  
-                       return w.lexElementInside(
-                               w.consumeBuffUntil(
-                                       func() bool {
-                                               if w.r != '>' {
-                                                       return false
-                                               }
-                                               m := endTagRe.FindSubmatch(w.buff.Bytes())
-                                               if m == nil {
-                                                       return false
-                                               }
-                                               return bytes.EqualFold(m[1], tagNameCopy)
-                                       },
-                                       htmlLexStart,
-                               ))
-               case skipAllElementRe.Match(tagName):
-                       // E.g. "<!DOCTYPE ..."
-                       w.buff.Reset()
-                       return w.consumeRuneUntil(func(r rune) bool {
-                               return r == '>'
-                       }, htmlLexStart)
-               default:
-                       w.backup()
-                       return w.defaultLexElementInside
-               }
+       spaceIndex := strings.Index(s, " ")
+       if spaceIndex != -1 {
+               s = s[:spaceIndex]
         }
  
-       w.buff.WriteRune(w.r)
+       return strings.ToLower(strings.TrimSpace(s)), true
+}
  
-       // If it's a comment, skip to its end.
-       if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) {
-               w.buff.Reset()
-               return htmlLexToEndOfComment
+func parseEndTag(s string) (string, bool) {
+       if !strings.HasPrefix(s, "</") {
+               return "", false
         }
  
-       return htmlLexElementStart
+       s = strings.TrimPrefix(s, "</")
+       s = strings.TrimSuffix(s, ">")
+
+       return strings.ToLower(strings.TrimSpace(s)), true
  }
  
-// Entry state func.
-// Looks for a opening bracket, '<'.
-func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
-       if w.r == '<' {
-               w.backup()
-               w.buff.Reset()
-               return htmlLexElementStart
-       }
+// No need to look inside these for HTML elements.
+func isPreFormatted(s string) bool {
+       return s == "pre" || s == "textarea" || s == "script" || s == "style"
+}
  
-       return htmlLexStart
+type htmlElement struct {
+       Tag     string
+       Classes []string
+       IDs     []string
  }
  
-// After "<!--", buff empty.
-func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
-       w.buff.WriteRune(w.r)
+var (
+       htmlJsonFixer = strings.NewReplacer(", ", "\n")
+       jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
+       classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
  
-       if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) {
-               // Done, start looking for HTML elements again.
-               return htmlLexStart
+       exceptionList = map[string]bool{
+               "thead": true,
+               "tbody": true,
+               "tfoot": true,
+               "td":    true,
+               "tr":    true,
         }
-
-       return htmlLexToEndOfComment
-}
+)
  
  func parseHTMLElement(elStr string) (el htmlElement, err error) {
+       var tagBuffer string = ""
  
-       tagName := parseStartTag(elStr)
-
-       el.Tag = strings.ToLower(tagName)
-       tagNameToParse := el.Tag
+       tagName, ok := parseStartTag(elStr)
+       if !ok {
+               return
+       }
  
         // The net/html parser does not handle single table elements as input, e.g. tbody.
         // We only care about the element/class/ids, so just store away the original tag name
         // and pretend it's a <div>.
-       if exceptionList[el.Tag] {
+       if exceptionList[tagName] {
+               tagBuffer = tagName
                 elStr = strings.Replace(elStr, tagName, "div", 1)
-               tagNameToParse = "div"
         }
  
         n, err := html.Parse(strings.NewReader(elStr))
         if err != nil {
                 return
         }
-
         var walk func(*html.Node)
         walk = func(n *html.Node) {
-               if n.Type == html.ElementNode && n.Data == tagNameToParse {
+               if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) {
+                       el.Tag = n.Data
+
                         for _, a := range n.Attr {
                                 switch {
                                 case strings.EqualFold(a.Key, "id"):
@@ -416,20 +345,10 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {
  
         walk(n)
  
-       return
-}
-
-// Variants of s
-//    <body class="b a">
-//    <div>
-func parseStartTag(s string) string {
-       spaceIndex := strings.IndexFunc(s, func(r rune) bool {
-               return unicode.IsSpace(r)
-       })
-
-       if spaceIndex == -1 {
-               return s[1 : len(s)-1]
+       // did we replaced the start tag?
+       if tagBuffer != "" {
+               el.Tag = tagBuffer
         }
  
-       return s[1:spaceIndex]
+       return
  }
diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go

index 2eac31f73b7894bf1a69451c2068152865a13bfe..0c8b2b65b347fa862ea39b2719b7bc7cab5b09d0 100644 (file)
--- a/publisher/htmlElementsCollector_test.go
+++ b/publisher/htmlElementsCollector_test.go
@@ -15,12 +15,8 @@ package publisher
  
  import (
         "fmt"
-       "math/rand"
         "strings"
         "testing"
-       "time"
-
-       "github.com/gohugoio/hugo/common/text"
  
         "github.com/gohugoio/hugo/media"
         "github.com/gohugoio/hugo/minifiers"
@@ -32,7 +28,6 @@ import (
  
  func TestClassCollector(t *testing.T) {
         c := qt.New((t))
-       rnd := rand.New(rand.NewSource(time.Now().Unix()))
  
         f := func(tags, classes, ids string) HTMLElements {
                 var tagss, classess, idss []string
@@ -62,20 +57,14 @@ func TestClassCollector(t *testing.T) {
                 expect HTMLElements
         }{
                 {"basic", `<body class="b a"></body>`, f("body", "a b", "")},
-               {"duplicates", `<div class="b a b"></div><div class="b a b"></div>x'`, f("div", "a b", "")},
+               {"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")},
                 {"single quote", `<body class='b a'></body>`, f("body", "a b", "")},
                 {"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")},
-               {"short", `<i>`, f("i", "", "")},
-               {"invalid", `< body class="b a"></body><div></div>`, f("div", "", "")},
                 // https://github.com/gohugoio/hugo/issues/7318
                 {"thead", `<table class="cl1">
      <thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead>
      <tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody>
  </table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")},
-               {"thead uppercase", `<TABLE class="CL1">
-    <THEAD class="CL2"><TR class="CL3"><TD class="CL4"></TD></TR></THEAD>
-    <TBODY class="CL5"><TR class="CL6"><TD class="CL7"></TD></TR></TBODY>
-</TABLE>`, f("table tbody td thead tr", "CL1 CL2 CL3 CL4 CL5 CL6 CL7", "")},
                 // https://github.com/gohugoio/hugo/issues/7161
                 {"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")},
                 {"AlpineJS bind 1", `<body>
@@ -109,11 +98,6 @@ func TestClassCollector(t *testing.T) {
                 {"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
                 {"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")},
                 {"Comments should be skipped", `<!-- example comment -->`, f("", "", "")},
-               {"Comments with elements before and after", `<div></div><!-- example comment --><span><span>`, f("div span", "", "")},
-               // Issue #8530
-               {"Comment with single quote", `<!-- Hero Area Image d'accueil --><i class="foo">`, f("i", "foo", "")},
-               {"Uppercase tags", `<DIV></DIV>`, f("div", "", "")},
-               {"Predefined tags with distinct casing", `<script>if (a < b) { nothing(); }</SCRIPT><div></div>`, f("div script", "", "")},
                 // Issue #8417
                 {"Tabs inline", `<hr    id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")},
                 {"Tabs on multiple rows", `<form
@@ -124,29 +108,16 @@ func TestClassCollector(t *testing.T) {
  <div id="b" class="foo">d</div>`, f("div form", "foo", "a b")},
         } {
  
-               for _, variant := range []struct {
-                       minify bool
-                       stream bool
-               }{
-                       {minify: false, stream: false},
-                       {minify: true, stream: false},
-                       {minify: false, stream: true},
-               } {
-
-                       c.Run(fmt.Sprintf("%s--minify-%t--stream-%t", test.name, variant.minify, variant.stream), func(c *qt.C) {
+               for _, minify := range []bool{false, true} {
+                       c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) {
                                 w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-                               if variant.minify {
+                               if minify {
                                         if skipMinifyTest[test.name] {
                                                 c.Skip("skip minify test")
                                         }
                                         v := viper.New()
                                         m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
                                         m.Minify(media.HTMLType, w, strings.NewReader(test.html))
-                               } else if variant.stream {
-                                       chunks := text.Chunk(test.html, rnd.Intn(41)+1)
-                                       for _, chunk := range chunks {
-                                               fmt.Fprint(w, chunk)
-                                       }
                                 } else {
                                         fmt.Fprint(w, test.html)
                                 }
@@ -155,7 +126,6 @@ func TestClassCollector(t *testing.T) {
                         })
                 }
         }
-
  }
  
  func BenchmarkElementsCollectorWriter(b *testing.B) {
author	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
	Wed, 19 May 2021 01:45:36 +0000 (03:45 +0200)
committer	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
	Wed, 19 May 2021 01:45:36 +0000 (03:45 +0200)
common/text/transform.go		patch \| blob \| history
publisher/htmlElementsCollector.go		patch \| blob \| history
publisher/htmlElementsCollector_test.go		patch \| blob \| history