publisher: Exclude comment and doctype elements from writeStats
authorDirk Olbrich <mail@dirkolbrich.de>
Mon, 12 Apr 2021 21:42:51 +0000 (23:42 +0200)
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Tue, 20 Apr 2021 15:24:17 +0000 (17:24 +0200)
- Reorder code blocks
- Rename cssClassCollectorWriter to htmlElementCollectorWriter, as it just collect html element information
- Expand benchmark to test for minified and unminified content

Fixes #8396
Fixes #8417

hugolib/site_test.go
publisher/htmlElementsCollector.go
publisher/htmlElementsCollector_test.go

index cd7ce51f8fd054d9b9f3d4aedd4a60becf98ef98..365679a328b5c5b47481775ebd4c71b023ce372e 100644 (file)
@@ -1113,7 +1113,7 @@ ABC.
                els := stats.HTMLElements
 
                b.Assert(els.Classes, qt.HasLen, 3606) // (4 * 900) + 4 +2
-               b.Assert(els.Tags, qt.HasLen, 9)
+               b.Assert(els.Tags, qt.HasLen, 8)
                b.Assert(els.IDs, qt.HasLen, 1)
        }
 }
index d9479aafaa527905d5ea4a1a5cd430a9e9fe434e..9f4be1ff5b7adbb47f95d996de5aba08868c9a27 100644 (file)
@@ -20,21 +20,10 @@ import (
        "strings"
        "sync"
 
-       "github.com/gohugoio/hugo/helpers"
        "golang.org/x/net/html"
-)
-
-func newHTMLElementsCollector() *htmlElementsCollector {
-       return &htmlElementsCollector{
-               elementSet: make(map[string]bool),
-       }
-}
 
-func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *cssClassCollectorWriter {
-       return &cssClassCollectorWriter{
-               collector: collector,
-       }
-}
+       "github.com/gohugoio/hugo/helpers"
+)
 
 // HTMLElements holds lists of tags and attribute values for classes and id.
 type HTMLElements struct {
@@ -59,7 +48,50 @@ func (h *HTMLElements) Sort() {
        sort.Strings(h.IDs)
 }
 
-type cssClassCollectorWriter struct {
+type htmlElementsCollector struct {
+       // Contains the raw HTML string. We will get the same element
+       // several times, and want to avoid costly reparsing when this
+       // is used for aggregated data only.
+       elementSet map[string]bool
+
+       elements []htmlElement
+
+       mu sync.RWMutex
+}
+
+func newHTMLElementsCollector() *htmlElementsCollector {
+       return &htmlElementsCollector{
+               elementSet: make(map[string]bool),
+       }
+}
+
+func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
+       var (
+               classes []string
+               ids     []string
+               tags    []string
+       )
+
+       for _, el := range c.elements {
+               classes = append(classes, el.Classes...)
+               ids = append(ids, el.IDs...)
+               tags = append(tags, el.Tag)
+       }
+
+       classes = helpers.UniqueStringsSorted(classes)
+       ids = helpers.UniqueStringsSorted(ids)
+       tags = helpers.UniqueStringsSorted(tags)
+
+       els := HTMLElements{
+               Classes: classes,
+               IDs:     ids,
+               Tags:    tags,
+       }
+
+       return els
+}
+
+type htmlElementsCollectorWriter struct {
        collector *htmlElementsCollector
        buff      bytes.Buffer
 
@@ -70,11 +102,19 @@ type cssClassCollectorWriter struct {
        quoteValue byte
 }
 
-func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
+func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
+       return &htmlElementsCollectorWriter{
+               collector: collector,
+       }
+}
+
+// Write splits the incoming stream into single html element and writes these into elementSet
+func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
        n = len(p)
        i := 0
 
        for i < len(p) {
+               // if is not collecting, cycle through byte stream until start bracket "<" is found
                if !w.isCollecting {
                        for ; i < len(p); i++ {
                                b := p[i]
@@ -86,109 +126,89 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
                }
 
                if w.isCollecting {
+                       // if is collecting, cycle through byte stream until end bracket ">" is found
+                       // disregard any ">" if within a quote
+                       // write bytes until found to buffer
                        for ; i < len(p); i++ {
                                b := p[i]
                                w.toggleIfQuote(b)
+                               w.buff.WriteByte(b)
+
                                if !w.inQuote && b == '>' {
                                        w.endCollecting()
                                        break
                                }
-                               w.buff.WriteByte(b)
                        }
+               }
 
-                       if !w.isCollecting {
-                               if w.inPreTag != "" {
-                                       s := w.buff.String()
-                                       if tagName, isEnd := w.parseEndTag(s); isEnd && w.inPreTag == tagName {
-                                               w.inPreTag = ""
-                                       }
-                                       w.buff.Reset()
-                                       continue
-                               }
-
-                               // First check if we have processed this element before.
-                               w.collector.mu.RLock()
-
-                               // See https://github.com/dominikh/go-tools/issues/723
-                               //lint:ignore S1030 This construct avoids memory allocation for the string.
-                               seen := w.collector.elementSet[string(w.buff.Bytes())]
-                               w.collector.mu.RUnlock()
-                               if seen {
-                                       w.buff.Reset()
-                                       continue
-                               }
-
-                               s := w.buff.String()
-
-                               w.buff.Reset()
+               // if no end bracket ">" is found while collecting, but the stream ended
+               // this could mean we received chunks of a stream from e.g. the minify functionality
+               // next if loop will be skipped
 
-                               if strings.HasPrefix(s, "</") {
-                                       continue
+               // at this point we have collected an element line between angle brackets "<" and ">"
+               if !w.isCollecting {
+                       s := w.buff.String()
+                       w.buff.Reset()
+
+                       // filter out unwanted tags
+                       // empty string, just in case
+                       // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
+                       // comments and doctype tags
+                       // end tags
+                       switch {
+                       case s == "": // empty string
+                               continue
+                       case w.inPreTag != "": // within preformatted code block
+                               if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
+                                       w.inPreTag = ""
                                }
+                               continue
+                       case strings.HasPrefix(s, "<!"): // comment or doctype tag
+                               continue
+                       case strings.HasPrefix(s, "</"): // end tag
+                               continue
+                       }
 
-                               key := s
-
-                               s, tagName := w.insertStandinHTMLElement(s)
-                               el := parseHTMLElement(s)
-                               el.Tag = tagName
-                               if w.isPreFormatted(tagName) {
-                                       w.inPreTag = tagName
-                               }
+                       // check if we have processed this element before.
+                       w.collector.mu.RLock()
+                       seen := w.collector.elementSet[s]
+                       w.collector.mu.RUnlock()
+                       if seen {
+                               continue
+                       }
 
-                               w.collector.mu.Lock()
-                               w.collector.elementSet[key] = true
-                               if el.Tag != "" {
-                                       w.collector.elements = append(w.collector.elements, el)
-                               }
-                               w.collector.mu.Unlock()
+                       // check if a preformatted code block started
+                       if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
+                               w.inPreTag = tagName
+                       }
 
+                       // parse each collected element
+                       el, err := parseHTMLElement(s)
+                       if err != nil {
+                               return n, err
                        }
+
+                       // write this tag to the element set
+                       w.collector.mu.Lock()
+                       w.collector.elementSet[s] = true
+                       w.collector.elements = append(w.collector.elements, el)
+                       w.collector.mu.Unlock()
                }
        }
 
        return
 }
 
-// No need to look inside these for HTML elements.
-func (c *cssClassCollectorWriter) isPreFormatted(s string) bool {
-       return s == "pre" || s == "textarea" || s == "script"
-}
-
-// The net/html parser does not handle single table elements as input, e.g. tbody.
-// We only care about the element/class/ids, so just store away the original tag name
-// and pretend it's a <div>.
-func (c *cssClassCollectorWriter) insertStandinHTMLElement(el string) (string, string) {
-       tag := el[1:]
-       spacei := strings.Index(tag, " ")
-       if spacei != -1 {
-               tag = tag[:spacei]
-       }
-       tag = strings.Trim(tag, "\n ")
-       newv := strings.Replace(el, tag, "div", 1)
-       return newv, strings.ToLower(tag)
-}
-
-func (c *cssClassCollectorWriter) parseEndTag(s string) (string, bool) {
-       if !strings.HasPrefix(s, "</") {
-               return "", false
-       }
-       s = strings.TrimPrefix(s, "</")
-       s = strings.TrimSuffix(s, ">")
-       return strings.ToLower(strings.TrimSpace(s)), true
+func (c *htmlElementsCollectorWriter) startCollecting() {
+       c.isCollecting = true
 }
 
-func (c *cssClassCollectorWriter) endCollecting() {
+func (c *htmlElementsCollectorWriter) endCollecting() {
        c.isCollecting = false
        c.inQuote = false
-
-}
-
-func (c *cssClassCollectorWriter) startCollecting() {
-       c.isCollecting = true
-
 }
 
-func (c *cssClassCollectorWriter) toggleIfQuote(b byte) {
+func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) {
        if isQuote(b) {
                if c.inQuote && b == c.quoteValue {
                        c.inQuote = false
@@ -199,51 +219,46 @@ func (c *cssClassCollectorWriter) toggleIfQuote(b byte) {
        }
 }
 
-type htmlElement struct {
-       Tag     string
-       Classes []string
-       IDs     []string
+func isQuote(b byte) bool {
+       return b == '"' || b == '\''
 }
 
-type htmlElementsCollector struct {
-       // Contains the raw HTML string. We will get the same element
-       // several times, and want to avoid costly reparsing when this
-       // is used for aggregated data only.
-       elementSet map[string]bool
+func parseStartTag(s string) (string, bool) {
+       if strings.HasPrefix(s, "</") || strings.HasPrefix(s, "<!") {
+               return "", false
+       }
 
-       elements []htmlElement
+       s = strings.TrimPrefix(s, "<")
+       s = strings.TrimSuffix(s, ">")
 
-       mu sync.RWMutex
-}
+       spaceIndex := strings.Index(s, " ")
+       if spaceIndex != -1 {
+               s = s[:spaceIndex]
+       }
 
-func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
-       var (
-               classes []string
-               ids     []string
-               tags    []string
-       )
+       return strings.ToLower(strings.TrimSpace(s)), true
+}
 
-       for _, el := range c.elements {
-               classes = append(classes, el.Classes...)
-               ids = append(ids, el.IDs...)
-               tags = append(tags, el.Tag)
+func parseEndTag(s string) (string, bool) {
+       if !strings.HasPrefix(s, "</") {
+               return "", false
        }
 
-       classes = helpers.UniqueStringsSorted(classes)
-       ids = helpers.UniqueStringsSorted(ids)
-       tags = helpers.UniqueStringsSorted(tags)
+       s = strings.TrimPrefix(s, "</")
+       s = strings.TrimSuffix(s, ">")
 
-       els := HTMLElements{
-               Classes: classes,
-               IDs:     ids,
-               Tags:    tags,
-       }
+       return strings.ToLower(strings.TrimSpace(s)), true
+}
 
-       return els
+// No need to look inside these for HTML elements.
+func isPreFormatted(s string) bool {
+       return s == "pre" || s == "textarea" || s == "script" || s == "style"
 }
 
-func isQuote(b byte) bool {
-       return b == '"' || b == '\''
+type htmlElement struct {
+       Tag     string
+       Classes []string
+       IDs     []string
 }
 
 var (
@@ -252,11 +267,29 @@ var (
        classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
 )
 
-func parseHTMLElement(elStr string) (el htmlElement) {
-       elStr = strings.TrimSpace(elStr)
-       if !strings.HasSuffix(elStr, ">") {
-               elStr += ">"
+func parseHTMLElement(elStr string) (el htmlElement, err error) {
+       var tagBuffer string = ""
+       exceptionList := map[string]bool{
+               "thead": true,
+               "tbody": true,
+               "tfoot": true,
+               "td":    true,
+               "tr":    true,
        }
+
+       tagName, ok := parseStartTag(elStr)
+       if !ok {
+               return
+       }
+
+       // The net/html parser does not handle single table elements as input, e.g. tbody.
+       // We only care about the element/class/ids, so just store away the original tag name
+       // and pretend it's a <div>.
+       if exceptionList[tagName] {
+               tagBuffer = tagName
+               elStr = strings.Replace(elStr, tagName, "div", 1)
+       }
+
        n, err := html.Parse(strings.NewReader(elStr))
        if err != nil {
                return
@@ -287,7 +320,6 @@ func parseHTMLElement(elStr string) (el htmlElement) {
                                                        val = strings.Join(lines, "\n")
                                                        val = jsonAttrRe.ReplaceAllString(val, "$1")
                                                        el.Classes = append(el.Classes, strings.Fields(val)...)
-
                                                }
                                        }
                                }
@@ -301,5 +333,10 @@ func parseHTMLElement(elStr string) (el htmlElement) {
 
        walk(n)
 
+       // did we replaced the start tag?
+       if tagBuffer != "" {
+               el.Tag = tagBuffer
+       }
+
        return
 }
index 5b2d85d47fe1b925aa69057358a834cc9360e966..1ada27c18d54c552000f3e4a038ad1ae28e0854a 100644 (file)
 package publisher
 
 import (
+       "bytes"
        "fmt"
        "strings"
        "testing"
 
-       "github.com/gohugoio/hugo/minifiers"
-
        "github.com/gohugoio/hugo/media"
+       "github.com/gohugoio/hugo/minifiers"
        "github.com/gohugoio/hugo/output"
-       "github.com/spf13/viper"
 
        qt "github.com/frankban/quicktest"
+       "github.com/spf13/viper"
 )
 
 func TestClassCollector(t *testing.T) {
@@ -50,7 +50,6 @@ func TestClassCollector(t *testing.T) {
 
        skipMinifyTest := map[string]bool{
                "Script tags content should be skipped": true, // https://github.com/tdewolff/minify/issues/396
-
        }
 
        for _, test := range []struct {
@@ -62,56 +61,57 @@ func TestClassCollector(t *testing.T) {
                {"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")},
                {"single quote", `<body class='b a'></body>`, f("body", "a b", "")},
                {"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")},
-               {"thead", `
-               https://github.com/gohugoio/hugo/issues/7318
-<table class="cl1">
+               // https://github.com/gohugoio/hugo/issues/7318
+               {"thead", `<table class="cl1">
     <thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead>
     <tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody>
 </table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")},
                // https://github.com/gohugoio/hugo/issues/7161
                {"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")},
-
                {"AlpineJS bind 1", `<body>
-                       <div x-bind:class="{
+    <div x-bind:class="{
         'class1': data.open,
         'class2 class3': data.foo == 'bar'
          }">
-                       </div>
-               </body>`, f("body div", "class1 class2 class3", "")},
-
-               {
-                       "Alpine bind 2", `<div x-bind:class="{ 'bg-black':  filter.checked }"
-                        class="inline-block mr-1 mb-2 rounded  bg-gray-300 px-2 py-2">FOO</div>`,
+    </div>
+</body>`, f("body div", "class1 class2 class3", "")},
+               {"AlpineJS bind 2", `<div x-bind:class="{ 'bg-black':  filter.checked }" class="inline-block mr-1 mb-2 rounded  bg-gray-300 px-2 py-2">FOO</div>`,
                        f("div", "bg-black bg-gray-300 inline-block mb-2 mr-1 px-2 py-2 rounded", ""),
                },
-
-               {"Alpine bind 3", `<div x-bind:class="{ 'text-gray-800':  !checked, 'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
-               {"Alpine bind 4", `<div x-bind:class="{ 'text-gray-800':  !checked, 
+               {"AlpineJS bind 3", `<div x-bind:class="{ 'text-gray-800':  !checked, 'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
+               {"AlpineJS bind 4", `<div x-bind:class="{ 'text-gray-800':  !checked, 
                                         'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
-
-               {"Alpine bind 5", `<a x-bind:class="{
+               {"AlpineJS bind 5", `<a x-bind:class="{
                 'text-a': a && b,
                 'text-b': !a && b || c,
                 'pl-3': a === 1,
                  pl-2: b == 3,
                 'text-gray-600': (a > 1)
-      
                 }" class="block w-36 cursor-pointer pr-3 no-underline capitalize"></a>`, f("a", "block capitalize cursor-pointer no-underline pl-2 pl-3 pr-3 text-a text-b text-gray-600 w-36", "")},
-
-               {"Alpine transition 1", `<div x-transition:enter-start="opacity-0 transform mobile:-translate-x-8 sm:-translate-y-8">`, f("div", "mobile:-translate-x-8 opacity-0 sm:-translate-y-8 transform", "")},
+               {"AlpineJS transition 1", `<div x-transition:enter-start="opacity-0 transform mobile:-translate-x-8 sm:-translate-y-8">`, f("div", "mobile:-translate-x-8 opacity-0 sm:-translate-y-8 transform", "")},
                {"Vue bind", `<div v-bind:class="{ active: isActive }"></div>`, f("div", "active", "")},
                // Issue #7746
                {"Apostrophe inside attribute value", `<a class="missingclass" title="Plus d'information">my text</a><div></div>`, f("a div", "missingclass", "")},
                // Issue #7567
                {"Script tags content should be skipped", `<script><span>foo</span><span>bar</span></script><div class="foo"></div>`, f("div script", "foo", "")},
+               {"Style tags content should be skipped", `<style>p{color: red;font-size: 20px;}</style><div class="foo"></div>`, f("div style", "foo", "")},
                {"Pre tags content should be skipped", `<pre class="preclass"><span>foo</span><span>bar</span></pre><div class="foo"></div>`, f("div pre", "foo preclass", "")},
-               {"Textare tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
+               {"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
+               {"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")},
+               {"Comments should be skipped", `<!-- example comment -->`, f("", "", "")},
+               // Issue #8417
+               {"Tabs inline", `<hr    id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")},
+               {"Tabs on multiple rows", `<form
+                       id="a"
+                       action="www.example.com"
+                       method="post"
+></form>
+<div id="b" class="foo">d</div>`, f("div form", "foo", "a b")},
        } {
 
                for _, minify := range []bool{false, true} {
                        c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) {
                                w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-
                                if minify {
                                        if skipMinifyTest[test.name] {
                                                c.Skip("skip minify test")
@@ -152,6 +152,106 @@ func BenchmarkClassCollectorWriter(b *testing.B) {
        for i := 0; i < b.N; i++ {
                w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
                fmt.Fprint(w, benchHTML)
+       }
+}
 
+const benchHTML = `
+<!DOCTYPE html>
+<html>
+<head>
+<title>title</title>
+<style>
+       a {color: red;}
+       .c {color: blue;}
+</style>
+</head>
+<body id="i1" class="a b c d">
+<a class="c d e"></a>
+<hr>
+<a class="c d e"></a>
+<a class="c d e"></a>
+<hr>
+<a id="i2" class="c d e f"></a>
+<a id="i3" class="c d e"></a>
+<a class="c d e"></a>
+<p>To force<br> line breaks<br> in a text,<br> use the br<br> element.</p>
+<hr>
+<a class="c d e"></a>
+<a class="c d e"></a>
+<a class="c d e"></a>
+<a class="c d e"></a>
+<table>
+  <thead class="ch">
+  <tr>
+    <th>Month</th>
+    <th>Savings</th>
+  </tr>
+  </thead>
+  <tbody class="cb">
+  <tr>
+    <td>January</td>
+    <td>$100</td>
+  </tr>
+  <tr>
+    <td>February</td>
+    <td>$200</td>
+  </tr>
+  </tbody>
+  <tfoot class="cf">
+  <tr>
+    <td></td>
+    <td>$300</td>
+  </tr>
+  </tfoot>
+</table>
+</body>
+</html>
+`
+
+func BenchmarkElementsCollectorWriter(b *testing.B) {
+       b.ReportAllocs()
+       for i := 0; i < b.N; i++ {
+               w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
+               fmt.Fprint(w, benchHTML)
+       }
+}
+
+func BenchmarkElementsCollectorWriterMinified(b *testing.B) {
+       b.ReportAllocs()
+       v := viper.New()
+       m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
+       var buf bytes.Buffer
+       m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
+       b.ResetTimer()
+
+       for i := 0; i < b.N; i++ {
+               w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
+               fmt.Fprint(w, buf.String())
+       }
+}
+
+func BenchmarkElementsCollectorWriterWithMinifyStream(b *testing.B) {
+       b.ReportAllocs()
+       v := viper.New()
+       m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
+       b.ResetTimer()
+
+       for i := 0; i < b.N; i++ {
+               w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
+               m.Minify(media.HTMLType, w, strings.NewReader(benchHTML))
+       }
+}
+
+func BenchmarkElementsCollectorWriterWithMinifyString(b *testing.B) {
+       b.ReportAllocs()
+       v := viper.New()
+       m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
+       b.ResetTimer()
+
+       for i := 0; i < b.N; i++ {
+               var buf bytes.Buffer
+               m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
+               w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
+               fmt.Fprint(w, buf.String())
        }
 }