publisher: Some performance tweaks for the HTML elements collector
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Tue, 20 Apr 2021 14:50:03 +0000 (16:50 +0200)
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Tue, 20 Apr 2021 15:24:17 +0000 (17:24 +0200)
publisher/htmlElementsCollector.go
publisher/htmlElementsCollector_test.go

index 9f4be1ff5b7adbb47f95d996de5aba08868c9a27..13387a7ee7efbb584984494ff52ce87f5f8a1d38 100644 (file)
@@ -108,13 +108,13 @@ func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlEleme
        }
 }
 
-// Write splits the incoming stream into single html element and writes these into elementSet
+// Write splits the incoming stream into single html element.
 func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
        n = len(p)
        i := 0
 
        for i < len(p) {
-               // if is not collecting, cycle through byte stream until start bracket "<" is found
+               // If we are not collecting, cycle through byte stream until start bracket "<" is found.
                if !w.isCollecting {
                        for ; i < len(p); i++ {
                                b := p[i]
@@ -126,9 +126,9 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
                }
 
                if w.isCollecting {
-                       // if is collecting, cycle through byte stream until end bracket ">" is found
-                       // disregard any ">" if within a quote
-                       // write bytes until found to buffer
+                       // If we are collecting, cycle through byte stream until end bracket ">" is found,
+                       // disregard any ">" if within a quote,
+                       // write bytes until found to buffer.
                        for ; i < len(p); i++ {
                                b := p[i]
                                w.toggleIfQuote(b)
@@ -141,54 +141,69 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
                        }
                }
 
-               // if no end bracket ">" is found while collecting, but the stream ended
+               // If no end bracket ">" is found while collecting, but the stream ended
                // this could mean we received chunks of a stream from e.g. the minify functionality
-               // next if loop will be skipped
+               // next if loop will be skipped.
 
-               // at this point we have collected an element line between angle brackets "<" and ">"
+               // At this point we have collected an element line between angle brackets "<" and ">".
                if !w.isCollecting {
-                       s := w.buff.String()
-                       w.buff.Reset()
-
-                       // filter out unwanted tags
-                       // empty string, just in case
-                       // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
-                       // comments and doctype tags
-                       // end tags
-                       switch {
-                       case s == "": // empty string
+                       if w.buff.Len() == 0 {
                                continue
-                       case w.inPreTag != "": // within preformatted code block
+                       }
+
+                       if w.inPreTag != "" { // within preformatted code block
+                               s := w.buff.String()
+                               w.buff.Reset()
                                if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
                                        w.inPreTag = ""
                                }
                                continue
-                       case strings.HasPrefix(s, "<!"): // comment or doctype tag
-                               continue
-                       case strings.HasPrefix(s, "</"): // end tag
-                               continue
                        }
 
-                       // check if we have processed this element before.
+                       // First check if we have processed this element before.
                        w.collector.mu.RLock()
-                       seen := w.collector.elementSet[s]
+
+                       // Work with the bytes slice as long as it's practical,
+                       // to save memory allocations.
+                       b := w.buff.Bytes()
+
+                       // See https://github.com/dominikh/go-tools/issues/723
+                       //lint:ignore S1030 This construct avoids memory allocation for the string.
+                       seen := w.collector.elementSet[string(b)]
                        w.collector.mu.RUnlock()
                        if seen {
+                               w.buff.Reset()
                                continue
                        }
 
-                       // check if a preformatted code block started
+                       // Filter out unwanted tags
+                       // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
+                       // comments and doctype tags
+                       // end tags.
+                       switch {
+                       case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag
+                               w.buff.Reset()
+                               continue
+                       case bytes.HasPrefix(b, []byte("</")): // end tag
+                               w.buff.Reset()
+                               continue
+                       }
+
+                       s := w.buff.String()
+                       w.buff.Reset()
+
+                       // Check if a preformatted code block started.
                        if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
                                w.inPreTag = tagName
                        }
 
-                       // parse each collected element
+                       // Parse each collected element.
                        el, err := parseHTMLElement(s)
                        if err != nil {
                                return n, err
                        }
 
-                       // write this tag to the element set
+                       // Write this tag to the element set.
                        w.collector.mu.Lock()
                        w.collector.elementSet[s] = true
                        w.collector.elements = append(w.collector.elements, el)
@@ -265,17 +280,18 @@ var (
        htmlJsonFixer = strings.NewReplacer(", ", "\n")
        jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
        classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
-)
 
-func parseHTMLElement(elStr string) (el htmlElement, err error) {
-       var tagBuffer string = ""
-       exceptionList := map[string]bool{
+       exceptionList = map[string]bool{
                "thead": true,
                "tbody": true,
                "tfoot": true,
                "td":    true,
                "tr":    true,
        }
+)
+
+func parseHTMLElement(elStr string) (el htmlElement, err error) {
+       var tagBuffer string = ""
 
        tagName, ok := parseStartTag(elStr)
        if !ok {
index 1ada27c18d54c552000f3e4a038ad1ae28e0854a..0c8b2b65b347fa862ea39b2719b7bc7cab5b09d0 100644 (file)
@@ -14,7 +14,6 @@
 package publisher
 
 import (
-       "bytes"
        "fmt"
        "strings"
        "testing"
@@ -129,33 +128,8 @@ func TestClassCollector(t *testing.T) {
        }
 }
 
-func BenchmarkClassCollectorWriter(b *testing.B) {
+func BenchmarkElementsCollectorWriter(b *testing.B) {
        const benchHTML = `
-<html>
-<body id="i1" class="a b c d">
-<a class="c d e"></a>
-<br>
-<a class="c d e"></a>
-<a class="c d e"></a>
-<br>
-<a id="i2" class="c d e f"></a>
-<a id="i3" class="c d e"></a>
-<a class="c d e"></a>
-<br>
-<a class="c d e"></a>
-<a class="c d e"></a>
-<a class="c d e"></a>
-<a class="c d e"></a>
-</body>
-</html>
-`
-       for i := 0; i < b.N; i++ {
-               w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-               fmt.Fprint(w, benchHTML)
-       }
-}
-
-const benchHTML = `
 <!DOCTYPE html>
 <html>
 <head>
@@ -207,51 +181,9 @@ const benchHTML = `
 </body>
 </html>
 `
-
-func BenchmarkElementsCollectorWriter(b *testing.B) {
-       b.ReportAllocs()
        for i := 0; i < b.N; i++ {
                w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
                fmt.Fprint(w, benchHTML)
-       }
-}
-
-func BenchmarkElementsCollectorWriterMinified(b *testing.B) {
-       b.ReportAllocs()
-       v := viper.New()
-       m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
-       var buf bytes.Buffer
-       m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
-       b.ResetTimer()
-
-       for i := 0; i < b.N; i++ {
-               w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-               fmt.Fprint(w, buf.String())
-       }
-}
-
-func BenchmarkElementsCollectorWriterWithMinifyStream(b *testing.B) {
-       b.ReportAllocs()
-       v := viper.New()
-       m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
-       b.ResetTimer()
-
-       for i := 0; i < b.N; i++ {
-               w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-               m.Minify(media.HTMLType, w, strings.NewReader(benchHTML))
-       }
-}
-
-func BenchmarkElementsCollectorWriterWithMinifyString(b *testing.B) {
-       b.ReportAllocs()
-       v := viper.New()
-       m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
-       b.ResetTimer()
 
-       for i := 0; i < b.N; i++ {
-               var buf bytes.Buffer
-               m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
-               w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-               fmt.Fprint(w, buf.String())
        }
 }