publisher: Some performance tweaks for the HTML elements collector

author Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>

Tue, 20 Apr 2021 14:50:03 +0000 (16:50 +0200)

committer Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>

Tue, 20 Apr 2021 15:24:17 +0000 (17:24 +0200)
author Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Tue, 20 Apr 2021 14:50:03 +0000 (16:50 +0200)
committer Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Tue, 20 Apr 2021 15:24:17 +0000 (17:24 +0200)
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go

index 9f4be1ff5b7adbb47f95d996de5aba08868c9a27..13387a7ee7efbb584984494ff52ce87f5f8a1d38 100644 (file)
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@@ -108,13 +108,13 @@ func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlEleme
         }
  }
  
-// Write splits the incoming stream into single html element and writes these into elementSet
+// Write splits the incoming stream into single html element.
  func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
         n = len(p)
         i := 0
  
         for i < len(p) {
-               // if is not collecting, cycle through byte stream until start bracket "<" is found
+               // If we are not collecting, cycle through byte stream until start bracket "<" is found.
                 if !w.isCollecting {
                         for ; i < len(p); i++ {
                                 b := p[i]
@@ -126,9 +126,9 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
                 }
  
                 if w.isCollecting {
-                       // if is collecting, cycle through byte stream until end bracket ">" is found
-                       // disregard any ">" if within a quote
-                       // write bytes until found to buffer
+                       // If we are collecting, cycle through byte stream until end bracket ">" is found,
+                       // disregard any ">" if within a quote,
+                       // write bytes until found to buffer.
                         for ; i < len(p); i++ {
                                 b := p[i]
                                 w.toggleIfQuote(b)
@@ -141,54 +141,69 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
                         }
                 }
  
-               // if no end bracket ">" is found while collecting, but the stream ended
+               // If no end bracket ">" is found while collecting, but the stream ended
                 // this could mean we received chunks of a stream from e.g. the minify functionality
-               // next if loop will be skipped
+               // next if loop will be skipped.
  
-               // at this point we have collected an element line between angle brackets "<" and ">"
+               // At this point we have collected an element line between angle brackets "<" and ">".
                 if !w.isCollecting {
-                       s := w.buff.String()
-                       w.buff.Reset()
-
-                       // filter out unwanted tags
-                       // empty string, just in case
-                       // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
-                       // comments and doctype tags
-                       // end tags
-                       switch {
-                       case s == "": // empty string
+                       if w.buff.Len() == 0 {
                                 continue
-                       case w.inPreTag != "": // within preformatted code block
+                       }
+
+                       if w.inPreTag != "" { // within preformatted code block
+                               s := w.buff.String()
+                               w.buff.Reset()
                                 if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
                                         w.inPreTag = ""
                                 }
                                 continue
-                       case strings.HasPrefix(s, "<!"): // comment or doctype tag
-                               continue
-                       case strings.HasPrefix(s, "</"): // end tag
-                               continue
                         }
  
-                       // check if we have processed this element before.
+                       // First check if we have processed this element before.
                         w.collector.mu.RLock()
-                       seen := w.collector.elementSet[s]
+
+                       // Work with the bytes slice as long as it's practical,
+                       // to save memory allocations.
+                       b := w.buff.Bytes()
+
+                       // See https://github.com/dominikh/go-tools/issues/723
+                       //lint:ignore S1030 This construct avoids memory allocation for the string.
+                       seen := w.collector.elementSet[string(b)]
                         w.collector.mu.RUnlock()
                         if seen {
+                               w.buff.Reset()
                                 continue
                         }
  
-                       // check if a preformatted code block started
+                       // Filter out unwanted tags
+                       // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
+                       // comments and doctype tags
+                       // end tags.
+                       switch {
+                       case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag
+                               w.buff.Reset()
+                               continue
+                       case bytes.HasPrefix(b, []byte("</")): // end tag
+                               w.buff.Reset()
+                               continue
+                       }
+
+                       s := w.buff.String()
+                       w.buff.Reset()
+
+                       // Check if a preformatted code block started.
                         if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
                                 w.inPreTag = tagName
                         }
  
-                       // parse each collected element
+                       // Parse each collected element.
                         el, err := parseHTMLElement(s)
                         if err != nil {
                                 return n, err
                         }
  
-                       // write this tag to the element set
+                       // Write this tag to the element set.
                         w.collector.mu.Lock()
                         w.collector.elementSet[s] = true
                         w.collector.elements = append(w.collector.elements, el)
@@ -265,17 +280,18 @@ var (
         htmlJsonFixer = strings.NewReplacer(", ", "\n")
         jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
         classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
-)
  
-func parseHTMLElement(elStr string) (el htmlElement, err error) {
-       var tagBuffer string = ""
-       exceptionList := map[string]bool{
+       exceptionList = map[string]bool{
                 "thead": true,
                 "tbody": true,
                 "tfoot": true,
                 "td":    true,
                 "tr":    true,
         }
+)
+
+func parseHTMLElement(elStr string) (el htmlElement, err error) {
+       var tagBuffer string = ""
  
         tagName, ok := parseStartTag(elStr)
         if !ok {
diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go

index 1ada27c18d54c552000f3e4a038ad1ae28e0854a..0c8b2b65b347fa862ea39b2719b7bc7cab5b09d0 100644 (file)
--- a/publisher/htmlElementsCollector_test.go
+++ b/publisher/htmlElementsCollector_test.go
@@ -14,7 +14,6 @@
  package publisher
  
  import (
-       "bytes"
         "fmt"
         "strings"
         "testing"
@@ -129,33 +128,8 @@ func TestClassCollector(t *testing.T) {
         }
  }
  
-func BenchmarkClassCollectorWriter(b *testing.B) {
+func BenchmarkElementsCollectorWriter(b *testing.B) {
         const benchHTML = `
-<html>
-<body id="i1" class="a b c d">
-<a class="c d e"></a>
-<br>
-<a class="c d e"></a>
-<a class="c d e"></a>
-<br>
-<a id="i2" class="c d e f"></a>
-<a id="i3" class="c d e"></a>
-<a class="c d e"></a>
-<br>
-<a class="c d e"></a>
-<a class="c d e"></a>
-<a class="c d e"></a>
-<a class="c d e"></a>
-</body>
-</html>
-`
-       for i := 0; i < b.N; i++ {
-               w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-               fmt.Fprint(w, benchHTML)
-       }
-}
-
-const benchHTML = `
  <!DOCTYPE html>
  <html>
  <head>
@@ -207,51 +181,9 @@ const benchHTML = `
  </body>
  </html>
  `
-
-func BenchmarkElementsCollectorWriter(b *testing.B) {
-       b.ReportAllocs()
         for i := 0; i < b.N; i++ {
                 w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
                 fmt.Fprint(w, benchHTML)
-       }
-}
-
-func BenchmarkElementsCollectorWriterMinified(b *testing.B) {
-       b.ReportAllocs()
-       v := viper.New()
-       m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
-       var buf bytes.Buffer
-       m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
-       b.ResetTimer()
-
-       for i := 0; i < b.N; i++ {
-               w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-               fmt.Fprint(w, buf.String())
-       }
-}
-
-func BenchmarkElementsCollectorWriterWithMinifyStream(b *testing.B) {
-       b.ReportAllocs()
-       v := viper.New()
-       m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
-       b.ResetTimer()
-
-       for i := 0; i < b.N; i++ {
-               w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-               m.Minify(media.HTMLType, w, strings.NewReader(benchHTML))
-       }
-}
-
-func BenchmarkElementsCollectorWriterWithMinifyString(b *testing.B) {
-       b.ReportAllocs()
-       v := viper.New()
-       m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
-       b.ResetTimer()
  
-       for i := 0; i < b.N; i++ {
-               var buf bytes.Buffer
-               m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
-               w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-               fmt.Fprint(w, buf.String())
         }
  }
author	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
	Tue, 20 Apr 2021 14:50:03 +0000 (16:50 +0200)
committer	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
	Tue, 20 Apr 2021 15:24:17 +0000 (17:24 +0200)
publisher/htmlElementsCollector.go		patch \| blob \| history
publisher/htmlElementsCollector_test.go		patch \| blob \| history