publisher: Skip script, pre and textarea content when looking for HTML elements

author Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>

Tue, 6 Apr 2021 16:19:25 +0000 (18:19 +0200)

committer Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>

Tue, 6 Apr 2021 22:26:02 +0000 (00:26 +0200)
author Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Tue, 6 Apr 2021 16:19:25 +0000 (18:19 +0200)
committer Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Tue, 6 Apr 2021 22:26:02 +0000 (00:26 +0200)
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go

index 1823a832776a234f3eee574e867147cefeb71ab1..d9479aafaa527905d5ea4a1a5cd430a9e9fe434e 100644 (file)
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@@ -64,7 +64,7 @@ type cssClassCollectorWriter struct {
         buff      bytes.Buffer
  
         isCollecting bool
-       dropValue    bool
+       inPreTag     string
  
         inQuote    bool
         quoteValue byte
@@ -90,49 +90,58 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
                                 b := p[i]
                                 w.toggleIfQuote(b)
                                 if !w.inQuote && b == '>' {
-                                       w.endCollecting(false)
+                                       w.endCollecting()
                                         break
                                 }
                                 w.buff.WriteByte(b)
                         }
  
                         if !w.isCollecting {
-                               if w.dropValue {
-                                       w.buff.Reset()
-                               } else {
-                                       // First check if we have processed this element before.
-                                       w.collector.mu.RLock()
-
-                                       // See https://github.com/dominikh/go-tools/issues/723
-                                       //lint:ignore S1030 This construct avoids memory allocation for the string.
-                                       seen := w.collector.elementSet[string(w.buff.Bytes())]
-                                       w.collector.mu.RUnlock()
-                                       if seen {
-                                               w.buff.Reset()
-                                               continue
+                               if w.inPreTag != "" {
+                                       s := w.buff.String()
+                                       if tagName, isEnd := w.parseEndTag(s); isEnd && w.inPreTag == tagName {
+                                               w.inPreTag = ""
                                         }
+                                       w.buff.Reset()
+                                       continue
+                               }
  
-                                       s := w.buff.String()
+                               // First check if we have processed this element before.
+                               w.collector.mu.RLock()
  
+                               // See https://github.com/dominikh/go-tools/issues/723
+                               //lint:ignore S1030 This construct avoids memory allocation for the string.
+                               seen := w.collector.elementSet[string(w.buff.Bytes())]
+                               w.collector.mu.RUnlock()
+                               if seen {
                                         w.buff.Reset()
+                                       continue
+                               }
  
-                                       if strings.HasPrefix(s, "</") {
-                                               continue
-                                       }
+                               s := w.buff.String()
  
-                                       key := s
+                               w.buff.Reset()
  
-                                       s, tagName := w.insertStandinHTMLElement(s)
-                                       el := parseHTMLElement(s)
-                                       el.Tag = tagName
+                               if strings.HasPrefix(s, "</") {
+                                       continue
+                               }
  
-                                       w.collector.mu.Lock()
-                                       w.collector.elementSet[key] = true
-                                       if el.Tag != "" {
-                                               w.collector.elements = append(w.collector.elements, el)
-                                       }
-                                       w.collector.mu.Unlock()
+                               key := s
+
+                               s, tagName := w.insertStandinHTMLElement(s)
+                               el := parseHTMLElement(s)
+                               el.Tag = tagName
+                               if w.isPreFormatted(tagName) {
+                                       w.inPreTag = tagName
                                 }
+
+                               w.collector.mu.Lock()
+                               w.collector.elementSet[key] = true
+                               if el.Tag != "" {
+                                       w.collector.elements = append(w.collector.elements, el)
+                               }
+                               w.collector.mu.Unlock()
+
                         }
                 }
         }
@@ -140,6 +149,11 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
         return
  }
  
+// No need to look inside these for HTML elements.
+func (c *cssClassCollectorWriter) isPreFormatted(s string) bool {
+       return s == "pre" || s == "textarea" || s == "script"
+}
+
  // The net/html parser does not handle single table elements as input, e.g. tbody.
  // We only care about the element/class/ids, so just store away the original tag name
  // and pretend it's a <div>.
@@ -154,15 +168,24 @@ func (c *cssClassCollectorWriter) insertStandinHTMLElement(el string) (string, s
         return newv, strings.ToLower(tag)
  }
  
-func (c *cssClassCollectorWriter) endCollecting(drop bool) {
+func (c *cssClassCollectorWriter) parseEndTag(s string) (string, bool) {
+       if !strings.HasPrefix(s, "</") {
+               return "", false
+       }
+       s = strings.TrimPrefix(s, "</")
+       s = strings.TrimSuffix(s, ">")
+       return strings.ToLower(strings.TrimSpace(s)), true
+}
+
+func (c *cssClassCollectorWriter) endCollecting() {
         c.isCollecting = false
         c.inQuote = false
-       c.dropValue = drop
+
  }
  
  func (c *cssClassCollectorWriter) startCollecting() {
         c.isCollecting = true
-       c.dropValue = false
+
  }
  
  func (c *cssClassCollectorWriter) toggleIfQuote(b byte) {
diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go

index 2c2fd37330651c690eee3bf0c36918c8e7ed1183..5a1802234b46938c60f234f3c84b5b8396928c0e 100644 (file)
--- a/publisher/htmlElementsCollector_test.go
+++ b/publisher/htmlElementsCollector_test.go
@@ -89,8 +89,12 @@ func TestClassCollector(t *testing.T) {
  
                 {"Alpine transition 1", `<div x-transition:enter-start="opacity-0 transform mobile:-translate-x-8 sm:-translate-y-8">`, f("div", "mobile:-translate-x-8 opacity-0 sm:-translate-y-8 transform", "")},
                 {"Vue bind", `<div v-bind:class="{ active: isActive }"></div>`, f("div", "active", "")},
-               // https://github.com/gohugoio/hugo/issues/7746
+               // Issue #7746
                 {"Apostrophe inside attribute value", `<a class="missingclass" title="Plus d'information">my text</a><div></div>`, f("a div", "missingclass", "")},
+               // Issue #7567
+               {"Script tags content should be skipped", `<script><span>foo</span><span>bar</span></script><div class="foo"></div>`, f("div script", "foo", "")},
+               {"Pre tags content should be skipped", `<pre class="preclass"><span>foo</span><span>bar</span></pre><div class="foo"></div>`, f("div pre", "foo preclass", "")},
+               {"Textare tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
         } {
                 c.Run(test.name, func(c *qt.C) {
                         w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
author	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
	Tue, 6 Apr 2021 16:19:25 +0000 (18:19 +0200)
committer	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
	Tue, 6 Apr 2021 22:26:02 +0000 (00:26 +0200)
publisher/htmlElementsCollector.go		patch \| blob \| history
publisher/htmlElementsCollector_test.go		patch \| blob \| history