publisher: Skip script, pre and textarea content when looking for HTML elements
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Tue, 6 Apr 2021 16:19:25 +0000 (18:19 +0200)
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Tue, 6 Apr 2021 22:26:02 +0000 (00:26 +0200)
Updates #7567

publisher/htmlElementsCollector.go
publisher/htmlElementsCollector_test.go

index 1823a832776a234f3eee574e867147cefeb71ab1..d9479aafaa527905d5ea4a1a5cd430a9e9fe434e 100644 (file)
@@ -64,7 +64,7 @@ type cssClassCollectorWriter struct {
        buff      bytes.Buffer
 
        isCollecting bool
-       dropValue    bool
+       inPreTag     string
 
        inQuote    bool
        quoteValue byte
@@ -90,49 +90,58 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
                                b := p[i]
                                w.toggleIfQuote(b)
                                if !w.inQuote && b == '>' {
-                                       w.endCollecting(false)
+                                       w.endCollecting()
                                        break
                                }
                                w.buff.WriteByte(b)
                        }
 
                        if !w.isCollecting {
-                               if w.dropValue {
-                                       w.buff.Reset()
-                               } else {
-                                       // First check if we have processed this element before.
-                                       w.collector.mu.RLock()
-
-                                       // See https://github.com/dominikh/go-tools/issues/723
-                                       //lint:ignore S1030 This construct avoids memory allocation for the string.
-                                       seen := w.collector.elementSet[string(w.buff.Bytes())]
-                                       w.collector.mu.RUnlock()
-                                       if seen {
-                                               w.buff.Reset()
-                                               continue
+                               if w.inPreTag != "" {
+                                       s := w.buff.String()
+                                       if tagName, isEnd := w.parseEndTag(s); isEnd && w.inPreTag == tagName {
+                                               w.inPreTag = ""
                                        }
+                                       w.buff.Reset()
+                                       continue
+                               }
 
-                                       s := w.buff.String()
+                               // First check if we have processed this element before.
+                               w.collector.mu.RLock()
 
+                               // See https://github.com/dominikh/go-tools/issues/723
+                               //lint:ignore S1030 This construct avoids memory allocation for the string.
+                               seen := w.collector.elementSet[string(w.buff.Bytes())]
+                               w.collector.mu.RUnlock()
+                               if seen {
                                        w.buff.Reset()
+                                       continue
+                               }
 
-                                       if strings.HasPrefix(s, "</") {
-                                               continue
-                                       }
+                               s := w.buff.String()
 
-                                       key := s
+                               w.buff.Reset()
 
-                                       s, tagName := w.insertStandinHTMLElement(s)
-                                       el := parseHTMLElement(s)
-                                       el.Tag = tagName
+                               if strings.HasPrefix(s, "</") {
+                                       continue
+                               }
 
-                                       w.collector.mu.Lock()
-                                       w.collector.elementSet[key] = true
-                                       if el.Tag != "" {
-                                               w.collector.elements = append(w.collector.elements, el)
-                                       }
-                                       w.collector.mu.Unlock()
+                               key := s
+
+                               s, tagName := w.insertStandinHTMLElement(s)
+                               el := parseHTMLElement(s)
+                               el.Tag = tagName
+                               if w.isPreFormatted(tagName) {
+                                       w.inPreTag = tagName
                                }
+
+                               w.collector.mu.Lock()
+                               w.collector.elementSet[key] = true
+                               if el.Tag != "" {
+                                       w.collector.elements = append(w.collector.elements, el)
+                               }
+                               w.collector.mu.Unlock()
+
                        }
                }
        }
@@ -140,6 +149,11 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
        return
 }
 
+// No need to look inside these for HTML elements.
+func (c *cssClassCollectorWriter) isPreFormatted(s string) bool {
+       return s == "pre" || s == "textarea" || s == "script"
+}
+
 // The net/html parser does not handle single table elements as input, e.g. tbody.
 // We only care about the element/class/ids, so just store away the original tag name
 // and pretend it's a <div>.
@@ -154,15 +168,24 @@ func (c *cssClassCollectorWriter) insertStandinHTMLElement(el string) (string, s
        return newv, strings.ToLower(tag)
 }
 
-func (c *cssClassCollectorWriter) endCollecting(drop bool) {
+func (c *cssClassCollectorWriter) parseEndTag(s string) (string, bool) {
+       if !strings.HasPrefix(s, "</") {
+               return "", false
+       }
+       s = strings.TrimPrefix(s, "</")
+       s = strings.TrimSuffix(s, ">")
+       return strings.ToLower(strings.TrimSpace(s)), true
+}
+
+func (c *cssClassCollectorWriter) endCollecting() {
        c.isCollecting = false
        c.inQuote = false
-       c.dropValue = drop
+
 }
 
 func (c *cssClassCollectorWriter) startCollecting() {
        c.isCollecting = true
-       c.dropValue = false
+
 }
 
 func (c *cssClassCollectorWriter) toggleIfQuote(b byte) {
index 2c2fd37330651c690eee3bf0c36918c8e7ed1183..5a1802234b46938c60f234f3c84b5b8396928c0e 100644 (file)
@@ -89,8 +89,12 @@ func TestClassCollector(t *testing.T) {
 
                {"Alpine transition 1", `<div x-transition:enter-start="opacity-0 transform mobile:-translate-x-8 sm:-translate-y-8">`, f("div", "mobile:-translate-x-8 opacity-0 sm:-translate-y-8 transform", "")},
                {"Vue bind", `<div v-bind:class="{ active: isActive }"></div>`, f("div", "active", "")},
-               // https://github.com/gohugoio/hugo/issues/7746
+               // Issue #7746
                {"Apostrophe inside attribute value", `<a class="missingclass" title="Plus d'information">my text</a><div></div>`, f("a div", "missingclass", "")},
+               // Issue #7567
+               {"Script tags content should be skipped", `<script><span>foo</span><span>bar</span></script><div class="foo"></div>`, f("div script", "foo", "")},
+               {"Pre tags content should be skipped", `<pre class="preclass"><span>foo</span><span>bar</span></pre><div class="foo"></div>`, f("div pre", "foo preclass", "")},
+               {"Textare tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
        } {
                c.Run(test.name, func(c *qt.C) {
                        w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())