Collect HTML elements during the build to use in PurgeCSS etc.
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Tue, 3 Mar 2020 11:25:03 +0000 (12:25 +0100)
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Thu, 9 Apr 2020 20:57:26 +0000 (22:57 +0200)
The main use case for this is to use with resources.PostProcess and resources.PostCSS with purgecss.

You would normally set it up to extract keywords from your templates, doing it from the full /public takes forever for bigger sites.

Doing the template thing misses dynamically created class names etc., and it's hard/impossible to set up in when using themes.

You can enable this in your site config:

```toml
[build]
  writeStats = true
```

It will then write a `hugo_stats.json` file to the project root as part of the build.

If you're only using this for the production build, you should consider putting it below `config/production`.

You can then set it up with PostCSS like this:

```js
const purgecss = require('@fullhuman/postcss-purgecss')({
    content: [ './hugo_stats.json' ],
    defaultExtractor: (content) => {
        let els = JSON.parse(content).htmlElements;
        return els.tags.concat(els.classes, els.ids);
    }
});

module.exports = {
    plugins: [
        require('tailwindcss'),
        require('autoprefixer'),
        ...(process.env.HUGO_ENVIRONMENT === 'production' ? [ purgecss ] : [])
    ]
};
```

Fixes #6999

.gitignore
config/commonConfig.go
go.mod
hugolib/hugo_sites.go
hugolib/hugo_sites_build.go
hugolib/site_test.go
publisher/htmlElementsCollector.go [new file with mode: 0644]
publisher/htmlElementsCollector_test.go [new file with mode: 0644]
publisher/publisher.go
publisher/publisher_test.go [deleted file]

index 75d85e8d0d967d7f62b0f93a54d2064ef999efb1..d3ef0199195e2a44886306649fe90f0e2196974a 100644 (file)
@@ -20,6 +20,7 @@ dock.sh
 GoBuilds
 dist
 
+hugolib/hugo_stats.json
 resources/sunset.jpg
 
 vendor
index 17d5619bb127a7922b4b9715def669130601c70d..ba99260a5e866593fa17c2e26e30deb2ec3f9070 100644 (file)
@@ -29,11 +29,16 @@ import (
 
 var DefaultBuild = Build{
        UseResourceCacheWhen: "fallback",
+       WriteStats:           false,
 }
 
 // Build holds some build related condfiguration.
 type Build struct {
        UseResourceCacheWhen string // never, fallback, always. Default is fallback
+
+       // When enabled, will collect and write a hugo_stats.json with some build
+       // related aggregated data (e.g. CSS class names).
+       WriteStats bool
 }
 
 func (b Build) UseResourceCache(err error) bool {
diff --git a/go.mod b/go.mod
index 4b75840b053f6fee1c62f6f3a8cd34b033899a22..c12caa8f0cb556673a5e0845e418c6a656860420 100644 (file)
--- a/go.mod
+++ b/go.mod
@@ -55,7 +55,7 @@ require (
        go.opencensus.io v0.22.0 // indirect
        gocloud.dev v0.15.0
        golang.org/x/image v0.0.0-20191214001246-9130b4cfad52
-       golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553 // indirect
+       golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553
        golang.org/x/oauth2 v0.0.0-20190523182746-aaccbc9213b0 // indirect
        golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e
        golang.org/x/sys v0.0.0-20200107144601-ef85f5a75ddf // indirect
index dca9e49680d83c479ed3242ef231ce07434958c7..9ff4d36cd236536d1e9441db6bfee65676a80a4a 100644 (file)
@@ -408,7 +408,11 @@ func applyDeps(cfg deps.DepsCfg, sites ...*Site) error {
                        s.Deps = d
 
                        // Set up the main publishing chain.
-                       pub, err := publisher.NewDestinationPublisher(d.PathSpec.BaseFs.PublishFs, s.outputFormatsConfig, s.mediaTypesConfig, cfg.Cfg)
+                       pub, err := publisher.NewDestinationPublisher(
+                               d.ResourceSpec,
+                               s.outputFormatsConfig,
+                               s.mediaTypesConfig,
+                       )
 
                        if err != nil {
                                return err
index 6a65605fce593261faa83549dbb80ec7c9c22dbb..fac20e8833fb0ce59711abcd7456e2138109db5f 100644 (file)
@@ -16,11 +16,17 @@ package hugolib
 import (
        "bytes"
        "context"
+       "encoding/json"
        "fmt"
        "os"
+       "path/filepath"
        "runtime/trace"
        "strings"
 
+       "github.com/gohugoio/hugo/publisher"
+
+       "github.com/gohugoio/hugo/hugofs"
+
        "github.com/gohugoio/hugo/common/para"
        "github.com/gohugoio/hugo/config"
        "github.com/gohugoio/hugo/resources/postpub"
@@ -146,10 +152,10 @@ func (h *HugoSites) Build(config BuildCfg, events ...fsnotify.Event) error {
                if err != nil {
                        h.SendError(err)
                }
-       }
 
-       if err := h.postProcess(); err != nil {
-               h.SendError(err)
+               if err = h.postProcess(); err != nil {
+                       h.SendError(err)
+               }
        }
 
        if h.Metrics != nil {
@@ -337,6 +343,12 @@ func (h *HugoSites) render(config *BuildCfg) error {
 }
 
 func (h *HugoSites) postProcess() error {
+       // Make sure to write any build stats to disk first so it's available
+       // to the post processors.
+       if err := h.writeBuildStats(); err != nil {
+               return err
+       }
+
        var toPostProcess []resource.OriginProvider
        for _, s := range h.Sites {
                for _, v := range s.ResourceSpec.PostProcessResources {
@@ -422,3 +434,47 @@ func (h *HugoSites) postProcess() error {
        return g.Wait()
 
 }
+
+type publishStats struct {
+       CSSClasses string `json:"cssClasses"`
+}
+
+func (h *HugoSites) writeBuildStats() error {
+       if !h.ResourceSpec.BuildConfig.WriteStats {
+               return nil
+       }
+
+       htmlElements := &publisher.HTMLElements{}
+       for _, s := range h.Sites {
+               stats := s.publisher.PublishStats()
+               htmlElements.Merge(stats.HTMLElements)
+       }
+
+       htmlElements.Sort()
+
+       stats := publisher.PublishStats{
+               HTMLElements: *htmlElements,
+       }
+
+       js, err := json.MarshalIndent(stats, "", "  ")
+       if err != nil {
+               return err
+       }
+
+       filename := filepath.Join(h.WorkingDir, "hugo_stats.json")
+
+       // Make sure it's always written to the OS fs.
+       if err := afero.WriteFile(hugofs.Os, filename, js, 0666); err != nil {
+               return err
+       }
+
+       // Write to the destination, too, if a mem fs is in play.
+       if h.Fs.Source != hugofs.Os {
+               if err := afero.WriteFile(h.Fs.Destination, filename, js, 0666); err != nil {
+                       return err
+               }
+       }
+
+       return nil
+
+}
index 0b05aac12d671bfdf311aa03b39d9d1083659b15..e404d80a4f5e0679ad22671c4cefb741949a897a 100644 (file)
@@ -980,3 +980,47 @@ func TestRefIssues(t *testing.T) {
        b.AssertFileContent("public/post/nested-a/content-a/index.html", `Content: http://example.com/post/nested-b/content-b/`)
 
 }
+
+func TestClassCollector(t *testing.T) {
+       b := newTestSitesBuilder(t)
+       b.WithConfigFile("toml", `
+
+[build]
+  writeStats = true
+
+`)
+
+       b.WithTemplates("index.html", `
+       
+<div id="el1" class="a b c">Foo</div>
+
+Some text.
+
+<div class="c d e" id="el2">Foo</div>
+`)
+
+       b.WithContent("p1.md", "")
+
+       b.Build(BuildCfg{})
+
+       b.AssertFileContent("hugo_stats.json", `
+{
+          "htmlElements": {
+            "tags": [
+              "div"
+            ],
+            "classes": [
+              "a",
+              "b",
+              "c",
+              "d",
+              "e"
+            ],
+            "ids": [
+              "el1",
+              "el2"
+            ]
+          }
+        }
+`)
+}
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go
new file mode 100644 (file)
index 0000000..c6e0d3f
--- /dev/null
@@ -0,0 +1,268 @@
+// Copyright 2020 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package publisher
+
+import (
+       "github.com/gohugoio/hugo/helpers"
+       "golang.org/x/net/html"
+       yaml "gopkg.in/yaml.v2"
+
+       "bytes"
+       "sort"
+       "strings"
+       "sync"
+)
+
+func newHTMLElementsCollector() *htmlElementsCollector {
+       return &htmlElementsCollector{
+               elementSet: make(map[string]bool),
+       }
+}
+
+func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *cssClassCollectorWriter {
+       return &cssClassCollectorWriter{
+               collector: collector,
+       }
+}
+
+// HTMLElements holds lists of tags and attribute values for classes and id.
+type HTMLElements struct {
+       Tags    []string `json:"tags"`
+       Classes []string `json:"classes"`
+       IDs     []string `json:"ids"`
+}
+
+func (h *HTMLElements) Merge(other HTMLElements) {
+       h.Tags = append(h.Tags, other.Tags...)
+       h.Classes = append(h.Classes, other.Classes...)
+       h.IDs = append(h.IDs, other.IDs...)
+
+       h.Tags = helpers.UniqueStringsReuse(h.Tags)
+       h.Classes = helpers.UniqueStringsReuse(h.Classes)
+       h.IDs = helpers.UniqueStringsReuse(h.IDs)
+
+}
+
+func (h *HTMLElements) Sort() {
+       sort.Strings(h.Tags)
+       sort.Strings(h.Classes)
+       sort.Strings(h.IDs)
+}
+
+type cssClassCollectorWriter struct {
+       collector *htmlElementsCollector
+       buff      bytes.Buffer
+
+       isCollecting bool
+       dropValue    bool
+       inQuote      bool
+}
+
+func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
+       n = len(p)
+       i := 0
+
+       for i < len(p) {
+               if !w.isCollecting {
+                       for ; i < len(p); i++ {
+                               b := p[i]
+                               if b == '<' {
+                                       w.startCollecting()
+                                       break
+                               }
+                       }
+               }
+
+               if w.isCollecting {
+                       for ; i < len(p); i++ {
+                               b := p[i]
+                               if !w.inQuote && b == '/' {
+                                       // End element, we don't care about those.
+                                       w.endCollecting(true)
+                                       break
+                               }
+                               w.toggleIfQuote(b)
+                               if !w.inQuote && b == '>' {
+                                       w.endCollecting(false)
+                                       break
+                               }
+                               w.buff.WriteByte(b)
+                       }
+
+                       if !w.isCollecting {
+                               if w.dropValue {
+                                       w.buff.Reset()
+                               } else {
+                                       // First check if we have processed this element before.
+                                       w.collector.mu.RLock()
+
+                                       // See https://github.com/dominikh/go-tools/issues/723
+                                       //lint:ignore S1030 This construct avoids memory allocation for the string.
+                                       seen := w.collector.elementSet[string(w.buff.Bytes())]
+                                       w.collector.mu.RUnlock()
+                                       if seen {
+                                               w.buff.Reset()
+                                               continue
+                                       }
+
+                                       s := w.buff.String()
+
+                                       w.buff.Reset()
+
+                                       el := parseHTMLElement(s)
+
+                                       w.collector.mu.Lock()
+                                       w.collector.elementSet[s] = true
+                                       if el.Tag != "" {
+                                               w.collector.elements = append(w.collector.elements, el)
+                                       }
+                                       w.collector.mu.Unlock()
+                               }
+                       }
+               }
+       }
+
+       return
+}
+
+func (c *cssClassCollectorWriter) endCollecting(drop bool) {
+       c.isCollecting = false
+       c.inQuote = false
+       c.dropValue = drop
+}
+
+func (c *cssClassCollectorWriter) startCollecting() {
+       c.isCollecting = true
+       c.dropValue = false
+}
+
+func (c *cssClassCollectorWriter) toggleIfQuote(b byte) {
+       if isQuote(b) {
+               c.inQuote = !c.inQuote
+       }
+}
+
+type htmlElement struct {
+       Tag     string
+       Classes []string
+       IDs     []string
+}
+
+type htmlElementsCollector struct {
+       // Contains the raw HTML string. We will get the same element
+       // several times, and want to avoid costly reparsing when this
+       // is used for aggregated data only.
+       elementSet map[string]bool
+
+       elements []htmlElement
+
+       mu sync.RWMutex
+}
+
+func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
+
+       var (
+               classes []string
+               ids     []string
+               tags    []string
+       )
+
+       for _, el := range c.elements {
+               classes = append(classes, el.Classes...)
+               ids = append(ids, el.IDs...)
+               tags = append(tags, el.Tag)
+       }
+
+       classes = helpers.UniqueStringsSorted(classes)
+       ids = helpers.UniqueStringsSorted(ids)
+       tags = helpers.UniqueStringsSorted(tags)
+
+       els := HTMLElements{
+               Classes: classes,
+               IDs:     ids,
+               Tags:    tags,
+       }
+
+       return els
+}
+
+func isQuote(b byte) bool {
+       return b == '"' || b == '\''
+}
+
+var htmlJsonFixer = strings.NewReplacer(", ", "\n")
+
+func parseHTMLElement(elStr string) (el htmlElement) {
+       elStr = strings.TrimSpace(elStr)
+       if !strings.HasSuffix(elStr, ">") {
+               elStr += ">"
+       }
+       n, err := html.Parse(strings.NewReader(elStr))
+       if err != nil {
+               return
+       }
+       var walk func(*html.Node)
+       walk = func(n *html.Node) {
+               if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) {
+                       el.Tag = n.Data
+
+                       for _, a := range n.Attr {
+                               switch {
+                               case strings.EqualFold(a.Key, "id"):
+                                       // There should be only one, but one never knows...
+                                       el.IDs = append(el.IDs, a.Val)
+                               default:
+                                       if strings.EqualFold(a.Key, "class") {
+                                               el.Classes = append(el.Classes, strings.Fields(a.Val)...)
+                                       } else {
+                                               key := strings.ToLower(a.Key)
+                                               val := strings.TrimSpace(a.Val)
+                                               if strings.Contains(key, "class") && strings.HasPrefix(val, "{") {
+                                                       // This looks like a Vue or AlpineJS class binding.
+                                                       // Try to unmarshal it as YAML and pull the keys.
+                                                       // This may look odd, as the source is (probably) JS (JSON), but the YAML
+                                                       // parser is much more lenient with simple JS input, it seems.
+                                                       m := make(map[string]interface{})
+                                                       val = htmlJsonFixer.Replace(strings.Trim(val, "{}"))
+                                                       // Remove leading space to make it look like YAML.
+                                                       lines := strings.Split(val, "\n")
+                                                       for i, l := range lines {
+                                                               lines[i] = strings.TrimSpace(l)
+                                                       }
+                                                       val = strings.Join(lines, "\n")
+                                                       err := yaml.Unmarshal([]byte(val), &m)
+                                                       if err == nil {
+                                                               for k := range m {
+                                                                       el.Classes = append(el.Classes, strings.Fields(k)...)
+                                                               }
+                                                       } else {
+                                                               // Just insert the raw values. This is used for CSS class pruning
+                                                               // so, it's important not to leave out values that may be a CSS class.
+                                                               el.Classes = append(el.Classes, strings.Fields(val)...)
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+
+               for c := n.FirstChild; c != nil; c = c.NextSibling {
+                       walk(c)
+               }
+       }
+
+       walk(n)
+
+       return
+}
diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go
new file mode 100644 (file)
index 0000000..3ef159d
--- /dev/null
@@ -0,0 +1,81 @@
+// Copyright 2020 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package publisher
+
+import (
+       "fmt"
+       "strings"
+       "testing"
+
+       qt "github.com/frankban/quicktest"
+)
+
+func TestClassCollector(t *testing.T) {
+       c := qt.New((t))
+
+       f := func(tags, classes, ids string) HTMLElements {
+               var tagss, classess, idss []string
+               if tags != "" {
+                       tagss = strings.Split(tags, " ")
+               }
+               if classes != "" {
+                       classess = strings.Split(classes, " ")
+               }
+               if ids != "" {
+                       idss = strings.Split(ids, " ")
+               }
+               return HTMLElements{
+                       Tags:    tagss,
+                       Classes: classess,
+                       IDs:     idss,
+               }
+       }
+
+       for _, test := range []struct {
+               name   string
+               html   string
+               expect HTMLElements
+       }{
+               {"basic", `<body class="b a"></body>`, f("body", "a b", "")},
+               {"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")},
+               {"single quote", `<body class='b a'></body>`, f("body", "a b", "")},
+               {"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")},
+
+               {"AlpineJS bind 1", `<body>
+                       <div x-bind:class="{
+        'class1': data.open,
+        'class2 class3': data.foo == 'bar'
+         }">
+                       </div>
+               </body>`, f("body div", "class1 class2 class3", "")},
+
+               {"Alpine bind 2", `<div x-bind:class="{ 'bg-black':  filter.checked }"
+                        class="inline-block mr-1 mb-2 rounded  bg-gray-300 px-2 py-2">FOO</div>`,
+                       f("div", "bg-black bg-gray-300 inline-block mb-2 mr-1 px-2 py-2 rounded", "")},
+
+               {"Alpine bind 3", `<div x-bind:class="{ 'text-gray-800':  !checked, 'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
+               {"Alpine bind 4", `<div x-bind:class="{ 'text-gray-800':  !checked, 
+                                        'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
+
+               {"Vue bind", `<div v-bind:class="{ active: isActive }"></div>`, f("div", "active", "")},
+       } {
+               c.Run(test.name, func(c *qt.C) {
+                       w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
+                       fmt.Fprint(w, test.html)
+                       got := w.collector.getHTMLElements()
+                       c.Assert(got, qt.DeepEquals, test.expect)
+               })
+       }
+
+}
index f30073c08f8b37ba8628f8e47d5b149c8d08f637..8b8d2fa631fff57f04922aa72306bcc945846629 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright 2019 The Hugo Authors. All rights reserved.
+// Copyright 2020 The Hugo Authors. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -18,7 +18,8 @@ import (
        "io"
        "sync/atomic"
 
-       "github.com/gohugoio/hugo/config"
+       "github.com/gohugoio/hugo/resources"
+
        "github.com/gohugoio/hugo/media"
 
        "github.com/gohugoio/hugo/minifiers"
@@ -68,17 +69,21 @@ type Descriptor struct {
 // DestinationPublisher is the default and currently only publisher in Hugo. This
 // publisher prepares and publishes an item to the defined destination, e.g. /public.
 type DestinationPublisher struct {
-       fs  afero.Fs
-       min minifiers.Client
+       fs                    afero.Fs
+       min                   minifiers.Client
+       htmlElementsCollector *htmlElementsCollector
 }
 
 // NewDestinationPublisher creates a new DestinationPublisher.
-func NewDestinationPublisher(fs afero.Fs, outputFormats output.Formats, mediaTypes media.Types, cfg config.Provider) (pub DestinationPublisher, err error) {
-       pub = DestinationPublisher{fs: fs}
-       pub.min, err = minifiers.New(mediaTypes, outputFormats, cfg)
-       if err != nil {
-               return
+func NewDestinationPublisher(rs *resources.Spec, outputFormats output.Formats, mediaTypes media.Types) (pub DestinationPublisher, err error) {
+       fs := rs.BaseFs.PublishFs
+       cfg := rs.Cfg
+       var classCollector *htmlElementsCollector
+       if rs.BuildConfig.WriteStats {
+               classCollector = newHTMLElementsCollector()
        }
+       pub = DestinationPublisher{fs: fs, htmlElementsCollector: classCollector}
+       pub.min, err = minifiers.New(mediaTypes, outputFormats, cfg)
        return
 }
 
@@ -111,16 +116,38 @@ func (p DestinationPublisher) Publish(d Descriptor) error {
        }
        defer f.Close()
 
-       _, err = io.Copy(f, src)
+       var w io.Writer = f
+
+       if p.htmlElementsCollector != nil && d.OutputFormat.IsHTML {
+               w = io.MultiWriter(w, newHTMLElementsCollectorWriter(p.htmlElementsCollector))
+       }
+
+       _, err = io.Copy(w, src)
        if err == nil && d.StatCounter != nil {
                atomic.AddUint64(d.StatCounter, uint64(1))
        }
+
        return err
 }
 
+func (p DestinationPublisher) PublishStats() PublishStats {
+       if p.htmlElementsCollector == nil {
+               return PublishStats{}
+       }
+
+       return PublishStats{
+               HTMLElements: p.htmlElementsCollector.getHTMLElements(),
+       }
+}
+
+type PublishStats struct {
+       HTMLElements HTMLElements `json:"htmlElements"`
+}
+
 // Publisher publishes a result file.
 type Publisher interface {
        Publish(d Descriptor) error
+       PublishStats() PublishStats
 }
 
 // XML transformer := transform.New(urlreplacers.NewAbsURLInXMLTransformer(path))
diff --git a/publisher/publisher_test.go b/publisher/publisher_test.go
deleted file mode 100644 (file)
index 200accc..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright 2018 The Hugo Authors. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package publisher