markup/goldmark: Make auto IDs GitHub compatible
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Sat, 4 Jan 2020 10:28:19 +0000 (11:28 +0100)
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Sat, 4 Jan 2020 18:46:01 +0000 (19:46 +0100)
You can turn off this behaviour:

```toml
[markup]
  [markup.goldmark]
    [markup.goldmark.parser]
      autoHeadingIDAsciiOnly = true
```
Note that the `anchorize` now adapts its behaviour depending on the default Markdown handler.

Fixes #6616

12 files changed:
common/text/transform.go [new file with mode: 0644]
common/text/transform_test.go [new file with mode: 0644]
helpers/content.go
helpers/path.go
markup/blackfriday/convert.go
markup/converter/converter.go
markup/goldmark/autoid.go [new file with mode: 0644]
markup/goldmark/autoid_test.go [new file with mode: 0644]
markup/goldmark/convert.go
markup/goldmark/convert_test.go
markup/goldmark/goldmark_config/config.go
tpl/urls/urls.go

diff --git a/common/text/transform.go b/common/text/transform.go
new file mode 100644 (file)
index 0000000..f595778
--- /dev/null
@@ -0,0 +1,47 @@
+// Copyright 2019 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package text
+
+import (
+       "sync"
+       "unicode"
+
+       "golang.org/x/text/runes"
+       "golang.org/x/text/transform"
+       "golang.org/x/text/unicode/norm"
+)
+
+var accentTransformerPool = &sync.Pool{
+       New: func() interface{} {
+               return transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
+       },
+}
+
+// RemoveAccents removes all accents from b.
+func RemoveAccents(b []byte) []byte {
+       t := accentTransformerPool.Get().(transform.Transformer)
+       b, _, _ = transform.Bytes(t, b)
+       t.Reset()
+       accentTransformerPool.Put(t)
+       return b
+}
+
+// RemoveAccentsString removes all accents from s.
+func RemoveAccentsString(s string) string {
+       t := accentTransformerPool.Get().(transform.Transformer)
+       s, _, _ = transform.String(t, s)
+       t.Reset()
+       accentTransformerPool.Put(t)
+       return s
+}
diff --git a/common/text/transform_test.go b/common/text/transform_test.go
new file mode 100644 (file)
index 0000000..70b10d1
--- /dev/null
@@ -0,0 +1,29 @@
+// Copyright 2019 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package text
+
+import (
+       "testing"
+
+       qt "github.com/frankban/quicktest"
+)
+
+func TestRemoveAccents(t *testing.T) {
+       c := qt.New(t)
+
+       c.Assert(string(RemoveAccents([]byte("Resumé"))), qt.Equals, "Resume")
+       c.Assert(string(RemoveAccents([]byte("Hugo Rocks!"))), qt.Equals, "Hugo Rocks!")
+       c.Assert(string(RemoveAccentsString("Resumé")), qt.Equals, "Resume")
+
+}
index 1c780fefe1b8b6e6efed938b260face1df250a29..e61888357efc2b17f35ebe2a2dec31cc321c2f22 100644 (file)
@@ -48,8 +48,9 @@ var (
 
 // ContentSpec provides functionality to render markdown content.
 type ContentSpec struct {
-       Converters       markup.ConverterProvider
-       MardownConverter converter.Converter // Markdown converter with no document context
+       Converters          markup.ConverterProvider
+       MardownConverter    converter.Converter // Markdown converter with no document context
+       anchorNameSanitizer converter.AnchorNameSanitizer
 
        // SummaryLength is the length of the summary that Hugo extracts from a content.
        summaryLength int
@@ -91,6 +92,17 @@ func NewContentSpec(cfg config.Provider, logger *loggers.Logger, contentFs afero
                return nil, err
        }
        spec.MardownConverter = conv
+       if as, ok := conv.(converter.AnchorNameSanitizer); ok {
+               spec.anchorNameSanitizer = as
+       } else {
+               // Use Goldmark's sanitizer
+               p := converterProvider.Get("goldmark")
+               conv, err := p.New(converter.DocumentContext{})
+               if err != nil {
+                       return nil, err
+               }
+               spec.anchorNameSanitizer = conv.(converter.AnchorNameSanitizer)
+       }
 
        return spec, nil
 }
@@ -192,6 +204,10 @@ func (c *ContentSpec) RenderMarkdown(src []byte) ([]byte, error) {
        return b.Bytes(), nil
 }
 
+func (c *ContentSpec) SanitizeAnchorName(s string) string {
+       return c.anchorNameSanitizer.SanitizeAnchorName(s)
+}
+
 func (c *ContentSpec) ResolveMarkup(in string) string {
        in = strings.ToLower(in)
        switch in {
index 12ddfeb56f5588295e18f66ad8e9377f879a6586..d97789e1580df7d67d4d180918f50b8ba57d8f7a 100644 (file)
@@ -24,6 +24,8 @@ import (
        "strings"
        "unicode"
 
+       "github.com/gohugoio/hugo/common/text"
+
        "github.com/gohugoio/hugo/config"
 
        "github.com/gohugoio/hugo/hugofs"
@@ -31,9 +33,6 @@ import (
        "github.com/gohugoio/hugo/common/hugio"
        _errors "github.com/pkg/errors"
        "github.com/spf13/afero"
-       "golang.org/x/text/runes"
-       "golang.org/x/text/transform"
-       "golang.org/x/text/unicode/norm"
 )
 
 var (
@@ -134,6 +133,10 @@ func ishex(c rune) bool {
 // are also removed.
 // Spaces will be replaced with a single hyphen, and sequential hyphens will be reduced to one.
 func (p *PathSpec) UnicodeSanitize(s string) string {
+       if p.RemovePathAccents {
+               s = text.RemoveAccentsString(s)
+       }
+
        source := []rune(s)
        target := make([]rune, 0, len(source))
        var prependHyphen bool
@@ -154,17 +157,7 @@ func (p *PathSpec) UnicodeSanitize(s string) string {
                }
        }
 
-       var result string
-
-       if p.RemovePathAccents {
-               // remove accents - see https://blog.golang.org/normalization
-               t := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
-               result, _, _ = transform.String(t, string(target))
-       } else {
-               result = string(target)
-       }
-
-       return result
+       return string(target)
 }
 
 // ReplaceExtension takes a path and an extension, strips the old extension
index 3df23c7ae7412603a8a0c6428fe8a9b3997e8df9..bbbc2b377d328d191f7b05e1aa1cdfd3f935c251 100644 (file)
@@ -60,6 +60,10 @@ type blackfridayConverter struct {
        cfg        converter.ProviderConfig
 }
 
+func (c *blackfridayConverter) SanitizeAnchorName(s string) string {
+       return blackfriday.SanitizedAnchorName(s)
+}
+
 func (c *blackfridayConverter) AnchorSuffix() string {
        if c.bf.PlainIDAnchors {
                return ""
@@ -204,5 +208,6 @@ var blackfridayExtensionMap = map[string]int{
 }
 
 var (
-       _ converter.DocumentInfo = (*blackfridayConverter)(nil)
+       _ converter.DocumentInfo        = (*blackfridayConverter)(nil)
+       _ converter.AnchorNameSanitizer = (*blackfridayConverter)(nil)
 )
index a4585bd03805ea3daedef67922fc3db7cafc99d6..b8a5c92c13bba69bcb45df995362b1e8f5497f3b 100644 (file)
@@ -87,6 +87,11 @@ type TableOfContentsProvider interface {
        TableOfContents() tableofcontents.Root
 }
 
+// AnchorNameSanitizer tells how a converter sanitizes anchor names.
+type AnchorNameSanitizer interface {
+       SanitizeAnchorName(s string) string
+}
+
 // Bytes holds a byte slice and implements the Result interface.
 type Bytes []byte
 
diff --git a/markup/goldmark/autoid.go b/markup/goldmark/autoid.go
new file mode 100644 (file)
index 0000000..6599f08
--- /dev/null
@@ -0,0 +1,125 @@
+// Copyright 2019 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package goldmark
+
+import (
+       "bytes"
+       "strconv"
+       "unicode"
+       "unicode/utf8"
+
+       "github.com/gohugoio/hugo/common/text"
+
+       "github.com/yuin/goldmark/ast"
+       "github.com/yuin/goldmark/parser"
+       "github.com/yuin/goldmark/util"
+
+       bp "github.com/gohugoio/hugo/bufferpool"
+)
+
+func sanitizeAnchorNameString(s string, asciiOnly bool) string {
+       return string(sanitizeAnchorName([]byte(s), asciiOnly))
+}
+
+func sanitizeAnchorName(b []byte, asciiOnly bool) []byte {
+       return sanitizeAnchorNameWithHook(b, asciiOnly, nil)
+}
+
+func sanitizeAnchorNameWithHook(b []byte, asciiOnly bool, hook func(buf *bytes.Buffer)) []byte {
+       buf := bp.GetBuffer()
+
+       if asciiOnly {
+               // Normalize it to preserve accents if possible.
+               b = text.RemoveAccents(b)
+       }
+
+       for len(b) > 0 {
+               r, size := utf8.DecodeRune(b)
+               switch {
+               case asciiOnly && size != 1:
+               case isSpace(r):
+                       buf.WriteString("-")
+               case r == '-' || isAlphaNumeric(r):
+                       buf.WriteRune(unicode.ToLower(r))
+               default:
+               }
+
+               b = b[size:]
+       }
+
+       if hook != nil {
+               hook(buf)
+       }
+
+       result := make([]byte, buf.Len())
+       copy(result, buf.Bytes())
+
+       bp.PutBuffer(buf)
+
+       return result
+}
+
+func isAlphaNumeric(r rune) bool {
+       return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
+}
+
+func isSpace(r rune) bool {
+       return r == ' ' || r == '\t'
+}
+
+var _ parser.IDs = (*idFactory)(nil)
+
+type idFactory struct {
+       asciiOnly bool
+       vals      map[string]struct{}
+}
+
+func newIDFactory(asciiOnly bool) *idFactory {
+       return &idFactory{
+               vals:      make(map[string]struct{}),
+               asciiOnly: asciiOnly,
+       }
+}
+
+func (ids *idFactory) Generate(value []byte, kind ast.NodeKind) []byte {
+       return sanitizeAnchorNameWithHook(value, ids.asciiOnly, func(buf *bytes.Buffer) {
+               if buf.Len() == 0 {
+                       if kind == ast.KindHeading {
+                               buf.WriteString("heading")
+                       } else {
+                               buf.WriteString("id")
+                       }
+               }
+
+               if _, found := ids.vals[util.BytesToReadOnlyString(buf.Bytes())]; found {
+                       // Append a hypen and a number, starting with 1.
+                       buf.WriteRune('-')
+                       pos := buf.Len()
+                       for i := 1; ; i++ {
+                               buf.WriteString(strconv.Itoa(i))
+                               if _, found := ids.vals[util.BytesToReadOnlyString(buf.Bytes())]; !found {
+                                       break
+                               }
+                               buf.Truncate(pos)
+                       }
+               }
+
+               ids.vals[buf.String()] = struct{}{}
+
+       })
+}
+
+func (ids *idFactory) Put(value []byte) {
+       ids.vals[util.BytesToReadOnlyString(value)] = struct{}{}
+}
diff --git a/markup/goldmark/autoid_test.go b/markup/goldmark/autoid_test.go
new file mode 100644 (file)
index 0000000..915c6a0
--- /dev/null
@@ -0,0 +1,121 @@
+// Copyright 2019 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package goldmark
+
+import (
+       "strings"
+       "testing"
+
+       qt "github.com/frankban/quicktest"
+)
+
+func TestSanitizeAnchorName(t *testing.T) {
+       c := qt.New(t)
+
+       // Tests generated manually on github.com
+       tests := `
+God is good: 神真美好
+Number 32
+Question?
+1+2=3
+Special !"#$%&(parens)=?´* chars
+Resumé
+One-Hyphen
+Multiple--Hyphens
+Trailing hyphen-
+Many   spaces  here
+Forward/slash
+Backward\slash
+Under_score
+`
+
+       expect := `
+god-is-good-神真美好
+number-32
+question
+123
+special-parens-chars
+resumé
+one-hyphen
+multiple--hyphens
+trailing-hyphen-
+many---spaces--here
+forwardslash
+backwardslash
+under_score
+`
+
+       tests, expect = strings.TrimSpace(tests), strings.TrimSpace(expect)
+
+       testlines, expectlines := strings.Split(tests, "\n"), strings.Split(expect, "\n")
+
+       if len(testlines) != len(expectlines) {
+               panic("test setup failed")
+       }
+
+       for i, input := range testlines {
+               input := input
+               expect := expectlines[i]
+               c.Run(input, func(c *qt.C) {
+                       b := []byte(input)
+                       got := string(sanitizeAnchorName(b, false))
+                       c.Assert(got, qt.Equals, expect)
+                       c.Assert(sanitizeAnchorNameString(input, false), qt.Equals, expect)
+                       c.Assert(string(b), qt.Equals, input)
+               })
+       }
+}
+
+func TestSanitizeAnchorNameAsciiOnly(t *testing.T) {
+       c := qt.New(t)
+
+       c.Assert(sanitizeAnchorNameString("god is神真美好 good", true), qt.Equals, "god-is-good")
+       c.Assert(sanitizeAnchorNameString("Resumé", true), qt.Equals, "resume")
+
+}
+
+func BenchmarkSanitizeAnchorName(b *testing.B) {
+       input := []byte("God is good: 神真美好")
+       b.ResetTimer()
+       for i := 0; i < b.N; i++ {
+               result := sanitizeAnchorName(input, false)
+               if len(result) != 24 {
+                       b.Fatalf("got %d", len(result))
+
+               }
+       }
+}
+
+func BenchmarkSanitizeAnchorNameAsciiOnly(b *testing.B) {
+       input := []byte("God is good: 神真美好")
+       b.ResetTimer()
+       for i := 0; i < b.N; i++ {
+               result := sanitizeAnchorName(input, true)
+               if len(result) != 12 {
+                       b.Fatalf("got %d", len(result))
+
+               }
+       }
+}
+
+func BenchmarkSanitizeAnchorNameString(b *testing.B) {
+       input := "God is good: 神真美好"
+       b.ResetTimer()
+       for i := 0; i < b.N; i++ {
+               result := sanitizeAnchorNameString(input, false)
+               if len(result) != 24 {
+                       b.Fatalf("got %d", len(result))
+               }
+       }
+}
index af204125f03358aaeea0dc569e4cfcbe090fa376..7d50839e23390259a8a8f218dfad28cf24773f9b 100644 (file)
@@ -50,19 +50,33 @@ type provide struct {
 
 func (p provide) New(cfg converter.ProviderConfig) (converter.Provider, error) {
        md := newMarkdown(cfg)
+
        return converter.NewProvider("goldmark", func(ctx converter.DocumentContext) (converter.Converter, error) {
                return &goldmarkConverter{
                        ctx: ctx,
                        cfg: cfg,
                        md:  md,
+                       sanitizeAnchorName: func(s string) string {
+                               return sanitizeAnchorNameString(s, cfg.MarkupConfig.Goldmark.Parser.AutoHeadingIDAsciiOnly)
+                       },
                }, nil
        }), nil
 }
 
+var (
+       _ converter.AnchorNameSanitizer = (*goldmarkConverter)(nil)
+)
+
 type goldmarkConverter struct {
        md  goldmark.Markdown
        ctx converter.DocumentContext
        cfg converter.ProviderConfig
+
+       sanitizeAnchorName func(s string) string
+}
+
+func (c *goldmarkConverter) SanitizeAnchorName(s string) string {
+       return c.sanitizeAnchorName(s)
 }
 
 func newMarkdown(pcfg converter.ProviderConfig) goldmark.Markdown {
@@ -226,7 +240,7 @@ func (c *goldmarkConverter) Convert(ctx converter.RenderContext) (result convert
 
        buf := &bufWriter{Buffer: &bytes.Buffer{}}
        result = buf
-       pctx := newParserContext(ctx)
+       pctx := c.newParserContext(ctx)
        reader := text.NewReader(ctx.Src)
 
        doc := c.md.Parser().Parse(
@@ -265,8 +279,8 @@ func (c *goldmarkConverter) Supports(feature identity.Identity) bool {
        return featureSet[feature.GetIdentity()]
 }
 
-func newParserContext(rctx converter.RenderContext) *parserContext {
-       ctx := parser.NewContext()
+func (c *goldmarkConverter) newParserContext(rctx converter.RenderContext) *parserContext {
+       ctx := parser.NewContext(parser.WithIDs(newIDFactory(c.cfg.MarkupConfig.Goldmark.Parser.AutoHeadingIDAsciiOnly)))
        ctx.Set(tocEnableKey, rctx.RenderTOC)
        return &parserContext{
                Context: ctx,
index 2a97276064be63719d73909b23b658a0dee80b0a..b9bf01ef5e18337c02b06a2bb0892eaf40947a98 100644 (file)
@@ -28,6 +28,23 @@ import (
        qt "github.com/frankban/quicktest"
 )
 
+func convert(c *qt.C, mconf markup_config.Config, content string) converter.Result {
+
+       p, err := Provider.New(
+               converter.ProviderConfig{
+                       MarkupConfig: mconf,
+                       Logger:       loggers.NewErrorLogger(),
+               },
+       )
+       c.Assert(err, qt.IsNil)
+       conv, err := p.New(converter.DocumentContext{DocumentID: "thedoc"})
+       c.Assert(err, qt.IsNil)
+       b, err := conv.Convert(converter.RenderContext{RenderTOC: true, Src: []byte(content)})
+       c.Assert(err, qt.IsNil)
+
+       return b
+}
+
 func TestConvert(t *testing.T) {
        c := qt.New(t)
 
@@ -92,29 +109,23 @@ description
 : the description for the content.
 
 
+## 神真美好
+
+## 神真美好
+
+## 神真美好
+
 [^1]: And that's the footnote.
 
 `
 
        // Code fences
        content = strings.Replace(content, "§§§", "```", -1)
-
        mconf := markup_config.Default
        mconf.Highlight.NoClasses = false
        mconf.Goldmark.Renderer.Unsafe = true
 
-       p, err := Provider.New(
-               converter.ProviderConfig{
-                       MarkupConfig: mconf,
-                       Logger:       loggers.NewErrorLogger(),
-               },
-       )
-       c.Assert(err, qt.IsNil)
-       conv, err := p.New(converter.DocumentContext{DocumentID: "thedoc"})
-       c.Assert(err, qt.IsNil)
-       b, err := conv.Convert(converter.RenderContext{RenderTOC: true, Src: []byte(content)})
-       c.Assert(err, qt.IsNil)
-
+       b := convert(c, mconf, content)
        got := string(b.Bytes())
 
        // Links
@@ -123,6 +134,9 @@ description
        // Header IDs
        c.Assert(got, qt.Contains, `<h2 id="custom">Custom ID</h2>`, qt.Commentf(got))
        c.Assert(got, qt.Contains, `<h2 id="auto-id">Auto ID</h2>`, qt.Commentf(got))
+       c.Assert(got, qt.Contains, `<h2 id="神真美好">神真美好</h2>`, qt.Commentf(got))
+       c.Assert(got, qt.Contains, `<h2 id="神真美好-1">神真美好</h2>`, qt.Commentf(got))
+       c.Assert(got, qt.Contains, `<h2 id="神真美好-2">神真美好</h2>`, qt.Commentf(got))
 
        // Code fences
        c.Assert(got, qt.Contains, "<div class=\"highlight\"><pre class=\"chroma\"><code class=\"language-bash\" data-lang=\"bash\">LINE1\n</code></pre></div>")
@@ -148,6 +162,20 @@ description
 
 }
 
+func TestConvertAutoIDAsciiOnly(t *testing.T) {
+       c := qt.New(t)
+
+       content := `
+## God is Good: 神真美好
+`
+       mconf := markup_config.Default
+       mconf.Goldmark.Parser.AutoHeadingIDAsciiOnly = true
+       b := convert(c, mconf, content)
+       got := string(b.Bytes())
+
+       c.Assert(got, qt.Contains, "<h2 id=\"god-is-good-\">")
+}
+
 func TestCodeFence(t *testing.T) {
        c := qt.New(t)
 
index bf18a384dda9eca09340a4b2fecf04ed7059c89c..2454eb46f24c9b33218744aff99485ead01457ae 100644 (file)
@@ -69,6 +69,10 @@ type Parser struct {
        // auto generated heading ids.
        AutoHeadingID bool
 
+       // When AutoHeadingID is enabled this will generate IDs with Ascii
+       // characters only.
+       AutoHeadingIDAsciiOnly bool
+
        // Enables custom attributes.
        Attribute bool
 }
index 5bae411b3ed554c5ebd68a77426e51d484bb91b1..ee0e55501663cdcb3bd5541627295c025be0ac8f 100644 (file)
@@ -25,7 +25,6 @@ import (
        "github.com/gohugoio/hugo/common/urls"
        "github.com/gohugoio/hugo/deps"
        _errors "github.com/pkg/errors"
-       "github.com/russross/blackfriday"
        "github.com/spf13/cast"
 )
 
@@ -90,7 +89,7 @@ func (ns *Namespace) Anchorize(a interface{}) (string, error) {
        if err != nil {
                return "", nil
        }
-       return blackfriday.SanitizedAnchorName(s), nil
+       return ns.deps.ContentSpec.SanitizeAnchorName(s), nil
 }
 
 // Ref returns the absolute URL path to a given content item.