transform/urlreplacers: Support unquoted URLs in canonifyURLs replacer
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Mon, 17 Dec 2018 13:25:00 +0000 (14:25 +0100)
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Mon, 17 Dec 2018 18:17:56 +0000 (19:17 +0100)
Fixes #5529

transform/urlreplacers/absurlreplacer.go
transform/urlreplacers/absurlreplacer_test.go

index 1de6b0ca7a3abab56519e683632a67bebc7ccf8e..45b98f8216cb2edad33f369c54c877f26f8f1731 100644 (file)
@@ -16,6 +16,7 @@ package urlreplacers
 import (
        "bytes"
        "io"
+       "unicode"
        "unicode/utf8"
 
        "github.com/gohugoio/hugo/transform"
@@ -43,7 +44,7 @@ type absurllexer struct {
        start int // item start position
        width int // width of last element
 
-       matchers []absURLMatcher
+       quotes [][]byte
 
        ms      matchState
        matches [3]bool // track matches of the 3 prefixes
@@ -140,84 +141,115 @@ func (l *absurllexer) emit() {
        l.start = l.pos
 }
 
-// handle URLs in src and href.
-func checkCandidateBase(l *absurllexer) {
-       for _, m := range l.matchers {
-               if !bytes.HasPrefix(l.content[l.pos:], m.match) {
-                       continue
-               }
-               // check for schemaless URLs
-               posAfter := l.pos + len(m.match)
-               if posAfter >= len(l.content) {
-                       return
-               }
-               r, _ := utf8.DecodeRune(l.content[posAfter:])
-               if r == '/' {
-                       // schemaless: skip
-                       return
-               }
-               if l.pos > l.start {
+var (
+       relURLPrefix    = []byte("/")
+       relURLPrefixLen = len(relURLPrefix)
+)
+
+func (l *absurllexer) consumeQuote() []byte {
+       for _, q := range l.quotes {
+               if bytes.HasPrefix(l.content[l.pos:], q) {
+                       l.pos += len(q)
                        l.emit()
+                       return q
                }
-               l.pos += len(m.match)
-               l.w.Write(m.quote)
-               l.w.Write(l.path)
-               l.start = l.pos
        }
+       return nil
+}
+
+// handle URLs in src and href.
+func checkCandidateBase(l *absurllexer) {
+       l.consumeQuote()
+
+       if !bytes.HasPrefix(l.content[l.pos:], relURLPrefix) {
+               return
+       }
+
+       // check for schemaless URLs
+       posAfter := l.pos + relURLPrefixLen
+       if posAfter >= len(l.content) {
+               return
+       }
+       r, _ := utf8.DecodeRune(l.content[posAfter:])
+       if r == '/' {
+               // schemaless: skip
+               return
+       }
+       if l.pos > l.start {
+               l.emit()
+       }
+       l.pos += relURLPrefixLen
+       l.w.Write(l.path)
+       l.start = l.pos
+}
+
+func (l *absurllexer) posAfterURL(q []byte) int {
+       if len(q) > 0 {
+               // look for end quote
+               return bytes.Index(l.content[l.pos:], q)
+       }
+
+       return bytes.IndexFunc(l.content[l.pos:], func(r rune) bool {
+               return r == '>' || unicode.IsSpace(r)
+       })
+
 }
 
 // handle URLs in srcset.
 func checkCandidateSrcset(l *absurllexer) {
-       // special case, not frequent (me think)
-       for _, m := range l.matchers {
-               if !bytes.HasPrefix(l.content[l.pos:], m.match) {
-                       continue
-               }
+       q := l.consumeQuote()
+       if q == nil {
+               // srcset needs to be quoted.
+               return
+       }
 
-               // check for schemaless URLs
-               posAfter := l.pos + len(m.match)
-               if posAfter >= len(l.content) {
-                       return
-               }
-               r, _ := utf8.DecodeRune(l.content[posAfter:])
-               if r == '/' {
-                       // schemaless: skip
-                       continue
-               }
+       // special case, not frequent (me think)
+       if !bytes.HasPrefix(l.content[l.pos:], relURLPrefix) {
+               return
+       }
 
-               posLastQuote := bytes.Index(l.content[l.pos+1:], m.quote)
+       // check for schemaless URLs
+       posAfter := l.pos + relURLPrefixLen
+       if posAfter >= len(l.content) {
+               return
+       }
+       r, _ := utf8.DecodeRune(l.content[posAfter:])
+       if r == '/' {
+               // schemaless: skip
+               return
+       }
 
-               // safe guard
-               if posLastQuote < 0 || posLastQuote > 2000 {
-                       return
-               }
+       posEnd := l.posAfterURL(q)
 
-               if l.pos > l.start {
-                       l.emit()
-               }
+       // safe guard
+       if posEnd < 0 || posEnd > 2000 {
+               return
+       }
 
-               section := l.content[l.pos+len(m.quote) : l.pos+posLastQuote+1]
+       if l.pos > l.start {
+               l.emit()
+       }
 
-               fields := bytes.Fields(section)
-               l.w.Write(m.quote)
-               for i, f := range fields {
-                       if f[0] == '/' {
-                               l.w.Write(l.path)
-                               l.w.Write(f[1:])
+       section := l.content[l.pos : l.pos+posEnd+1]
 
-                       } else {
-                               l.w.Write(f)
-                       }
+       fields := bytes.Fields(section)
+       for i, f := range fields {
+               if f[0] == '/' {
+                       l.w.Write(l.path)
+                       l.w.Write(f[1:])
 
-                       if i < len(fields)-1 {
-                               l.w.Write([]byte(" "))
-                       }
+               } else {
+                       l.w.Write(f)
                }
 
-               l.w.Write(m.quote)
-               l.pos += len(section) + (len(m.quote) * 2)
-               l.start = l.pos
+               if i < len(fields)-1 {
+                       l.w.Write([]byte(" "))
+               }
        }
+
+       l.pos += len(section)
+       l.start = l.pos
+
 }
 
 // main loop
@@ -262,53 +294,32 @@ func (l *absurllexer) replace() {
        }
 }
 
-func doReplace(path string, ct transform.FromTo, matchers []absURLMatcher) {
+func doReplace(path string, ct transform.FromTo, quotes [][]byte) {
 
        lexer := &absurllexer{
-               content:  ct.From().Bytes(),
-               w:        ct.To(),
-               path:     []byte(path),
-               matchers: matchers}
+               content: ct.From().Bytes(),
+               w:       ct.To(),
+               path:    []byte(path),
+               quotes:  quotes}
 
        lexer.replace()
 }
 
 type absURLReplacer struct {
-       htmlMatchers []absURLMatcher
-       xmlMatchers  []absURLMatcher
+       htmlQuotes [][]byte
+       xmlQuotes  [][]byte
 }
 
 func newAbsURLReplacer() *absURLReplacer {
-
-       // HTML
-       dqHTMLMatch := []byte("\"/")
-       sqHTMLMatch := []byte("'/")
-
-       // XML
-       dqXMLMatch := []byte("&#34;/")
-       sqXMLMatch := []byte("&#39;/")
-
-       dqHTML := []byte("\"")
-       sqHTML := []byte("'")
-
-       dqXML := []byte("&#34;")
-       sqXML := []byte("&#39;")
-
        return &absURLReplacer{
-               htmlMatchers: []absURLMatcher{
-                       {dqHTMLMatch, dqHTML},
-                       {sqHTMLMatch, sqHTML},
-               },
-               xmlMatchers: []absURLMatcher{
-                       {dqXMLMatch, dqXML},
-                       {sqXMLMatch, sqXML},
-               }}
+               htmlQuotes: [][]byte{[]byte("\""), []byte("'")},
+               xmlQuotes:  [][]byte{[]byte("&#34;"), []byte("&#39;")}}
 }
 
 func (au *absURLReplacer) replaceInHTML(path string, ct transform.FromTo) {
-       doReplace(path, ct, au.htmlMatchers)
+       doReplace(path, ct, au.htmlQuotes)
 }
 
 func (au *absURLReplacer) replaceInXML(path string, ct transform.FromTo) {
-       doReplace(path, ct, au.xmlMatchers)
+       doReplace(path, ct, au.xmlQuotes)
 }
index 7a530862bbf4041d1a1e09f6512c21a1754eff38..be6b91929494a3a0b8de5804d76416b67f3bc276 100644 (file)
@@ -156,6 +156,21 @@ func TestAbsURL(t *testing.T) {
 
 }
 
+func TestAbsURLUnqoted(t *testing.T) {
+       tr := transform.New(NewAbsURLTransformer(testBaseURL))
+
+       apply(t.Errorf, tr, []test{
+               test{
+                       content:  `Link: <a href=/asdf>ASDF</a>`,
+                       expected: `Link: <a href=http://base/asdf>ASDF</a>`,
+               },
+               test{
+                       content:  `Link: <a href=/asdf   >ASDF</a>`,
+                       expected: `Link: <a href=http://base/asdf   >ASDF</a>`,
+               },
+       })
+}
+
 func TestRelativeURL(t *testing.T) {
        tr := transform.New(NewAbsURLTransformer(helpers.GetDottedRelativePath(filepath.FromSlash("/post/sub/"))))
 
@@ -176,7 +191,7 @@ func TestAbsXMLURLSrcSet(t *testing.T) {
 }
 
 func BenchmarkXMLAbsURL(b *testing.B) {
-       tr := transform.New(NewAbsURLInXMLTransformer(""))
+       tr := transform.New(NewAbsURLInXMLTransformer(testBaseURL))
 
        b.ResetTimer()
        for i := 0; i < b.N; i++ {