canonifyurls in srcset
authorbep <bjorn.erik.pedersen@gmail.com>
Sun, 3 May 2015 17:54:17 +0000 (19:54 +0200)
committerbep <bjorn.erik.pedersen@gmail.com>
Sun, 3 May 2015 17:54:23 +0000 (19:54 +0200)
Speed is about the same as before, uses slightly less memory:

```
benchmark              old ns/op     new ns/op     delta
BenchmarkAbsURL        17302         17713         +2.38%
BenchmarkXMLAbsURL     9463          9470          +0.07%

benchmark              old allocs     new allocs     delta
BenchmarkAbsURL        28             24             -14.29%
BenchmarkXMLAbsURL     14             12             -14.29%

benchmark              old bytes     new bytes     delta
BenchmarkAbsURL        3422          3144          -8.12%
BenchmarkXMLAbsURL     1985          1864          -6.10%
```

Fixes #1059

transform/absurlreplacer.go
transform/chain_test.go

index 9acdfc8defc3ef8ed4c378979e446398bcaef88f..9d3aa89017333b8526cd323ca3350df5f50b1b39 100644 (file)
@@ -29,69 +29,98 @@ type contentlexer struct {
        start int // item start position
        width int // width of last element
 
-       matchers     []absURLMatcher
-       state        stateFunc
-       prefixLookup *prefixes
+       matchers []absURLMatcher
+       state    stateFunc
+
+       ms      matchState
+       matches [3]bool // track matches of the 3 prefixes
+       i       int     // last index in matches checked
 
        w io.Writer
 }
 
 type stateFunc func(*contentlexer) stateFunc
 
-type prefixRunes []rune
-
-type prefixes struct {
-       pr   []prefixRunes
-       curr prefixRunes // current prefix lookup table
-       i    int         // current index
+type prefix struct {
+       r []rune
+       f func(l *contentlexer)
+}
 
-       // first rune in potential match
-       first rune
+var prefixes = []*prefix{
+       &prefix{r: []rune{'s', 'r', 'c', '='}, f: checkCandidateSrc},
+       &prefix{r: []rune{'s', 'r', 'c', 's', 'e', 't', '='}, f: checkCandidateSrcset},
+       &prefix{r: []rune{'h', 'r', 'e', 'f', '='}, f: checkCandidateHref}}
 
-       // match-state:
-       // none, whitespace, partial, full
-       ms matchState
+type absURLMatcher struct {
+       prefix         int
+       match          []byte
+       quote          []byte
+       replacementURL []byte
 }
 
-// match returns partial and full match for the prefix in play
-// - it's a full match if all prefix runes has checked out in row
-// - it's a partial match if it's on its way towards a full match
 func (l *contentlexer) match(r rune) {
-       p := l.prefixLookup
-       if p.curr == nil {
-               // assumes prefixes all start off on a different rune
-               // works in this special case: href, src
-               p.i = 0
-               for _, pr := range p.pr {
-                       if pr[p.i] == r {
-                               fullMatch := len(p.pr) == 1
-                               p.first = r
-                               if !fullMatch {
-                                       p.curr = pr
-                                       l.prefixLookup.ms = matchStatePartial
-                               } else {
-                                       l.prefixLookup.ms = matchStateFull
+
+       var found bool
+
+       // note, the prefixes can start off on the same foot, i.e.
+       // src and srcset.
+       if l.ms == matchStateWhitespace {
+               l.i = 0
+               for j, p := range prefixes {
+                       if r == p.r[l.i] {
+                               l.matches[j] = true
+                               found = true
+                               if l.checkMatchState(r, j) {
+                                       return
                                }
-                               return
+                       } else {
+                               l.matches[j] = false
                        }
                }
-       } else {
-               p.i++
-               if p.curr[p.i] == r {
-                       fullMatch := len(p.curr) == p.i+1
-                       if fullMatch {
-                               p.curr = nil
-                               l.prefixLookup.ms = matchStateFull
+
+               if !found {
+                       l.ms = matchStateNone
+               }
+
+               return
+       }
+
+       l.i++
+       for j, m := range l.matches {
+               // still a match?
+               if m {
+                       if prefixes[j].r[l.i] == r {
+                               found = true
+                               if l.checkMatchState(r, j) {
+                                       return
+                               }
                        } else {
-                               l.prefixLookup.ms = matchStatePartial
+                               l.matches[j] = false
                        }
-                       return
                }
+       }
 
-               p.curr = nil
+       if found {
+               return
        }
 
-       l.prefixLookup.ms = matchStateNone
+       l.ms = matchStateNone
+}
+
+func (l *contentlexer) checkMatchState(r rune, idx int) bool {
+       if r == '=' {
+               l.ms = matchStateFull
+               for k := range l.matches {
+                       if k != idx {
+                               l.matches[k] = false
+                       }
+               }
+               return true
+       }
+
+       l.ms = matchStatePartial
+
+       return false
 }
 
 func (l *contentlexer) emit() {
@@ -99,49 +128,108 @@ func (l *contentlexer) emit() {
        l.start = l.pos
 }
 
-var mainPrefixRunes = []prefixRunes{{'s', 'r', 'c', '='}, {'h', 'r', 'e', 'f', '='}}
+func (a absURLMatcher) isSourceType() bool {
+       return a.prefix == matchPrefixSrc
+}
 
-type absURLMatcher struct {
-       prefix      int
-       match       []byte
-       replacement []byte
+func checkCandidateSrc(l *contentlexer) {
+       for _, m := range l.matchers {
+               if !m.isSourceType() {
+                       continue
+               }
+               l.replaceSimple(m)
+       }
 }
 
-func (a absURLMatcher) isSourceType() bool {
-       return a.prefix == matchPrefixSrc
+func checkCandidateHref(l *contentlexer) {
+       for _, m := range l.matchers {
+               if m.isSourceType() {
+                       continue
+               }
+               l.replaceSimple(m)
+       }
 }
 
-func checkCandidate(l *contentlexer) {
-       isSource := l.prefixLookup.first == 's'
+func checkCandidateSrcset(l *contentlexer) {
+       // special case, not frequent (me think)
        for _, m := range l.matchers {
+               if m.isSourceType() {
+                       continue
+               }
 
-               if isSource && !m.isSourceType() || !isSource && m.isSourceType() {
+               if !bytes.HasPrefix(l.content[l.pos:], m.match) {
                        continue
                }
 
-               if bytes.HasPrefix(l.content[l.pos:], m.match) {
-                       // check for schemaless URLs
-                       posAfter := l.pos + len(m.match)
-                       if posAfter >= len(l.content) {
-                               return
-                       }
-                       r, _ := utf8.DecodeRune(l.content[posAfter:])
-                       if r == '/' {
-                               // schemaless: skip
-                               return
-                       }
-                       if l.pos > l.start {
-                               l.emit()
-                       }
-                       l.pos += len(m.match)
-                       l.w.Write(m.replacement)
-                       l.start = l.pos
+               // check for schemaless URLs
+               posAfter := l.pos + len(m.match)
+               if posAfter >= len(l.content) {
+                       return
+               }
+               r, _ := utf8.DecodeRune(l.content[posAfter:])
+               if r == '/' {
+                       // schemaless: skip
+                       continue
+               }
+
+               posLastQuote := bytes.Index(l.content[l.pos+1:], m.quote)
+
+               // safe guard
+               if posLastQuote < 0 || posLastQuote > 2000 {
                        return
+               }
 
+               if l.pos > l.start {
+                       l.emit()
                }
+
+               section := l.content[l.pos+len(m.quote) : l.pos+posLastQuote+1]
+
+               fields := bytes.Fields(section)
+               l.w.Write([]byte(m.quote))
+               for i, f := range fields {
+                       if f[0] == '/' {
+                               l.w.Write(m.replacementURL)
+                               l.w.Write(f[1:])
+
+                       } else {
+                               l.w.Write(f)
+                       }
+
+                       if i < len(fields)-1 {
+                               l.w.Write([]byte(" "))
+                       }
+               }
+
+               l.w.Write(m.quote)
+               l.pos += len(section) + (len(m.quote) * 2)
+               l.start = l.pos
        }
 }
 
+func (l *contentlexer) replaceSimple(m absURLMatcher) {
+       if !bytes.HasPrefix(l.content[l.pos:], m.match) {
+               return
+       }
+       // check for schemaless URLs
+       posAfter := l.pos + len(m.match)
+       if posAfter >= len(l.content) {
+               return
+       }
+       r, _ := utf8.DecodeRune(l.content[posAfter:])
+       if r == '/' {
+               // schemaless: skip
+               return
+       }
+       if l.pos > l.start {
+               l.emit()
+       }
+       l.pos += len(m.match)
+       l.w.Write(m.quote)
+       l.w.Write(m.replacementURL)
+       l.start = l.pos
+}
+
 func (l *contentlexer) replace() {
        contentLength := len(l.content)
        var r rune
@@ -152,7 +240,7 @@ func (l *contentlexer) replace() {
                        break
                }
 
-               var width int = 1
+               var width = 1
                r = rune(l.content[l.pos])
                if r >= utf8.RuneSelf {
                        r, width = utf8.DecodeRune(l.content[l.pos:])
@@ -160,14 +248,24 @@ func (l *contentlexer) replace() {
                l.width = width
                l.pos += l.width
                if r == ' ' {
-                       l.prefixLookup.ms = matchStateWhitespace
-               } else if l.prefixLookup.ms != matchStateNone {
+                       l.ms = matchStateWhitespace
+               } else if l.ms != matchStateNone {
                        l.match(r)
-                       if l.prefixLookup.ms == matchStateFull {
-                               checkCandidate(l)
+                       if l.ms == matchStateFull {
+                               var p *prefix
+                               for i, m := range l.matches {
+                                       if m {
+                                               p = prefixes[i]
+                                       }
+                                       l.matches[i] = false
+                               }
+                               if p == nil {
+                                       panic("illegal state: curr is nil when state is full")
+                               }
+                               l.ms = matchStateNone
+                               p.f(l)
                        }
                }
-
        }
 
        // Done!
@@ -177,15 +275,12 @@ func (l *contentlexer) replace() {
 }
 
 func doReplace(ct contentTransformer, matchers []absURLMatcher) {
-
        lexer := &contentlexer{
-               content:      ct.Content(),
-               w:            ct,
-               prefixLookup: &prefixes{pr: mainPrefixRunes},
-               matchers:     matchers}
+               content:  ct.Content(),
+               w:        ct,
+               matchers: matchers}
 
        lexer.replace()
-
 }
 
 type absURLReplacer struct {
@@ -195,7 +290,7 @@ type absURLReplacer struct {
 
 func newAbsURLReplacer(baseURL string) *absURLReplacer {
        u, _ := url.Parse(baseURL)
-       base := strings.TrimRight(u.String(), "/")
+       base := []byte(strings.TrimRight(u.String(), "/") + "/")
 
        // HTML
        dqHTMLMatch := []byte("\"/")
@@ -205,23 +300,23 @@ func newAbsURLReplacer(baseURL string) *absURLReplacer {
        dqXMLMatch := []byte("&#34;/")
        sqXMLMatch := []byte("&#39;/")
 
-       dqHTML := []byte("\"" + base + "/")
-       sqHTML := []byte("'" + base + "/")
+       dqHTML := []byte("\"")
+       sqHTML := []byte("'")
 
-       dqXML := []byte("&#34;" + base + "/")
-       sqXML := []byte("&#39;" + base + "/")
+       dqXML := []byte("&#34;")
+       sqXML := []byte("&#39;")
 
        return &absURLReplacer{
                htmlMatchers: []absURLMatcher{
-                       {matchPrefixSrc, dqHTMLMatch, dqHTML},
-                       {matchPrefixSrc, sqHTMLMatch, sqHTML},
-                       {matchPrefixHref, dqHTMLMatch, dqHTML},
-                       {matchPrefixHref, sqHTMLMatch, sqHTML}},
+                       {matchPrefixSrc, dqHTMLMatch, dqHTML, base},
+                       {matchPrefixSrc, sqHTMLMatch, sqHTML, base},
+                       {matchPrefixHref, dqHTMLMatch, dqHTML, base},
+                       {matchPrefixHref, sqHTMLMatch, sqHTML, base}},
                xmlMatchers: []absURLMatcher{
-                       {matchPrefixSrc, dqXMLMatch, dqXML},
-                       {matchPrefixSrc, sqXMLMatch, sqXML},
-                       {matchPrefixHref, dqXMLMatch, dqXML},
-                       {matchPrefixHref, sqXMLMatch, sqXML},
+                       {matchPrefixSrc, dqXMLMatch, dqXML, base},
+                       {matchPrefixSrc, sqXMLMatch, sqXML, base},
+                       {matchPrefixHref, dqXMLMatch, dqXML, base},
+                       {matchPrefixHref, sqXMLMatch, sqXML, base},
                }}
 
 }
index 683ac77c1e5a07ce47ed80d45bff3e79df7daf32..242d571e7c2315ab6c47a167274c5a773246925a 100644 (file)
@@ -25,8 +25,42 @@ const REPLACE_2 = "ᚠᛇᚻ ᛒᛦᚦ ᚠᚱᚩᚠᚢᚱ\nᚠᛁᚱᚪ ᚷᛖ
 // Issue: 816, schemaless links combined with others
 const REPLACE_SCHEMALESS_HTML = `Pre. src='//schemaless' src='/normal'  <a href="//schemaless">Schemaless</a>. <a href="/normal">normal</a>. Post.`
 const REPLACE_SCHEMALESS_HTML_CORRECT = `Pre. src='//schemaless' src='http://base/normal'  <a href="//schemaless">Schemaless</a>. <a href="http://base/normal">normal</a>. Post.`
-const REPLACE_SCHEMALESS_XML = `Pre. src=&#34;//schemaless&#34; src=&#34;/normal&#34;  <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;/normal&#39;>normal</a>. Post.`
-const REPLACE_SCHEMALESS_XML_CORRECT = `Pre. src=&#34;//schemaless&#34; src=&#34;http://base/normal&#34;  <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;http://base/normal&#39;>normal</a>. Post.`
+const REPLACE_SCHEMALESS_XML = `Pre. src=&#39;//schemaless&#39; src=&#39;/normal&#39;  <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;/normal&#39;>normal</a>. Post.`
+const REPLACE_SCHEMALESS_XML_CORRECT = `Pre. src=&#39;//schemaless&#39; src=&#39;http://base/normal&#39;  <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;http://base/normal&#39;>normal</a>. Post.`
+
+// srcset=
+const SRCSET_BASIC = `Pre. <img srcset="/img/small.jpg 200w /img/big.jpg 700w" alt="text" src="/img/foo.jpg">`
+const SRCSET_BASIC_CORRECT = `Pre. <img srcset="http://base/img/small.jpg 200w http://base/img/big.jpg 700w" alt="text" src="http://base/img/foo.jpg">`
+const SRCSET_SINGLE_QUOTE = `Pre. <img srcset='/img/small.jpg 200w /img/big.jpg 700w' alt="text" src="/img/foo.jpg"> POST.`
+const SRCSET_SINGLE_QUOTE_CORRECT = `Pre. <img srcset='http://base/img/small.jpg 200w http://base/img/big.jpg 700w' alt="text" src="http://base/img/foo.jpg"> POST.`
+const SRCSET_XML_BASIC = `Pre. <img srcset=&#34;/img/small.jpg 200w /img/big.jpg 700w&#34; alt=&#34;text&#34; src=&#34;/img/foo.jpg&#34;>`
+const SRCSET_XML_BASIC_CORRECT = `Pre. <img srcset=&#34;http://base/img/small.jpg 200w http://base/img/big.jpg 700w&#34; alt=&#34;text&#34; src=&#34;http://base/img/foo.jpg&#34;>`
+const SRCSET_XML_SINGLE_QUOTE = `Pre. <img srcset=&#34;/img/small.jpg 200w /img/big.jpg 700w&#34; alt=&#34;text&#34; src=&#34;/img/foo.jpg&#34;>`
+const SRCSET_XML_SINGLE_QUOTE_CORRECT = `Pre. <img srcset=&#34;http://base/img/small.jpg 200w http://base/img/big.jpg 700w&#34; alt=&#34;text&#34; src=&#34;http://base/img/foo.jpg&#34;>`
+const SRCSET_VARIATIONS = `Pre. 
+Missing start quote: <img srcset=/img/small.jpg 200w /img/big.jpg 700w" alt="text"> src='/img/foo.jpg'> FOO. 
+<img srcset='/img.jpg'> 
+schemaless: <img srcset='//img.jpg' src='//basic.jpg'>
+schemaless2: <img srcset="//img.jpg" src="//basic.jpg2> POST
+`
+const SRCSET_VARIATIONS_CORRECT = `Pre. 
+Missing start quote: <img srcset=/img/small.jpg 200w /img/big.jpg 700w" alt="text"> src='http://base/img/foo.jpg'> FOO. 
+<img srcset='http://base/img.jpg'> 
+schemaless: <img srcset='//img.jpg' src='//basic.jpg'>
+schemaless2: <img srcset="//img.jpg" src="//basic.jpg2> POST
+`
+const SRCSET_XML_VARIATIONS = `Pre. 
+Missing start quote: &lt;img srcset=/img/small.jpg 200w /img/big.jpg 700w&quot; alt=&quot;text&quot;&gt; src=&#39;/img/foo.jpg&#39;&gt; FOO. 
+&lt;img srcset=&#39;/img.jpg&#39;&gt; 
+schemaless: &lt;img srcset=&#39;//img.jpg&#39; src=&#39;//basic.jpg&#39;&gt;
+schemaless2: &lt;img srcset=&quot;//img.jpg&quot; src=&quot;//basic.jpg2&gt; POST
+`
+const SRCSET_XML_VARIATIONS_CORRECT = `Pre. 
+Missing start quote: &lt;img srcset=/img/small.jpg 200w /img/big.jpg 700w&quot; alt=&quot;text&quot;&gt; src=&#39;http://base/img/foo.jpg&#39;&gt; FOO. 
+&lt;img srcset=&#39;http://base/img.jpg&#39;&gt; 
+schemaless: &lt;img srcset=&#39;//img.jpg&#39; src=&#39;//basic.jpg&#39;&gt;
+schemaless2: &lt;img srcset=&quot;//img.jpg&quot; src=&quot;//basic.jpg2&gt; POST
+`
 
 var abs_url_bench_tests = []test{
        {H5_JS_CONTENT_DOUBLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_DQ},
@@ -46,6 +80,12 @@ var abs_url_tests = append(abs_url_bench_tests, append(sanity_tests, extra_tests
 var extra_tests_xml = []test{{REPLACE_SCHEMALESS_XML, REPLACE_SCHEMALESS_XML_CORRECT}}
 var xml_abs_url_tests = append(xml_abs_url_bench_tests, append(sanity_tests, extra_tests_xml...)...)
 
+var srcset_tests = []test{{SRCSET_BASIC, SRCSET_BASIC_CORRECT}, {SRCSET_SINGLE_QUOTE, SRCSET_SINGLE_QUOTE_CORRECT}, {SRCSET_VARIATIONS, SRCSET_VARIATIONS_CORRECT}}
+var srcset_xml_tests = []test{
+       {SRCSET_XML_BASIC, SRCSET_XML_BASIC_CORRECT},
+       {SRCSET_XML_SINGLE_QUOTE, SRCSET_XML_SINGLE_QUOTE_CORRECT},
+       {SRCSET_XML_VARIATIONS, SRCSET_XML_VARIATIONS_CORRECT}}
+
 func TestChainZeroTransformers(t *testing.T) {
        tr := NewChain()
        in := new(bytes.Buffer)
@@ -102,6 +142,21 @@ func TestAbsURL(t *testing.T) {
 
 }
 
+func TestAbsURLSrcSet(t *testing.T) {
+       absURL, _ := absURLFromURL("http://base")
+       tr := NewChain(absURL...)
+
+       apply(t.Errorf, tr, srcset_tests)
+}
+
+func TestAbsXMLURLSrcSet(t *testing.T) {
+       absURLInXML, _ := absURLInXMLFromURL("http://base")
+       tr := NewChain(absURLInXML...)
+
+       apply(t.Errorf, tr, srcset_xml_tests)
+
+}
+
 func BenchmarkXMLAbsURL(b *testing.B) {
        absURLInXML, _ := absURLInXMLFromURL("http://base")
        tr := NewChain(absURLInXML...)