Improve abs url replacement speed
authorbep <bjorn.erik.pedersen@gmail.com>
Mon, 16 Feb 2015 09:48:15 +0000 (10:48 +0100)
committerspf13 <steve.francia@gmail.com>
Mon, 16 Feb 2015 13:24:42 +0000 (08:24 -0500)
This commit replaces the multuple `bytes.Containts` and `bytes.Replace` with a custom replacer that does one pass through the document and exploits the fact that there are two common prefixes we search for, `src=` and `href=`.

This is both faster and consumes less memory. There may be even better algos to use here, but we must leave some room for improvements for future versions.

This should also make it possible to solve #816.

```
benchmark              old ns/op     new ns/op     delta
BenchmarkAbsUrl        25795         22597         -12.40%
BenchmarkXmlAbsUrl     17187         11166         -35.03%

benchmark              old allocs     new allocs     delta
BenchmarkAbsUrl        60             33             -45.00%
BenchmarkXmlAbsUrl     30             16             -46.67%

benchmark              old bytes     new bytes     delta
BenchmarkAbsUrl        5844          4167          -28.70%
BenchmarkXmlAbsUrl     3754          2069          -44.89%
```

Fixes #894

.gitignore
transform/absurl.go
transform/absurlreplacer.go [new file with mode: 0644]
transform/chain_test.go

index 3ea8aedd63b28347d72842506dc4d72c9ca96238..41162a757c80aec305b788c1439ad7b3c6bda3d6 100644 (file)
@@ -2,6 +2,8 @@ hugo
 docs/public*\r
 hugo.exe\r
 *.test\r
+*.prof\r
+nohup.out\r
 cover.out\r
 *.swp\r
 *.swo\r
index 0a0cd7239b463727d449bea2346485c947f3c988..0efe624ac5a2fc6bf5db67ea6c3a679d68ad359b 100644 (file)
@@ -1,64 +1,33 @@
 package transform
 
 import (
-       "bytes"
-       "net/url"
-       "strings"
+       "sync"
 )
 
-func AbsURL(absURL string) (trs []link, err error) {
-       var baseURL *url.URL
+var absUrlInit sync.Once
+var ar *absurlReplacer
 
-       if baseURL, err = url.Parse(absURL); err != nil {
-               return
-       }
+// for performance reasons, we reuse the first baseUrl given
+func initAbsurlReplacer(baseURL string) {
+       absUrlInit.Do(func() {
+               ar = newAbsurlReplacer(baseURL)
+       })
+}
 
-       base := strings.TrimRight(baseURL.String(), "/")
+func AbsURL(absURL string) (trs []link, err error) {
+       initAbsurlReplacer(absURL)
 
-       var (
-               srcdq  = []byte(" src=\"" + base + "/")
-               hrefdq = []byte(" href=\"" + base + "/")
-               srcsq  = []byte(" src='" + base + "/")
-               hrefsq = []byte(" href='" + base + "/")
-       )
        trs = append(trs, func(content []byte) []byte {
-               content = guardReplace(content, []byte(" src=\"//"), []byte(" src=\"/"), srcdq)
-               content = guardReplace(content, []byte(" src='//"), []byte(" src='/"), srcsq)
-               content = guardReplace(content, []byte(" href=\"//"), []byte(" href=\"/"), hrefdq)
-               content = guardReplace(content, []byte(" href='//"), []byte(" href='/"), hrefsq)
-               return content
+               return ar.replaceInHtml(content)
        })
        return
 }
 
 func AbsURLInXML(absURL string) (trs []link, err error) {
-       var baseURL *url.URL
+       initAbsurlReplacer(absURL)
 
-       if baseURL, err = url.Parse(absURL); err != nil {
-               return
-       }
-
-       base := strings.TrimRight(baseURL.String(), "/")
-
-       var (
-               srcedq  = []byte(" src=&#34;" + base + "/")
-               hrefedq = []byte(" href=&#34;" + base + "/")
-               srcesq  = []byte(" src=&#39;" + base + "/")
-               hrefesq = []byte(" href=&#39;" + base + "/")
-       )
        trs = append(trs, func(content []byte) []byte {
-               content = guardReplace(content, []byte(" src=&#34;//"), []byte(" src=&#34;/"), srcedq)
-               content = guardReplace(content, []byte(" src=&#39;//"), []byte(" src=&#39;/"), srcesq)
-               content = guardReplace(content, []byte(" href=&#34;//"), []byte(" href=&#34;/"), hrefedq)
-               content = guardReplace(content, []byte(" href=&#39;//"), []byte(" href=&#39;/"), hrefesq)
-               return content
+               return ar.replaceInXml(content)
        })
        return
 }
-
-func guardReplace(content, guard, match, replace []byte) []byte {
-       if !bytes.Contains(content, guard) {
-               content = bytes.Replace(content, match, replace, -1)
-       }
-       return content
-}
diff --git a/transform/absurlreplacer.go b/transform/absurlreplacer.go
new file mode 100644 (file)
index 0000000..7b6f723
--- /dev/null
@@ -0,0 +1,325 @@
+package transform
+
+import (
+       "bytes"
+       bp "github.com/spf13/hugo/bufferpool"
+       "net/url"
+       "strings"
+       "sync"
+       "unicode/utf8"
+)
+
+// position (in bytes)
+type pos int
+
+type matchState int
+
+const (
+       matchStateNone matchState = iota
+       matchStateWhitespace
+       matchStatePartial
+       matchStateFull
+)
+
+type item struct {
+       typ itemType
+       pos pos
+       val []byte
+}
+
+type itemType int
+
+const (
+       tText itemType = iota
+
+       // matches
+       tSrcdq
+       tHrefdq
+       tSrcsq
+       tHrefsq
+       // guards
+       tGrcdq
+       tGhrefdq
+       tGsrcsq
+       tGhrefsq
+)
+
+type contentlexer struct {
+       content []byte
+
+       pos   pos // input position
+       start pos // item start position
+       width pos // width of last element
+
+       matchers     []absurlMatcher
+       state        stateFunc
+       prefixLookup *prefixes
+
+       // items delivered to client
+       items []item
+}
+
+type stateFunc func(*contentlexer) stateFunc
+
+type prefixRunes []rune
+
+type prefixes struct {
+       pr   []prefixRunes
+       curr prefixRunes // current prefix lookup table
+       i    int         // current index
+
+       // first rune in potential match
+       first rune
+
+       // match-state:
+       // none, whitespace, partial, full
+       ms matchState
+}
+
+// match returns partial and full match for the prefix in play
+// - it's a full match if all prefix runes has checked out in row
+// - it's a partial match if it's on its way towards a full match
+func (l *contentlexer) match(r rune) {
+       p := l.prefixLookup
+       if p.curr == nil {
+               // assumes prefixes all start off on a different rune
+               // works in this special case: href, src
+               p.i = 0
+               for _, pr := range p.pr {
+                       if pr[p.i] == r {
+                               fullMatch := len(p.pr) == 1
+                               p.first = r
+                               if !fullMatch {
+                                       p.curr = pr
+                                       l.prefixLookup.ms = matchStatePartial
+                               } else {
+                                       l.prefixLookup.ms = matchStateFull
+                               }
+                               return
+                       }
+               }
+       } else {
+               p.i++
+               if p.curr[p.i] == r {
+                       fullMatch := len(p.curr) == p.i+1
+                       if fullMatch {
+                               p.curr = nil
+                               l.prefixLookup.ms = matchStateFull
+                       } else {
+                               l.prefixLookup.ms = matchStatePartial
+                       }
+                       return
+               }
+
+               p.curr = nil
+       }
+
+       l.prefixLookup.ms = matchStateNone
+}
+
+func (l *contentlexer) emit(t itemType) {
+       l.items = append(l.items, item{t, l.start, l.content[l.start:l.pos]})
+       l.start = l.pos
+}
+
+var mainPrefixRunes = []prefixRunes{{'s', 'r', 'c', '='}, {'h', 'r', 'e', 'f', '='}}
+
+var itemSlicePool = &sync.Pool{
+       New: func() interface{} {
+               return make([]item, 0, 8)
+       },
+}
+
+func replace(content []byte, matchers []absurlMatcher) *contentlexer {
+       var items []item
+       if x := itemSlicePool.Get(); x != nil {
+               items = x.([]item)[:0]
+               defer itemSlicePool.Put(items)
+       } else {
+               items = make([]item, 0, 8)
+       }
+
+       lexer := &contentlexer{content: content,
+               items:        items,
+               prefixLookup: &prefixes{pr: mainPrefixRunes},
+               matchers:     matchers}
+
+       lexer.runReplacer()
+       return lexer
+}
+
+func (l *contentlexer) runReplacer() {
+       for l.state = lexReplacements; l.state != nil; {
+               l.state = l.state(l)
+       }
+}
+
+type absurlMatcher struct {
+       replaceType itemType
+       guardType   itemType
+       match       []byte
+       guard       []byte
+       replacement []byte
+       guarded     bool
+}
+
+func (a absurlMatcher) isSourceType() bool {
+       return a.replaceType == tSrcdq || a.replaceType == tSrcsq
+}
+
+func lexReplacements(l *contentlexer) stateFunc {
+       contentLength := len(l.content)
+       var r rune
+
+       for {
+               if int(l.pos) >= contentLength {
+                       l.width = 0
+                       break
+               }
+
+               var width int = 1
+               r = rune(l.content[l.pos])
+               if r >= utf8.RuneSelf {
+                       r, width = utf8.DecodeRune(l.content[l.pos:])
+               }
+               l.width = pos(width)
+               l.pos += l.width
+
+               if r == ' ' {
+                       l.prefixLookup.ms = matchStateWhitespace
+               } else if l.prefixLookup.ms != matchStateNone {
+                       l.match(r)
+                       if l.prefixLookup.ms == matchStateFull {
+                               checkCandidate(l)
+                       }
+               }
+
+       }
+
+       // Done!
+       if l.pos > l.start {
+               l.emit(tText)
+       }
+       return nil
+}
+
+func checkCandidate(l *contentlexer) {
+       isSource := l.prefixLookup.first == 's'
+       for _, m := range l.matchers {
+
+               if m.guarded {
+                       continue
+               }
+
+               if isSource && !m.isSourceType() || !isSource && m.isSourceType() {
+                       continue
+               }
+
+               s := l.content[l.pos:]
+               if bytes.HasPrefix(s, m.guard) {
+                       if l.pos > l.start {
+                               l.emit(tText)
+                       }
+                       l.pos += pos(len(m.guard))
+                       l.emit(m.guardType)
+                       m.guarded = true
+                       return
+               } else if bytes.HasPrefix(s, m.match) {
+                       if l.pos > l.start {
+                               l.emit(tText)
+                       }
+                       l.pos += pos(len(m.match))
+                       l.emit(m.replaceType)
+                       return
+
+               }
+       }
+}
+
+func doReplace(content []byte, matchers []absurlMatcher) []byte {
+       b := bp.GetBuffer()
+       defer bp.PutBuffer(b)
+
+       guards := make([]bool, len(matchers))
+       replaced := replace(content, matchers)
+
+       // first pass: check guards
+       for _, item := range replaced.items {
+               if item.typ != tText {
+                       for i, e := range matchers {
+                               if item.typ == e.guardType {
+                                       guards[i] = true
+                                       break
+                               }
+                       }
+               }
+       }
+       // second pass: do replacements for non-guarded tokens
+       for _, token := range replaced.items {
+               switch token.typ {
+               case tText:
+                       b.Write(token.val)
+               default:
+                       for i, e := range matchers {
+                               if token.typ == e.replaceType && !guards[i] {
+                                       b.Write(e.replacement)
+                               } else if token.typ == e.replaceType || token.typ == e.guardType {
+                                       b.Write(token.val)
+                               }
+                       }
+               }
+       }
+
+       return b.Bytes()
+}
+
+type absurlReplacer struct {
+       htmlMatchers []absurlMatcher
+       xmlMatchers  []absurlMatcher
+}
+
+func newAbsurlReplacer(baseUrl string) *absurlReplacer {
+       u, _ := url.Parse(baseUrl)
+       base := strings.TrimRight(u.String(), "/")
+
+       // HTML
+       dqHtmlMatch := []byte("\"/")
+       sqHtmlMatch := []byte("'/")
+
+       dqGuard := []byte("\"//")
+       sqGuard := []byte("'//")
+
+       // XML
+       dqXmlMatch := []byte("&#34;/")
+       sqXmlMatch := []byte("&#39;/")
+
+       dqXmlGuard := []byte("&#34;//")
+       sqXmlGuard := []byte("&#39;//")
+
+       dqHtml := []byte("\"" + base + "/")
+       sqHtml := []byte("'" + base + "/")
+
+       dqXml := []byte("&#34;" + base + "/")
+       sqXml := []byte("&#39;" + base + "/")
+
+       return &absurlReplacer{htmlMatchers: []absurlMatcher{
+               {tSrcdq, tGrcdq, dqHtmlMatch, dqGuard, dqHtml, false},
+               {tSrcsq, tGsrcsq, sqHtmlMatch, sqGuard, sqHtml, false},
+               {tHrefdq, tGhrefdq, dqHtmlMatch, dqGuard, dqHtml, false},
+               {tHrefsq, tGhrefsq, sqHtmlMatch, sqGuard, sqHtml, false}},
+               xmlMatchers: []absurlMatcher{
+                       {tSrcdq, tGrcdq, dqXmlMatch, dqXmlGuard, dqXml, false},
+                       {tSrcsq, tGsrcsq, sqXmlMatch, sqXmlGuard, sqXml, false},
+                       {tHrefdq, tGhrefdq, dqXmlMatch, dqXmlGuard, dqXml, false},
+                       {tHrefsq, tGhrefsq, sqXmlMatch, sqXmlGuard, sqXml, false},
+               }}
+
+}
+
+func (au *absurlReplacer) replaceInHtml(content []byte) []byte {
+       return doReplace(content, au.htmlMatchers)
+}
+
+func (au *absurlReplacer) replaceInXml(content []byte) []byte {
+       return doReplace(content, au.xmlMatchers)
+}
index 71037d4557689483544cba8f1f660a397bc12928..a88d8453308807d328d7c29927411caaf36bfd43 100644 (file)
@@ -14,21 +14,29 @@ const CORRECT_OUTPUT_SRC_HREF_DQ = "<!DOCTYPE html><html><head><script src=\"foo
 const CORRECT_OUTPUT_SRC_HREF_SQ = "<!DOCTYPE html><html><head><script src='foobar.js'></script><script src='http://base/barfoo.js'></script></head><body><nav><h1>title</h1></nav><article>content <a href='foobar'>foobar</a>. <a href='http://base/foobar'>Follow up</a></article></body></html>"
 
 const H5_XML_CONTENT_ABS_URL = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\" ?><feed xmlns=\"http://www.w3.org/2005/Atom\"><entry><content type=\"html\">&lt;p&gt;&lt;a href=&#34;/foobar&#34;&gt;foobar&lt;/a&gt;&lt;/p&gt; &lt;p&gt;A video: &lt;iframe src=&#39;/foo&#39;&gt;&lt;/iframe&gt;&lt;/p&gt;</content></entry></feed>"
-const CORRECT_OUTPUT_SRC_HREF_IN_XML = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\" ?><feed xmlns=\"http://www.w3.org/2005/Atom\"><entry><content type=\"html\">&lt;p&gt;&lt;a href=&#34;http://xml/foobar&#34;&gt;foobar&lt;/a&gt;&lt;/p&gt; &lt;p&gt;A video: &lt;iframe src=&#39;http://xml/foo&#39;&gt;&lt;/iframe&gt;&lt;/p&gt;</content></entry></feed>"
+const CORRECT_OUTPUT_SRC_HREF_IN_XML = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\" ?><feed xmlns=\"http://www.w3.org/2005/Atom\"><entry><content type=\"html\">&lt;p&gt;&lt;a href=&#34;http://base/foobar&#34;&gt;foobar&lt;/a&gt;&lt;/p&gt; &lt;p&gt;A video: &lt;iframe src=&#39;http://base/foo&#39;&gt;&lt;/iframe&gt;&lt;/p&gt;</content></entry></feed>"
 const H5_XML_CONTENT_GUARDED = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\" ?><feed xmlns=\"http://www.w3.org/2005/Atom\"><entry><content type=\"html\">&lt;p&gt;&lt;a href=&#34;//foobar&#34;&gt;foobar&lt;/a&gt;&lt;/p&gt; &lt;p&gt;A video: &lt;iframe src=&#39;//foo&#39;&gt;&lt;/iframe&gt;&lt;/p&gt;</content></entry></feed>"
 
-var abs_url_tests = []test{
+// additional sanity tests for replacements testing
+const REPLACE_1 = "No replacements."
+const REPLACE_2 = "ᚠᛇᚻ ᛒᛦᚦ ᚠᚱᚩᚠᚢᚱ\nᚠᛁᚱᚪ ᚷᛖᚻᚹᛦᛚᚳᚢᛗ"
+
+var abs_url_bench_tests = []test{
        {H5_JS_CONTENT_DOUBLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_DQ},
        {H5_JS_CONTENT_SINGLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_SQ},
        {H5_JS_CONTENT_ABS_URL, H5_JS_CONTENT_ABS_URL},
        {H5_JS_CONTENT_ABS_URL_SCHEMALESS, H5_JS_CONTENT_ABS_URL_SCHEMALESS},
 }
 
-var xml_abs_url_tests = []test{
+var xml_abs_url_bench_tests = []test{
        {H5_XML_CONTENT_ABS_URL, CORRECT_OUTPUT_SRC_HREF_IN_XML},
        {H5_XML_CONTENT_GUARDED, H5_XML_CONTENT_GUARDED},
 }
 
+var sanity_tests = []test{{REPLACE_1, REPLACE_1}, {REPLACE_2, REPLACE_2}}
+var abs_url_tests = append(abs_url_bench_tests, sanity_tests...)
+var xml_abs_url_tests = append(xml_abs_url_bench_tests, sanity_tests...)
+
 func TestChainZeroTransformers(t *testing.T) {
        tr := NewChain()
        in := new(bytes.Buffer)
@@ -44,7 +52,7 @@ func BenchmarkAbsUrl(b *testing.B) {
 
        b.ResetTimer()
        for i := 0; i < b.N; i++ {
-               apply(b.Errorf, tr, abs_url_tests)
+               apply(b.Errorf, tr, abs_url_bench_tests)
        }
 }
 
@@ -57,17 +65,17 @@ func TestAbsUrl(t *testing.T) {
 }
 
 func BenchmarkXmlAbsUrl(b *testing.B) {
-       absURLInXML, _ := AbsURLInXML("http://xml")
+       absURLInXML, _ := AbsURLInXML("http://base")
        tr := NewChain(absURLInXML...)
 
        b.ResetTimer()
        for i := 0; i < b.N; i++ {
-               apply(b.Errorf, tr, xml_abs_url_tests)
+               apply(b.Errorf, tr, xml_abs_url_bench_tests)
        }
 }
 
 func TestXMLAbsUrl(t *testing.T) {
-       absURLInXML, _ := AbsURLInXML("http://xml")
+       absURLInXML, _ := AbsURLInXML("http://base")
        tr := NewChain(absURLInXML...)
        apply(t.Errorf, tr, xml_abs_url_tests)
 }