hugolib: Implement "related content"
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Sat, 19 Aug 2017 11:16:00 +0000 (13:16 +0200)
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Tue, 5 Sep 2017 22:20:02 +0000 (00:20 +0200)
This closes #98, even if this commit does not do full content text search.

We may revisit that problem in the future, but that deserves its own issue.

Fixes #98

14 files changed:
common/types/types.go [new file with mode: 0644]
common/types/types_test.go [new file with mode: 0644]
hugolib/page.go
hugolib/pageCache.go
hugolib/pageCache_test.go
hugolib/pageGroup.go
hugolib/pageSort_test.go
hugolib/pages_related.go [new file with mode: 0644]
hugolib/pages_related_test.go [new file with mode: 0644]
hugolib/site.go
related/inverted_index.go [new file with mode: 0644]
related/inverted_index_test.go [new file with mode: 0644]
tpl/collections/collections.go
tpl/collections/init.go

diff --git a/common/types/types.go b/common/types/types.go
new file mode 100644 (file)
index 0000000..291bf6c
--- /dev/null
@@ -0,0 +1,44 @@
+// Copyright 2017-present The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package types contains types shared between packages in Hugo.
+package types
+
+import (
+       "fmt"
+
+       "github.com/spf13/cast"
+)
+
+// KeyValues holds an key and a slice of values.
+type KeyValues struct {
+       Key    interface{}
+       Values []interface{}
+}
+
+// KeyString returns the key as a string, an empty string if conversion fails.
+func (k KeyValues) KeyString() string {
+       return cast.ToString(k.Key)
+}
+
+func (k KeyValues) String() string {
+       return fmt.Sprintf("%v: %v", k.Key, k.Values)
+}
+
+func NewKeyValuesStrings(key string, values ...string) KeyValues {
+       iv := make([]interface{}, len(values))
+       for i := 0; i < len(values); i++ {
+               iv[i] = values[i]
+       }
+       return KeyValues{Key: key, Values: iv}
+}
diff --git a/common/types/types_test.go b/common/types/types_test.go
new file mode 100644 (file)
index 0000000..7cec8c0
--- /dev/null
@@ -0,0 +1,29 @@
+// Copyright 2017-present The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package types
+
+import (
+       "testing"
+
+       "github.com/stretchr/testify/require"
+)
+
+func TestKeyValues(t *testing.T) {
+       assert := require.New(t)
+
+       kv := NewKeyValuesStrings("key", "a1", "a2")
+
+       assert.Equal("key", kv.KeyString())
+       assert.Equal([]interface{}{"a1", "a2"}, kv.Values)
+}
index c2959080262a0a930d6dc79e2a742d4af33c9c6d..a723cabb213cd3461d576eb39bbc542f44b68a51 100644 (file)
@@ -20,6 +20,8 @@ import (
        "reflect"
        "unicode"
 
+       "github.com/gohugoio/hugo/related"
+
        "github.com/bep/gitmap"
 
        "github.com/gohugoio/hugo/helpers"
@@ -54,6 +56,9 @@ var (
        // Assert that it implements the Eqer interface.
        _ compare.Eqer = (*Page)(nil)
        _ compare.Eqer = (*PageOutput)(nil)
+
+       // Assert that it implements the interface needed for related searches.
+       _ related.Document = (*Page)(nil)
 )
 
 const (
@@ -231,6 +236,28 @@ type Page struct {
        targetPathDescriptorPrototype *targetPathDescriptor
 }
 
+// SearchKeywords implements the related.Document interface needed for fast page searches.
+func (p *Page) SearchKeywords(cfg related.IndexConfig) ([]related.Keyword, error) {
+
+       v, err := p.Param(cfg.Name)
+       if err != nil {
+               return nil, err
+       }
+
+       return cfg.ToKeywords(v)
+}
+
+// PubDate is when this page was or will be published.
+// NOTE: This is currently used for search only and is not meant to be used
+// directly in templates. We need to consolidate the dates in this struct.
+// TODO(bep) see https://github.com/gohugoio/hugo/issues/3854
+func (p *Page) PubDate() time.Time {
+       if !p.PublishDate.IsZero() {
+               return p.PublishDate
+       }
+       return p.Date
+}
+
 func (p *Page) RSSLink() template.URL {
        f, found := p.outputFormats.GetByName(output.RSSFormat.Name)
        if !found {
@@ -329,6 +356,21 @@ func (ps Pages) findPagePosByFilePath(inPath string) int {
        return -1
 }
 
+func (ps Pages) removeFirstIfFound(p *Page) Pages {
+       ii := -1
+       for i, pp := range ps {
+               if pp == p {
+                       ii = i
+                       break
+               }
+       }
+
+       if ii != -1 {
+               ps = append(ps[:ii], ps[ii+1:]...)
+       }
+       return ps
+}
+
 func (ps Pages) findFirstPagePosByFilePathPrefix(prefix string) int {
        if prefix == "" {
                return -1
index e0a3a160b44bd58ea3dc98214249bfd374707ac4..df381c679ffae84fc9ceb835de5b7f8d7a1b8c0e 100644 (file)
@@ -36,7 +36,7 @@ func (c *pageCache) get(key string, p Pages, apply func(p Pages)) (Pages, bool)
        c.RLock()
        if cached, ok := c.m[key]; ok {
                for _, ps := range cached {
-                       if probablyEqualPages(p, ps[0]) {
+                       if fastEqualPages(p, ps[0]) {
                                c.RUnlock()
                                return ps[1], true
                        }
@@ -51,7 +51,7 @@ func (c *pageCache) get(key string, p Pages, apply func(p Pages)) (Pages, bool)
        // double-check
        if cached, ok := c.m[key]; ok {
                for _, ps := range cached {
-                       if probablyEqualPages(p, ps[0]) {
+                       if fastEqualPages(p, ps[0]) {
                                return ps[1], true
                        }
                }
@@ -73,10 +73,10 @@ func (c *pageCache) get(key string, p Pages, apply func(p Pages)) (Pages, bool)
 
 }
 
-// "probably" as in: we do not compare every element for big slices, but that is
-// good enough for our use case.
+// "fast" as in: we do not compare every element for big slices, but that is
+// good enough for our use cases.
 // TODO(bep) there is a similar method in pagination.go. DRY.
-func probablyEqualPages(p1, p2 Pages) bool {
+func fastEqualPages(p1, p2 Pages) bool {
        if p1 == nil && p2 == nil {
                return true
        }
index 62837394f9384fbca09a5bc5f931346adc296b6a..aa2adf6e83aab7523eba6af27d30bebae8f6ad23 100644 (file)
@@ -56,8 +56,8 @@ func TestPageCache(t *testing.T) {
                                l1.Unlock()
                                p2, c2 := c1.get("k1", p, nil)
                                assert.True(t, c2)
-                               assert.True(t, probablyEqualPages(p, p2))
-                               assert.True(t, probablyEqualPages(p, pages))
+                               assert.True(t, fastEqualPages(p, p2))
+                               assert.True(t, fastEqualPages(p, pages))
                                assert.NotNil(t, p)
 
                                l2.Lock()
index 343ecf52e3184d8711a462584bf7c14c19029a30..3ccd35a060111fcd13483c8886249323b2ff0fab 100644 (file)
@@ -24,8 +24,8 @@ import (
 // PageGroup represents a group of pages, grouped by the key.
 // The key is typically a year or similar.
 type PageGroup struct {
-       Key   interface{}
-       Pages Pages
+       Key interface{}
+       Pages
 }
 
 type mapKeyValues []reflect.Value
index a17f53dc629c19f6be76d1cebda7bf19f2ab47d0..6379dccbe599d360c5966419aea9419f2c3af348 100644 (file)
@@ -115,7 +115,7 @@ func TestPageSortReverse(t *testing.T) {
        assert.Equal(t, 9, p2[0].fuzzyWordCount)
        assert.Equal(t, 0, p2[9].fuzzyWordCount)
        // cached
-       assert.True(t, probablyEqualPages(p2, p1.Reverse()))
+       assert.True(t, fastEqualPages(p2, p1.Reverse()))
 }
 
 func TestPageSortByParam(t *testing.T) {
diff --git a/hugolib/pages_related.go b/hugolib/pages_related.go
new file mode 100644 (file)
index 0000000..858ad0d
--- /dev/null
@@ -0,0 +1,191 @@
+// Copyright 2017-present The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hugolib
+
+import (
+       "sync"
+
+       "github.com/gohugoio/hugo/common/types"
+       "github.com/gohugoio/hugo/related"
+       "github.com/spf13/cast"
+)
+
+var (
+       // Assert that Pages and PageGroup implements the PageGenealogist interface.
+       _ PageGenealogist = (Pages)(nil)
+       _ PageGenealogist = PageGroup{}
+)
+
+// A PageGenealogist finds related pages in a page collection. This interface is implemented
+// by Pages and PageGroup, which makes it available as `{{ .RegularPages.Related . }}` etc.
+type PageGenealogist interface {
+
+       // Template example:
+       // {{ $related := .RegularPages.Related . }}
+       Related(doc related.Document) (Pages, error)
+
+       // Template example:
+       // {{ $related := .RegularPages.RelatedIndices . "tags" "date" }}
+       RelatedIndices(doc related.Document, indices ...interface{}) (Pages, error)
+
+       // Template example:
+       // {{ $related := .RegularPages.RelatedTo ( keyVals "tags" "hugo", "rocks")  ( keyVals "date" .Date ) }}
+       RelatedTo(args ...types.KeyValues) (Pages, error)
+}
+
+// Related searches all the configured indices with the search keywords from the
+// supplied document.
+func (p Pages) Related(doc related.Document) (Pages, error) {
+       page, err := unwrapPage(doc)
+       if err != nil {
+               return nil, err
+       }
+
+       result, err := p.searchDoc(page)
+       if err != nil {
+               return nil, err
+       }
+
+       return result.removeFirstIfFound(page), nil
+}
+
+// RelatedIndices searches the given indices with the search keywords from the
+// supplied document.
+func (p Pages) RelatedIndices(doc related.Document, indices ...interface{}) (Pages, error) {
+       page, err := unwrapPage(doc)
+       if err != nil {
+               return nil, err
+       }
+
+       indicesStr, err := cast.ToStringSliceE(indices)
+       if err != nil {
+               return nil, err
+       }
+
+       result, err := p.searchDoc(page, indicesStr...)
+       if err != nil {
+               return nil, err
+       }
+
+       return result.removeFirstIfFound(page), nil
+
+}
+
+// RelatedTo searches the given indices with the corresponding values.
+func (p Pages) RelatedTo(args ...types.KeyValues) (Pages, error) {
+       if len(p) == 0 {
+               return nil, nil
+       }
+
+       return p.search(args...)
+
+}
+
+func (p Pages) search(args ...types.KeyValues) (Pages, error) {
+       return p.withInvertedIndex(func(idx *related.InvertedIndex) ([]related.Document, error) {
+               return idx.SearchKeyValues(args...)
+       })
+
+}
+
+func (p Pages) searchDoc(doc related.Document, indices ...string) (Pages, error) {
+       return p.withInvertedIndex(func(idx *related.InvertedIndex) ([]related.Document, error) {
+               return idx.SearchDoc(doc, indices...)
+       })
+}
+
+func (p Pages) withInvertedIndex(search func(idx *related.InvertedIndex) ([]related.Document, error)) (Pages, error) {
+       if len(p) == 0 {
+               return nil, nil
+       }
+
+       cache := p[0].s.relatedDocsHandler
+
+       searchIndex, err := cache.getOrCreateIndex(p)
+       if err != nil {
+               return nil, err
+       }
+
+       result, err := search(searchIndex)
+       if err != nil {
+               return nil, err
+       }
+
+       if len(result) > 0 {
+               mp := make(Pages, len(result))
+               for i, match := range result {
+                       mp[i] = match.(*Page)
+               }
+               return mp, nil
+       }
+
+       return nil, nil
+}
+
+type cachedPostingList struct {
+       p Pages
+
+       postingList *related.InvertedIndex
+}
+
+type relatedDocsHandler struct {
+       // This is configured in site or langugage config.
+       cfg related.Config
+
+       postingLists []*cachedPostingList
+       mu           sync.RWMutex
+}
+
+func newSearchIndexHandler(cfg related.Config) *relatedDocsHandler {
+       return &relatedDocsHandler{cfg: cfg}
+}
+
+// This assumes that a lock has been aquired.
+func (s *relatedDocsHandler) getIndex(p Pages) *related.InvertedIndex {
+       for _, ci := range s.postingLists {
+               if fastEqualPages(p, ci.p) {
+                       return ci.postingList
+               }
+       }
+       return nil
+}
+
+func (s *relatedDocsHandler) getOrCreateIndex(p Pages) (*related.InvertedIndex, error) {
+       s.mu.RLock()
+       cachedIndex := s.getIndex(p)
+       if cachedIndex != nil {
+               s.mu.RUnlock()
+               return cachedIndex, nil
+       }
+       s.mu.RUnlock()
+
+       s.mu.Lock()
+       defer s.mu.Unlock()
+
+       if cachedIndex := s.getIndex(p); cachedIndex != nil {
+               return cachedIndex, nil
+       }
+
+       searchIndex := related.NewInvertedIndex(s.cfg)
+
+       for _, page := range p {
+               if err := searchIndex.Add(page); err != nil {
+                       return nil, err
+               }
+       }
+
+       s.postingLists = append(s.postingLists, &cachedPostingList{p: p, postingList: searchIndex})
+
+       return searchIndex, nil
+}
diff --git a/hugolib/pages_related_test.go b/hugolib/pages_related_test.go
new file mode 100644 (file)
index 0000000..cf5da09
--- /dev/null
@@ -0,0 +1,75 @@
+// Copyright 2017-present The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hugolib
+
+import (
+       "fmt"
+       "path/filepath"
+       "testing"
+
+       "github.com/gohugoio/hugo/common/types"
+       "github.com/gohugoio/hugo/deps"
+
+       "github.com/stretchr/testify/require"
+)
+
+func TestRelated(t *testing.T) {
+       assert := require.New(t)
+
+       t.Parallel()
+
+       var (
+               cfg, fs = newTestCfg()
+               //th      = testHelper{cfg, fs, t}
+       )
+
+       pageTmpl := `---
+title: Page %d
+keywords: [%s]
+date: %s
+---
+
+Content
+`
+
+       writeSource(t, fs, filepath.Join("content", "page1.md"), fmt.Sprintf(pageTmpl, 1, "hugo, says", "2017-01-03"))
+       writeSource(t, fs, filepath.Join("content", "page2.md"), fmt.Sprintf(pageTmpl, 2, "hugo, rocks", "2017-01-02"))
+       writeSource(t, fs, filepath.Join("content", "page3.md"), fmt.Sprintf(pageTmpl, 3, "bep, says", "2017-01-01"))
+
+       s := buildSingleSite(t, deps.DepsCfg{Fs: fs, Cfg: cfg}, BuildCfg{SkipRender: true})
+       assert.Len(s.RegularPages, 3)
+
+       result, err := s.RegularPages.RelatedTo(types.NewKeyValuesStrings("keywords", "hugo", "rocks"))
+
+       assert.NoError(err)
+       assert.Len(result, 2)
+       assert.Equal("Page 2", result[0].Title)
+       assert.Equal("Page 1", result[1].Title)
+
+       result, err = s.RegularPages.Related(s.RegularPages[0])
+       assert.Len(result, 2)
+       assert.Equal("Page 2", result[0].Title)
+       assert.Equal("Page 3", result[1].Title)
+
+       result, err = s.RegularPages.RelatedIndices(s.RegularPages[0], "keywords")
+       assert.Len(result, 2)
+       assert.Equal("Page 2", result[0].Title)
+       assert.Equal("Page 3", result[1].Title)
+
+       result, err = s.RegularPages.RelatedTo(types.NewKeyValuesStrings("keywords", "bep", "rocks"))
+       assert.Len(result, 2)
+       assert.Equal("Page 2", result[0].Title)
+       assert.Equal("Page 3", result[1].Title)
+
+}
index 13ca7f144d64d47d1e5c6af0948d775ba35fe289..b8898264a94b6d3d8819aea5367602d06ed6efb5 100644 (file)
@@ -42,6 +42,7 @@ import (
        "github.com/gohugoio/hugo/helpers"
        "github.com/gohugoio/hugo/output"
        "github.com/gohugoio/hugo/parser"
+       "github.com/gohugoio/hugo/related"
        "github.com/gohugoio/hugo/source"
        "github.com/gohugoio/hugo/tpl"
        "github.com/gohugoio/hugo/transform"
@@ -135,6 +136,8 @@ type Site struct {
        // The func used to title case titles.
        titleFunc func(s string) string
 
+       relatedDocsHandler *relatedDocsHandler
+
        siteStats *siteStats
 }
 
@@ -176,6 +179,7 @@ func (s *Site) reset() *Site {
                layoutHandler:       output.NewLayoutHandler(s.PathSpec.ThemeSet()),
                disabledKinds:       s.disabledKinds,
                titleFunc:           s.titleFunc,
+               relatedDocsHandler:  newSearchIndexHandler(s.relatedDocsHandler.cfg),
                outputFormats:       s.outputFormats,
                outputFormatsConfig: s.outputFormatsConfig,
                mediaTypesConfig:    s.mediaTypesConfig,
@@ -231,6 +235,21 @@ func newSite(cfg deps.DepsCfg) (*Site, error) {
                return nil, err
        }
 
+       var relatedContentConfig related.Config
+
+       if cfg.Language.IsSet("related") {
+               relatedContentConfig, err = related.DecodeConfig(cfg.Language.Get("related"))
+               if err != nil {
+                       return nil, err
+               }
+       } else {
+               relatedContentConfig = related.DefaultConfig
+               taxonomies := cfg.Language.GetStringMapString("taxonomies")
+               if _, found := taxonomies["tag"]; found {
+                       relatedContentConfig.Add(related.IndexConfig{Name: "tags", Weight: 80})
+               }
+       }
+
        titleFunc := helpers.GetTitleFunc(cfg.Language.GetString("titleCaseStyle"))
 
        s := &Site{
@@ -239,6 +258,7 @@ func newSite(cfg deps.DepsCfg) (*Site, error) {
                Language:            cfg.Language,
                disabledKinds:       disabledKinds,
                titleFunc:           titleFunc,
+               relatedDocsHandler:  newSearchIndexHandler(relatedContentConfig),
                outputFormats:       outputFormats,
                outputFormatsConfig: siteOutputFormatsConfig,
                mediaTypesConfig:    siteMediaTypesConfig,
@@ -1607,6 +1627,7 @@ func (s *Site) assembleTaxonomies() {
 // Prepare site for a new full build.
 func (s *Site) resetBuildState() {
 
+       s.relatedDocsHandler = newSearchIndexHandler(s.relatedDocsHandler.cfg)
        s.PageCollections = newPageCollectionsFromPages(s.rawAllPages)
        // TODO(bep) get rid of this double
        s.Info.PageCollections = s.PageCollections
diff --git a/related/inverted_index.go b/related/inverted_index.go
new file mode 100644 (file)
index 0000000..f0d598d
--- /dev/null
@@ -0,0 +1,450 @@
+// Copyright 2017-present The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package related holds code to help finding related content.
+package related
+
+import (
+       "errors"
+       "fmt"
+       "math"
+       "sort"
+       "strings"
+       "time"
+
+       "github.com/gohugoio/hugo/common/types"
+       "github.com/mitchellh/mapstructure"
+)
+
+var (
+       _        Keyword = (*StringKeyword)(nil)
+       zeroDate         = time.Time{}
+
+       // DefaultConfig is the default related config.
+       DefaultConfig = Config{
+               Threshold: 80,
+               Indices: IndexConfigs{
+                       IndexConfig{Name: "keywords", Weight: 100},
+                       IndexConfig{Name: "date", Weight: 10},
+               },
+       }
+)
+
+/*
+Config is the top level configuration element used to configure how to retrieve
+related content in Hugo.
+
+An example site config.toml:
+
+       [related]
+       threshold = 1
+       [[related.indices]]
+       name = "keywords"
+       weight = 200
+       [[related.indices]]
+       name  = "tags"
+       weight = 100
+       [[related.indices]]
+       name  = "date"
+       weight = 1
+       pattern = "2006"
+*/
+type Config struct {
+       // Only include matches >= threshold, a normalized rank between 0 and 100.
+       Threshold int
+
+       // To get stable "See also" sections we, by default, exclude newer related pages.
+       IncludeNewer bool
+
+       // Will lower case all string values and queries to the indices.
+       // May get better results, but at a slight performance cost.
+       ToLower bool
+
+       Indices IndexConfigs
+}
+
+func (c *Config) Add(index IndexConfig) {
+       if c.ToLower {
+               index.ToLower = true
+       }
+       c.Indices = append(c.Indices, index)
+}
+
+// IndexConfigs holds a set of index configurations.
+type IndexConfigs []IndexConfig
+
+// IndexConfig configures an index.
+type IndexConfig struct {
+       // The index name. This directly maps to a field or Param name.
+       Name string
+
+       // Contextual pattern used to convert the Param value into a string.
+       // Currently only used for dates. Can be used to, say, bump posts in the same
+       // time frame when searching for related documents.
+       // For dates it follows Go's time.Format patterns, i.e.
+       // "2006" for YYYY and "200601" for YYYYMM.
+       Pattern string
+
+       // This field's weight when doing multi-index searches. Higher is "better".
+       Weight int
+
+       // Will lower case all string values in and queries tothis index.
+       // May get better accurate results, but at a slight performance cost.
+       ToLower bool
+}
+
+// Document is the interface an indexable document in Hugo must fulfill.
+type Document interface {
+       // SearchKeywords returns a list of keywords for the given index config.
+       SearchKeywords(cfg IndexConfig) ([]Keyword, error)
+
+       // When this document was or will be published.
+       PubDate() time.Time
+}
+
+// InvertedIndex holds an inverted index, also sometimes named posting list, which
+// lists, for every possible search term, the documents that contain that term.
+type InvertedIndex struct {
+       cfg   Config
+       index map[string]map[Keyword][]Document
+
+       minWeight int
+       maxWeight int
+}
+
+func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) {
+       for _, conf := range idx.cfg.Indices {
+               if conf.Name == name {
+                       return conf, true
+               }
+       }
+
+       return IndexConfig{}, false
+}
+
+// NewInvertedIndex creates a new InvertedIndex.
+// Documents to index must be added in Add.
+func NewInvertedIndex(cfg Config) *InvertedIndex {
+       idx := &InvertedIndex{index: make(map[string]map[Keyword][]Document), cfg: cfg}
+       for _, conf := range cfg.Indices {
+               idx.index[conf.Name] = make(map[Keyword][]Document)
+               if conf.Weight < idx.minWeight {
+                       // By default, the weight scale starts at 0, but we allow
+                       // negative weights.
+                       idx.minWeight = conf.Weight
+               }
+               if conf.Weight > idx.maxWeight {
+                       idx.maxWeight = conf.Weight
+               }
+       }
+       return idx
+}
+
+// Add documents to the inverted index.
+// The value must support == and !=.
+func (idx *InvertedIndex) Add(docs ...Document) error {
+       var err error
+       for _, config := range idx.cfg.Indices {
+               if config.Weight == 0 {
+                       // Disabled
+                       continue
+               }
+               setm := idx.index[config.Name]
+
+               for _, doc := range docs {
+                       var words []Keyword
+                       words, err = doc.SearchKeywords(config)
+                       if err != nil {
+                               continue
+                       }
+
+                       for _, keyword := range words {
+                               setm[keyword] = append(setm[keyword], doc)
+                       }
+               }
+       }
+
+       return err
+
+}
+
+// queryElement holds the index name and keywords that can be used to compose a
+// search for related content.
+type queryElement struct {
+       Index    string
+       Keywords []Keyword
+}
+
+func newQueryElement(index string, keywords ...Keyword) queryElement {
+       return queryElement{Index: index, Keywords: keywords}
+}
+
+type ranks []*rank
+
+type rank struct {
+       Doc     Document
+       Weight  int
+       Matches int
+}
+
+func (r *rank) addWeight(w int) {
+       r.Weight += w
+       r.Matches++
+}
+
+func newRank(doc Document, weight int) *rank {
+       return &rank{Doc: doc, Weight: weight, Matches: 1}
+}
+
+func (r ranks) Len() int      { return len(r) }
+func (r ranks) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
+func (r ranks) Less(i, j int) bool {
+       if r[i].Weight == r[j].Weight {
+               return r[i].Doc.PubDate().After(r[j].Doc.PubDate())
+       }
+       return r[i].Weight > r[j].Weight
+}
+
+// SearchDoc finds the documents matching any of the keywords in the given indices
+// against the given document.
+// The resulting document set will be sorted according to number of matches
+// and the index weights, and any matches with a rank below the configured
+// threshold (normalize to 0..100) will be removed.
+// If an index name is provided, only that index will be queried.
+func (idx *InvertedIndex) SearchDoc(doc Document, indices ...string) ([]Document, error) {
+       var q []queryElement
+
+       var configs IndexConfigs
+
+       if len(indices) == 0 {
+               configs = idx.cfg.Indices
+       } else {
+               configs = make(IndexConfigs, len(indices))
+               for i, indexName := range indices {
+                       cfg, found := idx.getIndexCfg(indexName)
+                       if !found {
+                               return nil, fmt.Errorf("index %q not found", indexName)
+                       }
+                       configs[i] = cfg
+               }
+       }
+
+       for _, cfg := range configs {
+               keywords, err := doc.SearchKeywords(cfg)
+               if err != nil {
+                       return nil, err
+               }
+
+               q = append(q, newQueryElement(cfg.Name, keywords...))
+
+       }
+
+       return idx.searchDate(doc.PubDate(), q...)
+}
+
+func (cfg IndexConfig) ToKeywords(v interface{}) ([]Keyword, error) {
+       var (
+               keywords []Keyword
+               toLower  = cfg.ToLower
+       )
+       switch vv := v.(type) {
+       case string:
+               if toLower {
+                       vv = strings.ToLower(vv)
+               }
+               keywords = append(keywords, StringKeyword(vv))
+       case []string:
+               if toLower {
+                       for i := 0; i < len(vv); i++ {
+                               vv[i] = strings.ToLower(vv[i])
+                       }
+               }
+               keywords = append(keywords, StringsToKeywords(vv...)...)
+       case time.Time:
+               layout := "2006"
+               if cfg.Pattern != "" {
+                       layout = cfg.Pattern
+               }
+               keywords = append(keywords, StringKeyword(vv.Format(layout)))
+       case nil:
+               return keywords, nil
+       default:
+               return keywords, fmt.Errorf("indexing currently not supported for for index %q and type %T", cfg.Name, vv)
+       }
+
+       return keywords, nil
+}
+
+// SearchKeyValues finds the documents matching any of the keywords in the given indices.
+// The resulting document set will be sorted according to number of matches
+// and the index weights, and any matches with a rank below the configured
+// threshold (normalize to 0..100) will be removed.
+func (idx *InvertedIndex) SearchKeyValues(args ...types.KeyValues) ([]Document, error) {
+       q := make([]queryElement, len(args))
+
+       for i, arg := range args {
+               var keywords []Keyword
+               key := arg.KeyString()
+               if key == "" {
+                       return nil, fmt.Errorf("index %q not valid", arg.Key)
+               }
+               conf, found := idx.getIndexCfg(key)
+               if !found {
+                       return nil, fmt.Errorf("index %q not found", key)
+               }
+
+               for _, val := range arg.Values {
+                       k, err := conf.ToKeywords(val)
+                       if err != nil {
+                               return nil, err
+                       }
+                       keywords = append(keywords, k...)
+               }
+
+               q[i] = newQueryElement(conf.Name, keywords...)
+
+       }
+
+       return idx.search(q...)
+}
+
+func (idx *InvertedIndex) search(query ...queryElement) ([]Document, error) {
+       return idx.searchDate(zeroDate, query...)
+}
+
+func (idx *InvertedIndex) searchDate(upperDate time.Time, query ...queryElement) ([]Document, error) {
+       matchm := make(map[Document]*rank, 200)
+       applyDateFilter := !idx.cfg.IncludeNewer && !upperDate.IsZero()
+
+       for _, el := range query {
+               setm, found := idx.index[el.Index]
+               if !found {
+                       return []Document{}, fmt.Errorf("index for %q not found", el.Index)
+               }
+
+               config, found := idx.getIndexCfg(el.Index)
+               if !found {
+                       return []Document{}, fmt.Errorf("index config for %q not found", el.Index)
+               }
+
+               for _, kw := range el.Keywords {
+                       if docs, found := setm[kw]; found {
+                               for _, doc := range docs {
+                                       if applyDateFilter {
+                                               // Exclude newer than the limit given
+                                               if doc.PubDate().After(upperDate) {
+                                                       continue
+                                               }
+                                       }
+                                       r, found := matchm[doc]
+                                       if !found {
+                                               matchm[doc] = newRank(doc, config.Weight)
+                                       } else {
+                                               r.addWeight(config.Weight)
+                                       }
+                               }
+                       }
+               }
+       }
+
+       if len(matchm) == 0 {
+               return []Document{}, nil
+       }
+
+       matches := make(ranks, 0, 100)
+
+       for _, v := range matchm {
+               avgWeight := v.Weight / v.Matches
+               weight := norm(avgWeight, idx.minWeight, idx.maxWeight)
+               threshold := idx.cfg.Threshold / v.Matches
+
+               if weight >= threshold {
+                       matches = append(matches, v)
+               }
+       }
+
+       sort.Stable(matches)
+
+       result := make([]Document, len(matches))
+
+       for i, m := range matches {
+               result[i] = m.Doc
+       }
+
+       return result, nil
+}
+
+// normalizes num to a number between 0 and 100.
+func norm(num, min, max int) int {
+       if min > max {
+               panic("min > max")
+       }
+       return int(math.Floor((float64(num-min) / float64(max-min) * 100) + 0.5))
+}
+
+// DecodeConfig decodes a slice of map into Config.
+func DecodeConfig(in interface{}) (Config, error) {
+       if in == nil {
+               return Config{}, errors.New("no related config provided")
+       }
+
+       m, ok := in.(map[string]interface{})
+       if !ok {
+               return Config{}, fmt.Errorf("expected map[string]interface {} got %T", in)
+       }
+
+       if len(m) == 0 {
+               return Config{}, errors.New("empty related config provided")
+       }
+
+       var c Config
+
+       if err := mapstructure.WeakDecode(m, &c); err != nil {
+               return c, err
+       }
+
+       if c.Threshold < 0 || c.Threshold > 100 {
+               return Config{}, errors.New("related threshold must be between 0 and 100")
+       }
+
+       if c.ToLower {
+               for i, _ := range c.Indices {
+                       c.Indices[i].ToLower = true
+               }
+       }
+
+       return c, nil
+}
+
+// StringKeyword is a string search keyword.
+type StringKeyword string
+
+func (s StringKeyword) String() string {
+       return string(s)
+}
+
+// Keyword is the interface a keyword in the search index must implement.
+type Keyword interface {
+       String() string
+}
+
+// StringsToKeywords converts the given slice of strings to a slice of Keyword.
+func StringsToKeywords(s ...string) []Keyword {
+       kw := make([]Keyword, len(s))
+
+       for i := 0; i < len(s); i++ {
+               kw[i] = StringKeyword(s[i])
+       }
+
+       return kw
+}
diff --git a/related/inverted_index_test.go b/related/inverted_index_test.go
new file mode 100644 (file)
index 0000000..781a969
--- /dev/null
@@ -0,0 +1,276 @@
+// Copyright 2017-present The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package related
+
+import (
+       "fmt"
+       "math/rand"
+       "testing"
+       "time"
+
+       "github.com/stretchr/testify/require"
+)
+
+type testDoc struct {
+       keywords map[string][]Keyword
+       date     time.Time
+}
+
+func (k *testDoc) String() string {
+       s := "\n"
+       for k, v := range k.keywords {
+               s += k + ":\t\t"
+               for _, vv := range v {
+                       s += "  " + vv.String()
+               }
+               s += "\n"
+       }
+       return s
+}
+
+func newTestDoc(name string, keywords ...string) *testDoc {
+       km := make(map[string][]Keyword)
+
+       time.Sleep(1 * time.Millisecond)
+       kw := &testDoc{keywords: km, date: time.Now()}
+
+       kw.addKeywords(name, keywords...)
+       return kw
+}
+
+func (t *testDoc) addKeywords(name string, keywords ...string) *testDoc {
+       keywordm := createTestKeywords(name, keywords...)
+
+       for k, v := range keywordm {
+               keywords := make([]Keyword, len(v))
+               for i := 0; i < len(v); i++ {
+                       keywords[i] = StringKeyword(v[i])
+               }
+               t.keywords[k] = keywords
+       }
+       return t
+}
+
+func createTestKeywords(name string, keywords ...string) map[string][]string {
+       return map[string][]string{
+               name: keywords,
+       }
+}
+
+func (k *testDoc) SearchKeywords(cfg IndexConfig) ([]Keyword, error) {
+       return k.keywords[cfg.Name], nil
+}
+
+func (k *testDoc) PubDate() time.Time {
+       return k.date
+}
+
+func TestSearch(t *testing.T) {
+
+       config := Config{
+               Threshold:    90,
+               IncludeNewer: false,
+               Indices: IndexConfigs{
+                       IndexConfig{Name: "tags", Weight: 50},
+                       IndexConfig{Name: "keywords", Weight: 65},
+               },
+       }
+
+       idx := NewInvertedIndex(config)
+       //idx.debug = true
+
+       docs := []Document{
+               newTestDoc("tags", "a", "b", "c", "d"),
+               newTestDoc("tags", "b", "d", "g"),
+               newTestDoc("tags", "b", "h").addKeywords("keywords", "a"),
+               newTestDoc("tags", "g", "h").addKeywords("keywords", "a", "b"),
+       }
+
+       idx.Add(docs...)
+
+       t.Run("count", func(t *testing.T) {
+               assert := require.New(t)
+               assert.Len(idx.index, 2)
+               set1, found := idx.index["tags"]
+               assert.True(found)
+               // 6 tags
+               assert.Len(set1, 6)
+
+               set2, found := idx.index["keywords"]
+               assert.True(found)
+               assert.Len(set2, 2)
+
+       })
+
+       t.Run("search-tags", func(t *testing.T) {
+               assert := require.New(t)
+               m, err := idx.search(newQueryElement("tags", StringsToKeywords("a", "b", "d", "z")...))
+               assert.NoError(err)
+               assert.Len(m, 2)
+               assert.Equal(docs[0], m[0])
+               assert.Equal(docs[1], m[1])
+       })
+
+       t.Run("search-tags-and-keywords", func(t *testing.T) {
+               assert := require.New(t)
+               m, err := idx.search(
+                       newQueryElement("tags", StringsToKeywords("a", "b", "z")...),
+                       newQueryElement("keywords", StringsToKeywords("a", "b")...))
+               assert.NoError(err)
+               assert.Len(m, 3)
+               assert.Equal(docs[3], m[0])
+               assert.Equal(docs[2], m[1])
+               assert.Equal(docs[0], m[2])
+       })
+
+       t.Run("searchdoc-all", func(t *testing.T) {
+               assert := require.New(t)
+               doc := newTestDoc("tags", "a").addKeywords("keywords", "a")
+               m, err := idx.SearchDoc(doc)
+               assert.NoError(err)
+               assert.Len(m, 2)
+               assert.Equal(docs[3], m[0])
+               assert.Equal(docs[2], m[1])
+       })
+
+       t.Run("searchdoc-tags", func(t *testing.T) {
+               assert := require.New(t)
+               doc := newTestDoc("tags", "a", "b", "d", "z").addKeywords("keywords", "a", "b")
+               m, err := idx.SearchDoc(doc, "tags")
+               assert.NoError(err)
+               assert.Len(m, 2)
+               assert.Equal(docs[0], m[0])
+               assert.Equal(docs[1], m[1])
+       })
+
+       t.Run("searchdoc-keywords-date", func(t *testing.T) {
+               assert := require.New(t)
+               doc := newTestDoc("tags", "a", "b", "d", "z").addKeywords("keywords", "a", "b")
+               // This will get a date newer than the others.
+               newDoc := newTestDoc("keywords", "a", "b")
+               idx.Add(newDoc)
+
+               m, err := idx.SearchDoc(doc, "keywords")
+               assert.NoError(err)
+               assert.Len(m, 2)
+               assert.Equal(docs[3], m[0])
+       })
+
+}
+
+func BenchmarkRelatedNewIndex(b *testing.B) {
+
+       pages := make([]*testDoc, 100)
+       numkeywords := 30
+       allKeywords := make([]string, numkeywords)
+       for i := 0; i < numkeywords; i++ {
+               allKeywords[i] = fmt.Sprintf("keyword%d", i+1)
+       }
+
+       for i := 0; i < len(pages); i++ {
+               start := rand.Intn(len(allKeywords))
+               end := start + 3
+               if end >= len(allKeywords) {
+                       end = start + 1
+               }
+
+               kw := newTestDoc("tags", allKeywords[start:end]...)
+               if i%5 == 0 {
+                       start := rand.Intn(len(allKeywords))
+                       end := start + 3
+                       if end >= len(allKeywords) {
+                               end = start + 1
+                       }
+                       kw.addKeywords("keywords", allKeywords[start:end]...)
+               }
+
+               pages[i] = kw
+       }
+
+       cfg := Config{
+               Threshold: 50,
+               Indices: IndexConfigs{
+                       IndexConfig{Name: "tags", Weight: 100},
+                       IndexConfig{Name: "keywords", Weight: 200},
+               },
+       }
+
+       b.Run("singles", func(b *testing.B) {
+               for i := 0; i < b.N; i++ {
+                       idx := NewInvertedIndex(cfg)
+                       for _, doc := range pages {
+                               idx.Add(doc)
+                       }
+               }
+       })
+
+       b.Run("all", func(b *testing.B) {
+               for i := 0; i < b.N; i++ {
+                       idx := NewInvertedIndex(cfg)
+                       docs := make([]Document, len(pages))
+                       for i := 0; i < len(pages); i++ {
+                               docs[i] = pages[i]
+                       }
+                       idx.Add(docs...)
+               }
+       })
+
+}
+
+func BenchmarkRelatedMatchesIn(b *testing.B) {
+
+       q1 := newQueryElement("tags", StringsToKeywords("keyword2", "keyword5", "keyword32", "asdf")...)
+       q2 := newQueryElement("keywords", StringsToKeywords("keyword3", "keyword4")...)
+
+       docs := make([]*testDoc, 1000)
+       numkeywords := 20
+       allKeywords := make([]string, numkeywords)
+       for i := 0; i < numkeywords; i++ {
+               allKeywords[i] = fmt.Sprintf("keyword%d", i+1)
+       }
+
+       cfg := Config{
+               Threshold: 20,
+               Indices: IndexConfigs{
+                       IndexConfig{Name: "tags", Weight: 100},
+                       IndexConfig{Name: "keywords", Weight: 200},
+               },
+       }
+
+       idx := NewInvertedIndex(cfg)
+
+       for i := 0; i < len(docs); i++ {
+               start := rand.Intn(len(allKeywords))
+               end := start + 3
+               if end >= len(allKeywords) {
+                       end = start + 1
+               }
+
+               index := "tags"
+               if i%5 == 0 {
+                       index = "keywords"
+               }
+
+               idx.Add(newTestDoc(index, allKeywords[start:end]...))
+       }
+
+       b.ResetTimer()
+       for i := 0; i < b.N; i++ {
+               if i%10 == 0 {
+                       idx.search(q2)
+               } else {
+                       idx.search(q1)
+               }
+       }
+}
index bc80acbbec6361bc415f9c2eac9d8257a167b7f5..8f09097a75a12b90c695aed86df1d8aaa0290b4e 100644 (file)
@@ -23,6 +23,7 @@ import (
        "strings"
        "time"
 
+       "github.com/gohugoio/hugo/common/types"
        "github.com/gohugoio/hugo/deps"
        "github.com/gohugoio/hugo/helpers"
        "github.com/spf13/cast"
@@ -641,3 +642,8 @@ func (ns *Namespace) Uniq(l interface{}) (interface{}, error) {
        }
        return ret.Interface(), nil
 }
+
+// KeyVals creates a key and values wrapper.
+func (ns *Namespace) KeyVals(key interface{}, vals ...interface{}) (types.KeyValues, error) {
+       return types.KeyValues{Key: key, Values: vals}, nil
+}
index 4a7c2d875dc7ecddaafa94a1a6d2d8965a1a2719..91b0dea01375056b4c426e77f2e6b3bb1bc4e7d1 100644 (file)
@@ -63,6 +63,13 @@ func init() {
                        [][2]string{},
                )
 
+               ns.AddMethodMapping(ctx.KeyVals,
+                       []string{"keyVals"},
+                       [][2]string{
+                               {`{{ keyVals "key" "a" "b" }}`, `key: [a b]`},
+                       },
+               )
+
                ns.AddMethodMapping(ctx.In,
                        []string{"in"},
                        [][2]string{