WordCount and Summary support CJK Language

author coderzh <pythonzh@gmail.com>

Thu, 3 Sep 2015 10:22:20 +0000 (18:22 +0800)

committer Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>

Wed, 7 Oct 2015 13:14:57 +0000 (15:14 +0200)
author coderzh <pythonzh@gmail.com>
Thu, 3 Sep 2015 10:22:20 +0000 (18:22 +0800)
committer Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Wed, 7 Oct 2015 13:14:57 +0000 (15:14 +0200)
diff --git a/commands/hugo.go b/commands/hugo.go

index b0e4964c4de56366416e2c5087296aff369fc786..718a3e6f66efb6da499ebee164fff75ac645ec37 100644 (file)
--- a/commands/hugo.go
+++ b/commands/hugo.go
@@ -168,6 +168,7 @@ func LoadDefaultSettings() {
         viper.SetDefault("RSSUri", "index.xml")
         viper.SetDefault("SectionPagesMenu", "")
         viper.SetDefault("DisablePathToLower", false)
+       viper.SetDefault("HasCJKLanguage", false)
  }
  
  // InitializeConfig initializes a config file with sensible default configuration flags.
diff --git a/helpers/content.go b/helpers/content.go

index 8c5c9cc7b2ebb0cbd7bb267231ed7d33775b16a6..847d4dcbc28003175255fd37dd80eabe09bb0a12 100644 (file)
--- a/helpers/content.go
+++ b/helpers/content.go
@@ -19,9 +19,9 @@ package helpers
  
  import (
         "bytes"
-       "unicode/utf8"
         "html/template"
         "os/exec"
+       "unicode/utf8"
  
         "github.com/miekg/mmark"
         "github.com/russross/blackfriday"
@@ -178,7 +178,6 @@ func GetHTMLRenderer(defaultFlags int, ctx *RenderingContext) blackfriday.Render
         }
  }
  
-
  func getMarkdownExtensions(ctx *RenderingContext) int {
         flags := 0 | blackfriday.EXTENSION_NO_INTRA_EMPHASIS |
                 blackfriday.EXTENSION_TABLES | blackfriday.EXTENSION_FENCED_CODE |
@@ -385,61 +384,51 @@ func TruncateWords(s string, max int) string {
         return strings.Join(words[:max], " ")
  }
  
-// TruncateWordsToWholeSentence takes content and an int
-// and returns entire sentences from content, delimited by the int
-// and whether it's truncated or not.
-func TruncateWordsToWholeSentence(words []string, max int) (string, bool) {
+func TruncateWordsByRune(words []string, max int) (string, bool) {
         count := 0
-       index, word := 0, ""
-       truncated := false
-       
-       for index, word = range words {
+       for index, word := range words {
+               if count >= max {
+                       return strings.Join(words[:index], " "), true
+               }
                 runeCount := utf8.RuneCountInString(word)
                 if len(word) == runeCount {
-                       count++;
+                       count++
+               } else if count+runeCount < max {
+                       count += runeCount
                 } else {
-                       if count + runeCount <= max {
-                               count += runeCount
-                       } else {
-                               offset := 0
-                               for count < max {
-                                       _, width := utf8.DecodeRuneInString(word[offset:])
-                               offset += width
+                       for ri, _ := range word {
+                               if count >= max {
+                                       truncatedWords := append(words[:index], word[:ri])
+                                       return strings.Join(truncatedWords, " "), true
+                               } else {
                                         count++
                                 }
-                               words[index] = word[:offset]
-                               truncated = true
-                       }
-               }
-               
-               if count >= max {
-                       if index < len(words) - 1 {
-                               truncated = true        
                         }
-                       break
                 }
         }
-       
-       index += 1
-       
-       if index < len(words) {
-               for counter, word := range words[index:] {
-                       if len(word) != utf8.RuneCountInString(word) {
-                               break
-                       }
-                       if strings.HasSuffix(word, ".") ||
-                               strings.HasSuffix(word, "?") ||
-                               strings.HasSuffix(word, ".\"") ||
-                               strings.HasSuffix(word, "!") {
-                               upper := index + counter + 1
-                               return strings.Join(words[:upper], " "), (upper < len(words))
-                       }
+
+       return strings.Join(words, " "), false
+}
+
+// TruncateWordsToWholeSentence takes content and an int
+// and returns entire sentences from content, delimited by the int
+// and whether it's truncated or not.
+func TruncateWordsToWholeSentence(words []string, max int) (string, bool) {
+       if max >= len(words) {
+               return strings.Join(words, " "), false
+       }
+
+       for counter, word := range words[max:] {
+               if strings.HasSuffix(word, ".") ||
+                       strings.HasSuffix(word, "?") ||
+                       strings.HasSuffix(word, ".\"") ||
+                       strings.HasSuffix(word, "!") {
+                       upper := max + counter + 1
+                       return strings.Join(words[:upper], " "), (upper < len(words))
                 }
-       } else if index > len(words) {
-               return strings.Join(words, " "), truncated
         }
-       
-       return strings.Join(words[:index], " "), truncated
+
+       return strings.Join(words[:max], " "), true
  }
  
  // GetAsciidocContent calls asciidoctor or asciidoc as an external helper
diff --git a/helpers/content_test.go b/helpers/content_test.go

index f614011c0e6f15c00b0c881dd3e249331cedc201..f0d76b6cea6120eeb3faac15a3a4cda7e92bcca6 100644 (file)
--- a/helpers/content_test.go
+++ b/helpers/content_test.go
@@ -1,10 +1,11 @@
  package helpers
  
  import (
-       "github.com/stretchr/testify/assert"
         "html/template"
         "strings"
         "testing"
+
+       "github.com/stretchr/testify/assert"
  )
  
  const tstHTMLContent = "<!DOCTYPE html><html><head><script src=\"http://two/foobar.js\"></script></head><body><nav><ul><li hugo-nav=\"section_0\"></li><li hugo-nav=\"section_1\"></li></ul></nav><article>content <a href=\"http://two/foobar\">foobar</a>. Follow up</article><p>This is some text.<br>And some more.</p></body></html>"
@@ -54,8 +55,6 @@ func TestTruncateWordsToWholeSentence(t *testing.T) {
                 {"a b c", "a b c", 12, false},
                 {"a b c", "a b c", 3, false},
                 {"a", "a", 1, false},
-               {"Hello 中国", "Hello 中", 2, true},
-               {"Hello 中国", "Hello 中国", 3, false},
                 {"This is a sentence.", "This is a sentence.", 5, false},
                 {"This is also a sentence!", "This is also a sentence!", 1, false},
                 {"To be. Or not to be. That's the question.", "To be.", 1, true},
@@ -72,3 +71,36 @@ func TestTruncateWordsToWholeSentence(t *testing.T) {
                 }
         }
  }
+
+func TestTruncateWordsByRune(t *testing.T) {
+       type test struct {
+               input, expected string
+               max             int
+               truncated       bool
+       }
+       data := []test{
+               {"", "", 1, false},
+               {"a b c", "a b c", 12, false},
+               {"a b c", "a b c", 3, false},
+               {"a", "a", 1, false},
+               {"Hello 中国", "", 0, true},
+               {"这是中文，全中文。", "这是中文，", 5, true},
+               {"Hello 中国", "Hello 中", 2, true},
+               {"Hello 中国", "Hello 中国", 3, false},
+               {"Hello中国 Good 好的", "Hello中国 Good 好", 9, true},
+               {"This is a sentence.", "This is", 2, true},
+               {"This is also a sentence!", "This", 1, true},
+               {"To be. Or not to be. That's the question.", "To be. Or not", 4, true},
+               {" \nThis is    not a sentence\n ", "This is not", 3, true},
+       }
+       for i, d := range data {
+               output, truncated := TruncateWordsByRune(strings.Fields(d.input), d.max)
+               if d.expected != output {
+                       t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output)
+               }
+
+               if d.truncated != truncated {
+                       t.Errorf("Test %d failed. Expected truncated=%t got %t", i, d.truncated, truncated)
+               }
+       }
+}
diff --git a/hugolib/page.go b/hugolib/page.go

index c50e2da18b23348773bd5965a003b87acd1fa226..e08e764af955baeee5ea2a69c5b789114088eeac 100644 (file)
--- a/hugolib/page.go
+++ b/hugolib/page.go
@@ -28,6 +28,7 @@ import (
         "net/url"
         "path"
         "path/filepath"
+       "regexp"
         "strings"
         "sync"
         "time"
@@ -42,6 +43,10 @@ import (
         "github.com/spf13/viper"
  )
  
+var (
+       cjk = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`)
+)
+
  type Page struct {
         Params          map[string]interface{}
         Content         template.HTML
@@ -67,7 +72,6 @@ type Page struct {
         contentShortCodes   map[string]string
         plain               string // TODO should be []byte
         plainWords          []string
-       plainRuneCount      int
         plainInit           sync.Once
         plainSecondaryInit  sync.Once
         renderingConfig     *helpers.Blackfriday
@@ -78,6 +82,7 @@ type Page struct {
         Node
         pageMenus     PageMenus
         pageMenusInit sync.Once
+       isCJKLanguage bool
  }
  
  type Source struct {
@@ -111,12 +116,6 @@ func (p *Page) PlainWords() []string {
         return p.plainWords
  }
  
-// RuneCount returns the rune count, excluding any whitespace, of the plain content.
-func (p *Page) RuneCount() int {
-       p.initPlainSecondary()
-       return p.plainRuneCount
-}
-
  func (p *Page) initPlain() {
         p.plainInit.Do(func() {
                 p.plain = helpers.StripHTML(string(p.Content))
@@ -125,20 +124,6 @@ func (p *Page) initPlain() {
         })
  }
  
-func (p *Page) initPlainSecondary() {
-       p.plainSecondaryInit.Do(func() {
-               p.initPlain()
-               runeCount := 0
-               for _, r := range p.plain {
-                       if !helpers.IsWhitespace(r) {
-                               runeCount++
-                       }
-               }
-               p.plainRuneCount = runeCount
-               return
-       })
-}
-
  func (p *Page) IsNode() bool {
         return false
  }
@@ -218,7 +203,13 @@ func (p *Page) setSummary() {
         } else {
                 // If hugo defines split:
                 // render, strip html, then split
-               summary, truncated := helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength)
+               var summary string
+               var truncated bool
+               if p.isCJKLanguage {
+                       summary, truncated = helpers.TruncateWordsByRune(p.PlainWords(), helpers.SummaryLength)
+               } else {
+                       summary, truncated = helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength)
+               }
                 p.Summary = template.HTML(summary)
                 p.Truncated = truncated
  
@@ -363,18 +354,27 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) {
  }
  
  func (p *Page) analyzePage() {
-       p.WordCount = 0
-       for _, word := range p.PlainWords() {
-               runeCount := utf8.RuneCountInString(word)
-               if len(word) == runeCount {
-                       p.WordCount++   
-               } else {
-                       p.WordCount += runeCount
+       if p.isCJKLanguage {
+               p.WordCount = 0
+               for _, word := range p.PlainWords() {
+                       runeCount := utf8.RuneCountInString(word)
+                       if len(word) == runeCount {
+                               p.WordCount++
+                       } else {
+                               p.WordCount += runeCount
+                       }
                 }
+       } else {
+               p.WordCount = len(p.PlainWords())
         }
-       
+
         p.FuzzyWordCount = int((p.WordCount+100)/100) * 100
-       p.ReadingTime = int((p.WordCount + 212) / 213)
+
+       if p.isCJKLanguage {
+               p.ReadingTime = int((p.WordCount + 500) / 501)
+       } else {
+               p.ReadingTime = int((p.WordCount + 212) / 213)
+       }
  }
  
  func (p *Page) permalink() (*url.URL, error) {
@@ -481,7 +481,7 @@ func (p *Page) update(f interface{}) error {
         }
         m := f.(map[string]interface{})
         var err error
-       var draft, published *bool
+       var draft, published, isCJKLanguage *bool
         for k, v := range m {
                 loki := strings.ToLower(k)
                 switch loki {
@@ -542,6 +542,9 @@ func (p *Page) update(f interface{}) error {
                         p.Status = cast.ToString(v)
                 case "sitemap":
                         p.Sitemap = parseSitemap(cast.ToStringMap(v))
+               case "iscjklanguage":
+                       isCJKLanguage = new(bool)
+                       *isCJKLanguage = cast.ToBool(v)
                 default:
                         // If not one of the explicit values, store in Params
                         switch vv := v.(type) {
@@ -596,6 +599,16 @@ func (p *Page) update(f interface{}) error {
                 p.Lastmod = p.Date
         }
  
+       if isCJKLanguage != nil {
+               p.isCJKLanguage = *isCJKLanguage
+       } else if viper.GetBool("HasCJKLanguage") {
+               if cjk.Match(p.rawContent) {
+                       p.isCJKLanguage = true
+               } else {
+                       p.isCJKLanguage = false
+               }
+       }
+
         return nil
  
  }
@@ -766,6 +779,8 @@ func (p *Page) parse(reader io.Reader) error {
  
         p.renderable = psr.IsRenderable()
         p.frontmatter = psr.FrontMatter()
+       p.rawContent = psr.Content()
+
         meta, err := psr.Metadata()
         if meta != nil {
                 if err != nil {
@@ -778,8 +793,6 @@ func (p *Page) parse(reader io.Reader) error {
                 }
         }
  
-       p.rawContent = psr.Content()
-
         return nil
  }
  
diff --git a/hugolib/page_test.go b/hugolib/page_test.go

index c3506d48d4b7ae769ffe77c2ade11d2833289869..9134ba6c6445ecc6207fc53c05ef099f51d4fb4a 100644 (file)
--- a/hugolib/page_test.go
+++ b/hugolib/page_test.go
@@ -146,16 +146,67 @@ Summary Same Line<!--more-->
  Some more text
  `
  
-       SIMPLE_PAGE_WITH_FIVE_MULTIBYTE_UFT8_RUNES = `---
+       SIMPLE_PAGE_WITH_ALL_CJK_RUNES = `---
  title: Simple
  ---
  
  
  € € € € €
+你好
+도형이
+カテゴリー
  
  
  `
  
+       SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES = `---
+title: Simple
+---
+
+
+In Chinese, 好 means good.  In Chinese, 好 means good.
+In Chinese, 好 means good.  In Chinese, 好 means good.
+In Chinese, 好 means good.  In Chinese, 好 means good.
+In Chinese, 好 means good.  In Chinese, 好 means good.
+In Chinese, 好 means good.  In Chinese, 好 means good.
+In Chinese, 好 means good.  In Chinese, 好 means good.
+In Chinese, 好 means good.  In Chinese, 好 means good.
+More then 70 words.
+
+
+`
+       SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY = "In Chinese, 好 means good. In Chinese, 好 means good. " +
+               "In Chinese, 好 means good. In Chinese, 好 means good. " +
+               "In Chinese, 好 means good. In Chinese, 好 means good. " +
+               "In Chinese, 好 means good. In Chinese, 好 means good. " +
+               "In Chinese, 好 means good. In Chinese, 好 means good. " +
+               "In Chinese, 好 means good. In Chinese, 好 means good. " +
+               "In Chinese, 好 means good. In Chinese, 好 means good."
+
+       SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE = `---
+title: Simple
+isCJKLanguage: false
+---
+
+In Chinese, 好的啊 means good.  In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good.  In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good.  In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good.  In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good.  In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good.  In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good.  In Chinese, 好的呀呀 means good enough.
+More then 70 words.
+
+
+`
+       SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY = "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+               "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+               "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+               "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+               "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+               "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+               "In Chinese, 好的啊 means good. In Chinese, 好的呀呀 means good enough."
+
         SIMPLE_PAGE_WITH_LONG_CONTENT = `---
  title: Simple
  ---
@@ -584,18 +635,86 @@ func TestPageWithDate(t *testing.T) {
         checkPageDate(t, p, d)
  }
  
-func TestRuneCount(t *testing.T) {
+func TestWordCountWithAllCJKRunesWithoutHasCJKLanguage(t *testing.T) {
+       viper.Reset()
+
         p, _ := NewPage("simple.md")
-       _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_FIVE_MULTIBYTE_UFT8_RUNES))
+       _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ALL_CJK_RUNES))
         p.Convert()
         p.analyzePage()
         if err != nil {
                 t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
         }
  
-       if p.RuneCount() != 5 {
-               t.Fatalf("incorrect rune count for content '%s'. expected %v, got %v", p.plain, 5, p.RuneCount())
+       if p.WordCount != 8 {
+               t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 8, p.WordCount)
+       }
+}
+
+func TestWordCountWithAllCJKRunesHasCJKLanguage(t *testing.T) {
+       viper.Reset()
+       defer viper.Reset()
+
+       viper.Set("HasCJKLanguage", true)
+
+       p, _ := NewPage("simple.md")
+       _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ALL_CJK_RUNES))
+       p.Convert()
+       p.analyzePage()
+       if err != nil {
+               t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
+       }
+
+       if p.WordCount != 15 {
+               t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 15, p.WordCount)
+       }
+}
+
+func TestWordCountWithMainEnglishWithCJKRunes(t *testing.T) {
+       viper.Reset()
+       defer viper.Reset()
+
+       viper.Set("HasCJKLanguage", true)
+
+       p, _ := NewPage("simple.md")
+       _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES))
+       p.Convert()
+       p.analyzePage()
+       if err != nil {
+               t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
+       }
+
+       if p.WordCount != 74 {
+               t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 74, p.WordCount)
+       }
+
+       if p.Summary != SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY {
+               t.Fatalf("incorrect Summary for content '%s'. expected %v, got %v", p.plain,
+                       SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY, p.Summary)
+       }
+}
+
+func TestWordCountWithIsCJKLanguageFalse(t *testing.T) {
+       viper.Reset()
+       defer viper.Reset()
+
+       viper.Set("HasCJKLanguage", true)
+
+       p, _ := NewPage("simple.md")
+       _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE))
+       p.Convert()
+       p.analyzePage()
+       if err != nil {
+               t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
+       }
+
+       if p.WordCount != 75 {
+               t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 75, p.WordCount)
+       }
  
+       if p.Summary != SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY {
+               t.Fatalf("incorrect Summary for content '%s'. expected %v, got %v", p.plain,
+                       SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY, p.Summary)
         }
  }
author	coderzh <pythonzh@gmail.com>
	Thu, 3 Sep 2015 10:22:20 +0000 (18:22 +0800)
committer	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
	Wed, 7 Oct 2015 13:14:57 +0000 (15:14 +0200)
commands/hugo.go		patch \| blob \| history
helpers/content.go		patch \| blob \| history
helpers/content_test.go		patch \| blob \| history
hugolib/page.go		patch \| blob \| history
hugolib/page_test.go		patch \| blob \| history