WordCount Summary support UTF-8 string
authorcoderzh <pythonzh@gmail.com>
Thu, 3 Sep 2015 10:22:20 +0000 (18:22 +0800)
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Sat, 12 Sep 2015 13:41:17 +0000 (15:41 +0200)
helpers/content.go
helpers/content_test.go
hugolib/page.go

index 8e3fda5053d7e8d2bac05ece3ac8506838678b1c..6bb7ed4d36ccfafd08b389f0703712f6f972bc9d 100644 (file)
@@ -19,6 +19,7 @@ package helpers
 
 import (
        "bytes"
+       "unicode/utf8"
        "html/template"
        "os/exec"
 
@@ -386,21 +387,57 @@ func TruncateWords(s string, max int) string {
 // and returns entire sentences from content, delimited by the int
 // and whether it's truncated or not.
 func TruncateWordsToWholeSentence(words []string, max int) (string, bool) {
-       if max >= len(words) {
-               return strings.Join(words, " "), false
+       count := 0
+       index, word := 0, ""
+       truncated := false
+       
+       for index, word = range words {
+               runeCount := utf8.RuneCountInString(word)
+               if len(word) == runeCount {
+                       count++;
+               } else {
+                       if count + runeCount <= max {
+                               count += runeCount
+                       } else {
+                               offset := 0
+                               for count < max {
+                                       _, width := utf8.DecodeRuneInString(word[offset:])
+                               offset += width
+                                       count++
+                               }
+                               words[index] = word[:offset]
+                               truncated = true
+                       }
+               }
+               
+               if count >= max {
+                       if index < len(words) - 1 {
+                               truncated = true        
+                       }
+                       break
+               }
        }
-
-       for counter, word := range words[max:] {
-               if strings.HasSuffix(word, ".") ||
-                       strings.HasSuffix(word, "?") ||
-                       strings.HasSuffix(word, ".\"") ||
-                       strings.HasSuffix(word, "!") {
-                       upper := max + counter + 1
-                       return strings.Join(words[:upper], " "), (upper < len(words))
+       
+       index += 1
+       
+       if index < len(words) {
+               for counter, word := range words[index:] {
+                       if len(word) != utf8.RuneCountInString(word) {
+                               break
+                       }
+                       if strings.HasSuffix(word, ".") ||
+                               strings.HasSuffix(word, "?") ||
+                               strings.HasSuffix(word, ".\"") ||
+                               strings.HasSuffix(word, "!") {
+                               upper := index + counter + 1
+                               return strings.Join(words[:upper], " "), (upper < len(words))
+                       }
                }
+       } else if index > len(words) {
+               return strings.Join(words, " "), truncated
        }
-
-       return strings.Join(words[:max], " "), true
+       
+       return strings.Join(words[:index], " "), truncated
 }
 
 // GetAsciidocContent calls asciidoctor or asciidoc as an external helper
index 602ca3785e80636f7c48dd2c954901c986ed645a..f614011c0e6f15c00b0c881dd3e249331cedc201 100644 (file)
@@ -54,6 +54,8 @@ func TestTruncateWordsToWholeSentence(t *testing.T) {
                {"a b c", "a b c", 12, false},
                {"a b c", "a b c", 3, false},
                {"a", "a", 1, false},
+               {"Hello 中国", "Hello 中", 2, true},
+               {"Hello 中国", "Hello 中国", 3, false},
                {"This is a sentence.", "This is a sentence.", 5, false},
                {"This is also a sentence!", "This is also a sentence!", 1, false},
                {"To be. Or not to be. That's the question.", "To be.", 1, true},
index b80e92257eedfdfd84f393593439df85e8dd423f..c50e2da18b23348773bd5965a003b87acd1fa226 100644 (file)
@@ -31,6 +31,7 @@ import (
        "strings"
        "sync"
        "time"
+       "unicode/utf8"
 
        "github.com/spf13/cast"
        bp "github.com/spf13/hugo/bufferpool"
@@ -362,7 +363,16 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) {
 }
 
 func (p *Page) analyzePage() {
-       p.WordCount = len(p.PlainWords())
+       p.WordCount = 0
+       for _, word := range p.PlainWords() {
+               runeCount := utf8.RuneCountInString(word)
+               if len(word) == runeCount {
+                       p.WordCount++   
+               } else {
+                       p.WordCount += runeCount
+               }
+       }
+       
        p.FuzzyWordCount = int((p.WordCount+100)/100) * 100
        p.ReadingTime = int((p.WordCount + 212) / 213)
 }