resource: Use MD5 to identify image files
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Wed, 27 Dec 2017 18:31:42 +0000 (19:31 +0100)
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Thu, 28 Dec 2017 16:41:51 +0000 (17:41 +0100)
But only a set of byte chunks spread around in the image file to calculate the fingerprint, which is much faster than reading the whole file:

```bash
BenchmarkMD5FromFileFast/full=false-4            300000       4356 ns/op      240 B/op        5 allocs/op
BenchmarkMD5FromFileFast/full=true-4              30000      42899 ns/op    32944 B/op        5 allocs/op
```

Fixes #4186

Gopkg.lock
Gopkg.toml
helpers/general.go
helpers/general_test.go
resource/image.go
resource/image_test.go
resource/resource.go

index 44f957d186d17ffb4a04182c380bbfb073d9ebc5..4e79fd5695826ca0e6d354776c7a2346d412001b 100644 (file)
     ".",
     "mem"
   ]
-  revision = "8d919cbe7e2627e417f3e45c3c0e489a5b7e2536"
-  version = "v1.0.0"
+  revision = "ec3a3111d1e1bdff38a61e09d5a5f5e974905611"
+  version = "v1.0.1"
 
 [[projects]]
   name = "github.com/spf13/cast"
 [solve-meta]
   analyzer-name = "dep"
   analyzer-version = 1
-  inputs-digest = "2d9c34c260bc26814a0635c93009daeb9d8ffa56c29c0cff6827ae2d3e9ef96d"
+  inputs-digest = "7259b4caf8e75db0b809f06d4897dc870261252e3aecd68ea1348c87a5da9d50"
   solver-name = "gps-cdcl"
   solver-version = 1
index cef7427cecea0aa7386d75bed2aff1df633929fd..cc2f6686738363971b31b6e0bfa1bff5f0a80f16 100644 (file)
@@ -82,7 +82,7 @@
 
 [[constraint]]
   name = "github.com/spf13/afero"
-  version = "1.0.0"
+  version = "1.0.1"
 
 [[constraint]]
   name = "github.com/spf13/cast"
index 4bb4eb5844eba88d1858bb7d13ed7ddb25823980..dcbac697e7a967ee2d992d7d1bc5d26342ea7599 100644 (file)
@@ -26,6 +26,8 @@ import (
        "unicode"
        "unicode/utf8"
 
+       "github.com/spf13/afero"
+
        "github.com/jdkato/prose/transform"
 
        bp "github.com/gohugoio/hugo/bufferpool"
@@ -372,6 +374,57 @@ func MD5String(f string) string {
        return hex.EncodeToString(h.Sum([]byte{}))
 }
 
+// MD5FromFileFast creates a MD5 hash from the given file. It only reads parts of
+// the file for speed, so don't use it if the files are very subtly different.
+// It will not close the file.
+func MD5FromFileFast(f afero.File) (string, error) {
+       const (
+               // Do not change once set in stone!
+               maxChunks = 8
+               peekSize  = 64
+               seek      = 2048
+       )
+
+       h := md5.New()
+       buff := make([]byte, peekSize)
+
+       for i := 0; i < maxChunks; i++ {
+               if i > 0 {
+                       _, err := f.Seek(seek, 0)
+                       if err != nil {
+                               if err == io.EOF {
+                                       break
+                               }
+                               return "", err
+                       }
+               }
+
+               _, err := io.ReadAtLeast(f, buff, peekSize)
+               if err != nil {
+                       if err == io.EOF || err == io.ErrUnexpectedEOF {
+                               h.Write(buff)
+                               break
+                       }
+                       return "", err
+               }
+               h.Write(buff)
+       }
+
+       h.Write(buff)
+
+       return hex.EncodeToString(h.Sum(nil)), nil
+}
+
+// MD5FromFile creates a MD5 hash from the given file.
+// It will not close the file.
+func MD5FromFile(f afero.File) (string, error) {
+       h := md5.New()
+       if _, err := io.Copy(h, f); err != nil {
+               return "", nil
+       }
+       return hex.EncodeToString(h.Sum(nil)), nil
+}
+
 // IsWhitespace determines if the given rune is whitespace.
 func IsWhitespace(r rune) bool {
        return r == ' ' || r == '\t' || r == '\n' || r == '\r'
index 2bca632e0d4a6087a5165a39c04514ee390879fd..6a7dd48835b759ccec27058d96add5b035516ad4 100644 (file)
 package helpers
 
 import (
+       "fmt"
        "reflect"
        "strings"
        "testing"
 
+       "github.com/spf13/afero"
        "github.com/stretchr/testify/assert"
        "github.com/stretchr/testify/require"
 )
@@ -270,3 +272,91 @@ func TestToLowerMap(t *testing.T) {
                }
        }
 }
+
+func TestFastMD5FromFile(t *testing.T) {
+       fs := afero.NewMemMapFs()
+
+       if err := afero.WriteFile(fs, "small.txt", []byte("abc"), 0777); err != nil {
+               t.Fatal(err)
+       }
+
+       if err := afero.WriteFile(fs, "small2.txt", []byte("abd"), 0777); err != nil {
+               t.Fatal(err)
+       }
+
+       if err := afero.WriteFile(fs, "bigger.txt", []byte(strings.Repeat("a bc d e", 100)), 0777); err != nil {
+               t.Fatal(err)
+       }
+
+       if err := afero.WriteFile(fs, "bigger2.txt", []byte(strings.Repeat("c d e f g", 100)), 0777); err != nil {
+               t.Fatal(err)
+       }
+
+       req := require.New(t)
+
+       sf1, err := fs.Open("small.txt")
+       req.NoError(err)
+       sf2, err := fs.Open("small2.txt")
+       req.NoError(err)
+
+       bf1, err := fs.Open("bigger.txt")
+       req.NoError(err)
+       bf2, err := fs.Open("bigger2.txt")
+       req.NoError(err)
+
+       defer sf1.Close()
+       defer sf2.Close()
+       defer bf1.Close()
+       defer bf2.Close()
+
+       m1, err := MD5FromFileFast(sf1)
+       req.NoError(err)
+       req.Equal("308d8a1127b46524b51507424071c22c", m1)
+
+       m2, err := MD5FromFileFast(sf2)
+       req.NoError(err)
+       req.NotEqual(m1, m2)
+
+       m3, err := MD5FromFileFast(bf1)
+       req.NoError(err)
+       req.NotEqual(m2, m3)
+
+       m4, err := MD5FromFileFast(bf2)
+       req.NoError(err)
+       req.NotEqual(m3, m4)
+
+       m5, err := MD5FromFile(bf2)
+       req.NoError(err)
+       req.NotEqual(m4, m5)
+}
+
+func BenchmarkMD5FromFileFast(b *testing.B) {
+       fs := afero.NewMemMapFs()
+
+       for _, full := range []bool{false, true} {
+               b.Run(fmt.Sprintf("full=%t", full), func(b *testing.B) {
+                       for i := 0; i < b.N; i++ {
+                               b.StopTimer()
+                               if err := afero.WriteFile(fs, "file.txt", []byte(strings.Repeat("1234567890", 2000)), 0777); err != nil {
+                                       b.Fatal(err)
+                               }
+                               f, err := fs.Open("file.txt")
+                               if err != nil {
+                                       b.Fatal(err)
+                               }
+                               b.StartTimer()
+                               if full {
+                                       if _, err := MD5FromFile(f); err != nil {
+                                               b.Fatal(err)
+                                       }
+                               } else {
+                                       if _, err := MD5FromFileFast(f); err != nil {
+                                               b.Fatal(err)
+                                       }
+                               }
+                               f.Close()
+                       }
+               })
+       }
+
+}
index c039f68b6d63d38bf4bce44da7e98f1dd7e05731..916f89a4869f487d18098aeada8bc7f0f863f5cd 100644 (file)
@@ -112,6 +112,8 @@ type Image struct {
 
        imaging *Imaging
 
+       hash string
+
        *genericResource
 }
 
@@ -129,6 +131,7 @@ func (i *Image) Height() int {
 func (i *Image) WithNewBase(base string) Resource {
        return &Image{
                imaging:         i.imaging,
+               hash:            i.hash,
                genericResource: i.genericResource.WithNewBase(base).(*genericResource)}
 }
 
@@ -490,6 +493,7 @@ func (i *Image) clone() *Image {
 
        return &Image{
                imaging:         i.imaging,
+               hash:            i.hash,
                genericResource: &g}
 }
 
@@ -497,20 +501,11 @@ func (i *Image) setBasePath(conf imageConfig) {
        i.rel = i.filenameFromConfig(conf)
 }
 
-// We need to set this to something static during tests.
-var fiModTimeFunc = func(fi os.FileInfo) int64 {
-       return fi.ModTime().Unix()
-}
-
 func (i *Image) filenameFromConfig(conf imageConfig) string {
        p1, p2 := helpers.FileAndExt(i.rel)
-       sizeModeStr := fmt.Sprintf("_S%d_T%d", i.osFileInfo.Size(), fiModTimeFunc(i.osFileInfo))
-       // On scaling an already scaled image, we get the file info from the original.
-       // Repeating the same info in the filename makes it stuttery for no good reason.
-       if strings.Contains(p1, sizeModeStr) {
-               sizeModeStr = ""
-       }
+       idStr := fmt.Sprintf("_H%s_%d", i.hash, i.osFileInfo.Size())
 
+       // Do not change for no good reason.
        const md5Threshold = 100
 
        key := conf.key()
@@ -518,12 +513,16 @@ func (i *Image) filenameFromConfig(conf imageConfig) string {
        // It is useful to have the key in clear text, but when nesting transforms, it
        // can easily be too long to read, and maybe even too long
        // for the different OSes to handle.
-       if len(p1)+len(sizeModeStr)+len(p2) > md5Threshold {
+       if len(p1)+len(idStr)+len(p2) > md5Threshold {
                key = helpers.MD5String(p1 + key + p2)
-               p1 = p1[:strings.Index(p1, "_S")]
+               p1 = p1[:strings.Index(p1, "_H")]
+       } else if strings.Contains(p1, idStr) {
+               // On scaling an already scaled image, we get the file info from the original.
+               // Repeating the same info in the filename makes it stuttery for no good reason.
+               idStr = ""
        }
 
-       return fmt.Sprintf("%s%s_%s%s", p1, sizeModeStr, key, p2)
+       return fmt.Sprintf("%s%s_%s%s", p1, idStr, key, p2)
 }
 
 func decodeImaging(m map[string]interface{}) (Imaging, error) {
index 3543abb37dd64b9dec260167fe461d81b2325ab5..61a9ef8440021770b6ac1e778622442b3b017136 100644 (file)
@@ -15,7 +15,6 @@ package resource
 
 import (
        "fmt"
-       "os"
        "testing"
 
        "github.com/stretchr/testify/require"
@@ -52,9 +51,6 @@ func TestParseImageConfig(t *testing.T) {
 }
 
 func TestImageTransform(t *testing.T) {
-       fiModTimeFunc = func(fi os.FileInfo) int64 {
-               return int64(10111213)
-       }
 
        assert := require.New(t)
 
@@ -86,13 +82,13 @@ func TestImageTransform(t *testing.T) {
        assert.Equal(200, resizedAndRotated.Height())
        assertFileCache(assert, image.spec.Fs, resizedAndRotated.RelPermalink(), 125, 200)
 
-       assert.Equal("/a/sunset_S90587_T10111213_300x200_resize_q75_box_center.jpg", resized.RelPermalink())
+       assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_300x200_resize_q75_box_center.jpg", resized.RelPermalink())
        assert.Equal(300, resized.Width())
        assert.Equal(200, resized.Height())
 
        fitted, err := resized.Fit("50x50")
        assert.NoError(err)
-       assert.Equal("/a/sunset_S90587_T10111213_300x200_resize_q75_box_center_50x50_fit_q75_box_center.jpg", fitted.RelPermalink())
+       assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_9b37eba4e4e6ea0cc56a59bb5aa98143.jpg", fitted.RelPermalink())
        assert.Equal(50, fitted.Width())
        assert.Equal(31, fitted.Height())
 
@@ -100,13 +96,13 @@ func TestImageTransform(t *testing.T) {
        fittedAgain, _ := fitted.Fit("10x20")
        fittedAgain, err = fittedAgain.Fit("10x20")
        assert.NoError(err)
-       assert.Equal("/a/sunset_f1fb715a17c42d5d4602a1870424d590.jpg", fittedAgain.RelPermalink())
+       assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_9a8be1402216c385e0dfd73e267c6827.jpg", fittedAgain.RelPermalink())
        assert.Equal(10, fittedAgain.Width())
        assert.Equal(6, fittedAgain.Height())
 
        filled, err := image.Fill("200x100 bottomLeft")
        assert.NoError(err)
-       assert.Equal("/a/sunset_S90587_T10111213_200x100_fill_q75_box_bottomleft.jpg", filled.RelPermalink())
+       assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_200x100_fill_q75_box_bottomleft.jpg", filled.RelPermalink())
        assert.Equal(200, filled.Width())
        assert.Equal(100, filled.Height())
        assertFileCache(assert, image.spec.Fs, filled.RelPermalink(), 200, 100)
index 2c934d0312e7595e58dff00004de1598777cd80b..19392f3d34d1015d65f1b0f08650712589e883a5 100644 (file)
@@ -153,7 +153,19 @@ func (r *Spec) newResource(
        gr := r.newGenericResource(linker, fi, absPublishDir, absSourceFilename, filepath.ToSlash(relTargetFilename), mimeType)
 
        if mimeType == "image" {
+               f, err := r.Fs.Source.Open(absSourceFilename)
+               if err != nil {
+                       return nil, err
+               }
+               defer f.Close()
+
+               hash, err := helpers.MD5FromFileFast(f)
+               if err != nil {
+                       return nil, err
+               }
+
                return &Image{
+                       hash:            hash,
                        imaging:         r.imaging,
                        genericResource: gr}, nil
        }