crypto: x86/sha256-ni - optimize code size
authorEric Biggers <ebiggers@google.com>
Thu, 11 Apr 2024 16:23:58 +0000 (09:23 -0700)
committerHerbert Xu <herbert@gondor.apana.org.au>
Fri, 19 Apr 2024 10:54:18 +0000 (18:54 +0800)
- Load the SHA-256 round constants relative to a pointer that points
  into the middle of the constants rather than to the beginning.  Since
  x86 instructions use signed offsets, this decreases the instruction
  length required to access some of the later round constants.

- Use punpcklqdq or punpckhqdq instead of longer instructions such as
  pshufd, pblendw, and palignr.  This doesn't harm performance.

The end result is that sha256_ni_transform shrinks from 839 bytes to 791
bytes, with no loss in performance.

Suggested-by: Stefan Kanthak <stefan.kanthak@nexgo.de>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/x86/crypto/sha256_ni_asm.S

index b7e7001dafdfd75b4e54c22b1e365c641c74ec3d..ffc9f1c75c15a74c5e4a760cd4a8cc8476ab437d 100644 (file)
@@ -84,7 +84,7 @@
 .else
        movdqa          \m0, MSG
 .endif
-       paddd           \i*4(SHA256CONSTANTS), MSG
+       paddd           (\i-32)*4(SHA256CONSTANTS), MSG
        sha256rnds2     STATE0, STATE1
 .if \i >= 12 && \i < 60
        movdqa          \m0, TMP
@@ -92,7 +92,7 @@
        paddd           TMP, \m1
        sha256msg2      \m0, \m1
 .endif
-       pshufd          $0x0E, MSG, MSG
+       punpckhqdq      MSG, MSG
        sha256rnds2     STATE1, STATE0
 .if \i >= 4 && \i < 52
        sha256msg1      \m0, \m3
@@ -128,17 +128,17 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
         * Need to reorder these appropriately
         * DCBA, HGFE -> ABEF, CDGH
         */
-       movdqu          0*16(DIGEST_PTR), STATE0
-       movdqu          1*16(DIGEST_PTR), STATE1
+       movdqu          0*16(DIGEST_PTR), STATE0        /* DCBA */
+       movdqu          1*16(DIGEST_PTR), STATE1        /* HGFE */
 
-       pshufd          $0xB1, STATE0,  STATE0          /* CDAB */
-       pshufd          $0x1B, STATE1,  STATE1          /* EFGH */
        movdqa          STATE0, TMP
-       palignr         $8, STATE1,  STATE0             /* ABEF */
-       pblendw         $0xF0, TMP, STATE1              /* CDGH */
+       punpcklqdq      STATE1, STATE0                  /* FEBA */
+       punpckhqdq      TMP, STATE1                     /* DCHG */
+       pshufd          $0x1B, STATE0, STATE0           /* ABEF */
+       pshufd          $0xB1, STATE1, STATE1           /* CDGH */
 
        movdqa          PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
-       lea             K256(%rip), SHA256CONSTANTS
+       lea             K256+32*4(%rip), SHA256CONSTANTS
 
 .Lloop0:
        /* Save hash values for addition after rounds */
@@ -162,14 +162,14 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
        jne             .Lloop0
 
        /* Write hash values back in the correct order */
-       pshufd          $0x1B, STATE0,  STATE0          /* FEBA */
-       pshufd          $0xB1, STATE1,  STATE1          /* DCHG */
        movdqa          STATE0, TMP
-       pblendw         $0xF0, STATE1,  STATE0          /* DCBA */
-       palignr         $8, TMP, STATE1                 /* HGFE */
+       punpcklqdq      STATE1, STATE0                  /* GHEF */
+       punpckhqdq      TMP, STATE1                     /* ABCD */
+       pshufd          $0xB1, STATE0, STATE0           /* HGFE */
+       pshufd          $0x1B, STATE1, STATE1           /* DCBA */
 
-       movdqu          STATE0, 0*16(DIGEST_PTR)
-       movdqu          STATE1, 1*16(DIGEST_PTR)
+       movdqu          STATE1, 0*16(DIGEST_PTR)
+       movdqu          STATE0, 1*16(DIGEST_PTR)
 
 .Ldone_hash: