kasan: speed up mte_set_mem_tag_range
authorEvgenii Stepanov <eugenis@google.com>
Fri, 21 May 2021 01:00:23 +0000 (18:00 -0700)
committerWill Deacon <will@kernel.org>
Tue, 25 May 2021 18:21:58 +0000 (19:21 +0100)
Use DC GVA / DC GZVA to speed up KASan memory tagging in HW tags mode.

The first cacheline is always tagged using STG/STZG even if the address is
cacheline-aligned, as benchmarks show it is faster than a conditional
branch.

Signed-off-by: Evgenii Stepanov <eugenis@google.com>
Co-developed-by: Peter Collingbourne <pcc@google.com>
Signed-off-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210521010023.3244784-1-eugenis@google.com
Signed-off-by: Will Deacon <will@kernel.org>
arch/arm64/include/asm/mte-kasan.h

index ddd4d17cf9a07b2a057ab4734e383c17309ff0b1..d952352bd0088d5ce8f602a3b39724d308f4b4ad 100644 (file)
@@ -48,43 +48,84 @@ static inline u8 mte_get_random_tag(void)
        return mte_get_ptr_tag(addr);
 }
 
+static inline u64 __stg_post(u64 p)
+{
+       asm volatile(__MTE_PREAMBLE "stg %0, [%0], #16"
+                    : "+r"(p)
+                    :
+                    : "memory");
+       return p;
+}
+
+static inline u64 __stzg_post(u64 p)
+{
+       asm volatile(__MTE_PREAMBLE "stzg %0, [%0], #16"
+                    : "+r"(p)
+                    :
+                    : "memory");
+       return p;
+}
+
+static inline void __dc_gva(u64 p)
+{
+       asm volatile(__MTE_PREAMBLE "dc gva, %0" : : "r"(p) : "memory");
+}
+
+static inline void __dc_gzva(u64 p)
+{
+       asm volatile(__MTE_PREAMBLE "dc gzva, %0" : : "r"(p) : "memory");
+}
+
 /*
  * Assign allocation tags for a region of memory based on the pointer tag.
  * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
- * size must be non-zero and MTE_GRANULE_SIZE aligned.
+ * size must be MTE_GRANULE_SIZE aligned.
  */
-static inline void mte_set_mem_tag_range(void *addr, size_t size,
-                                               u8 tag, bool init)
+static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag,
+                                        bool init)
 {
-       u64 curr, end;
+       u64 curr, mask, dczid_bs, end1, end2, end3;
 
-       if (!size)
-               return;
+       /* Read DC G(Z)VA block size from the system register. */
+       dczid_bs = 4ul << (read_cpuid(DCZID_EL0) & 0xf);
 
        curr = (u64)__tag_set(addr, tag);
-       end = curr + size;
+       mask = dczid_bs - 1;
+       /* STG/STZG up to the end of the first block. */
+       end1 = curr | mask;
+       end3 = curr + size;
+       /* DC GVA / GZVA in [end1, end2) */
+       end2 = end3 & ~mask;
 
        /*
-        * 'asm volatile' is required to prevent the compiler to move
-        * the statement outside of the loop.
+        * The following code uses STG on the first DC GVA block even if the
+        * start address is aligned - it appears to be faster than an alignment
+        * check + conditional branch. Also, if the range size is at least 2 DC
+        * GVA blocks, the first two loops can use post-condition to save one
+        * branch each.
         */
-       if (init) {
-               do {
-                       asm volatile(__MTE_PREAMBLE "stzg %0, [%0]"
-                                    :
-                                    : "r" (curr)
-                                    : "memory");
-                       curr += MTE_GRANULE_SIZE;
-               } while (curr != end);
-       } else {
-               do {
-                       asm volatile(__MTE_PREAMBLE "stg %0, [%0]"
-                                    :
-                                    : "r" (curr)
-                                    : "memory");
-                       curr += MTE_GRANULE_SIZE;
-               } while (curr != end);
-       }
+#define SET_MEMTAG_RANGE(stg_post, dc_gva)             \
+       do {                                            \
+               if (size >= 2 * dczid_bs) {             \
+                       do {                            \
+                               curr = stg_post(curr);  \
+                       } while (curr < end1);          \
+                                                       \
+                       do {                            \
+                               dc_gva(curr);           \
+                               curr += dczid_bs;       \
+                       } while (curr < end2);          \
+               }                                       \
+                                                       \
+               while (curr < end3)                     \
+                       curr = stg_post(curr);          \
+       } while (0)
+
+       if (init)
+               SET_MEMTAG_RANGE(__stzg_post, __dc_gzva);
+       else
+               SET_MEMTAG_RANGE(__stg_post, __dc_gva);
+#undef SET_MEMTAG_RANGE
 }
 
 void mte_enable_kernel_sync(void);