arm64: fpsimd: run kernel mode NEON with softirqs disabled
authorArd Biesheuvel <ardb@kernel.org>
Tue, 2 Mar 2021 09:01:12 +0000 (10:01 +0100)
committerCatalin Marinas <catalin.marinas@arm.com>
Mon, 12 Apr 2021 10:55:34 +0000 (11:55 +0100)
Kernel mode NEON can be used in task or softirq context, but only in
a non-nesting manner, i.e., softirq context is only permitted if the
interrupt was not taken at a point where the kernel was using the NEON
in task context.

This means all users of kernel mode NEON have to be aware of this
limitation, and either need to provide scalar fallbacks that may be much
slower (up to 20x for AES instructions) and potentially less safe, or
use an asynchronous interface that defers processing to a later time
when the NEON is guaranteed to be available.

Given that grabbing and releasing the NEON is cheap, we can relax this
restriction, by increasing the granularity of kernel mode NEON code, and
always disabling softirq processing while the NEON is being used in task
context.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210302090118.30666-4-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
arch/arm64/crypto/aes-modes.S
arch/arm64/crypto/sha1-ce-core.S
arch/arm64/crypto/sha2-ce-core.S
arch/arm64/crypto/sha3-ce-core.S
arch/arm64/crypto/sha512-ce-core.S
arch/arm64/include/asm/assembler.h
arch/arm64/kernel/asm-offsets.c
arch/arm64/kernel/fpsimd.c

index bbdb54702aa7a45d40fd26b7c2db5e2c98ffbca9..ab6c14ef9f4ed91b3abd6c21cedb527bcbd90c0c 100644 (file)
@@ -700,7 +700,7 @@ AES_FUNC_START(aes_mac_update)
        cbz             w5, .Lmacout
        encrypt_block   v0, w2, x1, x7, w8
        st1             {v0.16b}, [x4]                  /* return dg */
-       cond_yield      .Lmacout, x7
+       cond_yield      .Lmacout, x7, x8
        b               .Lmacloop4x
 .Lmac1x:
        add             w3, w3, #4
index 8c02bbc2684ed7f88428bf3f4b96482a64c202a5..889ca0f8972b3736c044a1f80bdccf5ddc41e4df 100644 (file)
@@ -121,7 +121,7 @@ CPU_LE(     rev32           v11.16b, v11.16b        )
        add             dgav.4s, dgav.4s, dg0v.4s
 
        cbz             w2, 2f
-       cond_yield      3f, x5
+       cond_yield      3f, x5, x6
        b               0b
 
        /*
index 6cdea7d560593ba90adc0047958823fd8057da7b..491179922f49808f1144a7a313b3eb647067d17e 100644 (file)
@@ -129,7 +129,7 @@ CPU_LE(     rev32           v19.16b, v19.16b        )
 
        /* handled all input blocks? */
        cbz             w2, 2f
-       cond_yield      3f, x5
+       cond_yield      3f, x5, x6
        b               0b
 
        /*
index 6f5208414fe3fdb03c33482a26eda8d0cddc09e4..9c77313f5a60885231e9374b8853a0a3252d3504 100644 (file)
@@ -184,11 +184,11 @@ SYM_FUNC_START(sha3_ce_transform)
        eor      v0.16b,  v0.16b, v31.16b
 
        cbnz    w8, 3b
-       cond_yield 3f, x8
+       cond_yield 4f, x8, x9
        cbnz    w2, 0b
 
        /* save state */
-3:     st1     { v0.1d- v3.1d}, [x0], #32
+4:     st1     { v0.1d- v3.1d}, [x0], #32
        st1     { v4.1d- v7.1d}, [x0], #32
        st1     { v8.1d-v11.1d}, [x0], #32
        st1     {v12.1d-v15.1d}, [x0], #32
index d6e7f6c95fa6f26f20471426ca04bc7498178c5b..b6a3a36e15f58cf98c7829bc2ad746349d23a74e 100644 (file)
@@ -195,7 +195,7 @@ CPU_LE(     rev64           v19.16b, v19.16b        )
        add             v10.2d, v10.2d, v2.2d
        add             v11.2d, v11.2d, v3.2d
 
-       cond_yield      3f, x4
+       cond_yield      3f, x4, x5
        /* handled all input blocks? */
        cbnz            w2, 0b
 
index 7b076ccd1a54d9a6c3a5da45b5d8795cd1f41f5b..6ac38f7cf824da0b89fae861ff79769b49d7f2e7 100644 (file)
@@ -15,6 +15,7 @@
 #include <asm-generic/export.h>
 
 #include <asm/asm-offsets.h>
+#include <asm/alternative.h>
 #include <asm/cpufeature.h>
 #include <asm/cputype.h>
 #include <asm/debug-monitors.h>
@@ -701,19 +702,32 @@ USER(\label, ic   ivau, \tmp2)                    // invalidate I line PoU
 .endm
 
        /*
-        * Check whether preempt-disabled code should yield as soon as it
-        * is able. This is the case if re-enabling preemption a single
-        * time results in a preempt count of zero, and the TIF_NEED_RESCHED
-        * flag is set. (Note that the latter is stored negated in the
-        * top word of the thread_info::preempt_count field)
+        * Check whether preempt/bh-disabled asm code should yield as soon as
+        * it is able. This is the case if we are currently running in task
+        * context, and either a softirq is pending, or the TIF_NEED_RESCHED
+        * flag is set and re-enabling preemption a single time would result in
+        * a preempt count of zero. (Note that the TIF_NEED_RESCHED flag is
+        * stored negated in the top word of the thread_info::preempt_count
+        * field)
         */
-       .macro          cond_yield, lbl:req, tmp:req
-#ifdef CONFIG_PREEMPTION
+       .macro          cond_yield, lbl:req, tmp:req, tmp2:req
        get_current_task \tmp
        ldr             \tmp, [\tmp, #TSK_TI_PREEMPT]
+       /*
+        * If we are serving a softirq, there is no point in yielding: the
+        * softirq will not be preempted no matter what we do, so we should
+        * run to completion as quickly as we can.
+        */
+       tbnz            \tmp, #SOFTIRQ_SHIFT, .Lnoyield_\@
+#ifdef CONFIG_PREEMPTION
        sub             \tmp, \tmp, #PREEMPT_DISABLE_OFFSET
        cbz             \tmp, \lbl
 #endif
+       adr_l           \tmp, irq_stat + IRQ_CPUSTAT_SOFTIRQ_PENDING
+       this_cpu_offset \tmp2
+       ldr             w\tmp, [\tmp, \tmp2]
+       cbnz            w\tmp, \lbl     // yield on pending softirq in task context
+.Lnoyield_\@:
        .endm
 
 /*
index a36e2fc330d430a714f901595ee9efc02bedab99..cc7267a24bf72c7abba69b447990105b5db867e0 100644 (file)
@@ -95,6 +95,8 @@ int main(void)
   DEFINE(DMA_FROM_DEVICE,      DMA_FROM_DEVICE);
   BLANK();
   DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET);
+  DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT);
+  DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending));
   BLANK();
   DEFINE(CPU_BOOT_STACK,       offsetof(struct secondary_data, stack));
   DEFINE(CPU_BOOT_TASK,                offsetof(struct secondary_data, task));
index 062b21f30f9422aa03d6cf0eb8cc56e2ba8032d9..823e3a8a8871bc14007571c5b4a84bb1e8dfd7ed 100644 (file)
@@ -180,7 +180,7 @@ static void __get_cpu_fpsimd_context(void)
  */
 static void get_cpu_fpsimd_context(void)
 {
-       preempt_disable();
+       local_bh_disable();
        __get_cpu_fpsimd_context();
 }
 
@@ -201,7 +201,7 @@ static void __put_cpu_fpsimd_context(void)
 static void put_cpu_fpsimd_context(void)
 {
        __put_cpu_fpsimd_context();
-       preempt_enable();
+       local_bh_enable();
 }
 
 static bool have_cpu_fpsimd_context(void)