powerpc/64s: Implement queued spinlocks and rwlocks
authorNicholas Piggin <npiggin@gmail.com>
Fri, 24 Jul 2020 13:14:20 +0000 (23:14 +1000)
committerMichael Ellerman <mpe@ellerman.id.au>
Sun, 26 Jul 2020 14:01:23 +0000 (00:01 +1000)
These have shown significantly improved performance and fairness when
spinlock contention is moderate to high on very large systems.

With this series including subsequent patches, on a 16 socket 1536
thread POWER9, a stress test such as same-file open/close from all
CPUs gets big speedups, 11620op/s aggregate with simple spinlocks vs
384158op/s (33x faster), where the difference in throughput between
the fastest and slowest thread goes from 7x to 1.4x.

Thanks to the fast path being identical in terms of atomics and
barriers (after a subsequent optimisation patch), single threaded
performance is not changed (no measurable difference).

On smaller systems, performance and fairness seems to be generally
improved. Using dbench on tmpfs as a test (that starts to run into
kernel spinlock contention), a 2-socket OpenPOWER POWER9 system was
tested with bare metal and KVM guest configurations. Results can be
found here:

https://github.com/linuxppc/issues/issues/305#issuecomment-663487453

Observations are:

- Queued spinlocks are equal when contention is insignificant, as
  expected and as measured with microbenchmarks.

- When there is contention, on bare metal queued spinlocks have better
  throughput and max latency at all points.

- When virtualised, queued spinlocks are slightly worse approaching
  peak throughput, but significantly better throughput and max latency
  at all points beyond peak, until queued spinlock maximum latency
  rises when clients are 2x vCPUs.

The regressions haven't been analysed very well yet, there are a lot
of things that can be tuned, particularly the paravirtualised locking,
but the numbers already look like a good net win even on relatively
small systems.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200724131423.1362108-4-npiggin@gmail.com
arch/powerpc/Kconfig
arch/powerpc/include/asm/Kbuild
arch/powerpc/include/asm/qspinlock.h [new file with mode: 0644]
arch/powerpc/include/asm/spinlock.h
arch/powerpc/include/asm/spinlock_types.h
arch/powerpc/lib/Makefile
include/asm-generic/qspinlock.h

index 81c0dee1cbffe683d9e9766687fa17d21e92fc45..a751edacf4bc0f63756eebd0f44fe81b643df99e 100644 (file)
@@ -146,6 +146,8 @@ config PPC
        select ARCH_SUPPORTS_ATOMIC_RMW
        select ARCH_USE_BUILTIN_BSWAP
        select ARCH_USE_CMPXCHG_LOCKREF         if PPC64
+       select ARCH_USE_QUEUED_RWLOCKS          if PPC_QUEUED_SPINLOCKS
+       select ARCH_USE_QUEUED_SPINLOCKS        if PPC_QUEUED_SPINLOCKS
        select ARCH_WANT_IPC_PARSE_VERSION
        select ARCH_WEAK_RELEASE_ACQUIRE
        select BINFMT_ELF
@@ -491,6 +493,19 @@ config HOTPLUG_CPU
 
          Say N if you are unsure.
 
+config PPC_QUEUED_SPINLOCKS
+       bool "Queued spinlocks"
+       depends on SMP
+       help
+         Say Y here to use queued spinlocks which give better scalability and
+         fairness on large SMP and NUMA systems without harming single threaded
+         performance.
+
+         This option is currently experimental, the code is more complex and
+         less tested so it defaults to "N" for the moment.
+
+         If unsure, say "N".
+
 config ARCH_CPU_PROBE_RELEASE
        def_bool y
        depends on HOTPLUG_CPU
index dadbcf3a0b1e77a2e62d2586c95302af24e5421e..27c2268dfd6c0c709b1526533806fea97add5968 100644 (file)
@@ -6,5 +6,6 @@ generated-y += syscall_table_spu.h
 generic-y += export.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
+generic-y += qrwlock.h
 generic-y += vtime.h
 generic-y += early_ioremap.h
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
new file mode 100644 (file)
index 0000000..c49e33e
--- /dev/null
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_QSPINLOCK_H
+#define _ASM_POWERPC_QSPINLOCK_H
+
+#include <asm-generic/qspinlock_types.h>
+
+#define _Q_PENDING_LOOPS       (1 << 9) /* not tuned */
+
+#define smp_mb__after_spinlock()   smp_mb()
+
+static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
+{
+       /*
+        * This barrier was added to simple spinlocks by commit 51d7d5205d338,
+        * but it should now be possible to remove it, asm arm64 has done with
+        * commit c6f5d02b6a0f.
+        */
+       smp_mb();
+       return atomic_read(&lock->val);
+}
+#define queued_spin_is_locked queued_spin_is_locked
+
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_POWERPC_QSPINLOCK_H */
index 21357fe05fe0013239fe303eb2975ecf92fa9f3d..434615f1d76109e6656a326e322ff9f9ba4503a1 100644 (file)
@@ -3,7 +3,12 @@
 #define __ASM_SPINLOCK_H
 #ifdef __KERNEL__
 
+#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
+#else
 #include <asm/simple_spinlock.h>
+#endif
 
 #endif /* __KERNEL__ */
 #endif /* __ASM_SPINLOCK_H */
index 3906f52dae65799f774550a3099caedc5f6d200a..c5d742f18021dd36c3ca8d20c02b5ec801c822ce 100644 (file)
@@ -6,6 +6,11 @@
 # error "please don't include this file directly"
 #endif
 
+#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#include <asm-generic/qrwlock_types.h>
+#else
 #include <asm/simple_spinlock_types.h>
+#endif
 
 #endif
index 5e994cda8e401c2f802128202055bdb989cef385..d66a645503ebdc662c57961b74cdced30d30411e 100644 (file)
@@ -41,7 +41,10 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
 obj64-y        += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
           memcpy_64.o memcpy_mcsafe_64.o
 
+ifndef CONFIG_PPC_QUEUED_SPINLOCKS
 obj64-$(CONFIG_SMP)    += locks.o
+endif
+
 obj64-$(CONFIG_ALTIVEC)        += vmx-helper.o
 obj64-$(CONFIG_KPROBES_SANITY_TEST)    += test_emulate_step.o \
                                           test_emulate_step_exec_instr.o
index fde943d180e03fe725989b4ffbe8af167f3535f8..fb0a814d4395a231d5b8fdf69edef6db26f3ec10 100644 (file)
@@ -12,6 +12,7 @@
 
 #include <asm-generic/qspinlock_types.h>
 
+#ifndef queued_spin_is_locked
 /**
  * queued_spin_is_locked - is the spinlock locked?
  * @lock: Pointer to queued spinlock structure
@@ -25,6 +26,7 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
         */
        return atomic_read(&lock->val);
 }
+#endif
 
 /**
  * queued_spin_value_unlocked - is the spinlock structure unlocked?