49a7d46a06c3 (powerpc: Implement smp_cond_load_relaxed()) added busy-waiting pausing with a preferred SMT priority pattern, lowering the priority (reducing decode cycles) during the whole loop slowpath. However, data shows that while this pattern works well with simple spinlocks, queued spinlocks benefit more being kept in medium priority, with a cpu_relax() instead, being a low+medium combo on powerpc. Data is from three benchmarks on a Power9: 9008-22L 64 CPUs with 2 sockets and 8 threads per core. 1. locktorture. This is data for the lowest and most artificial/pathological level, with increasing thread counts pounding on the lock. Metrics are total ops/minute. Despite some small hits in the 4-8 range, scenarios are either neutral or favorable to this patch. +=========+==========+==========+=======+ | # tasks | vanilla | dirty | %diff | +=========+==========+==========+=======+ | 2 | 46718565 | 48751350 | 4.35 | +---------+----------+----------+-------+ | 4 | 51740198 | 50369082 | -2.65 | +---------+----------+----------+-------+ | 8 | 63756510 | 62568821 | -1.86 | +---------+----------+----------+-------+ | 16 | 67824531 | 70966546 | 4.63 | +---------+----------+----------+-------+ | 32 | 53843519 | 61155508 | 13.58 | +---------+----------+----------+-------+ | 64 | 53005778 | 53104412 | 0.18 | +---------+----------+----------+-------+ | 128 | 53331980 | 54606910 | 2.39 | +=========+==========+==========+=======+ 2. sockperf (tcp throughput) Here a client will do one-way throughput tests to a localhost server, with increasing message sizes, dealing with the sk_lock. This patch shows to put the performance of the qspinlock back to par with that of the simple lock: simple-spinlock vanilla dirty Hmean 14 73.50 ( 0.00%) 54.44 * -25.93%* 73.45 * -0.07%* Hmean 100 654.47 ( 0.00%) 385.61 * -41.08%* 771.43 * 17.87%* Hmean 300 2719.39 ( 0.00%) 2181.67 * -19.77%* 2666.50 * -1.94%* Hmean 500 4400.59 ( 0.00%) 3390.77 * -22.95%* 4322.14 * -1.78%* Hmean 850 6726.21 ( 0.00%) 5264.03 * -21.74%* 6863.12 * 2.04%* 3. dbench (tmpfs) Configured to run with up to ncpusx8 clients, it shows both latency and throughput metrics. For the latency, with the exception of the 64 case, there is really nothing to go by: vanilla dirty Amean latency-1 1.67 ( 0.00%) 1.67 * 0.09%* Amean latency-2 2.15 ( 0.00%) 2.08 * 3.36%* Amean latency-4 2.50 ( 0.00%) 2.56 * -2.27%* Amean latency-8 2.49 ( 0.00%) 2.48 * 0.31%* Amean latency-16 2.69 ( 0.00%) 2.72 * -1.37%* Amean latency-32 2.96 ( 0.00%) 3.04 * -2.60%* Amean latency-64 7.78 ( 0.00%) 8.17 * -5.07%* Amean latency-512 186.91 ( 0.00%) 186.41 * 0.27%* For the dbench4 Throughput (misleading but traditional) there's a small but rather constant improvement: vanilla dirty Hmean 1 849.13 ( 0.00%) 851.51 * 0.28%* Hmean 2 1664.03 ( 0.00%) 1663.94 * -0.01%* Hmean 4 3073.70 ( 0.00%) 3104.29 * 1.00%* Hmean 8 5624.02 ( 0.00%) 5694.16 * 1.25%* Hmean 16 9169.49 ( 0.00%) 9324.43 * 1.69%* Hmean 32 11969.37 ( 0.00%) 12127.09 * 1.32%* Hmean 64 15021.12 ( 0.00%) 15243.14 * 1.48%* Hmean 512 14891.27 ( 0.00%) 15162.11 * 1.82%* Measuring the dbench4 Per-VFS Operation latency, shows some very minor differences within the noise level, around the 0-1% ranges. Fixes: 49a7d46a06c3 ("powerpc: Implement smp_cond_load_relaxed()") Acked-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Davidlohr Bueso <dbueso@suse.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20210318204702.71417-1-dave@stgolabs.net
116 lines
3.8 KiB
C
116 lines
3.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright (C) 1999 Cort Dougan <cort@cs.nmt.edu>
|
|
*/
|
|
#ifndef _ASM_POWERPC_BARRIER_H
|
|
#define _ASM_POWERPC_BARRIER_H
|
|
|
|
#include <asm/asm-const.h>
|
|
|
|
#ifndef __ASSEMBLY__
|
|
#include <asm/ppc-opcode.h>
|
|
#endif
|
|
|
|
/*
|
|
* Memory barrier.
|
|
* The sync instruction guarantees that all memory accesses initiated
|
|
* by this processor have been performed (with respect to all other
|
|
* mechanisms that access memory). The eieio instruction is a barrier
|
|
* providing an ordering (separately) for (a) cacheable stores and (b)
|
|
* loads and stores to non-cacheable memory (e.g. I/O devices).
|
|
*
|
|
* mb() prevents loads and stores being reordered across this point.
|
|
* rmb() prevents loads being reordered across this point.
|
|
* wmb() prevents stores being reordered across this point.
|
|
*
|
|
* *mb() variants without smp_ prefix must order all types of memory
|
|
* operations with one another. sync is the only instruction sufficient
|
|
* to do this.
|
|
*
|
|
* For the smp_ barriers, ordering is for cacheable memory operations
|
|
* only. We have to use the sync instruction for smp_mb(), since lwsync
|
|
* doesn't order loads with respect to previous stores. Lwsync can be
|
|
* used for smp_rmb() and smp_wmb().
|
|
*
|
|
* However, on CPUs that don't support lwsync, lwsync actually maps to a
|
|
* heavy-weight sync, so smp_wmb() can be a lighter-weight eieio.
|
|
*/
|
|
#define mb() __asm__ __volatile__ ("sync" : : : "memory")
|
|
#define rmb() __asm__ __volatile__ ("sync" : : : "memory")
|
|
#define wmb() __asm__ __volatile__ ("sync" : : : "memory")
|
|
|
|
/* The sub-arch has lwsync */
|
|
#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
|
|
# define SMPWMB LWSYNC
|
|
#else
|
|
# define SMPWMB eieio
|
|
#endif
|
|
|
|
#define __lwsync() __asm__ __volatile__ (stringify_in_c(LWSYNC) : : :"memory")
|
|
#define dma_rmb() __lwsync()
|
|
#define dma_wmb() __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory")
|
|
|
|
#define __smp_lwsync() __lwsync()
|
|
|
|
#define __smp_mb() mb()
|
|
#define __smp_rmb() __lwsync()
|
|
#define __smp_wmb() __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory")
|
|
|
|
/*
|
|
* This is a barrier which prevents following instructions from being
|
|
* started until the value of the argument x is known. For example, if
|
|
* x is a variable loaded from memory, this prevents following
|
|
* instructions from being executed until the load has been performed.
|
|
*/
|
|
#define data_barrier(x) \
|
|
asm volatile("twi 0,%0,0; isync" : : "r" (x) : "memory");
|
|
|
|
#define __smp_store_release(p, v) \
|
|
do { \
|
|
compiletime_assert_atomic_type(*p); \
|
|
__smp_lwsync(); \
|
|
WRITE_ONCE(*p, v); \
|
|
} while (0)
|
|
|
|
#define __smp_load_acquire(p) \
|
|
({ \
|
|
typeof(*p) ___p1 = READ_ONCE(*p); \
|
|
compiletime_assert_atomic_type(*p); \
|
|
__smp_lwsync(); \
|
|
___p1; \
|
|
})
|
|
|
|
#ifdef CONFIG_PPC_BOOK3S_64
|
|
#define NOSPEC_BARRIER_SLOT nop
|
|
#elif defined(CONFIG_PPC_FSL_BOOK3E)
|
|
#define NOSPEC_BARRIER_SLOT nop; nop
|
|
#endif
|
|
|
|
#ifdef CONFIG_PPC_BARRIER_NOSPEC
|
|
/*
|
|
* Prevent execution of subsequent instructions until preceding branches have
|
|
* been fully resolved and are no longer executing speculatively.
|
|
*/
|
|
#define barrier_nospec_asm NOSPEC_BARRIER_FIXUP_SECTION; NOSPEC_BARRIER_SLOT
|
|
|
|
// This also acts as a compiler barrier due to the memory clobber.
|
|
#define barrier_nospec() asm (stringify_in_c(barrier_nospec_asm) ::: "memory")
|
|
|
|
#else /* !CONFIG_PPC_BARRIER_NOSPEC */
|
|
#define barrier_nospec_asm
|
|
#define barrier_nospec()
|
|
#endif /* CONFIG_PPC_BARRIER_NOSPEC */
|
|
|
|
/*
|
|
* pmem_wmb() ensures that all stores for which the modification
|
|
* are written to persistent storage by preceding dcbfps/dcbstps
|
|
* instructions have updated persistent storage before any data
|
|
* access or data transfer caused by subsequent instructions is
|
|
* initiated.
|
|
*/
|
|
#define pmem_wmb() __asm__ __volatile__(PPC_PHWSYNC ::: "memory")
|
|
|
|
#include <asm-generic/barrier.h>
|
|
|
|
#endif /* _ASM_POWERPC_BARRIER_H */
|