031af50045
The inline assembly for arm64's cmpxchg_double*() implementations use a +Q constraint to hazard against other accesses to the memory location being exchanged. However, the pointer passed to the constraint is a pointer to unsigned long, and thus the hazard only applies to the first 8 bytes of the location. GCC can take advantage of this, assuming that other portions of the location are unchanged, leading to a number of potential problems. This is similar to what we fixed back in commit: fee960bed5e857eb ("arm64: xchg: hazard against entire exchange variable") ... but we forgot to adjust cmpxchg_double*() similarly at the same time. The same problem applies, as demonstrated with the following test: | struct big { | u64 lo, hi; | } __aligned(128); | | unsigned long foo(struct big *b) | { | u64 hi_old, hi_new; | | hi_old = b->hi; | cmpxchg_double_local(&b->lo, &b->hi, 0x12, 0x34, 0x56, 0x78); | hi_new = b->hi; | | return hi_old ^ hi_new; | } ... which GCC 12.1.0 compiles as: | 0000000000000000 <foo>: | 0: d503233f paciasp | 4: aa0003e4 mov x4, x0 | 8: 1400000e b 40 <foo+0x40> | c: d2800240 mov x0, #0x12 // #18 | 10: d2800681 mov x1, #0x34 // #52 | 14: aa0003e5 mov x5, x0 | 18: aa0103e6 mov x6, x1 | 1c: d2800ac2 mov x2, #0x56 // #86 | 20: d2800f03 mov x3, #0x78 // #120 | 24: 48207c82 casp x0, x1, x2, x3, [x4] | 28: ca050000 eor x0, x0, x5 | 2c: ca060021 eor x1, x1, x6 | 30: aa010000 orr x0, x0, x1 | 34: d2800000 mov x0, #0x0 // #0 <--- BANG | 38: d50323bf autiasp | 3c: d65f03c0 ret | 40: d2800240 mov x0, #0x12 // #18 | 44: d2800681 mov x1, #0x34 // #52 | 48: d2800ac2 mov x2, #0x56 // #86 | 4c: d2800f03 mov x3, #0x78 // #120 | 50: f9800091 prfm pstl1strm, [x4] | 54: c87f1885 ldxp x5, x6, [x4] | 58: ca0000a5 eor x5, x5, x0 | 5c: ca0100c6 eor x6, x6, x1 | 60: aa0600a6 orr x6, x5, x6 | 64: b5000066 cbnz x6, 70 <foo+0x70> | 68: c8250c82 stxp w5, x2, x3, [x4] | 6c: 35ffff45 cbnz w5, 54 <foo+0x54> | 70: d2800000 mov x0, #0x0 // #0 <--- BANG | 74: d50323bf autiasp | 78: d65f03c0 ret Notice that at the lines with "BANG" comments, GCC has assumed that the higher 8 bytes are unchanged by the cmpxchg_double() call, and that `hi_old ^ hi_new` can be reduced to a constant zero, for both LSE and LL/SC versions of cmpxchg_double(). This patch fixes the issue by passing a pointer to __uint128_t into the +Q constraint, ensuring that the compiler hazards against the entire 16 bytes being modified. With this change, GCC 12.1.0 compiles the above test as: | 0000000000000000 <foo>: | 0: f9400407 ldr x7, [x0, #8] | 4: d503233f paciasp | 8: aa0003e4 mov x4, x0 | c: 1400000f b 48 <foo+0x48> | 10: d2800240 mov x0, #0x12 // #18 | 14: d2800681 mov x1, #0x34 // #52 | 18: aa0003e5 mov x5, x0 | 1c: aa0103e6 mov x6, x1 | 20: d2800ac2 mov x2, #0x56 // #86 | 24: d2800f03 mov x3, #0x78 // #120 | 28: 48207c82 casp x0, x1, x2, x3, [x4] | 2c: ca050000 eor x0, x0, x5 | 30: ca060021 eor x1, x1, x6 | 34: aa010000 orr x0, x0, x1 | 38: f9400480 ldr x0, [x4, #8] | 3c: d50323bf autiasp | 40: ca0000e0 eor x0, x7, x0 | 44: d65f03c0 ret | 48: d2800240 mov x0, #0x12 // #18 | 4c: d2800681 mov x1, #0x34 // #52 | 50: d2800ac2 mov x2, #0x56 // #86 | 54: d2800f03 mov x3, #0x78 // #120 | 58: f9800091 prfm pstl1strm, [x4] | 5c: c87f1885 ldxp x5, x6, [x4] | 60: ca0000a5 eor x5, x5, x0 | 64: ca0100c6 eor x6, x6, x1 | 68: aa0600a6 orr x6, x5, x6 | 6c: b5000066 cbnz x6, 78 <foo+0x78> | 70: c8250c82 stxp w5, x2, x3, [x4] | 74: 35ffff45 cbnz w5, 5c <foo+0x5c> | 78: f9400480 ldr x0, [x4, #8] | 7c: d50323bf autiasp | 80: ca0000e0 eor x0, x7, x0 | 84: d65f03c0 ret ... sampling the high 8 bytes before and after the cmpxchg, and performing an EOR, as we'd expect. For backporting, I've tested this atop linux-4.9.y with GCC 5.5.0. Note that linux-4.9.y is oldest currently supported stable release, and mandates GCC 5.1+. Unfortunately I couldn't get a GCC 5.1 binary to run on my machines due to library incompatibilities. I've also used a standalone test to check that we can use a __uint128_t pointer in a +Q constraint at least as far back as GCC 4.8.5 and LLVM 3.9.1. Fixes: 5284e1b4bc8a ("arm64: xchg: Implement cmpxchg_double") Fixes: e9a4b795652f ("arm64: cmpxchg_dbl: patch in lse instructions when supported by the CPU") Reported-by: Boqun Feng <boqun.feng@gmail.com> Link: https://lore.kernel.org/lkml/Y6DEfQXymYVgL3oJ@boqun-archlinux/ Reported-by: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/lkml/Y6GXoO4qmH9OIZ5Q@hirez.programming.kicks-ass.net/ Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: stable@vger.kernel.org Cc: Arnd Bergmann <arnd@arndb.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Steve Capper <steve.capper@arm.com> Cc: Will Deacon <will@kernel.org> Link: https://lore.kernel.org/r/20230104151626.3262137-1-mark.rutland@arm.com Signed-off-by: Will Deacon <will@kernel.org>
332 lines
10 KiB
C
332 lines
10 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* Based on arch/arm/include/asm/atomic.h
|
|
*
|
|
* Copyright (C) 1996 Russell King.
|
|
* Copyright (C) 2002 Deep Blue Solutions Ltd.
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
*/
|
|
|
|
#ifndef __ASM_ATOMIC_LL_SC_H
|
|
#define __ASM_ATOMIC_LL_SC_H
|
|
|
|
#include <linux/stringify.h>
|
|
|
|
#ifndef CONFIG_CC_HAS_K_CONSTRAINT
|
|
#define K
|
|
#endif
|
|
|
|
/*
|
|
* AArch64 UP and SMP safe atomic ops. We use load exclusive and
|
|
* store exclusive to ensure that these are atomic. We may loop
|
|
* to ensure that the update happens.
|
|
*/
|
|
|
|
#define ATOMIC_OP(op, asm_op, constraint) \
|
|
static __always_inline void \
|
|
__ll_sc_atomic_##op(int i, atomic_t *v) \
|
|
{ \
|
|
unsigned long tmp; \
|
|
int result; \
|
|
\
|
|
asm volatile("// atomic_" #op "\n" \
|
|
" prfm pstl1strm, %2\n" \
|
|
"1: ldxr %w0, %2\n" \
|
|
" " #asm_op " %w0, %w0, %w3\n" \
|
|
" stxr %w1, %w0, %2\n" \
|
|
" cbnz %w1, 1b\n" \
|
|
: "=&r" (result), "=&r" (tmp), "+Q" (v->counter) \
|
|
: __stringify(constraint) "r" (i)); \
|
|
}
|
|
|
|
#define ATOMIC_OP_RETURN(name, mb, acq, rel, cl, op, asm_op, constraint)\
|
|
static __always_inline int \
|
|
__ll_sc_atomic_##op##_return##name(int i, atomic_t *v) \
|
|
{ \
|
|
unsigned long tmp; \
|
|
int result; \
|
|
\
|
|
asm volatile("// atomic_" #op "_return" #name "\n" \
|
|
" prfm pstl1strm, %2\n" \
|
|
"1: ld" #acq "xr %w0, %2\n" \
|
|
" " #asm_op " %w0, %w0, %w3\n" \
|
|
" st" #rel "xr %w1, %w0, %2\n" \
|
|
" cbnz %w1, 1b\n" \
|
|
" " #mb \
|
|
: "=&r" (result), "=&r" (tmp), "+Q" (v->counter) \
|
|
: __stringify(constraint) "r" (i) \
|
|
: cl); \
|
|
\
|
|
return result; \
|
|
}
|
|
|
|
#define ATOMIC_FETCH_OP(name, mb, acq, rel, cl, op, asm_op, constraint) \
|
|
static __always_inline int \
|
|
__ll_sc_atomic_fetch_##op##name(int i, atomic_t *v) \
|
|
{ \
|
|
unsigned long tmp; \
|
|
int val, result; \
|
|
\
|
|
asm volatile("// atomic_fetch_" #op #name "\n" \
|
|
" prfm pstl1strm, %3\n" \
|
|
"1: ld" #acq "xr %w0, %3\n" \
|
|
" " #asm_op " %w1, %w0, %w4\n" \
|
|
" st" #rel "xr %w2, %w1, %3\n" \
|
|
" cbnz %w2, 1b\n" \
|
|
" " #mb \
|
|
: "=&r" (result), "=&r" (val), "=&r" (tmp), "+Q" (v->counter) \
|
|
: __stringify(constraint) "r" (i) \
|
|
: cl); \
|
|
\
|
|
return result; \
|
|
}
|
|
|
|
#define ATOMIC_OPS(...) \
|
|
ATOMIC_OP(__VA_ARGS__) \
|
|
ATOMIC_OP_RETURN( , dmb ish, , l, "memory", __VA_ARGS__)\
|
|
ATOMIC_OP_RETURN(_relaxed, , , , , __VA_ARGS__)\
|
|
ATOMIC_OP_RETURN(_acquire, , a, , "memory", __VA_ARGS__)\
|
|
ATOMIC_OP_RETURN(_release, , , l, "memory", __VA_ARGS__)\
|
|
ATOMIC_FETCH_OP ( , dmb ish, , l, "memory", __VA_ARGS__)\
|
|
ATOMIC_FETCH_OP (_relaxed, , , , , __VA_ARGS__)\
|
|
ATOMIC_FETCH_OP (_acquire, , a, , "memory", __VA_ARGS__)\
|
|
ATOMIC_FETCH_OP (_release, , , l, "memory", __VA_ARGS__)
|
|
|
|
ATOMIC_OPS(add, add, I)
|
|
ATOMIC_OPS(sub, sub, J)
|
|
|
|
#undef ATOMIC_OPS
|
|
#define ATOMIC_OPS(...) \
|
|
ATOMIC_OP(__VA_ARGS__) \
|
|
ATOMIC_FETCH_OP ( , dmb ish, , l, "memory", __VA_ARGS__)\
|
|
ATOMIC_FETCH_OP (_relaxed, , , , , __VA_ARGS__)\
|
|
ATOMIC_FETCH_OP (_acquire, , a, , "memory", __VA_ARGS__)\
|
|
ATOMIC_FETCH_OP (_release, , , l, "memory", __VA_ARGS__)
|
|
|
|
ATOMIC_OPS(and, and, K)
|
|
ATOMIC_OPS(or, orr, K)
|
|
ATOMIC_OPS(xor, eor, K)
|
|
/*
|
|
* GAS converts the mysterious and undocumented BIC (immediate) alias to
|
|
* an AND (immediate) instruction with the immediate inverted. We don't
|
|
* have a constraint for this, so fall back to register.
|
|
*/
|
|
ATOMIC_OPS(andnot, bic, )
|
|
|
|
#undef ATOMIC_OPS
|
|
#undef ATOMIC_FETCH_OP
|
|
#undef ATOMIC_OP_RETURN
|
|
#undef ATOMIC_OP
|
|
|
|
#define ATOMIC64_OP(op, asm_op, constraint) \
|
|
static __always_inline void \
|
|
__ll_sc_atomic64_##op(s64 i, atomic64_t *v) \
|
|
{ \
|
|
s64 result; \
|
|
unsigned long tmp; \
|
|
\
|
|
asm volatile("// atomic64_" #op "\n" \
|
|
" prfm pstl1strm, %2\n" \
|
|
"1: ldxr %0, %2\n" \
|
|
" " #asm_op " %0, %0, %3\n" \
|
|
" stxr %w1, %0, %2\n" \
|
|
" cbnz %w1, 1b" \
|
|
: "=&r" (result), "=&r" (tmp), "+Q" (v->counter) \
|
|
: __stringify(constraint) "r" (i)); \
|
|
}
|
|
|
|
#define ATOMIC64_OP_RETURN(name, mb, acq, rel, cl, op, asm_op, constraint)\
|
|
static __always_inline long \
|
|
__ll_sc_atomic64_##op##_return##name(s64 i, atomic64_t *v) \
|
|
{ \
|
|
s64 result; \
|
|
unsigned long tmp; \
|
|
\
|
|
asm volatile("// atomic64_" #op "_return" #name "\n" \
|
|
" prfm pstl1strm, %2\n" \
|
|
"1: ld" #acq "xr %0, %2\n" \
|
|
" " #asm_op " %0, %0, %3\n" \
|
|
" st" #rel "xr %w1, %0, %2\n" \
|
|
" cbnz %w1, 1b\n" \
|
|
" " #mb \
|
|
: "=&r" (result), "=&r" (tmp), "+Q" (v->counter) \
|
|
: __stringify(constraint) "r" (i) \
|
|
: cl); \
|
|
\
|
|
return result; \
|
|
}
|
|
|
|
#define ATOMIC64_FETCH_OP(name, mb, acq, rel, cl, op, asm_op, constraint)\
|
|
static __always_inline long \
|
|
__ll_sc_atomic64_fetch_##op##name(s64 i, atomic64_t *v) \
|
|
{ \
|
|
s64 result, val; \
|
|
unsigned long tmp; \
|
|
\
|
|
asm volatile("// atomic64_fetch_" #op #name "\n" \
|
|
" prfm pstl1strm, %3\n" \
|
|
"1: ld" #acq "xr %0, %3\n" \
|
|
" " #asm_op " %1, %0, %4\n" \
|
|
" st" #rel "xr %w2, %1, %3\n" \
|
|
" cbnz %w2, 1b\n" \
|
|
" " #mb \
|
|
: "=&r" (result), "=&r" (val), "=&r" (tmp), "+Q" (v->counter) \
|
|
: __stringify(constraint) "r" (i) \
|
|
: cl); \
|
|
\
|
|
return result; \
|
|
}
|
|
|
|
#define ATOMIC64_OPS(...) \
|
|
ATOMIC64_OP(__VA_ARGS__) \
|
|
ATOMIC64_OP_RETURN(, dmb ish, , l, "memory", __VA_ARGS__) \
|
|
ATOMIC64_OP_RETURN(_relaxed,, , , , __VA_ARGS__) \
|
|
ATOMIC64_OP_RETURN(_acquire,, a, , "memory", __VA_ARGS__) \
|
|
ATOMIC64_OP_RETURN(_release,, , l, "memory", __VA_ARGS__) \
|
|
ATOMIC64_FETCH_OP (, dmb ish, , l, "memory", __VA_ARGS__) \
|
|
ATOMIC64_FETCH_OP (_relaxed,, , , , __VA_ARGS__) \
|
|
ATOMIC64_FETCH_OP (_acquire,, a, , "memory", __VA_ARGS__) \
|
|
ATOMIC64_FETCH_OP (_release,, , l, "memory", __VA_ARGS__)
|
|
|
|
ATOMIC64_OPS(add, add, I)
|
|
ATOMIC64_OPS(sub, sub, J)
|
|
|
|
#undef ATOMIC64_OPS
|
|
#define ATOMIC64_OPS(...) \
|
|
ATOMIC64_OP(__VA_ARGS__) \
|
|
ATOMIC64_FETCH_OP (, dmb ish, , l, "memory", __VA_ARGS__) \
|
|
ATOMIC64_FETCH_OP (_relaxed,, , , , __VA_ARGS__) \
|
|
ATOMIC64_FETCH_OP (_acquire,, a, , "memory", __VA_ARGS__) \
|
|
ATOMIC64_FETCH_OP (_release,, , l, "memory", __VA_ARGS__)
|
|
|
|
ATOMIC64_OPS(and, and, L)
|
|
ATOMIC64_OPS(or, orr, L)
|
|
ATOMIC64_OPS(xor, eor, L)
|
|
/*
|
|
* GAS converts the mysterious and undocumented BIC (immediate) alias to
|
|
* an AND (immediate) instruction with the immediate inverted. We don't
|
|
* have a constraint for this, so fall back to register.
|
|
*/
|
|
ATOMIC64_OPS(andnot, bic, )
|
|
|
|
#undef ATOMIC64_OPS
|
|
#undef ATOMIC64_FETCH_OP
|
|
#undef ATOMIC64_OP_RETURN
|
|
#undef ATOMIC64_OP
|
|
|
|
static __always_inline s64
|
|
__ll_sc_atomic64_dec_if_positive(atomic64_t *v)
|
|
{
|
|
s64 result;
|
|
unsigned long tmp;
|
|
|
|
asm volatile("// atomic64_dec_if_positive\n"
|
|
" prfm pstl1strm, %2\n"
|
|
"1: ldxr %0, %2\n"
|
|
" subs %0, %0, #1\n"
|
|
" b.lt 2f\n"
|
|
" stlxr %w1, %0, %2\n"
|
|
" cbnz %w1, 1b\n"
|
|
" dmb ish\n"
|
|
"2:"
|
|
: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
|
|
:
|
|
: "cc", "memory");
|
|
|
|
return result;
|
|
}
|
|
|
|
#define __CMPXCHG_CASE(w, sfx, name, sz, mb, acq, rel, cl, constraint) \
|
|
static __always_inline u##sz \
|
|
__ll_sc__cmpxchg_case_##name##sz(volatile void *ptr, \
|
|
unsigned long old, \
|
|
u##sz new) \
|
|
{ \
|
|
unsigned long tmp; \
|
|
u##sz oldval; \
|
|
\
|
|
/* \
|
|
* Sub-word sizes require explicit casting so that the compare \
|
|
* part of the cmpxchg doesn't end up interpreting non-zero \
|
|
* upper bits of the register containing "old". \
|
|
*/ \
|
|
if (sz < 32) \
|
|
old = (u##sz)old; \
|
|
\
|
|
asm volatile( \
|
|
" prfm pstl1strm, %[v]\n" \
|
|
"1: ld" #acq "xr" #sfx "\t%" #w "[oldval], %[v]\n" \
|
|
" eor %" #w "[tmp], %" #w "[oldval], %" #w "[old]\n" \
|
|
" cbnz %" #w "[tmp], 2f\n" \
|
|
" st" #rel "xr" #sfx "\t%w[tmp], %" #w "[new], %[v]\n" \
|
|
" cbnz %w[tmp], 1b\n" \
|
|
" " #mb "\n" \
|
|
"2:" \
|
|
: [tmp] "=&r" (tmp), [oldval] "=&r" (oldval), \
|
|
[v] "+Q" (*(u##sz *)ptr) \
|
|
: [old] __stringify(constraint) "r" (old), [new] "r" (new) \
|
|
: cl); \
|
|
\
|
|
return oldval; \
|
|
}
|
|
|
|
/*
|
|
* Earlier versions of GCC (no later than 8.1.0) appear to incorrectly
|
|
* handle the 'K' constraint for the value 4294967295 - thus we use no
|
|
* constraint for 32 bit operations.
|
|
*/
|
|
__CMPXCHG_CASE(w, b, , 8, , , , , K)
|
|
__CMPXCHG_CASE(w, h, , 16, , , , , K)
|
|
__CMPXCHG_CASE(w, , , 32, , , , , K)
|
|
__CMPXCHG_CASE( , , , 64, , , , , L)
|
|
__CMPXCHG_CASE(w, b, acq_, 8, , a, , "memory", K)
|
|
__CMPXCHG_CASE(w, h, acq_, 16, , a, , "memory", K)
|
|
__CMPXCHG_CASE(w, , acq_, 32, , a, , "memory", K)
|
|
__CMPXCHG_CASE( , , acq_, 64, , a, , "memory", L)
|
|
__CMPXCHG_CASE(w, b, rel_, 8, , , l, "memory", K)
|
|
__CMPXCHG_CASE(w, h, rel_, 16, , , l, "memory", K)
|
|
__CMPXCHG_CASE(w, , rel_, 32, , , l, "memory", K)
|
|
__CMPXCHG_CASE( , , rel_, 64, , , l, "memory", L)
|
|
__CMPXCHG_CASE(w, b, mb_, 8, dmb ish, , l, "memory", K)
|
|
__CMPXCHG_CASE(w, h, mb_, 16, dmb ish, , l, "memory", K)
|
|
__CMPXCHG_CASE(w, , mb_, 32, dmb ish, , l, "memory", K)
|
|
__CMPXCHG_CASE( , , mb_, 64, dmb ish, , l, "memory", L)
|
|
|
|
#undef __CMPXCHG_CASE
|
|
|
|
#define __CMPXCHG_DBL(name, mb, rel, cl) \
|
|
static __always_inline long \
|
|
__ll_sc__cmpxchg_double##name(unsigned long old1, \
|
|
unsigned long old2, \
|
|
unsigned long new1, \
|
|
unsigned long new2, \
|
|
volatile void *ptr) \
|
|
{ \
|
|
unsigned long tmp, ret; \
|
|
\
|
|
asm volatile("// __cmpxchg_double" #name "\n" \
|
|
" prfm pstl1strm, %2\n" \
|
|
"1: ldxp %0, %1, %2\n" \
|
|
" eor %0, %0, %3\n" \
|
|
" eor %1, %1, %4\n" \
|
|
" orr %1, %0, %1\n" \
|
|
" cbnz %1, 2f\n" \
|
|
" st" #rel "xp %w0, %5, %6, %2\n" \
|
|
" cbnz %w0, 1b\n" \
|
|
" " #mb "\n" \
|
|
"2:" \
|
|
: "=&r" (tmp), "=&r" (ret), "+Q" (*(__uint128_t *)ptr) \
|
|
: "r" (old1), "r" (old2), "r" (new1), "r" (new2) \
|
|
: cl); \
|
|
\
|
|
return ret; \
|
|
}
|
|
|
|
__CMPXCHG_DBL( , , , )
|
|
__CMPXCHG_DBL(_mb, dmb ish, l, "memory")
|
|
|
|
#undef __CMPXCHG_DBL
|
|
#undef K
|
|
|
|
#endif /* __ASM_ATOMIC_LL_SC_H */
|