2019-06-03 07:44:50 +02:00
/* SPDX-License-Identifier: GPL-2.0-only */
2014-08-26 21:15:30 -07:00
/*
* BPF JIT compiler for ARM64
*
arm64: bpf: implement bpf_tail_call() helper
Add support for JMP_CALL_X (tail call) introduced by commit 04fd61ab36ec
("bpf: allow bpf programs to tail-call other bpf programs").
bpf_tail_call() arguments:
ctx - context pointer passed to next program
array - pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
index - index inside array that selects specific program to run
In this implementation arm64 JIT jumps into callee program after prologue,
so callee program reuses the same stack. For tail_call_cnt, we use the
callee-saved R26 (which was already saved/restored but previously unused
by JIT).
With this patch a tail call generates the following code on arm64:
if (index >= array->map.max_entries)
goto out;
34: mov x10, #0x10 // #16
38: ldr w10, [x1,x10]
3c: cmp w2, w10
40: b.ge 0x0000000000000074
if (tail_call_cnt > MAX_TAIL_CALL_CNT)
goto out;
tail_call_cnt++;
44: mov x10, #0x20 // #32
48: cmp x26, x10
4c: b.gt 0x0000000000000074
50: add x26, x26, #0x1
prog = array->ptrs[index];
if (prog == NULL)
goto out;
54: mov x10, #0x68 // #104
58: ldr x10, [x1,x10]
5c: ldr x11, [x10,x2]
60: cbz x11, 0x0000000000000074
goto *(prog->bpf_func + prologue_size);
64: mov x10, #0x20 // #32
68: ldr x10, [x11,x10]
6c: add x10, x10, #0x20
70: br x10
74:
Signed-off-by: Zi Shen Lim <zlim.lnx@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-08 21:18:48 -07:00
* Copyright ( C ) 2014 - 2016 Zi Shen Lim < zlim . lnx @ gmail . com >
2014-08-26 21:15:30 -07:00
*/
# ifndef _BPF_JIT_H
# define _BPF_JIT_H
# include <asm/insn.h>
/* 5-bit Register Operand */
# define A64_R(x) AARCH64_INSN_REG_##x
# define A64_FP AARCH64_INSN_REG_FP
# define A64_LR AARCH64_INSN_REG_LR
# define A64_ZR AARCH64_INSN_REG_ZR
# define A64_SP AARCH64_INSN_REG_SP
# define A64_VARIANT(sf) \
( ( sf ) ? AARCH64_INSN_VARIANT_64BIT : AARCH64_INSN_VARIANT_32BIT )
/* Compare & branch (immediate) */
# define A64_COMP_BRANCH(sf, Rt, offset, type) \
aarch64_insn_gen_comp_branch_imm ( 0 , offset , Rt , A64_VARIANT ( sf ) , \
AARCH64_INSN_BRANCH_COMP_ # # type )
# define A64_CBZ(sf, Rt, imm19) A64_COMP_BRANCH(sf, Rt, (imm19) << 2, ZERO)
2015-11-03 22:56:44 -08:00
# define A64_CBNZ(sf, Rt, imm19) A64_COMP_BRANCH(sf, Rt, (imm19) << 2, NONZERO)
2014-08-26 21:15:30 -07:00
/* Conditional branch (immediate) */
# define A64_COND_BRANCH(cond, offset) \
aarch64_insn_gen_cond_branch_imm ( 0 , offset , cond )
# define A64_COND_EQ AARCH64_INSN_COND_EQ /* == */
# define A64_COND_NE AARCH64_INSN_COND_NE /* != */
# define A64_COND_CS AARCH64_INSN_COND_CS /* unsigned >= */
# define A64_COND_HI AARCH64_INSN_COND_HI /* unsigned > */
2017-08-10 01:39:57 +02:00
# define A64_COND_LS AARCH64_INSN_COND_LS /* unsigned <= */
# define A64_COND_CC AARCH64_INSN_COND_CC /* unsigned < */
2014-08-26 21:15:30 -07:00
# define A64_COND_GE AARCH64_INSN_COND_GE /* signed >= */
# define A64_COND_GT AARCH64_INSN_COND_GT /* signed > */
2017-08-10 01:39:57 +02:00
# define A64_COND_LE AARCH64_INSN_COND_LE /* signed <= */
# define A64_COND_LT AARCH64_INSN_COND_LT /* signed < */
2014-08-26 21:15:30 -07:00
# define A64_B_(cond, imm19) A64_COND_BRANCH(cond, (imm19) << 2)
/* Unconditional branch (immediate) */
# define A64_BRANCH(offset, type) aarch64_insn_gen_branch_imm(0, offset, \
AARCH64_INSN_BRANCH_ # # type )
# define A64_B(imm26) A64_BRANCH((imm26) << 2, NOLINK)
# define A64_BL(imm26) A64_BRANCH((imm26) << 2, LINK)
/* Unconditional branch (register) */
arm64: bpf: implement bpf_tail_call() helper
Add support for JMP_CALL_X (tail call) introduced by commit 04fd61ab36ec
("bpf: allow bpf programs to tail-call other bpf programs").
bpf_tail_call() arguments:
ctx - context pointer passed to next program
array - pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
index - index inside array that selects specific program to run
In this implementation arm64 JIT jumps into callee program after prologue,
so callee program reuses the same stack. For tail_call_cnt, we use the
callee-saved R26 (which was already saved/restored but previously unused
by JIT).
With this patch a tail call generates the following code on arm64:
if (index >= array->map.max_entries)
goto out;
34: mov x10, #0x10 // #16
38: ldr w10, [x1,x10]
3c: cmp w2, w10
40: b.ge 0x0000000000000074
if (tail_call_cnt > MAX_TAIL_CALL_CNT)
goto out;
tail_call_cnt++;
44: mov x10, #0x20 // #32
48: cmp x26, x10
4c: b.gt 0x0000000000000074
50: add x26, x26, #0x1
prog = array->ptrs[index];
if (prog == NULL)
goto out;
54: mov x10, #0x68 // #104
58: ldr x10, [x1,x10]
5c: ldr x11, [x10,x2]
60: cbz x11, 0x0000000000000074
goto *(prog->bpf_func + prologue_size);
64: mov x10, #0x20 // #32
68: ldr x10, [x11,x10]
6c: add x10, x10, #0x20
70: br x10
74:
Signed-off-by: Zi Shen Lim <zlim.lnx@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-08 21:18:48 -07:00
# define A64_BR(Rn) aarch64_insn_gen_branch_reg(Rn, AARCH64_INSN_BRANCH_NOLINK)
2014-08-26 21:15:30 -07:00
# define A64_BLR(Rn) aarch64_insn_gen_branch_reg(Rn, AARCH64_INSN_BRANCH_LINK)
# define A64_RET(Rn) aarch64_insn_gen_branch_reg(Rn, AARCH64_INSN_BRANCH_RETURN)
/* Load/store register (register offset) */
# define A64_LS_REG(Rt, Rn, Rm, size, type) \
aarch64_insn_gen_load_store_reg ( Rt , Rn , Rm , \
AARCH64_INSN_SIZE_ # # size , \
AARCH64_INSN_LDST_ # # type # # _REG_OFFSET )
# define A64_STRB(Wt, Xn, Xm) A64_LS_REG(Wt, Xn, Xm, 8, STORE)
# define A64_LDRB(Wt, Xn, Xm) A64_LS_REG(Wt, Xn, Xm, 8, LOAD)
# define A64_STRH(Wt, Xn, Xm) A64_LS_REG(Wt, Xn, Xm, 16, STORE)
# define A64_LDRH(Wt, Xn, Xm) A64_LS_REG(Wt, Xn, Xm, 16, LOAD)
# define A64_STR32(Wt, Xn, Xm) A64_LS_REG(Wt, Xn, Xm, 32, STORE)
# define A64_LDR32(Wt, Xn, Xm) A64_LS_REG(Wt, Xn, Xm, 32, LOAD)
# define A64_STR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, STORE)
# define A64_LDR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, LOAD)
bpf, arm64: Optimize BPF store/load using arm64 str/ldr(immediate offset)
The current BPF store/load instruction is translated by the JIT into two
instructions. The first instruction moves the immediate offset into a
temporary register. The second instruction uses this temporary register
to do the real store/load.
In fact, arm64 supports addressing with immediate offsets. So This patch
introduces optimization that uses arm64 str/ldr instruction with immediate
offset when the offset fits.
Example of generated instuction for r2 = *(u64 *)(r1 + 0):
without optimization:
mov x10, 0
ldr x1, [x0, x10]
with optimization:
ldr x1, [x0, 0]
If the offset is negative, or is not aligned correctly, or exceeds max
value, rollback to the use of temporary register.
Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220321152852.2334294-3-xukuohai@huawei.com
2022-03-21 11:28:49 -04:00
/* Load/store register (immediate offset) */
# define A64_LS_IMM(Rt, Rn, imm, size, type) \
aarch64_insn_gen_load_store_imm ( Rt , Rn , imm , \
AARCH64_INSN_SIZE_ # # size , \
AARCH64_INSN_LDST_ # # type # # _IMM_OFFSET )
# define A64_STRBI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 8, STORE)
# define A64_LDRBI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 8, LOAD)
# define A64_STRHI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 16, STORE)
# define A64_LDRHI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 16, LOAD)
# define A64_STR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, STORE)
# define A64_LDR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, LOAD)
# define A64_STR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, STORE)
# define A64_LDR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, LOAD)
bpf, arm64: Implement bpf_arch_text_poke() for arm64
Implement bpf_arch_text_poke() for arm64, so bpf prog or bpf trampoline
can be patched with it.
When the target address is NULL, the original instruction is patched to
a NOP.
When the target address and the source address are within the branch
range, the original instruction is patched to a bl instruction to the
target address directly.
To support attaching bpf trampoline to both regular kernel function and
bpf prog, we follow the ftrace patchsite way for bpf prog. That is, two
instructions are inserted at the beginning of bpf prog, the first one
saves the return address to x9, and the second is a nop which will be
patched to a bl instruction when a bpf trampoline is attached.
However, when a bpf trampoline is attached to bpf prog, the distance
between target address and source address may exceed 128MB, the maximum
branch range, because bpf trampoline and bpf prog are allocated
separately with vmalloc. So long jump should be handled.
When a bpf prog is constructed, a plt pointing to empty trampoline
dummy_tramp is placed at the end:
bpf_prog:
mov x9, lr
nop // patchsite
...
ret
plt:
ldr x10, target
br x10
target:
.quad dummy_tramp // plt target
This is also the state when no trampoline is attached.
When a short-jump bpf trampoline is attached, the patchsite is patched to
a bl instruction to the trampoline directly:
bpf_prog:
mov x9, lr
bl <short-jump bpf trampoline address> // patchsite
...
ret
plt:
ldr x10, target
br x10
target:
.quad dummy_tramp // plt target
When a long-jump bpf trampoline is attached, the plt target is filled with
the trampoline address and the patchsite is patched to a bl instruction to
the plt:
bpf_prog:
mov x9, lr
bl plt // patchsite
...
ret
plt:
ldr x10, target
br x10
target:
.quad <long-jump bpf trampoline address>
dummy_tramp is used to prevent another CPU from jumping to an unknown
location during the patching process, making the patching process easier.
The patching process is as follows:
1. when neither the old address or the new address is a long jump, the
patchsite is replaced with a bl to the new address, or nop if the new
address is NULL;
2. when the old address is not long jump but the new one is, the
branch target address is written to plt first, then the patchsite
is replaced with a bl instruction to the plt;
3. when the old address is long jump but the new one is not, the address
of dummy_tramp is written to plt first, then the patchsite is replaced
with a bl to the new address, or a nop if the new address is NULL;
4. when both the old address and the new address are long jump, the
new address is written to plt and the patchsite is not changed.
Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: KP Singh <kpsingh@kernel.org>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20220711150823.2128542-4-xukuohai@huawei.com
2022-07-11 11:08:22 -04:00
/* LDR (literal) */
# define A64_LDR32LIT(Wt, offset) \
aarch64_insn_gen_load_literal ( 0 , offset , Wt , false )
# define A64_LDR64LIT(Xt, offset) \
aarch64_insn_gen_load_literal ( 0 , offset , Xt , true )
2014-08-26 21:15:30 -07:00
/* Load/store register pair */
# define A64_LS_PAIR(Rt, Rt2, Rn, offset, ls, type) \
aarch64_insn_gen_load_store_pair ( Rt , Rt2 , Rn , offset , \
AARCH64_INSN_VARIANT_64BIT , \
AARCH64_INSN_LDST_ # # ls # # _PAIR_ # # type )
/* Rn -= 16; Rn[0] = Rt; Rn[8] = Rt2; */
# define A64_PUSH(Rt, Rt2, Rn) A64_LS_PAIR(Rt, Rt2, Rn, -16, STORE, PRE_INDEX)
/* Rt = Rn[0]; Rt2 = Rn[8]; Rn += 16; */
# define A64_POP(Rt, Rt2, Rn) A64_LS_PAIR(Rt, Rt2, Rn, 16, LOAD, POST_INDEX)
bpf, arm64: implement jiting of BPF_XADD
This work adds BPF_XADD for BPF_W/BPF_DW to the arm64 JIT and therefore
completes JITing of all BPF instructions, meaning we can thus also remove
the 'notyet' label and do not need to fall back to the interpreter when
BPF_XADD is used in a program!
This now also brings arm64 JIT in line with x86_64, s390x, ppc64, sparc64,
where all current eBPF features are supported.
BPF_W example from test_bpf:
.u.insns_int = {
BPF_ALU32_IMM(BPF_MOV, R0, 0x12),
BPF_ST_MEM(BPF_W, R10, -40, 0x10),
BPF_STX_XADD(BPF_W, R10, R0, -40),
BPF_LDX_MEM(BPF_W, R0, R10, -40),
BPF_EXIT_INSN(),
},
[...]
00000020: 52800247 mov w7, #0x12 // #18
00000024: 928004eb mov x11, #0xffffffffffffffd8 // #-40
00000028: d280020a mov x10, #0x10 // #16
0000002c: b82b6b2a str w10, [x25,x11]
// start of xadd mapping:
00000030: 928004ea mov x10, #0xffffffffffffffd8 // #-40
00000034: 8b19014a add x10, x10, x25
00000038: f9800151 prfm pstl1strm, [x10]
0000003c: 885f7d4b ldxr w11, [x10]
00000040: 0b07016b add w11, w11, w7
00000044: 880b7d4b stxr w11, w11, [x10]
00000048: 35ffffab cbnz w11, 0x0000003c
// end of xadd mapping:
[...]
BPF_DW example from test_bpf:
.u.insns_int = {
BPF_ALU32_IMM(BPF_MOV, R0, 0x12),
BPF_ST_MEM(BPF_DW, R10, -40, 0x10),
BPF_STX_XADD(BPF_DW, R10, R0, -40),
BPF_LDX_MEM(BPF_DW, R0, R10, -40),
BPF_EXIT_INSN(),
},
[...]
00000020: 52800247 mov w7, #0x12 // #18
00000024: 928004eb mov x11, #0xffffffffffffffd8 // #-40
00000028: d280020a mov x10, #0x10 // #16
0000002c: f82b6b2a str x10, [x25,x11]
// start of xadd mapping:
00000030: 928004ea mov x10, #0xffffffffffffffd8 // #-40
00000034: 8b19014a add x10, x10, x25
00000038: f9800151 prfm pstl1strm, [x10]
0000003c: c85f7d4b ldxr x11, [x10]
00000040: 8b07016b add x11, x11, x7
00000044: c80b7d4b stxr w11, x11, [x10]
00000048: 35ffffab cbnz w11, 0x0000003c
// end of xadd mapping:
[...]
Tested on Cavium ThunderX ARMv8, test suite results after the patch:
No JIT: [ 3751.855362] test_bpf: Summary: 311 PASSED, 0 FAILED, [0/303 JIT'ed]
With JIT: [ 3573.759527] test_bpf: Summary: 311 PASSED, 0 FAILED, [303/303 JIT'ed]
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-01 02:57:20 +02:00
/* Load/store exclusive */
# define A64_SIZE(sf) \
( ( sf ) ? AARCH64_INSN_SIZE_64 : AARCH64_INSN_SIZE_32 )
# define A64_LSX(sf, Rt, Rn, Rs, type) \
aarch64_insn_gen_load_store_ex ( Rt , Rn , Rs , A64_SIZE ( sf ) , \
AARCH64_INSN_LDST_ # # type )
/* Rt = [Rn]; (atomic) */
# define A64_LDXR(sf, Rt, Rn) \
A64_LSX ( sf , Rt , Rn , A64_ZR , LOAD_EX )
/* [Rn] = Rt; (atomic) Rs = [state] */
# define A64_STXR(sf, Rt, Rn, Rs) \
A64_LSX ( sf , Rt , Rn , Rs , STORE_EX )
bpf, arm64: Support more atomic operations
Atomics for eBPF patch series adds support for atomic[64]_fetch_add,
atomic[64]_[fetch_]{and,or,xor} and atomic[64]_{xchg|cmpxchg}, but it
only adds support for x86-64, so support these atomic operations for
arm64 as well.
Basically the implementation procedure is almost mechanical translation
of code snippets in atomic_ll_sc.h & atomic_lse.h & cmpxchg.h located
under arch/arm64/include/asm.
When LSE atomic is unavailable, an extra temporary register is needed for
(BPF_ADD | BPF_FETCH) to save the value of src register, instead of adding
TMP_REG_4 just use BPF_REG_AX instead. Also make emit_lse_atomic() as an
empty inline function when CONFIG_ARM64_LSE_ATOMICS is disabled.
For cpus_have_cap(ARM64_HAS_LSE_ATOMICS) case and no-LSE-ATOMICS case, the
following three tests: "./test_verifier", "./test_progs -t atomic" and
"insmod ./test_bpf.ko" are exercised and passed.
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220217072232.1186625-4-houtao1@huawei.com
2022-02-17 15:22:31 +08:00
/* [Rn] = Rt (store release); (atomic) Rs = [state] */
# define A64_STLXR(sf, Rt, Rn, Rs) \
aarch64_insn_gen_load_store_ex ( Rt , Rn , Rs , A64_SIZE ( sf ) , \
AARCH64_INSN_LDST_STORE_REL_EX )
bpf, arm64: implement jiting of BPF_XADD
This work adds BPF_XADD for BPF_W/BPF_DW to the arm64 JIT and therefore
completes JITing of all BPF instructions, meaning we can thus also remove
the 'notyet' label and do not need to fall back to the interpreter when
BPF_XADD is used in a program!
This now also brings arm64 JIT in line with x86_64, s390x, ppc64, sparc64,
where all current eBPF features are supported.
BPF_W example from test_bpf:
.u.insns_int = {
BPF_ALU32_IMM(BPF_MOV, R0, 0x12),
BPF_ST_MEM(BPF_W, R10, -40, 0x10),
BPF_STX_XADD(BPF_W, R10, R0, -40),
BPF_LDX_MEM(BPF_W, R0, R10, -40),
BPF_EXIT_INSN(),
},
[...]
00000020: 52800247 mov w7, #0x12 // #18
00000024: 928004eb mov x11, #0xffffffffffffffd8 // #-40
00000028: d280020a mov x10, #0x10 // #16
0000002c: b82b6b2a str w10, [x25,x11]
// start of xadd mapping:
00000030: 928004ea mov x10, #0xffffffffffffffd8 // #-40
00000034: 8b19014a add x10, x10, x25
00000038: f9800151 prfm pstl1strm, [x10]
0000003c: 885f7d4b ldxr w11, [x10]
00000040: 0b07016b add w11, w11, w7
00000044: 880b7d4b stxr w11, w11, [x10]
00000048: 35ffffab cbnz w11, 0x0000003c
// end of xadd mapping:
[...]
BPF_DW example from test_bpf:
.u.insns_int = {
BPF_ALU32_IMM(BPF_MOV, R0, 0x12),
BPF_ST_MEM(BPF_DW, R10, -40, 0x10),
BPF_STX_XADD(BPF_DW, R10, R0, -40),
BPF_LDX_MEM(BPF_DW, R0, R10, -40),
BPF_EXIT_INSN(),
},
[...]
00000020: 52800247 mov w7, #0x12 // #18
00000024: 928004eb mov x11, #0xffffffffffffffd8 // #-40
00000028: d280020a mov x10, #0x10 // #16
0000002c: f82b6b2a str x10, [x25,x11]
// start of xadd mapping:
00000030: 928004ea mov x10, #0xffffffffffffffd8 // #-40
00000034: 8b19014a add x10, x10, x25
00000038: f9800151 prfm pstl1strm, [x10]
0000003c: c85f7d4b ldxr x11, [x10]
00000040: 8b07016b add x11, x11, x7
00000044: c80b7d4b stxr w11, x11, [x10]
00000048: 35ffffab cbnz w11, 0x0000003c
// end of xadd mapping:
[...]
Tested on Cavium ThunderX ARMv8, test suite results after the patch:
No JIT: [ 3751.855362] test_bpf: Summary: 311 PASSED, 0 FAILED, [0/303 JIT'ed]
With JIT: [ 3573.759527] test_bpf: Summary: 311 PASSED, 0 FAILED, [303/303 JIT'ed]
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-01 02:57:20 +02:00
2022-02-17 15:22:30 +08:00
/*
* LSE atomics
*
bpf, arm64: Support more atomic operations
Atomics for eBPF patch series adds support for atomic[64]_fetch_add,
atomic[64]_[fetch_]{and,or,xor} and atomic[64]_{xchg|cmpxchg}, but it
only adds support for x86-64, so support these atomic operations for
arm64 as well.
Basically the implementation procedure is almost mechanical translation
of code snippets in atomic_ll_sc.h & atomic_lse.h & cmpxchg.h located
under arch/arm64/include/asm.
When LSE atomic is unavailable, an extra temporary register is needed for
(BPF_ADD | BPF_FETCH) to save the value of src register, instead of adding
TMP_REG_4 just use BPF_REG_AX instead. Also make emit_lse_atomic() as an
empty inline function when CONFIG_ARM64_LSE_ATOMICS is disabled.
For cpus_have_cap(ARM64_HAS_LSE_ATOMICS) case and no-LSE-ATOMICS case, the
following three tests: "./test_verifier", "./test_progs -t atomic" and
"insmod ./test_bpf.ko" are exercised and passed.
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220217072232.1186625-4-houtao1@huawei.com
2022-02-17 15:22:31 +08:00
* ST { ADD , CLR , SET , EOR } is simply encoded as an alias for
* LDD { ADD , CLR , SET , EOR } with XZR as the destination register .
2022-02-17 15:22:30 +08:00
*/
bpf, arm64: Support more atomic operations
Atomics for eBPF patch series adds support for atomic[64]_fetch_add,
atomic[64]_[fetch_]{and,or,xor} and atomic[64]_{xchg|cmpxchg}, but it
only adds support for x86-64, so support these atomic operations for
arm64 as well.
Basically the implementation procedure is almost mechanical translation
of code snippets in atomic_ll_sc.h & atomic_lse.h & cmpxchg.h located
under arch/arm64/include/asm.
When LSE atomic is unavailable, an extra temporary register is needed for
(BPF_ADD | BPF_FETCH) to save the value of src register, instead of adding
TMP_REG_4 just use BPF_REG_AX instead. Also make emit_lse_atomic() as an
empty inline function when CONFIG_ARM64_LSE_ATOMICS is disabled.
For cpus_have_cap(ARM64_HAS_LSE_ATOMICS) case and no-LSE-ATOMICS case, the
following three tests: "./test_verifier", "./test_progs -t atomic" and
"insmod ./test_bpf.ko" are exercised and passed.
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220217072232.1186625-4-houtao1@huawei.com
2022-02-17 15:22:31 +08:00
# define A64_ST_OP(sf, Rn, Rs, op) \
2022-02-17 15:22:30 +08:00
aarch64_insn_gen_atomic_ld_op ( A64_ZR , Rn , Rs , \
bpf, arm64: Support more atomic operations
Atomics for eBPF patch series adds support for atomic[64]_fetch_add,
atomic[64]_[fetch_]{and,or,xor} and atomic[64]_{xchg|cmpxchg}, but it
only adds support for x86-64, so support these atomic operations for
arm64 as well.
Basically the implementation procedure is almost mechanical translation
of code snippets in atomic_ll_sc.h & atomic_lse.h & cmpxchg.h located
under arch/arm64/include/asm.
When LSE atomic is unavailable, an extra temporary register is needed for
(BPF_ADD | BPF_FETCH) to save the value of src register, instead of adding
TMP_REG_4 just use BPF_REG_AX instead. Also make emit_lse_atomic() as an
empty inline function when CONFIG_ARM64_LSE_ATOMICS is disabled.
For cpus_have_cap(ARM64_HAS_LSE_ATOMICS) case and no-LSE-ATOMICS case, the
following three tests: "./test_verifier", "./test_progs -t atomic" and
"insmod ./test_bpf.ko" are exercised and passed.
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220217072232.1186625-4-houtao1@huawei.com
2022-02-17 15:22:31 +08:00
A64_SIZE ( sf ) , AARCH64_INSN_MEM_ATOMIC_ # # op , \
2022-02-17 15:22:30 +08:00
AARCH64_INSN_MEM_ORDER_NONE )
bpf, arm64: Support more atomic operations
Atomics for eBPF patch series adds support for atomic[64]_fetch_add,
atomic[64]_[fetch_]{and,or,xor} and atomic[64]_{xchg|cmpxchg}, but it
only adds support for x86-64, so support these atomic operations for
arm64 as well.
Basically the implementation procedure is almost mechanical translation
of code snippets in atomic_ll_sc.h & atomic_lse.h & cmpxchg.h located
under arch/arm64/include/asm.
When LSE atomic is unavailable, an extra temporary register is needed for
(BPF_ADD | BPF_FETCH) to save the value of src register, instead of adding
TMP_REG_4 just use BPF_REG_AX instead. Also make emit_lse_atomic() as an
empty inline function when CONFIG_ARM64_LSE_ATOMICS is disabled.
For cpus_have_cap(ARM64_HAS_LSE_ATOMICS) case and no-LSE-ATOMICS case, the
following three tests: "./test_verifier", "./test_progs -t atomic" and
"insmod ./test_bpf.ko" are exercised and passed.
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220217072232.1186625-4-houtao1@huawei.com
2022-02-17 15:22:31 +08:00
/* [Rn] <op>= Rs */
# define A64_STADD(sf, Rn, Rs) A64_ST_OP(sf, Rn, Rs, ADD)
# define A64_STCLR(sf, Rn, Rs) A64_ST_OP(sf, Rn, Rs, CLR)
# define A64_STEOR(sf, Rn, Rs) A64_ST_OP(sf, Rn, Rs, EOR)
# define A64_STSET(sf, Rn, Rs) A64_ST_OP(sf, Rn, Rs, SET)
# define A64_LD_OP_AL(sf, Rt, Rn, Rs, op) \
aarch64_insn_gen_atomic_ld_op ( Rt , Rn , Rs , \
A64_SIZE ( sf ) , AARCH64_INSN_MEM_ATOMIC_ # # op , \
AARCH64_INSN_MEM_ORDER_ACQREL )
/* Rt = [Rn] (load acquire); [Rn] <op>= Rs (store release) */
# define A64_LDADDAL(sf, Rt, Rn, Rs) A64_LD_OP_AL(sf, Rt, Rn, Rs, ADD)
# define A64_LDCLRAL(sf, Rt, Rn, Rs) A64_LD_OP_AL(sf, Rt, Rn, Rs, CLR)
# define A64_LDEORAL(sf, Rt, Rn, Rs) A64_LD_OP_AL(sf, Rt, Rn, Rs, EOR)
# define A64_LDSETAL(sf, Rt, Rn, Rs) A64_LD_OP_AL(sf, Rt, Rn, Rs, SET)
/* Rt = [Rn] (load acquire); [Rn] = Rs (store release) */
# define A64_SWPAL(sf, Rt, Rn, Rs) A64_LD_OP_AL(sf, Rt, Rn, Rs, SWP)
/* Rs = CAS(Rn, Rs, Rt) (load acquire & store release) */
# define A64_CASAL(sf, Rt, Rn, Rs) \
aarch64_insn_gen_cas ( Rt , Rn , Rs , A64_SIZE ( sf ) , \
AARCH64_INSN_MEM_ORDER_ACQREL )
2019-04-26 21:48:22 +02:00
2014-08-26 21:15:30 -07:00
/* Add/subtract (immediate) */
# define A64_ADDSUB_IMM(sf, Rd, Rn, imm12, type) \
aarch64_insn_gen_add_sub_imm ( Rd , Rn , imm12 , \
A64_VARIANT ( sf ) , AARCH64_INSN_ADSB_ # # type )
/* Rd = Rn OP imm12 */
# define A64_ADD_I(sf, Rd, Rn, imm12) A64_ADDSUB_IMM(sf, Rd, Rn, imm12, ADD)
# define A64_SUB_I(sf, Rd, Rn, imm12) A64_ADDSUB_IMM(sf, Rd, Rn, imm12, SUB)
bpf, arm64: Optimize ADD,SUB,JMP BPF_K using arm64 add/sub immediates
The current code for BPF_{ADD,SUB} BPF_K loads the BPF immediate to a
temporary register before performing the addition/subtraction. Similarly,
BPF_JMP BPF_K cases load the immediate to a temporary register before
comparison.
This patch introduces optimizations that use arm64 immediate add, sub,
cmn, or cmp instructions when the BPF immediate fits. If the immediate
does not fit, it falls back to using a temporary register.
Example of generated code for BPF_ALU64_IMM(BPF_ADD, R0, 2):
without optimization:
24: mov x10, #0x2
28: add x7, x7, x10
with optimization:
24: add x7, x7, #0x2
The code could use A64_{ADD,SUB}_I directly and check if it returns
AARCH64_BREAK_FAULT, similar to how logical immediates are handled.
However, aarch64_insn_gen_add_sub_imm from insn.c prints error messages
when the immediate does not fit, and it's simpler to check if the
immediate fits ahead of time.
Co-developed-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: Luke Nelson <luke.r.nels@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20200508181547.24783-4-luke.r.nels@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
2020-05-08 11:15:46 -07:00
# define A64_ADDS_I(sf, Rd, Rn, imm12) \
A64_ADDSUB_IMM ( sf , Rd , Rn , imm12 , ADD_SETFLAGS )
# define A64_SUBS_I(sf, Rd, Rn, imm12) \
A64_ADDSUB_IMM ( sf , Rd , Rn , imm12 , SUB_SETFLAGS )
/* Rn + imm12; set condition flags */
# define A64_CMN_I(sf, Rn, imm12) A64_ADDS_I(sf, A64_ZR, Rn, imm12)
/* Rn - imm12; set condition flags */
# define A64_CMP_I(sf, Rn, imm12) A64_SUBS_I(sf, A64_ZR, Rn, imm12)
2014-08-26 21:15:30 -07:00
/* Rd = Rn */
# define A64_MOV(sf, Rd, Rn) A64_ADD_I(sf, Rd, Rn, 0)
/* Bitfield move */
# define A64_BITFIELD(sf, Rd, Rn, immr, imms, type) \
aarch64_insn_gen_bitfield ( Rd , Rn , immr , imms , \
A64_VARIANT ( sf ) , AARCH64_INSN_BITFIELD_MOVE_ # # type )
/* Signed, with sign replication to left and zeros to right */
# define A64_SBFM(sf, Rd, Rn, ir, is) A64_BITFIELD(sf, Rd, Rn, ir, is, SIGNED)
/* Unsigned, with zeros to left and right */
# define A64_UBFM(sf, Rd, Rn, ir, is) A64_BITFIELD(sf, Rd, Rn, ir, is, UNSIGNED)
/* Rd = Rn << shift */
# define A64_LSL(sf, Rd, Rn, shift) ({ \
int sz = ( sf ) ? 64 : 32 ; \
A64_UBFM ( sf , Rd , Rn , ( unsigned ) - ( shift ) % sz , sz - 1 - ( shift ) ) ; \
} )
/* Rd = Rn >> shift */
# define A64_LSR(sf, Rd, Rn, shift) A64_UBFM(sf, Rd, Rn, shift, (sf) ? 63 : 31)
/* Rd = Rn >> shift; signed */
# define A64_ASR(sf, Rd, Rn, shift) A64_SBFM(sf, Rd, Rn, shift, (sf) ? 63 : 31)
2015-06-25 18:39:15 -07:00
/* Zero extend */
# define A64_UXTH(sf, Rd, Rn) A64_UBFM(sf, Rd, Rn, 0, 15)
# define A64_UXTW(sf, Rd, Rn) A64_UBFM(sf, Rd, Rn, 0, 31)
2014-08-26 21:15:30 -07:00
/* Move wide (immediate) */
# define A64_MOVEW(sf, Rd, imm16, shift, type) \
aarch64_insn_gen_movewide ( Rd , imm16 , shift , \
A64_VARIANT ( sf ) , AARCH64_INSN_MOVEWIDE_ # # type )
/* Rd = Zeros (for MOVZ);
* Rd | = imm16 < < shift ( where shift is { 0 , 16 , 32 , 48 } ) ;
* Rd = ~ Rd ; ( for MOVN ) ; */
# define A64_MOVN(sf, Rd, imm16, shift) A64_MOVEW(sf, Rd, imm16, shift, INVERSE)
# define A64_MOVZ(sf, Rd, imm16, shift) A64_MOVEW(sf, Rd, imm16, shift, ZERO)
# define A64_MOVK(sf, Rd, imm16, shift) A64_MOVEW(sf, Rd, imm16, shift, KEEP)
/* Add/subtract (shifted register) */
# define A64_ADDSUB_SREG(sf, Rd, Rn, Rm, type) \
aarch64_insn_gen_add_sub_shifted_reg ( Rd , Rn , Rm , 0 , \
A64_VARIANT ( sf ) , AARCH64_INSN_ADSB_ # # type )
/* Rd = Rn OP Rm */
# define A64_ADD(sf, Rd, Rn, Rm) A64_ADDSUB_SREG(sf, Rd, Rn, Rm, ADD)
# define A64_SUB(sf, Rd, Rn, Rm) A64_ADDSUB_SREG(sf, Rd, Rn, Rm, SUB)
# define A64_SUBS(sf, Rd, Rn, Rm) A64_ADDSUB_SREG(sf, Rd, Rn, Rm, SUB_SETFLAGS)
/* Rd = -Rm */
# define A64_NEG(sf, Rd, Rm) A64_SUB(sf, Rd, A64_ZR, Rm)
/* Rn - Rm; set condition flags */
# define A64_CMP(sf, Rn, Rm) A64_SUBS(sf, A64_ZR, Rn, Rm)
/* Data-processing (1 source) */
# define A64_DATA1(sf, Rd, Rn, type) aarch64_insn_gen_data1(Rd, Rn, \
A64_VARIANT ( sf ) , AARCH64_INSN_DATA1_ # # type )
/* Rd = BSWAPx(Rn) */
# define A64_REV16(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, REVERSE_16)
# define A64_REV32(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, REVERSE_32)
# define A64_REV64(Rd, Rn) A64_DATA1(1, Rd, Rn, REVERSE_64)
/* Data-processing (2 source) */
/* Rd = Rn OP Rm */
2014-09-16 19:37:35 +01:00
# define A64_DATA2(sf, Rd, Rn, Rm, type) aarch64_insn_gen_data2(Rd, Rn, Rm, \
A64_VARIANT ( sf ) , AARCH64_INSN_DATA2_ # # type )
# define A64_UDIV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, UDIV)
# define A64_LSLV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, LSLV)
# define A64_LSRV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, LSRV)
# define A64_ASRV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, ASRV)
2014-08-26 21:15:30 -07:00
/* Data-processing (3 source) */
/* Rd = Ra + Rn * Rm */
# define A64_MADD(sf, Rd, Ra, Rn, Rm) aarch64_insn_gen_data3(Rd, Ra, Rn, Rm, \
A64_VARIANT ( sf ) , AARCH64_INSN_DATA3_MADD )
2019-09-02 11:44:48 +05:30
/* Rd = Ra - Rn * Rm */
# define A64_MSUB(sf, Rd, Ra, Rn, Rm) aarch64_insn_gen_data3(Rd, Ra, Rn, Rm, \
A64_VARIANT ( sf ) , AARCH64_INSN_DATA3_MSUB )
2014-08-26 21:15:30 -07:00
/* Rd = Rn * Rm */
# define A64_MUL(sf, Rd, Rn, Rm) A64_MADD(sf, Rd, A64_ZR, Rn, Rm)
/* Logical (shifted register) */
# define A64_LOGIC_SREG(sf, Rd, Rn, Rm, type) \
aarch64_insn_gen_logical_shifted_reg ( Rd , Rn , Rm , 0 , \
A64_VARIANT ( sf ) , AARCH64_INSN_LOGIC_ # # type )
/* Rd = Rn OP Rm */
# define A64_AND(sf, Rd, Rn, Rm) A64_LOGIC_SREG(sf, Rd, Rn, Rm, AND)
# define A64_ORR(sf, Rd, Rn, Rm) A64_LOGIC_SREG(sf, Rd, Rn, Rm, ORR)
# define A64_EOR(sf, Rd, Rn, Rm) A64_LOGIC_SREG(sf, Rd, Rn, Rm, EOR)
# define A64_ANDS(sf, Rd, Rn, Rm) A64_LOGIC_SREG(sf, Rd, Rn, Rm, AND_SETFLAGS)
/* Rn & Rm; set condition flags */
# define A64_TST(sf, Rn, Rm) A64_ANDS(sf, A64_ZR, Rn, Rm)
bpf, arm64: Support more atomic operations
Atomics for eBPF patch series adds support for atomic[64]_fetch_add,
atomic[64]_[fetch_]{and,or,xor} and atomic[64]_{xchg|cmpxchg}, but it
only adds support for x86-64, so support these atomic operations for
arm64 as well.
Basically the implementation procedure is almost mechanical translation
of code snippets in atomic_ll_sc.h & atomic_lse.h & cmpxchg.h located
under arch/arm64/include/asm.
When LSE atomic is unavailable, an extra temporary register is needed for
(BPF_ADD | BPF_FETCH) to save the value of src register, instead of adding
TMP_REG_4 just use BPF_REG_AX instead. Also make emit_lse_atomic() as an
empty inline function when CONFIG_ARM64_LSE_ATOMICS is disabled.
For cpus_have_cap(ARM64_HAS_LSE_ATOMICS) case and no-LSE-ATOMICS case, the
following three tests: "./test_verifier", "./test_progs -t atomic" and
"insmod ./test_bpf.ko" are exercised and passed.
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220217072232.1186625-4-houtao1@huawei.com
2022-02-17 15:22:31 +08:00
/* Rd = ~Rm (alias of ORN with A64_ZR as Rn) */
# define A64_MVN(sf, Rd, Rm) \
A64_LOGIC_SREG ( sf , Rd , A64_ZR , Rm , ORN )
2014-08-26 21:15:30 -07:00
bpf, arm64: Optimize AND,OR,XOR,JSET BPF_K using arm64 logical immediates
The current code for BPF_{AND,OR,XOR,JSET} BPF_K loads the immediate to
a temporary register before use.
This patch changes the code to avoid using a temporary register
when the BPF immediate is encodable using an arm64 logical immediate
instruction. If the encoding fails (due to the immediate not being
encodable), it falls back to using a temporary register.
Example of generated code for BPF_ALU32_IMM(BPF_AND, R0, 0x80000001):
without optimization:
24: mov w10, #0x8000ffff
28: movk w10, #0x1
2c: and w7, w7, w10
with optimization:
24: and w7, w7, #0x80000001
Since the encoding process is quite complex, the JIT reuses existing
functionality in arch/arm64/kernel/insn.c for encoding logical immediates
rather than duplicate it in the JIT.
Co-developed-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: Luke Nelson <luke.r.nels@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20200508181547.24783-3-luke.r.nels@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
2020-05-08 11:15:45 -07:00
/* Logical (immediate) */
# define A64_LOGIC_IMM(sf, Rd, Rn, imm, type) ({ \
u64 imm64 = ( sf ) ? ( u64 ) imm : ( u64 ) ( u32 ) imm ; \
aarch64_insn_gen_logical_immediate ( AARCH64_INSN_LOGIC_ # # type , \
A64_VARIANT ( sf ) , Rn , Rd , imm64 ) ; \
} )
/* Rd = Rn OP imm */
# define A64_AND_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, AND)
# define A64_ORR_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, ORR)
# define A64_EOR_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, EOR)
# define A64_ANDS_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, AND_SETFLAGS)
/* Rn & imm; set condition flags */
# define A64_TST_I(sf, Rn, imm) A64_ANDS_I(sf, A64_ZR, Rn, imm)
2020-05-06 20:51:32 +01:00
/* HINTs */
# define A64_HINT(x) aarch64_insn_gen_hint(x)
2022-04-02 03:39:42 -04:00
# define A64_PACIASP A64_HINT(AARCH64_INSN_HINT_PACIASP)
# define A64_AUTIASP A64_HINT(AARCH64_INSN_HINT_AUTIASP)
2020-05-06 20:51:32 +01:00
/* BTI */
# define A64_BTI_C A64_HINT(AARCH64_INSN_HINT_BTIC)
# define A64_BTI_J A64_HINT(AARCH64_INSN_HINT_BTIJ)
# define A64_BTI_JC A64_HINT(AARCH64_INSN_HINT_BTIJC)
bpf, arm64: Implement bpf_arch_text_poke() for arm64
Implement bpf_arch_text_poke() for arm64, so bpf prog or bpf trampoline
can be patched with it.
When the target address is NULL, the original instruction is patched to
a NOP.
When the target address and the source address are within the branch
range, the original instruction is patched to a bl instruction to the
target address directly.
To support attaching bpf trampoline to both regular kernel function and
bpf prog, we follow the ftrace patchsite way for bpf prog. That is, two
instructions are inserted at the beginning of bpf prog, the first one
saves the return address to x9, and the second is a nop which will be
patched to a bl instruction when a bpf trampoline is attached.
However, when a bpf trampoline is attached to bpf prog, the distance
between target address and source address may exceed 128MB, the maximum
branch range, because bpf trampoline and bpf prog are allocated
separately with vmalloc. So long jump should be handled.
When a bpf prog is constructed, a plt pointing to empty trampoline
dummy_tramp is placed at the end:
bpf_prog:
mov x9, lr
nop // patchsite
...
ret
plt:
ldr x10, target
br x10
target:
.quad dummy_tramp // plt target
This is also the state when no trampoline is attached.
When a short-jump bpf trampoline is attached, the patchsite is patched to
a bl instruction to the trampoline directly:
bpf_prog:
mov x9, lr
bl <short-jump bpf trampoline address> // patchsite
...
ret
plt:
ldr x10, target
br x10
target:
.quad dummy_tramp // plt target
When a long-jump bpf trampoline is attached, the plt target is filled with
the trampoline address and the patchsite is patched to a bl instruction to
the plt:
bpf_prog:
mov x9, lr
bl plt // patchsite
...
ret
plt:
ldr x10, target
br x10
target:
.quad <long-jump bpf trampoline address>
dummy_tramp is used to prevent another CPU from jumping to an unknown
location during the patching process, making the patching process easier.
The patching process is as follows:
1. when neither the old address or the new address is a long jump, the
patchsite is replaced with a bl to the new address, or nop if the new
address is NULL;
2. when the old address is not long jump but the new one is, the
branch target address is written to plt first, then the patchsite
is replaced with a bl instruction to the plt;
3. when the old address is long jump but the new one is not, the address
of dummy_tramp is written to plt first, then the patchsite is replaced
with a bl to the new address, or a nop if the new address is NULL;
4. when both the old address and the new address are long jump, the
new address is written to plt and the patchsite is not changed.
Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: KP Singh <kpsingh@kernel.org>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20220711150823.2128542-4-xukuohai@huawei.com
2022-07-11 11:08:22 -04:00
# define A64_NOP A64_HINT(AARCH64_INSN_HINT_NOP)
2020-05-06 20:51:32 +01:00
bpf, arm64: Support more atomic operations
Atomics for eBPF patch series adds support for atomic[64]_fetch_add,
atomic[64]_[fetch_]{and,or,xor} and atomic[64]_{xchg|cmpxchg}, but it
only adds support for x86-64, so support these atomic operations for
arm64 as well.
Basically the implementation procedure is almost mechanical translation
of code snippets in atomic_ll_sc.h & atomic_lse.h & cmpxchg.h located
under arch/arm64/include/asm.
When LSE atomic is unavailable, an extra temporary register is needed for
(BPF_ADD | BPF_FETCH) to save the value of src register, instead of adding
TMP_REG_4 just use BPF_REG_AX instead. Also make emit_lse_atomic() as an
empty inline function when CONFIG_ARM64_LSE_ATOMICS is disabled.
For cpus_have_cap(ARM64_HAS_LSE_ATOMICS) case and no-LSE-ATOMICS case, the
following three tests: "./test_verifier", "./test_progs -t atomic" and
"insmod ./test_bpf.ko" are exercised and passed.
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220217072232.1186625-4-houtao1@huawei.com
2022-02-17 15:22:31 +08:00
/* DMB */
# define A64_DMB_ISH aarch64_insn_gen_dmb(AARCH64_INSN_MB_ISH)
2014-08-26 21:15:30 -07:00
# endif /* _BPF_JIT_H */