From 71330842ff93ae67a066c1fa68d75672527312fa Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 9 Aug 2021 21:45:32 +0200
Subject: [PATCH 1/5] bpf: Add _kernel suffix to internal lockdown_bpf_read

Rename LOCKDOWN_BPF_READ into LOCKDOWN_BPF_READ_KERNEL so we have naming
more consistent with a LOCKDOWN_BPF_WRITE_USER option that we are adding.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/security.h | 2 +-
 kernel/bpf/helpers.c     | 4 ++--
 kernel/trace/bpf_trace.c | 8 ++++----
 security/security.c      | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/linux/security.h b/include/linux/security.h
index 24eda04221e9..724d7a4a0c91 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -123,7 +123,7 @@ enum lockdown_reason {
 	LOCKDOWN_INTEGRITY_MAX,
 	LOCKDOWN_KCORE,
 	LOCKDOWN_KPROBES,
-	LOCKDOWN_BPF_READ,
+	LOCKDOWN_BPF_READ_KERNEL,
 	LOCKDOWN_PERF,
 	LOCKDOWN_TRACEFS,
 	LOCKDOWN_XMON_RW,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 62cf00383910..0b04553e8c44 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1070,12 +1070,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_probe_read_user:
 		return &bpf_probe_read_user_proto;
 	case BPF_FUNC_probe_read_kernel:
-		return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
 		       NULL : &bpf_probe_read_kernel_proto;
 	case BPF_FUNC_probe_read_user_str:
 		return &bpf_probe_read_user_str_proto;
 	case BPF_FUNC_probe_read_kernel_str:
-		return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
 		       NULL : &bpf_probe_read_kernel_str_proto;
 	case BPF_FUNC_snprintf_btf:
 		return &bpf_snprintf_btf_proto;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b4916ef388ad..1836591197a5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -999,19 +999,19 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_probe_read_user:
 		return &bpf_probe_read_user_proto;
 	case BPF_FUNC_probe_read_kernel:
-		return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
 		       NULL : &bpf_probe_read_kernel_proto;
 	case BPF_FUNC_probe_read_user_str:
 		return &bpf_probe_read_user_str_proto;
 	case BPF_FUNC_probe_read_kernel_str:
-		return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
 		       NULL : &bpf_probe_read_kernel_str_proto;
 #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	case BPF_FUNC_probe_read:
-		return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
 		       NULL : &bpf_probe_read_compat_proto;
 	case BPF_FUNC_probe_read_str:
-		return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
 		       NULL : &bpf_probe_read_compat_str_proto;
 #endif
 #ifdef CONFIG_CGROUPS
diff --git a/security/security.c b/security/security.c
index 09533cbb7221..6b83ab4e9d66 100644
--- a/security/security.c
+++ b/security/security.c
@@ -61,7 +61,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
 	[LOCKDOWN_INTEGRITY_MAX] = "integrity",
 	[LOCKDOWN_KCORE] = "/proc/kcore access",
 	[LOCKDOWN_KPROBES] = "use of kprobes",
-	[LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM",
+	[LOCKDOWN_BPF_READ_KERNEL] = "use of bpf to read kernel RAM",
 	[LOCKDOWN_PERF] = "unsafe use of perf",
 	[LOCKDOWN_TRACEFS] = "use of tracefs",
 	[LOCKDOWN_XMON_RW] = "xmon read and write access",

From 51e1bb9eeaf7868db56e58f47848e364ab4c4129 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 9 Aug 2021 12:43:17 +0200
Subject: [PATCH 2/5] bpf: Add lockdown check for probe_write_user helper

Back then, commit 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper
to be called in tracers") added the bpf_probe_write_user() helper in order
to allow to override user space memory. Its original goal was to have a
facility to "debug, divert, and manipulate execution of semi-cooperative
processes" under CAP_SYS_ADMIN. Write to kernel was explicitly disallowed
since it would otherwise tamper with its integrity.

One use case was shown in cf9b1199de27 ("samples/bpf: Add test/example of
using bpf_probe_write_user bpf helper") where the program DNATs traffic
at the time of connect(2) syscall, meaning, it rewrites the arguments to
a syscall while they're still in userspace, and before the syscall has a
chance to copy the argument into kernel space. These days we have better
mechanisms in BPF for achieving the same (e.g. for load-balancers), but
without having to write to userspace memory.

Of course the bpf_probe_write_user() helper can also be used to abuse
many other things for both good or bad purpose. Outside of BPF, there is
a similar mechanism for ptrace(2) such as PTRACE_PEEK{TEXT,DATA} and
PTRACE_POKE{TEXT,DATA}, but would likely require some more effort.
Commit 96ae52279594 explicitly dedicated the helper for experimentation
purpose only. Thus, move the helper's availability behind a newly added
LOCKDOWN_BPF_WRITE_USER lockdown knob so that the helper is disabled under
the "integrity" mode. More fine-grained control can be implemented also
from LSM side with this change.

Fixes: 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/security.h | 1 +
 kernel/trace/bpf_trace.c | 5 +++--
 security/security.c      | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/linux/security.h b/include/linux/security.h
index 724d7a4a0c91..5b7288521300 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -120,6 +120,7 @@ enum lockdown_reason {
 	LOCKDOWN_MMIOTRACE,
 	LOCKDOWN_DEBUGFS,
 	LOCKDOWN_XMON_WR,
+	LOCKDOWN_BPF_WRITE_USER,
 	LOCKDOWN_INTEGRITY_MAX,
 	LOCKDOWN_KCORE,
 	LOCKDOWN_KPROBES,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1836591197a5..fdd14072fc3b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -990,12 +990,13 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_numa_node_id_proto;
 	case BPF_FUNC_perf_event_read:
 		return &bpf_perf_event_read_proto;
-	case BPF_FUNC_probe_write_user:
-		return bpf_get_probe_write_proto();
 	case BPF_FUNC_current_task_under_cgroup:
 		return &bpf_current_task_under_cgroup_proto;
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
+	case BPF_FUNC_probe_write_user:
+		return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
+		       NULL : bpf_get_probe_write_proto();
 	case BPF_FUNC_probe_read_user:
 		return &bpf_probe_read_user_proto;
 	case BPF_FUNC_probe_read_kernel:
diff --git a/security/security.c b/security/security.c
index 6b83ab4e9d66..9ffa9e9c5c55 100644
--- a/security/security.c
+++ b/security/security.c
@@ -58,6 +58,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
 	[LOCKDOWN_MMIOTRACE] = "unsafe mmio",
 	[LOCKDOWN_DEBUGFS] = "debugfs access",
 	[LOCKDOWN_XMON_WR] = "xmon write access",
+	[LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM",
 	[LOCKDOWN_INTEGRITY_MAX] = "integrity",
 	[LOCKDOWN_KCORE] = "/proc/kcore access",
 	[LOCKDOWN_KPROBES] = "use of kprobes",

From 87b7b5335e6995a6d64fca98fc67b92b29caac9c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 9 Aug 2021 16:51:51 -0700
Subject: [PATCH 3/5] bpf: Add missing bpf_read_[un]lock_trace() for syscall
 program

Commit 79a7f8bdb159d ("bpf: Introduce bpf_sys_bpf() helper and program type.")
added support for syscall program, which is a sleepable program.

But the program run missed bpf_read_lock_trace()/bpf_read_unlock_trace(),
which is needed to ensure proper rcu callback invocations. This patch adds
bpf_read_[un]lock_trace() properly.

Fixes: 79a7f8bdb159d ("bpf: Introduce bpf_sys_bpf() helper and program type.")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210809235151.1663680-1-yhs@fb.com
---
 net/bpf/test_run.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 1cc75c811e24..caa16bf30fb5 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -7,6 +7,7 @@
 #include <linux/vmalloc.h>
 #include <linux/etherdevice.h>
 #include <linux/filter.h>
+#include <linux/rcupdate_trace.h>
 #include <linux/sched/signal.h>
 #include <net/bpf_sk_storage.h>
 #include <net/sock.h>
@@ -951,7 +952,10 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog,
 			goto out;
 		}
 	}
+
+	rcu_read_lock_trace();
 	retval = bpf_prog_run_pin_on_cpu(prog, ctx);
+	rcu_read_unlock_trace();
 
 	if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) {
 		err = -EFAULT;

From a2baf4e8bb0f306fbed7b5e6197c02896a638ab5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 9 Aug 2021 18:04:13 -0700
Subject: [PATCH 4/5] bpf: Fix potentially incorrect results with
 bpf_get_local_storage()

Commit b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage()
helper") fixed a bug for bpf_get_local_storage() helper so different tasks
won't mess up with each other's percpu local storage.

The percpu data contains 8 slots so it can hold up to 8 contexts (same or
different tasks), for 8 different program runs, at the same time. This in
general is sufficient. But our internal testing showed the following warning
multiple times:

  [...]
  warning: WARNING: CPU: 13 PID: 41661 at include/linux/bpf-cgroup.h:193
     __cgroup_bpf_run_filter_sock_ops+0x13e/0x180
  RIP: 0010:__cgroup_bpf_run_filter_sock_ops+0x13e/0x180
  <IRQ>
   tcp_call_bpf.constprop.99+0x93/0xc0
   tcp_conn_request+0x41e/0xa50
   ? tcp_rcv_state_process+0x203/0xe00
   tcp_rcv_state_process+0x203/0xe00
   ? sk_filter_trim_cap+0xbc/0x210
   ? tcp_v6_inbound_md5_hash.constprop.41+0x44/0x160
   tcp_v6_do_rcv+0x181/0x3e0
   tcp_v6_rcv+0xc65/0xcb0
   ip6_protocol_deliver_rcu+0xbd/0x450
   ip6_input_finish+0x11/0x20
   ip6_input+0xb5/0xc0
   ip6_sublist_rcv_finish+0x37/0x50
   ip6_sublist_rcv+0x1dc/0x270
   ipv6_list_rcv+0x113/0x140
   __netif_receive_skb_list_core+0x1a0/0x210
   netif_receive_skb_list_internal+0x186/0x2a0
   gro_normal_list.part.170+0x19/0x40
   napi_complete_done+0x65/0x150
   mlx5e_napi_poll+0x1ae/0x680
   __napi_poll+0x25/0x120
   net_rx_action+0x11e/0x280
   __do_softirq+0xbb/0x271
   irq_exit_rcu+0x97/0xa0
   common_interrupt+0x7f/0xa0
   </IRQ>
   asm_common_interrupt+0x1e/0x40
  RIP: 0010:bpf_prog_1835a9241238291a_tw_egress+0x5/0xbac
   ? __cgroup_bpf_run_filter_skb+0x378/0x4e0
   ? do_softirq+0x34/0x70
   ? ip6_finish_output2+0x266/0x590
   ? ip6_finish_output+0x66/0xa0
   ? ip6_output+0x6c/0x130
   ? ip6_xmit+0x279/0x550
   ? ip6_dst_check+0x61/0xd0
  [...]

Using drgn [0] to dump the percpu buffer contents showed that on this CPU
slot 0 is still available, but slots 1-7 are occupied and those tasks in
slots 1-7 mostly don't exist any more. So we might have issues in
bpf_cgroup_storage_unset().

Further debugging confirmed that there is a bug in bpf_cgroup_storage_unset().
Currently, it tries to unset "current" slot with searching from the start.
So the following sequence is possible:

  1. A task is running and claims slot 0
  2. Running BPF program is done, and it checked slot 0 has the "task"
     and ready to reset it to NULL (not yet).
  3. An interrupt happens, another BPF program runs and it claims slot 1
     with the *same* task.
  4. The unset() in interrupt context releases slot 0 since it matches "task".
  5. Interrupt is done, the task in process context reset slot 0.

At the end, slot 1 is not reset and the same process can continue to occupy
slots 2-7 and finally, when the above step 1-5 is repeated again, step 3 BPF
program won't be able to claim an empty slot and a warning will be issued.

To fix the issue, for unset() function, we should traverse from the last slot
to the first. This way, the above issue can be avoided.

The same reverse traversal should also be done in bpf_get_local_storage() helper
itself. Otherwise, incorrect local storage may be returned to BPF program.

  [0] https://github.com/osandov/drgn

Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210810010413.1976277-1-yhs@fb.com
---
 include/linux/bpf-cgroup.h | 4 ++--
 kernel/bpf/helpers.c       | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8b77d08d4b47..6c9b10d82c80 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -201,8 +201,8 @@ static inline void bpf_cgroup_storage_unset(void)
 {
 	int i;
 
-	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
-		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+	for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) {
+		if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
 			continue;
 
 		this_cpu_write(bpf_cgroup_storage_info[i].task, NULL);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 0b04553e8c44..7a97b2f4747d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -397,8 +397,8 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 	void *ptr;
 	int i;
 
-	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
-		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+	for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) {
+		if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
 			continue;
 
 		storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);

From 019d0454c61707879cf9853c894e0a191f6b9774 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 9 Aug 2021 14:52:29 -0700
Subject: [PATCH 5/5] bpf, core: Fix kernel-doc notation

Fix kernel-doc warnings in kernel/bpf/core.c (found by scripts/kernel-doc
and W=1 builds). That is, correct a function name in a comment and add
return descriptions for 2 functions.

Fixes these kernel-doc warnings:

  kernel/bpf/core.c:1372: warning: expecting prototype for __bpf_prog_run(). Prototype was for ___bpf_prog_run() instead
  kernel/bpf/core.c:1372: warning: No description found for return value of '___bpf_prog_run'
  kernel/bpf/core.c:1883: warning: No description found for return value of 'bpf_prog_select_runtime'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210809215229.7556-1-rdunlap@infradead.org
---
 kernel/bpf/core.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b1a5fc04492b..0a28a8095d3e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1362,11 +1362,13 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
 }
 
 /**
- *	__bpf_prog_run - run eBPF program on a given context
+ *	___bpf_prog_run - run eBPF program on a given context
  *	@regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
  *	@insn: is the array of eBPF instructions
  *
  * Decode and execute eBPF instructions.
+ *
+ * Return: whatever value is in %BPF_R0 at program exit
  */
 static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
 {
@@ -1878,6 +1880,9 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
  *
  * Try to JIT eBPF program, if JIT is not available, use interpreter.
  * The BPF program will be executed via BPF_PROG_RUN() macro.
+ *
+ * Return: the &fp argument along with &err set to 0 for success or
+ * a negative errno code on failure
  */
 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 {