2020-05-09 20:59:11 +03:00
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2020 Facebook */
# include <linux/init.h>
# include <linux/namei.h>
# include <linux/pid_namespace.h>
# include <linux/fs.h>
# include <linux/fdtable.h>
# include <linux/filter.h>
2020-07-20 19:34:03 +03:00
# include <linux/btf_ids.h>
2021-11-06 02:23:29 +03:00
# include "mmap_unlock_work.h"
2020-05-09 20:59:11 +03:00
struct bpf_iter_seq_task_common {
struct pid_namespace * ns ;
} ;
struct bpf_iter_seq_task_info {
/* The first field must be struct bpf_iter_seq_task_common.
* this is assumed by { init , fini } _seq_pidns ( ) callback functions .
*/
struct bpf_iter_seq_task_common common ;
u32 tid ;
} ;
static struct task_struct * task_seq_get_next ( struct pid_namespace * ns ,
2020-09-02 05:31:12 +03:00
u32 * tid ,
bool skip_if_dup_files )
2020-05-09 20:59:11 +03:00
{
struct task_struct * task = NULL ;
struct pid * pid ;
rcu_read_lock ( ) ;
2020-05-14 08:51:37 +03:00
retry :
2020-08-19 01:23:10 +03:00
pid = find_ge_pid ( * tid , ns ) ;
2020-05-14 08:51:37 +03:00
if ( pid ) {
2020-08-19 01:23:10 +03:00
* tid = pid_nr_ns ( pid , ns ) ;
2020-05-09 20:59:11 +03:00
task = get_pid_task ( pid , PIDTYPE_PID ) ;
2020-05-14 08:51:37 +03:00
if ( ! task ) {
+ + * tid ;
goto retry ;
2020-12-18 21:50:31 +03:00
} else if ( skip_if_dup_files & & ! thread_group_leader ( task ) & &
2020-09-02 05:31:12 +03:00
task - > files = = task - > group_leader - > files ) {
put_task_struct ( task ) ;
task = NULL ;
+ + * tid ;
goto retry ;
2020-05-14 08:51:37 +03:00
}
}
2020-05-09 20:59:11 +03:00
rcu_read_unlock ( ) ;
return task ;
}
static void * task_seq_start ( struct seq_file * seq , loff_t * pos )
{
struct bpf_iter_seq_task_info * info = seq - > private ;
struct task_struct * task ;
2020-09-02 05:31:12 +03:00
task = task_seq_get_next ( info - > common . ns , & info - > tid , false ) ;
2020-05-09 20:59:11 +03:00
if ( ! task )
return NULL ;
2020-07-22 22:51:56 +03:00
if ( * pos = = 0 )
+ + * pos ;
2020-05-09 20:59:11 +03:00
return task ;
}
static void * task_seq_next ( struct seq_file * seq , void * v , loff_t * pos )
{
struct bpf_iter_seq_task_info * info = seq - > private ;
struct task_struct * task ;
+ + * pos ;
+ + info - > tid ;
put_task_struct ( ( struct task_struct * ) v ) ;
2020-09-02 05:31:12 +03:00
task = task_seq_get_next ( info - > common . ns , & info - > tid , false ) ;
2020-05-09 20:59:11 +03:00
if ( ! task )
return NULL ;
return task ;
}
struct bpf_iter__task {
__bpf_md_ptr ( struct bpf_iter_meta * , meta ) ;
__bpf_md_ptr ( struct task_struct * , task ) ;
} ;
DEFINE_BPF_ITER_FUNC ( task , struct bpf_iter_meta * meta , struct task_struct * task )
static int __task_seq_show ( struct seq_file * seq , struct task_struct * task ,
bool in_stop )
{
struct bpf_iter_meta meta ;
struct bpf_iter__task ctx ;
struct bpf_prog * prog ;
meta . seq = seq ;
prog = bpf_iter_get_info ( & meta , in_stop ) ;
if ( ! prog )
return 0 ;
ctx . meta = & meta ;
ctx . task = task ;
return bpf_iter_run_prog ( prog , & ctx ) ;
}
static int task_seq_show ( struct seq_file * seq , void * v )
{
return __task_seq_show ( seq , v , false ) ;
}
static void task_seq_stop ( struct seq_file * seq , void * v )
{
if ( ! v )
( void ) __task_seq_show ( seq , v , true ) ;
else
put_task_struct ( ( struct task_struct * ) v ) ;
}
static const struct seq_operations task_seq_ops = {
. start = task_seq_start ,
. next = task_seq_next ,
. stop = task_seq_stop ,
. show = task_seq_show ,
} ;
struct bpf_iter_seq_task_file_info {
/* The first field must be struct bpf_iter_seq_task_common.
* this is assumed by { init , fini } _seq_pidns ( ) callback functions .
*/
struct bpf_iter_seq_task_common common ;
struct task_struct * task ;
u32 tid ;
u32 fd ;
} ;
static struct file *
2020-11-20 03:28:33 +03:00
task_file_seq_get_next ( struct bpf_iter_seq_task_file_info * info )
2020-05-09 20:59:11 +03:00
{
struct pid_namespace * ns = info - > common . ns ;
2020-11-21 02:14:33 +03:00
u32 curr_tid = info - > tid ;
2020-05-09 20:59:11 +03:00
struct task_struct * curr_task ;
2020-11-21 02:14:33 +03:00
unsigned int curr_fd = info - > fd ;
2020-05-09 20:59:11 +03:00
/* If this function returns a non-NULL file object,
2020-11-21 02:14:33 +03:00
* it held a reference to the task / file .
2020-05-09 20:59:11 +03:00
* Otherwise , it does not hold any reference .
*/
again :
2020-11-20 03:28:33 +03:00
if ( info - > task ) {
curr_task = info - > task ;
2020-05-09 20:59:11 +03:00
curr_fd = info - > fd ;
} else {
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf
Daniel Borkmann says:
====================
pull-request: bpf 2020-12-28
The following pull-request contains BPF updates for your *net* tree.
There is a small merge conflict between bpf tree commit 69ca310f3416
("bpf: Save correct stopping point in file seq iteration") and net tree
commit 66ed594409a1 ("bpf/task_iter: In task_file_seq_get_next use
task_lookup_next_fd_rcu"). The get_files_struct() does not exist anymore
in net, so take the hunk in HEAD and add the `info->tid = curr_tid` to
the error path:
[...]
curr_task = task_seq_get_next(ns, &curr_tid, true);
if (!curr_task) {
info->task = NULL;
info->tid = curr_tid;
return NULL;
}
/* set info->task and info->tid */
[...]
We've added 10 non-merge commits during the last 9 day(s) which contain
a total of 11 files changed, 75 insertions(+), 20 deletions(-).
The main changes are:
1) Various AF_XDP fixes such as fill/completion ring leak on failed bind and
fixing a race in skb mode's backpressure mechanism, from Magnus Karlsson.
2) Fix latency spikes on lockdep enabled kernels by adding a rescheduling
point to BPF hashtab initialization, from Eric Dumazet.
3) Fix a splat in task iterator by saving the correct stopping point in the
seq file iteration, from Jonathan Lemon.
4) Fix BPF maps selftest by adding retries in case hashtab returns EBUSY
errors on update/deletes, from Andrii Nakryiko.
5) Fix BPF selftest error reporting to something more user friendly if the
vmlinux BTF cannot be found, from Kamal Mostafa.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-12-29 02:20:48 +03:00
curr_task = task_seq_get_next ( ns , & curr_tid , true ) ;
if ( ! curr_task ) {
info - > task = NULL ;
info - > tid = curr_tid ;
return NULL ;
}
/* set info->task and info->tid */
2020-12-31 08:24:18 +03:00
info - > task = curr_task ;
2020-05-09 20:59:11 +03:00
if ( curr_tid = = info - > tid ) {
curr_fd = info - > fd ;
} else {
info - > tid = curr_tid ;
curr_fd = 0 ;
}
}
rcu_read_lock ( ) ;
2020-11-21 02:14:33 +03:00
for ( ; ; curr_fd + + ) {
2020-05-09 20:59:11 +03:00
struct file * f ;
2020-11-21 02:14:33 +03:00
f = task_lookup_next_fd_rcu ( curr_task , & curr_fd ) ;
2020-05-09 20:59:11 +03:00
if ( ! f )
2020-11-21 02:14:33 +03:00
break ;
2020-08-17 20:42:14 +03:00
if ( ! get_file_rcu ( f ) )
continue ;
2020-05-09 20:59:11 +03:00
/* set info->fd */
info - > fd = curr_fd ;
rcu_read_unlock ( ) ;
return f ;
}
/* the current task is done, go to the next task */
rcu_read_unlock ( ) ;
put_task_struct ( curr_task ) ;
2020-11-20 03:28:33 +03:00
info - > task = NULL ;
2020-05-09 20:59:11 +03:00
info - > fd = 0 ;
curr_tid = + + ( info - > tid ) ;
goto again ;
}
static void * task_file_seq_start ( struct seq_file * seq , loff_t * pos )
{
struct bpf_iter_seq_task_file_info * info = seq - > private ;
struct file * file ;
2020-11-20 03:28:33 +03:00
info - > task = NULL ;
file = task_file_seq_get_next ( info ) ;
if ( file & & * pos = = 0 )
2020-07-22 22:51:56 +03:00
+ + * pos ;
2020-05-09 20:59:11 +03:00
return file ;
}
static void * task_file_seq_next ( struct seq_file * seq , void * v , loff_t * pos )
{
struct bpf_iter_seq_task_file_info * info = seq - > private ;
+ + * pos ;
+ + info - > fd ;
fput ( ( struct file * ) v ) ;
2020-11-20 03:28:33 +03:00
return task_file_seq_get_next ( info ) ;
2020-05-09 20:59:11 +03:00
}
struct bpf_iter__task_file {
__bpf_md_ptr ( struct bpf_iter_meta * , meta ) ;
__bpf_md_ptr ( struct task_struct * , task ) ;
u32 fd __aligned ( 8 ) ;
__bpf_md_ptr ( struct file * , file ) ;
} ;
DEFINE_BPF_ITER_FUNC ( task_file , struct bpf_iter_meta * meta ,
struct task_struct * task , u32 fd ,
struct file * file )
static int __task_file_seq_show ( struct seq_file * seq , struct file * file ,
bool in_stop )
{
struct bpf_iter_seq_task_file_info * info = seq - > private ;
struct bpf_iter__task_file ctx ;
struct bpf_iter_meta meta ;
struct bpf_prog * prog ;
meta . seq = seq ;
prog = bpf_iter_get_info ( & meta , in_stop ) ;
if ( ! prog )
return 0 ;
ctx . meta = & meta ;
ctx . task = info - > task ;
ctx . fd = info - > fd ;
ctx . file = file ;
return bpf_iter_run_prog ( prog , & ctx ) ;
}
static int task_file_seq_show ( struct seq_file * seq , void * v )
{
return __task_file_seq_show ( seq , v , false ) ;
}
static void task_file_seq_stop ( struct seq_file * seq , void * v )
{
struct bpf_iter_seq_task_file_info * info = seq - > private ;
if ( ! v ) {
( void ) __task_file_seq_show ( seq , v , true ) ;
} else {
fput ( ( struct file * ) v ) ;
put_task_struct ( info - > task ) ;
info - > task = NULL ;
}
}
2020-07-23 21:41:10 +03:00
static int init_seq_pidns ( void * priv_data , struct bpf_iter_aux_info * aux )
2020-05-09 20:59:11 +03:00
{
struct bpf_iter_seq_task_common * common = priv_data ;
common - > ns = get_pid_ns ( task_active_pid_ns ( current ) ) ;
return 0 ;
}
static void fini_seq_pidns ( void * priv_data )
{
struct bpf_iter_seq_task_common * common = priv_data ;
put_pid_ns ( common - > ns ) ;
}
static const struct seq_operations task_file_seq_ops = {
. start = task_file_seq_start ,
. next = task_file_seq_next ,
. stop = task_file_seq_stop ,
. show = task_file_seq_show ,
} ;
2021-02-12 21:31:05 +03:00
struct bpf_iter_seq_task_vma_info {
/* The first field must be struct bpf_iter_seq_task_common.
* this is assumed by { init , fini } _seq_pidns ( ) callback functions .
*/
struct bpf_iter_seq_task_common common ;
struct task_struct * task ;
struct vm_area_struct * vma ;
u32 tid ;
unsigned long prev_vm_start ;
unsigned long prev_vm_end ;
} ;
enum bpf_task_vma_iter_find_op {
task_vma_iter_first_vma , /* use mm->mmap */
task_vma_iter_next_vma , /* use curr_vma->vm_next */
task_vma_iter_find_vma , /* use find_vma() to find next vma */
} ;
static struct vm_area_struct *
task_vma_seq_get_next ( struct bpf_iter_seq_task_vma_info * info )
{
struct pid_namespace * ns = info - > common . ns ;
enum bpf_task_vma_iter_find_op op ;
struct vm_area_struct * curr_vma ;
struct task_struct * curr_task ;
u32 curr_tid = info - > tid ;
/* If this function returns a non-NULL vma, it holds a reference to
* the task_struct , and holds read lock on vma - > mm - > mmap_lock .
* If this function returns NULL , it does not hold any reference or
* lock .
*/
if ( info - > task ) {
curr_task = info - > task ;
curr_vma = info - > vma ;
/* In case of lock contention, drop mmap_lock to unblock
* the writer .
*
* After relock , call find ( mm , prev_vm_end - 1 ) to find
* new vma to process .
*
* + - - - - - - + - - - - - - + - - - - - - - - - - - +
* | VMA1 | VMA2 | VMA3 |
* + - - - - - - + - - - - - - + - - - - - - - - - - - +
* | | | |
* 4 k 8 k 16 k 400 k
*
* For example , curr_vma = = VMA2 . Before unlock , we set
*
* prev_vm_start = 8 k
* prev_vm_end = 16 k
*
* There are a few cases :
*
* 1 ) VMA2 is freed , but VMA3 exists .
*
* find_vma ( ) will return VMA3 , just process VMA3 .
*
* 2 ) VMA2 still exists .
*
* find_vma ( ) will return VMA2 , process VMA2 - > next .
*
* 3 ) no more vma in this mm .
*
* Process the next task .
*
* 4 ) find_vma ( ) returns a different vma , VMA2 ' .
*
* 4.1 ) If VMA2 covers same range as VMA2 ' , skip VMA2 ' ,
* because we already covered the range ;
* 4.2 ) VMA2 and VMA2 ' covers different ranges , process
* VMA2 ' .
*/
if ( mmap_lock_is_contended ( curr_task - > mm ) ) {
info - > prev_vm_start = curr_vma - > vm_start ;
info - > prev_vm_end = curr_vma - > vm_end ;
op = task_vma_iter_find_vma ;
mmap_read_unlock ( curr_task - > mm ) ;
if ( mmap_read_lock_killable ( curr_task - > mm ) )
goto finish ;
} else {
op = task_vma_iter_next_vma ;
}
} else {
again :
curr_task = task_seq_get_next ( ns , & curr_tid , true ) ;
if ( ! curr_task ) {
info - > tid = curr_tid + 1 ;
goto finish ;
}
if ( curr_tid ! = info - > tid ) {
info - > tid = curr_tid ;
/* new task, process the first vma */
op = task_vma_iter_first_vma ;
} else {
/* Found the same tid, which means the user space
* finished data in previous buffer and read more .
* We dropped mmap_lock before returning to user
* space , so it is necessary to use find_vma ( ) to
* find the next vma to process .
*/
op = task_vma_iter_find_vma ;
}
if ( ! curr_task - > mm )
goto next_task ;
if ( mmap_read_lock_killable ( curr_task - > mm ) )
goto finish ;
}
switch ( op ) {
case task_vma_iter_first_vma :
curr_vma = curr_task - > mm - > mmap ;
break ;
case task_vma_iter_next_vma :
curr_vma = curr_vma - > vm_next ;
break ;
case task_vma_iter_find_vma :
/* We dropped mmap_lock so it is necessary to use find_vma
* to find the next vma . This is similar to the mechanism
* in show_smaps_rollup ( ) .
*/
curr_vma = find_vma ( curr_task - > mm , info - > prev_vm_end - 1 ) ;
/* case 1) and 4.2) above just use curr_vma */
/* check for case 2) or case 4.1) above */
if ( curr_vma & &
curr_vma - > vm_start = = info - > prev_vm_start & &
curr_vma - > vm_end = = info - > prev_vm_end )
curr_vma = curr_vma - > vm_next ;
break ;
}
if ( ! curr_vma ) {
/* case 3) above, or case 2) 4.1) with vma->next == NULL */
mmap_read_unlock ( curr_task - > mm ) ;
goto next_task ;
}
info - > task = curr_task ;
info - > vma = curr_vma ;
return curr_vma ;
next_task :
put_task_struct ( curr_task ) ;
info - > task = NULL ;
curr_tid + + ;
goto again ;
finish :
if ( curr_task )
put_task_struct ( curr_task ) ;
info - > task = NULL ;
info - > vma = NULL ;
return NULL ;
}
static void * task_vma_seq_start ( struct seq_file * seq , loff_t * pos )
{
struct bpf_iter_seq_task_vma_info * info = seq - > private ;
struct vm_area_struct * vma ;
vma = task_vma_seq_get_next ( info ) ;
if ( vma & & * pos = = 0 )
+ + * pos ;
return vma ;
}
static void * task_vma_seq_next ( struct seq_file * seq , void * v , loff_t * pos )
{
struct bpf_iter_seq_task_vma_info * info = seq - > private ;
+ + * pos ;
return task_vma_seq_get_next ( info ) ;
}
struct bpf_iter__task_vma {
__bpf_md_ptr ( struct bpf_iter_meta * , meta ) ;
__bpf_md_ptr ( struct task_struct * , task ) ;
__bpf_md_ptr ( struct vm_area_struct * , vma ) ;
} ;
DEFINE_BPF_ITER_FUNC ( task_vma , struct bpf_iter_meta * meta ,
struct task_struct * task , struct vm_area_struct * vma )
static int __task_vma_seq_show ( struct seq_file * seq , bool in_stop )
{
struct bpf_iter_seq_task_vma_info * info = seq - > private ;
struct bpf_iter__task_vma ctx ;
struct bpf_iter_meta meta ;
struct bpf_prog * prog ;
meta . seq = seq ;
prog = bpf_iter_get_info ( & meta , in_stop ) ;
if ( ! prog )
return 0 ;
ctx . meta = & meta ;
ctx . task = info - > task ;
ctx . vma = info - > vma ;
return bpf_iter_run_prog ( prog , & ctx ) ;
}
static int task_vma_seq_show ( struct seq_file * seq , void * v )
{
return __task_vma_seq_show ( seq , false ) ;
}
static void task_vma_seq_stop ( struct seq_file * seq , void * v )
{
struct bpf_iter_seq_task_vma_info * info = seq - > private ;
if ( ! v ) {
( void ) __task_vma_seq_show ( seq , true ) ;
} else {
/* info->vma has not been seen by the BPF program. If the
* user space reads more , task_vma_seq_get_next should
* return this vma again . Set prev_vm_start to ~ 0UL ,
* so that we don ' t skip the vma returned by the next
* find_vma ( ) ( case task_vma_iter_find_vma in
* task_vma_seq_get_next ( ) ) .
*/
info - > prev_vm_start = ~ 0UL ;
info - > prev_vm_end = info - > vma - > vm_end ;
mmap_read_unlock ( info - > task - > mm ) ;
put_task_struct ( info - > task ) ;
info - > task = NULL ;
}
}
static const struct seq_operations task_vma_seq_ops = {
. start = task_vma_seq_start ,
. next = task_vma_seq_next ,
. stop = task_vma_seq_stop ,
. show = task_vma_seq_show ,
} ;
2020-07-23 21:41:09 +03:00
static const struct bpf_iter_seq_info task_seq_info = {
2020-05-13 21:02:19 +03:00
. seq_ops = & task_seq_ops ,
. init_seq_private = init_seq_pidns ,
. fini_seq_private = fini_seq_pidns ,
. seq_priv_size = sizeof ( struct bpf_iter_seq_task_info ) ,
2020-07-23 21:41:09 +03:00
} ;
static struct bpf_iter_reg task_reg_info = {
. target = " task " ,
bpf: Permit cond_resched for some iterators
Commit e679654a704e ("bpf: Fix a rcu_sched stall issue with
bpf task/task_file iterator") tries to fix rcu stalls warning
which is caused by bpf task_file iterator when running
"bpftool prog".
rcu: INFO: rcu_sched self-detected stall on CPU
rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913
\x09(t=21031 jiffies g=2534773 q=179750)
NMI backtrace for cpu 7
CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6
Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019
Call Trace:
<IRQ>
dump_stack+0x57/0x70
nmi_cpu_backtrace.cold+0x14/0x53
? lapic_can_unplug_cpu.cold+0x39/0x39
nmi_trigger_cpumask_backtrace+0xb7/0xc7
rcu_dump_cpu_stacks+0xa2/0xd0
rcu_sched_clock_irq.cold+0x1ff/0x3d9
? tick_nohz_handler+0x100/0x100
update_process_times+0x5b/0x90
tick_sched_timer+0x5e/0xf0
__hrtimer_run_queues+0x12a/0x2a0
hrtimer_interrupt+0x10e/0x280
__sysvec_apic_timer_interrupt+0x51/0xe0
asm_call_on_stack+0xf/0x20
</IRQ>
sysvec_apic_timer_interrupt+0x6f/0x80
...
task_file_seq_next+0x52/0xa0
bpf_seq_read+0xb9/0x320
vfs_read+0x9d/0x180
ksys_read+0x5f/0xe0
do_syscall_64+0x38/0x60
entry_SYSCALL_64_after_hwframe+0x44/0xa9
The fix is to limit the number of bpf program runs to be
one million. This fixed the program in most cases. But
we also found under heavy load, which can increase the wallclock
time for bpf_seq_read(), the warning may still be possible.
For example, calling bpf_delay() in the "while" loop of
bpf_seq_read(), which will introduce artificial delay,
the warning will show up in my qemu run.
static unsigned q;
volatile unsigned *p = &q;
volatile unsigned long long ll;
static void bpf_delay(void)
{
int i, j;
for (i = 0; i < 10000; i++)
for (j = 0; j < 10000; j++)
ll += *p;
}
There are two ways to fix this issue. One is to reduce the above
one million threshold to say 100,000 and hopefully rcu warning will
not show up any more. Another is to introduce a target feature
which enables bpf_seq_read() calling cond_resched().
This patch took second approach as the first approach may cause
more -EAGAIN failures for read() syscalls. Note that not all bpf_iter
targets can permit cond_resched() in bpf_seq_read() as some, e.g.,
netlink seq iterator, rcu read lock critical section spans through
seq_ops->next() -> seq_ops->show() -> seq_ops->next().
For the kernel code with the above hack, "bpftool p" roughly takes
38 seconds to finish on my VM with 184 bpf program runs.
Using the following command, I am able to collect the number of
context switches:
perf stat -e context-switches -- ./bpftool p >& log
Without this patch,
69 context-switches
With this patch,
75 context-switches
This patch added additional 6 context switches, roughly every 6 seconds
to reschedule, to avoid lengthy no-rescheduling which may cause the
above RCU warnings.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201028061054.1411116-1-yhs@fb.com
2020-10-28 09:10:54 +03:00
. feature = BPF_ITER_RESCHED ,
2020-05-13 21:02:21 +03:00
. ctx_arg_info_size = 1 ,
. ctx_arg_info = {
{ offsetof ( struct bpf_iter__task , task ) ,
PTR_TO_BTF_ID_OR_NULL } ,
} ,
2020-07-23 21:41:09 +03:00
. seq_info = & task_seq_info ,
2020-05-13 21:02:19 +03:00
} ;
2020-07-23 21:41:09 +03:00
static const struct bpf_iter_seq_info task_file_seq_info = {
2020-05-13 21:02:19 +03:00
. seq_ops = & task_file_seq_ops ,
. init_seq_private = init_seq_pidns ,
. fini_seq_private = fini_seq_pidns ,
. seq_priv_size = sizeof ( struct bpf_iter_seq_task_file_info ) ,
2020-07-23 21:41:09 +03:00
} ;
static struct bpf_iter_reg task_file_reg_info = {
. target = " task_file " ,
bpf: Permit cond_resched for some iterators
Commit e679654a704e ("bpf: Fix a rcu_sched stall issue with
bpf task/task_file iterator") tries to fix rcu stalls warning
which is caused by bpf task_file iterator when running
"bpftool prog".
rcu: INFO: rcu_sched self-detected stall on CPU
rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913
\x09(t=21031 jiffies g=2534773 q=179750)
NMI backtrace for cpu 7
CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6
Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019
Call Trace:
<IRQ>
dump_stack+0x57/0x70
nmi_cpu_backtrace.cold+0x14/0x53
? lapic_can_unplug_cpu.cold+0x39/0x39
nmi_trigger_cpumask_backtrace+0xb7/0xc7
rcu_dump_cpu_stacks+0xa2/0xd0
rcu_sched_clock_irq.cold+0x1ff/0x3d9
? tick_nohz_handler+0x100/0x100
update_process_times+0x5b/0x90
tick_sched_timer+0x5e/0xf0
__hrtimer_run_queues+0x12a/0x2a0
hrtimer_interrupt+0x10e/0x280
__sysvec_apic_timer_interrupt+0x51/0xe0
asm_call_on_stack+0xf/0x20
</IRQ>
sysvec_apic_timer_interrupt+0x6f/0x80
...
task_file_seq_next+0x52/0xa0
bpf_seq_read+0xb9/0x320
vfs_read+0x9d/0x180
ksys_read+0x5f/0xe0
do_syscall_64+0x38/0x60
entry_SYSCALL_64_after_hwframe+0x44/0xa9
The fix is to limit the number of bpf program runs to be
one million. This fixed the program in most cases. But
we also found under heavy load, which can increase the wallclock
time for bpf_seq_read(), the warning may still be possible.
For example, calling bpf_delay() in the "while" loop of
bpf_seq_read(), which will introduce artificial delay,
the warning will show up in my qemu run.
static unsigned q;
volatile unsigned *p = &q;
volatile unsigned long long ll;
static void bpf_delay(void)
{
int i, j;
for (i = 0; i < 10000; i++)
for (j = 0; j < 10000; j++)
ll += *p;
}
There are two ways to fix this issue. One is to reduce the above
one million threshold to say 100,000 and hopefully rcu warning will
not show up any more. Another is to introduce a target feature
which enables bpf_seq_read() calling cond_resched().
This patch took second approach as the first approach may cause
more -EAGAIN failures for read() syscalls. Note that not all bpf_iter
targets can permit cond_resched() in bpf_seq_read() as some, e.g.,
netlink seq iterator, rcu read lock critical section spans through
seq_ops->next() -> seq_ops->show() -> seq_ops->next().
For the kernel code with the above hack, "bpftool p" roughly takes
38 seconds to finish on my VM with 184 bpf program runs.
Using the following command, I am able to collect the number of
context switches:
perf stat -e context-switches -- ./bpftool p >& log
Without this patch,
69 context-switches
With this patch,
75 context-switches
This patch added additional 6 context switches, roughly every 6 seconds
to reschedule, to avoid lengthy no-rescheduling which may cause the
above RCU warnings.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201028061054.1411116-1-yhs@fb.com
2020-10-28 09:10:54 +03:00
. feature = BPF_ITER_RESCHED ,
2020-05-13 21:02:21 +03:00
. ctx_arg_info_size = 2 ,
. ctx_arg_info = {
{ offsetof ( struct bpf_iter__task_file , task ) ,
PTR_TO_BTF_ID_OR_NULL } ,
{ offsetof ( struct bpf_iter__task_file , file ) ,
PTR_TO_BTF_ID_OR_NULL } ,
} ,
2020-07-23 21:41:09 +03:00
. seq_info = & task_file_seq_info ,
2020-05-13 21:02:19 +03:00
} ;
2021-02-12 21:31:05 +03:00
static const struct bpf_iter_seq_info task_vma_seq_info = {
. seq_ops = & task_vma_seq_ops ,
. init_seq_private = init_seq_pidns ,
. fini_seq_private = fini_seq_pidns ,
. seq_priv_size = sizeof ( struct bpf_iter_seq_task_vma_info ) ,
} ;
static struct bpf_iter_reg task_vma_reg_info = {
. target = " task_vma " ,
. feature = BPF_ITER_RESCHED ,
. ctx_arg_info_size = 2 ,
. ctx_arg_info = {
{ offsetof ( struct bpf_iter__task_vma , task ) ,
PTR_TO_BTF_ID_OR_NULL } ,
{ offsetof ( struct bpf_iter__task_vma , vma ) ,
PTR_TO_BTF_ID_OR_NULL } ,
} ,
. seq_info = & task_vma_seq_info ,
} ;
2021-11-06 02:23:29 +03:00
BPF_CALL_5 ( bpf_find_vma , struct task_struct * , task , u64 , start ,
bpf_callback_t , callback_fn , void * , callback_ctx , u64 , flags )
{
struct mmap_unlock_irq_work * work = NULL ;
struct vm_area_struct * vma ;
bool irq_work_busy = false ;
struct mm_struct * mm ;
int ret = - ENOENT ;
if ( flags )
return - EINVAL ;
if ( ! task )
return - ENOENT ;
mm = task - > mm ;
if ( ! mm )
return - ENOENT ;
irq_work_busy = bpf_mmap_unlock_get_irq_work ( & work ) ;
if ( irq_work_busy | | ! mmap_read_trylock ( mm ) )
return - EBUSY ;
vma = find_vma ( mm , start ) ;
if ( vma & & vma - > vm_start < = start & & vma - > vm_end > start ) {
callback_fn ( ( u64 ) ( long ) task , ( u64 ) ( long ) vma ,
( u64 ) ( long ) callback_ctx , 0 , 0 ) ;
ret = 0 ;
}
bpf_mmap_unlock_mm ( work , mm ) ;
return ret ;
}
const struct bpf_func_proto bpf_find_vma_proto = {
. func = bpf_find_vma ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_BTF_ID ,
2021-11-12 18:02:43 +03:00
. arg1_btf_id = & btf_tracing_ids [ BTF_TRACING_TYPE_TASK ] ,
2021-11-06 02:23:29 +03:00
. arg2_type = ARG_ANYTHING ,
. arg3_type = ARG_PTR_TO_FUNC ,
. arg4_type = ARG_PTR_TO_STACK_OR_NULL ,
. arg5_type = ARG_ANYTHING ,
} ;
DEFINE_PER_CPU ( struct mmap_unlock_irq_work , mmap_unlock_work ) ;
static void do_mmap_read_unlock ( struct irq_work * entry )
{
struct mmap_unlock_irq_work * work ;
if ( WARN_ON_ONCE ( IS_ENABLED ( CONFIG_PREEMPT_RT ) ) )
return ;
work = container_of ( entry , struct mmap_unlock_irq_work , irq_work ) ;
mmap_read_unlock_non_owner ( work - > mm ) ;
}
2020-05-09 20:59:11 +03:00
static int __init task_iter_init ( void )
{
2021-11-06 02:23:29 +03:00
struct mmap_unlock_irq_work * work ;
int ret , cpu ;
for_each_possible_cpu ( cpu ) {
work = per_cpu_ptr ( & mmap_unlock_work , cpu ) ;
init_irq_work ( & work - > irq_work , do_mmap_read_unlock ) ;
}
2020-05-09 20:59:11 +03:00
2021-11-12 18:02:43 +03:00
task_reg_info . ctx_arg_info [ 0 ] . btf_id = btf_tracing_ids [ BTF_TRACING_TYPE_TASK ] ;
2020-05-09 20:59:11 +03:00
ret = bpf_iter_reg_target ( & task_reg_info ) ;
if ( ret )
return ret ;
2021-11-12 18:02:43 +03:00
task_file_reg_info . ctx_arg_info [ 0 ] . btf_id = btf_tracing_ids [ BTF_TRACING_TYPE_TASK ] ;
task_file_reg_info . ctx_arg_info [ 1 ] . btf_id = btf_tracing_ids [ BTF_TRACING_TYPE_FILE ] ;
2021-02-12 21:31:05 +03:00
ret = bpf_iter_reg_target ( & task_file_reg_info ) ;
if ( ret )
return ret ;
2021-11-12 18:02:43 +03:00
task_vma_reg_info . ctx_arg_info [ 0 ] . btf_id = btf_tracing_ids [ BTF_TRACING_TYPE_TASK ] ;
task_vma_reg_info . ctx_arg_info [ 1 ] . btf_id = btf_tracing_ids [ BTF_TRACING_TYPE_VMA ] ;
2021-02-12 21:31:05 +03:00
return bpf_iter_reg_target ( & task_vma_reg_info ) ;
2020-05-09 20:59:11 +03:00
}
late_initcall ( task_iter_init ) ;