2018-08-16 11:20:54 -04:00
// SPDX-License-Identifier: GPL-2.0
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
2016-09-01 18:37:22 -07:00
* Copyright ( c ) 2016 Facebook
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
*/
# include <linux/kernel.h>
# include <linux/types.h>
# include <linux/slab.h>
# include <linux/bpf.h>
2016-09-01 18:37:22 -07:00
# include <linux/bpf_perf_event.h>
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
# include <linux/filter.h>
# include <linux/uaccess.h>
2015-03-25 12:49:22 -07:00
# include <linux/ctype.h>
2017-12-11 11:36:48 -05:00
# include <linux/kprobes.h>
bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.
There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.
This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-05-24 11:21:09 -07:00
# include <linux/syscalls.h>
2018-01-13 02:55:03 +09:00
# include <linux/error-injection.h>
2017-12-11 11:36:48 -05:00
2019-04-25 17:11:43 -07:00
# include <asm/tlb.h>
2017-12-11 11:36:48 -05:00
# include "trace_probe.h"
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
# include "trace.h"
2019-05-28 14:14:44 -07:00
# define bpf_event_rcu_dereference(p) \
rcu_dereference_protected ( p , lockdep_is_held ( & bpf_event_mutex ) )
2018-12-12 16:42:37 -08:00
# ifdef CONFIG_MODULES
struct bpf_trace_module {
struct module * module ;
struct list_head list ;
} ;
static LIST_HEAD ( bpf_trace_modules ) ;
static DEFINE_MUTEX ( bpf_module_mutex ) ;
static struct bpf_raw_event_map * bpf_get_raw_tracepoint_module ( const char * name )
{
struct bpf_raw_event_map * btp , * ret = NULL ;
struct bpf_trace_module * btm ;
unsigned int i ;
mutex_lock ( & bpf_module_mutex ) ;
list_for_each_entry ( btm , & bpf_trace_modules , list ) {
for ( i = 0 ; i < btm - > module - > num_bpf_raw_events ; + + i ) {
btp = & btm - > module - > bpf_raw_events [ i ] ;
if ( ! strcmp ( btp - > tp - > name , name ) ) {
if ( try_module_get ( btm - > module ) )
ret = btp ;
goto out ;
}
}
}
out :
mutex_unlock ( & bpf_module_mutex ) ;
return ret ;
}
# else
static struct bpf_raw_event_map * bpf_get_raw_tracepoint_module ( const char * name )
{
return NULL ;
}
# endif /* CONFIG_MODULES */
bpf: remove tail_call and get_stackid helper declarations from bpf.h
commit afdb09c720b6 ("security: bpf: Add LSM hooks for bpf object related
syscall") included linux/bpf.h in linux/security.h. As a result, bpf
programs including bpf_helpers.h and some other header that ends up
pulling in also security.h, such as several examples under samples/bpf,
fail to compile because bpf_tail_call and bpf_get_stackid are now
"redefined as different kind of symbol".
>From bpf.h:
u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
Whereas in bpf_helpers.h they are:
static void (*bpf_tail_call)(void *ctx, void *map, int index);
static int (*bpf_get_stackid)(void *ctx, void *map, int flags);
Fix this by removing the unused declaration of bpf_tail_call and moving
the declaration of bpf_get_stackid in bpf_trace.c, which is the only
place where it's needed.
Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-26 01:47:42 +00:00
u64 bpf_get_stackid ( u64 r1 , u64 r2 , u64 r3 , u64 r4 , u64 r5 ) ;
2018-04-28 22:28:08 -07:00
u64 bpf_get_stack ( u64 r1 , u64 r2 , u64 r3 , u64 r4 , u64 r5 ) ;
bpf: remove tail_call and get_stackid helper declarations from bpf.h
commit afdb09c720b6 ("security: bpf: Add LSM hooks for bpf object related
syscall") included linux/bpf.h in linux/security.h. As a result, bpf
programs including bpf_helpers.h and some other header that ends up
pulling in also security.h, such as several examples under samples/bpf,
fail to compile because bpf_tail_call and bpf_get_stackid are now
"redefined as different kind of symbol".
>From bpf.h:
u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
Whereas in bpf_helpers.h they are:
static void (*bpf_tail_call)(void *ctx, void *map, int index);
static int (*bpf_get_stackid)(void *ctx, void *map, int flags);
Fix this by removing the unused declaration of bpf_tail_call and moving
the declaration of bpf_get_stackid in bpf_trace.c, which is the only
place where it's needed.
Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-26 01:47:42 +00:00
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
/**
* trace_call_bpf - invoke BPF program
2017-10-23 23:53:08 -07:00
* @ call : tracepoint event
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
* @ ctx : opaque context pointer
*
* kprobe handlers execute BPF programs via this helper .
* Can be used from static tracepoints in the future .
*
* Return : BPF programs always return an integer which is interpreted by
* kprobe handler as :
* 0 - return from kprobe ( event is filtered out )
* 1 - store kprobe event into ring buffer
* Other values are reserved and currently alias to 1
*/
2017-10-23 23:53:08 -07:00
unsigned int trace_call_bpf ( struct trace_event_call * call , void * ctx )
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
{
unsigned int ret ;
if ( in_nmi ( ) ) /* not supported yet */
return 1 ;
2020-02-24 15:01:37 +01:00
cant_sleep ( ) ;
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
if ( unlikely ( __this_cpu_inc_return ( bpf_prog_active ) ! = 1 ) ) {
/*
* since some bpf program is already running on this cpu ,
* don ' t call into another bpf program ( same or different )
* and don ' t send kprobe event into ring - buffer ,
* so return zero here
*/
ret = 0 ;
goto out ;
}
2017-10-23 23:53:08 -07:00
/*
* Instead of moving rcu_read_lock / rcu_dereference / rcu_read_unlock
* to all call sites , we did a bpf_prog_array_valid ( ) there to check
* whether call - > prog_array is empty or not , which is
* a heurisitc to speed up execution .
*
* If bpf_prog_array_valid ( ) fetched prog_array was
* non - NULL , we go into trace_call_bpf ( ) and do the actual
* proper rcu_dereference ( ) under RCU lock .
* If it turns out that prog_array is NULL then , we bail out .
* For the opposite , if the bpf_prog_array_valid ( ) fetched pointer
* was NULL , you ' ll skip the prog_array with the risk of missing
* out of events when it was updated in between this and the
* rcu_dereference ( ) which is accepted risk .
*/
ret = BPF_PROG_RUN_ARRAY_CHECK ( call - > prog_array , ctx , BPF_PROG_RUN ) ;
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
out :
__this_cpu_dec ( bpf_prog_active ) ;
return ret ;
}
2017-12-11 11:36:48 -05:00
# ifdef CONFIG_BPF_KPROBE_OVERRIDE
BPF_CALL_2 ( bpf_override_return , struct pt_regs * , regs , unsigned long , rc )
{
regs_set_return_value ( regs , rc ) ;
2018-01-13 02:55:03 +09:00
override_function_with_return ( regs ) ;
2017-12-11 11:36:48 -05:00
return 0 ;
}
static const struct bpf_func_proto bpf_override_return_proto = {
. func = bpf_override_return ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_CTX ,
. arg2_type = ARG_ANYTHING ,
} ;
# endif
2020-06-08 21:34:40 -07:00
static __always_inline int
bpf_probe_read_user_common ( void * dst , u32 size , const void __user * unsafe_ptr )
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
{
2020-06-08 21:34:40 -07:00
int ret ;
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
2020-06-08 21:34:40 -07:00
ret = probe_user_read ( dst , unsafe_ptr , size ) ;
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
if ( unlikely ( ret < 0 ) )
memset ( dst , 0 , size ) ;
return ret ;
}
2020-06-08 21:34:40 -07:00
BPF_CALL_3 ( bpf_probe_read_user , void * , dst , u32 , size ,
const void __user * , unsafe_ptr )
{
return bpf_probe_read_user_common ( dst , size , unsafe_ptr ) ;
}
2020-05-24 09:50:55 -07:00
const struct bpf_func_proto bpf_probe_read_user_proto = {
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
. func = bpf_probe_read_user ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_UNINIT_MEM ,
. arg2_type = ARG_CONST_SIZE_OR_ZERO ,
. arg3_type = ARG_ANYTHING ,
} ;
2020-06-08 21:34:40 -07:00
static __always_inline int
bpf_probe_read_user_str_common ( void * dst , u32 size ,
const void __user * unsafe_ptr )
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
{
2020-06-08 21:34:40 -07:00
int ret ;
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
2020-06-08 21:34:40 -07:00
ret = strncpy_from_user_nofault ( dst , unsafe_ptr , size ) ;
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
if ( unlikely ( ret < 0 ) )
memset ( dst , 0 , size ) ;
return ret ;
}
2020-06-08 21:34:40 -07:00
BPF_CALL_3 ( bpf_probe_read_user_str , void * , dst , u32 , size ,
const void __user * , unsafe_ptr )
{
return bpf_probe_read_user_str_common ( dst , size , unsafe_ptr ) ;
}
2020-05-24 09:50:55 -07:00
const struct bpf_func_proto bpf_probe_read_user_str_proto = {
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
. func = bpf_probe_read_user_str ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_UNINIT_MEM ,
. arg2_type = ARG_CONST_SIZE_OR_ZERO ,
. arg3_type = ARG_ANYTHING ,
} ;
static __always_inline int
2020-06-08 21:34:40 -07:00
bpf_probe_read_kernel_common ( void * dst , u32 size , const void * unsafe_ptr )
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
{
int ret = security_locked_down ( LOCKDOWN_BPF_READ ) ;
2019-08-19 17:17:59 -07:00
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
if ( unlikely ( ret < 0 ) )
2020-06-08 21:34:40 -07:00
goto fail ;
2020-06-08 21:34:50 -07:00
ret = probe_kernel_read ( dst , unsafe_ptr , size ) ;
2016-04-13 00:10:52 +02:00
if ( unlikely ( ret < 0 ) )
2020-06-08 21:34:40 -07:00
goto fail ;
return ret ;
fail :
memset ( dst , 0 , size ) ;
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
return ret ;
}
2016-04-13 00:10:52 +02:00
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
BPF_CALL_3 ( bpf_probe_read_kernel , void * , dst , u32 , size ,
const void * , unsafe_ptr )
{
2020-06-08 21:34:40 -07:00
return bpf_probe_read_kernel_common ( dst , size , unsafe_ptr ) ;
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
}
2020-05-24 09:50:55 -07:00
const struct bpf_func_proto bpf_probe_read_kernel_proto = {
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
. func = bpf_probe_read_kernel ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_UNINIT_MEM ,
. arg2_type = ARG_CONST_SIZE_OR_ZERO ,
. arg3_type = ARG_ANYTHING ,
} ;
static __always_inline int
2020-06-08 21:34:40 -07:00
bpf_probe_read_kernel_str_common ( void * dst , u32 size , const void * unsafe_ptr )
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
{
int ret = security_locked_down ( LOCKDOWN_BPF_READ ) ;
if ( unlikely ( ret < 0 ) )
2020-06-08 21:34:40 -07:00
goto fail ;
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
/*
2020-06-08 21:34:40 -07:00
* The strncpy_from_kernel_nofault ( ) call will likely not fill the
* entire buffer , but that ' s okay in this circumstance as we ' re probing
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
* arbitrary memory anyway similar to bpf_probe_read_ * ( ) and might
* as well probe the stack . Thus , memory is explicitly cleared
* only in error case , so that improper users ignoring return
* code altogether don ' t copy garbage ; otherwise length of string
* is returned that can be used for bpf_perf_event_output ( ) et al .
*/
2020-06-08 21:34:40 -07:00
ret = strncpy_from_kernel_nofault ( dst , unsafe_ptr , size ) ;
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
if ( unlikely ( ret < 0 ) )
2020-06-08 21:34:40 -07:00
goto fail ;
2020-06-15 22:04:30 -07:00
return ret ;
2020-06-08 21:34:40 -07:00
fail :
memset ( dst , 0 , size ) ;
2016-04-13 00:10:52 +02:00
return ret ;
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
}
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
BPF_CALL_3 ( bpf_probe_read_kernel_str , void * , dst , u32 , size ,
const void * , unsafe_ptr )
{
2020-06-08 21:34:40 -07:00
return bpf_probe_read_kernel_str_common ( dst , size , unsafe_ptr ) ;
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
}
2020-05-24 09:50:55 -07:00
const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
. func = bpf_probe_read_kernel_str ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_UNINIT_MEM ,
. arg2_type = ARG_CONST_SIZE_OR_ZERO ,
. arg3_type = ARG_ANYTHING ,
} ;
2020-06-08 21:34:40 -07:00
# ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
BPF_CALL_3 ( bpf_probe_read_compat , void * , dst , u32 , size ,
const void * , unsafe_ptr )
{
if ( ( unsigned long ) unsafe_ptr < TASK_SIZE ) {
return bpf_probe_read_user_common ( dst , size ,
( __force void __user * ) unsafe_ptr ) ;
}
return bpf_probe_read_kernel_common ( dst , size , unsafe_ptr ) ;
}
static const struct bpf_func_proto bpf_probe_read_compat_proto = {
. func = bpf_probe_read_compat ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_UNINIT_MEM ,
. arg2_type = ARG_CONST_SIZE_OR_ZERO ,
. arg3_type = ARG_ANYTHING ,
} ;
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
BPF_CALL_3 ( bpf_probe_read_compat_str , void * , dst , u32 , size ,
const void * , unsafe_ptr )
{
2020-06-08 21:34:40 -07:00
if ( ( unsigned long ) unsafe_ptr < TASK_SIZE ) {
return bpf_probe_read_user_str_common ( dst , size ,
( __force void __user * ) unsafe_ptr ) ;
}
return bpf_probe_read_kernel_str_common ( dst , size , unsafe_ptr ) ;
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
}
static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
. func = bpf_probe_read_compat_str ,
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
. gpl_only = true ,
. ret_type = RET_INTEGER ,
2017-01-09 10:19:50 -08:00
. arg1_type = ARG_PTR_TO_UNINIT_MEM ,
2017-11-12 14:49:10 -08:00
. arg2_type = ARG_CONST_SIZE_OR_ZERO ,
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
. arg3_type = ARG_ANYTHING ,
} ;
2020-06-08 21:34:40 -07:00
# endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
2019-11-02 00:17:58 +01:00
BPF_CALL_3 ( bpf_probe_write_user , void __user * , unsafe_ptr , const void * , src ,
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 02:45:31 +02:00
u32 , size )
2016-07-25 05:54:46 -07:00
{
/*
* Ensure we ' re in user context which is safe for the helper to
* run . This helper has no business in a kthread .
*
* access_ok ( ) should prevent writing to non - user memory , but in
* some situations ( nommu , temporary switch , etc ) access_ok ( ) does
* not provide enough validation , hence the check on KERNEL_DS .
2019-04-25 17:11:43 -07:00
*
* nmi_uaccess_okay ( ) ensures the probe is not run in an interim
* state , when the task or mm are switched . This is specifically
* required to prevent the use of temporary mm .
2016-07-25 05:54:46 -07:00
*/
if ( unlikely ( in_interrupt ( ) | |
current - > flags & ( PF_KTHREAD | PF_EXITING ) ) )
return - EPERM ;
2017-03-20 21:08:07 -04:00
if ( unlikely ( uaccess_kernel ( ) ) )
2016-07-25 05:54:46 -07:00
return - EPERM ;
2019-04-25 17:11:43 -07:00
if ( unlikely ( ! nmi_uaccess_okay ( ) ) )
return - EPERM ;
2016-07-25 05:54:46 -07:00
2019-11-02 00:17:58 +01:00
return probe_user_write ( unsafe_ptr , src , size ) ;
2016-07-25 05:54:46 -07:00
}
static const struct bpf_func_proto bpf_probe_write_user_proto = {
. func = bpf_probe_write_user ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_ANYTHING ,
2017-01-09 10:19:50 -08:00
. arg2_type = ARG_PTR_TO_MEM ,
. arg3_type = ARG_CONST_SIZE ,
2016-07-25 05:54:46 -07:00
} ;
static const struct bpf_func_proto * bpf_get_probe_write_proto ( void )
{
2020-05-13 16:03:54 -07:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return NULL ;
2016-07-25 05:54:46 -07:00
pr_warn_ratelimited ( " %s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory! " ,
current - > comm , task_pid_nr ( current ) ) ;
return & bpf_probe_write_user_proto ;
}
2020-06-08 21:34:30 -07:00
static void bpf_trace_copy_string ( char * buf , void * unsafe_ptr , char fmt_ptype ,
size_t bufsz )
{
void __user * user_ptr = ( __force void __user * ) unsafe_ptr ;
buf [ 0 ] = 0 ;
switch ( fmt_ptype ) {
case ' s ' :
# ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
2020-06-08 21:34:33 -07:00
if ( ( unsigned long ) unsafe_ptr < TASK_SIZE ) {
strncpy_from_user_nofault ( buf , user_ptr , bufsz ) ;
break ;
}
fallthrough ;
2020-06-08 21:34:30 -07:00
# endif
case ' k ' :
strncpy_from_kernel_nofault ( buf , unsafe_ptr , bufsz ) ;
break ;
case ' u ' :
strncpy_from_user_nofault ( buf , user_ptr , bufsz ) ;
break ;
}
}
2015-03-25 12:49:22 -07:00
/*
2017-07-02 02:13:29 +02:00
* Only limited trace_printk ( ) conversion specifiers allowed :
2020-05-15 12:11:18 +02:00
* % d % i % u % x % ld % li % lu % lx % lld % lli % llu % llx % p % pks % pus % s
2015-03-25 12:49:22 -07:00
*/
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 02:45:31 +02:00
BPF_CALL_5 ( bpf_trace_printk , char * , fmt , u32 , fmt_size , u64 , arg1 ,
u64 , arg2 , u64 , arg3 )
2015-03-25 12:49:22 -07:00
{
2020-05-15 12:11:18 +02:00
int i , mod [ 3 ] = { } , fmt_cnt = 0 ;
char buf [ 64 ] , fmt_ptype ;
void * unsafe_ptr = NULL ;
2015-08-28 15:56:23 -07:00
bool str_seen = false ;
2015-03-25 12:49:22 -07:00
/*
* bpf_check ( ) - > check_func_arg ( ) - > check_stack_boundary ( )
* guarantees that fmt points to bpf program stack ,
* fmt_size bytes of it were initialized and fmt_size > 0
*/
if ( fmt [ - - fmt_size ] ! = 0 )
return - EINVAL ;
/* check format string for allowed specifiers */
for ( i = 0 ; i < fmt_size ; i + + ) {
if ( ( ! isprint ( fmt [ i ] ) & & ! isspace ( fmt [ i ] ) ) | | ! isascii ( fmt [ i ] ) )
return - EINVAL ;
if ( fmt [ i ] ! = ' % ' )
continue ;
if ( fmt_cnt > = 3 )
return - EINVAL ;
/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
i + + ;
if ( fmt [ i ] = = ' l ' ) {
mod [ fmt_cnt ] + + ;
i + + ;
2020-05-15 12:11:18 +02:00
} else if ( fmt [ i ] = = ' p ' ) {
2015-03-25 12:49:22 -07:00
mod [ fmt_cnt ] + + ;
2020-05-15 12:11:18 +02:00
if ( ( fmt [ i + 1 ] = = ' k ' | |
fmt [ i + 1 ] = = ' u ' ) & &
fmt [ i + 2 ] = = ' s ' ) {
fmt_ptype = fmt [ i + 1 ] ;
i + = 2 ;
goto fmt_str ;
}
2018-11-23 17:43:26 +01:00
/* disallow any further format extensions */
if ( fmt [ i + 1 ] ! = 0 & &
! isspace ( fmt [ i + 1 ] ) & &
! ispunct ( fmt [ i + 1 ] ) )
2015-03-25 12:49:22 -07:00
return - EINVAL ;
2020-05-15 12:11:18 +02:00
goto fmt_next ;
} else if ( fmt [ i ] = = ' s ' ) {
mod [ fmt_cnt ] + + ;
fmt_ptype = fmt [ i ] ;
fmt_str :
if ( str_seen )
/* allow only one '%s' per fmt string */
return - EINVAL ;
str_seen = true ;
if ( fmt [ i + 1 ] ! = 0 & &
! isspace ( fmt [ i + 1 ] ) & &
! ispunct ( fmt [ i + 1 ] ) )
return - EINVAL ;
switch ( fmt_cnt ) {
case 0 :
unsafe_ptr = ( void * ) ( long ) arg1 ;
arg1 = ( long ) buf ;
break ;
case 1 :
unsafe_ptr = ( void * ) ( long ) arg2 ;
arg2 = ( long ) buf ;
break ;
case 2 :
unsafe_ptr = ( void * ) ( long ) arg3 ;
arg3 = ( long ) buf ;
break ;
}
2020-06-08 21:34:30 -07:00
bpf_trace_copy_string ( buf , unsafe_ptr , fmt_ptype ,
sizeof ( buf ) ) ;
2020-05-15 12:11:18 +02:00
goto fmt_next ;
2015-03-25 12:49:22 -07:00
}
if ( fmt [ i ] = = ' l ' ) {
mod [ fmt_cnt ] + + ;
i + + ;
}
2017-07-02 02:13:29 +02:00
if ( fmt [ i ] ! = ' i ' & & fmt [ i ] ! = ' d ' & &
fmt [ i ] ! = ' u ' & & fmt [ i ] ! = ' x ' )
2015-03-25 12:49:22 -07:00
return - EINVAL ;
2020-05-15 12:11:18 +02:00
fmt_next :
2015-03-25 12:49:22 -07:00
fmt_cnt + + ;
}
2017-08-16 01:45:33 +02:00
/* Horrid workaround for getting va_list handling working with different
* argument type combinations generically for 32 and 64 bit archs .
*/
# define __BPF_TP_EMIT() __BPF_ARG3_TP()
# define __BPF_TP(...) \
2018-01-17 09:19:32 -08:00
__trace_printk ( 0 /* Fake ip */ , \
2017-08-16 01:45:33 +02:00
fmt , # # __VA_ARGS__ )
# define __BPF_ARG1_TP(...) \
( ( mod [ 0 ] = = 2 | | ( mod [ 0 ] = = 1 & & __BITS_PER_LONG = = 64 ) ) \
? __BPF_TP ( arg1 , # # __VA_ARGS__ ) \
: ( ( mod [ 0 ] = = 1 | | ( mod [ 0 ] = = 0 & & __BITS_PER_LONG = = 32 ) ) \
? __BPF_TP ( ( long ) arg1 , # # __VA_ARGS__ ) \
: __BPF_TP ( ( u32 ) arg1 , # # __VA_ARGS__ ) ) )
# define __BPF_ARG2_TP(...) \
( ( mod [ 1 ] = = 2 | | ( mod [ 1 ] = = 1 & & __BITS_PER_LONG = = 64 ) ) \
? __BPF_ARG1_TP ( arg2 , # # __VA_ARGS__ ) \
: ( ( mod [ 1 ] = = 1 | | ( mod [ 1 ] = = 0 & & __BITS_PER_LONG = = 32 ) ) \
? __BPF_ARG1_TP ( ( long ) arg2 , # # __VA_ARGS__ ) \
: __BPF_ARG1_TP ( ( u32 ) arg2 , # # __VA_ARGS__ ) ) )
# define __BPF_ARG3_TP(...) \
( ( mod [ 2 ] = = 2 | | ( mod [ 2 ] = = 1 & & __BITS_PER_LONG = = 64 ) ) \
? __BPF_ARG2_TP ( arg3 , # # __VA_ARGS__ ) \
: ( ( mod [ 2 ] = = 1 | | ( mod [ 2 ] = = 0 & & __BITS_PER_LONG = = 32 ) ) \
? __BPF_ARG2_TP ( ( long ) arg3 , # # __VA_ARGS__ ) \
: __BPF_ARG2_TP ( ( u32 ) arg3 , # # __VA_ARGS__ ) ) )
return __BPF_TP_EMIT ( ) ;
2015-03-25 12:49:22 -07:00
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
. func = bpf_trace_printk ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
2017-01-09 10:19:50 -08:00
. arg1_type = ARG_PTR_TO_MEM ,
. arg2_type = ARG_CONST_SIZE ,
2015-03-25 12:49:22 -07:00
} ;
2015-06-12 19:39:13 -07:00
const struct bpf_func_proto * bpf_get_trace_printk_proto ( void )
{
/*
* this program might be calling bpf_trace_printk ,
* so allocate per - cpu printk buffers
*/
trace_printk_init_buffers ( ) ;
return & bpf_trace_printk_proto ;
}
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 10:59:14 -07:00
# define MAX_SEQ_PRINTF_VARARGS 12
# define MAX_SEQ_PRINTF_MAX_MEMCPY 6
# define MAX_SEQ_PRINTF_STR_LEN 128
struct bpf_seq_printf_buf {
char buf [ MAX_SEQ_PRINTF_MAX_MEMCPY ] [ MAX_SEQ_PRINTF_STR_LEN ] ;
} ;
static DEFINE_PER_CPU ( struct bpf_seq_printf_buf , bpf_seq_printf_buf ) ;
static DEFINE_PER_CPU ( int , bpf_seq_printf_buf_used ) ;
BPF_CALL_5 ( bpf_seq_printf , struct seq_file * , m , char * , fmt , u32 , fmt_size ,
const void * , data , u32 , data_len )
{
int err = - EINVAL , fmt_cnt = 0 , memcpy_cnt = 0 ;
int i , buf_used , copy_size , num_args ;
u64 params [ MAX_SEQ_PRINTF_VARARGS ] ;
struct bpf_seq_printf_buf * bufs ;
const u64 * args = data ;
buf_used = this_cpu_inc_return ( bpf_seq_printf_buf_used ) ;
if ( WARN_ON_ONCE ( buf_used > 1 ) ) {
err = - EBUSY ;
goto out ;
}
bufs = this_cpu_ptr ( & bpf_seq_printf_buf ) ;
/*
* bpf_check ( ) - > check_func_arg ( ) - > check_stack_boundary ( )
* guarantees that fmt points to bpf program stack ,
* fmt_size bytes of it were initialized and fmt_size > 0
*/
if ( fmt [ - - fmt_size ] ! = 0 )
goto out ;
if ( data_len & 7 )
goto out ;
for ( i = 0 ; i < fmt_size ; i + + ) {
if ( fmt [ i ] = = ' % ' ) {
if ( fmt [ i + 1 ] = = ' % ' )
i + + ;
else if ( ! data | | ! data_len )
goto out ;
}
}
num_args = data_len / 8 ;
/* check format string for allowed specifiers */
for ( i = 0 ; i < fmt_size ; i + + ) {
/* only printable ascii for now. */
if ( ( ! isprint ( fmt [ i ] ) & & ! isspace ( fmt [ i ] ) ) | | ! isascii ( fmt [ i ] ) ) {
err = - EINVAL ;
goto out ;
}
if ( fmt [ i ] ! = ' % ' )
continue ;
if ( fmt [ i + 1 ] = = ' % ' ) {
i + + ;
continue ;
}
if ( fmt_cnt > = MAX_SEQ_PRINTF_VARARGS ) {
err = - E2BIG ;
goto out ;
}
if ( fmt_cnt > = num_args ) {
err = - EINVAL ;
goto out ;
}
/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
i + + ;
/* skip optional "[0 +-][num]" width formating field */
while ( fmt [ i ] = = ' 0 ' | | fmt [ i ] = = ' + ' | | fmt [ i ] = = ' - ' | |
fmt [ i ] = = ' ' )
i + + ;
if ( fmt [ i ] > = ' 1 ' & & fmt [ i ] < = ' 9 ' ) {
i + + ;
while ( fmt [ i ] > = ' 0 ' & & fmt [ i ] < = ' 9 ' )
i + + ;
}
if ( fmt [ i ] = = ' s ' ) {
2020-06-08 21:34:37 -07:00
void * unsafe_ptr ;
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 10:59:14 -07:00
/* try our best to copy */
if ( memcpy_cnt > = MAX_SEQ_PRINTF_MAX_MEMCPY ) {
err = - E2BIG ;
goto out ;
}
2020-06-08 21:34:37 -07:00
unsafe_ptr = ( void * ) ( long ) args [ fmt_cnt ] ;
err = strncpy_from_kernel_nofault ( bufs - > buf [ memcpy_cnt ] ,
unsafe_ptr , MAX_SEQ_PRINTF_STR_LEN ) ;
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 10:59:14 -07:00
if ( err < 0 )
bufs - > buf [ memcpy_cnt ] [ 0 ] = ' \0 ' ;
params [ fmt_cnt ] = ( u64 ) ( long ) bufs - > buf [ memcpy_cnt ] ;
fmt_cnt + + ;
memcpy_cnt + + ;
continue ;
}
if ( fmt [ i ] = = ' p ' ) {
if ( fmt [ i + 1 ] = = 0 | |
fmt [ i + 1 ] = = ' K ' | |
fmt [ i + 1 ] = = ' x ' ) {
/* just kernel pointers */
params [ fmt_cnt ] = args [ fmt_cnt ] ;
fmt_cnt + + ;
continue ;
}
/* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
if ( fmt [ i + 1 ] ! = ' i ' & & fmt [ i + 1 ] ! = ' I ' ) {
err = - EINVAL ;
goto out ;
}
if ( fmt [ i + 2 ] ! = ' 4 ' & & fmt [ i + 2 ] ! = ' 6 ' ) {
err = - EINVAL ;
goto out ;
}
if ( memcpy_cnt > = MAX_SEQ_PRINTF_MAX_MEMCPY ) {
err = - E2BIG ;
goto out ;
}
copy_size = ( fmt [ i + 2 ] = = ' 4 ' ) ? 4 : 16 ;
err = probe_kernel_read ( bufs - > buf [ memcpy_cnt ] ,
( void * ) ( long ) args [ fmt_cnt ] ,
copy_size ) ;
if ( err < 0 )
memset ( bufs - > buf [ memcpy_cnt ] , 0 , copy_size ) ;
params [ fmt_cnt ] = ( u64 ) ( long ) bufs - > buf [ memcpy_cnt ] ;
i + = 2 ;
fmt_cnt + + ;
memcpy_cnt + + ;
continue ;
}
if ( fmt [ i ] = = ' l ' ) {
i + + ;
if ( fmt [ i ] = = ' l ' )
i + + ;
}
if ( fmt [ i ] ! = ' i ' & & fmt [ i ] ! = ' d ' & &
2020-06-23 16:08:07 -07:00
fmt [ i ] ! = ' u ' & & fmt [ i ] ! = ' x ' & &
fmt [ i ] ! = ' X ' ) {
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 10:59:14 -07:00
err = - EINVAL ;
goto out ;
}
params [ fmt_cnt ] = args [ fmt_cnt ] ;
fmt_cnt + + ;
}
/* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
* all of them to seq_printf ( ) .
*/
seq_printf ( m , fmt , params [ 0 ] , params [ 1 ] , params [ 2 ] , params [ 3 ] ,
params [ 4 ] , params [ 5 ] , params [ 6 ] , params [ 7 ] , params [ 8 ] ,
params [ 9 ] , params [ 10 ] , params [ 11 ] ) ;
err = seq_has_overflowed ( m ) ? - EOVERFLOW : 0 ;
out :
this_cpu_dec ( bpf_seq_printf_buf_used ) ;
return err ;
}
static int bpf_seq_printf_btf_ids [ 5 ] ;
static const struct bpf_func_proto bpf_seq_printf_proto = {
. func = bpf_seq_printf ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_BTF_ID ,
. arg2_type = ARG_PTR_TO_MEM ,
. arg3_type = ARG_CONST_SIZE ,
. arg4_type = ARG_PTR_TO_MEM_OR_NULL ,
. arg5_type = ARG_CONST_SIZE_OR_ZERO ,
. btf_id = bpf_seq_printf_btf_ids ,
} ;
BPF_CALL_3 ( bpf_seq_write , struct seq_file * , m , const void * , data , u32 , len )
{
return seq_write ( m , data , len ) ? - EOVERFLOW : 0 ;
}
static int bpf_seq_write_btf_ids [ 5 ] ;
static const struct bpf_func_proto bpf_seq_write_proto = {
. func = bpf_seq_write ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_BTF_ID ,
. arg2_type = ARG_PTR_TO_MEM ,
. arg3_type = ARG_CONST_SIZE_OR_ZERO ,
. btf_id = bpf_seq_write_btf_ids ,
} ;
2017-10-05 09:19:20 -07:00
static __always_inline int
get_map_perf_counter ( struct bpf_map * map , u64 flags ,
u64 * value , u64 * enabled , u64 * running )
2015-08-06 07:02:35 +00:00
{
struct bpf_array * array = container_of ( map , struct bpf_array , map ) ;
2016-06-28 12:18:25 +02:00
unsigned int cpu = smp_processor_id ( ) ;
u64 index = flags & BPF_F_INDEX_MASK ;
bpf, maps: flush own entries on perf map release
The behavior of perf event arrays are quite different from all
others as they are tightly coupled to perf event fds, f.e. shown
recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array
to use struct file") to make refcounting on perf event more robust.
A remaining issue that the current code still has is that since
additions to the perf event array take a reference on the struct
file via perf_event_get() and are only released via fput() (that
cleans up the perf event eventually via perf_event_release_kernel())
when the element is either manually removed from the map from user
space or automatically when the last reference on the perf event
map is dropped. However, this leads us to dangling struct file's
when the map gets pinned after the application owning the perf
event descriptor exits, and since the struct file reference will
in such case only be manually dropped or via pinned file removal,
it leads to the perf event living longer than necessary, consuming
needlessly resources for that time.
Relations between perf event fds and bpf perf event map fds can be
rather complex. F.e. maps can act as demuxers among different perf
event fds that can possibly be owned by different threads and based
on the index selection from the program, events get dispatched to
one of the per-cpu fd endpoints. One perf event fd (or, rather a
per-cpu set of them) can also live in multiple perf event maps at
the same time, listening for events. Also, another requirement is
that perf event fds can get closed from application side after they
have been attached to the perf event map, so that on exit perf event
map will take care of dropping their references eventually. Likewise,
when such maps are pinned, the intended behavior is that a user
application does bpf_obj_get(), puts its fds in there and on exit
when fd is released, they are dropped from the map again, so the map
acts rather as connector endpoint. This also makes perf event maps
inherently different from program arrays as described in more detail
in commit c9da161c6517 ("bpf: fix clearing on persistent program
array maps").
To tackle this, map entries are marked by the map struct file that
added the element to the map. And when the last reference to that map
struct file is released from user space, then the tracked entries
are purged from the map. This is okay, because new map struct files
instances resp. frontends to the anon inode are provided via
bpf_map_new_fd() that is called when we invoke bpf_obj_get_user()
for retrieving a pinned map, but also when an initial instance is
created via map_create(). The rest is resolved by the vfs layer
automatically for us by keeping reference count on the map's struct
file. Any concurrent updates on the map slot are fine as well, it
just means that perf_event_fd_array_release() needs to delete less
of its own entires.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-15 22:47:14 +02:00
struct bpf_event_entry * ee ;
2015-08-06 07:02:35 +00:00
2016-06-28 12:18:25 +02:00
if ( unlikely ( flags & ~ ( BPF_F_INDEX_MASK ) ) )
return - EINVAL ;
if ( index = = BPF_F_CURRENT_CPU )
index = cpu ;
2015-08-06 07:02:35 +00:00
if ( unlikely ( index > = array - > map . max_entries ) )
return - E2BIG ;
bpf, maps: flush own entries on perf map release
The behavior of perf event arrays are quite different from all
others as they are tightly coupled to perf event fds, f.e. shown
recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array
to use struct file") to make refcounting on perf event more robust.
A remaining issue that the current code still has is that since
additions to the perf event array take a reference on the struct
file via perf_event_get() and are only released via fput() (that
cleans up the perf event eventually via perf_event_release_kernel())
when the element is either manually removed from the map from user
space or automatically when the last reference on the perf event
map is dropped. However, this leads us to dangling struct file's
when the map gets pinned after the application owning the perf
event descriptor exits, and since the struct file reference will
in such case only be manually dropped or via pinned file removal,
it leads to the perf event living longer than necessary, consuming
needlessly resources for that time.
Relations between perf event fds and bpf perf event map fds can be
rather complex. F.e. maps can act as demuxers among different perf
event fds that can possibly be owned by different threads and based
on the index selection from the program, events get dispatched to
one of the per-cpu fd endpoints. One perf event fd (or, rather a
per-cpu set of them) can also live in multiple perf event maps at
the same time, listening for events. Also, another requirement is
that perf event fds can get closed from application side after they
have been attached to the perf event map, so that on exit perf event
map will take care of dropping their references eventually. Likewise,
when such maps are pinned, the intended behavior is that a user
application does bpf_obj_get(), puts its fds in there and on exit
when fd is released, they are dropped from the map again, so the map
acts rather as connector endpoint. This also makes perf event maps
inherently different from program arrays as described in more detail
in commit c9da161c6517 ("bpf: fix clearing on persistent program
array maps").
To tackle this, map entries are marked by the map struct file that
added the element to the map. And when the last reference to that map
struct file is released from user space, then the tracked entries
are purged from the map. This is okay, because new map struct files
instances resp. frontends to the anon inode are provided via
bpf_map_new_fd() that is called when we invoke bpf_obj_get_user()
for retrieving a pinned map, but also when an initial instance is
created via map_create(). The rest is resolved by the vfs layer
automatically for us by keeping reference count on the map's struct
file. Any concurrent updates on the map slot are fine as well, it
just means that perf_event_fd_array_release() needs to delete less
of its own entires.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-15 22:47:14 +02:00
ee = READ_ONCE ( array - > ptrs [ index ] ) ;
2016-06-28 12:18:23 +02:00
if ( ! ee )
2015-08-06 07:02:35 +00:00
return - ENOENT ;
2017-10-05 09:19:20 -07:00
return perf_event_read_local ( ee - > event , value , enabled , running ) ;
}
BPF_CALL_2 ( bpf_perf_event_read , struct bpf_map * , map , u64 , flags )
{
u64 value = 0 ;
int err ;
err = get_map_perf_counter ( map , flags , & value , NULL , NULL ) ;
2015-08-06 07:02:35 +00:00
/*
2017-06-02 21:03:52 -07:00
* this api is ugly since we miss [ - 22. . - 2 ] range of valid
* counter values , but that ' s uapi
2015-08-06 07:02:35 +00:00
*/
2017-06-02 21:03:52 -07:00
if ( err )
return err ;
return value ;
2015-08-06 07:02:35 +00:00
}
2015-10-22 17:10:14 -07:00
static const struct bpf_func_proto bpf_perf_event_read_proto = {
2015-08-06 07:02:35 +00:00
. func = bpf_perf_event_read ,
2015-10-23 14:58:19 -07:00
. gpl_only = true ,
2015-08-06 07:02:35 +00:00
. ret_type = RET_INTEGER ,
. arg1_type = ARG_CONST_MAP_PTR ,
. arg2_type = ARG_ANYTHING ,
} ;
2017-10-05 09:19:20 -07:00
BPF_CALL_4 ( bpf_perf_event_read_value , struct bpf_map * , map , u64 , flags ,
struct bpf_perf_event_value * , buf , u32 , size )
{
int err = - EINVAL ;
if ( unlikely ( size ! = sizeof ( struct bpf_perf_event_value ) ) )
goto clear ;
err = get_map_perf_counter ( map , flags , & buf - > counter , & buf - > enabled ,
& buf - > running ) ;
if ( unlikely ( err ) )
goto clear ;
return 0 ;
clear :
memset ( buf , 0 , size ) ;
return err ;
}
static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
. func = bpf_perf_event_read_value ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_CONST_MAP_PTR ,
. arg2_type = ARG_ANYTHING ,
. arg3_type = ARG_PTR_TO_UNINIT_MEM ,
. arg4_type = ARG_CONST_SIZE ,
} ;
2016-07-14 18:08:04 +02:00
static __always_inline u64
__bpf_perf_event_output ( struct pt_regs * regs , struct bpf_map * map ,
2017-12-12 02:25:30 +01:00
u64 flags , struct perf_sample_data * sd )
2015-10-20 20:02:34 -07:00
{
struct bpf_array * array = container_of ( map , struct bpf_array , map ) ;
2016-06-28 12:18:24 +02:00
unsigned int cpu = smp_processor_id ( ) ;
2016-04-18 21:01:23 +02:00
u64 index = flags & BPF_F_INDEX_MASK ;
bpf, maps: flush own entries on perf map release
The behavior of perf event arrays are quite different from all
others as they are tightly coupled to perf event fds, f.e. shown
recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array
to use struct file") to make refcounting on perf event more robust.
A remaining issue that the current code still has is that since
additions to the perf event array take a reference on the struct
file via perf_event_get() and are only released via fput() (that
cleans up the perf event eventually via perf_event_release_kernel())
when the element is either manually removed from the map from user
space or automatically when the last reference on the perf event
map is dropped. However, this leads us to dangling struct file's
when the map gets pinned after the application owning the perf
event descriptor exits, and since the struct file reference will
in such case only be manually dropped or via pinned file removal,
it leads to the perf event living longer than necessary, consuming
needlessly resources for that time.
Relations between perf event fds and bpf perf event map fds can be
rather complex. F.e. maps can act as demuxers among different perf
event fds that can possibly be owned by different threads and based
on the index selection from the program, events get dispatched to
one of the per-cpu fd endpoints. One perf event fd (or, rather a
per-cpu set of them) can also live in multiple perf event maps at
the same time, listening for events. Also, another requirement is
that perf event fds can get closed from application side after they
have been attached to the perf event map, so that on exit perf event
map will take care of dropping their references eventually. Likewise,
when such maps are pinned, the intended behavior is that a user
application does bpf_obj_get(), puts its fds in there and on exit
when fd is released, they are dropped from the map again, so the map
acts rather as connector endpoint. This also makes perf event maps
inherently different from program arrays as described in more detail
in commit c9da161c6517 ("bpf: fix clearing on persistent program
array maps").
To tackle this, map entries are marked by the map struct file that
added the element to the map. And when the last reference to that map
struct file is released from user space, then the tracked entries
are purged from the map. This is okay, because new map struct files
instances resp. frontends to the anon inode are provided via
bpf_map_new_fd() that is called when we invoke bpf_obj_get_user()
for retrieving a pinned map, but also when an initial instance is
created via map_create(). The rest is resolved by the vfs layer
automatically for us by keeping reference count on the map's struct
file. Any concurrent updates on the map slot are fine as well, it
just means that perf_event_fd_array_release() needs to delete less
of its own entires.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-15 22:47:14 +02:00
struct bpf_event_entry * ee ;
2015-10-20 20:02:34 -07:00
struct perf_event * event ;
2016-04-18 21:01:23 +02:00
if ( index = = BPF_F_CURRENT_CPU )
2016-06-28 12:18:24 +02:00
index = cpu ;
2015-10-20 20:02:34 -07:00
if ( unlikely ( index > = array - > map . max_entries ) )
return - E2BIG ;
bpf, maps: flush own entries on perf map release
The behavior of perf event arrays are quite different from all
others as they are tightly coupled to perf event fds, f.e. shown
recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array
to use struct file") to make refcounting on perf event more robust.
A remaining issue that the current code still has is that since
additions to the perf event array take a reference on the struct
file via perf_event_get() and are only released via fput() (that
cleans up the perf event eventually via perf_event_release_kernel())
when the element is either manually removed from the map from user
space or automatically when the last reference on the perf event
map is dropped. However, this leads us to dangling struct file's
when the map gets pinned after the application owning the perf
event descriptor exits, and since the struct file reference will
in such case only be manually dropped or via pinned file removal,
it leads to the perf event living longer than necessary, consuming
needlessly resources for that time.
Relations between perf event fds and bpf perf event map fds can be
rather complex. F.e. maps can act as demuxers among different perf
event fds that can possibly be owned by different threads and based
on the index selection from the program, events get dispatched to
one of the per-cpu fd endpoints. One perf event fd (or, rather a
per-cpu set of them) can also live in multiple perf event maps at
the same time, listening for events. Also, another requirement is
that perf event fds can get closed from application side after they
have been attached to the perf event map, so that on exit perf event
map will take care of dropping their references eventually. Likewise,
when such maps are pinned, the intended behavior is that a user
application does bpf_obj_get(), puts its fds in there and on exit
when fd is released, they are dropped from the map again, so the map
acts rather as connector endpoint. This also makes perf event maps
inherently different from program arrays as described in more detail
in commit c9da161c6517 ("bpf: fix clearing on persistent program
array maps").
To tackle this, map entries are marked by the map struct file that
added the element to the map. And when the last reference to that map
struct file is released from user space, then the tracked entries
are purged from the map. This is okay, because new map struct files
instances resp. frontends to the anon inode are provided via
bpf_map_new_fd() that is called when we invoke bpf_obj_get_user()
for retrieving a pinned map, but also when an initial instance is
created via map_create(). The rest is resolved by the vfs layer
automatically for us by keeping reference count on the map's struct
file. Any concurrent updates on the map slot are fine as well, it
just means that perf_event_fd_array_release() needs to delete less
of its own entires.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-15 22:47:14 +02:00
ee = READ_ONCE ( array - > ptrs [ index ] ) ;
2016-06-28 12:18:23 +02:00
if ( ! ee )
2015-10-20 20:02:34 -07:00
return - ENOENT ;
bpf, maps: flush own entries on perf map release
The behavior of perf event arrays are quite different from all
others as they are tightly coupled to perf event fds, f.e. shown
recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array
to use struct file") to make refcounting on perf event more robust.
A remaining issue that the current code still has is that since
additions to the perf event array take a reference on the struct
file via perf_event_get() and are only released via fput() (that
cleans up the perf event eventually via perf_event_release_kernel())
when the element is either manually removed from the map from user
space or automatically when the last reference on the perf event
map is dropped. However, this leads us to dangling struct file's
when the map gets pinned after the application owning the perf
event descriptor exits, and since the struct file reference will
in such case only be manually dropped or via pinned file removal,
it leads to the perf event living longer than necessary, consuming
needlessly resources for that time.
Relations between perf event fds and bpf perf event map fds can be
rather complex. F.e. maps can act as demuxers among different perf
event fds that can possibly be owned by different threads and based
on the index selection from the program, events get dispatched to
one of the per-cpu fd endpoints. One perf event fd (or, rather a
per-cpu set of them) can also live in multiple perf event maps at
the same time, listening for events. Also, another requirement is
that perf event fds can get closed from application side after they
have been attached to the perf event map, so that on exit perf event
map will take care of dropping their references eventually. Likewise,
when such maps are pinned, the intended behavior is that a user
application does bpf_obj_get(), puts its fds in there and on exit
when fd is released, they are dropped from the map again, so the map
acts rather as connector endpoint. This also makes perf event maps
inherently different from program arrays as described in more detail
in commit c9da161c6517 ("bpf: fix clearing on persistent program
array maps").
To tackle this, map entries are marked by the map struct file that
added the element to the map. And when the last reference to that map
struct file is released from user space, then the tracked entries
are purged from the map. This is okay, because new map struct files
instances resp. frontends to the anon inode are provided via
bpf_map_new_fd() that is called when we invoke bpf_obj_get_user()
for retrieving a pinned map, but also when an initial instance is
created via map_create(). The rest is resolved by the vfs layer
automatically for us by keeping reference count on the map's struct
file. Any concurrent updates on the map slot are fine as well, it
just means that perf_event_fd_array_release() needs to delete less
of its own entires.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-15 22:47:14 +02:00
event = ee - > event ;
2015-10-20 20:02:34 -07:00
if ( unlikely ( event - > attr . type ! = PERF_TYPE_SOFTWARE | |
event - > attr . config ! = PERF_COUNT_SW_BPF_OUTPUT ) )
return - EINVAL ;
2016-06-28 12:18:24 +02:00
if ( unlikely ( event - > oncpu ! = cpu ) )
2015-10-20 20:02:34 -07:00
return - EOPNOTSUPP ;
2019-01-11 13:20:20 -03:00
return perf_event_output ( event , sd , regs ) ;
2015-10-20 20:02:34 -07:00
}
2019-06-11 14:53:04 -07:00
/*
* Support executing tracepoints in normal , irq , and nmi context that each call
* bpf_perf_event_output
*/
struct bpf_trace_sample_data {
struct perf_sample_data sds [ 3 ] ;
} ;
static DEFINE_PER_CPU ( struct bpf_trace_sample_data , bpf_trace_sds ) ;
static DEFINE_PER_CPU ( int , bpf_trace_nest_level ) ;
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 02:45:31 +02:00
BPF_CALL_5 ( bpf_perf_event_output , struct pt_regs * , regs , struct bpf_map * , map ,
u64 , flags , void * , data , u64 , size )
2016-07-14 18:08:04 +02:00
{
2019-06-11 14:53:04 -07:00
struct bpf_trace_sample_data * sds = this_cpu_ptr ( & bpf_trace_sds ) ;
int nest_level = this_cpu_inc_return ( bpf_trace_nest_level ) ;
2016-07-14 18:08:04 +02:00
struct perf_raw_record raw = {
. frag = {
. size = size ,
. data = data ,
} ,
} ;
2019-06-11 14:53:04 -07:00
struct perf_sample_data * sd ;
int err ;
2016-07-14 18:08:04 +02:00
2019-06-11 14:53:04 -07:00
if ( WARN_ON_ONCE ( nest_level > ARRAY_SIZE ( sds - > sds ) ) ) {
err = - EBUSY ;
goto out ;
}
sd = & sds - > sds [ nest_level - 1 ] ;
if ( unlikely ( flags & ~ ( BPF_F_INDEX_MASK ) ) ) {
err = - EINVAL ;
goto out ;
}
2016-07-14 18:08:04 +02:00
2017-12-12 02:25:30 +01:00
perf_sample_data_init ( sd , 0 , 0 ) ;
sd - > raw = & raw ;
2019-06-11 14:53:04 -07:00
err = __bpf_perf_event_output ( regs , map , flags , sd ) ;
out :
this_cpu_dec ( bpf_trace_nest_level ) ;
return err ;
2016-07-14 18:08:04 +02:00
}
2015-10-20 20:02:34 -07:00
static const struct bpf_func_proto bpf_perf_event_output_proto = {
. func = bpf_perf_event_output ,
2015-10-23 14:58:19 -07:00
. gpl_only = true ,
2015-10-20 20:02:34 -07:00
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_CTX ,
. arg2_type = ARG_CONST_MAP_PTR ,
. arg3_type = ARG_ANYTHING ,
2017-01-09 10:19:50 -08:00
. arg4_type = ARG_PTR_TO_MEM ,
bpf: change bpf_perf_event_output arg5 type to ARG_CONST_SIZE_OR_ZERO
Commit 9fd29c08e520 ("bpf: improve verifier ARG_CONST_SIZE_OR_ZERO
semantics") relaxed the treatment of ARG_CONST_SIZE_OR_ZERO due to the way
the compiler generates optimized BPF code when checking boundaries of an
argument from C code. A typical example of this optimized code can be
generated using the bpf_perf_event_output helper when operating on variable
memory:
/* len is a generic scalar */
if (len > 0 && len <= 0x7fff)
bpf_perf_event_output(ctx, &perf_map, 0, buf, len);
110: (79) r5 = *(u64 *)(r10 -40)
111: (bf) r1 = r5
112: (07) r1 += -1
113: (25) if r1 > 0x7ffe goto pc+6
114: (bf) r1 = r6
115: (18) r2 = 0xffff94e5f166c200
117: (b7) r3 = 0
118: (bf) r4 = r7
119: (85) call bpf_perf_event_output#25
R5 min value is negative, either use unsigned or 'var &= const'
With this code, the verifier loses track of the variable.
Replacing arg5 with ARG_CONST_SIZE_OR_ZERO is thus desirable since it
avoids this quite common case which leads to usability issues, and the
compiler generates code that the verifier can more easily test:
if (len <= 0x7fff)
bpf_perf_event_output(ctx, &perf_map, 0, buf, len);
or
bpf_perf_event_output(ctx, &perf_map, 0, buf, len & 0x7fff);
No changes to the bpf_perf_event_output helper are necessary since it can
handle a case where size is 0, and an empty frame is pushed.
Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2017-11-22 18:32:56 +00:00
. arg5_type = ARG_CONST_SIZE_OR_ZERO ,
2015-10-20 20:02:34 -07:00
} ;
2019-09-25 16:43:12 -07:00
static DEFINE_PER_CPU ( int , bpf_event_output_nest_level ) ;
struct bpf_nested_pt_regs {
struct pt_regs regs [ 3 ] ;
} ;
static DEFINE_PER_CPU ( struct bpf_nested_pt_regs , bpf_pt_regs ) ;
static DEFINE_PER_CPU ( struct bpf_trace_sample_data , bpf_misc_sds ) ;
bpf: add event output helper for notifications/sampling/logging
This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").
The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.
User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.
While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.
Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-18 21:01:24 +02:00
2016-07-14 18:08:05 +02:00
u64 bpf_event_output ( struct bpf_map * map , u64 flags , void * meta , u64 meta_size ,
void * ctx , u64 ctx_size , bpf_ctx_copy_t ctx_copy )
bpf: add event output helper for notifications/sampling/logging
This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").
The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.
User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.
While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.
Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-18 21:01:24 +02:00
{
2019-09-25 16:43:12 -07:00
int nest_level = this_cpu_inc_return ( bpf_event_output_nest_level ) ;
2016-07-14 18:08:05 +02:00
struct perf_raw_frag frag = {
. copy = ctx_copy ,
. size = ctx_size ,
. data = ctx ,
} ;
struct perf_raw_record raw = {
. frag = {
2016-07-18 15:50:58 -07:00
{
. next = ctx_size ? & frag : NULL ,
} ,
2016-07-14 18:08:05 +02:00
. size = meta_size ,
. data = meta ,
} ,
} ;
2019-09-25 16:43:12 -07:00
struct perf_sample_data * sd ;
struct pt_regs * regs ;
u64 ret ;
if ( WARN_ON_ONCE ( nest_level > ARRAY_SIZE ( bpf_misc_sds . sds ) ) ) {
ret = - EBUSY ;
goto out ;
}
sd = this_cpu_ptr ( & bpf_misc_sds . sds [ nest_level - 1 ] ) ;
regs = this_cpu_ptr ( & bpf_pt_regs . regs [ nest_level - 1 ] ) ;
bpf: add event output helper for notifications/sampling/logging
This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").
The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.
User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.
While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.
Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-18 21:01:24 +02:00
perf_fetch_caller_regs ( regs ) ;
2017-12-12 02:25:30 +01:00
perf_sample_data_init ( sd , 0 , 0 ) ;
sd - > raw = & raw ;
bpf: add event output helper for notifications/sampling/logging
This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").
The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.
User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.
While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.
Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-18 21:01:24 +02:00
2019-09-25 16:43:12 -07:00
ret = __bpf_perf_event_output ( regs , map , flags , sd ) ;
out :
this_cpu_dec ( bpf_event_output_nest_level ) ;
return ret ;
bpf: add event output helper for notifications/sampling/logging
This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").
The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.
User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.
While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.
Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-18 21:01:24 +02:00
}
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 02:45:31 +02:00
BPF_CALL_0 ( bpf_get_current_task )
2016-07-06 22:38:36 -07:00
{
return ( long ) current ;
}
2020-05-24 09:50:55 -07:00
const struct bpf_func_proto bpf_get_current_task_proto = {
2016-07-06 22:38:36 -07:00
. func = bpf_get_current_task ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
} ;
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 02:45:31 +02:00
BPF_CALL_2 ( bpf_current_task_under_cgroup , struct bpf_map * , map , u32 , idx )
2016-08-12 08:56:52 -07:00
{
struct bpf_array * array = container_of ( map , struct bpf_array , map ) ;
struct cgroup * cgrp ;
if ( unlikely ( idx > = array - > map . max_entries ) )
return - E2BIG ;
cgrp = READ_ONCE ( array - > ptrs [ idx ] ) ;
if ( unlikely ( ! cgrp ) )
return - EAGAIN ;
return task_under_cgroup_hierarchy ( current , cgrp ) ;
}
static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
. func = bpf_current_task_under_cgroup ,
. gpl_only = false ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_CONST_MAP_PTR ,
. arg2_type = ARG_ANYTHING ,
} ;
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 14:47:45 -07:00
struct send_signal_irq_work {
struct irq_work irq_work ;
struct task_struct * task ;
u32 sig ;
2020-01-14 19:50:02 -08:00
enum pid_type type ;
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 14:47:45 -07:00
} ;
static DEFINE_PER_CPU ( struct send_signal_irq_work , send_signal_work ) ;
static void do_bpf_send_signal ( struct irq_work * entry )
{
struct send_signal_irq_work * work ;
work = container_of ( entry , struct send_signal_irq_work , irq_work ) ;
2020-01-14 19:50:02 -08:00
group_send_sig_info ( work - > sig , SEND_SIG_PRIV , work - > task , work - > type ) ;
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 14:47:45 -07:00
}
2020-01-14 19:50:02 -08:00
static int bpf_send_signal_common ( u32 sig , enum pid_type type )
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 14:47:45 -07:00
{
struct send_signal_irq_work * work = NULL ;
/* Similar to bpf_probe_write_user, task needs to be
* in a sound condition and kernel memory access be
* permitted in order to send signal to the current
* task .
*/
if ( unlikely ( current - > flags & ( PF_KTHREAD | PF_EXITING ) ) )
return - EPERM ;
if ( unlikely ( uaccess_kernel ( ) ) )
return - EPERM ;
if ( unlikely ( ! nmi_uaccess_okay ( ) ) )
return - EPERM ;
bpf: Fix deadlock with rq_lock in bpf_send_signal()
When experimenting with bpf_send_signal() helper in our production
environment (5.2 based), we experienced a deadlock in NMI mode:
#5 [ffffc9002219f770] queued_spin_lock_slowpath at ffffffff8110be24
#6 [ffffc9002219f770] _raw_spin_lock_irqsave at ffffffff81a43012
#7 [ffffc9002219f780] try_to_wake_up at ffffffff810e7ecd
#8 [ffffc9002219f7e0] signal_wake_up_state at ffffffff810c7b55
#9 [ffffc9002219f7f0] __send_signal at ffffffff810c8602
#10 [ffffc9002219f830] do_send_sig_info at ffffffff810ca31a
#11 [ffffc9002219f868] bpf_send_signal at ffffffff8119d227
#12 [ffffc9002219f988] bpf_overflow_handler at ffffffff811d4140
#13 [ffffc9002219f9e0] __perf_event_overflow at ffffffff811d68cf
#14 [ffffc9002219fa10] perf_swevent_overflow at ffffffff811d6a09
#15 [ffffc9002219fa38] ___perf_sw_event at ffffffff811e0f47
#16 [ffffc9002219fc30] __schedule at ffffffff81a3e04d
#17 [ffffc9002219fc90] schedule at ffffffff81a3e219
#18 [ffffc9002219fca0] futex_wait_queue_me at ffffffff8113d1b9
#19 [ffffc9002219fcd8] futex_wait at ffffffff8113e529
#20 [ffffc9002219fdf0] do_futex at ffffffff8113ffbc
#21 [ffffc9002219fec0] __x64_sys_futex at ffffffff81140d1c
#22 [ffffc9002219ff38] do_syscall_64 at ffffffff81002602
#23 [ffffc9002219ff50] entry_SYSCALL_64_after_hwframe at ffffffff81c00068
The above call stack is actually very similar to an issue
reported by Commit eac9153f2b58 ("bpf/stackmap: Fix deadlock with
rq_lock in bpf_get_stack()") by Song Liu. The only difference is
bpf_send_signal() helper instead of bpf_get_stack() helper.
The above deadlock is triggered with a perf_sw_event.
Similar to Commit eac9153f2b58, the below almost identical reproducer
used tracepoint point sched/sched_switch so the issue can be easily caught.
/* stress_test.c */
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#define THREAD_COUNT 1000
char *filename;
void *worker(void *p)
{
void *ptr;
int fd;
char *pptr;
fd = open(filename, O_RDONLY);
if (fd < 0)
return NULL;
while (1) {
struct timespec ts = {0, 1000 + rand() % 2000};
ptr = mmap(NULL, 4096 * 64, PROT_READ, MAP_PRIVATE, fd, 0);
usleep(1);
if (ptr == MAP_FAILED) {
printf("failed to mmap\n");
break;
}
munmap(ptr, 4096 * 64);
usleep(1);
pptr = malloc(1);
usleep(1);
pptr[0] = 1;
usleep(1);
free(pptr);
usleep(1);
nanosleep(&ts, NULL);
}
close(fd);
return NULL;
}
int main(int argc, char *argv[])
{
void *ptr;
int i;
pthread_t threads[THREAD_COUNT];
if (argc < 2)
return 0;
filename = argv[1];
for (i = 0; i < THREAD_COUNT; i++) {
if (pthread_create(threads + i, NULL, worker, NULL)) {
fprintf(stderr, "Error creating thread\n");
return 0;
}
}
for (i = 0; i < THREAD_COUNT; i++)
pthread_join(threads[i], NULL);
return 0;
}
and the following command:
1. run `stress_test /bin/ls` in one windown
2. hack bcc trace.py with the following change:
--- a/tools/trace.py
+++ b/tools/trace.py
@@ -513,6 +513,7 @@ BPF_PERF_OUTPUT(%s);
__data.tgid = __tgid;
__data.pid = __pid;
bpf_get_current_comm(&__data.comm, sizeof(__data.comm));
+ bpf_send_signal(10);
%s
%s
%s.perf_submit(%s, &__data, sizeof(__data));
3. in a different window run
./trace.py -p $(pidof stress_test) t:sched:sched_switch
The deadlock can be reproduced in our production system.
Similar to Song's fix, the fix is to delay sending signal if
irqs is disabled to avoid deadlocks involving with rq_lock.
With this change, my above stress-test in our production system
won't cause deadlock any more.
I also implemented a scale-down version of reproducer in the
selftest (a subsequent commit). With latest bpf-next,
it complains for the following potential deadlock.
[ 32.832450] -> #1 (&p->pi_lock){-.-.}:
[ 32.833100] _raw_spin_lock_irqsave+0x44/0x80
[ 32.833696] task_rq_lock+0x2c/0xa0
[ 32.834182] task_sched_runtime+0x59/0xd0
[ 32.834721] thread_group_cputime+0x250/0x270
[ 32.835304] thread_group_cputime_adjusted+0x2e/0x70
[ 32.835959] do_task_stat+0x8a7/0xb80
[ 32.836461] proc_single_show+0x51/0xb0
...
[ 32.839512] -> #0 (&(&sighand->siglock)->rlock){....}:
[ 32.840275] __lock_acquire+0x1358/0x1a20
[ 32.840826] lock_acquire+0xc7/0x1d0
[ 32.841309] _raw_spin_lock_irqsave+0x44/0x80
[ 32.841916] __lock_task_sighand+0x79/0x160
[ 32.842465] do_send_sig_info+0x35/0x90
[ 32.842977] bpf_send_signal+0xa/0x10
[ 32.843464] bpf_prog_bc13ed9e4d3163e3_send_signal_tp_sched+0x465/0x1000
[ 32.844301] trace_call_bpf+0x115/0x270
[ 32.844809] perf_trace_run_bpf_submit+0x4a/0xc0
[ 32.845411] perf_trace_sched_switch+0x10f/0x180
[ 32.846014] __schedule+0x45d/0x880
[ 32.846483] schedule+0x5f/0xd0
...
[ 32.853148] Chain exists of:
[ 32.853148] &(&sighand->siglock)->rlock --> &p->pi_lock --> &rq->lock
[ 32.853148]
[ 32.854451] Possible unsafe locking scenario:
[ 32.854451]
[ 32.855173] CPU0 CPU1
[ 32.855745] ---- ----
[ 32.856278] lock(&rq->lock);
[ 32.856671] lock(&p->pi_lock);
[ 32.857332] lock(&rq->lock);
[ 32.857999] lock(&(&sighand->siglock)->rlock);
Deadlock happens on CPU0 when it tries to acquire &sighand->siglock
but it has been held by CPU1 and CPU1 tries to grab &rq->lock
and cannot get it.
This is not exactly the callstack in our production environment,
but sympotom is similar and both locks are using spin_lock_irqsave()
to acquire the lock, and both involves rq_lock. The fix to delay
sending signal when irq is disabled also fixed this issue.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200304191104.2796501-1-yhs@fb.com
2020-03-04 11:11:04 -08:00
if ( irqs_disabled ( ) ) {
2019-05-25 11:57:53 -07:00
/* Do an early check on signal validity. Otherwise,
* the error is lost in deferred irq_work .
*/
if ( unlikely ( ! valid_signal ( sig ) ) )
return - EINVAL ;
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 14:47:45 -07:00
work = this_cpu_ptr ( & send_signal_work ) ;
2019-11-08 17:08:55 +01:00
if ( atomic_read ( & work - > irq_work . flags ) & IRQ_WORK_BUSY )
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 14:47:45 -07:00
return - EBUSY ;
/* Add the current task, which is the target of sending signal,
* to the irq_work . The current task may change when queued
* irq works get executed .
*/
work - > task = current ;
work - > sig = sig ;
2020-01-14 19:50:02 -08:00
work - > type = type ;
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 14:47:45 -07:00
irq_work_queue ( & work - > irq_work ) ;
return 0 ;
}
2020-01-14 19:50:02 -08:00
return group_send_sig_info ( sig , SEND_SIG_PRIV , current , type ) ;
}
BPF_CALL_1 ( bpf_send_signal , u32 , sig )
{
return bpf_send_signal_common ( sig , PIDTYPE_TGID ) ;
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 14:47:45 -07:00
}
static const struct bpf_func_proto bpf_send_signal_proto = {
. func = bpf_send_signal ,
. gpl_only = false ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_ANYTHING ,
} ;
2020-01-14 19:50:02 -08:00
BPF_CALL_1 ( bpf_send_signal_thread , u32 , sig )
{
return bpf_send_signal_common ( sig , PIDTYPE_PID ) ;
}
static const struct bpf_func_proto bpf_send_signal_thread_proto = {
. func = bpf_send_signal_thread ,
. gpl_only = false ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_ANYTHING ,
} ;
2020-03-29 01:43:49 +01:00
const struct bpf_func_proto *
bpf_tracing_func_proto ( enum bpf_func_id func_id , const struct bpf_prog * prog )
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
{
switch ( func_id ) {
case BPF_FUNC_map_lookup_elem :
return & bpf_map_lookup_elem_proto ;
case BPF_FUNC_map_update_elem :
return & bpf_map_update_elem_proto ;
case BPF_FUNC_map_delete_elem :
return & bpf_map_delete_elem_proto ;
2019-04-14 18:58:46 +02:00
case BPF_FUNC_map_push_elem :
return & bpf_map_push_elem_proto ;
case BPF_FUNC_map_pop_elem :
return & bpf_map_pop_elem_proto ;
case BPF_FUNC_map_peek_elem :
return & bpf_map_peek_elem_proto ;
2015-03-25 12:49:21 -07:00
case BPF_FUNC_ktime_get_ns :
return & bpf_ktime_get_ns_proto ;
2020-04-26 09:15:25 -07:00
case BPF_FUNC_ktime_get_boot_ns :
return & bpf_ktime_get_boot_ns_proto ;
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-19 16:59:03 -07:00
case BPF_FUNC_tail_call :
return & bpf_tail_call_proto ;
2015-06-12 19:39:12 -07:00
case BPF_FUNC_get_current_pid_tgid :
return & bpf_get_current_pid_tgid_proto ;
2016-07-06 22:38:36 -07:00
case BPF_FUNC_get_current_task :
return & bpf_get_current_task_proto ;
2015-06-12 19:39:12 -07:00
case BPF_FUNC_get_current_uid_gid :
return & bpf_get_current_uid_gid_proto ;
case BPF_FUNC_get_current_comm :
return & bpf_get_current_comm_proto ;
2015-03-25 12:49:22 -07:00
case BPF_FUNC_trace_printk :
2015-06-12 19:39:13 -07:00
return bpf_get_trace_printk_proto ( ) ;
2015-06-12 19:39:14 -07:00
case BPF_FUNC_get_smp_processor_id :
return & bpf_get_smp_processor_id_proto ;
2016-10-21 12:46:33 +02:00
case BPF_FUNC_get_numa_node_id :
return & bpf_get_numa_node_id_proto ;
2015-08-06 07:02:35 +00:00
case BPF_FUNC_perf_event_read :
return & bpf_perf_event_read_proto ;
2016-07-25 05:54:46 -07:00
case BPF_FUNC_probe_write_user :
return bpf_get_probe_write_proto ( ) ;
2016-08-12 08:56:52 -07:00
case BPF_FUNC_current_task_under_cgroup :
return & bpf_current_task_under_cgroup_proto ;
2016-08-11 18:17:18 -07:00
case BPF_FUNC_get_prandom_u32 :
return & bpf_get_prandom_u32_proto ;
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
case BPF_FUNC_probe_read_user :
return & bpf_probe_read_user_proto ;
case BPF_FUNC_probe_read_kernel :
return & bpf_probe_read_kernel_proto ;
case BPF_FUNC_probe_read_user_str :
return & bpf_probe_read_user_str_proto ;
case BPF_FUNC_probe_read_kernel_str :
return & bpf_probe_read_kernel_str_proto ;
bpf: Restrict bpf_probe_read{, str}() only to archs where they work
Given the legacy bpf_probe_read{,str}() BPF helpers are broken on archs
with overlapping address ranges, we should really take the next step to
disable them from BPF use there.
To generally fix the situation, we've recently added new helper variants
bpf_probe_read_{user,kernel}() and bpf_probe_read_{user,kernel}_str().
For details on them, see 6ae08ae3dea2 ("bpf: Add probe_read_{user, kernel}
and probe_read_{user,kernel}_str helpers").
Given bpf_probe_read{,str}() have been around for ~5 years by now, there
are plenty of users at least on x86 still relying on them today, so we
cannot remove them entirely w/o breaking the BPF tracing ecosystem.
However, their use should be restricted to archs with non-overlapping
address ranges where they are working in their current form. Therefore,
move this behind a CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE and
have x86, arm64, arm select it (other archs supporting it can follow-up
on it as well).
For the remaining archs, they can workaround easily by relying on the
feature probe from bpftool which spills out defines that can be used out
of BPF C code to implement the drop-in replacement for old/new kernels
via: bpftool feature probe macro
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/bpf/20200515101118.6508-2-daniel@iogearbox.net
2020-05-15 12:11:16 +02:00
# ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
case BPF_FUNC_probe_read :
return & bpf_probe_read_compat_proto ;
bpf: add bpf_probe_read_str helper
Provide a simple helper with the same semantics of strncpy_from_unsafe():
int bpf_probe_read_str(void *dst, int size, const void *unsafe_addr)
This gives more flexibility to a bpf program. A typical use case is
intercepting a file name during sys_open(). The current approach is:
SEC("kprobe/sys_open")
void bpf_sys_open(struct pt_regs *ctx)
{
char buf[PATHLEN]; // PATHLEN is defined to 256
bpf_probe_read(buf, sizeof(buf), ctx->di);
/* consume buf */
}
This is suboptimal because the size of the string needs to be estimated
at compile time, causing more memory to be copied than often necessary,
and can become more problematic if further processing on buf is done,
for example by pushing it to userspace via bpf_perf_event_output(),
since the real length of the string is unknown and the entire buffer
must be copied (and defining an unrolled strnlen() inside the bpf
program is a very inefficient and unfeasible approach).
With the new helper, the code can easily operate on the actual string
length rather than the buffer size:
SEC("kprobe/sys_open")
void bpf_sys_open(struct pt_regs *ctx)
{
char buf[PATHLEN]; // PATHLEN is defined to 256
int res = bpf_probe_read_str(buf, sizeof(buf), ctx->di);
/* consume buf, for example push it to userspace via
* bpf_perf_event_output(), but this time we can use
* res (the string length) as event size, after checking
* its boundaries.
*/
}
Another useful use case is when parsing individual process arguments or
individual environment variables navigating current->mm->arg_start and
current->mm->env_start: using this helper and the return value, one can
quickly iterate at the right offset of the memory area.
The code changes simply leverage the already existent
strncpy_from_unsafe() kernel function, which is safe to be called from a
bpf program as it is used in bpf_trace_printk().
Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-18 17:55:49 +00:00
case BPF_FUNC_probe_read_str :
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-02 00:17:59 +01:00
return & bpf_probe_read_compat_str_proto ;
bpf: Restrict bpf_probe_read{, str}() only to archs where they work
Given the legacy bpf_probe_read{,str}() BPF helpers are broken on archs
with overlapping address ranges, we should really take the next step to
disable them from BPF use there.
To generally fix the situation, we've recently added new helper variants
bpf_probe_read_{user,kernel}() and bpf_probe_read_{user,kernel}_str().
For details on them, see 6ae08ae3dea2 ("bpf: Add probe_read_{user, kernel}
and probe_read_{user,kernel}_str helpers").
Given bpf_probe_read{,str}() have been around for ~5 years by now, there
are plenty of users at least on x86 still relying on them today, so we
cannot remove them entirely w/o breaking the BPF tracing ecosystem.
However, their use should be restricted to archs with non-overlapping
address ranges where they are working in their current form. Therefore,
move this behind a CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE and
have x86, arm64, arm select it (other archs supporting it can follow-up
on it as well).
For the remaining archs, they can workaround easily by relying on the
feature probe from bpftool which spills out defines that can be used out
of BPF C code to implement the drop-in replacement for old/new kernels
via: bpftool feature probe macro
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/bpf/20200515101118.6508-2-daniel@iogearbox.net
2020-05-15 12:11:16 +02:00
# endif
2018-06-04 08:53:41 -07:00
# ifdef CONFIG_CGROUPS
2018-06-03 15:59:41 -07:00
case BPF_FUNC_get_current_cgroup_id :
return & bpf_get_current_cgroup_id_proto ;
2018-06-04 08:53:41 -07:00
# endif
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 14:47:45 -07:00
case BPF_FUNC_send_signal :
return & bpf_send_signal_proto ;
2020-01-14 19:50:02 -08:00
case BPF_FUNC_send_signal_thread :
return & bpf_send_signal_thread_proto ;
2020-02-14 15:41:46 -08:00
case BPF_FUNC_perf_event_read_value :
return & bpf_perf_event_read_value_proto ;
2020-03-04 17:41:56 -03:00
case BPF_FUNC_get_ns_current_pid_tgid :
return & bpf_get_ns_current_pid_tgid_proto ;
bpf: Implement BPF ring buffer and verifier support for it
This commit adds a new MPSC ring buffer implementation into BPF ecosystem,
which allows multiple CPUs to submit data to a single shared ring buffer. On
the consumption side, only single consumer is assumed.
Motivation
----------
There are two distinctive motivators for this work, which are not satisfied by
existing perf buffer, which prompted creation of a new ring buffer
implementation.
- more efficient memory utilization by sharing ring buffer across CPUs;
- preserving ordering of events that happen sequentially in time, even
across multiple CPUs (e.g., fork/exec/exit events for a task).
These two problems are independent, but perf buffer fails to satisfy both.
Both are a result of a choice to have per-CPU perf ring buffer. Both can be
also solved by having an MPSC implementation of ring buffer. The ordering
problem could technically be solved for perf buffer with some in-kernel
counting, but given the first one requires an MPSC buffer, the same solution
would solve the second problem automatically.
Semantics and APIs
------------------
Single ring buffer is presented to BPF programs as an instance of BPF map of
type BPF_MAP_TYPE_RINGBUF. Two other alternatives considered, but ultimately
rejected.
One way would be to, similar to BPF_MAP_TYPE_PERF_EVENT_ARRAY, make
BPF_MAP_TYPE_RINGBUF could represent an array of ring buffers, but not enforce
"same CPU only" rule. This would be more familiar interface compatible with
existing perf buffer use in BPF, but would fail if application needed more
advanced logic to lookup ring buffer by arbitrary key. HASH_OF_MAPS addresses
this with current approach. Additionally, given the performance of BPF
ringbuf, many use cases would just opt into a simple single ring buffer shared
among all CPUs, for which current approach would be an overkill.
Another approach could introduce a new concept, alongside BPF map, to
represent generic "container" object, which doesn't necessarily have key/value
interface with lookup/update/delete operations. This approach would add a lot
of extra infrastructure that has to be built for observability and verifier
support. It would also add another concept that BPF developers would have to
familiarize themselves with, new syntax in libbpf, etc. But then would really
provide no additional benefits over the approach of using a map.
BPF_MAP_TYPE_RINGBUF doesn't support lookup/update/delete operations, but so
doesn't few other map types (e.g., queue and stack; array doesn't support
delete, etc).
The approach chosen has an advantage of re-using existing BPF map
infrastructure (introspection APIs in kernel, libbpf support, etc), being
familiar concept (no need to teach users a new type of object in BPF program),
and utilizing existing tooling (bpftool). For common scenario of using
a single ring buffer for all CPUs, it's as simple and straightforward, as
would be with a dedicated "container" object. On the other hand, by being
a map, it can be combined with ARRAY_OF_MAPS and HASH_OF_MAPS map-in-maps to
implement a wide variety of topologies, from one ring buffer for each CPU
(e.g., as a replacement for perf buffer use cases), to a complicated
application hashing/sharding of ring buffers (e.g., having a small pool of
ring buffers with hashed task's tgid being a look up key to preserve order,
but reduce contention).
Key and value sizes are enforced to be zero. max_entries is used to specify
the size of ring buffer and has to be a power of 2 value.
There are a bunch of similarities between perf buffer
(BPF_MAP_TYPE_PERF_EVENT_ARRAY) and new BPF ring buffer semantics:
- variable-length records;
- if there is no more space left in ring buffer, reservation fails, no
blocking;
- memory-mappable data area for user-space applications for ease of
consumption and high performance;
- epoll notifications for new incoming data;
- but still the ability to do busy polling for new data to achieve the
lowest latency, if necessary.
BPF ringbuf provides two sets of APIs to BPF programs:
- bpf_ringbuf_output() allows to *copy* data from one place to a ring
buffer, similarly to bpf_perf_event_output();
- bpf_ringbuf_reserve()/bpf_ringbuf_commit()/bpf_ringbuf_discard() APIs
split the whole process into two steps. First, a fixed amount of space is
reserved. If successful, a pointer to a data inside ring buffer data area
is returned, which BPF programs can use similarly to a data inside
array/hash maps. Once ready, this piece of memory is either committed or
discarded. Discard is similar to commit, but makes consumer ignore the
record.
bpf_ringbuf_output() has disadvantage of incurring extra memory copy, because
record has to be prepared in some other place first. But it allows to submit
records of the length that's not known to verifier beforehand. It also closely
matches bpf_perf_event_output(), so will simplify migration significantly.
bpf_ringbuf_reserve() avoids the extra copy of memory by providing a memory
pointer directly to ring buffer memory. In a lot of cases records are larger
than BPF stack space allows, so many programs have use extra per-CPU array as
a temporary heap for preparing sample. bpf_ringbuf_reserve() avoid this needs
completely. But in exchange, it only allows a known constant size of memory to
be reserved, such that verifier can verify that BPF program can't access
memory outside its reserved record space. bpf_ringbuf_output(), while slightly
slower due to extra memory copy, covers some use cases that are not suitable
for bpf_ringbuf_reserve().
The difference between commit and discard is very small. Discard just marks
a record as discarded, and such records are supposed to be ignored by consumer
code. Discard is useful for some advanced use-cases, such as ensuring
all-or-nothing multi-record submission, or emulating temporary malloc()/free()
within single BPF program invocation.
Each reserved record is tracked by verifier through existing
reference-tracking logic, similar to socket ref-tracking. It is thus
impossible to reserve a record, but forget to submit (or discard) it.
bpf_ringbuf_query() helper allows to query various properties of ring buffer.
Currently 4 are supported:
- BPF_RB_AVAIL_DATA returns amount of unconsumed data in ring buffer;
- BPF_RB_RING_SIZE returns the size of ring buffer;
- BPF_RB_CONS_POS/BPF_RB_PROD_POS returns current logical possition of
consumer/producer, respectively.
Returned values are momentarily snapshots of ring buffer state and could be
off by the time helper returns, so this should be used only for
debugging/reporting reasons or for implementing various heuristics, that take
into account highly-changeable nature of some of those characteristics.
One such heuristic might involve more fine-grained control over poll/epoll
notifications about new data availability in ring buffer. Together with
BPF_RB_NO_WAKEUP/BPF_RB_FORCE_WAKEUP flags for output/commit/discard helpers,
it allows BPF program a high degree of control and, e.g., more efficient
batched notifications. Default self-balancing strategy, though, should be
adequate for most applications and will work reliable and efficiently already.
Design and implementation
-------------------------
This reserve/commit schema allows a natural way for multiple producers, either
on different CPUs or even on the same CPU/in the same BPF program, to reserve
independent records and work with them without blocking other producers. This
means that if BPF program was interruped by another BPF program sharing the
same ring buffer, they will both get a record reserved (provided there is
enough space left) and can work with it and submit it independently. This
applies to NMI context as well, except that due to using a spinlock during
reservation, in NMI context, bpf_ringbuf_reserve() might fail to get a lock,
in which case reservation will fail even if ring buffer is not full.
The ring buffer itself internally is implemented as a power-of-2 sized
circular buffer, with two logical and ever-increasing counters (which might
wrap around on 32-bit architectures, that's not a problem):
- consumer counter shows up to which logical position consumer consumed the
data;
- producer counter denotes amount of data reserved by all producers.
Each time a record is reserved, producer that "owns" the record will
successfully advance producer counter. At that point, data is still not yet
ready to be consumed, though. Each record has 8 byte header, which contains
the length of reserved record, as well as two extra bits: busy bit to denote
that record is still being worked on, and discard bit, which might be set at
commit time if record is discarded. In the latter case, consumer is supposed
to skip the record and move on to the next one. Record header also encodes
record's relative offset from the beginning of ring buffer data area (in
pages). This allows bpf_ringbuf_commit()/bpf_ringbuf_discard() to accept only
the pointer to the record itself, without requiring also the pointer to ring
buffer itself. Ring buffer memory location will be restored from record
metadata header. This significantly simplifies verifier, as well as improving
API usability.
Producer counter increments are serialized under spinlock, so there is
a strict ordering between reservations. Commits, on the other hand, are
completely lockless and independent. All records become available to consumer
in the order of reservations, but only after all previous records where
already committed. It is thus possible for slow producers to temporarily hold
off submitted records, that were reserved later.
Reservation/commit/consumer protocol is verified by litmus tests in
Documentation/litmus-test/bpf-rb.
One interesting implementation bit, that significantly simplifies (and thus
speeds up as well) implementation of both producers and consumers is how data
area is mapped twice contiguously back-to-back in the virtual memory. This
allows to not take any special measures for samples that have to wrap around
at the end of the circular buffer data area, because the next page after the
last data page would be first data page again, and thus the sample will still
appear completely contiguous in virtual memory. See comment and a simple ASCII
diagram showing this visually in bpf_ringbuf_area_alloc().
Another feature that distinguishes BPF ringbuf from perf ring buffer is
a self-pacing notifications of new data being availability.
bpf_ringbuf_commit() implementation will send a notification of new record
being available after commit only if consumer has already caught up right up
to the record being committed. If not, consumer still has to catch up and thus
will see new data anyways without needing an extra poll notification.
Benchmarks (see tools/testing/selftests/bpf/benchs/bench_ringbuf.c) show that
this allows to achieve a very high throughput without having to resort to
tricks like "notify only every Nth sample", which are necessary with perf
buffer. For extreme cases, when BPF program wants more manual control of
notifications, commit/discard/output helpers accept BPF_RB_NO_WAKEUP and
BPF_RB_FORCE_WAKEUP flags, which give full control over notifications of data
availability, but require extra caution and diligence in using this API.
Comparison to alternatives
--------------------------
Before considering implementing BPF ring buffer from scratch existing
alternatives in kernel were evaluated, but didn't seem to meet the needs. They
largely fell into few categores:
- per-CPU buffers (perf, ftrace, etc), which don't satisfy two motivations
outlined above (ordering and memory consumption);
- linked list-based implementations; while some were multi-producer designs,
consuming these from user-space would be very complicated and most
probably not performant; memory-mapping contiguous piece of memory is
simpler and more performant for user-space consumers;
- io_uring is SPSC, but also requires fixed-sized elements. Naively turning
SPSC queue into MPSC w/ lock would have subpar performance compared to
locked reserve + lockless commit, as with BPF ring buffer. Fixed sized
elements would be too limiting for BPF programs, given existing BPF
programs heavily rely on variable-sized perf buffer already;
- specialized implementations (like a new printk ring buffer, [0]) with lots
of printk-specific limitations and implications, that didn't seem to fit
well for intended use with BPF programs.
[0] https://lwn.net/Articles/779550/
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200529075424.3139988-2-andriin@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2020-05-29 00:54:20 -07:00
case BPF_FUNC_ringbuf_output :
return & bpf_ringbuf_output_proto ;
case BPF_FUNC_ringbuf_reserve :
return & bpf_ringbuf_reserve_proto ;
case BPF_FUNC_ringbuf_submit :
return & bpf_ringbuf_submit_proto ;
case BPF_FUNC_ringbuf_discard :
return & bpf_ringbuf_discard_proto ;
case BPF_FUNC_ringbuf_query :
return & bpf_ringbuf_query_proto ;
2020-06-23 16:08:08 -07:00
case BPF_FUNC_jiffies64 :
return & bpf_jiffies64_proto ;
2016-04-06 18:43:26 -07:00
default :
return NULL ;
}
}
2018-03-30 15:08:00 -07:00
static const struct bpf_func_proto *
kprobe_prog_func_proto ( enum bpf_func_id func_id , const struct bpf_prog * prog )
2016-04-06 18:43:26 -07:00
{
switch ( func_id ) {
2015-10-20 20:02:34 -07:00
case BPF_FUNC_perf_event_output :
return & bpf_perf_event_output_proto ;
2016-02-17 19:58:58 -08:00
case BPF_FUNC_get_stackid :
return & bpf_get_stackid_proto ;
2018-04-28 22:28:08 -07:00
case BPF_FUNC_get_stack :
return & bpf_get_stack_proto ;
2017-12-11 11:36:48 -05:00
# ifdef CONFIG_BPF_KPROBE_OVERRIDE
case BPF_FUNC_override_return :
return & bpf_override_return_proto ;
# endif
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
default :
2020-03-29 01:43:49 +01:00
return bpf_tracing_func_proto ( func_id , prog ) ;
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
}
}
/* bpf+kprobe programs can access fields of 'struct pt_regs' */
2016-06-15 18:25:38 -07:00
static bool kprobe_prog_is_valid_access ( int off , int size , enum bpf_access_type type ,
2018-03-30 15:08:00 -07:00
const struct bpf_prog * prog ,
2017-06-22 15:07:39 -07:00
struct bpf_insn_access_aux * info )
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
{
if ( off < 0 | | off > = sizeof ( struct pt_regs ) )
return false ;
if ( type ! = BPF_READ )
return false ;
if ( off % size ! = 0 )
return false ;
2017-01-15 01:34:25 +01:00
/*
* Assertion for 32 bit to make sure last 8 byte access
* ( BPF_DW ) to the last 4 byte member is disallowed .
*/
if ( off + size > sizeof ( struct pt_regs ) )
return false ;
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
return true ;
}
2017-10-16 16:40:53 -07:00
const struct bpf_verifier_ops kprobe_verifier_ops = {
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 12:49:20 -07:00
. get_func_proto = kprobe_prog_func_proto ,
. is_valid_access = kprobe_prog_is_valid_access ,
} ;
2017-10-16 16:40:53 -07:00
const struct bpf_prog_ops kprobe_prog_ops = {
} ;
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 02:45:31 +02:00
BPF_CALL_5 ( bpf_perf_event_output_tp , void * , tp_buff , struct bpf_map * , map ,
u64 , flags , void * , data , u64 , size )
2016-04-06 18:43:27 -07:00
{
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 02:45:31 +02:00
struct pt_regs * regs = * ( struct pt_regs * * ) tp_buff ;
2016-04-06 18:43:27 -07:00
/*
* r1 points to perf tracepoint buffer where first 8 bytes are hidden
* from bpf program and contain a pointer to ' struct pt_regs ' . Fetch it
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 02:45:31 +02:00
* from there and call the same bpf_perf_event_output ( ) helper inline .
2016-04-06 18:43:27 -07:00
*/
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 02:45:31 +02:00
return ____bpf_perf_event_output ( regs , map , flags , data , size ) ;
2016-04-06 18:43:27 -07:00
}
static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
. func = bpf_perf_event_output_tp ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_CTX ,
. arg2_type = ARG_CONST_MAP_PTR ,
. arg3_type = ARG_ANYTHING ,
2017-01-09 10:19:50 -08:00
. arg4_type = ARG_PTR_TO_MEM ,
bpf: change bpf_perf_event_output arg5 type to ARG_CONST_SIZE_OR_ZERO
Commit 9fd29c08e520 ("bpf: improve verifier ARG_CONST_SIZE_OR_ZERO
semantics") relaxed the treatment of ARG_CONST_SIZE_OR_ZERO due to the way
the compiler generates optimized BPF code when checking boundaries of an
argument from C code. A typical example of this optimized code can be
generated using the bpf_perf_event_output helper when operating on variable
memory:
/* len is a generic scalar */
if (len > 0 && len <= 0x7fff)
bpf_perf_event_output(ctx, &perf_map, 0, buf, len);
110: (79) r5 = *(u64 *)(r10 -40)
111: (bf) r1 = r5
112: (07) r1 += -1
113: (25) if r1 > 0x7ffe goto pc+6
114: (bf) r1 = r6
115: (18) r2 = 0xffff94e5f166c200
117: (b7) r3 = 0
118: (bf) r4 = r7
119: (85) call bpf_perf_event_output#25
R5 min value is negative, either use unsigned or 'var &= const'
With this code, the verifier loses track of the variable.
Replacing arg5 with ARG_CONST_SIZE_OR_ZERO is thus desirable since it
avoids this quite common case which leads to usability issues, and the
compiler generates code that the verifier can more easily test:
if (len <= 0x7fff)
bpf_perf_event_output(ctx, &perf_map, 0, buf, len);
or
bpf_perf_event_output(ctx, &perf_map, 0, buf, len & 0x7fff);
No changes to the bpf_perf_event_output helper are necessary since it can
handle a case where size is 0, and an empty frame is pushed.
Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2017-11-22 18:32:56 +00:00
. arg5_type = ARG_CONST_SIZE_OR_ZERO ,
2016-04-06 18:43:27 -07:00
} ;
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 02:45:31 +02:00
BPF_CALL_3 ( bpf_get_stackid_tp , void * , tp_buff , struct bpf_map * , map ,
u64 , flags )
2016-04-06 18:43:27 -07:00
{
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 02:45:31 +02:00
struct pt_regs * regs = * ( struct pt_regs * * ) tp_buff ;
2016-04-06 18:43:27 -07:00
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 02:45:31 +02:00
/*
* Same comment as in bpf_perf_event_output_tp ( ) , only that this time
* the other helper ' s function body cannot be inlined due to being
* external , thus we need to call raw helper function .
*/
return bpf_get_stackid ( ( unsigned long ) regs , ( unsigned long ) map ,
flags , 0 , 0 ) ;
2016-04-06 18:43:27 -07:00
}
static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
. func = bpf_get_stackid_tp ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_CTX ,
. arg2_type = ARG_CONST_MAP_PTR ,
. arg3_type = ARG_ANYTHING ,
} ;
2018-04-28 22:28:08 -07:00
BPF_CALL_4 ( bpf_get_stack_tp , void * , tp_buff , void * , buf , u32 , size ,
u64 , flags )
{
struct pt_regs * regs = * ( struct pt_regs * * ) tp_buff ;
return bpf_get_stack ( ( unsigned long ) regs , ( unsigned long ) buf ,
( unsigned long ) size , flags , 0 ) ;
}
static const struct bpf_func_proto bpf_get_stack_proto_tp = {
. func = bpf_get_stack_tp ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_CTX ,
. arg2_type = ARG_PTR_TO_UNINIT_MEM ,
. arg3_type = ARG_CONST_SIZE_OR_ZERO ,
. arg4_type = ARG_ANYTHING ,
} ;
2018-03-30 15:08:00 -07:00
static const struct bpf_func_proto *
tp_prog_func_proto ( enum bpf_func_id func_id , const struct bpf_prog * prog )
2018-03-20 11:19:17 -07:00
{
switch ( func_id ) {
case BPF_FUNC_perf_event_output :
return & bpf_perf_event_output_proto_tp ;
case BPF_FUNC_get_stackid :
return & bpf_get_stackid_proto_tp ;
2018-04-28 22:28:08 -07:00
case BPF_FUNC_get_stack :
return & bpf_get_stack_proto_tp ;
2018-03-20 11:19:17 -07:00
default :
2020-03-29 01:43:49 +01:00
return bpf_tracing_func_proto ( func_id , prog ) ;
2018-03-20 11:19:17 -07:00
}
}
static bool tp_prog_is_valid_access ( int off , int size , enum bpf_access_type type ,
2018-03-30 15:08:00 -07:00
const struct bpf_prog * prog ,
2018-03-20 11:19:17 -07:00
struct bpf_insn_access_aux * info )
{
if ( off < sizeof ( void * ) | | off > = PERF_MAX_TRACE_SIZE )
return false ;
if ( type ! = BPF_READ )
return false ;
if ( off % size ! = 0 )
return false ;
BUILD_BUG_ON ( PERF_MAX_TRACE_SIZE % sizeof ( __u64 ) ) ;
return true ;
}
const struct bpf_verifier_ops tracepoint_verifier_ops = {
. get_func_proto = tp_prog_func_proto ,
. is_valid_access = tp_prog_is_valid_access ,
} ;
const struct bpf_prog_ops tracepoint_prog_ops = {
} ;
BPF_CALL_3 ( bpf_perf_prog_read_value , struct bpf_perf_event_data_kern * , ctx ,
2017-10-05 09:19:22 -07:00
struct bpf_perf_event_value * , buf , u32 , size )
{
int err = - EINVAL ;
if ( unlikely ( size ! = sizeof ( struct bpf_perf_event_value ) ) )
goto clear ;
err = perf_event_read_local ( ctx - > event , & buf - > counter , & buf - > enabled ,
& buf - > running ) ;
if ( unlikely ( err ) )
goto clear ;
return 0 ;
clear :
memset ( buf , 0 , size ) ;
return err ;
}
2018-03-20 11:19:17 -07:00
static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
. func = bpf_perf_prog_read_value ,
2017-10-05 09:19:22 -07:00
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_CTX ,
. arg2_type = ARG_PTR_TO_UNINIT_MEM ,
. arg3_type = ARG_CONST_SIZE ,
} ;
2020-02-17 19:04:31 -08:00
BPF_CALL_4 ( bpf_read_branch_records , struct bpf_perf_event_data_kern * , ctx ,
void * , buf , u32 , size , u64 , flags )
{
# ifndef CONFIG_X86
return - ENOENT ;
# else
static const u32 br_entry_size = sizeof ( struct perf_branch_entry ) ;
struct perf_branch_stack * br_stack = ctx - > data - > br_stack ;
u32 to_copy ;
if ( unlikely ( flags & ~ BPF_F_GET_BRANCH_RECORDS_SIZE ) )
return - EINVAL ;
if ( unlikely ( ! br_stack ) )
return - EINVAL ;
if ( flags & BPF_F_GET_BRANCH_RECORDS_SIZE )
return br_stack - > nr * br_entry_size ;
if ( ! buf | | ( size % br_entry_size ! = 0 ) )
return - EINVAL ;
to_copy = min_t ( u32 , br_stack - > nr * br_entry_size , size ) ;
memcpy ( buf , br_stack - > entries , to_copy ) ;
return to_copy ;
# endif
}
static const struct bpf_func_proto bpf_read_branch_records_proto = {
. func = bpf_read_branch_records ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_CTX ,
. arg2_type = ARG_PTR_TO_MEM_OR_NULL ,
. arg3_type = ARG_CONST_SIZE_OR_ZERO ,
. arg4_type = ARG_ANYTHING ,
} ;
2018-03-30 15:08:00 -07:00
static const struct bpf_func_proto *
pe_prog_func_proto ( enum bpf_func_id func_id , const struct bpf_prog * prog )
2016-04-06 18:43:26 -07:00
{
switch ( func_id ) {
case BPF_FUNC_perf_event_output :
2016-04-06 18:43:27 -07:00
return & bpf_perf_event_output_proto_tp ;
2016-04-06 18:43:26 -07:00
case BPF_FUNC_get_stackid :
2016-04-06 18:43:27 -07:00
return & bpf_get_stackid_proto_tp ;
2018-04-28 22:28:08 -07:00
case BPF_FUNC_get_stack :
return & bpf_get_stack_proto_tp ;
2017-10-05 09:19:22 -07:00
case BPF_FUNC_perf_prog_read_value :
2018-03-20 11:19:17 -07:00
return & bpf_perf_prog_read_value_proto ;
2020-02-17 19:04:31 -08:00
case BPF_FUNC_read_branch_records :
return & bpf_read_branch_records_proto ;
2016-04-06 18:43:26 -07:00
default :
2020-03-29 01:43:49 +01:00
return bpf_tracing_func_proto ( func_id , prog ) ;
2016-04-06 18:43:26 -07:00
}
}
2018-03-28 12:05:37 -07:00
/*
* bpf_raw_tp_regs are separate from bpf_pt_regs used from skb / xdp
* to avoid potential recursive reuse issue when / if tracepoints are added
2019-06-11 14:53:04 -07:00
* inside bpf_ * _event_output , bpf_get_stackid and / or bpf_get_stack .
*
* Since raw tracepoints run despite bpf_prog_active , support concurrent usage
* in normal , irq , and nmi context .
2018-03-28 12:05:37 -07:00
*/
2019-06-11 14:53:04 -07:00
struct bpf_raw_tp_regs {
struct pt_regs regs [ 3 ] ;
} ;
static DEFINE_PER_CPU ( struct bpf_raw_tp_regs , bpf_raw_tp_regs ) ;
static DEFINE_PER_CPU ( int , bpf_raw_tp_nest_level ) ;
static struct pt_regs * get_bpf_raw_tp_regs ( void )
{
struct bpf_raw_tp_regs * tp_regs = this_cpu_ptr ( & bpf_raw_tp_regs ) ;
int nest_level = this_cpu_inc_return ( bpf_raw_tp_nest_level ) ;
if ( WARN_ON_ONCE ( nest_level > ARRAY_SIZE ( tp_regs - > regs ) ) ) {
this_cpu_dec ( bpf_raw_tp_nest_level ) ;
return ERR_PTR ( - EBUSY ) ;
}
return & tp_regs - > regs [ nest_level - 1 ] ;
}
static void put_bpf_raw_tp_regs ( void )
{
this_cpu_dec ( bpf_raw_tp_nest_level ) ;
}
2018-03-28 12:05:37 -07:00
BPF_CALL_5 ( bpf_perf_event_output_raw_tp , struct bpf_raw_tracepoint_args * , args ,
struct bpf_map * , map , u64 , flags , void * , data , u64 , size )
{
2019-06-11 14:53:04 -07:00
struct pt_regs * regs = get_bpf_raw_tp_regs ( ) ;
int ret ;
if ( IS_ERR ( regs ) )
return PTR_ERR ( regs ) ;
2018-03-28 12:05:37 -07:00
perf_fetch_caller_regs ( regs ) ;
2019-06-11 14:53:04 -07:00
ret = ____bpf_perf_event_output ( regs , map , flags , data , size ) ;
put_bpf_raw_tp_regs ( ) ;
return ret ;
2018-03-28 12:05:37 -07:00
}
static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
. func = bpf_perf_event_output_raw_tp ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_CTX ,
. arg2_type = ARG_CONST_MAP_PTR ,
. arg3_type = ARG_ANYTHING ,
. arg4_type = ARG_PTR_TO_MEM ,
. arg5_type = ARG_CONST_SIZE_OR_ZERO ,
} ;
2019-10-15 20:25:04 -07:00
extern const struct bpf_func_proto bpf_skb_output_proto ;
2020-03-06 08:59:23 +00:00
extern const struct bpf_func_proto bpf_xdp_output_proto ;
2019-10-15 20:25:04 -07:00
2018-03-28 12:05:37 -07:00
BPF_CALL_3 ( bpf_get_stackid_raw_tp , struct bpf_raw_tracepoint_args * , args ,
struct bpf_map * , map , u64 , flags )
{
2019-06-11 14:53:04 -07:00
struct pt_regs * regs = get_bpf_raw_tp_regs ( ) ;
int ret ;
if ( IS_ERR ( regs ) )
return PTR_ERR ( regs ) ;
2018-03-28 12:05:37 -07:00
perf_fetch_caller_regs ( regs ) ;
/* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
2019-06-11 14:53:04 -07:00
ret = bpf_get_stackid ( ( unsigned long ) regs , ( unsigned long ) map ,
flags , 0 , 0 ) ;
put_bpf_raw_tp_regs ( ) ;
return ret ;
2018-03-28 12:05:37 -07:00
}
static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
. func = bpf_get_stackid_raw_tp ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_CTX ,
. arg2_type = ARG_CONST_MAP_PTR ,
. arg3_type = ARG_ANYTHING ,
} ;
2018-04-28 22:28:08 -07:00
BPF_CALL_4 ( bpf_get_stack_raw_tp , struct bpf_raw_tracepoint_args * , args ,
void * , buf , u32 , size , u64 , flags )
{
2019-06-11 14:53:04 -07:00
struct pt_regs * regs = get_bpf_raw_tp_regs ( ) ;
int ret ;
if ( IS_ERR ( regs ) )
return PTR_ERR ( regs ) ;
2018-04-28 22:28:08 -07:00
perf_fetch_caller_regs ( regs ) ;
2019-06-11 14:53:04 -07:00
ret = bpf_get_stack ( ( unsigned long ) regs , ( unsigned long ) buf ,
( unsigned long ) size , flags , 0 ) ;
put_bpf_raw_tp_regs ( ) ;
return ret ;
2018-04-28 22:28:08 -07:00
}
static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
. func = bpf_get_stack_raw_tp ,
. gpl_only = true ,
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_CTX ,
. arg2_type = ARG_PTR_TO_MEM ,
. arg3_type = ARG_CONST_SIZE_OR_ZERO ,
. arg4_type = ARG_ANYTHING ,
} ;
2018-03-30 15:08:00 -07:00
static const struct bpf_func_proto *
raw_tp_prog_func_proto ( enum bpf_func_id func_id , const struct bpf_prog * prog )
2018-03-28 12:05:37 -07:00
{
switch ( func_id ) {
case BPF_FUNC_perf_event_output :
return & bpf_perf_event_output_proto_raw_tp ;
case BPF_FUNC_get_stackid :
return & bpf_get_stackid_proto_raw_tp ;
2018-04-28 22:28:08 -07:00
case BPF_FUNC_get_stack :
return & bpf_get_stack_proto_raw_tp ;
2018-03-28 12:05:37 -07:00
default :
2020-03-29 01:43:49 +01:00
return bpf_tracing_func_proto ( func_id , prog ) ;
2018-03-28 12:05:37 -07:00
}
}
2020-05-31 17:42:55 +02:00
const struct bpf_func_proto *
2019-10-30 15:32:11 -07:00
tracing_prog_func_proto ( enum bpf_func_id func_id , const struct bpf_prog * prog )
{
switch ( func_id ) {
# ifdef CONFIG_NET
case BPF_FUNC_skb_output :
return & bpf_skb_output_proto ;
2020-03-06 08:59:23 +00:00
case BPF_FUNC_xdp_output :
return & bpf_xdp_output_proto ;
2020-06-23 16:08:09 -07:00
case BPF_FUNC_skc_to_tcp6_sock :
return & bpf_skc_to_tcp6_sock_proto ;
2019-10-30 15:32:11 -07:00
# endif
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 10:59:14 -07:00
case BPF_FUNC_seq_printf :
return prog - > expected_attach_type = = BPF_TRACE_ITER ?
& bpf_seq_printf_proto :
NULL ;
case BPF_FUNC_seq_write :
return prog - > expected_attach_type = = BPF_TRACE_ITER ?
& bpf_seq_write_proto :
NULL ;
2019-10-30 15:32:11 -07:00
default :
return raw_tp_prog_func_proto ( func_id , prog ) ;
}
}
2018-03-28 12:05:37 -07:00
static bool raw_tp_prog_is_valid_access ( int off , int size ,
enum bpf_access_type type ,
2018-03-30 15:08:00 -07:00
const struct bpf_prog * prog ,
2018-03-28 12:05:37 -07:00
struct bpf_insn_access_aux * info )
{
2019-10-30 15:32:11 -07:00
if ( off < 0 | | off > = sizeof ( __u64 ) * MAX_BPF_FUNC_ARGS )
return false ;
if ( type ! = BPF_READ )
return false ;
if ( off % size ! = 0 )
return false ;
return true ;
}
static bool tracing_prog_is_valid_access ( int off , int size ,
enum bpf_access_type type ,
const struct bpf_prog * prog ,
struct bpf_insn_access_aux * info )
{
if ( off < 0 | | off > = sizeof ( __u64 ) * MAX_BPF_FUNC_ARGS )
2018-03-28 12:05:37 -07:00
return false ;
if ( type ! = BPF_READ )
return false ;
if ( off % size ! = 0 )
return false ;
2019-10-15 20:25:00 -07:00
return btf_ctx_access ( off , size , type , prog , info ) ;
2018-03-28 12:05:37 -07:00
}
2020-03-05 23:01:27 +01:00
int __weak bpf_prog_test_run_tracing ( struct bpf_prog * prog ,
const union bpf_attr * kattr ,
union bpf_attr __user * uattr )
{
return - ENOTSUPP ;
}
2018-03-28 12:05:37 -07:00
const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
. get_func_proto = raw_tp_prog_func_proto ,
. is_valid_access = raw_tp_prog_is_valid_access ,
} ;
const struct bpf_prog_ops raw_tracepoint_prog_ops = {
} ;
2019-10-30 15:32:11 -07:00
const struct bpf_verifier_ops tracing_verifier_ops = {
. get_func_proto = tracing_prog_func_proto ,
. is_valid_access = tracing_prog_is_valid_access ,
} ;
const struct bpf_prog_ops tracing_prog_ops = {
2020-03-04 20:18:52 +01:00
. test_run = bpf_prog_test_run_tracing ,
2019-10-30 15:32:11 -07:00
} ;
2019-04-26 11:49:47 -07:00
static bool raw_tp_writable_prog_is_valid_access ( int off , int size ,
enum bpf_access_type type ,
const struct bpf_prog * prog ,
struct bpf_insn_access_aux * info )
{
if ( off = = 0 ) {
if ( size ! = sizeof ( u64 ) | | type ! = BPF_READ )
return false ;
info - > reg_type = PTR_TO_TP_BUFFER ;
}
return raw_tp_prog_is_valid_access ( off , size , type , prog , info ) ;
}
const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = {
. get_func_proto = raw_tp_prog_func_proto ,
. is_valid_access = raw_tp_writable_prog_is_valid_access ,
} ;
const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = {
} ;
2016-09-01 18:37:22 -07:00
static bool pe_prog_is_valid_access ( int off , int size , enum bpf_access_type type ,
2018-03-30 15:08:00 -07:00
const struct bpf_prog * prog ,
2017-06-22 15:07:39 -07:00
struct bpf_insn_access_aux * info )
2016-09-01 18:37:22 -07:00
{
2018-03-06 10:55:01 -08:00
const int size_u64 = sizeof ( u64 ) ;
2017-06-13 15:52:13 -07:00
2016-09-01 18:37:22 -07:00
if ( off < 0 | | off > = sizeof ( struct bpf_perf_event_data ) )
return false ;
if ( type ! = BPF_READ )
return false ;
bpf: fix context access in tracing progs on 32 bit archs
Wang reported that all the testcases for BPF_PROG_TYPE_PERF_EVENT
program type in test_verifier report the following errors on x86_32:
172/p unpriv: spill/fill of different pointers ldx FAIL
Unexpected error message!
0: (bf) r6 = r10
1: (07) r6 += -8
2: (15) if r1 == 0x0 goto pc+3
R1=ctx(id=0,off=0,imm=0) R6=fp-8,call_-1 R10=fp0,call_-1
3: (bf) r2 = r10
4: (07) r2 += -76
5: (7b) *(u64 *)(r6 +0) = r2
6: (55) if r1 != 0x0 goto pc+1
R1=ctx(id=0,off=0,imm=0) R2=fp-76,call_-1 R6=fp-8,call_-1 R10=fp0,call_-1 fp-8=fp
7: (7b) *(u64 *)(r6 +0) = r1
8: (79) r1 = *(u64 *)(r6 +0)
9: (79) r1 = *(u64 *)(r1 +68)
invalid bpf_context access off=68 size=8
378/p check bpf_perf_event_data->sample_period byte load permitted FAIL
Failed to load prog 'Permission denied'!
0: (b7) r0 = 0
1: (71) r0 = *(u8 *)(r1 +68)
invalid bpf_context access off=68 size=1
379/p check bpf_perf_event_data->sample_period half load permitted FAIL
Failed to load prog 'Permission denied'!
0: (b7) r0 = 0
1: (69) r0 = *(u16 *)(r1 +68)
invalid bpf_context access off=68 size=2
380/p check bpf_perf_event_data->sample_period word load permitted FAIL
Failed to load prog 'Permission denied'!
0: (b7) r0 = 0
1: (61) r0 = *(u32 *)(r1 +68)
invalid bpf_context access off=68 size=4
381/p check bpf_perf_event_data->sample_period dword load permitted FAIL
Failed to load prog 'Permission denied'!
0: (b7) r0 = 0
1: (79) r0 = *(u64 *)(r1 +68)
invalid bpf_context access off=68 size=8
Reason is that struct pt_regs on x86_32 doesn't fully align to 8 byte
boundary due to its size of 68 bytes. Therefore, bpf_ctx_narrow_access_ok()
will then bail out saying that off & (size_default - 1) which is 68 & 7
doesn't cleanly align in the case of sample_period access from struct
bpf_perf_event_data, hence verifier wrongly thinks we might be doing an
unaligned access here though underlying arch can handle it just fine.
Therefore adjust this down to machine size and check and rewrite the
offset for narrow access on that basis. We also need to fix corresponding
pe_prog_is_valid_access(), since we hit the check for off % size != 0
(e.g. 68 % 8 -> 4) in the first and last test. With that in place, progs
for tracing work on x86_32.
Reported-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-06-02 23:06:39 +02:00
if ( off % size ! = 0 ) {
if ( sizeof ( unsigned long ) ! = 4 )
return false ;
if ( size ! = 8 )
return false ;
if ( off % size ! = 4 )
return false ;
}
2017-06-13 15:52:13 -07:00
bpf: simplify narrower ctx access
This work tries to make the semantics and code around the
narrower ctx access a bit easier to follow. Right now
everything is done inside the .is_valid_access(). Offset
matching is done differently for read/write types, meaning
writes don't support narrower access and thus matching only
on offsetof(struct foo, bar) is enough whereas for read
case that supports narrower access we must check for
offsetof(struct foo, bar) + offsetof(struct foo, bar) +
sizeof(<bar>) - 1 for each of the cases. For read cases of
individual members that don't support narrower access (like
packet pointers or skb->cb[] case which has its own narrow
access logic), we check as usual only offsetof(struct foo,
bar) like in write case. Then, for the case where narrower
access is allowed, we also need to set the aux info for the
access. Meaning, ctx_field_size and converted_op_size have
to be set. First is the original field size e.g. sizeof(<bar>)
as in above example from the user facing ctx, and latter
one is the target size after actual rewrite happened, thus
for the kernel facing ctx. Also here we need the range match
and we need to keep track changing convert_ctx_access() and
converted_op_size from is_valid_access() as both are not at
the same location.
We can simplify the code a bit: check_ctx_access() becomes
simpler in that we only store ctx_field_size as a meta data
and later in convert_ctx_accesses() we fetch the target_size
right from the location where we do convert. Should the verifier
be misconfigured we do reject for BPF_WRITE cases or target_size
that are not provided. For the subsystems, we always work on
ranges in is_valid_access() and add small helpers for ranges
and narrow access, convert_ctx_accesses() sets target_size
for the relevant instruction.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-02 02:13:27 +02:00
switch ( off ) {
case bpf_ctx_range ( struct bpf_perf_event_data , sample_period ) :
2018-03-06 10:55:01 -08:00
bpf_ctx_record_field_size ( info , size_u64 ) ;
if ( ! bpf_ctx_narrow_access_ok ( off , size , size_u64 ) )
return false ;
break ;
case bpf_ctx_range ( struct bpf_perf_event_data , addr ) :
bpf_ctx_record_field_size ( info , size_u64 ) ;
if ( ! bpf_ctx_narrow_access_ok ( off , size , size_u64 ) )
2017-06-22 15:07:39 -07:00
return false ;
bpf: simplify narrower ctx access
This work tries to make the semantics and code around the
narrower ctx access a bit easier to follow. Right now
everything is done inside the .is_valid_access(). Offset
matching is done differently for read/write types, meaning
writes don't support narrower access and thus matching only
on offsetof(struct foo, bar) is enough whereas for read
case that supports narrower access we must check for
offsetof(struct foo, bar) + offsetof(struct foo, bar) +
sizeof(<bar>) - 1 for each of the cases. For read cases of
individual members that don't support narrower access (like
packet pointers or skb->cb[] case which has its own narrow
access logic), we check as usual only offsetof(struct foo,
bar) like in write case. Then, for the case where narrower
access is allowed, we also need to set the aux info for the
access. Meaning, ctx_field_size and converted_op_size have
to be set. First is the original field size e.g. sizeof(<bar>)
as in above example from the user facing ctx, and latter
one is the target size after actual rewrite happened, thus
for the kernel facing ctx. Also here we need the range match
and we need to keep track changing convert_ctx_access() and
converted_op_size from is_valid_access() as both are not at
the same location.
We can simplify the code a bit: check_ctx_access() becomes
simpler in that we only store ctx_field_size as a meta data
and later in convert_ctx_accesses() we fetch the target_size
right from the location where we do convert. Should the verifier
be misconfigured we do reject for BPF_WRITE cases or target_size
that are not provided. For the subsystems, we always work on
ranges in is_valid_access() and add small helpers for ranges
and narrow access, convert_ctx_accesses() sets target_size
for the relevant instruction.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-02 02:13:27 +02:00
break ;
default :
2016-09-01 18:37:22 -07:00
if ( size ! = sizeof ( long ) )
return false ;
}
bpf: simplify narrower ctx access
This work tries to make the semantics and code around the
narrower ctx access a bit easier to follow. Right now
everything is done inside the .is_valid_access(). Offset
matching is done differently for read/write types, meaning
writes don't support narrower access and thus matching only
on offsetof(struct foo, bar) is enough whereas for read
case that supports narrower access we must check for
offsetof(struct foo, bar) + offsetof(struct foo, bar) +
sizeof(<bar>) - 1 for each of the cases. For read cases of
individual members that don't support narrower access (like
packet pointers or skb->cb[] case which has its own narrow
access logic), we check as usual only offsetof(struct foo,
bar) like in write case. Then, for the case where narrower
access is allowed, we also need to set the aux info for the
access. Meaning, ctx_field_size and converted_op_size have
to be set. First is the original field size e.g. sizeof(<bar>)
as in above example from the user facing ctx, and latter
one is the target size after actual rewrite happened, thus
for the kernel facing ctx. Also here we need the range match
and we need to keep track changing convert_ctx_access() and
converted_op_size from is_valid_access() as both are not at
the same location.
We can simplify the code a bit: check_ctx_access() becomes
simpler in that we only store ctx_field_size as a meta data
and later in convert_ctx_accesses() we fetch the target_size
right from the location where we do convert. Should the verifier
be misconfigured we do reject for BPF_WRITE cases or target_size
that are not provided. For the subsystems, we always work on
ranges in is_valid_access() and add small helpers for ranges
and narrow access, convert_ctx_accesses() sets target_size
for the relevant instruction.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-02 02:13:27 +02:00
2016-09-01 18:37:22 -07:00
return true ;
}
2017-01-12 11:51:32 +01:00
static u32 pe_prog_convert_ctx_access ( enum bpf_access_type type ,
const struct bpf_insn * si ,
2016-09-01 18:37:22 -07:00
struct bpf_insn * insn_buf ,
bpf: simplify narrower ctx access
This work tries to make the semantics and code around the
narrower ctx access a bit easier to follow. Right now
everything is done inside the .is_valid_access(). Offset
matching is done differently for read/write types, meaning
writes don't support narrower access and thus matching only
on offsetof(struct foo, bar) is enough whereas for read
case that supports narrower access we must check for
offsetof(struct foo, bar) + offsetof(struct foo, bar) +
sizeof(<bar>) - 1 for each of the cases. For read cases of
individual members that don't support narrower access (like
packet pointers or skb->cb[] case which has its own narrow
access logic), we check as usual only offsetof(struct foo,
bar) like in write case. Then, for the case where narrower
access is allowed, we also need to set the aux info for the
access. Meaning, ctx_field_size and converted_op_size have
to be set. First is the original field size e.g. sizeof(<bar>)
as in above example from the user facing ctx, and latter
one is the target size after actual rewrite happened, thus
for the kernel facing ctx. Also here we need the range match
and we need to keep track changing convert_ctx_access() and
converted_op_size from is_valid_access() as both are not at
the same location.
We can simplify the code a bit: check_ctx_access() becomes
simpler in that we only store ctx_field_size as a meta data
and later in convert_ctx_accesses() we fetch the target_size
right from the location where we do convert. Should the verifier
be misconfigured we do reject for BPF_WRITE cases or target_size
that are not provided. For the subsystems, we always work on
ranges in is_valid_access() and add small helpers for ranges
and narrow access, convert_ctx_accesses() sets target_size
for the relevant instruction.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-02 02:13:27 +02:00
struct bpf_prog * prog , u32 * target_size )
2016-09-01 18:37:22 -07:00
{
struct bpf_insn * insn = insn_buf ;
2017-01-12 11:51:32 +01:00
switch ( si - > off ) {
2016-09-01 18:37:22 -07:00
case offsetof ( struct bpf_perf_event_data , sample_period ) :
2016-09-09 02:45:29 +02:00
* insn + + = BPF_LDX_MEM ( BPF_FIELD_SIZEOF ( struct bpf_perf_event_data_kern ,
2017-01-12 11:51:32 +01:00
data ) , si - > dst_reg , si - > src_reg ,
2016-09-01 18:37:22 -07:00
offsetof ( struct bpf_perf_event_data_kern , data ) ) ;
2017-01-12 11:51:32 +01:00
* insn + + = BPF_LDX_MEM ( BPF_DW , si - > dst_reg , si - > dst_reg ,
bpf: simplify narrower ctx access
This work tries to make the semantics and code around the
narrower ctx access a bit easier to follow. Right now
everything is done inside the .is_valid_access(). Offset
matching is done differently for read/write types, meaning
writes don't support narrower access and thus matching only
on offsetof(struct foo, bar) is enough whereas for read
case that supports narrower access we must check for
offsetof(struct foo, bar) + offsetof(struct foo, bar) +
sizeof(<bar>) - 1 for each of the cases. For read cases of
individual members that don't support narrower access (like
packet pointers or skb->cb[] case which has its own narrow
access logic), we check as usual only offsetof(struct foo,
bar) like in write case. Then, for the case where narrower
access is allowed, we also need to set the aux info for the
access. Meaning, ctx_field_size and converted_op_size have
to be set. First is the original field size e.g. sizeof(<bar>)
as in above example from the user facing ctx, and latter
one is the target size after actual rewrite happened, thus
for the kernel facing ctx. Also here we need the range match
and we need to keep track changing convert_ctx_access() and
converted_op_size from is_valid_access() as both are not at
the same location.
We can simplify the code a bit: check_ctx_access() becomes
simpler in that we only store ctx_field_size as a meta data
and later in convert_ctx_accesses() we fetch the target_size
right from the location where we do convert. Should the verifier
be misconfigured we do reject for BPF_WRITE cases or target_size
that are not provided. For the subsystems, we always work on
ranges in is_valid_access() and add small helpers for ranges
and narrow access, convert_ctx_accesses() sets target_size
for the relevant instruction.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-02 02:13:27 +02:00
bpf_target_off ( struct perf_sample_data , period , 8 ,
target_size ) ) ;
2016-09-01 18:37:22 -07:00
break ;
2018-03-06 10:55:01 -08:00
case offsetof ( struct bpf_perf_event_data , addr ) :
* insn + + = BPF_LDX_MEM ( BPF_FIELD_SIZEOF ( struct bpf_perf_event_data_kern ,
data ) , si - > dst_reg , si - > src_reg ,
offsetof ( struct bpf_perf_event_data_kern , data ) ) ;
* insn + + = BPF_LDX_MEM ( BPF_DW , si - > dst_reg , si - > dst_reg ,
bpf_target_off ( struct perf_sample_data , addr , 8 ,
target_size ) ) ;
break ;
2016-09-01 18:37:22 -07:00
default :
2016-09-09 02:45:29 +02:00
* insn + + = BPF_LDX_MEM ( BPF_FIELD_SIZEOF ( struct bpf_perf_event_data_kern ,
2017-01-12 11:51:32 +01:00
regs ) , si - > dst_reg , si - > src_reg ,
2016-09-01 18:37:22 -07:00
offsetof ( struct bpf_perf_event_data_kern , regs ) ) ;
2017-01-12 11:51:32 +01:00
* insn + + = BPF_LDX_MEM ( BPF_SIZEOF ( long ) , si - > dst_reg , si - > dst_reg ,
si - > off ) ;
2016-09-01 18:37:22 -07:00
break ;
}
return insn - insn_buf ;
}
2017-10-16 16:40:53 -07:00
const struct bpf_verifier_ops perf_event_verifier_ops = {
2018-03-20 11:19:17 -07:00
. get_func_proto = pe_prog_func_proto ,
2016-09-01 18:37:22 -07:00
. is_valid_access = pe_prog_is_valid_access ,
. convert_ctx_access = pe_prog_convert_ctx_access ,
} ;
2017-10-16 16:40:53 -07:00
const struct bpf_prog_ops perf_event_prog_ops = {
} ;
2017-10-23 23:53:08 -07:00
static DEFINE_MUTEX ( bpf_event_mutex ) ;
2017-11-30 13:47:54 -08:00
# define BPF_TRACE_MAX_PROGS 64
2017-10-23 23:53:08 -07:00
int perf_event_attach_bpf_prog ( struct perf_event * event ,
struct bpf_prog * prog )
{
2019-05-28 14:14:44 -07:00
struct bpf_prog_array * old_array ;
2017-10-23 23:53:08 -07:00
struct bpf_prog_array * new_array ;
int ret = - EEXIST ;
2017-12-11 11:36:48 -05:00
/*
2018-01-13 02:54:04 +09:00
* Kprobe override only works if they are on the function entry ,
* and only if they are on the opt - in list .
2017-12-11 11:36:48 -05:00
*/
if ( prog - > kprobe_override & &
2018-01-13 02:54:04 +09:00
( ! trace_kprobe_on_func_entry ( event - > tp_event ) | |
2017-12-11 11:36:48 -05:00
! trace_kprobe_error_injectable ( event - > tp_event ) ) )
return - EINVAL ;
2017-10-23 23:53:08 -07:00
mutex_lock ( & bpf_event_mutex ) ;
if ( event - > prog )
2017-10-30 13:50:22 -07:00
goto unlock ;
2017-10-23 23:53:08 -07:00
2019-05-28 14:14:44 -07:00
old_array = bpf_event_rcu_dereference ( event - > tp_event - > prog_array ) ;
2017-11-30 13:47:54 -08:00
if ( old_array & &
bpf_prog_array_length ( old_array ) > = BPF_TRACE_MAX_PROGS ) {
ret = - E2BIG ;
goto unlock ;
}
2017-10-23 23:53:08 -07:00
ret = bpf_prog_array_copy ( old_array , NULL , prog , & new_array ) ;
if ( ret < 0 )
2017-10-30 13:50:22 -07:00
goto unlock ;
2017-10-23 23:53:08 -07:00
/* set the new array to event->tp_event and set event->prog */
event - > prog = prog ;
rcu_assign_pointer ( event - > tp_event - > prog_array , new_array ) ;
bpf_prog_array_free ( old_array ) ;
2017-10-30 13:50:22 -07:00
unlock :
2017-10-23 23:53:08 -07:00
mutex_unlock ( & bpf_event_mutex ) ;
return ret ;
}
void perf_event_detach_bpf_prog ( struct perf_event * event )
{
2019-05-28 14:14:44 -07:00
struct bpf_prog_array * old_array ;
2017-10-23 23:53:08 -07:00
struct bpf_prog_array * new_array ;
int ret ;
mutex_lock ( & bpf_event_mutex ) ;
if ( ! event - > prog )
2017-10-30 13:50:22 -07:00
goto unlock ;
2017-10-23 23:53:08 -07:00
2019-05-28 14:14:44 -07:00
old_array = bpf_event_rcu_dereference ( event - > tp_event - > prog_array ) ;
2017-10-23 23:53:08 -07:00
ret = bpf_prog_array_copy ( old_array , event - > prog , NULL , & new_array ) ;
2018-05-27 12:24:08 +01:00
if ( ret = = - ENOENT )
goto unlock ;
2017-10-23 23:53:08 -07:00
if ( ret < 0 ) {
bpf_prog_array_delete_safe ( old_array , event - > prog ) ;
} else {
rcu_assign_pointer ( event - > tp_event - > prog_array , new_array ) ;
bpf_prog_array_free ( old_array ) ;
}
bpf_prog_put ( event - > prog ) ;
event - > prog = NULL ;
2017-10-30 13:50:22 -07:00
unlock :
2017-10-23 23:53:08 -07:00
mutex_unlock ( & bpf_event_mutex ) ;
}
bpf/tracing: allow user space to query prog array on the same tp
Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.
The new uapi ioctl command:
PERF_EVENT_IOC_QUERY_BPF
The new uapi/linux/perf_event.h structure:
struct perf_event_query_bpf {
__u32 ids_len;
__u32 prog_cnt;
__u32 ids[0];
};
User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".
The usage:
struct perf_event_query_bpf *query =
malloc(sizeof(*query) + sizeof(u32) * ids_len);
query.ids_len = ids_len;
err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
if (err == 0) {
/* query.prog_cnt is the number of available progs,
* number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
*/
} else if (errno == ENOSPC) {
/* query.ids_len number of progs copied,
* query.prog_cnt is the number of available progs
*/
} else {
/* other errors */
}
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2017-12-11 11:39:02 -08:00
2017-12-13 10:35:37 -08:00
int perf_event_query_prog_array ( struct perf_event * event , void __user * info )
bpf/tracing: allow user space to query prog array on the same tp
Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.
The new uapi ioctl command:
PERF_EVENT_IOC_QUERY_BPF
The new uapi/linux/perf_event.h structure:
struct perf_event_query_bpf {
__u32 ids_len;
__u32 prog_cnt;
__u32 ids[0];
};
User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".
The usage:
struct perf_event_query_bpf *query =
malloc(sizeof(*query) + sizeof(u32) * ids_len);
query.ids_len = ids_len;
err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
if (err == 0) {
/* query.prog_cnt is the number of available progs,
* number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
*/
} else if (errno == ENOSPC) {
/* query.ids_len number of progs copied,
* query.prog_cnt is the number of available progs
*/
} else {
/* other errors */
}
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2017-12-11 11:39:02 -08:00
{
struct perf_event_query_bpf __user * uquery = info ;
struct perf_event_query_bpf query = { } ;
2019-05-28 14:14:44 -07:00
struct bpf_prog_array * progs ;
2018-04-10 09:37:32 -07:00
u32 * ids , prog_cnt , ids_len ;
bpf/tracing: allow user space to query prog array on the same tp
Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.
The new uapi ioctl command:
PERF_EVENT_IOC_QUERY_BPF
The new uapi/linux/perf_event.h structure:
struct perf_event_query_bpf {
__u32 ids_len;
__u32 prog_cnt;
__u32 ids[0];
};
User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".
The usage:
struct perf_event_query_bpf *query =
malloc(sizeof(*query) + sizeof(u32) * ids_len);
query.ids_len = ids_len;
err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
if (err == 0) {
/* query.prog_cnt is the number of available progs,
* number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
*/
} else if (errno == ENOSPC) {
/* query.ids_len number of progs copied,
* query.prog_cnt is the number of available progs
*/
} else {
/* other errors */
}
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2017-12-11 11:39:02 -08:00
int ret ;
2020-04-02 11:48:54 +03:00
if ( ! perfmon_capable ( ) )
bpf/tracing: allow user space to query prog array on the same tp
Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.
The new uapi ioctl command:
PERF_EVENT_IOC_QUERY_BPF
The new uapi/linux/perf_event.h structure:
struct perf_event_query_bpf {
__u32 ids_len;
__u32 prog_cnt;
__u32 ids[0];
};
User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".
The usage:
struct perf_event_query_bpf *query =
malloc(sizeof(*query) + sizeof(u32) * ids_len);
query.ids_len = ids_len;
err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
if (err == 0) {
/* query.prog_cnt is the number of available progs,
* number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
*/
} else if (errno == ENOSPC) {
/* query.ids_len number of progs copied,
* query.prog_cnt is the number of available progs
*/
} else {
/* other errors */
}
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2017-12-11 11:39:02 -08:00
return - EPERM ;
if ( event - > attr . type ! = PERF_TYPE_TRACEPOINT )
return - EINVAL ;
if ( copy_from_user ( & query , uquery , sizeof ( query ) ) )
return - EFAULT ;
2018-04-10 09:37:32 -07:00
ids_len = query . ids_len ;
if ( ids_len > BPF_TRACE_MAX_PROGS )
2018-02-14 15:31:00 +01:00
return - E2BIG ;
2018-04-10 09:37:32 -07:00
ids = kcalloc ( ids_len , sizeof ( u32 ) , GFP_USER | __GFP_NOWARN ) ;
if ( ! ids )
return - ENOMEM ;
/*
* The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0 , which
* is required when user only wants to check for uquery - > prog_cnt .
* There is no need to check for it since the case is handled
* gracefully in bpf_prog_array_copy_info .
*/
bpf/tracing: allow user space to query prog array on the same tp
Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.
The new uapi ioctl command:
PERF_EVENT_IOC_QUERY_BPF
The new uapi/linux/perf_event.h structure:
struct perf_event_query_bpf {
__u32 ids_len;
__u32 prog_cnt;
__u32 ids[0];
};
User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".
The usage:
struct perf_event_query_bpf *query =
malloc(sizeof(*query) + sizeof(u32) * ids_len);
query.ids_len = ids_len;
err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
if (err == 0) {
/* query.prog_cnt is the number of available progs,
* number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
*/
} else if (errno == ENOSPC) {
/* query.ids_len number of progs copied,
* query.prog_cnt is the number of available progs
*/
} else {
/* other errors */
}
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2017-12-11 11:39:02 -08:00
mutex_lock ( & bpf_event_mutex ) ;
2019-05-28 14:14:44 -07:00
progs = bpf_event_rcu_dereference ( event - > tp_event - > prog_array ) ;
ret = bpf_prog_array_copy_info ( progs , ids , ids_len , & prog_cnt ) ;
bpf/tracing: allow user space to query prog array on the same tp
Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.
The new uapi ioctl command:
PERF_EVENT_IOC_QUERY_BPF
The new uapi/linux/perf_event.h structure:
struct perf_event_query_bpf {
__u32 ids_len;
__u32 prog_cnt;
__u32 ids[0];
};
User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".
The usage:
struct perf_event_query_bpf *query =
malloc(sizeof(*query) + sizeof(u32) * ids_len);
query.ids_len = ids_len;
err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
if (err == 0) {
/* query.prog_cnt is the number of available progs,
* number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
*/
} else if (errno == ENOSPC) {
/* query.ids_len number of progs copied,
* query.prog_cnt is the number of available progs
*/
} else {
/* other errors */
}
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2017-12-11 11:39:02 -08:00
mutex_unlock ( & bpf_event_mutex ) ;
2018-04-10 09:37:32 -07:00
if ( copy_to_user ( & uquery - > prog_cnt , & prog_cnt , sizeof ( prog_cnt ) ) | |
copy_to_user ( uquery - > ids , ids , ids_len * sizeof ( u32 ) ) )
ret = - EFAULT ;
kfree ( ids ) ;
bpf/tracing: allow user space to query prog array on the same tp
Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.
The new uapi ioctl command:
PERF_EVENT_IOC_QUERY_BPF
The new uapi/linux/perf_event.h structure:
struct perf_event_query_bpf {
__u32 ids_len;
__u32 prog_cnt;
__u32 ids[0];
};
User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".
The usage:
struct perf_event_query_bpf *query =
malloc(sizeof(*query) + sizeof(u32) * ids_len);
query.ids_len = ids_len;
err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
if (err == 0) {
/* query.prog_cnt is the number of available progs,
* number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
*/
} else if (errno == ENOSPC) {
/* query.ids_len number of progs copied,
* query.prog_cnt is the number of available progs
*/
} else {
/* other errors */
}
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2017-12-11 11:39:02 -08:00
return ret ;
}
2018-03-28 12:05:37 -07:00
extern struct bpf_raw_event_map __start__bpf_raw_tp [ ] ;
extern struct bpf_raw_event_map __stop__bpf_raw_tp [ ] ;
2018-12-12 16:42:37 -08:00
struct bpf_raw_event_map * bpf_get_raw_tracepoint ( const char * name )
2018-03-28 12:05:37 -07:00
{
struct bpf_raw_event_map * btp = __start__bpf_raw_tp ;
for ( ; btp < __stop__bpf_raw_tp ; btp + + ) {
if ( ! strcmp ( btp - > tp - > name , name ) )
return btp ;
}
2018-12-12 16:42:37 -08:00
return bpf_get_raw_tracepoint_module ( name ) ;
}
void bpf_put_raw_tracepoint ( struct bpf_raw_event_map * btp )
{
struct module * mod = __module_address ( ( unsigned long ) btp ) ;
if ( mod )
module_put ( mod ) ;
2018-03-28 12:05:37 -07:00
}
static __always_inline
void __bpf_trace_run ( struct bpf_prog * prog , u64 * args )
{
2020-02-24 15:01:35 +01:00
cant_sleep ( ) ;
2018-03-28 12:05:37 -07:00
rcu_read_lock ( ) ;
( void ) BPF_PROG_RUN ( prog , args ) ;
rcu_read_unlock ( ) ;
}
# define UNPACK(...) __VA_ARGS__
# define REPEAT_1(FN, DL, X, ...) FN(X)
# define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__)
# define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__)
# define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__)
# define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__)
# define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__)
# define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__)
# define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__)
# define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__)
# define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__)
# define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__)
# define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__)
# define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__)
# define SARG(X) u64 arg##X
# define COPY(X) args[X] = arg##X
# define __DL_COM (,)
# define __DL_SEM (;)
# define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
# define BPF_TRACE_DEFN_x(x) \
void bpf_trace_run # # x ( struct bpf_prog * prog , \
REPEAT ( x , SARG , __DL_COM , __SEQ_0_11 ) ) \
{ \
u64 args [ x ] ; \
REPEAT ( x , COPY , __DL_SEM , __SEQ_0_11 ) ; \
__bpf_trace_run ( prog , args ) ; \
} \
EXPORT_SYMBOL_GPL ( bpf_trace_run # # x )
BPF_TRACE_DEFN_x ( 1 ) ;
BPF_TRACE_DEFN_x ( 2 ) ;
BPF_TRACE_DEFN_x ( 3 ) ;
BPF_TRACE_DEFN_x ( 4 ) ;
BPF_TRACE_DEFN_x ( 5 ) ;
BPF_TRACE_DEFN_x ( 6 ) ;
BPF_TRACE_DEFN_x ( 7 ) ;
BPF_TRACE_DEFN_x ( 8 ) ;
BPF_TRACE_DEFN_x ( 9 ) ;
BPF_TRACE_DEFN_x ( 10 ) ;
BPF_TRACE_DEFN_x ( 11 ) ;
BPF_TRACE_DEFN_x ( 12 ) ;
static int __bpf_probe_register ( struct bpf_raw_event_map * btp , struct bpf_prog * prog )
{
struct tracepoint * tp = btp - > tp ;
/*
* check that program doesn ' t access arguments beyond what ' s
* available in this tracepoint
*/
if ( prog - > aux - > max_ctx_offset > btp - > num_args * sizeof ( u64 ) )
return - EINVAL ;
2019-04-26 11:49:47 -07:00
if ( prog - > aux - > max_tp_access > btp - > writable_size )
return - EINVAL ;
2018-03-28 12:05:37 -07:00
return tracepoint_probe_register ( tp , ( void * ) btp - > bpf_func , prog ) ;
}
int bpf_probe_register ( struct bpf_raw_event_map * btp , struct bpf_prog * prog )
{
2019-01-30 18:12:44 -08:00
return __bpf_probe_register ( btp , prog ) ;
2018-03-28 12:05:37 -07:00
}
int bpf_probe_unregister ( struct bpf_raw_event_map * btp , struct bpf_prog * prog )
{
2019-01-30 18:12:44 -08:00
return tracepoint_probe_unregister ( btp - > tp , ( void * ) btp - > bpf_func , prog ) ;
2018-03-28 12:05:37 -07:00
}
bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.
There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.
This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-05-24 11:21:09 -07:00
int bpf_get_perf_event_info ( const struct perf_event * event , u32 * prog_id ,
u32 * fd_type , const char * * buf ,
u64 * probe_offset , u64 * probe_addr )
{
bool is_tracepoint , is_syscall_tp ;
struct bpf_prog * prog ;
int flags , err = 0 ;
prog = event - > prog ;
if ( ! prog )
return - ENOENT ;
/* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
if ( prog - > type = = BPF_PROG_TYPE_PERF_EVENT )
return - EOPNOTSUPP ;
* prog_id = prog - > aux - > id ;
flags = event - > tp_event - > flags ;
is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT ;
is_syscall_tp = is_syscall_trace_event ( event - > tp_event ) ;
if ( is_tracepoint | | is_syscall_tp ) {
* buf = is_tracepoint ? event - > tp_event - > tp - > name
: event - > tp_event - > name ;
* fd_type = BPF_FD_TYPE_TRACEPOINT ;
* probe_offset = 0x0 ;
* probe_addr = 0x0 ;
} else {
/* kprobe/uprobe */
err = - EOPNOTSUPP ;
# ifdef CONFIG_KPROBE_EVENTS
if ( flags & TRACE_EVENT_FL_KPROBE )
err = bpf_get_kprobe_info ( event , fd_type , buf ,
probe_offset , probe_addr ,
event - > attr . type = = PERF_TYPE_TRACEPOINT ) ;
# endif
# ifdef CONFIG_UPROBE_EVENTS
if ( flags & TRACE_EVENT_FL_UPROBE )
err = bpf_get_uprobe_info ( event , fd_type , buf ,
probe_offset ,
event - > attr . type = = PERF_TYPE_TRACEPOINT ) ;
# endif
}
return err ;
}
2018-12-12 16:42:37 -08:00
2019-06-25 17:35:03 -07:00
static int __init send_signal_irq_work_init ( void )
{
int cpu ;
struct send_signal_irq_work * work ;
for_each_possible_cpu ( cpu ) {
work = per_cpu_ptr ( & send_signal_work , cpu ) ;
init_irq_work ( & work - > irq_work , do_bpf_send_signal ) ;
}
return 0 ;
}
subsys_initcall ( send_signal_irq_work_init ) ;
2018-12-12 16:42:37 -08:00
# ifdef CONFIG_MODULES
2019-05-13 12:04:36 -07:00
static int bpf_event_notify ( struct notifier_block * nb , unsigned long op ,
void * module )
2018-12-12 16:42:37 -08:00
{
struct bpf_trace_module * btm , * tmp ;
struct module * mod = module ;
if ( mod - > num_bpf_raw_events = = 0 | |
( op ! = MODULE_STATE_COMING & & op ! = MODULE_STATE_GOING ) )
return 0 ;
mutex_lock ( & bpf_module_mutex ) ;
switch ( op ) {
case MODULE_STATE_COMING :
btm = kzalloc ( sizeof ( * btm ) , GFP_KERNEL ) ;
if ( btm ) {
btm - > module = module ;
list_add ( & btm - > list , & bpf_trace_modules ) ;
}
break ;
case MODULE_STATE_GOING :
list_for_each_entry_safe ( btm , tmp , & bpf_trace_modules , list ) {
if ( btm - > module = = module ) {
list_del ( & btm - > list ) ;
kfree ( btm ) ;
break ;
}
}
break ;
}
mutex_unlock ( & bpf_module_mutex ) ;
return 0 ;
}
static struct notifier_block bpf_module_nb = {
. notifier_call = bpf_event_notify ,
} ;
2019-05-13 12:04:36 -07:00
static int __init bpf_event_init ( void )
2018-12-12 16:42:37 -08:00
{
register_module_notifier ( & bpf_module_nb ) ;
return 0 ;
}
fs_initcall ( bpf_event_init ) ;
# endif /* CONFIG_MODULES */