2014-12-02 02:06:37 +03:00
# include <stdio.h>
# include <sys/types.h>
# include <sys/stat.h>
# include <fcntl.h>
# include <libelf.h>
# include <gelf.h>
# include <errno.h>
# include <unistd.h>
# include <string.h>
# include <stdbool.h>
2015-03-25 22:49:23 +03:00
# include <stdlib.h>
2014-12-02 02:06:37 +03:00
# include <linux/bpf.h>
# include <linux/filter.h>
2015-03-25 22:49:23 +03:00
# include <linux/perf_event.h>
# include <sys/syscall.h>
# include <sys/ioctl.h>
# include <sys/mman.h>
# include <poll.h>
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 02:59:05 +03:00
# include <ctype.h>
2014-12-02 02:06:37 +03:00
# include "libbpf.h"
# include "bpf_helpers.h"
# include "bpf_load.h"
2015-03-25 22:49:23 +03:00
# define DEBUGFS " / sys / kernel / debug / tracing / "
2014-12-02 02:06:37 +03:00
static char license [ 128 ] ;
2015-03-25 22:49:23 +03:00
static int kern_version ;
2014-12-02 02:06:37 +03:00
static bool processed_sec [ 128 ] ;
int map_fd [ MAX_MAPS ] ;
int prog_fd [ MAX_PROGS ] ;
2015-03-25 22:49:23 +03:00
int event_fd [ MAX_PROGS ] ;
2014-12-02 02:06:37 +03:00
int prog_cnt ;
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 02:59:05 +03:00
int prog_array_fd = - 1 ;
static int populate_prog_array ( const char * event , int prog_fd )
{
int ind = atoi ( event ) , err ;
err = bpf_update_elem ( prog_array_fd , & ind , & prog_fd , BPF_ANY ) ;
if ( err < 0 ) {
printf ( " failed to store prog_fd in prog_array \n " ) ;
return - 1 ;
}
return 0 ;
}
2014-12-02 02:06:37 +03:00
static int load_and_attach ( const char * event , struct bpf_insn * prog , int size )
{
bool is_socket = strncmp ( event , " socket " , 6 ) = = 0 ;
2015-03-25 22:49:23 +03:00
bool is_kprobe = strncmp ( event , " kprobe/ " , 7 ) = = 0 ;
bool is_kretprobe = strncmp ( event , " kretprobe/ " , 10 ) = = 0 ;
enum bpf_prog_type prog_type ;
char buf [ 256 ] ;
int fd , efd , err , id ;
struct perf_event_attr attr = { } ;
attr . type = PERF_TYPE_TRACEPOINT ;
attr . sample_type = PERF_SAMPLE_RAW ;
attr . sample_period = 1 ;
attr . wakeup_events = 1 ;
if ( is_socket ) {
prog_type = BPF_PROG_TYPE_SOCKET_FILTER ;
} else if ( is_kprobe | | is_kretprobe ) {
prog_type = BPF_PROG_TYPE_KPROBE ;
} else {
printf ( " Unknown event '%s' \n " , event ) ;
2014-12-02 02:06:37 +03:00
return - 1 ;
2015-03-25 22:49:23 +03:00
}
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 02:59:05 +03:00
fd = bpf_prog_load ( prog_type , prog , size , license , kern_version ) ;
if ( fd < 0 ) {
printf ( " bpf_prog_load() err=%d \n %s " , errno , bpf_log_buf ) ;
return - 1 ;
}
prog_fd [ prog_cnt + + ] = fd ;
if ( is_socket ) {
event + = 6 ;
if ( * event ! = ' / ' )
return 0 ;
event + + ;
if ( ! isdigit ( * event ) ) {
printf ( " invalid prog number \n " ) ;
return - 1 ;
}
return populate_prog_array ( event , fd ) ;
}
2015-03-25 22:49:23 +03:00
if ( is_kprobe | | is_kretprobe ) {
if ( is_kprobe )
event + = 7 ;
else
event + = 10 ;
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 02:59:05 +03:00
if ( * event = = 0 ) {
printf ( " event name cannot be empty \n " ) ;
return - 1 ;
}
if ( isdigit ( * event ) )
return populate_prog_array ( event , fd ) ;
2015-03-25 22:49:23 +03:00
snprintf ( buf , sizeof ( buf ) ,
" echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events " ,
is_kprobe ? ' p ' : ' r ' , event , event ) ;
err = system ( buf ) ;
if ( err < 0 ) {
printf ( " failed to create kprobe '%s' error '%s' \n " ,
event , strerror ( errno ) ) ;
return - 1 ;
}
}
2014-12-02 02:06:37 +03:00
2015-03-25 22:49:23 +03:00
strcpy ( buf , DEBUGFS ) ;
strcat ( buf , " events/kprobes/ " ) ;
strcat ( buf , event ) ;
strcat ( buf , " /id " ) ;
efd = open ( buf , O_RDONLY , 0 ) ;
if ( efd < 0 ) {
printf ( " failed to open event %s \n " , event ) ;
return - 1 ;
}
err = read ( efd , buf , sizeof ( buf ) ) ;
if ( err < 0 | | err > = sizeof ( buf ) ) {
printf ( " read from '%s' failed '%s' \n " , event , strerror ( errno ) ) ;
return - 1 ;
}
close ( efd ) ;
buf [ err ] = 0 ;
id = atoi ( buf ) ;
attr . config = id ;
efd = perf_event_open ( & attr , - 1 /*pid*/ , 0 /*cpu*/ , - 1 /*group_fd*/ , 0 ) ;
if ( efd < 0 ) {
printf ( " event %d fd %d err %s \n " , id , efd , strerror ( errno ) ) ;
return - 1 ;
}
event_fd [ prog_cnt - 1 ] = efd ;
ioctl ( efd , PERF_EVENT_IOC_ENABLE , 0 ) ;
ioctl ( efd , PERF_EVENT_IOC_SET_BPF , fd ) ;
2014-12-02 02:06:37 +03:00
return 0 ;
}
static int load_maps ( struct bpf_map_def * maps , int len )
{
int i ;
for ( i = 0 ; i < len / sizeof ( struct bpf_map_def ) ; i + + ) {
map_fd [ i ] = bpf_create_map ( maps [ i ] . type ,
maps [ i ] . key_size ,
maps [ i ] . value_size ,
maps [ i ] . max_entries ) ;
if ( map_fd [ i ] < 0 )
return 1 ;
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 02:59:05 +03:00
if ( maps [ i ] . type = = BPF_MAP_TYPE_PROG_ARRAY )
prog_array_fd = map_fd [ i ] ;
2014-12-02 02:06:37 +03:00
}
return 0 ;
}
static int get_sec ( Elf * elf , int i , GElf_Ehdr * ehdr , char * * shname ,
GElf_Shdr * shdr , Elf_Data * * data )
{
Elf_Scn * scn ;
scn = elf_getscn ( elf , i ) ;
if ( ! scn )
return 1 ;
if ( gelf_getshdr ( scn , shdr ) ! = shdr )
return 2 ;
* shname = elf_strptr ( elf , ehdr - > e_shstrndx , shdr - > sh_name ) ;
if ( ! * shname | | ! shdr - > sh_size )
return 3 ;
* data = elf_getdata ( scn , 0 ) ;
if ( ! * data | | elf_getdata ( scn , * data ) ! = NULL )
return 4 ;
return 0 ;
}
static int parse_relo_and_apply ( Elf_Data * data , Elf_Data * symbols ,
GElf_Shdr * shdr , struct bpf_insn * insn )
{
int i , nrels ;
nrels = shdr - > sh_size / shdr - > sh_entsize ;
for ( i = 0 ; i < nrels ; i + + ) {
GElf_Sym sym ;
GElf_Rel rel ;
unsigned int insn_idx ;
gelf_getrel ( data , i , & rel ) ;
insn_idx = rel . r_offset / sizeof ( struct bpf_insn ) ;
gelf_getsym ( symbols , GELF_R_SYM ( rel . r_info ) , & sym ) ;
if ( insn [ insn_idx ] . code ! = ( BPF_LD | BPF_IMM | BPF_DW ) ) {
printf ( " invalid relo for insn[%d].code 0x%x \n " ,
insn_idx , insn [ insn_idx ] . code ) ;
return 1 ;
}
insn [ insn_idx ] . src_reg = BPF_PSEUDO_MAP_FD ;
insn [ insn_idx ] . imm = map_fd [ sym . st_value / sizeof ( struct bpf_map_def ) ] ;
}
return 0 ;
}
int load_bpf_file ( char * path )
{
int fd , i ;
Elf * elf ;
GElf_Ehdr ehdr ;
GElf_Shdr shdr , shdr_prog ;
Elf_Data * data , * data_prog , * symbols = NULL ;
char * shname , * shname_prog ;
if ( elf_version ( EV_CURRENT ) = = EV_NONE )
return 1 ;
fd = open ( path , O_RDONLY , 0 ) ;
if ( fd < 0 )
return 1 ;
elf = elf_begin ( fd , ELF_C_READ , NULL ) ;
if ( ! elf )
return 1 ;
if ( gelf_getehdr ( elf , & ehdr ) ! = & ehdr )
return 1 ;
2015-03-25 22:49:23 +03:00
/* clear all kprobes */
i = system ( " echo \" \" > /sys/kernel/debug/tracing/kprobe_events " ) ;
2014-12-02 02:06:37 +03:00
/* scan over all elf sections to get license and map info */
for ( i = 1 ; i < ehdr . e_shnum ; i + + ) {
if ( get_sec ( elf , i , & ehdr , & shname , & shdr , & data ) )
continue ;
if ( 0 ) /* helpful for llvm debugging */
printf ( " section %d:%s data %p size %zd link %d flags %d \n " ,
i , shname , data - > d_buf , data - > d_size ,
shdr . sh_link , ( int ) shdr . sh_flags ) ;
if ( strcmp ( shname , " license " ) = = 0 ) {
processed_sec [ i ] = true ;
memcpy ( license , data - > d_buf , data - > d_size ) ;
2015-03-25 22:49:23 +03:00
} else if ( strcmp ( shname , " version " ) = = 0 ) {
processed_sec [ i ] = true ;
if ( data - > d_size ! = sizeof ( int ) ) {
printf ( " invalid size of version section %zd \n " ,
data - > d_size ) ;
return 1 ;
}
memcpy ( & kern_version , data - > d_buf , sizeof ( int ) ) ;
2014-12-02 02:06:37 +03:00
} else if ( strcmp ( shname , " maps " ) = = 0 ) {
processed_sec [ i ] = true ;
if ( load_maps ( data - > d_buf , data - > d_size ) )
return 1 ;
} else if ( shdr . sh_type = = SHT_SYMTAB ) {
symbols = data ;
}
}
/* load programs that need map fixup (relocations) */
for ( i = 1 ; i < ehdr . e_shnum ; i + + ) {
if ( get_sec ( elf , i , & ehdr , & shname , & shdr , & data ) )
continue ;
if ( shdr . sh_type = = SHT_REL ) {
struct bpf_insn * insns ;
if ( get_sec ( elf , shdr . sh_info , & ehdr , & shname_prog ,
& shdr_prog , & data_prog ) )
continue ;
insns = ( struct bpf_insn * ) data_prog - > d_buf ;
processed_sec [ shdr . sh_info ] = true ;
processed_sec [ i ] = true ;
if ( parse_relo_and_apply ( data , symbols , & shdr , insns ) )
continue ;
2015-03-25 22:49:23 +03:00
if ( memcmp ( shname_prog , " kprobe/ " , 7 ) = = 0 | |
memcmp ( shname_prog , " kretprobe/ " , 10 ) = = 0 | |
2014-12-02 02:06:37 +03:00
memcmp ( shname_prog , " socket " , 6 ) = = 0 )
load_and_attach ( shname_prog , insns , data_prog - > d_size ) ;
}
}
/* load programs that don't use maps */
for ( i = 1 ; i < ehdr . e_shnum ; i + + ) {
if ( processed_sec [ i ] )
continue ;
if ( get_sec ( elf , i , & ehdr , & shname , & shdr , & data ) )
continue ;
2015-03-25 22:49:23 +03:00
if ( memcmp ( shname , " kprobe/ " , 7 ) = = 0 | |
memcmp ( shname , " kretprobe/ " , 10 ) = = 0 | |
2014-12-02 02:06:37 +03:00
memcmp ( shname , " socket " , 6 ) = = 0 )
load_and_attach ( shname , data - > d_buf , data - > d_size ) ;
}
close ( fd ) ;
return 0 ;
}
2015-03-25 22:49:23 +03:00
void read_trace_pipe ( void )
{
int trace_fd ;
trace_fd = open ( DEBUGFS " trace_pipe " , O_RDONLY , 0 ) ;
if ( trace_fd < 0 )
return ;
while ( 1 ) {
static char buf [ 4096 ] ;
ssize_t sz ;
sz = read ( trace_fd , buf , sizeof ( buf ) ) ;
if ( sz > 0 ) {
buf [ sz ] = 0 ;
puts ( buf ) ;
}
}
}