2014-12-02 02:06:37 +03:00
# include <stdio.h>
# include <sys/types.h>
# include <sys/stat.h>
# include <fcntl.h>
# include <libelf.h>
# include <gelf.h>
# include <errno.h>
# include <unistd.h>
# include <string.h>
# include <stdbool.h>
2015-03-25 22:49:23 +03:00
# include <stdlib.h>
2014-12-02 02:06:37 +03:00
# include <linux/bpf.h>
# include <linux/filter.h>
2015-03-25 22:49:23 +03:00
# include <linux/perf_event.h>
# include <sys/syscall.h>
# include <sys/ioctl.h>
# include <sys/mman.h>
# include <poll.h>
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 02:59:05 +03:00
# include <ctype.h>
2014-12-02 02:06:37 +03:00
# include "libbpf.h"
# include "bpf_helpers.h"
# include "bpf_load.h"
2015-03-25 22:49:23 +03:00
# define DEBUGFS " / sys / kernel / debug / tracing / "
2014-12-02 02:06:37 +03:00
static char license [ 128 ] ;
2015-03-25 22:49:23 +03:00
static int kern_version ;
2014-12-02 02:06:37 +03:00
static bool processed_sec [ 128 ] ;
int map_fd [ MAX_MAPS ] ;
int prog_fd [ MAX_PROGS ] ;
2015-03-25 22:49:23 +03:00
int event_fd [ MAX_PROGS ] ;
2014-12-02 02:06:37 +03:00
int prog_cnt ;
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 02:59:05 +03:00
int prog_array_fd = - 1 ;
static int populate_prog_array ( const char * event , int prog_fd )
{
int ind = atoi ( event ) , err ;
err = bpf_update_elem ( prog_array_fd , & ind , & prog_fd , BPF_ANY ) ;
if ( err < 0 ) {
printf ( " failed to store prog_fd in prog_array \n " ) ;
return - 1 ;
}
return 0 ;
}
2014-12-02 02:06:37 +03:00
static int load_and_attach ( const char * event , struct bpf_insn * prog , int size )
{
bool is_socket = strncmp ( event , " socket " , 6 ) = = 0 ;
2015-03-25 22:49:23 +03:00
bool is_kprobe = strncmp ( event , " kprobe/ " , 7 ) = = 0 ;
bool is_kretprobe = strncmp ( event , " kretprobe/ " , 10 ) = = 0 ;
2016-04-07 04:43:29 +03:00
bool is_tracepoint = strncmp ( event , " tracepoint/ " , 11 ) = = 0 ;
Add sample for adding simple drop program to link
Add a sample program that only drops packets at the BPF_PROG_TYPE_XDP_RX
hook of a link. With the drop-only program, observed single core rate is
~20Mpps.
Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.
$ perf record -a samples/bpf/xdp1 $(</sys/class/net/eth0/ifindex)
proto 17: 20403027 drops/s
./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 11791017(c11788327+d2689) usec, 59622913 (60byte,0frags)
5056638pps 2427Mb/sec (2427186240bps) errors: 0
Device: eth4@1
Result: OK: 11791012(c11787906+d3106) usec, 60526944 (60byte,0frags)
5133311pps 2463Mb/sec (2463989280bps) errors: 0
Device: eth4@2
Result: OK: 11791019(c11788249+d2769) usec, 59868091 (60byte,0frags)
5077431pps 2437Mb/sec (2437166880bps) errors: 0
Device: eth4@3
Result: OK: 11795039(c11792403+d2636) usec, 59483181 (60byte,0frags)
5043067pps 2420Mb/sec (2420672160bps) errors: 0
perf report --no-children:
26.05% ksoftirqd/0 [mlx4_en] [k] mlx4_en_process_rx_cq
17.84% ksoftirqd/0 [mlx4_en] [k] mlx4_en_alloc_frags
5.52% ksoftirqd/0 [mlx4_en] [k] mlx4_en_free_frag
4.90% swapper [kernel.vmlinux] [k] poll_idle
4.14% ksoftirqd/0 [kernel.vmlinux] [k] get_page_from_freelist
2.78% ksoftirqd/0 [kernel.vmlinux] [k] __free_pages_ok
2.57% ksoftirqd/0 [kernel.vmlinux] [k] bpf_map_lookup_elem
2.51% swapper [mlx4_en] [k] mlx4_en_process_rx_cq
1.94% ksoftirqd/0 [kernel.vmlinux] [k] percpu_array_map_lookup_elem
1.45% swapper [mlx4_en] [k] mlx4_en_alloc_frags
1.35% ksoftirqd/0 [kernel.vmlinux] [k] free_one_page
1.33% swapper [kernel.vmlinux] [k] intel_idle
1.04% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5c5
0.96% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c58d
0.93% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6ee
0.92% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6b9
0.89% ksoftirqd/0 [kernel.vmlinux] [k] __alloc_pages_nodemask
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c686
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5d5
0.78% ksoftirqd/0 [mlx4_en] [k] mlx4_alloc_pages.isra.23
0.77% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5b4
0.77% ksoftirqd/0 [kernel.vmlinux] [k] net_rx_action
machine specs:
receiver - Intel E5-1630 v3 @ 3.70GHz
sender - Intel E5645 @ 2.40GHz
Mellanox ConnectX-3 @40G
Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-19 22:16:51 +03:00
bool is_xdp = strncmp ( event , " xdp " , 3 ) = = 0 ;
2016-09-02 04:37:25 +03:00
bool is_perf_event = strncmp ( event , " perf_event " , 10 ) = = 0 ;
2015-03-25 22:49:23 +03:00
enum bpf_prog_type prog_type ;
char buf [ 256 ] ;
int fd , efd , err , id ;
struct perf_event_attr attr = { } ;
attr . type = PERF_TYPE_TRACEPOINT ;
attr . sample_type = PERF_SAMPLE_RAW ;
attr . sample_period = 1 ;
attr . wakeup_events = 1 ;
if ( is_socket ) {
prog_type = BPF_PROG_TYPE_SOCKET_FILTER ;
} else if ( is_kprobe | | is_kretprobe ) {
prog_type = BPF_PROG_TYPE_KPROBE ;
2016-04-07 04:43:29 +03:00
} else if ( is_tracepoint ) {
prog_type = BPF_PROG_TYPE_TRACEPOINT ;
Add sample for adding simple drop program to link
Add a sample program that only drops packets at the BPF_PROG_TYPE_XDP_RX
hook of a link. With the drop-only program, observed single core rate is
~20Mpps.
Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.
$ perf record -a samples/bpf/xdp1 $(</sys/class/net/eth0/ifindex)
proto 17: 20403027 drops/s
./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 11791017(c11788327+d2689) usec, 59622913 (60byte,0frags)
5056638pps 2427Mb/sec (2427186240bps) errors: 0
Device: eth4@1
Result: OK: 11791012(c11787906+d3106) usec, 60526944 (60byte,0frags)
5133311pps 2463Mb/sec (2463989280bps) errors: 0
Device: eth4@2
Result: OK: 11791019(c11788249+d2769) usec, 59868091 (60byte,0frags)
5077431pps 2437Mb/sec (2437166880bps) errors: 0
Device: eth4@3
Result: OK: 11795039(c11792403+d2636) usec, 59483181 (60byte,0frags)
5043067pps 2420Mb/sec (2420672160bps) errors: 0
perf report --no-children:
26.05% ksoftirqd/0 [mlx4_en] [k] mlx4_en_process_rx_cq
17.84% ksoftirqd/0 [mlx4_en] [k] mlx4_en_alloc_frags
5.52% ksoftirqd/0 [mlx4_en] [k] mlx4_en_free_frag
4.90% swapper [kernel.vmlinux] [k] poll_idle
4.14% ksoftirqd/0 [kernel.vmlinux] [k] get_page_from_freelist
2.78% ksoftirqd/0 [kernel.vmlinux] [k] __free_pages_ok
2.57% ksoftirqd/0 [kernel.vmlinux] [k] bpf_map_lookup_elem
2.51% swapper [mlx4_en] [k] mlx4_en_process_rx_cq
1.94% ksoftirqd/0 [kernel.vmlinux] [k] percpu_array_map_lookup_elem
1.45% swapper [mlx4_en] [k] mlx4_en_alloc_frags
1.35% ksoftirqd/0 [kernel.vmlinux] [k] free_one_page
1.33% swapper [kernel.vmlinux] [k] intel_idle
1.04% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5c5
0.96% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c58d
0.93% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6ee
0.92% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6b9
0.89% ksoftirqd/0 [kernel.vmlinux] [k] __alloc_pages_nodemask
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c686
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5d5
0.78% ksoftirqd/0 [mlx4_en] [k] mlx4_alloc_pages.isra.23
0.77% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5b4
0.77% ksoftirqd/0 [kernel.vmlinux] [k] net_rx_action
machine specs:
receiver - Intel E5-1630 v3 @ 3.70GHz
sender - Intel E5645 @ 2.40GHz
Mellanox ConnectX-3 @40G
Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-19 22:16:51 +03:00
} else if ( is_xdp ) {
prog_type = BPF_PROG_TYPE_XDP ;
2016-09-02 04:37:25 +03:00
} else if ( is_perf_event ) {
prog_type = BPF_PROG_TYPE_PERF_EVENT ;
2015-03-25 22:49:23 +03:00
} else {
printf ( " Unknown event '%s' \n " , event ) ;
2014-12-02 02:06:37 +03:00
return - 1 ;
2015-03-25 22:49:23 +03:00
}
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 02:59:05 +03:00
fd = bpf_prog_load ( prog_type , prog , size , license , kern_version ) ;
if ( fd < 0 ) {
printf ( " bpf_prog_load() err=%d \n %s " , errno , bpf_log_buf ) ;
return - 1 ;
}
prog_fd [ prog_cnt + + ] = fd ;
2016-09-02 04:37:25 +03:00
if ( is_xdp | | is_perf_event )
Add sample for adding simple drop program to link
Add a sample program that only drops packets at the BPF_PROG_TYPE_XDP_RX
hook of a link. With the drop-only program, observed single core rate is
~20Mpps.
Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.
$ perf record -a samples/bpf/xdp1 $(</sys/class/net/eth0/ifindex)
proto 17: 20403027 drops/s
./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 11791017(c11788327+d2689) usec, 59622913 (60byte,0frags)
5056638pps 2427Mb/sec (2427186240bps) errors: 0
Device: eth4@1
Result: OK: 11791012(c11787906+d3106) usec, 60526944 (60byte,0frags)
5133311pps 2463Mb/sec (2463989280bps) errors: 0
Device: eth4@2
Result: OK: 11791019(c11788249+d2769) usec, 59868091 (60byte,0frags)
5077431pps 2437Mb/sec (2437166880bps) errors: 0
Device: eth4@3
Result: OK: 11795039(c11792403+d2636) usec, 59483181 (60byte,0frags)
5043067pps 2420Mb/sec (2420672160bps) errors: 0
perf report --no-children:
26.05% ksoftirqd/0 [mlx4_en] [k] mlx4_en_process_rx_cq
17.84% ksoftirqd/0 [mlx4_en] [k] mlx4_en_alloc_frags
5.52% ksoftirqd/0 [mlx4_en] [k] mlx4_en_free_frag
4.90% swapper [kernel.vmlinux] [k] poll_idle
4.14% ksoftirqd/0 [kernel.vmlinux] [k] get_page_from_freelist
2.78% ksoftirqd/0 [kernel.vmlinux] [k] __free_pages_ok
2.57% ksoftirqd/0 [kernel.vmlinux] [k] bpf_map_lookup_elem
2.51% swapper [mlx4_en] [k] mlx4_en_process_rx_cq
1.94% ksoftirqd/0 [kernel.vmlinux] [k] percpu_array_map_lookup_elem
1.45% swapper [mlx4_en] [k] mlx4_en_alloc_frags
1.35% ksoftirqd/0 [kernel.vmlinux] [k] free_one_page
1.33% swapper [kernel.vmlinux] [k] intel_idle
1.04% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5c5
0.96% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c58d
0.93% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6ee
0.92% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6b9
0.89% ksoftirqd/0 [kernel.vmlinux] [k] __alloc_pages_nodemask
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c686
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5d5
0.78% ksoftirqd/0 [mlx4_en] [k] mlx4_alloc_pages.isra.23
0.77% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5b4
0.77% ksoftirqd/0 [kernel.vmlinux] [k] net_rx_action
machine specs:
receiver - Intel E5-1630 v3 @ 3.70GHz
sender - Intel E5645 @ 2.40GHz
Mellanox ConnectX-3 @40G
Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-19 22:16:51 +03:00
return 0 ;
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 02:59:05 +03:00
if ( is_socket ) {
event + = 6 ;
if ( * event ! = ' / ' )
return 0 ;
event + + ;
if ( ! isdigit ( * event ) ) {
printf ( " invalid prog number \n " ) ;
return - 1 ;
}
return populate_prog_array ( event , fd ) ;
}
2015-03-25 22:49:23 +03:00
if ( is_kprobe | | is_kretprobe ) {
if ( is_kprobe )
event + = 7 ;
else
event + = 10 ;
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 02:59:05 +03:00
if ( * event = = 0 ) {
printf ( " event name cannot be empty \n " ) ;
return - 1 ;
}
if ( isdigit ( * event ) )
return populate_prog_array ( event , fd ) ;
2015-03-25 22:49:23 +03:00
snprintf ( buf , sizeof ( buf ) ,
" echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events " ,
is_kprobe ? ' p ' : ' r ' , event , event ) ;
err = system ( buf ) ;
if ( err < 0 ) {
printf ( " failed to create kprobe '%s' error '%s' \n " ,
event , strerror ( errno ) ) ;
return - 1 ;
}
2014-12-02 02:06:37 +03:00
2016-04-07 04:43:29 +03:00
strcpy ( buf , DEBUGFS ) ;
strcat ( buf , " events/kprobes/ " ) ;
strcat ( buf , event ) ;
strcat ( buf , " /id " ) ;
} else if ( is_tracepoint ) {
event + = 11 ;
if ( * event = = 0 ) {
printf ( " event name cannot be empty \n " ) ;
return - 1 ;
}
strcpy ( buf , DEBUGFS ) ;
strcat ( buf , " events/ " ) ;
strcat ( buf , event ) ;
strcat ( buf , " /id " ) ;
}
2015-03-25 22:49:23 +03:00
efd = open ( buf , O_RDONLY , 0 ) ;
if ( efd < 0 ) {
printf ( " failed to open event %s \n " , event ) ;
return - 1 ;
}
err = read ( efd , buf , sizeof ( buf ) ) ;
if ( err < 0 | | err > = sizeof ( buf ) ) {
printf ( " read from '%s' failed '%s' \n " , event , strerror ( errno ) ) ;
return - 1 ;
}
close ( efd ) ;
buf [ err ] = 0 ;
id = atoi ( buf ) ;
attr . config = id ;
efd = perf_event_open ( & attr , - 1 /*pid*/ , 0 /*cpu*/ , - 1 /*group_fd*/ , 0 ) ;
if ( efd < 0 ) {
printf ( " event %d fd %d err %s \n " , id , efd , strerror ( errno ) ) ;
return - 1 ;
}
event_fd [ prog_cnt - 1 ] = efd ;
ioctl ( efd , PERF_EVENT_IOC_ENABLE , 0 ) ;
ioctl ( efd , PERF_EVENT_IOC_SET_BPF , fd ) ;
2014-12-02 02:06:37 +03:00
return 0 ;
}
static int load_maps ( struct bpf_map_def * maps , int len )
{
int i ;
for ( i = 0 ; i < len / sizeof ( struct bpf_map_def ) ; i + + ) {
map_fd [ i ] = bpf_create_map ( maps [ i ] . type ,
maps [ i ] . key_size ,
maps [ i ] . value_size ,
2016-03-08 08:57:20 +03:00
maps [ i ] . max_entries ,
maps [ i ] . map_flags ) ;
2016-03-08 08:57:18 +03:00
if ( map_fd [ i ] < 0 ) {
printf ( " failed to create a map: %d %s \n " ,
errno , strerror ( errno ) ) ;
2014-12-02 02:06:37 +03:00
return 1 ;
2016-03-08 08:57:18 +03:00
}
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 02:59:05 +03:00
if ( maps [ i ] . type = = BPF_MAP_TYPE_PROG_ARRAY )
prog_array_fd = map_fd [ i ] ;
2014-12-02 02:06:37 +03:00
}
return 0 ;
}
static int get_sec ( Elf * elf , int i , GElf_Ehdr * ehdr , char * * shname ,
GElf_Shdr * shdr , Elf_Data * * data )
{
Elf_Scn * scn ;
scn = elf_getscn ( elf , i ) ;
if ( ! scn )
return 1 ;
if ( gelf_getshdr ( scn , shdr ) ! = shdr )
return 2 ;
* shname = elf_strptr ( elf , ehdr - > e_shstrndx , shdr - > sh_name ) ;
if ( ! * shname | | ! shdr - > sh_size )
return 3 ;
* data = elf_getdata ( scn , 0 ) ;
if ( ! * data | | elf_getdata ( scn , * data ) ! = NULL )
return 4 ;
return 0 ;
}
static int parse_relo_and_apply ( Elf_Data * data , Elf_Data * symbols ,
GElf_Shdr * shdr , struct bpf_insn * insn )
{
int i , nrels ;
nrels = shdr - > sh_size / shdr - > sh_entsize ;
for ( i = 0 ; i < nrels ; i + + ) {
GElf_Sym sym ;
GElf_Rel rel ;
unsigned int insn_idx ;
gelf_getrel ( data , i , & rel ) ;
insn_idx = rel . r_offset / sizeof ( struct bpf_insn ) ;
gelf_getsym ( symbols , GELF_R_SYM ( rel . r_info ) , & sym ) ;
if ( insn [ insn_idx ] . code ! = ( BPF_LD | BPF_IMM | BPF_DW ) ) {
printf ( " invalid relo for insn[%d].code 0x%x \n " ,
insn_idx , insn [ insn_idx ] . code ) ;
return 1 ;
}
insn [ insn_idx ] . src_reg = BPF_PSEUDO_MAP_FD ;
insn [ insn_idx ] . imm = map_fd [ sym . st_value / sizeof ( struct bpf_map_def ) ] ;
}
return 0 ;
}
int load_bpf_file ( char * path )
{
int fd , i ;
Elf * elf ;
GElf_Ehdr ehdr ;
GElf_Shdr shdr , shdr_prog ;
Elf_Data * data , * data_prog , * symbols = NULL ;
char * shname , * shname_prog ;
if ( elf_version ( EV_CURRENT ) = = EV_NONE )
return 1 ;
fd = open ( path , O_RDONLY , 0 ) ;
if ( fd < 0 )
return 1 ;
elf = elf_begin ( fd , ELF_C_READ , NULL ) ;
if ( ! elf )
return 1 ;
if ( gelf_getehdr ( elf , & ehdr ) ! = & ehdr )
return 1 ;
2015-03-25 22:49:23 +03:00
/* clear all kprobes */
i = system ( " echo \" \" > /sys/kernel/debug/tracing/kprobe_events " ) ;
2014-12-02 02:06:37 +03:00
/* scan over all elf sections to get license and map info */
for ( i = 1 ; i < ehdr . e_shnum ; i + + ) {
if ( get_sec ( elf , i , & ehdr , & shname , & shdr , & data ) )
continue ;
if ( 0 ) /* helpful for llvm debugging */
printf ( " section %d:%s data %p size %zd link %d flags %d \n " ,
i , shname , data - > d_buf , data - > d_size ,
shdr . sh_link , ( int ) shdr . sh_flags ) ;
if ( strcmp ( shname , " license " ) = = 0 ) {
processed_sec [ i ] = true ;
memcpy ( license , data - > d_buf , data - > d_size ) ;
2015-03-25 22:49:23 +03:00
} else if ( strcmp ( shname , " version " ) = = 0 ) {
processed_sec [ i ] = true ;
if ( data - > d_size ! = sizeof ( int ) ) {
printf ( " invalid size of version section %zd \n " ,
data - > d_size ) ;
return 1 ;
}
memcpy ( & kern_version , data - > d_buf , sizeof ( int ) ) ;
2014-12-02 02:06:37 +03:00
} else if ( strcmp ( shname , " maps " ) = = 0 ) {
processed_sec [ i ] = true ;
if ( load_maps ( data - > d_buf , data - > d_size ) )
return 1 ;
} else if ( shdr . sh_type = = SHT_SYMTAB ) {
symbols = data ;
}
}
/* load programs that need map fixup (relocations) */
for ( i = 1 ; i < ehdr . e_shnum ; i + + ) {
if ( get_sec ( elf , i , & ehdr , & shname , & shdr , & data ) )
continue ;
if ( shdr . sh_type = = SHT_REL ) {
struct bpf_insn * insns ;
if ( get_sec ( elf , shdr . sh_info , & ehdr , & shname_prog ,
& shdr_prog , & data_prog ) )
continue ;
insns = ( struct bpf_insn * ) data_prog - > d_buf ;
processed_sec [ shdr . sh_info ] = true ;
processed_sec [ i ] = true ;
if ( parse_relo_and_apply ( data , symbols , & shdr , insns ) )
continue ;
2015-03-25 22:49:23 +03:00
if ( memcmp ( shname_prog , " kprobe/ " , 7 ) = = 0 | |
memcmp ( shname_prog , " kretprobe/ " , 10 ) = = 0 | |
2016-04-07 04:43:29 +03:00
memcmp ( shname_prog , " tracepoint/ " , 11 ) = = 0 | |
Add sample for adding simple drop program to link
Add a sample program that only drops packets at the BPF_PROG_TYPE_XDP_RX
hook of a link. With the drop-only program, observed single core rate is
~20Mpps.
Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.
$ perf record -a samples/bpf/xdp1 $(</sys/class/net/eth0/ifindex)
proto 17: 20403027 drops/s
./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 11791017(c11788327+d2689) usec, 59622913 (60byte,0frags)
5056638pps 2427Mb/sec (2427186240bps) errors: 0
Device: eth4@1
Result: OK: 11791012(c11787906+d3106) usec, 60526944 (60byte,0frags)
5133311pps 2463Mb/sec (2463989280bps) errors: 0
Device: eth4@2
Result: OK: 11791019(c11788249+d2769) usec, 59868091 (60byte,0frags)
5077431pps 2437Mb/sec (2437166880bps) errors: 0
Device: eth4@3
Result: OK: 11795039(c11792403+d2636) usec, 59483181 (60byte,0frags)
5043067pps 2420Mb/sec (2420672160bps) errors: 0
perf report --no-children:
26.05% ksoftirqd/0 [mlx4_en] [k] mlx4_en_process_rx_cq
17.84% ksoftirqd/0 [mlx4_en] [k] mlx4_en_alloc_frags
5.52% ksoftirqd/0 [mlx4_en] [k] mlx4_en_free_frag
4.90% swapper [kernel.vmlinux] [k] poll_idle
4.14% ksoftirqd/0 [kernel.vmlinux] [k] get_page_from_freelist
2.78% ksoftirqd/0 [kernel.vmlinux] [k] __free_pages_ok
2.57% ksoftirqd/0 [kernel.vmlinux] [k] bpf_map_lookup_elem
2.51% swapper [mlx4_en] [k] mlx4_en_process_rx_cq
1.94% ksoftirqd/0 [kernel.vmlinux] [k] percpu_array_map_lookup_elem
1.45% swapper [mlx4_en] [k] mlx4_en_alloc_frags
1.35% ksoftirqd/0 [kernel.vmlinux] [k] free_one_page
1.33% swapper [kernel.vmlinux] [k] intel_idle
1.04% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5c5
0.96% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c58d
0.93% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6ee
0.92% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6b9
0.89% ksoftirqd/0 [kernel.vmlinux] [k] __alloc_pages_nodemask
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c686
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5d5
0.78% ksoftirqd/0 [mlx4_en] [k] mlx4_alloc_pages.isra.23
0.77% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5b4
0.77% ksoftirqd/0 [kernel.vmlinux] [k] net_rx_action
machine specs:
receiver - Intel E5-1630 v3 @ 3.70GHz
sender - Intel E5645 @ 2.40GHz
Mellanox ConnectX-3 @40G
Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-19 22:16:51 +03:00
memcmp ( shname_prog , " xdp " , 3 ) = = 0 | |
2016-09-02 04:37:25 +03:00
memcmp ( shname_prog , " perf_event " , 10 ) = = 0 | |
2014-12-02 02:06:37 +03:00
memcmp ( shname_prog , " socket " , 6 ) = = 0 )
load_and_attach ( shname_prog , insns , data_prog - > d_size ) ;
}
}
/* load programs that don't use maps */
for ( i = 1 ; i < ehdr . e_shnum ; i + + ) {
if ( processed_sec [ i ] )
continue ;
if ( get_sec ( elf , i , & ehdr , & shname , & shdr , & data ) )
continue ;
2015-03-25 22:49:23 +03:00
if ( memcmp ( shname , " kprobe/ " , 7 ) = = 0 | |
memcmp ( shname , " kretprobe/ " , 10 ) = = 0 | |
2016-04-07 04:43:29 +03:00
memcmp ( shname , " tracepoint/ " , 11 ) = = 0 | |
Add sample for adding simple drop program to link
Add a sample program that only drops packets at the BPF_PROG_TYPE_XDP_RX
hook of a link. With the drop-only program, observed single core rate is
~20Mpps.
Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.
$ perf record -a samples/bpf/xdp1 $(</sys/class/net/eth0/ifindex)
proto 17: 20403027 drops/s
./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 11791017(c11788327+d2689) usec, 59622913 (60byte,0frags)
5056638pps 2427Mb/sec (2427186240bps) errors: 0
Device: eth4@1
Result: OK: 11791012(c11787906+d3106) usec, 60526944 (60byte,0frags)
5133311pps 2463Mb/sec (2463989280bps) errors: 0
Device: eth4@2
Result: OK: 11791019(c11788249+d2769) usec, 59868091 (60byte,0frags)
5077431pps 2437Mb/sec (2437166880bps) errors: 0
Device: eth4@3
Result: OK: 11795039(c11792403+d2636) usec, 59483181 (60byte,0frags)
5043067pps 2420Mb/sec (2420672160bps) errors: 0
perf report --no-children:
26.05% ksoftirqd/0 [mlx4_en] [k] mlx4_en_process_rx_cq
17.84% ksoftirqd/0 [mlx4_en] [k] mlx4_en_alloc_frags
5.52% ksoftirqd/0 [mlx4_en] [k] mlx4_en_free_frag
4.90% swapper [kernel.vmlinux] [k] poll_idle
4.14% ksoftirqd/0 [kernel.vmlinux] [k] get_page_from_freelist
2.78% ksoftirqd/0 [kernel.vmlinux] [k] __free_pages_ok
2.57% ksoftirqd/0 [kernel.vmlinux] [k] bpf_map_lookup_elem
2.51% swapper [mlx4_en] [k] mlx4_en_process_rx_cq
1.94% ksoftirqd/0 [kernel.vmlinux] [k] percpu_array_map_lookup_elem
1.45% swapper [mlx4_en] [k] mlx4_en_alloc_frags
1.35% ksoftirqd/0 [kernel.vmlinux] [k] free_one_page
1.33% swapper [kernel.vmlinux] [k] intel_idle
1.04% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5c5
0.96% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c58d
0.93% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6ee
0.92% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6b9
0.89% ksoftirqd/0 [kernel.vmlinux] [k] __alloc_pages_nodemask
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c686
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5d5
0.78% ksoftirqd/0 [mlx4_en] [k] mlx4_alloc_pages.isra.23
0.77% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5b4
0.77% ksoftirqd/0 [kernel.vmlinux] [k] net_rx_action
machine specs:
receiver - Intel E5-1630 v3 @ 3.70GHz
sender - Intel E5645 @ 2.40GHz
Mellanox ConnectX-3 @40G
Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-19 22:16:51 +03:00
memcmp ( shname , " xdp " , 3 ) = = 0 | |
2016-09-02 04:37:25 +03:00
memcmp ( shname , " perf_event " , 10 ) = = 0 | |
2014-12-02 02:06:37 +03:00
memcmp ( shname , " socket " , 6 ) = = 0 )
load_and_attach ( shname , data - > d_buf , data - > d_size ) ;
}
close ( fd ) ;
return 0 ;
}
2015-03-25 22:49:23 +03:00
void read_trace_pipe ( void )
{
int trace_fd ;
trace_fd = open ( DEBUGFS " trace_pipe " , O_RDONLY , 0 ) ;
if ( trace_fd < 0 )
return ;
while ( 1 ) {
static char buf [ 4096 ] ;
ssize_t sz ;
sz = read ( trace_fd , buf , sizeof ( buf ) ) ;
if ( sz > 0 ) {
buf [ sz ] = 0 ;
puts ( buf ) ;
}
}
}
2016-03-08 08:57:19 +03:00
# define MAX_SYMS 300000
static struct ksym syms [ MAX_SYMS ] ;
static int sym_cnt ;
static int ksym_cmp ( const void * p1 , const void * p2 )
{
return ( ( struct ksym * ) p1 ) - > addr - ( ( struct ksym * ) p2 ) - > addr ;
}
int load_kallsyms ( void )
{
FILE * f = fopen ( " /proc/kallsyms " , " r " ) ;
char func [ 256 ] , buf [ 256 ] ;
char symbol ;
void * addr ;
int i = 0 ;
if ( ! f )
return - ENOENT ;
while ( ! feof ( f ) ) {
if ( ! fgets ( buf , sizeof ( buf ) , f ) )
break ;
if ( sscanf ( buf , " %p %c %s " , & addr , & symbol , func ) ! = 3 )
break ;
if ( ! addr )
continue ;
syms [ i ] . addr = ( long ) addr ;
syms [ i ] . name = strdup ( func ) ;
i + + ;
}
sym_cnt = i ;
qsort ( syms , sym_cnt , sizeof ( struct ksym ) , ksym_cmp ) ;
return 0 ;
}
struct ksym * ksym_search ( long key )
{
int start = 0 , end = sym_cnt ;
int result ;
while ( start < end ) {
size_t mid = start + ( end - start ) / 2 ;
result = key - syms [ mid ] . addr ;
if ( result < 0 )
end = mid ;
else if ( result > 0 )
start = mid + 1 ;
else
return & syms [ mid ] ;
}
if ( start > = 1 & & syms [ start - 1 ] . addr < key & &
key < syms [ start ] . addr )
/* valid ksym */
return & syms [ start - 1 ] ;
/* out of range. return _stext */
return & syms [ 0 ] ;
}