2019-06-15 22:12:24 +03:00
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2019 Facebook
# include <stdint.h>
# include <stddef.h>
# include <stdbool.h>
# include <linux/bpf.h>
# include <linux/ptrace.h>
# include <linux/sched.h>
# include <linux/types.h>
2020-01-20 16:06:45 +03:00
# include <bpf/bpf_helpers.h>
2019-06-15 22:12:24 +03:00
typedef uint32_t pid_t ;
struct task_struct { } ;
# define TASK_COMM_LEN 16
# define PERF_MAX_STACK_DEPTH 127
# define STROBE_TYPE_INVALID 0
# define STROBE_TYPE_INT 1
# define STROBE_TYPE_STR 2
# define STROBE_TYPE_MAP 3
# define STACK_TABLE_EPOCH_SHIFT 20
# define STROBE_MAX_STR_LEN 1
# define STROBE_MAX_CFGS 32
# define STROBE_MAX_PAYLOAD \
( STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \
STROBE_MAX_MAPS * ( 1 + STROBE_MAX_MAP_ENTRIES * 2 ) * STROBE_MAX_STR_LEN )
struct strobe_value_header {
/*
* meaning depends on type :
* 1. int : 0 , if value not set , 1 otherwise
* 2. str : 1 always , whether value is set or not is determined by ptr
* 3. map : 1 always , pointer points to additional struct with number
* of entries ( up to STROBE_MAX_MAP_ENTRIES )
*/
uint16_t len ;
/*
* _reserved might be used for some future fields / flags , but we always
* want to keep strobe_value_header to be 8 bytes , so BPF can read 16
* bytes in one go and get both header and value
*/
uint8_t _reserved [ 6 ] ;
} ;
/*
* strobe_value_generic is used from BPF probe only , but needs to be a union
* of strobe_value_int / strobe_value_str / strobe_value_map
*/
struct strobe_value_generic {
struct strobe_value_header header ;
union {
int64_t val ;
void * ptr ;
} ;
} ;
struct strobe_value_int {
struct strobe_value_header header ;
int64_t value ;
} ;
struct strobe_value_str {
struct strobe_value_header header ;
const char * value ;
} ;
struct strobe_value_map {
struct strobe_value_header header ;
const struct strobe_map_raw * value ;
} ;
struct strobe_map_entry {
const char * key ;
const char * val ;
} ;
/*
* Map of C - string key / value pairs with fixed maximum capacity . Each map has
* corresponding int64 ID , which application can use ( or ignore ) in whatever
* way appropriate . Map is " write-only " , there is no way to get data out of
* map . Map is intended to be used to provide metadata for profilers and is
* not to be used for internal in - app communication . All methods are
* thread - safe .
*/
struct strobe_map_raw {
/*
* general purpose unique ID that ' s up to application to decide
* whether and how to use ; for request metadata use case id is unique
* request ID that ' s used to match metadata with stack traces on
* Strobelight backend side
*/
int64_t id ;
/* number of used entries in map */
int64_t cnt ;
/*
* having volatile doesn ' t change anything on BPF side , but clang
* emits warnings for passing ` volatile const char * ` into
2019-11-02 02:18:02 +03:00
* bpf_probe_read_user_str that expects just ` const char * `
2019-06-15 22:12:24 +03:00
*/
const char * tag ;
/*
* key / value entries , each consisting of 2 pointers to key and value
* C strings
*/
struct strobe_map_entry entries [ STROBE_MAX_MAP_ENTRIES ] ;
} ;
/* Following values define supported values of TLS mode */
# define TLS_NOT_SET -1
# define TLS_LOCAL_EXEC 0
# define TLS_IMM_EXEC 1
# define TLS_GENERAL_DYN 2
/*
* structure that universally represents TLS location ( both for static
* executables and shared libraries )
*/
struct strobe_value_loc {
/*
* tls_mode defines what TLS mode was used for particular metavariable :
* - - 1 ( TLS_NOT_SET ) - no metavariable ;
* - 0 ( TLS_LOCAL_EXEC ) - Local Executable mode ;
* - 1 ( TLS_IMM_EXEC ) - Immediate Executable mode ;
* - 2 ( TLS_GENERAL_DYN ) - General Dynamic mode ;
* Local Dynamic mode is not yet supported , because never seen in
* practice . Mode defines how offset field is interpreted . See
* calc_location ( ) in below for details .
*/
int64_t tls_mode ;
/*
* TLS_LOCAL_EXEC : offset from thread pointer ( fs : 0 for x86 - 64 ,
* tpidr_el0 for aarch64 ) .
* TLS_IMM_EXEC : absolute address of GOT entry containing offset
* from thread pointer ;
* TLS_GENERAL_DYN : absolute addres of double GOT entry
* containing tls_index_t struct ;
*/
int64_t offset ;
} ;
struct strobemeta_cfg {
int64_t req_meta_idx ;
struct strobe_value_loc int_locs [ STROBE_MAX_INTS ] ;
struct strobe_value_loc str_locs [ STROBE_MAX_STRS ] ;
struct strobe_value_loc map_locs [ STROBE_MAX_MAPS ] ;
} ;
struct strobe_map_descr {
uint64_t id ;
int16_t tag_len ;
/*
* cnt < 0 - map value isn ' t set ;
* 0 - map has id set , but no key / value entries
*/
int16_t cnt ;
/*
* both key_lens [ i ] and val_lens [ i ] should be > 0 for present key / value
* entry
*/
uint16_t key_lens [ STROBE_MAX_MAP_ENTRIES ] ;
uint16_t val_lens [ STROBE_MAX_MAP_ENTRIES ] ;
} ;
struct strobemeta_payload {
/* req_id has valid request ID, if req_meta_valid == 1 */
int64_t req_id ;
uint8_t req_meta_valid ;
/*
* mask has Nth bit set to 1 , if Nth metavar was present and
* successfully read
*/
uint64_t int_vals_set_mask ;
int64_t int_vals [ STROBE_MAX_INTS ] ;
/* len is >0 for present values */
uint16_t str_lens [ STROBE_MAX_STRS ] ;
/* if map_descrs[i].cnt == -1, metavar is not present/set */
struct strobe_map_descr map_descrs [ STROBE_MAX_MAPS ] ;
/*
* payload has compactly packed values of str and map variables in the
* form : strval1 \ 0 strval2 \ 0 map1key1 \ 0 map1val1 \ 0 map2key1 \ 0 map2val1 \ 0
* ( and so on ) ; str_lens [ i ] , key_lens [ i ] and val_lens [ i ] determines
* value length
*/
char payload [ STROBE_MAX_PAYLOAD ] ;
} ;
struct strobelight_bpf_sample {
uint64_t ktime ;
char comm [ TASK_COMM_LEN ] ;
pid_t pid ;
int user_stack_id ;
int kernel_stack_id ;
int has_meta ;
struct strobemeta_payload metadata ;
/*
* makes it possible to pass ( < real payload size > + 1 ) as data size to
* perf_submit ( ) to avoid perf_submit ' s paranoia about passing zero as
* size , as it deduces that < real payload size > might be
* * * theoretically * * zero
*/
char dummy_safeguard ;
} ;
2019-07-05 18:50:12 +03:00
struct {
__uint ( type , BPF_MAP_TYPE_PERF_EVENT_ARRAY ) ;
__uint ( max_entries , 32 ) ;
__uint ( key_size , sizeof ( int ) ) ;
__uint ( value_size , sizeof ( int ) ) ;
} samples SEC ( " .maps " ) ;
struct {
__uint ( type , BPF_MAP_TYPE_STACK_TRACE ) ;
__uint ( max_entries , 16 ) ;
__uint ( key_size , sizeof ( uint32_t ) ) ;
__uint ( value_size , sizeof ( uint64_t ) * PERF_MAX_STACK_DEPTH ) ;
} stacks_0 SEC ( " .maps " ) ;
struct {
__uint ( type , BPF_MAP_TYPE_STACK_TRACE ) ;
__uint ( max_entries , 16 ) ;
__uint ( key_size , sizeof ( uint32_t ) ) ;
__uint ( value_size , sizeof ( uint64_t ) * PERF_MAX_STACK_DEPTH ) ;
} stacks_1 SEC ( " .maps " ) ;
struct {
__uint ( type , BPF_MAP_TYPE_PERCPU_ARRAY ) ;
__uint ( max_entries , 1 ) ;
__type ( key , uint32_t ) ;
__type ( value , struct strobelight_bpf_sample ) ;
} sample_heap SEC ( " .maps " ) ;
struct {
__uint ( type , BPF_MAP_TYPE_PERCPU_ARRAY ) ;
__uint ( max_entries , STROBE_MAX_CFGS ) ;
__type ( key , pid_t ) ;
__type ( value , struct strobemeta_cfg ) ;
} strobemeta_cfgs SEC ( " .maps " ) ;
2019-06-15 22:12:24 +03:00
/* Type for the dtv. */
/* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */
typedef union dtv {
size_t counter ;
struct {
void * val ;
bool is_static ;
} pointer ;
} dtv_t ;
/* Partial definition for tcbhead_t */
/* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */
struct tcbhead {
void * tcb ;
dtv_t * dtv ;
} ;
/*
* TLS module / offset information for shared library case .
* For x86 - 64 , this is mapped onto two entries in GOT .
* For aarch64 , this is pointed to by second GOT entry .
*/
struct tls_index {
uint64_t module ;
uint64_t offset ;
} ;
2020-09-03 23:35:40 +03:00
# ifdef SUBPROGS
__noinline
# else
__always_inline
# endif
static void * calc_location ( struct strobe_value_loc * loc , void * tls_base )
2019-06-15 22:12:24 +03:00
{
/*
* tls_mode value is :
* - - 1 ( TLS_NOT_SET ) , if no metavar is present ;
* - 0 ( TLS_LOCAL_EXEC ) , if metavar uses Local Executable mode of TLS
* ( offset from fs : 0 for x86 - 64 or tpidr_el0 for aarch64 ) ;
* - 1 ( TLS_IMM_EXEC ) , if metavar uses Immediate Executable mode of TLS ;
* - 2 ( TLS_GENERAL_DYN ) , if metavar uses General Dynamic mode of TLS ;
* This schema allows to use something like :
* ( tls_mode + 1 ) * ( tls_base + offset )
* to get NULL for " no metavar " location , or correct pointer for local
* executable mode without doing extra ifs .
*/
if ( loc - > tls_mode < = TLS_LOCAL_EXEC ) {
/* static executable is simple, we just have offset from
* tls_base */
void * addr = tls_base + loc - > offset ;
/* multiply by (tls_mode + 1) to get NULL, if we have no
* metavar in this slot */
return ( void * ) ( ( loc - > tls_mode + 1 ) * ( int64_t ) addr ) ;
}
/*
* Other modes are more complicated , we need to jump through few hoops .
*
* For immediate executable mode ( currently supported only for aarch64 ) :
* - loc - > offset is pointing to a GOT entry containing fixed offset
* relative to tls_base ;
*
* For general dynamic mode :
* - loc - > offset is pointing to a beginning of double GOT entries ;
* - ( for aarch64 only ) second entry points to tls_index_t struct ;
* - ( for x86 - 64 only ) two GOT entries are already tls_index_t ;
* - tls_index_t - > module is used to find start of TLS section in
* which variable resides ;
* - tls_index_t - > offset provides offset within that TLS section ,
* pointing to value of variable .
*/
struct tls_index tls_index ;
dtv_t * dtv ;
void * tls_ptr ;
2019-11-02 02:18:02 +03:00
bpf_probe_read_user ( & tls_index , sizeof ( struct tls_index ) ,
( void * ) loc - > offset ) ;
2019-06-15 22:12:24 +03:00
/* valid module index is always positive */
if ( tls_index . module > 0 ) {
/* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */
2019-11-02 02:18:02 +03:00
bpf_probe_read_user ( & dtv , sizeof ( dtv ) ,
& ( ( struct tcbhead * ) tls_base ) - > dtv ) ;
2019-06-15 22:12:24 +03:00
dtv + = tls_index . module ;
} else {
dtv = NULL ;
}
2019-11-02 02:18:02 +03:00
bpf_probe_read_user ( & tls_ptr , sizeof ( void * ) , dtv ) ;
2019-06-15 22:12:24 +03:00
/* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
return tls_ptr & & tls_ptr ! = ( void * ) - 1
? tls_ptr + tls_index . offset
: NULL ;
}
2020-09-03 23:35:40 +03:00
# ifdef SUBPROGS
__noinline
# else
__always_inline
# endif
static void read_int_var ( struct strobemeta_cfg * cfg ,
size_t idx , void * tls_base ,
struct strobe_value_generic * value ,
struct strobemeta_payload * data )
2019-06-15 22:12:24 +03:00
{
void * location = calc_location ( & cfg - > int_locs [ idx ] , tls_base ) ;
if ( ! location )
return ;
2019-11-02 02:18:02 +03:00
bpf_probe_read_user ( value , sizeof ( struct strobe_value_generic ) , location ) ;
2019-06-15 22:12:24 +03:00
data - > int_vals [ idx ] = value - > val ;
if ( value - > header . len )
data - > int_vals_set_mask | = ( 1 < < idx ) ;
}
2019-07-02 21:26:51 +03:00
static __always_inline uint64_t read_str_var ( struct strobemeta_cfg * cfg ,
size_t idx , void * tls_base ,
struct strobe_value_generic * value ,
struct strobemeta_payload * data ,
void * payload )
2019-06-15 22:12:24 +03:00
{
void * location ;
uint32_t len ;
data - > str_lens [ idx ] = 0 ;
location = calc_location ( & cfg - > str_locs [ idx ] , tls_base ) ;
if ( ! location )
return 0 ;
2019-11-02 02:18:02 +03:00
bpf_probe_read_user ( value , sizeof ( struct strobe_value_generic ) , location ) ;
len = bpf_probe_read_user_str ( payload , STROBE_MAX_STR_LEN , value - > ptr ) ;
2019-06-15 22:12:24 +03:00
/*
2019-11-02 02:18:02 +03:00
* if bpf_probe_read_user_str returns error ( < 0 ) , due to casting to
2019-06-15 22:12:24 +03:00
* unsinged int , it will become big number , so next check is
* sufficient to check for errors AND prove to BPF verifier , that
2019-11-02 02:18:02 +03:00
* bpf_probe_read_user_str won ' t return anything bigger than
2019-06-15 22:12:24 +03:00
* STROBE_MAX_STR_LEN
*/
if ( len > STROBE_MAX_STR_LEN )
return 0 ;
data - > str_lens [ idx ] = len ;
return len ;
}
2019-07-02 21:26:51 +03:00
static __always_inline void * read_map_var ( struct strobemeta_cfg * cfg ,
size_t idx , void * tls_base ,
struct strobe_value_generic * value ,
struct strobemeta_payload * data ,
void * payload )
2019-06-15 22:12:24 +03:00
{
struct strobe_map_descr * descr = & data - > map_descrs [ idx ] ;
struct strobe_map_raw map ;
void * location ;
uint32_t len ;
int i ;
descr - > tag_len = 0 ; /* presume no tag is set */
descr - > cnt = - 1 ; /* presume no value is set */
location = calc_location ( & cfg - > map_locs [ idx ] , tls_base ) ;
if ( ! location )
return payload ;
2019-11-02 02:18:02 +03:00
bpf_probe_read_user ( value , sizeof ( struct strobe_value_generic ) , location ) ;
if ( bpf_probe_read_user ( & map , sizeof ( struct strobe_map_raw ) , value - > ptr ) )
2019-06-15 22:12:24 +03:00
return payload ;
descr - > id = map . id ;
descr - > cnt = map . cnt ;
if ( cfg - > req_meta_idx = = idx ) {
data - > req_id = map . id ;
data - > req_meta_valid = 1 ;
}
2019-11-02 02:18:02 +03:00
len = bpf_probe_read_user_str ( payload , STROBE_MAX_STR_LEN , map . tag ) ;
2019-06-15 22:12:24 +03:00
if ( len < = STROBE_MAX_STR_LEN ) {
descr - > tag_len = len ;
payload + = len ;
}
# ifdef NO_UNROLL
# pragma clang loop unroll(disable)
# else
# pragma unroll
# endif
2019-09-25 21:52:05 +03:00
for ( int i = 0 ; i < STROBE_MAX_MAP_ENTRIES ; + + i ) {
if ( i > = map . cnt )
break ;
2019-06-15 22:12:24 +03:00
descr - > key_lens [ i ] = 0 ;
2019-11-02 02:18:02 +03:00
len = bpf_probe_read_user_str ( payload , STROBE_MAX_STR_LEN ,
map . entries [ i ] . key ) ;
2019-06-15 22:12:24 +03:00
if ( len < = STROBE_MAX_STR_LEN ) {
descr - > key_lens [ i ] = len ;
payload + = len ;
}
descr - > val_lens [ i ] = 0 ;
2019-11-02 02:18:02 +03:00
len = bpf_probe_read_user_str ( payload , STROBE_MAX_STR_LEN ,
map . entries [ i ] . val ) ;
2019-06-15 22:12:24 +03:00
if ( len < = STROBE_MAX_STR_LEN ) {
descr - > val_lens [ i ] = len ;
payload + = len ;
}
}
return payload ;
}
/*
* read_strobe_meta returns NULL , if no metadata was read ; otherwise returns
* pointer to * right after * payload ends
*/
2020-09-03 23:35:40 +03:00
# ifdef SUBPROGS
__noinline
# else
__always_inline
# endif
static void * read_strobe_meta ( struct task_struct * task ,
struct strobemeta_payload * data )
2019-07-02 21:26:51 +03:00
{
2019-06-15 22:12:24 +03:00
pid_t pid = bpf_get_current_pid_tgid ( ) > > 32 ;
struct strobe_value_generic value = { 0 } ;
struct strobemeta_cfg * cfg ;
void * tls_base , * payload ;
cfg = bpf_map_lookup_elem ( & strobemeta_cfgs , & pid ) ;
if ( ! cfg )
return NULL ;
data - > int_vals_set_mask = 0 ;
data - > req_meta_valid = 0 ;
payload = data - > payload ;
/*
* we don ' t have struct task_struct definition , it should be :
* tls_base = ( void * ) task - > thread . fsbase ;
*/
tls_base = ( void * ) task ;
# ifdef NO_UNROLL
# pragma clang loop unroll(disable)
# else
# pragma unroll
# endif
for ( int i = 0 ; i < STROBE_MAX_INTS ; + + i ) {
read_int_var ( cfg , i , tls_base , & value , data ) ;
}
# ifdef NO_UNROLL
# pragma clang loop unroll(disable)
# else
# pragma unroll
# endif
for ( int i = 0 ; i < STROBE_MAX_STRS ; + + i ) {
payload + = read_str_var ( cfg , i , tls_base , & value , data , payload ) ;
}
# ifdef NO_UNROLL
# pragma clang loop unroll(disable)
# else
# pragma unroll
# endif
for ( int i = 0 ; i < STROBE_MAX_MAPS ; + + i ) {
payload = read_map_var ( cfg , i , tls_base , & value , data , payload ) ;
}
/*
* return pointer right after end of payload , so it ' s possible to
* calculate exact amount of useful data that needs to be sent
*/
return payload ;
}
SEC ( " raw_tracepoint/kfree_skb " )
int on_event ( struct pt_regs * ctx ) {
pid_t pid = bpf_get_current_pid_tgid ( ) > > 32 ;
struct strobelight_bpf_sample * sample ;
struct task_struct * task ;
uint32_t zero = 0 ;
uint64_t ktime_ns ;
void * sample_end ;
sample = bpf_map_lookup_elem ( & sample_heap , & zero ) ;
if ( ! sample )
return 0 ; /* this will never happen */
sample - > pid = pid ;
bpf_get_current_comm ( & sample - > comm , TASK_COMM_LEN ) ;
ktime_ns = bpf_ktime_get_ns ( ) ;
sample - > ktime = ktime_ns ;
task = ( struct task_struct * ) bpf_get_current_task ( ) ;
sample_end = read_strobe_meta ( task , & sample - > metadata ) ;
sample - > has_meta = sample_end ! = NULL ;
sample_end = sample_end ? : & sample - > metadata ;
if ( ( ktime_ns > > STACK_TABLE_EPOCH_SHIFT ) & 1 ) {
sample - > kernel_stack_id = bpf_get_stackid ( ctx , & stacks_1 , 0 ) ;
sample - > user_stack_id = bpf_get_stackid ( ctx , & stacks_1 , BPF_F_USER_STACK ) ;
} else {
sample - > kernel_stack_id = bpf_get_stackid ( ctx , & stacks_0 , 0 ) ;
sample - > user_stack_id = bpf_get_stackid ( ctx , & stacks_0 , BPF_F_USER_STACK ) ;
}
uint64_t sample_size = sample_end - ( void * ) sample ;
/* should always be true */
if ( sample_size < sizeof ( struct strobelight_bpf_sample ) )
bpf_perf_event_output ( ctx , & samples , 0 , sample , 1 + sample_size ) ;
return 0 ;
}
char _license [ ] SEC ( " license " ) = " GPL " ;