diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4175cf1f4665..8357be349133 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1675,6 +1675,8 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_get_task_stack_proto; +extern const struct bpf_func_proto bpf_get_stackid_proto_pe; +extern const struct bpf_func_proto bpf_get_stack_proto_pe; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; diff --git a/include/linux/filter.h b/include/linux/filter.h index d07a6e973a7d..0a355b005bf4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -533,7 +533,8 @@ struct bpf_prog { is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ has_callchain_buf:1, /* callchain buffer allocated? */ - enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */ + enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ + call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 48d8e739975f..5beb2f8c23da 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -387,11 +388,10 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) #endif } -BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, - u64, flags) +static long __bpf_get_stackid(struct bpf_map *map, + struct perf_callchain_entry *trace, u64 flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ @@ -399,21 +399,9 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; - bool kernel = !user; u64 *ips; bool hash_matches; - if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | - BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) - return -EINVAL; - - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); - - if (unlikely(!trace)) - /* couldn't fetch the stack trace */ - return -EFAULT; - /* get_perf_callchain() guarantees that trace->nr >= init_nr * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth */ @@ -478,6 +466,30 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, return id; } +BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags) +{ + u32 max_depth = map->value_size / stack_map_data_size(map); + /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ + u32 init_nr = sysctl_perf_event_max_stack - max_depth; + bool user = flags & BPF_F_USER_STACK; + struct perf_callchain_entry *trace; + bool kernel = !user; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + return __bpf_get_stackid(map, trace, flags); +} + const struct bpf_func_proto bpf_get_stackid_proto = { .func = bpf_get_stackid, .gpl_only = true, @@ -487,7 +499,77 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; +static __u64 count_kernel_ip(struct perf_callchain_entry *trace) +{ + __u64 nr_kernel = 0; + + while (nr_kernel < trace->nr) { + if (trace->ip[nr_kernel] == PERF_CONTEXT_USER) + break; + nr_kernel++; + } + return nr_kernel; +} + +BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, + struct bpf_map *, map, u64, flags) +{ + struct perf_event *event = ctx->event; + struct perf_callchain_entry *trace; + bool kernel, user; + __u64 nr_kernel; + int ret; + + /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ + if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + return bpf_get_stackid((unsigned long)(ctx->regs), + (unsigned long) map, flags, 0, 0); + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + user = flags & BPF_F_USER_STACK; + kernel = !user; + + trace = ctx->data->callchain; + if (unlikely(!trace)) + return -EFAULT; + + nr_kernel = count_kernel_ip(trace); + + if (kernel) { + __u64 nr = trace->nr; + + trace->nr = nr_kernel; + ret = __bpf_get_stackid(map, trace, flags); + + /* restore nr */ + trace->nr = nr; + } else { /* user */ + u64 skip = flags & BPF_F_SKIP_FIELD_MASK; + + skip += nr_kernel; + if (skip > BPF_F_SKIP_FIELD_MASK) + return -EFAULT; + + flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; + ret = __bpf_get_stackid(map, trace, flags); + } + return ret; +} + +const struct bpf_func_proto bpf_get_stackid_proto_pe = { + .func = bpf_get_stackid_pe, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, + struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags) { u32 init_nr, trace_nr, copy_len, elem_size, num_elem; @@ -520,7 +602,9 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, else init_nr = sysctl_perf_event_max_stack - num_elem; - if (kernel && task) + if (trace_in) + trace = trace_in; + else if (kernel && task) trace = get_callchain_entry_for_task(task, init_nr); else trace = get_perf_callchain(regs, init_nr, kernel, user, @@ -556,7 +640,7 @@ clear: BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, u64, flags) { - return __bpf_get_stack(regs, NULL, buf, size, flags); + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); } const struct bpf_func_proto bpf_get_stack_proto = { @@ -574,7 +658,7 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, { struct pt_regs *regs = task_pt_regs(task); - return __bpf_get_stack(regs, task, buf, size, flags); + return __bpf_get_stack(regs, task, NULL, buf, size, flags); } BTF_ID_LIST(bpf_get_task_stack_btf_ids) @@ -591,6 +675,70 @@ const struct bpf_func_proto bpf_get_task_stack_proto = { .btf_id = bpf_get_task_stack_btf_ids, }; +BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, + void *, buf, u32, size, u64, flags) +{ + struct perf_event *event = ctx->event; + struct perf_callchain_entry *trace; + bool kernel, user; + int err = -EINVAL; + __u64 nr_kernel; + + if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + return __bpf_get_stack(ctx->regs, NULL, NULL, buf, size, flags); + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_USER_BUILD_ID))) + goto clear; + + user = flags & BPF_F_USER_STACK; + kernel = !user; + + err = -EFAULT; + trace = ctx->data->callchain; + if (unlikely(!trace)) + goto clear; + + nr_kernel = count_kernel_ip(trace); + + if (kernel) { + __u64 nr = trace->nr; + + trace->nr = nr_kernel; + err = __bpf_get_stack(ctx->regs, NULL, trace, buf, + size, flags); + + /* restore nr */ + trace->nr = nr; + } else { /* user */ + u64 skip = flags & BPF_F_SKIP_FIELD_MASK; + + skip += nr_kernel; + if (skip > BPF_F_SKIP_FIELD_MASK) + goto clear; + + flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; + err = __bpf_get_stack(ctx->regs, NULL, trace, buf, + size, flags); + } + return err; + +clear: + memset(buf, 0, size); + return err; + +} + +const struct bpf_func_proto bpf_get_stack_proto_pe = { + .func = bpf_get_stack_pe, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d6979db48d8..cd14e70f2d07 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4962,6 +4962,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn env->prog->has_callchain_buf = true; } + if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack) + env->prog->call_get_stack = true; + if (changes_data) clear_all_pkt_pointers(env); return 0; diff --git a/kernel/events/core.c b/kernel/events/core.c index 856d98c36f56..ddcfd2fb5cc5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9544,6 +9544,24 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) if (IS_ERR(prog)) return PTR_ERR(prog); + if (event->attr.precise_ip && + prog->call_get_stack && + (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || + event->attr.exclude_callchain_kernel || + event->attr.exclude_callchain_user)) { + /* + * On perf_event with precise_ip, calling bpf_get_stack() + * may trigger unwinder warnings and occasional crashes. + * bpf_get_[stack|stackid] works around this issue by using + * callchain attached to perf_sample_data. If the + * perf_event does not full (kernel and user) callchain + * attached to perf_sample_data, do not allow attaching BPF + * program that calls bpf_get_[stack|stackid]. + */ + bpf_prog_put(prog); + return -EPROTO; + } + event->prog = prog; event->orig_overflow_handler = READ_ONCE(event->overflow_handler); WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3cc0dcb60ca2..cb91ef902cc4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1411,9 +1411,9 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: - return &bpf_get_stackid_proto_tp; + return &bpf_get_stackid_proto_pe; case BPF_FUNC_get_stack: - return &bpf_get_stack_proto_tp; + return &bpf_get_stack_proto_pe; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; case BPF_FUNC_read_branch_records: diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a05aa7e2bab6..e51479d60285 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7833,6 +7833,9 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, pr_warn("program '%s': failed to attach to pfd %d: %s\n", bpf_program__title(prog, false), pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + if (err == -EPROTO) + pr_warn("program '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n", + bpf_program__title(prog, false), pfd); return ERR_PTR(err); } if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) { diff --git a/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c b/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c new file mode 100644 index 000000000000..d884b2ed5bc5 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook +#include +#include "test_stacktrace_build_id.skel.h" + +void test_get_stackid_cannot_attach(void) +{ + struct perf_event_attr attr = { + /* .type = PERF_TYPE_SOFTWARE, */ + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .precise_ip = 1, + .sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_BRANCH_STACK, + .branch_sample_type = PERF_SAMPLE_BRANCH_USER | + PERF_SAMPLE_BRANCH_NO_FLAGS | + PERF_SAMPLE_BRANCH_NO_CYCLES | + PERF_SAMPLE_BRANCH_CALL_STACK, + .sample_period = 5000, + .size = sizeof(struct perf_event_attr), + }; + struct test_stacktrace_build_id *skel; + __u32 duration = 0; + int pmu_fd, err; + + skel = test_stacktrace_build_id__open(); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) + return; + + /* override program type */ + bpf_program__set_perf_event(skel->progs.oncpu); + + err = test_stacktrace_build_id__load(skel); + if (CHECK(err, "skel_load", "skeleton load failed: %d\n", err)) + goto cleanup; + + pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, + 0 /* cpu 0 */, -1 /* group id */, + 0 /* flags */); + if (pmu_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) { + printf("%s:SKIP:cannot open PERF_COUNT_HW_CPU_CYCLES with precise_ip > 0\n", + __func__); + test__skip(); + goto cleanup; + } + if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", + pmu_fd, errno)) + goto cleanup; + + skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu, + pmu_fd); + CHECK(!IS_ERR(skel->links.oncpu), "attach_perf_event_no_callchain", + "should have failed\n"); + close(pmu_fd); + + /* add PERF_SAMPLE_CALLCHAIN, attach should succeed */ + attr.sample_type |= PERF_SAMPLE_CALLCHAIN; + + pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, + 0 /* cpu 0 */, -1 /* group id */, + 0 /* flags */); + + if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", + pmu_fd, errno)) + goto cleanup; + + skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu, + pmu_fd); + CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event_callchain", + "err: %ld\n", PTR_ERR(skel->links.oncpu)); + close(pmu_fd); + + /* add exclude_callchain_kernel, attach should fail */ + attr.exclude_callchain_kernel = 1; + + pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, + 0 /* cpu 0 */, -1 /* group id */, + 0 /* flags */); + + if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", + pmu_fd, errno)) + goto cleanup; + + skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu, + pmu_fd); + CHECK(!IS_ERR(skel->links.oncpu), "attach_perf_event_exclude_callchain_kernel", + "should have failed\n"); + close(pmu_fd); + +cleanup: + test_stacktrace_build_id__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c b/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c new file mode 100644 index 000000000000..72c3690844fb --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook +#define _GNU_SOURCE +#include +#include +#include +#include "perf_event_stackmap.skel.h" + +#ifndef noinline +#define noinline __attribute__((noinline)) +#endif + +noinline int func_1(void) +{ + static int val = 1; + + val += 1; + + usleep(100); + return val; +} + +noinline int func_2(void) +{ + return func_1(); +} + +noinline int func_3(void) +{ + return func_2(); +} + +noinline int func_4(void) +{ + return func_3(); +} + +noinline int func_5(void) +{ + return func_4(); +} + +noinline int func_6(void) +{ + int i, val = 1; + + for (i = 0; i < 100; i++) + val += func_5(); + + return val; +} + +void test_perf_event_stackmap(void) +{ + struct perf_event_attr attr = { + /* .type = PERF_TYPE_SOFTWARE, */ + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .precise_ip = 2, + .sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_BRANCH_STACK | + PERF_SAMPLE_CALLCHAIN, + .branch_sample_type = PERF_SAMPLE_BRANCH_USER | + PERF_SAMPLE_BRANCH_NO_FLAGS | + PERF_SAMPLE_BRANCH_NO_CYCLES | + PERF_SAMPLE_BRANCH_CALL_STACK, + .sample_period = 5000, + .size = sizeof(struct perf_event_attr), + }; + struct perf_event_stackmap *skel; + __u32 duration = 0; + cpu_set_t cpu_set; + int pmu_fd, err; + + skel = perf_event_stackmap__open(); + + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) + return; + + err = perf_event_stackmap__load(skel); + if (CHECK(err, "skel_load", "skeleton load failed: %d\n", err)) + goto cleanup; + + CPU_ZERO(&cpu_set); + CPU_SET(0, &cpu_set); + err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set); + if (CHECK(err, "set_affinity", "err %d, errno %d\n", err, errno)) + goto cleanup; + + pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, + 0 /* cpu 0 */, -1 /* group id */, + 0 /* flags */); + if (pmu_fd < 0) { + printf("%s:SKIP:cpu doesn't support the event\n", __func__); + test__skip(); + goto cleanup; + } + + skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu, + pmu_fd); + if (CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event", + "err %ld\n", PTR_ERR(skel->links.oncpu))) { + close(pmu_fd); + goto cleanup; + } + + /* create kernel and user stack traces for testing */ + func_6(); + + CHECK(skel->data->stackid_kernel != 2, "get_stackid_kernel", "failed\n"); + CHECK(skel->data->stackid_user != 2, "get_stackid_user", "failed\n"); + CHECK(skel->data->stack_kernel != 2, "get_stack_kernel", "failed\n"); + CHECK(skel->data->stack_user != 2, "get_stack_user", "failed\n"); + +cleanup: + perf_event_stackmap__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/perf_event_stackmap.c b/tools/testing/selftests/bpf/progs/perf_event_stackmap.c new file mode 100644 index 000000000000..25467d13c356 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/perf_event_stackmap.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook +#include "vmlinux.h" +#include + +#ifndef PERF_MAX_STACK_DEPTH +#define PERF_MAX_STACK_DEPTH 127 +#endif + +typedef __u64 stack_trace_t[PERF_MAX_STACK_DEPTH]; +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(max_entries, 16384); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(stack_trace_t)); +} stackmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, stack_trace_t); +} stackdata_map SEC(".maps"); + +long stackid_kernel = 1; +long stackid_user = 1; +long stack_kernel = 1; +long stack_user = 1; + +SEC("perf_event") +int oncpu(void *ctx) +{ + stack_trace_t *trace; + __u32 key = 0; + long val; + + val = bpf_get_stackid(ctx, &stackmap, 0); + if (val > 0) + stackid_kernel = 2; + val = bpf_get_stackid(ctx, &stackmap, BPF_F_USER_STACK); + if (val > 0) + stackid_user = 2; + + trace = bpf_map_lookup_elem(&stackdata_map, &key); + if (!trace) + return 0; + + val = bpf_get_stack(ctx, trace, sizeof(stack_trace_t), 0); + if (val > 0) + stack_kernel = 2; + + val = bpf_get_stack(ctx, trace, sizeof(stack_trace_t), BPF_F_USER_STACK); + if (val > 0) + stack_user = 2; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL";