From a1f157c7a3bb342b972c2830a0ad53f627000a04 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 6 Sep 2023 17:18:37 +0800 Subject: [PATCH 01/34] tracing: Expand all ring buffers individually The ring buffer of global_trace is set to the minimum size in order to save memory on boot up and then it will be expand when some trace feature enabled. However currently operations under an instance can also cause global_trace ring buffer being expanded, and the expanded memory would be wasted if global_trace then not being used. See following case, we enable 'sched_switch' event in instance 'A', then ring buffer of global_trace is unexpectedly expanded to be 1410KB, also the '(expanded: 1408)' from 'buffer_size_kb' of instance is confusing. # cd /sys/kernel/tracing # mkdir instances/A # cat buffer_size_kb 7 (expanded: 1408) # cat instances/A/buffer_size_kb 1410 (expanded: 1408) # echo sched:sched_switch > instances/A/set_event # cat buffer_size_kb 1410 # cat instances/A/buffer_size_kb 1410 To fix it, we can: - Make 'ring_buffer_expanded' as a member of 'struct trace_array'; - Make 'ring_buffer_expanded' of instance is defaultly true, global_trace is defaultly false; - In order not to expose 'global_trace' outside of file 'kernel/trace/trace.c', introduce trace_set_ring_buffer_expanded() to set 'ring_buffer_expanded' as 'true'; - Pass the expected trace_array to tracing_update_buffers(). Link: https://lore.kernel.org/linux-trace-kernel/20230906091837.3998020-1-zhengyejian1@huawei.com Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 47 ++++++++++++++++++++----------------- kernel/trace/trace.h | 9 +++++-- kernel/trace/trace_events.c | 22 +++++++++-------- 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index abaaf516fcae..dd6395692ff9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -54,12 +54,6 @@ #include "trace.h" #include "trace_output.h" -/* - * On boot up, the ring buffer is set to the minimum size, so that - * we do not waste memory on systems that are not using tracing. - */ -bool ring_buffer_expanded; - #ifdef CONFIG_FTRACE_STARTUP_TEST /* * We need to change this state when a selftest is running. @@ -202,7 +196,7 @@ static int __init set_cmdline_ftrace(char *str) strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(NULL); return 1; } __setup("ftrace=", set_cmdline_ftrace); @@ -247,7 +241,7 @@ static int __init boot_alloc_snapshot(char *str) } else { allocate_snapshot = true; /* We also need the main ring buffer expanded */ - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(NULL); } return 1; } @@ -490,6 +484,13 @@ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, }; +void trace_set_ring_buffer_expanded(struct trace_array *tr) +{ + if (!tr) + tr = &global_trace; + tr->ring_buffer_expanded = true; +} + LIST_HEAD(ftrace_trace_arrays); int trace_array_get(struct trace_array *this_tr) @@ -2012,7 +2013,7 @@ static int run_tracer_selftest(struct tracer *type) #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { /* If we expanded the buffers, make sure the max is expanded too */ - if (ring_buffer_expanded) + if (tr->ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, RING_BUFFER_ALL_CPUS); tr->allocated_snapshot = true; @@ -2038,7 +2039,7 @@ static int run_tracer_selftest(struct tracer *type) tr->allocated_snapshot = false; /* Shrink the max buffer again */ - if (ring_buffer_expanded) + if (tr->ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); } @@ -3403,7 +3404,7 @@ void trace_printk_init_buffers(void) pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ - tracing_update_buffers(); + tracing_update_buffers(&global_trace); buffers_allocated = 1; @@ -6374,7 +6375,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, * we use the size that was given, and we can forget about * expanding it later. */ - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(tr); /* May be called before buffers are initialized */ if (!tr->array_buffer.buffer) @@ -6452,6 +6453,7 @@ out: /** * tracing_update_buffers - used by tracing facility to expand ring buffers + * @tr: The tracing instance * * To save on memory when the tracing is never used on a system with it * configured in. The ring buffers are set to a minimum size. But once @@ -6460,13 +6462,13 @@ out: * * This function is to be called when a tracer is about to be used. */ -int tracing_update_buffers(void) +int tracing_update_buffers(struct trace_array *tr) { int ret = 0; mutex_lock(&trace_types_lock); - if (!ring_buffer_expanded) - ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size, + if (!tr->ring_buffer_expanded) + ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); mutex_unlock(&trace_types_lock); @@ -6520,7 +6522,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) mutex_lock(&trace_types_lock); - if (!ring_buffer_expanded) { + if (!tr->ring_buffer_expanded) { ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); if (ret < 0) @@ -7192,7 +7194,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, } if (buf_size_same) { - if (!ring_buffer_expanded) + if (!tr->ring_buffer_expanded) r = sprintf(buf, "%lu (expanded: %lu)\n", size >> 10, trace_buf_size >> 10); @@ -7249,10 +7251,10 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10; - if (!ring_buffer_expanded) + if (!tr->ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } - if (ring_buffer_expanded) + if (tr->ring_buffer_expanded) r = sprintf(buf, "%lu\n", size); else r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); @@ -7646,7 +7648,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, unsigned long val; int ret; - ret = tracing_update_buffers(); + ret = tracing_update_buffers(tr); if (ret < 0) return ret; @@ -9550,6 +9552,9 @@ static struct trace_array *trace_array_create(const char *name) if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; + /* The ring buffer is defaultly expanded */ + trace_set_ring_buffer_expanded(tr); + if (ftrace_allocate_ftrace_ops(tr) < 0) goto out_free_tr; @@ -10444,7 +10449,7 @@ __init static int tracer_alloc_buffers(void) trace_printk_init_buffers(); /* To save memory, keep the ring buffer size to its minimum */ - if (ring_buffer_expanded) + if (global_trace.ring_buffer_expanded) ring_buf_size = trace_buf_size; else ring_buf_size = 1; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 77debe53f07c..e92cb9c1292f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -410,6 +410,11 @@ struct trace_array { struct cond_snapshot *cond_snapshot; #endif struct trace_func_repeats __percpu *last_func_repeats; + /* + * On boot up, the ring buffer is set to the minimum size, so that + * we do not waste memory on systems that are not using tracing. + */ + bool ring_buffer_expanded; }; enum { @@ -761,7 +766,7 @@ extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); -extern bool ring_buffer_expanded; +extern void trace_set_ring_buffer_expanded(struct trace_array *tr); extern bool tracing_selftest_disabled; #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -1305,7 +1310,7 @@ static inline void trace_branch_disable(void) #endif /* CONFIG_BRANCH_TRACER */ /* set ring buffers to default size if not already done so */ -int tracing_update_buffers(void); +int tracing_update_buffers(struct trace_array *tr); union trace_synth_field { u8 as_u8; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f49d6ddb6342..099b9b6bbdc4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1166,7 +1166,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf, if (!cnt) return 0; - ret = tracing_update_buffers(); + ret = tracing_update_buffers(tr); if (ret < 0) return ret; @@ -1397,18 +1397,20 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; - ret = tracing_update_buffers(); - if (ret < 0) - return ret; - switch (val) { case 0: case 1: ret = -ENODEV; mutex_lock(&event_mutex); file = event_file_data(filp); - if (likely(file)) + if (likely(file)) { + ret = tracing_update_buffers(file->tr); + if (ret < 0) { + mutex_unlock(&event_mutex); + return ret; + } ret = ftrace_event_enable_disable(file, val); + } mutex_unlock(&event_mutex); break; @@ -1482,7 +1484,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; - ret = tracing_update_buffers(); + ret = tracing_update_buffers(dir->tr); if (ret < 0) return ret; @@ -1956,7 +1958,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, if (!cnt) return 0; - ret = tracing_update_buffers(); + ret = tracing_update_buffers(tr); if (ret < 0) return ret; @@ -2824,7 +2826,7 @@ static __init int setup_trace_triggers(char *str) int i; strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event triggers"); buf = bootup_trigger_buf; @@ -3614,7 +3616,7 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE); - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event tracing"); return 1; From bdf4fb628093b4ab2f4eb312256233a2ae0594b7 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 14 Sep 2023 18:34:02 +0200 Subject: [PATCH 02/34] ring_buffer: Use try_cmpxchg instead of cmpxchg in rb_insert_pages Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in rb_insert_pages. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). No functional change intended. Link: https://lore.kernel.org/linux-trace-kernel/20230914163420.12923-1-ubizjak@gmail.com Cc: Steven Rostedt Cc: Masami Hiramatsu Signed-off-by: Uros Bizjak Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 515cafdb18d9..43cc47d7faaf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2056,7 +2056,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) retries = 10; success = false; while (retries--) { - struct list_head *head_page, *prev_page, *r; + struct list_head *head_page, *prev_page; struct list_head *last_page, *first_page; struct list_head *head_page_with_bit; struct buffer_page *hpage = rb_set_head_page(cpu_buffer); @@ -2075,9 +2075,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) last_page->next = head_page_with_bit; first_page->prev = prev_page; - r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); - - if (r == head_page_with_bit) { + /* caution: head_page_with_bit gets updated on cmpxchg failure */ + if (try_cmpxchg(&prev_page->next, + &head_page_with_bit, first_page)) { /* * yay, we replaced the page pointer to our new list, * now, we just have to update to head page's prev From 5dbd04eddb2c0841d1b3930e0a9944a2343c9cac Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 12 Sep 2023 18:07:02 +0000 Subject: [PATCH 03/34] tracing/user_events: Allow events to persist for perfmon_capable users There are several scenarios that have come up where having a user_event persist even if the process that registered it exits. The main one is having a daemon create events on bootup that shouldn't get deleted if the daemon has to exit or reload. Another is within OpenTelemetry exporters, they wish to potentially check if a user_event exists on the system to determine if exporting the data out should occur. The user_event in this case must exist even in the absence of the owning process running (such as the above daemon case). Expose the previously internal flag USER_EVENT_REG_PERSIST to user processes. Upon register or delete of events with this flag, ensure the user is perfmon_capable to prevent random user processes with access to tracefs from creating events that persist after exit. Link: https://lkml.kernel.org/r/20230912180704.1284-2-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/uapi/linux/user_events.h | 11 +++++++++- kernel/trace/trace_events_user.c | 36 +++++++++++++++++++------------- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h index 2984aae4a2b4..f74f3aedd49c 100644 --- a/include/uapi/linux/user_events.h +++ b/include/uapi/linux/user_events.h @@ -17,6 +17,15 @@ /* Create dynamic location entry within a 32-bit value */ #define DYN_LOC(offset, size) ((size) << 16 | (offset)) +/* List of supported registration flags */ +enum user_reg_flag { + /* Event will not delete upon last reference closing */ + USER_EVENT_REG_PERSIST = 1U << 0, + + /* This value or above is currently non-ABI */ + USER_EVENT_REG_MAX = 1U << 1, +}; + /* * Describes an event registration and stores the results of the registration. * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum @@ -33,7 +42,7 @@ struct user_reg { /* Input: Enable size in bytes at address */ __u8 enable_size; - /* Input: Flags for future use, set to 0 */ + /* Input: Flags to use, if any */ __u16 flags; /* Input: Address to update when enabled */ diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index b87f41187c6a..9365ce407426 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -49,18 +49,6 @@ #define EVENT_STATUS_PERF BIT(1) #define EVENT_STATUS_OTHER BIT(7) -/* - * User register flags are not allowed yet, keep them here until we are - * ready to expose them out to the user ABI. - */ -enum user_reg_flag { - /* Event will not delete upon last reference closing */ - USER_EVENT_REG_PERSIST = 1U << 0, - - /* This value or above is currently non-ABI */ - USER_EVENT_REG_MAX = 1U << 1, -}; - /* * Stores the system name, tables, and locks for a group of events. This * allows isolation for events by various means. @@ -220,6 +208,17 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } +static bool user_event_capable(u16 reg_flags) +{ + /* Persistent events require CAP_PERFMON / CAP_SYS_ADMIN */ + if (reg_flags & USER_EVENT_REG_PERSIST) { + if (!perfmon_capable()) + return false; + } + + return true; +} + static struct user_event *user_event_get(struct user_event *user) { refcount_inc(&user->refcnt); @@ -1811,6 +1810,9 @@ static int user_event_free(struct dyn_event *ev) if (!user_event_last_ref(user)) return -EBUSY; + if (!user_event_capable(user->reg_flags)) + return -EPERM; + return destroy_user_event(user); } @@ -1926,10 +1928,13 @@ static int user_event_parse(struct user_event_group *group, char *name, int argc = 0; char **argv; - /* User register flags are not ready yet */ - if (reg_flags != 0 || flags != NULL) + /* Currently don't support any text based flags */ + if (flags != NULL) return -EINVAL; + if (!user_event_capable(reg_flags)) + return -EPERM; + /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); user = find_user_event(group, name, &key); @@ -2062,6 +2067,9 @@ static int delete_user_event(struct user_event_group *group, char *name) if (!user_event_last_ref(user)) return -EBUSY; + if (!user_event_capable(user->reg_flags)) + return -EPERM; + return destroy_user_event(user); } From cf74c59c4fc10e45c6f415ee6624b07ed747f963 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 12 Sep 2023 18:07:03 +0000 Subject: [PATCH 04/34] selftests/user_events: Test persist flag cases Now that we have exposed USER_EVENT_REG_PERSIST events can persist both via the ABI and in the /sys/kernel/tracing/dynamic_events file. Ensure both the ABI and DYN cases work by calling both during the parse tests. Add new flags test that ensures only USER_EVENT_REG_PERSIST is honored and any other flag is invalid. Link: https://lkml.kernel.org/r/20230912180704.1284-3-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- .../testing/selftests/user_events/abi_test.c | 55 ++++++++++++++++++- .../testing/selftests/user_events/dyn_test.c | 54 +++++++++++++++++- 2 files changed, 107 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/user_events/abi_test.c b/tools/testing/selftests/user_events/abi_test.c index 8202f1327c39..3d539e556dcd 100644 --- a/tools/testing/selftests/user_events/abi_test.c +++ b/tools/testing/selftests/user_events/abi_test.c @@ -24,6 +24,18 @@ const char *data_file = "/sys/kernel/tracing/user_events_data"; const char *enable_file = "/sys/kernel/tracing/events/user_events/__abi_event/enable"; +static bool event_exists(void) +{ + int fd = open(enable_file, O_RDWR); + + if (fd < 0) + return false; + + close(fd); + + return true; +} + static int change_event(bool enable) { int fd = open(enable_file, O_RDWR); @@ -47,7 +59,22 @@ static int change_event(bool enable) return ret; } -static int reg_enable(long *enable, int size, int bit) +static int event_delete(void) +{ + int fd = open(data_file, O_RDWR); + int ret; + + if (fd < 0) + return -1; + + ret = ioctl(fd, DIAG_IOCSDEL, "__abi_event"); + + close(fd); + + return ret; +} + +static int reg_enable_flags(long *enable, int size, int bit, int flags) { struct user_reg reg = {0}; int fd = open(data_file, O_RDWR); @@ -58,6 +85,7 @@ static int reg_enable(long *enable, int size, int bit) reg.size = sizeof(reg); reg.name_args = (__u64)"__abi_event"; + reg.flags = flags; reg.enable_bit = bit; reg.enable_addr = (__u64)enable; reg.enable_size = size; @@ -69,6 +97,11 @@ static int reg_enable(long *enable, int size, int bit) return ret; } +static int reg_enable(long *enable, int size, int bit) +{ + return reg_enable_flags(enable, size, bit, 0); +} + static int reg_disable(long *enable, int bit) { struct user_unreg reg = {0}; @@ -126,6 +159,26 @@ TEST_F(user, enablement) { ASSERT_EQ(0, change_event(false)); } +TEST_F(user, flags) { + /* USER_EVENT_REG_PERSIST is allowed */ + ASSERT_EQ(0, reg_enable_flags(&self->check, sizeof(int), 0, + USER_EVENT_REG_PERSIST)); + ASSERT_EQ(0, reg_disable(&self->check, 0)); + + /* Ensure it exists after close and disable */ + ASSERT_TRUE(event_exists()); + + /* Ensure we can delete it */ + ASSERT_EQ(0, event_delete()); + + /* USER_EVENT_REG_MAX or above is not allowed */ + ASSERT_EQ(-1, reg_enable_flags(&self->check, sizeof(int), 0, + USER_EVENT_REG_MAX)); + + /* Ensure it does not exist after invalid flags */ + ASSERT_FALSE(event_exists()); +} + TEST_F(user, bit_sizes) { /* Allow 0-31 bits for 32-bit */ ASSERT_EQ(0, reg_enable(&self->check, sizeof(int), 0)); diff --git a/tools/testing/selftests/user_events/dyn_test.c b/tools/testing/selftests/user_events/dyn_test.c index a85980190bea..bdf9ab127488 100644 --- a/tools/testing/selftests/user_events/dyn_test.c +++ b/tools/testing/selftests/user_events/dyn_test.c @@ -17,9 +17,25 @@ #include "../kselftest_harness.h" #include "user_events_selftests.h" +const char *dyn_file = "/sys/kernel/tracing/dynamic_events"; const char *abi_file = "/sys/kernel/tracing/user_events_data"; const char *enable_file = "/sys/kernel/tracing/events/user_events/__test_event/enable"; +static int event_delete(void) +{ + int fd = open(abi_file, O_RDWR); + int ret; + + if (fd < 0) + return -1; + + ret = ioctl(fd, DIAG_IOCSDEL, "__test_event"); + + close(fd); + + return ret; +} + static bool wait_for_delete(void) { int i; @@ -64,7 +80,31 @@ static int unreg_event(int fd, int *check, int bit) return ioctl(fd, DIAG_IOCSUNREG, &unreg); } -static int parse(int *check, const char *value) +static int parse_dyn(const char *value) +{ + int fd = open(dyn_file, O_RDWR | O_APPEND); + int len = strlen(value); + int ret; + + if (fd == -1) + return -1; + + ret = write(fd, value, len); + + if (ret == len) + ret = 0; + else + ret = -1; + + close(fd); + + if (ret == 0) + event_delete(); + + return ret; +} + +static int parse_abi(int *check, const char *value) { int fd = open(abi_file, O_RDWR); int ret; @@ -90,6 +130,18 @@ static int parse(int *check, const char *value) return ret; } +static int parse(int *check, const char *value) +{ + int abi_ret = parse_abi(check, value); + int dyn_ret = parse_dyn(value); + + /* Ensure both ABI and DYN parse the same way */ + if (dyn_ret != abi_ret) + return -1; + + return dyn_ret; +} + static int check_match(int *check, const char *first, const char *second, bool *match) { int fd = open(abi_file, O_RDWR); From 2c6d0950f65e537ddec96548e2f73c00e758f87e Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 12 Sep 2023 18:07:04 +0000 Subject: [PATCH 05/34] tracing/user_events: Document persist event flags Users need to know how to make events persist now that we allow for that. We also now allow the dynamic_events file to create events by utilizing the persist flag during event register. Add back in to documentation how /sys/kernel/tracing/dynamic_events can be used to create persistent user_events. Add a section under registering for the currently supported flags (USER_EVENT_REG_PERSIST) and the required permissions. Add a note under deleting that deleting a persistent event also requires sufficient permission. Link: https://lkml.kernel.org/r/20230912180704.1284-4-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/user_events.rst | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst index f9530d0ac5d3..d8f12442aaa6 100644 --- a/Documentation/trace/user_events.rst +++ b/Documentation/trace/user_events.rst @@ -14,6 +14,11 @@ Programs can view status of the events via /sys/kernel/tracing/user_events_status and can both register and write data out via /sys/kernel/tracing/user_events_data. +Programs can also use /sys/kernel/tracing/dynamic_events to register and +delete user based events via the u: prefix. The format of the command to +dynamic_events is the same as the ioctl with the u: prefix applied. This +requires CAP_PERFMON due to the event persisting, otherwise -EPERM is returned. + Typically programs will register a set of events that they wish to expose to tools that can read trace_events (such as ftrace and perf). The registration process tells the kernel which address and bit to reflect if any tool has @@ -45,7 +50,7 @@ This command takes a packed struct user_reg as an argument:: /* Input: Enable size in bytes at address */ __u8 enable_size; - /* Input: Flags for future use, set to 0 */ + /* Input: Flags to use, if any */ __u16 flags; /* Input: Address to update when enabled */ @@ -69,7 +74,7 @@ The struct user_reg requires all the above inputs to be set appropriately. This must be 4 (32-bit) or 8 (64-bit). 64-bit values are only allowed to be used on 64-bit kernels, however, 32-bit can be used on all kernels. -+ flags: The flags to use, if any. For the initial version this must be 0. ++ flags: The flags to use, if any. Callers should first attempt to use flags and retry without flags to ensure support for lower versions of the kernel. If a flag is not supported -EINVAL is returned. @@ -80,6 +85,13 @@ The struct user_reg requires all the above inputs to be set appropriately. + name_args: The name and arguments to describe the event, see command format for details. +The following flags are currently supported. + ++ USER_EVENT_REG_PERSIST: The event will not delete upon the last reference + closing. Callers may use this if an event should exist even after the + process closes or unregisters the event. Requires CAP_PERFMON otherwise + -EPERM is returned. + Upon successful registration the following is set. + write_index: The index to use for this file descriptor that represents this @@ -141,7 +153,10 @@ event (in both user and kernel space). User programs should use a separate file to request deletes than the one used for registration due to this. **NOTE:** By default events will auto-delete when there are no references left -to the event. Flags in the future may change this logic. +to the event. If programs do not want auto-delete, they must use the +USER_EVENT_REG_PERSIST flag when registering the event. Once that flag is used +the event exists until DIAG_IOCSDEL is invoked. Both register and delete of an +event that persists requires CAP_PERFMON, otherwise -EPERM is returned. Unregistering ------------- From 5790b1fb3d672d9a1fe3881a7181dfdbe741568f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 4 Oct 2023 16:50:07 -0400 Subject: [PATCH 06/34] eventfs: Remove eventfs_file and just use eventfs_inode Instead of having a descriptor for every file represented in the eventfs directory, only have the directory itself represented. Change the API to send in a list of entries that represent all the files in the directory (but not other directories). The entry list contains a name and a callback function that will be used to create the files when they are accessed. struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent, const struct eventfs_entry *entries, int size, void *data); is used for the top level eventfs directory, and returns an eventfs_inode that will be used by: struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent, const struct eventfs_entry *entries, int size, void *data); where both of the above take an array of struct eventfs_entry entries for every file that is in the directory. The entries are defined by: typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data, const struct file_operations **fops); struct eventfs_entry { const char *name; eventfs_callback callback; }; Where the name is the name of the file and the callback gets called when the file is being created. The callback passes in the name (in case the same callback is used for multiple files), a pointer to the mode, data and fops. The data will be pointing to the data that was passed in eventfs_create_dir() or eventfs_create_events_dir() but may be overridden to point to something else, as it will be used to point to the inode->i_private that is created. The information passed back from the callback is used to create the dentry/inode. If the callback fills the data and the file should be created, it must return a positive number. On zero or negative, the file is ignored. This logic may also be used as a prototype to convert entire pseudo file systems into just-in-time allocation. The "show_events_dentry" file has been updated to show the directories, and any files they have. With just the eventfs_file allocations: Before after deltas for meminfo (in kB): MemFree: -14360 MemAvailable: -14260 Buffers: 40 Cached: 24 Active: 44 Inactive: 48 Inactive(anon): 28 Active(file): 44 Inactive(file): 20 Dirty: -4 AnonPages: 28 Mapped: 4 KReclaimable: 132 Slab: 1604 SReclaimable: 132 SUnreclaim: 1472 Committed_AS: 12 Before after deltas for slabinfo: : [ * = ] ext4_inode_cache 27 [* 1184 = 31968 ] extent_status 102 [* 40 = 4080 ] tracefs_inode_cache 144 [* 656 = 94464 ] buffer_head 39 [* 104 = 4056 ] shmem_inode_cache 49 [* 800 = 39200 ] filp -53 [* 256 = -13568 ] dentry 251 [* 192 = 48192 ] lsm_file_cache 277 [* 32 = 8864 ] vm_area_struct -14 [* 184 = -2576 ] trace_event_file 1748 [* 88 = 153824 ] kmalloc-1k 35 [* 1024 = 35840 ] kmalloc-256 49 [* 256 = 12544 ] kmalloc-192 -28 [* 192 = -5376 ] kmalloc-128 -30 [* 128 = -3840 ] kmalloc-96 10581 [* 96 = 1015776 ] kmalloc-64 3056 [* 64 = 195584 ] kmalloc-32 1291 [* 32 = 41312 ] kmalloc-16 2310 [* 16 = 36960 ] kmalloc-8 9216 [* 8 = 73728 ] Free memory dropped by 14,360 kB Available memory dropped by 14,260 kB Total slab additions in size: 1,771,032 bytes With this change: Before after deltas for meminfo (in kB): MemFree: -12084 MemAvailable: -11976 Buffers: 32 Cached: 32 Active: 72 Inactive: 168 Inactive(anon): 176 Active(file): 72 Inactive(file): -8 Dirty: 24 AnonPages: 196 Mapped: 8 KReclaimable: 148 Slab: 836 SReclaimable: 148 SUnreclaim: 688 Committed_AS: 324 Before after deltas for slabinfo: : [ * = ] tracefs_inode_cache 144 [* 656 = 94464 ] shmem_inode_cache -23 [* 800 = -18400 ] filp -92 [* 256 = -23552 ] dentry 179 [* 192 = 34368 ] lsm_file_cache -3 [* 32 = -96 ] vm_area_struct -13 [* 184 = -2392 ] trace_event_file 1748 [* 88 = 153824 ] kmalloc-1k -49 [* 1024 = -50176 ] kmalloc-256 -27 [* 256 = -6912 ] kmalloc-128 1864 [* 128 = 238592 ] kmalloc-64 4685 [* 64 = 299840 ] kmalloc-32 -72 [* 32 = -2304 ] kmalloc-16 256 [* 16 = 4096 ] total = 721352 Free memory dropped by 12,084 kB Available memory dropped by 11,976 kB Total slab additions in size: 721,352 bytes That's over 2 MB in savings per instance for free and available memory, and over 1 MB in savings per instance of slab memory. Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Ajay Kaher Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 859 ++++++++++++++++++----------------- fs/tracefs/inode.c | 2 +- fs/tracefs/internal.h | 37 +- include/linux/trace_events.h | 2 +- include/linux/tracefs.h | 29 +- kernel/trace/trace.c | 7 +- kernel/trace/trace.h | 4 +- kernel/trace/trace_events.c | 313 +++++++++---- 8 files changed, 711 insertions(+), 542 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 8c8d64e76103..eab18b157ef5 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -2,8 +2,9 @@ /* * event_inode.c - part of tracefs, a pseudo file system for activating tracing * - * Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt (VMware) + * Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt * Copyright (C) 2020-23 VMware Inc, author: Ajay Kaher + * Copyright (C) 2023 Google, author: Steven Rostedt * * eventfs is used to dynamically create inodes and dentries based on the * meta data provided by the tracing system. @@ -23,46 +24,6 @@ #include #include "internal.h" -struct eventfs_inode { - struct list_head e_top_files; -}; - -/* - * struct eventfs_file - hold the properties of the eventfs files and - * directories. - * @name: the name of the file or directory to create - * @d_parent: holds parent's dentry - * @dentry: once accessed holds dentry - * @list: file or directory to be added to parent directory - * @ei: list of files and directories within directory - * @fop: file_operations for file or directory - * @iop: inode_operations for file or directory - * @data: something that the caller will want to get to later on - * @mode: the permission that the file or directory should have - */ -struct eventfs_file { - const char *name; - struct dentry *d_parent; - struct dentry *dentry; - struct list_head list; - struct eventfs_inode *ei; - const struct file_operations *fop; - const struct inode_operations *iop; - /* - * Union - used for deletion - * @del_list: list of eventfs_file to delete - * @rcu: eventfs_file to delete in RCU - * @is_freed: node is freed if one of the above is set - */ - union { - struct list_head del_list; - struct rcu_head rcu; - unsigned long is_freed; - }; - void *data; - umode_t mode; -}; - static DEFINE_MUTEX(eventfs_mutex); DEFINE_STATIC_SRCU(eventfs_srcu); @@ -93,16 +54,9 @@ static const struct file_operations eventfs_file_operations = { * @data: something that the caller will want to get to later on. * @fop: struct file_operations that should be used for this file. * - * This is the basic "create a file" function for tracefs. It allows for a - * wide range of flexibility in creating a file. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the tracefs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. - * - * If tracefs is not enabled in the kernel, the value -%ENODEV will be - * returned. + * This function creates a dentry that represents a file in the eventsfs_inode + * directory. The inode.i_private pointer will point to @data in the open() + * call. */ static struct dentry *create_file(const char *name, umode_t mode, struct dentry *parent, void *data, @@ -118,6 +72,7 @@ static struct dentry *create_file(const char *name, umode_t mode, if (WARN_ON_ONCE(!S_ISREG(mode))) return NULL; + WARN_ON_ONCE(!parent); dentry = eventfs_start_creating(name, parent); if (IS_ERR(dentry)) @@ -142,20 +97,11 @@ static struct dentry *create_file(const char *name, umode_t mode, * create_dir - create a dir in the tracefs filesystem * @name: the name of the file to create. * @parent: parent dentry for this file. - * @data: something that the caller will want to get to later on. * - * This is the basic "create a dir" function for eventfs. It allows for a - * wide range of flexibility in creating a dir. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the tracefs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. - * - * If tracefs is not enabled in the kernel, the value -%ENODEV will be - * returned. + * This function will create a dentry for a directory represented by + * a eventfs_inode. */ -static struct dentry *create_dir(const char *name, struct dentry *parent, void *data) +static struct dentry *create_dir(const char *name, struct dentry *parent) { struct tracefs_inode *ti; struct dentry *dentry; @@ -172,7 +118,6 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void * inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; inode->i_op = &eventfs_root_dir_inode_operations; inode->i_fop = &eventfs_file_operations; - inode->i_private = data; ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE; @@ -185,18 +130,18 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void * } /** - * eventfs_set_ef_status_free - set the ef->status to free + * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode * @ti: the tracefs_inode of the dentry - * @dentry: dentry who's status to be freed + * @dentry: dentry which has the reference to remove. * - * eventfs_set_ef_status_free will be called if no more - * references remain + * Remove the association between a dentry from an eventfs_inode. */ -void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry) +void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) { struct tracefs_inode *ti_parent; + struct eventfs_inode *ei_child, *tmp; struct eventfs_inode *ei; - struct eventfs_file *ef, *tmp; + int i; /* The top level events directory may be freed by this */ if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) { @@ -207,9 +152,9 @@ void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry) ei = ti->private; /* Record all the top level files */ - list_for_each_entry_srcu(ef, &ei->e_top_files, list, + list_for_each_entry_srcu(ei_child, &ei->children, list, lockdep_is_held(&eventfs_mutex)) { - list_add_tail(&ef->del_list, &ef_del_list); + list_add_tail(&ei_child->del_list, &ef_del_list); } /* Nothing should access this, but just in case! */ @@ -218,11 +163,13 @@ void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry) mutex_unlock(&eventfs_mutex); /* Now safely free the top level files and their children */ - list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) { - list_del(&ef->del_list); - eventfs_remove(ef); + list_for_each_entry_safe(ei_child, tmp, &ef_del_list, del_list) { + list_del(&ei_child->del_list); + eventfs_remove_dir(ei_child); } + kfree_const(ei->name); + kfree(ei->d_children); kfree(ei); return; } @@ -233,68 +180,162 @@ void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry) if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE)) goto out; - ef = dentry->d_fsdata; - if (!ef) + ei = dentry->d_fsdata; + if (!ei) goto out; /* - * If ef was freed, then the LSB bit is set for d_fsdata. + * If ei was freed, then the LSB bit is set for d_fsdata. * But this should not happen, as it should still have a * ref count that prevents it. Warn in case it does. */ - if (WARN_ON_ONCE((unsigned long)ef & 1)) + if (WARN_ON_ONCE((unsigned long)ei & 1)) goto out; + /* This could belong to one of the files of the ei */ + if (ei->dentry != dentry) { + for (i = 0; i < ei->nr_entries; i++) { + if (ei->d_children[i] == dentry) + break; + } + if (WARN_ON_ONCE(i == ei->nr_entries)) + goto out; + ei->d_children[i] = NULL; + } else { + ei->dentry = NULL; + } + dentry->d_fsdata = NULL; - ef->dentry = NULL; -out: + out: mutex_unlock(&eventfs_mutex); } +/** + * create_file_dentry - create a dentry for a file of an eventfs_inode + * @ei: the eventfs_inode that the file will be created under + * @e_dentry: a pointer to the d_children[] of the @ei + * @parent: The parent dentry of the created file. + * @name: The name of the file to create + * @mode: The mode of the file. + * @data: The data to use to set the inode of the file with on open() + * @fops: The fops of the file to be created. + * @lookup: If called by the lookup routine, in which case, dput() the created dentry. + * + * Create a dentry for a file of an eventfs_inode @ei and place it into the + * address located at @e_dentry. If the @e_dentry already has a dentry, then + * just do a dget() on it and return. Otherwise create the dentry and attach it. + */ +static struct dentry * +create_file_dentry(struct eventfs_inode *ei, struct dentry **e_dentry, + struct dentry *parent, const char *name, umode_t mode, void *data, + const struct file_operations *fops, bool lookup) +{ + struct dentry *dentry; + bool invalidate = false; + + mutex_lock(&eventfs_mutex); + /* If the e_dentry already has a dentry, use it */ + if (*e_dentry) { + /* lookup does not need to up the ref count */ + if (!lookup) + dget(*e_dentry); + mutex_unlock(&eventfs_mutex); + return *e_dentry; + } + mutex_unlock(&eventfs_mutex); + + /* The lookup already has the parent->d_inode locked */ + if (!lookup) + inode_lock(parent->d_inode); + + dentry = create_file(name, mode, parent, data, fops); + + if (!lookup) + inode_unlock(parent->d_inode); + + mutex_lock(&eventfs_mutex); + + if (IS_ERR_OR_NULL(dentry)) { + /* + * When the mutex was released, something else could have + * created the dentry for this e_dentry. In which case + * use that one. + * + * Note, with the mutex held, the e_dentry cannot have content + * and the ei->is_freed be true at the same time. + */ + WARN_ON_ONCE(ei->is_freed); + dentry = *e_dentry; + /* The lookup does not need to up the dentry refcount */ + if (dentry && !lookup) + dget(dentry); + mutex_unlock(&eventfs_mutex); + return dentry; + } + + if (!*e_dentry && !ei->is_freed) { + *e_dentry = dentry; + dentry->d_fsdata = ei; + } else { + /* + * Should never happen unless we get here due to being freed. + * Otherwise it means two dentries exist with the same name. + */ + WARN_ON_ONCE(!ei->is_freed); + invalidate = true; + } + mutex_unlock(&eventfs_mutex); + + if (invalidate) + d_invalidate(dentry); + + if (lookup || invalidate) + dput(dentry); + + return invalidate ? NULL : dentry; +} + /** * eventfs_post_create_dir - post create dir routine - * @ef: eventfs_file of recently created dir + * @ei: eventfs_inode of recently created dir * * Map the meta-data of files within an eventfs dir to their parent dentry */ -static void eventfs_post_create_dir(struct eventfs_file *ef) +static void eventfs_post_create_dir(struct eventfs_inode *ei) { - struct eventfs_file *ef_child; + struct eventfs_inode *ei_child; struct tracefs_inode *ti; /* srcu lock already held */ /* fill parent-child relation */ - list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list, + list_for_each_entry_srcu(ei_child, &ei->children, list, srcu_read_lock_held(&eventfs_srcu)) { - ef_child->d_parent = ef->dentry; + ei_child->d_parent = ei->dentry; } - ti = get_tracefs(ef->dentry->d_inode); - ti->private = ef->ei; + ti = get_tracefs(ei->dentry->d_inode); + ti->private = ei; } /** - * create_dentry - helper function to create dentry - * @ef: eventfs_file of file or directory to create - * @parent: parent dentry - * @lookup: true if called from lookup routine + * create_dir_dentry - Create a directory dentry for the eventfs_inode + * @ei: The eventfs_inode to create the directory for + * @parent: The dentry of the parent of this directory + * @lookup: True if this is called by the lookup code * - * Used to create a dentry for file/dir, executes post dentry creation routine + * This creates and attaches a directory dentry to the eventfs_inode @ei. */ static struct dentry * -create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup) +create_dir_dentry(struct eventfs_inode *ei, struct dentry *parent, bool lookup) { bool invalidate = false; - struct dentry *dentry; + struct dentry *dentry = NULL; mutex_lock(&eventfs_mutex); - if (ef->is_freed) { - mutex_unlock(&eventfs_mutex); - return NULL; - } - if (ef->dentry) { - dentry = ef->dentry; - /* On dir open, up the ref count */ + if (ei->dentry) { + /* If the dentry already has a dentry, use it */ + dentry = ei->dentry; + /* lookup does not need to up the ref count */ if (!lookup) dget(dentry); mutex_unlock(&eventfs_mutex); @@ -302,42 +343,44 @@ create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup) } mutex_unlock(&eventfs_mutex); + /* The lookup already has the parent->d_inode locked */ if (!lookup) inode_lock(parent->d_inode); - if (ef->ei) - dentry = create_dir(ef->name, parent, ef->data); - else - dentry = create_file(ef->name, ef->mode, parent, - ef->data, ef->fop); + dentry = create_dir(ei->name, parent); if (!lookup) inode_unlock(parent->d_inode); mutex_lock(&eventfs_mutex); - if (IS_ERR_OR_NULL(dentry)) { - /* If the ef was already updated get it */ - dentry = ef->dentry; + + if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) { + /* + * When the mutex was released, something else could have + * created the dentry for this e_dentry. In which case + * use that one. + * + * Note, with the mutex held, the e_dentry cannot have content + * and the ei->is_freed be true at the same time. + */ + dentry = ei->dentry; if (dentry && !lookup) dget(dentry); mutex_unlock(&eventfs_mutex); return dentry; } - if (!ef->dentry && !ef->is_freed) { - ef->dentry = dentry; - if (ef->ei) - eventfs_post_create_dir(ef); - dentry->d_fsdata = ef; + if (!ei->dentry && !ei->is_freed) { + ei->dentry = dentry; + eventfs_post_create_dir(ei); + dentry->d_fsdata = ei; } else { - /* A race here, should try again (unless freed) */ - invalidate = true; - /* * Should never happen unless we get here due to being freed. * Otherwise it means two dentries exist with the same name. */ - WARN_ON_ONCE(!ef->is_freed); + WARN_ON_ONCE(!ei->is_freed); + invalidate = true; } mutex_unlock(&eventfs_mutex); if (invalidate) @@ -349,50 +392,85 @@ create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup) return invalidate ? NULL : dentry; } -static bool match_event_file(struct eventfs_file *ef, const char *name) -{ - bool ret; - - mutex_lock(&eventfs_mutex); - ret = !ef->is_freed && strcmp(ef->name, name) == 0; - mutex_unlock(&eventfs_mutex); - - return ret; -} - /** * eventfs_root_lookup - lookup routine to create file/dir * @dir: in which a lookup is being done * @dentry: file/dir dentry - * @flags: to pass as flags parameter to simple lookup + * @flags: Just passed to simple_lookup() * - * Used to create a dynamic file/dir within @dir. Use the eventfs_inode - * list of meta data to find the information needed to create the file/dir. + * Used to create dynamic file/dir with-in @dir, search with-in @ei + * list, if @dentry found go ahead and create the file/dir */ + static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { + const struct file_operations *fops; + const struct eventfs_entry *entry; + struct eventfs_inode *ei_child; struct tracefs_inode *ti; struct eventfs_inode *ei; - struct eventfs_file *ef; + struct dentry *ei_dentry = NULL; struct dentry *ret = NULL; + const char *name = dentry->d_name.name; + bool created = false; + umode_t mode; + void *data; int idx; + int i; + int r; ti = get_tracefs(dir); if (!(ti->flags & TRACEFS_EVENT_INODE)) return NULL; - ei = ti->private; + /* Grab srcu to prevent the ei from going away */ idx = srcu_read_lock(&eventfs_srcu); - list_for_each_entry_srcu(ef, &ei->e_top_files, list, + + /* + * Grab the eventfs_mutex to consistent value from ti->private. + * This s + */ + mutex_lock(&eventfs_mutex); + ei = READ_ONCE(ti->private); + if (ei) + ei_dentry = READ_ONCE(ei->dentry); + mutex_unlock(&eventfs_mutex); + + if (!ei || !ei_dentry) + goto out; + + data = ei->data; + + list_for_each_entry_srcu(ei_child, &ei->children, list, srcu_read_lock_held(&eventfs_srcu)) { - if (!match_event_file(ef, dentry->d_name.name)) + if (strcmp(ei_child->name, name) != 0) continue; ret = simple_lookup(dir, dentry, flags); - create_dentry(ef, ef->d_parent, true); + create_dir_dentry(ei_child, ei_dentry, true); + created = true; break; } + + if (created) + goto out; + + for (i = 0; i < ei->nr_entries; i++) { + entry = &ei->entries[i]; + if (strcmp(name, entry->name) == 0) { + void *cdata = data; + r = entry->callback(name, &mode, &cdata, &fops); + if (r <= 0) + continue; + ret = simple_lookup(dir, dentry, flags); + create_file_dentry(ei, &ei->d_children[i], + ei_dentry, name, mode, cdata, + fops, true); + break; + } + } + out: srcu_read_unlock(&eventfs_srcu, idx); return ret; } @@ -432,29 +510,48 @@ static int eventfs_release(struct inode *inode, struct file *file) return dcache_dir_close(inode, file); } +static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt) +{ + struct dentry **tmp; + + tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_KERNEL); + if (!tmp) + return -1; + tmp[cnt] = d; + tmp[cnt + 1] = NULL; + *dentries = tmp; + return 0; +} + /** * dcache_dir_open_wrapper - eventfs open wrapper * @inode: not used - * @file: dir to be opened (to create its child) + * @file: dir to be opened (to create it's children) * - * Used to dynamically create the file/dir within @file. @file is really a - * directory and all the files/dirs of the children within @file will be - * created. If any of the files/dirs have already been created, their - * reference count will be incremented. + * Used to dynamic create file/dir with-in @file, all the + * file/dir will be created. If already created then references + * will be increased */ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) { + const struct file_operations *fops; + const struct eventfs_entry *entry; + struct eventfs_inode *ei_child; struct tracefs_inode *ti; struct eventfs_inode *ei; - struct eventfs_file *ef; struct dentry_list *dlist; struct dentry **dentries = NULL; - struct dentry *dentry = file_dentry(file); + struct dentry *parent = file_dentry(file); struct dentry *d; struct inode *f_inode = file_inode(file); + const char *name = parent->d_name.name; + umode_t mode; + void *data; int cnt = 0; int idx; int ret; + int i; + int r; ti = get_tracefs(f_inode); if (!(ti->flags & TRACEFS_EVENT_INODE)) @@ -463,25 +560,51 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) if (WARN_ON_ONCE(file->private_data)) return -EINVAL; - dlist = kmalloc(sizeof(*dlist), GFP_KERNEL); - if (!dlist) - return -ENOMEM; - - ei = ti->private; idx = srcu_read_lock(&eventfs_srcu); - list_for_each_entry_srcu(ef, &ei->e_top_files, list, - srcu_read_lock_held(&eventfs_srcu)) { - d = create_dentry(ef, dentry, false); - if (d) { - struct dentry **tmp; - tmp = krealloc(dentries, sizeof(d) * (cnt + 2), GFP_KERNEL); - if (!tmp) + mutex_lock(&eventfs_mutex); + ei = READ_ONCE(ti->private); + mutex_unlock(&eventfs_mutex); + + if (!ei) { + srcu_read_unlock(&eventfs_srcu, idx); + return -EINVAL; + } + + + data = ei->data; + + dlist = kmalloc(sizeof(*dlist), GFP_KERNEL); + if (!dlist) { + srcu_read_unlock(&eventfs_srcu, idx); + return -ENOMEM; + } + + list_for_each_entry_srcu(ei_child, &ei->children, list, + srcu_read_lock_held(&eventfs_srcu)) { + d = create_dir_dentry(ei_child, parent, false); + if (d) { + ret = add_dentries(&dentries, d, cnt); + if (ret < 0) + break; + cnt++; + } + } + + for (i = 0; i < ei->nr_entries; i++) { + void *cdata = data; + entry = &ei->entries[i]; + name = entry->name; + r = entry->callback(name, &mode, &cdata, &fops); + if (r <= 0) + continue; + d = create_file_dentry(ei, &ei->d_children[i], + parent, name, mode, cdata, fops, false); + if (d) { + ret = add_dentries(&dentries, d, cnt); + if (ret < 0) break; - tmp[cnt] = d; - tmp[cnt + 1] = NULL; cnt++; - dentries = tmp; } } srcu_read_unlock(&eventfs_srcu, idx); @@ -514,63 +637,90 @@ static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx) } /** - * eventfs_prepare_ef - helper function to prepare eventfs_file - * @name: the name of the file/directory to create. - * @mode: the permission that the file should have. - * @fop: struct file_operations that should be used for this file/directory. - * @iop: struct inode_operations that should be used for this file/directory. - * @data: something that the caller will want to get to later on. The - * inode.i_private pointer will point to this value on the open() call. + * eventfs_create_dir - Create the eventfs_inode for this directory + * @name: The name of the directory to create. + * @parent: The eventfs_inode of the parent directory. + * @entries: A list of entries that represent the files under this directory + * @size: The number of @entries + * @data: The default data to pass to the files (an entry may override it). * - * This function allocates and fills the eventfs_file structure. + * This function creates the descriptor to represent a directory in the + * eventfs. This descriptor is an eventfs_inode, and it is returned to be + * used to create other children underneath. + * + * The @entries is an array of eventfs_entry structures which has: + * const char *name + * eventfs_callback callback; + * + * The name is the name of the file, and the callback is a pointer to a function + * that will be called when the file is reference (either by lookup or by + * reading a directory). The callback is of the prototype: + * + * int callback(const char *name, umode_t *mode, void **data, + * const struct file_operations **fops); + * + * When a file needs to be created, this callback will be called with + * name = the name of the file being created (so that the same callback + * may be used for multiple files). + * mode = a place to set the file's mode + * data = A pointer to @data, and the callback may replace it, which will + * cause the file created to pass the new data to the open() call. + * fops = the fops to use for the created file. */ -static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode, - const struct file_operations *fop, - const struct inode_operations *iop, - void *data) +struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent, + const struct eventfs_entry *entries, + int size, void *data) { - struct eventfs_file *ef; + struct eventfs_inode *ei; - ef = kzalloc(sizeof(*ef), GFP_KERNEL); - if (!ef) + if (!parent) + return ERR_PTR(-EINVAL); + + ei = kzalloc(sizeof(*ei), GFP_KERNEL); + if (!ei) return ERR_PTR(-ENOMEM); - ef->name = kstrdup(name, GFP_KERNEL); - if (!ef->name) { - kfree(ef); + ei->name = kstrdup_const(name, GFP_KERNEL); + if (!ei->name) { + kfree(ei); return ERR_PTR(-ENOMEM); } - if (S_ISDIR(mode)) { - ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL); - if (!ef->ei) { - kfree(ef->name); - kfree(ef); + if (size) { + ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL); + if (!ei->d_children) { + kfree_const(ei->name); + kfree(ei); return ERR_PTR(-ENOMEM); } - INIT_LIST_HEAD(&ef->ei->e_top_files); - } else { - ef->ei = NULL; } - ef->iop = iop; - ef->fop = fop; - ef->mode = mode; - ef->data = data; - return ef; + ei->entries = entries; + ei->nr_entries = size; + ei->data = data; + INIT_LIST_HEAD(&ei->children); + + mutex_lock(&eventfs_mutex); + list_add_tail(&ei->list, &parent->children); + ei->d_parent = parent->dentry; + mutex_unlock(&eventfs_mutex); + + return ei; } /** - * eventfs_create_events_dir - create the trace event structure - * @name: the name of the directory to create. - * @parent: parent dentry for this file. This should be a directory dentry - * if set. If this parameter is NULL, then the directory will be - * created in the root of the tracefs filesystem. + * eventfs_create_events_dir - create the top level events directory + * @name: The name of the top level directory to create. + * @parent: Parent dentry for this file in the tracefs directory. + * @entries: A list of entries that represent the files under this directory + * @size: The number of @entries + * @data: The default data to pass to the files (an entry may override it). * * This function creates the top of the trace event directory. */ -struct dentry *eventfs_create_events_dir(const char *name, - struct dentry *parent) +struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent, + const struct eventfs_entry *entries, + int size, void *data) { struct dentry *dentry = tracefs_start_creating(name, parent); struct eventfs_inode *ei; @@ -581,19 +731,32 @@ struct dentry *eventfs_create_events_dir(const char *name, return NULL; if (IS_ERR(dentry)) - return dentry; + return (struct eventfs_inode *)dentry; ei = kzalloc(sizeof(*ei), GFP_KERNEL); if (!ei) - return ERR_PTR(-ENOMEM); + goto fail; + inode = tracefs_get_inode(dentry->d_sb); - if (unlikely(!inode)) { - kfree(ei); - tracefs_failed_creating(dentry); - return ERR_PTR(-ENOMEM); + if (unlikely(!inode)) + goto fail; + + if (size) { + ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL); + if (!ei->d_children) + goto fail; } - INIT_LIST_HEAD(&ei->e_top_files); + ei->dentry = dentry; + ei->entries = entries; + ei->nr_entries = size; + ei->data = data; + ei->name = kstrdup_const(name, GFP_KERNEL); + if (!ei->name) + goto fail; + + INIT_LIST_HEAD(&ei->children); + INIT_LIST_HEAD(&ei->list); ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE; @@ -608,193 +771,41 @@ struct dentry *eventfs_create_events_dir(const char *name, d_instantiate(dentry, inode); inc_nlink(dentry->d_parent->d_inode); fsnotify_mkdir(dentry->d_parent->d_inode, dentry); - return tracefs_end_creating(dentry); + tracefs_end_creating(dentry); + + /* Will call dput when the directory is removed */ + dget(dentry); + + return ei; + + fail: + kfree(ei->d_children); + kfree(ei); + tracefs_failed_creating(dentry); + return ERR_PTR(-ENOMEM); } -/** - * eventfs_add_subsystem_dir - add eventfs subsystem_dir to list to create later - * @name: the name of the file to create. - * @parent: parent dentry for this dir. - * - * This function adds eventfs subsystem dir to list. - * And all these dirs are created on the fly when they are looked up, - * and the dentry and inodes will be removed when they are done. - */ -struct eventfs_file *eventfs_add_subsystem_dir(const char *name, - struct dentry *parent) +static void free_ei(struct rcu_head *head) { - struct tracefs_inode *ti_parent; - struct eventfs_inode *ei_parent; - struct eventfs_file *ef; + struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu); - if (security_locked_down(LOCKDOWN_TRACEFS)) - return NULL; - - if (!parent) - return ERR_PTR(-EINVAL); - - ti_parent = get_tracefs(parent->d_inode); - ei_parent = ti_parent->private; - - ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL); - if (IS_ERR(ef)) - return ef; - - mutex_lock(&eventfs_mutex); - list_add_tail(&ef->list, &ei_parent->e_top_files); - ef->d_parent = parent; - mutex_unlock(&eventfs_mutex); - return ef; -} - -/** - * eventfs_add_dir - add eventfs dir to list to create later - * @name: the name of the file to create. - * @ef_parent: parent eventfs_file for this dir. - * - * This function adds eventfs dir to list. - * And all these dirs are created on the fly when they are looked up, - * and the dentry and inodes will be removed when they are done. - */ -struct eventfs_file *eventfs_add_dir(const char *name, - struct eventfs_file *ef_parent) -{ - struct eventfs_file *ef; - - if (security_locked_down(LOCKDOWN_TRACEFS)) - return NULL; - - if (!ef_parent) - return ERR_PTR(-EINVAL); - - ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL); - if (IS_ERR(ef)) - return ef; - - mutex_lock(&eventfs_mutex); - list_add_tail(&ef->list, &ef_parent->ei->e_top_files); - ef->d_parent = ef_parent->dentry; - mutex_unlock(&eventfs_mutex); - return ef; -} - -/** - * eventfs_add_events_file - add the data needed to create a file for later reference - * @name: the name of the file to create. - * @mode: the permission that the file should have. - * @parent: parent dentry for this file. - * @data: something that the caller will want to get to later on. - * @fop: struct file_operations that should be used for this file. - * - * This function is used to add the information needed to create a - * dentry/inode within the top level events directory. The file created - * will have the @mode permissions. The @data will be used to fill the - * inode.i_private when the open() call is done. The dentry and inodes are - * all created when they are referenced, and removed when they are no - * longer referenced. - */ -int eventfs_add_events_file(const char *name, umode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fop) -{ - struct tracefs_inode *ti; - struct eventfs_inode *ei; - struct eventfs_file *ef; - - if (security_locked_down(LOCKDOWN_TRACEFS)) - return -ENODEV; - - if (!parent) - return -EINVAL; - - if (!(mode & S_IFMT)) - mode |= S_IFREG; - - if (!parent->d_inode) - return -EINVAL; - - ti = get_tracefs(parent->d_inode); - if (!(ti->flags & TRACEFS_EVENT_INODE)) - return -EINVAL; - - ei = ti->private; - ef = eventfs_prepare_ef(name, mode, fop, NULL, data); - - if (IS_ERR(ef)) - return -ENOMEM; - - mutex_lock(&eventfs_mutex); - list_add_tail(&ef->list, &ei->e_top_files); - ef->d_parent = parent; - mutex_unlock(&eventfs_mutex); - return 0; -} - -/** - * eventfs_add_file - add eventfs file to list to create later - * @name: the name of the file to create. - * @mode: the permission that the file should have. - * @ef_parent: parent eventfs_file for this file. - * @data: something that the caller will want to get to later on. - * @fop: struct file_operations that should be used for this file. - * - * This function is used to add the information needed to create a - * file within a subdirectory of the events directory. The file created - * will have the @mode permissions. The @data will be used to fill the - * inode.i_private when the open() call is done. The dentry and inodes are - * all created when they are referenced, and removed when they are no - * longer referenced. - */ -int eventfs_add_file(const char *name, umode_t mode, - struct eventfs_file *ef_parent, - void *data, - const struct file_operations *fop) -{ - struct eventfs_file *ef; - - if (security_locked_down(LOCKDOWN_TRACEFS)) - return -ENODEV; - - if (!ef_parent) - return -EINVAL; - - if (!(mode & S_IFMT)) - mode |= S_IFREG; - - ef = eventfs_prepare_ef(name, mode, fop, NULL, data); - if (IS_ERR(ef)) - return -ENOMEM; - - mutex_lock(&eventfs_mutex); - list_add_tail(&ef->list, &ef_parent->ei->e_top_files); - ef->d_parent = ef_parent->dentry; - mutex_unlock(&eventfs_mutex); - return 0; -} - -static void free_ef(struct rcu_head *head) -{ - struct eventfs_file *ef = container_of(head, struct eventfs_file, rcu); - - kfree(ef->name); - kfree(ef->ei); - kfree(ef); + kfree_const(ei->name); + kfree(ei->d_children); + kfree(ei); } /** * eventfs_remove_rec - remove eventfs dir or file from list - * @ef: eventfs_file to be removed. - * @head: to create list of eventfs_file to be deleted - * @level: to check recursion depth + * @ei: eventfs_inode to be removed. * - * The helper function eventfs_remove_rec() is used to clean up and free the - * associated data from eventfs for both of the added functions. + * This function recursively remove eventfs_inode which + * contains info of file or dir. */ -static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head, int level) +static void eventfs_remove_rec(struct eventfs_inode *ei, struct list_head *head, int level) { - struct eventfs_file *ef_child; + struct eventfs_inode *ei_child; - if (!ef) + if (!ei) return; /* * Check recursion depth. It should never be greater than 3: @@ -806,62 +817,68 @@ static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head, if (WARN_ON_ONCE(level > 3)) return; - if (ef->ei) { - /* search for nested folders or files */ - list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list, - lockdep_is_held(&eventfs_mutex)) { - eventfs_remove_rec(ef_child, head, level + 1); - } + /* search for nested folders or files */ + list_for_each_entry_srcu(ei_child, &ei->children, list, + lockdep_is_held(&eventfs_mutex)) { + eventfs_remove_rec(ei_child, head, level + 1); } - list_del_rcu(&ef->list); - list_add_tail(&ef->del_list, head); + list_del_rcu(&ei->list); + list_add_tail(&ei->del_list, head); } +static void unhook_dentry(struct dentry **dentry, struct dentry **list) +{ + if (*dentry) { + unsigned long ptr = (unsigned long)*list; + + /* Keep the dentry from being freed yet */ + dget(*dentry); + + /* + * Paranoid: The dget() above should prevent the dentry + * from being freed and calling eventfs_set_ei_status_free(). + * But just in case, set the link list LSB pointer to 1 + * and have eventfs_set_ei_status_free() check that to + * make sure that if it does happen, it will not think + * the d_fsdata is an eventfs_inode. + * + * For this to work, no eventfs_inode should be allocated + * on a odd space, as the ef should always be allocated + * to be at least word aligned. Check for that too. + */ + WARN_ON_ONCE(ptr & 1); + + (*dentry)->d_fsdata = (void *)(ptr | 1); + *list = *dentry; + *dentry = NULL; + } +} /** * eventfs_remove - remove eventfs dir or file from list - * @ef: eventfs_file to be removed. + * @ei: eventfs_inode to be removed. * * This function acquire the eventfs_mutex lock and call eventfs_remove_rec() */ -void eventfs_remove(struct eventfs_file *ef) +void eventfs_remove_dir(struct eventfs_inode *ei) { - struct eventfs_file *tmp; - LIST_HEAD(ef_del_list); + struct eventfs_inode *tmp; + LIST_HEAD(ei_del_list); struct dentry *dentry_list = NULL; struct dentry *dentry; + int i; - if (!ef) + if (!ei) return; mutex_lock(&eventfs_mutex); - eventfs_remove_rec(ef, &ef_del_list, 0); - list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) { - if (ef->dentry) { - unsigned long ptr = (unsigned long)dentry_list; + eventfs_remove_rec(ei, &ei_del_list, 0); - /* Keep the dentry from being freed yet */ - dget(ef->dentry); - - /* - * Paranoid: The dget() above should prevent the dentry - * from being freed and calling eventfs_set_ef_status_free(). - * But just in case, set the link list LSB pointer to 1 - * and have eventfs_set_ef_status_free() check that to - * make sure that if it does happen, it will not think - * the d_fsdata is an event_file. - * - * For this to work, no event_file should be allocated - * on a odd space, as the ef should always be allocated - * to be at least word aligned. Check for that too. - */ - WARN_ON_ONCE(ptr & 1); - - ef->dentry->d_fsdata = (void *)(ptr | 1); - dentry_list = ef->dentry; - ef->dentry = NULL; - } - call_srcu(&eventfs_srcu, &ef->rcu, free_ef); + list_for_each_entry_safe(ei, tmp, &ei_del_list, del_list) { + for (i = 0; i < ei->nr_entries; i++) + unhook_dentry(&ei->d_children[i], &dentry_list); + unhook_dentry(&ei->dentry, &dentry_list); + call_srcu(&eventfs_srcu, &ei->rcu, free_ei); } mutex_unlock(&eventfs_mutex); @@ -876,8 +893,8 @@ void eventfs_remove(struct eventfs_file *ef) mutex_lock(&eventfs_mutex); /* dentry should now have at least a single reference */ WARN_ONCE((int)d_count(dentry) < 1, - "dentry %p less than one reference (%d) after invalidate\n", - dentry, d_count(dentry)); + "dentry %px (%s) less than one reference (%d) after invalidate\n", + dentry, dentry->d_name.name, d_count(dentry)); mutex_unlock(&eventfs_mutex); dput(dentry); } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 891653ba9cf3..34ffb2f8114e 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -385,7 +385,7 @@ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode) ti = get_tracefs(inode); if (ti && ti->flags & TRACEFS_EVENT_INODE) - eventfs_set_ef_status_free(ti, dentry); + eventfs_set_ei_status_free(ti, dentry); iput(inode); } diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 4f2e49e2197b..298d3ecaf621 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -13,6 +13,41 @@ struct tracefs_inode { struct inode vfs_inode; }; +/* + * struct eventfs_inode - hold the properties of the eventfs directories. + * @list: link list into the parent directory + * @entries: the array of entries representing the files in the directory + * @name: the name of the directory to create + * @children: link list into the child eventfs_inode + * @dentry: the dentry of the directory + * @d_parent: pointer to the parent's dentry + * @d_children: The array of dentries to represent the files when created + * @data: The private data to pass to the callbacks + * @nr_entries: The number of items in @entries + */ +struct eventfs_inode { + struct list_head list; + const struct eventfs_entry *entries; + const char *name; + struct list_head children; + struct dentry *dentry; + struct dentry *d_parent; + struct dentry **d_children; + void *data; + /* + * Union - used for deletion + * @del_list: list of eventfs_inode to delete + * @rcu: eventfs_indoe to delete in RCU + * @is_freed: node is freed if one of the above is set + */ + union { + struct list_head del_list; + struct rcu_head rcu; + unsigned long is_freed; + }; + int nr_entries; +}; + static inline struct tracefs_inode *get_tracefs(const struct inode *inode) { return container_of(inode, struct tracefs_inode, vfs_inode); @@ -25,6 +60,6 @@ struct inode *tracefs_get_inode(struct super_block *sb); struct dentry *eventfs_start_creating(const char *name, struct dentry *parent); struct dentry *eventfs_failed_creating(struct dentry *dentry); struct dentry *eventfs_end_creating(struct dentry *dentry); -void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry); +void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry); #endif /* _TRACEFS_INTERNAL_H */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 21ae37e49319..12207dc6722d 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -649,7 +649,7 @@ struct trace_event_file { struct list_head list; struct trace_event_call *event_call; struct event_filter __rcu *filter; - struct eventfs_file *ef; + struct eventfs_inode *ei; struct trace_array *tr; struct trace_subsystem_dir *system; struct list_head triggers; diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h index 009072792fa3..0c39704455d9 100644 --- a/include/linux/tracefs.h +++ b/include/linux/tracefs.h @@ -23,26 +23,25 @@ struct file_operations; struct eventfs_file; -struct dentry *eventfs_create_events_dir(const char *name, - struct dentry *parent); +typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data, + const struct file_operations **fops); -struct eventfs_file *eventfs_add_subsystem_dir(const char *name, - struct dentry *parent); +struct eventfs_entry { + const char *name; + eventfs_callback callback; +}; -struct eventfs_file *eventfs_add_dir(const char *name, - struct eventfs_file *ef_parent); +struct eventfs_inode; -int eventfs_add_file(const char *name, umode_t mode, - struct eventfs_file *ef_parent, void *data, - const struct file_operations *fops); +struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent, + const struct eventfs_entry *entries, + int size, void *data); -int eventfs_add_events_file(const char *name, umode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fops); +struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent, + const struct eventfs_entry *entries, + int size, void *data); -void eventfs_remove(struct eventfs_file *ef); - -void eventfs_remove_events_dir(struct dentry *dentry); +void eventfs_remove_dir(struct eventfs_inode *ei); struct dentry *tracefs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dd6395692ff9..4383be8fa1b0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9764,7 +9764,6 @@ static __init void create_trace_instances(struct dentry *d_tracer) static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { - struct trace_event_file *file; int cpu; trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer, @@ -9797,11 +9796,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_marker", 0220, d_tracer, tr, &tracing_mark_fops); - file = __find_event_file(tr, "ftrace", "print"); - if (file && file->ef) - eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef, - file, &event_trigger_fops); - tr->trace_marker_file = file; + tr->trace_marker_file = __find_event_file(tr, "ftrace", "print"); trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e92cb9c1292f..0e1405abf4f7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -381,7 +381,7 @@ struct trace_array { struct dentry *dir; struct dentry *options; struct dentry *percpu_dir; - struct dentry *event_dir; + struct eventfs_inode *event_dir; struct trace_options *topts; struct list_head systems; struct list_head events; @@ -1349,7 +1349,7 @@ struct trace_subsystem_dir { struct list_head list; struct event_subsystem *subsystem; struct trace_array *tr; - struct eventfs_file *ef; + struct eventfs_inode *ei; int ref_count; int nr_events; }; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 099b9b6bbdc4..a3b9d9423824 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -984,7 +984,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) return; if (!--dir->nr_events) { - eventfs_remove(dir->ef); + eventfs_remove_dir(dir->ei); list_del(&dir->list); __put_system_dir(dir); } @@ -992,7 +992,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) static void remove_event_file_dir(struct trace_event_file *file) { - eventfs_remove(file->ef); + eventfs_remove_dir(file->ei); list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); @@ -2282,14 +2282,40 @@ create_new_subsystem(const char *name) return NULL; } -static struct eventfs_file * +int system_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (strcmp(name, "filter") == 0) + *fops = &ftrace_subsystem_filter_fops; + + else if (strcmp(name, "enable") == 0) + *fops = &ftrace_system_enable_fops; + + else + return 0; + + *mode = TRACE_MODE_WRITE; + return 1; +} + +static struct eventfs_inode * event_subsystem_dir(struct trace_array *tr, const char *name, - struct trace_event_file *file, struct dentry *parent) + struct trace_event_file *file, struct eventfs_inode *parent) { struct event_subsystem *system, *iter; struct trace_subsystem_dir *dir; - struct eventfs_file *ef; - int res; + struct eventfs_inode *ei; + int nr_entries; + static struct eventfs_entry system_entries[] = { + { + .name = "filter", + .callback = system_callback, + }, + { + .name = "enable", + .callback = system_callback, + } + }; /* First see if we did not already create this dir */ list_for_each_entry(dir, &tr->systems, list) { @@ -2297,7 +2323,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, if (strcmp(system->name, name) == 0) { dir->nr_events++; file->system = dir; - return dir->ef; + return dir->ei; } } @@ -2321,39 +2347,29 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } else __get_system(system); - ef = eventfs_add_subsystem_dir(name, parent); - if (IS_ERR(ef)) { + /* ftrace only has directories no files */ + if (strcmp(name, "ftrace") == 0) + nr_entries = 0; + else + nr_entries = ARRAY_SIZE(system_entries); + + ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir); + if (!ei) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; } - dir->ef = ef; + dir->ei = ei; dir->tr = tr; dir->ref_count = 1; dir->nr_events = 1; dir->subsystem = system; file->system = dir; - /* the ftrace system is special, do not create enable or filter files */ - if (strcmp(name, "ftrace") != 0) { - - res = eventfs_add_file("filter", TRACE_MODE_WRITE, - dir->ef, dir, - &ftrace_subsystem_filter_fops); - if (res) { - kfree(system->filter); - system->filter = NULL; - pr_warn("Could not create tracefs '%s/filter' entry\n", name); - } - - eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir, - &ftrace_system_enable_fops); - } - list_add(&dir->list, &tr->systems); - return dir->ef; + return dir->ei; out_free: kfree(dir); @@ -2402,15 +2418,134 @@ event_define_fields(struct trace_event_call *call) return ret; } +static int event_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + struct trace_event_file *file = *data; + struct trace_event_call *call = file->event_call; + + if (strcmp(name, "format") == 0) { + *mode = TRACE_MODE_READ; + *fops = &ftrace_event_format_fops; + *data = call; + return 1; + } + + /* + * Only event directories that can be enabled should have + * triggers or filters, with the exception of the "print" + * event that can have a "trigger" file. + */ + if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { + if (call->class->reg && strcmp(name, "enable") == 0) { + *mode = TRACE_MODE_WRITE; + *fops = &ftrace_enable_fops; + return 1; + } + + if (strcmp(name, "filter") == 0) { + *mode = TRACE_MODE_WRITE; + *fops = &ftrace_event_filter_fops; + return 1; + } + } + + if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || + strcmp(trace_event_name(call), "print") == 0) { + if (strcmp(name, "trigger") == 0) { + *mode = TRACE_MODE_WRITE; + *fops = &event_trigger_fops; + return 1; + } + } + +#ifdef CONFIG_PERF_EVENTS + if (call->event.type && call->class->reg && + strcmp(name, "id") == 0) { + *mode = TRACE_MODE_READ; + *data = (void *)(long)call->event.type; + *fops = &ftrace_event_id_fops; + return 1; + } +#endif + +#ifdef CONFIG_HIST_TRIGGERS + if (strcmp(name, "hist") == 0) { + *mode = TRACE_MODE_READ; + *fops = &event_hist_fops; + return 1; + } +#endif +#ifdef CONFIG_HIST_TRIGGERS_DEBUG + if (strcmp(name, "hist_debug") == 0) { + *mode = TRACE_MODE_READ; + *fops = &event_hist_debug_fops; + return 1; + } +#endif +#ifdef CONFIG_TRACE_EVENT_INJECT + if (call->event.type && call->class->reg && + strcmp(name, "inject") == 0) { + *mode = 0200; + *fops = &event_inject_fops; + return 1; + } +#endif + return 0; +} + static int -event_create_dir(struct dentry *parent, struct trace_event_file *file) +event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) { struct trace_event_call *call = file->event_call; - struct eventfs_file *ef_subsystem = NULL; struct trace_array *tr = file->tr; - struct eventfs_file *ef; + struct eventfs_inode *e_events; + struct eventfs_inode *ei; const char *name; + int nr_entries; int ret; + static struct eventfs_entry event_entries[] = { + { + .name = "enable", + .callback = event_callback, + }, + { + .name = "filter", + .callback = event_callback, + }, + { + .name = "trigger", + .callback = event_callback, + }, + { + .name = "format", + .callback = event_callback, + }, +#ifdef CONFIG_PERF_EVENTS + { + .name = "id", + .callback = event_callback, + }, +#endif +#ifdef CONFIG_HIST_TRIGGERS + { + .name = "hist", + .callback = event_callback, + }, +#endif +#ifdef CONFIG_HIST_TRIGGERS_DEBUG + { + .name = "hist_debug", + .callback = event_callback, + }, +#endif +#ifdef CONFIG_TRACE_EVENT_INJECT + { + .name = "inject", + .callback = event_callback, + }, +#endif + }; /* * If the trace point header did not define TRACE_SYSTEM @@ -2420,29 +2555,20 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0)) return -ENODEV; - ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent); - if (!ef_subsystem) + e_events = event_subsystem_dir(tr, call->class->system, file, parent); + if (!e_events) return -ENOMEM; + nr_entries = ARRAY_SIZE(event_entries); + name = trace_event_name(call); - ef = eventfs_add_dir(name, ef_subsystem); - if (IS_ERR(ef)) { + ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file); + if (IS_ERR(ei)) { pr_warn("Could not create tracefs '%s' directory\n", name); return -1; } - file->ef = ef; - - if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) - eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file, - &ftrace_enable_fops); - -#ifdef CONFIG_PERF_EVENTS - if (call->event.type && call->class->reg) - eventfs_add_file("id", TRACE_MODE_READ, file->ef, - (void *)(long)call->event.type, - &ftrace_event_id_fops); -#endif + file->ei = ei; ret = event_define_fields(call); if (ret < 0) { @@ -2450,35 +2576,6 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) return ret; } - /* - * Only event directories that can be enabled should have - * triggers or filters. - */ - if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { - eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef, - file, &ftrace_event_filter_fops); - - eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef, - file, &event_trigger_fops); - } - -#ifdef CONFIG_HIST_TRIGGERS - eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file, - &event_hist_fops); -#endif -#ifdef CONFIG_HIST_TRIGGERS_DEBUG - eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file, - &event_hist_debug_fops); -#endif - eventfs_add_file("format", TRACE_MODE_READ, file->ef, call, - &ftrace_event_format_fops); - -#ifdef CONFIG_TRACE_EVENT_INJECT - if (call->event.type && call->class->reg) - eventfs_add_file("inject", 0200, file->ef, file, - &event_inject_fops); -#endif - return 0; } @@ -3623,30 +3720,65 @@ static __init int setup_trace_event(char *str) } __setup("trace_event=", setup_trace_event); +static int events_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (strcmp(name, "enable") == 0) { + *mode = TRACE_MODE_WRITE; + *fops = &ftrace_tr_enable_fops; + return 1; + } + + if (strcmp(name, "header_page") == 0) + *data = ring_buffer_print_page_header; + + else if (strcmp(name, "header_event") == 0) + *data = ring_buffer_print_entry_header; + + else + return 0; + + *mode = TRACE_MODE_READ; + *fops = &ftrace_show_header_fops; + return 1; +} + /* Expects to have event_mutex held when called */ static int create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) { - struct dentry *d_events; + struct eventfs_inode *e_events; struct dentry *entry; - int error = 0; + int nr_entries; + static struct eventfs_entry events_entries[] = { + { + .name = "enable", + .callback = events_callback, + }, + { + .name = "header_page", + .callback = events_callback, + }, + { + .name = "header_event", + .callback = events_callback, + }, + }; entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_fops); if (!entry) return -ENOMEM; - d_events = eventfs_create_events_dir("events", parent); - if (IS_ERR(d_events)) { + nr_entries = ARRAY_SIZE(events_entries); + + e_events = eventfs_create_events_dir("events", parent, events_entries, + nr_entries, tr); + if (IS_ERR(e_events)) { pr_warn("Could not create tracefs 'events' directory\n"); return -ENOMEM; } - error = eventfs_add_events_file("enable", TRACE_MODE_WRITE, d_events, - tr, &ftrace_tr_enable_fops); - if (error) - return -ENOMEM; - /* There are not as crucial, just warn if they are not created */ trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, @@ -3656,16 +3788,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_notrace_pid_fops); - /* ring buffer internal formats */ - eventfs_add_events_file("header_page", TRACE_MODE_READ, d_events, - ring_buffer_print_page_header, - &ftrace_show_header_fops); - - eventfs_add_events_file("header_event", TRACE_MODE_READ, d_events, - ring_buffer_print_entry_header, - &ftrace_show_header_fops); - - tr->event_dir = d_events; + tr->event_dir = e_events; return 0; } @@ -3749,7 +3872,7 @@ int event_trace_del_tracer(struct trace_array *tr) down_write(&trace_event_sem); __trace_remove_event_dirs(tr); - eventfs_remove_events_dir(tr->event_dir); + eventfs_remove_dir(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; From f5d9e8e08f81c9e7c723de7abcce106808f0770c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 14 Sep 2023 12:35:06 -0400 Subject: [PATCH 07/34] tracing/selftests: Update kprobe args char/string to match new functions The function that the kprobe_args_char and kprobes_arg_string attaches to for its test has changed its name once again. Now we need to check for eventfs_create_dir(), and if it exists, use that, otherwise check for eventfs_add_dir() and if that exists use that, otherwise use the original tracefs_create_dir()! Link: https://lore.kernel.org/linux-trace-kernel/20230914163535.487267410@goodmis.org Cc: Mark Rutland Cc: Andrew Morton Cc: Ajay Kaher Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- .../selftests/ftrace/test.d/kprobe/kprobe_args_char.tc | 4 +++- .../selftests/ftrace/test.d/kprobe/kprobe_args_string.tc | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc index ff7499eb98d6..c639c6c8ca03 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc @@ -34,7 +34,9 @@ mips*) esac : "Test get argument (1)" -if grep -q eventfs_add_dir available_filter_functions; then +if grep -q eventfs_create_dir available_filter_functions; then + DIR_NAME="eventfs_create_dir" +elif grep -q eventfs_add_dir available_filter_functions; then DIR_NAME="eventfs_add_dir" else DIR_NAME="tracefs_create_dir" diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc index a202b2ea4baf..a5ab4d5c74ac 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc @@ -37,7 +37,9 @@ loongarch*) esac : "Test get argument (1)" -if grep -q eventfs_add_dir available_filter_functions; then +if grep -q eventfs_create_dir available_filter_functions; then + DIR_NAME="eventfs_create_dir" +elif grep -q eventfs_add_dir available_filter_functions; then DIR_NAME="eventfs_add_dir" else DIR_NAME="tracefs_create_dir" From 2819f23ac12ce93ff79ca7a54597df9a4a1f6331 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 5 Oct 2023 09:13:48 -0400 Subject: [PATCH 08/34] eventfs: Use eventfs_remove_events_dir() The update to removing the eventfs_file changed the way the events top level directory was handled. Instead of returning a dentry, it now returns the eventfs_inode. In this changed, the removing of the events top level directory is not much different than removing any of the other directories. Because of this, the removal just called eventfs_remove_dir() instead of eventfs_remove_events_dir(). Although eventfs_remove_dir() does the clean up, it misses out on the dget() of the ei->dentry done in eventfs_create_events_dir(). It makes more sense to match eventfs_create_events_dir() with a specific function eventfs_remove_events_dir() and this specific function can then perform the dput() to the dentry that had the dget() when it was created. Fixes: 5790b1fb3d67 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202310051743.y9EobbUr-lkp@intel.com/ Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 19 +++++++------------ include/linux/tracefs.h | 1 + kernel/trace/trace_events.c | 2 +- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index eab18b157ef5..1ccd100bc565 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -901,22 +901,17 @@ void eventfs_remove_dir(struct eventfs_inode *ei) } /** - * eventfs_remove_events_dir - remove eventfs dir or file from list - * @dentry: events's dentry to be removed. + * eventfs_remove_events_dir - remove the top level eventfs directory + * @ei: the event_inode returned by eventfs_create_events_dir(). * - * This function remove events main directory + * This function removes the events main directory */ -void eventfs_remove_events_dir(struct dentry *dentry) +void eventfs_remove_events_dir(struct eventfs_inode *ei) { - struct tracefs_inode *ti; + struct dentry *dentry = ei->dentry; - if (!dentry || !dentry->d_inode) - return; + eventfs_remove_dir(ei); - ti = get_tracefs(dentry->d_inode); - if (!ti || !(ti->flags & TRACEFS_EVENT_INODE)) - return; - - d_invalidate(dentry); + /* Matches the dget() from eventfs_create_events_dir() */ dput(dentry); } diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h index 0c39704455d9..13359b1a35d1 100644 --- a/include/linux/tracefs.h +++ b/include/linux/tracefs.h @@ -41,6 +41,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode const struct eventfs_entry *entries, int size, void *data); +void eventfs_remove_events_dir(struct eventfs_inode *ei); void eventfs_remove_dir(struct eventfs_inode *ei); struct dentry *tracefs_create_file(const char *name, umode_t mode, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a3b9d9423824..0e3a1c70e410 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3872,7 +3872,7 @@ int event_trace_del_tracer(struct trace_array *tr) down_write(&trace_event_sem); __trace_remove_event_dirs(tr); - eventfs_remove_dir(tr->event_dir); + eventfs_remove_events_dir(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; From 5ddd8baa4857709b4e5d84b376d735152851955b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 5 Oct 2023 10:47:45 -0400 Subject: [PATCH 09/34] tracing: Make system_callback() function static The system_callback() function in trace_events.c is only used within that file. The "static" annotation was missed. Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202310051743.y9EobbUr-lkp@intel.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0e3a1c70e410..db46d2116500 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2282,7 +2282,7 @@ create_new_subsystem(const char *name) return NULL; } -int system_callback(const char *name, umode_t *mode, void **data, +static int system_callback(const char *name, umode_t *mode, void **data, const struct file_operations **fops) { if (strcmp(name, "filter") == 0) From b8a555dc31e5aa18d976de0bc228006e398a2e7d Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 18 Oct 2023 11:10:31 -0700 Subject: [PATCH 10/34] eventfs: Use ERR_CAST() in eventfs_create_events_dir() When building with clang and CONFIG_RANDSTRUCT_FULL=y, there is an error due to a cast in eventfs_create_events_dir(): fs/tracefs/event_inode.c:734:10: error: casting from randomized structure pointer type 'struct dentry *' to 'struct eventfs_inode *' 734 | return (struct eventfs_inode *)dentry; | ^ 1 error generated. Use the ERR_CAST() function to resolve the error, as it was designed for this exact situation (casting an error pointer to another type). Link: https://lore.kernel.org/linux-trace-kernel/20231018-ftrace-fix-clang-randstruct-v1-1-338cb214abfb@kernel.org Closes: https://github.com/ClangBuiltLinux/linux/issues/1947 Fixes: 5790b1fb3d67 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reviewed-by: Kees Cook Signed-off-by: Nathan Chancellor Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 1ccd100bc565..9f19b6608954 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -731,7 +731,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry return NULL; if (IS_ERR(dentry)) - return (struct eventfs_inode *)dentry; + return ERR_CAST(dentry); ei = kzalloc(sizeof(*ei), GFP_KERNEL); if (!ei) From 7e8ad67c9b5c11e990c320ed7e7563f2301672a7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 19 Oct 2023 20:41:32 -0400 Subject: [PATCH 11/34] eventfs: Fix failure path in eventfs_create_events_dir() The failure path of allocating ei goes to a path that dereferences ei. Add another label that skips over the ei dereferences to do the rest of the clean up. Link: https://lore.kernel.org/all/70e7bace-561c-95f-1117-706c2c220bc@inria.fr/ Link: https://lore.kernel.org/linux-trace-kernel/20231019204132.6662fef0@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Fixes: 5790b1fb3d67 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reported-by: Julia Lawall Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 9f19b6608954..1885f1f1f339 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -735,7 +735,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry ei = kzalloc(sizeof(*ei), GFP_KERNEL); if (!ei) - goto fail; + goto fail_ei; inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) @@ -781,6 +781,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry fail: kfree(ei->d_children); kfree(ei); + fail_ei: tracefs_failed_creating(dentry); return ERR_PTR(-ENOMEM); } From 5264a2f4bb3baf712e19f1f053caaa8d7d3afa2e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 20 Oct 2023 16:52:45 +0300 Subject: [PATCH 12/34] tracing: Fix a NULL vs IS_ERR() bug in event_subsystem_dir() The eventfs_create_dir() function returns error pointers, it never returns NULL. Update the check to reflect that. Link: https://lore.kernel.org/linux-trace-kernel/ff641474-84e2-46a7-9d7a-62b251a1050c@moroto.mountain Cc: Masami Hiramatsu Fixes: 5790b1fb3d67 ("eventfs: Remove eventfs_file and just use eventfs_inode") Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index db46d2116500..f9e3e24d8796 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2354,7 +2354,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, nr_entries = ARRAY_SIZE(system_entries); ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir); - if (!ei) { + if (IS_ERR(ei)) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; From 64bf2f685c795e75dd855761c75a193ee5998731 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 19 Oct 2023 11:13:53 +0800 Subject: [PATCH 13/34] tracefs/eventfs: Modify mismatched function name No functional modification involved. fs/tracefs/event_inode.c:864: warning: expecting prototype for eventfs_remove(). Prototype was for eventfs_remove_dir() instead. Link: https://lore.kernel.org/linux-trace-kernel/20231019031353.73846-1-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=6939 Signed-off-by: Jiapeng Chong Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 1885f1f1f339..09ab93357957 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -856,7 +856,7 @@ static void unhook_dentry(struct dentry **dentry, struct dentry **list) } } /** - * eventfs_remove - remove eventfs dir or file from list + * eventfs_remove_dir - remove eventfs dir or file from list * @ei: eventfs_inode to be removed. * * This function acquire the eventfs_mutex lock and call eventfs_remove_rec() From d0ed46b60396cfa7e0056f55e1ce0b43c7db57b6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 20 Oct 2023 04:35:45 +0100 Subject: [PATCH 14/34] tracing: Move readpos from seq_buf to trace_seq To make seq_buf more lightweight as a string buf, move the readpos member from seq_buf to its container, trace_seq. That puts the responsibility of maintaining the readpos entirely in the tracing code. If some future users want to package up the readpos with a seq_buf, we can define a new struct then. Link: https://lore.kernel.org/linux-trace-kernel/20231020033545.2587554-2-willy@infradead.org Cc: Kees Cook Cc: Justin Stitt Cc: Kent Overstreet Cc: Petr Mladek Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Sergey Senozhatsky Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Greg Kroah-Hartman Signed-off-by: Steven Rostedt (Google) --- include/linux/seq_buf.h | 5 +---- include/linux/trace_seq.h | 2 ++ kernel/trace/trace.c | 10 +++++----- kernel/trace/trace_seq.c | 6 +++++- lib/seq_buf.c | 22 ++++++++++------------ 5 files changed, 23 insertions(+), 22 deletions(-) diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 515d7fcb9634..a0fb013cebdf 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -14,19 +14,16 @@ * @buffer: pointer to the buffer * @size: size of the buffer * @len: the amount of data inside the buffer - * @readpos: The next position to read in the buffer. */ struct seq_buf { char *buffer; size_t size; size_t len; - loff_t readpos; }; static inline void seq_buf_clear(struct seq_buf *s) { s->len = 0; - s->readpos = 0; } static inline void @@ -143,7 +140,7 @@ extern __printf(2, 0) int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args); extern int seq_buf_print_seq(struct seq_file *m, struct seq_buf *s); extern int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, - int cnt); + size_t start, int cnt); extern int seq_buf_puts(struct seq_buf *s, const char *str); extern int seq_buf_putc(struct seq_buf *s, unsigned char c); extern int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len); diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 6be92bf559fe..3691e0e76a1a 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -14,6 +14,7 @@ struct trace_seq { char buffer[PAGE_SIZE]; struct seq_buf seq; + size_t readpos; int full; }; @@ -22,6 +23,7 @@ trace_seq_init(struct trace_seq *s) { seq_buf_init(&s->seq, s->buffer, PAGE_SIZE); s->full = 0; + s->readpos = 0; } /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4383be8fa1b0..d629065c2383 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1731,15 +1731,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; - if (trace_seq_used(s) <= s->seq.readpos) + if (trace_seq_used(s) <= s->readpos) return -EBUSY; - len = trace_seq_used(s) - s->seq.readpos; + len = trace_seq_used(s) - s->readpos; if (cnt > len) cnt = len; - memcpy(buf, s->buffer + s->seq.readpos, cnt); + memcpy(buf, s->buffer + s->readpos, cnt); - s->seq.readpos += cnt; + s->readpos += cnt; return cnt; } @@ -7008,7 +7008,7 @@ waitagain: /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq)) + if (iter->seq.readpos >= trace_seq_used(&iter->seq)) trace_seq_init(&iter->seq); /* diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index bac06ee3b98b..7be97229ddf8 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -370,8 +370,12 @@ EXPORT_SYMBOL_GPL(trace_seq_path); */ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) { + int ret; __trace_seq_init(s); - return seq_buf_to_user(&s->seq, ubuf, cnt); + ret = seq_buf_to_user(&s->seq, ubuf, s->readpos, cnt); + if (ret > 0) + s->readpos += ret; + return ret; } EXPORT_SYMBOL_GPL(trace_seq_to_user); diff --git a/lib/seq_buf.c b/lib/seq_buf.c index 45c450f423fa..b7477aefff53 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -324,23 +324,24 @@ int seq_buf_path(struct seq_buf *s, const struct path *path, const char *esc) * seq_buf_to_user - copy the sequence buffer to user space * @s: seq_buf descriptor * @ubuf: The userspace memory location to copy to + * @start: The first byte in the buffer to copy * @cnt: The amount to copy * * Copies the sequence buffer into the userspace memory pointed to - * by @ubuf. It starts from the last read position (@s->readpos) - * and writes up to @cnt characters or till it reaches the end of - * the content in the buffer (@s->len), which ever comes first. + * by @ubuf. It starts from @start and writes up to @cnt characters + * or until it reaches the end of the content in the buffer (@s->len), + * whichever comes first. * * On success, it returns a positive number of the number of bytes * it copied. * * On failure it returns -EBUSY if all of the content in the * sequence has been already read, which includes nothing in the - * sequence (@s->len == @s->readpos). + * sequence (@s->len == @start). * * Returns -EFAULT if the copy to userspace fails. */ -int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, int cnt) +int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, size_t start, int cnt) { int len; int ret; @@ -350,20 +351,17 @@ int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, int cnt) len = seq_buf_used(s); - if (len <= s->readpos) + if (len <= start) return -EBUSY; - len -= s->readpos; + len -= start; if (cnt > len) cnt = len; - ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); + ret = copy_to_user(ubuf, s->buffer + start, cnt); if (ret == cnt) return -EFAULT; - cnt -= ret; - - s->readpos += cnt; - return cnt; + return cnt - ret; } /** From 845e31e1101fc8533be52aff42d8f1ff48636024 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 20 Oct 2023 14:38:49 -0600 Subject: [PATCH 15/34] seq_buf: fix a misleading comment The comment for seq_buf_has_overflowed() says that an overflow condition is marked by len == size, but that's not what the code is testing. Make the comment match reality. Link: https://lkml.kernel.org/r/87pm19kp0m.fsf@meer.lwn.net Fixes: 8cd709ae7658a ("tracing: Have seq_buf use full buffer") Signed-off-by: Jonathan Corbet Signed-off-by: Steven Rostedt (Google) --- include/linux/seq_buf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index a0fb013cebdf..8483e4b2d0d2 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -36,7 +36,7 @@ seq_buf_init(struct seq_buf *s, char *buf, unsigned int size) /* * seq_buf have a buffer that might overflow. When this happens - * the len and size are set to be equal. + * len is set to be greater than size. */ static inline bool seq_buf_has_overflowed(struct seq_buf *s) From 545db7e21e64766e6b7cb987fbfd3e79419726ce Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 21 Oct 2023 16:42:41 +0200 Subject: [PATCH 16/34] tracing/histograms: Simplify last_cmd_set() Turn a kzalloc()+strcpy()+strncat() into an equivalent and less verbose kasprintf(). Link: https://lore.kernel.org/linux-trace-kernel/30b6fb04dadc10a03cc1ad08f5d8a93ef623a167.1697899346.git.christophe.jaillet@wanadoo.fr Cc: Masami Hiramatsu Signed-off-by: Christophe JAILLET Reviewed-by: Mukesh ojha Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index d06938ae0717..1abc07fba1b9 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -774,23 +774,16 @@ static void last_cmd_set(struct trace_event_file *file, char *str) { const char *system = NULL, *name = NULL; struct trace_event_call *call; - int len; if (!str) return; - /* sizeof() contains the nul byte */ - len = sizeof(HIST_PREFIX) + strlen(str); kfree(last_cmd); - last_cmd = kzalloc(len, GFP_KERNEL); + + last_cmd = kasprintf(GFP_KERNEL, HIST_PREFIX "%s", str); if (!last_cmd) return; - strcpy(last_cmd, HIST_PREFIX); - /* Again, sizeof() contains the nul byte */ - len -= sizeof(HIST_PREFIX); - strncat(last_cmd, str, len); - if (file) { call = file->event_call; system = call->class->system; From 0f7f544af60a6082cfaa3ed4c8f4ca1a858807ee Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 24 Oct 2023 15:55:59 +0100 Subject: [PATCH 17/34] powerpc: Remove initialisation of readpos While powerpc doesn't use the seq_buf readpos, it did explicitly initialise it for no good reason. Link: https://lore.kernel.org/linux-trace-kernel/20231024145600.739451-1-willy@infradead.org Cc: Christoph Hellwig Cc: Justin Stitt Cc: Kent Overstreet Cc: Petr Mladek Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Sergey Senozhatsky Cc: Michael Ellerman Reviewed-by: Kees Cook Fixes: d0ed46b60396 ("tracing: Move readpos from seq_buf to trace_seq") Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Steven Rostedt (Google) --- arch/powerpc/kernel/setup-common.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 2f1026fba00d..34975532e44c 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -601,7 +601,6 @@ struct seq_buf ppc_hw_desc __initdata = { .buffer = ppc_hw_desc_buf, .size = sizeof(ppc_hw_desc_buf), .len = 0, - .readpos = 0, }; static __init void probe_machine(void) From a9de4eb15ad430fe45747c211e367da745a90093 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 24 Oct 2023 12:36:28 -0400 Subject: [PATCH 18/34] eventfs: Fix WARN_ON() in create_file_dentry() As the comment right above a WARN_ON() in create_file_dentry() states: * Note, with the mutex held, the e_dentry cannot have content * and the ei->is_freed be true at the same time. But the WARN_ON() only has: WARN_ON_ONCE(ei->is_free); Where to match the comment (and what it should actually do) is: dentry = *e_dentry; WARN_ON_ONCE(dentry && ei->is_free) Also in that case, set dentry to NULL (although it should never happen). Link: https://lore.kernel.org/linux-trace-kernel/20231024123628.62b88755@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode") Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 09ab93357957..4d2da7480e5f 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -264,8 +264,9 @@ create_file_dentry(struct eventfs_inode *ei, struct dentry **e_dentry, * Note, with the mutex held, the e_dentry cannot have content * and the ei->is_freed be true at the same time. */ - WARN_ON_ONCE(ei->is_freed); dentry = *e_dentry; + if (WARN_ON_ONCE(dentry && ei->is_freed)) + dentry = NULL; /* The lookup does not need to up the dentry refcount */ if (dentry && !lookup) dget(dentry); From 29e06c10702e81a7d0b75020ca514d2f2962704a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 24 Oct 2023 13:10:24 -0400 Subject: [PATCH 19/34] eventfs: Fix typo in eventfs_inode union comment It's eventfs_inode not eventfs_indoe. There's no deer involved! Link: https://lore.kernel.org/linux-trace-kernel/20231024131024.5634c743@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode") Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 298d3ecaf621..64fde9490f52 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -37,7 +37,7 @@ struct eventfs_inode { /* * Union - used for deletion * @del_list: list of eventfs_inode to delete - * @rcu: eventfs_indoe to delete in RCU + * @rcu: eventfs_inode to delete in RCU * @is_freed: node is freed if one of the above is set */ union { From dcc4e5728eeaeda84878ca0018758cff1abfca21 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 27 Oct 2023 08:56:38 -0700 Subject: [PATCH 20/34] seq_buf: Introduce DECLARE_SEQ_BUF and seq_buf_str() Solve two ergonomic issues with struct seq_buf; 1) Too much boilerplate is required to initialize: struct seq_buf s; char buf[32]; seq_buf_init(s, buf, sizeof(buf)); Instead, we can build this directly on the stack. Provide DECLARE_SEQ_BUF() macro to do this: DECLARE_SEQ_BUF(s, 32); 2) %NUL termination is fragile and requires 2 steps to get a valid C String (and is a layering violation exposing the "internals" of seq_buf): seq_buf_terminate(s); do_something(s->buffer); Instead, we can just return s->buffer directly after terminating it in the refactored seq_buf_terminate(), now known as seq_buf_str(): do_something(seq_buf_str(s)); Link: https://lore.kernel.org/linux-trace-kernel/20231027155634.make.260-kees@kernel.org Link: https://lore.kernel.org/linux-trace-kernel/20231026194033.it.702-kees@kernel.org/ Cc: Yosry Ahmed Cc: "Matthew Wilcox (Oracle)" Cc: Christoph Hellwig Cc: Justin Stitt Cc: Kent Overstreet Cc: Petr Mladek Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Sergey Senozhatsky Cc: Masami Hiramatsu Cc: Greg Kroah-Hartman Cc: Arnd Bergmann Cc: Jonathan Corbet Cc: Yun Zhou Cc: Jacob Keller Cc: Zhen Lei Signed-off-by: Kees Cook Signed-off-by: Steven Rostedt (Google) --- include/linux/seq_buf.h | 21 +++++++++++++++++---- kernel/trace/trace.c | 11 +---------- lib/seq_buf.c | 4 +--- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 8483e4b2d0d2..5fb1f12c33f9 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -21,9 +21,18 @@ struct seq_buf { size_t len; }; +#define DECLARE_SEQ_BUF(NAME, SIZE) \ + char __ ## NAME ## _buffer[SIZE] = ""; \ + struct seq_buf NAME = { \ + .buffer = &__ ## NAME ## _buffer, \ + .size = SIZE, \ + } + static inline void seq_buf_clear(struct seq_buf *s) { s->len = 0; + if (s->size) + s->buffer[0] = '\0'; } static inline void @@ -69,8 +78,8 @@ static inline unsigned int seq_buf_used(struct seq_buf *s) } /** - * seq_buf_terminate - Make sure buffer is nul terminated - * @s: the seq_buf descriptor to terminate. + * seq_buf_str - get %NUL-terminated C string from seq_buf + * @s: the seq_buf handle * * This makes sure that the buffer in @s is nul terminated and * safe to read as a string. @@ -81,16 +90,20 @@ static inline unsigned int seq_buf_used(struct seq_buf *s) * * After this function is called, s->buffer is safe to use * in string operations. + * + * Returns @s->buf after making sure it is terminated. */ -static inline void seq_buf_terminate(struct seq_buf *s) +static inline const char *seq_buf_str(struct seq_buf *s) { if (WARN_ON(s->size == 0)) - return; + return ""; if (seq_buf_buffer_left(s)) s->buffer[s->len] = 0; else s->buffer[s->size - 1] = 0; + + return s->buffer; } /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d629065c2383..2539cfc20a97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3828,15 +3828,6 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, return false; } -static const char *show_buffer(struct trace_seq *s) -{ - struct seq_buf *seq = &s->seq; - - seq_buf_terminate(seq); - - return seq->buffer; -} - static DEFINE_STATIC_KEY_FALSE(trace_no_verify); static int test_can_verify_check(const char *fmt, ...) @@ -3976,7 +3967,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, */ if (WARN_ONCE(!trace_safe_str(iter, str, star, len), "fmt: '%s' current_buffer: '%s'", - fmt, show_buffer(&iter->seq))) { + fmt, seq_buf_str(&iter->seq.seq))) { int ret; /* Try to safely read the string */ diff --git a/lib/seq_buf.c b/lib/seq_buf.c index b7477aefff53..23518f77ea9c 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -109,9 +109,7 @@ void seq_buf_do_printk(struct seq_buf *s, const char *lvl) if (s->size == 0 || s->len == 0) return; - seq_buf_terminate(s); - - start = s->buffer; + start = seq_buf_str(s); while ((lf = strchr(start, '\n'))) { int len = lf - start + 1; From bb32500fb9b78215e4ef6ee8b4345c5f5d7eafb4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 31 Oct 2023 12:24:53 -0400 Subject: [PATCH 21/34] tracing: Have trace_event_file have ref counters The following can crash the kernel: # cd /sys/kernel/tracing # echo 'p:sched schedule' > kprobe_events # exec 5>>events/kprobes/sched/enable # > kprobe_events # exec 5>&- The above commands: 1. Change directory to the tracefs directory 2. Create a kprobe event (doesn't matter what one) 3. Open bash file descriptor 5 on the enable file of the kprobe event 4. Delete the kprobe event (removes the files too) 5. Close the bash file descriptor 5 The above causes a crash! BUG: kernel NULL pointer dereference, address: 0000000000000028 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 6 PID: 877 Comm: bash Not tainted 6.5.0-rc4-test-00008-g2c6b6b1029d4-dirty #186 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 RIP: 0010:tracing_release_file_tr+0xc/0x50 What happens here is that the kprobe event creates a trace_event_file "file" descriptor that represents the file in tracefs to the event. It maintains state of the event (is it enabled for the given instance?). Opening the "enable" file gets a reference to the event "file" descriptor via the open file descriptor. When the kprobe event is deleted, the file is also deleted from the tracefs system which also frees the event "file" descriptor. But as the tracefs file is still opened by user space, it will not be totally removed until the final dput() is called on it. But this is not true with the event "file" descriptor that is already freed. If the user does a write to or simply closes the file descriptor it will reference the event "file" descriptor that was just freed, causing a use-after-free bug. To solve this, add a ref count to the event "file" descriptor as well as a new flag called "FREED". The "file" will not be freed until the last reference is released. But the FREE flag will be set when the event is removed to prevent any more modifications to that event from happening, even if there's still a reference to the event "file" descriptor. Link: https://lore.kernel.org/linux-trace-kernel/20231031000031.1e705592@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20231031122453.7a48b923@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mark Rutland Fixes: f5ca233e2e66d ("tracing: Increase trace array ref count on enable and filter files") Reported-by: Beau Belgrave Tested-by: Beau Belgrave Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 4 ++++ kernel/trace/trace.c | 15 +++++++++++++++ kernel/trace/trace.h | 3 +++ kernel/trace/trace_events.c | 31 ++++++++++++++++++++++++++---- kernel/trace/trace_events_filter.c | 3 +++ 5 files changed, 52 insertions(+), 4 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 12207dc6722d..696f8dc4aa53 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -492,6 +492,7 @@ enum { EVENT_FILE_FL_TRIGGER_COND_BIT, EVENT_FILE_FL_PID_FILTER_BIT, EVENT_FILE_FL_WAS_ENABLED_BIT, + EVENT_FILE_FL_FREED_BIT, }; extern struct trace_event_file *trace_get_event_file(const char *instance, @@ -630,6 +631,7 @@ extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...); * TRIGGER_COND - When set, one or more triggers has an associated filter * PID_FILTER - When set, the event is filtered based on pid * WAS_ENABLED - Set when enabled to know to clear trace on module removal + * FREED - File descriptor is freed, all fields should be considered invalid */ enum { EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT), @@ -643,6 +645,7 @@ enum { EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT), EVENT_FILE_FL_PID_FILTER = (1 << EVENT_FILE_FL_PID_FILTER_BIT), EVENT_FILE_FL_WAS_ENABLED = (1 << EVENT_FILE_FL_WAS_ENABLED_BIT), + EVENT_FILE_FL_FREED = (1 << EVENT_FILE_FL_FREED_BIT), }; struct trace_event_file { @@ -671,6 +674,7 @@ struct trace_event_file { * caching and such. Which is mostly OK ;-) */ unsigned long flags; + atomic_t ref; /* ref count for opened files */ atomic_t sm_ref; /* soft-mode reference counter */ atomic_t tm_ref; /* trigger-mode reference counter */ }; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2539cfc20a97..9aebf904ff97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4978,6 +4978,20 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp) if (ret) return ret; + mutex_lock(&event_mutex); + + /* Fail if the file is marked for removal */ + if (file->flags & EVENT_FILE_FL_FREED) { + trace_array_put(file->tr); + ret = -ENODEV; + } else { + event_file_get(file); + } + + mutex_unlock(&event_mutex); + if (ret) + return ret; + filp->private_data = inode->i_private; return 0; @@ -4988,6 +5002,7 @@ int tracing_release_file_tr(struct inode *inode, struct file *filp) struct trace_event_file *file = inode->i_private; trace_array_put(file->tr); + event_file_put(file); return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0e1405abf4f7..b7f4ea25a194 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1669,6 +1669,9 @@ extern void event_trigger_unregister(struct event_command *cmd_ops, char *glob, struct event_trigger_data *trigger_data); +extern void event_file_get(struct trace_event_file *file); +extern void event_file_put(struct trace_event_file *file); + /** * struct event_trigger_ops - callbacks for trace event triggers * diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f9e3e24d8796..f29e815ca5b2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -990,13 +990,35 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) } } +void event_file_get(struct trace_event_file *file) +{ + atomic_inc(&file->ref); +} + +void event_file_put(struct trace_event_file *file) +{ + if (WARN_ON_ONCE(!atomic_read(&file->ref))) { + if (file->flags & EVENT_FILE_FL_FREED) + kmem_cache_free(file_cachep, file); + return; + } + + if (atomic_dec_and_test(&file->ref)) { + /* Count should only go to zero when it is freed */ + if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) + return; + kmem_cache_free(file_cachep, file); + } +} + static void remove_event_file_dir(struct trace_event_file *file) { eventfs_remove_dir(file->ei); list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); - kmem_cache_free(file_cachep, file); + file->flags |= EVENT_FILE_FL_FREED; + event_file_put(file); } /* @@ -1369,7 +1391,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, flags = file->flags; mutex_unlock(&event_mutex); - if (!file) + if (!file || flags & EVENT_FILE_FL_FREED) return -ENODEV; if (flags & EVENT_FILE_FL_ENABLED && @@ -1403,7 +1425,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, ret = -ENODEV; mutex_lock(&event_mutex); file = event_file_data(filp); - if (likely(file)) { + if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) { ret = tracing_update_buffers(file->tr); if (ret < 0) { mutex_unlock(&event_mutex); @@ -1683,7 +1705,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); file = event_file_data(filp); - if (file) + if (file && !(file->flags & EVENT_FILE_FL_FREED)) print_event_filter(file, s); mutex_unlock(&event_mutex); @@ -2902,6 +2924,7 @@ trace_create_new_event(struct trace_event_call *call, atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); + event_file_get(file); return file; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 33264e510d16..0c611b281a5b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2349,6 +2349,9 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) struct event_filter *filter = NULL; int err; + if (file->flags & EVENT_FILE_FL_FREED) + return -ENODEV; + if (!strcmp(strstrip(filter_string), "0")) { filter_disable(file); filter = event_filter(file); From 77bc4d4921bd3497678ba8e7f4e480de35692f05 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 31 Oct 2023 12:42:29 -0400 Subject: [PATCH 22/34] eventfs: Remove extra dget() in eventfs_create_events_dir() The creation of the top events directory does a dget() at the end of the creation in eventfs_create_events_dir() with a comment saying the final dput() will happen when it is removed. The problem is that a dget() is already done on the dentry when it was created with tracefs_start_creating()! The dget() now just causes a memory leak of that dentry. Remove the extra dget() as the final dput() in the deletion of the events directory actually matches the one in tracefs_start_creating(). Link: https://lore.kernel.org/linux-trace-kernel/20231031124229.4f2e3fa1@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode") Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 4d2da7480e5f..5536860eb2ff 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -774,9 +774,6 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry fsnotify_mkdir(dentry->d_parent->d_inode, dentry); tracefs_end_creating(dentry); - /* Will call dput when the directory is removed */ - dget(dentry); - return ei; fail: From 4f7969bcd6d33042d62e249b41b5578161e4c868 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 31 Oct 2023 15:10:33 -0400 Subject: [PATCH 23/34] tracing: Have the user copy of synthetic event address use correct context A synthetic event is created by the synthetic event interface that can read both user or kernel address memory. In reality, it reads any arbitrary memory location from within the kernel. If the address space is in USER (where CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE is set) then it uses strncpy_from_user_nofault() to copy strings otherwise it uses strncpy_from_kernel_nofault(). But since both functions use the same variable there's no annotation to what that variable is (ie. __user). This makes sparse complain. Quiet sparse by typecasting the strncpy_from_user_nofault() variable to a __user pointer. Link: https://lore.kernel.org/linux-trace-kernel/20231031151033.73c42e23@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Fixes: 0934ae9977c2 ("tracing: Fix reading strings from synthetic events"); Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311010013.fm8WTxa5-lkp@intel.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 14cb275a0bab..846e02c0fb59 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -452,7 +452,7 @@ static unsigned int trace_string(struct synth_trace_event *entry, #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE if ((unsigned long)str_val < TASK_SIZE) - ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX); + ret = strncpy_from_user_nofault(str_field, (const void __user *)str_val, STR_VAR_LEN_MAX); else #endif ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX); From 9037caa09ed345b35325200f0e4acf5a94ae0a65 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 30 Oct 2023 12:15:23 -0400 Subject: [PATCH 24/34] eventfs: Fix kerneldoc of eventfs_remove_rec() The eventfs_remove_rec() had some missing parameters in the kerneldoc comment above it. Also, rephrase the description a bit more to have a bit more correct grammar. Link: https://lore.kernel.org/linux-trace-kernel/20231030121523.0b2225a7@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode"); Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202310052216.4SgqasWo-lkp@intel.com/ Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 5536860eb2ff..9f612a8f009d 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -796,9 +796,11 @@ static void free_ei(struct rcu_head *head) /** * eventfs_remove_rec - remove eventfs dir or file from list * @ei: eventfs_inode to be removed. + * @head: the list head to place the deleted @ei and children + * @level: prevent recursion from going more than 3 levels deep. * - * This function recursively remove eventfs_inode which - * contains info of file or dir. + * This function recursively removes eventfs_inodes which + * contains info of files and/or directories. */ static void eventfs_remove_rec(struct eventfs_inode *ei, struct list_head *head, int level) { From f2f496370afcbc5227d7002da28c74b91fed12ff Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 1 Nov 2023 13:25:42 -0400 Subject: [PATCH 25/34] eventfs: Remove "is_freed" union with rcu head The eventfs_inode->is_freed was a union with the rcu_head with the assumption that when it was on the srcu list the head would contain a pointer which would make "is_freed" true. But that was a wrong assumption as the rcu head is a single link list where the last element is NULL. Instead, split the nr_entries integer so that "is_freed" is one bit and the nr_entries is the next 31 bits. As there shouldn't be more than 10 (currently there's at most 5 to 7 depending on the config), this should not be a problem. Link: https://lkml.kernel.org/r/20231101172649.049758712@goodmis.org Cc: stable@vger.kernel.org Cc: Mark Rutland Cc: Andrew Morton Cc: Ajay Kaher Fixes: 63940449555e7 ("eventfs: Implement eventfs lookup, read, open functions") Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 2 ++ fs/tracefs/internal.h | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 9f612a8f009d..1ce73acf3df0 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -824,6 +824,8 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, struct list_head *head, eventfs_remove_rec(ei_child, head, level + 1); } + ei->is_freed = 1; + list_del_rcu(&ei->list); list_add_tail(&ei->del_list, head); } diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 64fde9490f52..c7d88aaa949f 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -23,6 +23,7 @@ struct tracefs_inode { * @d_parent: pointer to the parent's dentry * @d_children: The array of dentries to represent the files when created * @data: The private data to pass to the callbacks + * @is_freed: Flag set if the eventfs is on its way to be freed * @nr_entries: The number of items in @entries */ struct eventfs_inode { @@ -38,14 +39,13 @@ struct eventfs_inode { * Union - used for deletion * @del_list: list of eventfs_inode to delete * @rcu: eventfs_inode to delete in RCU - * @is_freed: node is freed if one of the above is set */ union { struct list_head del_list; struct rcu_head rcu; - unsigned long is_freed; }; - int nr_entries; + unsigned int is_freed:1; + unsigned int nr_entries:31; }; static inline struct tracefs_inode *get_tracefs(const struct inode *inode) From db3a397209b00d2e4e0a068608e5c546fc064b82 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 1 Nov 2023 13:25:43 -0400 Subject: [PATCH 26/34] eventfs: Have a free_ei() that just frees the eventfs_inode As the eventfs_inode is freed in two different locations, make a helper function free_ei() to make sure all the allocated fields of the eventfs_inode is freed. This requires renaming the existing free_ei() which is called by the srcu handler to free_rcu_ei() and have free_ei() just do the freeing, where free_rcu_ei() will call it. Link: https://lkml.kernel.org/r/20231101172649.265214087@goodmis.org Cc: Ajay Kaher Cc: Mark Rutland Cc: Andrew Morton Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 1ce73acf3df0..dd5971855732 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -129,6 +129,13 @@ static struct dentry *create_dir(const char *name, struct dentry *parent) return eventfs_end_creating(dentry); } +static void free_ei(struct eventfs_inode *ei) +{ + kfree_const(ei->name); + kfree(ei->d_children); + kfree(ei); +} + /** * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode * @ti: the tracefs_inode of the dentry @@ -168,9 +175,7 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) eventfs_remove_dir(ei_child); } - kfree_const(ei->name); - kfree(ei->d_children); - kfree(ei); + free_ei(ei); return; } @@ -784,13 +789,11 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry return ERR_PTR(-ENOMEM); } -static void free_ei(struct rcu_head *head) +static void free_rcu_ei(struct rcu_head *head) { struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu); - kfree_const(ei->name); - kfree(ei->d_children); - kfree(ei); + free_ei(ei); } /** @@ -881,7 +884,7 @@ void eventfs_remove_dir(struct eventfs_inode *ei) for (i = 0; i < ei->nr_entries; i++) unhook_dentry(&ei->d_children[i], &dentry_list); unhook_dentry(&ei->dentry, &dentry_list); - call_srcu(&eventfs_srcu, &ei->rcu, free_ei); + call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei); } mutex_unlock(&eventfs_mutex); From 77a06c33a22d13f3a6e31f06f6ee6bca666e6898 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 1 Nov 2023 13:25:44 -0400 Subject: [PATCH 27/34] eventfs: Test for ei->is_freed when accessing ei->dentry The eventfs_inode (ei) is protected by SRCU, but the ei->dentry is not. It is protected by the eventfs_mutex. Anytime the eventfs_mutex is released, and access to the ei->dentry needs to be done, it should first check if ei->is_freed is set under the eventfs_mutex. If it is, then the ei->dentry is invalid and must not be used. The ei->dentry must only be accessed under the eventfs_mutex and after checking if ei->is_freed is set. When the ei is being freed, it will (under the eventfs_mutex) set is_freed and at the same time move the dentry to a free list to be cleared after the eventfs_mutex is released. This means that any access to the ei->dentry must check first if ei->is_freed is set, because if it is, then the dentry is on its way to be freed. Also add comments to describe this better. Link: https://lore.kernel.org/all/CA+G9fYt6pY+tMZEOg=SoEywQOe19fGP3uR15SGowkdK+_X85Cg@mail.gmail.com/ Link: https://lore.kernel.org/all/CA+G9fYuDP3hVQ3t7FfrBAjd_WFVSurMgCepTxunSJf=MTe=6aA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20231101172649.477608228@goodmis.org Cc: Ajay Kaher Cc: Mark Rutland Cc: Andrew Morton Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reported-by: Linux Kernel Functional Testing Reported-by: Naresh Kamboju Reported-by: Beau Belgrave Reviewed-by: Masami Hiramatsu (Google) Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Tested-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 45 ++++++++++++++++++++++++++++++++++------ fs/tracefs/internal.h | 3 ++- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index dd5971855732..e9625732c52d 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -24,7 +24,20 @@ #include #include "internal.h" +/* + * eventfs_mutex protects the eventfs_inode (ei) dentry. Any access + * to the ei->dentry must be done under this mutex and after checking + * if ei->is_freed is not set. The ei->dentry is released under the + * mutex at the same time ei->is_freed is set. If ei->is_freed is set + * then the ei->dentry is invalid. + */ static DEFINE_MUTEX(eventfs_mutex); + +/* + * The eventfs_inode (ei) itself is protected by SRCU. It is released from + * its parent's list and will have is_freed set (under eventfs_mutex). + * After the SRCU grace period is over, the ei may be freed. + */ DEFINE_STATIC_SRCU(eventfs_srcu); static struct dentry *eventfs_root_lookup(struct inode *dir, @@ -239,6 +252,10 @@ create_file_dentry(struct eventfs_inode *ei, struct dentry **e_dentry, bool invalidate = false; mutex_lock(&eventfs_mutex); + if (ei->is_freed) { + mutex_unlock(&eventfs_mutex); + return NULL; + } /* If the e_dentry already has a dentry, use it */ if (*e_dentry) { /* lookup does not need to up the ref count */ @@ -312,6 +329,8 @@ static void eventfs_post_create_dir(struct eventfs_inode *ei) struct eventfs_inode *ei_child; struct tracefs_inode *ti; + lockdep_assert_held(&eventfs_mutex); + /* srcu lock already held */ /* fill parent-child relation */ list_for_each_entry_srcu(ei_child, &ei->children, list, @@ -325,6 +344,7 @@ static void eventfs_post_create_dir(struct eventfs_inode *ei) /** * create_dir_dentry - Create a directory dentry for the eventfs_inode + * @pei: The eventfs_inode parent of ei. * @ei: The eventfs_inode to create the directory for * @parent: The dentry of the parent of this directory * @lookup: True if this is called by the lookup code @@ -332,12 +352,17 @@ static void eventfs_post_create_dir(struct eventfs_inode *ei) * This creates and attaches a directory dentry to the eventfs_inode @ei. */ static struct dentry * -create_dir_dentry(struct eventfs_inode *ei, struct dentry *parent, bool lookup) +create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, + struct dentry *parent, bool lookup) { bool invalidate = false; struct dentry *dentry = NULL; mutex_lock(&eventfs_mutex); + if (pei->is_freed || ei->is_freed) { + mutex_unlock(&eventfs_mutex); + return NULL; + } if (ei->dentry) { /* If the dentry already has a dentry, use it */ dentry = ei->dentry; @@ -440,7 +465,7 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, */ mutex_lock(&eventfs_mutex); ei = READ_ONCE(ti->private); - if (ei) + if (ei && !ei->is_freed) ei_dentry = READ_ONCE(ei->dentry); mutex_unlock(&eventfs_mutex); @@ -454,7 +479,7 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, if (strcmp(ei_child->name, name) != 0) continue; ret = simple_lookup(dir, dentry, flags); - create_dir_dentry(ei_child, ei_dentry, true); + create_dir_dentry(ei, ei_child, ei_dentry, true); created = true; break; } @@ -588,7 +613,7 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) list_for_each_entry_srcu(ei_child, &ei->children, list, srcu_read_lock_held(&eventfs_srcu)) { - d = create_dir_dentry(ei_child, parent, false); + d = create_dir_dentry(ei, ei_child, parent, false); if (d) { ret = add_dentries(&dentries, d, cnt); if (ret < 0) @@ -705,12 +730,20 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode ei->nr_entries = size; ei->data = data; INIT_LIST_HEAD(&ei->children); + INIT_LIST_HEAD(&ei->list); mutex_lock(&eventfs_mutex); - list_add_tail(&ei->list, &parent->children); - ei->d_parent = parent->dentry; + if (!parent->is_freed) { + list_add_tail(&ei->list, &parent->children); + ei->d_parent = parent->dentry; + } mutex_unlock(&eventfs_mutex); + /* Was the parent freed? */ + if (list_empty(&ei->list)) { + free_ei(ei); + ei = NULL; + } return ei; } diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index c7d88aaa949f..5a98e87dd3d1 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -24,6 +24,7 @@ struct tracefs_inode { * @d_children: The array of dentries to represent the files when created * @data: The private data to pass to the callbacks * @is_freed: Flag set if the eventfs is on its way to be freed + * Note if is_freed is set, then dentry is corrupted. * @nr_entries: The number of items in @entries */ struct eventfs_inode { @@ -31,7 +32,7 @@ struct eventfs_inode { const struct eventfs_entry *entries; const char *name; struct list_head children; - struct dentry *dentry; + struct dentry *dentry; /* Check is_freed to access */ struct dentry *d_parent; struct dentry **d_children; void *data; From 28e12c09f5aa081b2d13d1340e3610070b6c624d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 1 Nov 2023 13:25:45 -0400 Subject: [PATCH 28/34] eventfs: Save ownership and mode Now that inodes and dentries are created on the fly, they are also reclaimed on memory pressure. Since the ownership and file mode are saved in the inode, if they are freed, any changes to the ownership and mode will be lost. To counter this, if the user changes the permissions or ownership, save them, and when creating the inodes again, restore those changes. Link: https://lkml.kernel.org/r/20231101172649.691841445@goodmis.org Cc: stable@vger.kernel.org Cc: Ajay Kaher Cc: Mark Rutland Cc: Andrew Morton Fixes: 63940449555e7 ("eventfs: Implement eventfs lookup, read, open functions") Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 148 +++++++++++++++++++++++++++++++++++---- fs/tracefs/internal.h | 16 +++++ 2 files changed, 151 insertions(+), 13 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index e9625732c52d..93d08e552483 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -40,6 +40,15 @@ static DEFINE_MUTEX(eventfs_mutex); */ DEFINE_STATIC_SRCU(eventfs_srcu); +/* Mode is unsigned short, use the upper bits for flags */ +enum { + EVENTFS_SAVE_MODE = BIT(16), + EVENTFS_SAVE_UID = BIT(17), + EVENTFS_SAVE_GID = BIT(18), +}; + +#define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1) + static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); @@ -47,8 +56,89 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file); static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx); static int eventfs_release(struct inode *inode, struct file *file); +static void update_attr(struct eventfs_attr *attr, struct iattr *iattr) +{ + unsigned int ia_valid = iattr->ia_valid; + + if (ia_valid & ATTR_MODE) { + attr->mode = (attr->mode & ~EVENTFS_MODE_MASK) | + (iattr->ia_mode & EVENTFS_MODE_MASK) | + EVENTFS_SAVE_MODE; + } + if (ia_valid & ATTR_UID) { + attr->mode |= EVENTFS_SAVE_UID; + attr->uid = iattr->ia_uid; + } + if (ia_valid & ATTR_GID) { + attr->mode |= EVENTFS_SAVE_GID; + attr->gid = iattr->ia_gid; + } +} + +static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *iattr) +{ + const struct eventfs_entry *entry; + struct eventfs_inode *ei; + const char *name; + int ret; + + mutex_lock(&eventfs_mutex); + ei = dentry->d_fsdata; + /* The LSB is set when the eventfs_inode is being freed */ + if (((unsigned long)ei & 1UL) || ei->is_freed) { + /* Do not allow changes if the event is about to be removed. */ + mutex_unlock(&eventfs_mutex); + return -ENODEV; + } + + /* Preallocate the children mode array if necessary */ + if (!(dentry->d_inode->i_mode & S_IFDIR)) { + if (!ei->entry_attrs) { + ei->entry_attrs = kzalloc(sizeof(*ei->entry_attrs) * ei->nr_entries, + GFP_KERNEL); + if (!ei->entry_attrs) { + ret = -ENOMEM; + goto out; + } + } + } + + ret = simple_setattr(idmap, dentry, iattr); + if (ret < 0) + goto out; + + /* + * If this is a dir, then update the ei cache, only the file + * mode is saved in the ei->m_children, and the ownership is + * determined by the parent directory. + */ + if (dentry->d_inode->i_mode & S_IFDIR) { + update_attr(&ei->attr, iattr); + + } else { + name = dentry->d_name.name; + + for (int i = 0; i < ei->nr_entries; i++) { + entry = &ei->entries[i]; + if (strcmp(name, entry->name) == 0) { + update_attr(&ei->entry_attrs[i], iattr); + break; + } + } + } + out: + mutex_unlock(&eventfs_mutex); + return ret; +} + static const struct inode_operations eventfs_root_dir_inode_operations = { .lookup = eventfs_root_lookup, + .setattr = eventfs_set_attr, +}; + +static const struct inode_operations eventfs_file_inode_operations = { + .setattr = eventfs_set_attr, }; static const struct file_operations eventfs_file_operations = { @@ -59,10 +149,30 @@ static const struct file_operations eventfs_file_operations = { .release = eventfs_release, }; +static void update_inode_attr(struct inode *inode, struct eventfs_attr *attr, umode_t mode) +{ + if (!attr) { + inode->i_mode = mode; + return; + } + + if (attr->mode & EVENTFS_SAVE_MODE) + inode->i_mode = attr->mode & EVENTFS_MODE_MASK; + else + inode->i_mode = mode; + + if (attr->mode & EVENTFS_SAVE_UID) + inode->i_uid = attr->uid; + + if (attr->mode & EVENTFS_SAVE_GID) + inode->i_gid = attr->gid; +} + /** * create_file - create a file in the tracefs filesystem * @name: the name of the file to create. * @mode: the permission that the file should have. + * @attr: saved attributes changed by user * @parent: parent dentry for this file. * @data: something that the caller will want to get to later on. * @fop: struct file_operations that should be used for this file. @@ -72,6 +182,7 @@ static const struct file_operations eventfs_file_operations = { * call. */ static struct dentry *create_file(const char *name, umode_t mode, + struct eventfs_attr *attr, struct dentry *parent, void *data, const struct file_operations *fop) { @@ -95,7 +206,10 @@ static struct dentry *create_file(const char *name, umode_t mode, if (unlikely(!inode)) return eventfs_failed_creating(dentry); - inode->i_mode = mode; + /* If the user updated the directory's attributes, use them */ + update_inode_attr(inode, attr, mode); + + inode->i_op = &eventfs_file_inode_operations; inode->i_fop = fop; inode->i_private = data; @@ -108,19 +222,19 @@ static struct dentry *create_file(const char *name, umode_t mode, /** * create_dir - create a dir in the tracefs filesystem - * @name: the name of the file to create. + * @ei: the eventfs_inode that represents the directory to create * @parent: parent dentry for this file. * * This function will create a dentry for a directory represented by * a eventfs_inode. */ -static struct dentry *create_dir(const char *name, struct dentry *parent) +static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent) { struct tracefs_inode *ti; struct dentry *dentry; struct inode *inode; - dentry = eventfs_start_creating(name, parent); + dentry = eventfs_start_creating(ei->name, parent); if (IS_ERR(dentry)) return dentry; @@ -128,7 +242,9 @@ static struct dentry *create_dir(const char *name, struct dentry *parent) if (unlikely(!inode)) return eventfs_failed_creating(dentry); - inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + /* If the user updated the directory's attributes, use them */ + update_inode_attr(inode, &ei->attr, S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO); + inode->i_op = &eventfs_root_dir_inode_operations; inode->i_fop = &eventfs_file_operations; @@ -146,6 +262,7 @@ static void free_ei(struct eventfs_inode *ei) { kfree_const(ei->name); kfree(ei->d_children); + kfree(ei->entry_attrs); kfree(ei); } @@ -231,7 +348,7 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) /** * create_file_dentry - create a dentry for a file of an eventfs_inode * @ei: the eventfs_inode that the file will be created under - * @e_dentry: a pointer to the d_children[] of the @ei + * @idx: the index into the d_children[] of the @ei * @parent: The parent dentry of the created file. * @name: The name of the file to create * @mode: The mode of the file. @@ -244,10 +361,12 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) * just do a dget() on it and return. Otherwise create the dentry and attach it. */ static struct dentry * -create_file_dentry(struct eventfs_inode *ei, struct dentry **e_dentry, +create_file_dentry(struct eventfs_inode *ei, int idx, struct dentry *parent, const char *name, umode_t mode, void *data, const struct file_operations *fops, bool lookup) { + struct eventfs_attr *attr = NULL; + struct dentry **e_dentry = &ei->d_children[idx]; struct dentry *dentry; bool invalidate = false; @@ -264,13 +383,18 @@ create_file_dentry(struct eventfs_inode *ei, struct dentry **e_dentry, mutex_unlock(&eventfs_mutex); return *e_dentry; } + + /* ei->entry_attrs are protected by SRCU */ + if (ei->entry_attrs) + attr = &ei->entry_attrs[idx]; + mutex_unlock(&eventfs_mutex); /* The lookup already has the parent->d_inode locked */ if (!lookup) inode_lock(parent->d_inode); - dentry = create_file(name, mode, parent, data, fops); + dentry = create_file(name, mode, attr, parent, data, fops); if (!lookup) inode_unlock(parent->d_inode); @@ -378,7 +502,7 @@ create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, if (!lookup) inode_lock(parent->d_inode); - dentry = create_dir(ei->name, parent); + dentry = create_dir(ei, parent); if (!lookup) inode_unlock(parent->d_inode); @@ -495,8 +619,7 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, if (r <= 0) continue; ret = simple_lookup(dir, dentry, flags); - create_file_dentry(ei, &ei->d_children[i], - ei_dentry, name, mode, cdata, + create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops, true); break; } @@ -629,8 +752,7 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) r = entry->callback(name, &mode, &cdata, &fops); if (r <= 0) continue; - d = create_file_dentry(ei, &ei->d_children[i], - parent, name, mode, cdata, fops, false); + d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false); if (d) { ret = add_dentries(&dentries, d, cnt); if (ret < 0) diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 5a98e87dd3d1..5f60bcd69289 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -13,6 +13,18 @@ struct tracefs_inode { struct inode vfs_inode; }; +/* + * struct eventfs_attr - cache the mode and ownership of a eventfs entry + * @mode: saved mode plus flags of what is saved + * @uid: saved uid if changed + * @gid: saved gid if changed + */ +struct eventfs_attr { + int mode; + kuid_t uid; + kgid_t gid; +}; + /* * struct eventfs_inode - hold the properties of the eventfs directories. * @list: link list into the parent directory @@ -22,6 +34,8 @@ struct tracefs_inode { * @dentry: the dentry of the directory * @d_parent: pointer to the parent's dentry * @d_children: The array of dentries to represent the files when created + * @entry_attrs: Saved mode and ownership of the @d_children + * @attr: Saved mode and ownership of eventfs_inode itself * @data: The private data to pass to the callbacks * @is_freed: Flag set if the eventfs is on its way to be freed * Note if is_freed is set, then dentry is corrupted. @@ -35,6 +49,8 @@ struct eventfs_inode { struct dentry *dentry; /* Check is_freed to access */ struct dentry *d_parent; struct dentry **d_children; + struct eventfs_attr *entry_attrs; + struct eventfs_attr attr; void *data; /* * Union - used for deletion From 44365329f8219fc379097c2c9a75ff53f123764f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 1 Nov 2023 13:25:46 -0400 Subject: [PATCH 29/34] eventfs: Hold eventfs_mutex when calling callback functions The callback function that is used to create inodes and dentries is not protected by anything and the data that is passed to it could become stale. After eventfs_remove_dir() is called by the tracing system, it is free to remove the events that are associated to that directory. Unfortunately, that means the callbacks must not be called after that. CPU0 CPU1 ---- ---- eventfs_root_lookup() { eventfs_remove_dir() { mutex_lock(&event_mutex); ei->is_freed = set; mutex_unlock(&event_mutex); } kfree(event_call); for (...) { entry = &ei->entries[i]; r = entry->callback() { call = data; // call == event_call above if (call->flags ...) [ USE AFTER FREE BUG ] The safest way to protect this is to wrap the callback with: mutex_lock(&eventfs_mutex); if (!ei->is_freed) r = entry->callback(); else r = -1; mutex_unlock(&eventfs_mutex); This will make sure that the callback will not be called after it is freed. But now it needs to be known that the callback is called while holding internal eventfs locks, and that it must not call back into the eventfs / tracefs system. There's no reason it should anyway, but document that as well. Link: https://lore.kernel.org/all/CA+G9fYu9GOEbD=rR5eMR-=HJ8H6rMsbzDC2ZY5=Y50WpWAE7_Q@mail.gmail.com/ Link: https://lkml.kernel.org/r/20231101172649.906696613@goodmis.org Cc: Ajay Kaher Cc: Mark Rutland Cc: Andrew Morton Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reported-by: Linux Kernel Functional Testing Reported-by: Naresh Kamboju Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 22 ++++++++++++++++++-- include/linux/tracefs.h | 43 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 93d08e552483..8ac9abf7a3d5 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -615,7 +615,13 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, entry = &ei->entries[i]; if (strcmp(name, entry->name) == 0) { void *cdata = data; - r = entry->callback(name, &mode, &cdata, &fops); + mutex_lock(&eventfs_mutex); + /* If ei->is_freed, then the event itself may be too */ + if (!ei->is_freed) + r = entry->callback(name, &mode, &cdata, &fops); + else + r = -1; + mutex_unlock(&eventfs_mutex); if (r <= 0) continue; ret = simple_lookup(dir, dentry, flags); @@ -749,7 +755,13 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) void *cdata = data; entry = &ei->entries[i]; name = entry->name; - r = entry->callback(name, &mode, &cdata, &fops); + mutex_lock(&eventfs_mutex); + /* If ei->is_freed, then the event itself may be too */ + if (!ei->is_freed) + r = entry->callback(name, &mode, &cdata, &fops); + else + r = -1; + mutex_unlock(&eventfs_mutex); if (r <= 0) continue; d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false); @@ -819,6 +831,10 @@ static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx) * data = A pointer to @data, and the callback may replace it, which will * cause the file created to pass the new data to the open() call. * fops = the fops to use for the created file. + * + * NB. @callback is called while holding internal locks of the eventfs + * system. The callback must not call any code that might also call into + * the tracefs or eventfs system or it will risk creating a deadlock. */ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent, const struct eventfs_entry *entries, @@ -878,6 +894,8 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode * @data: The default data to pass to the files (an entry may override it). * * This function creates the top of the trace event directory. + * + * See eventfs_create_dir() for use of @entries. */ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent, const struct eventfs_entry *entries, diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h index 13359b1a35d1..7a5fe17b6bf9 100644 --- a/include/linux/tracefs.h +++ b/include/linux/tracefs.h @@ -23,9 +23,52 @@ struct file_operations; struct eventfs_file; +/** + * eventfs_callback - A callback function to create dynamic files in eventfs + * @name: The name of the file that is to be created + * @mode: return the file mode for the file (RW access, etc) + * @data: data to pass to the created file ops + * @fops: the file operations of the created file + * + * The evetnfs files are dynamically created. The struct eventfs_entry array + * is passed to eventfs_create_dir() or eventfs_create_events_dir() that will + * be used to create the files within those directories. When a lookup + * or access to a file within the directory is made, the struct eventfs_entry + * array is used to find a callback() with the matching name that is being + * referenced (for lookups, the entire array is iterated and each callback + * will be called). + * + * The callback will be called with @name for the name of the file to create. + * The callback can return less than 1 to indicate that no file should be + * created. + * + * If a file is to be created, then @mode should be populated with the file + * mode (permissions) for which the file is created for. This would be + * used to set the created inode i_mode field. + * + * The @data should be set to the data passed to the other file operations + * (read, write, etc). Note, @data will also point to the data passed in + * to eventfs_create_dir() or eventfs_create_events_dir(), but the callback + * can replace the data if it chooses to. Otherwise, the original data + * will be used for the file operation functions. + * + * The @fops should be set to the file operations that will be used to create + * the inode. + * + * NB. This callback is called while holding internal locks of the eventfs + * system. The callback must not call any code that might also call into + * the tracefs or eventfs system or it will risk creating a deadlock. + */ typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data, const struct file_operations **fops); +/** + * struct eventfs_entry - dynamically created eventfs file call back handler + * @name: Then name of the dynamic file in an eventfs directory + * @callback: The callback to get the fops of the file when it is created + * + * See evenfs_callback() typedef for how to set up @callback. + */ struct eventfs_entry { const char *name; eventfs_callback callback; From 020010fbfa202aa528a52743eba4ab0da3400a4e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 1 Nov 2023 13:25:47 -0400 Subject: [PATCH 30/34] eventfs: Delete eventfs_inode when the last dentry is freed There exists a race between holding a reference of an eventfs_inode dentry and the freeing of the eventfs_inode. If user space has a dentry held long enough, it may still be able to access the dentry's eventfs_inode after it has been freed. To prevent this, have he eventfs_inode freed via the last dput() (or via RCU if the eventfs_inode does not have a dentry). This means reintroducing the eventfs_inode del_list field at a temporary place to put the eventfs_inode. It needs to mark it as freed (via the list) but also must invalidate the dentry immediately as the return from eventfs_remove_dir() expects that they are. But the dentry invalidation must not be called under the eventfs_mutex, so it must be done after the eventfs_inode is marked as free (put on a deletion list). Link: https://lkml.kernel.org/r/20231101172650.123479767@goodmis.org Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Ajay Kaher Fixes: 5bdcd5f5331a2 ("eventfs: Implement removal of meta data from eventfs") Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 148 ++++++++++++++++++--------------------- fs/tracefs/internal.h | 2 + 2 files changed, 70 insertions(+), 80 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 8ac9abf7a3d5..0a04ae0ca8c8 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -85,8 +85,7 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, mutex_lock(&eventfs_mutex); ei = dentry->d_fsdata; - /* The LSB is set when the eventfs_inode is being freed */ - if (((unsigned long)ei & 1UL) || ei->is_freed) { + if (ei->is_freed) { /* Do not allow changes if the event is about to be removed. */ mutex_unlock(&eventfs_mutex); return -ENODEV; @@ -276,35 +275,17 @@ static void free_ei(struct eventfs_inode *ei) void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) { struct tracefs_inode *ti_parent; - struct eventfs_inode *ei_child, *tmp; struct eventfs_inode *ei; int i; /* The top level events directory may be freed by this */ if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) { - LIST_HEAD(ef_del_list); - mutex_lock(&eventfs_mutex); - ei = ti->private; - - /* Record all the top level files */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - lockdep_is_held(&eventfs_mutex)) { - list_add_tail(&ei_child->del_list, &ef_del_list); - } - /* Nothing should access this, but just in case! */ ti->private = NULL; - mutex_unlock(&eventfs_mutex); - /* Now safely free the top level files and their children */ - list_for_each_entry_safe(ei_child, tmp, &ef_del_list, del_list) { - list_del(&ei_child->del_list); - eventfs_remove_dir(ei_child); - } - free_ei(ei); return; } @@ -319,14 +300,6 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) if (!ei) goto out; - /* - * If ei was freed, then the LSB bit is set for d_fsdata. - * But this should not happen, as it should still have a - * ref count that prevents it. Warn in case it does. - */ - if (WARN_ON_ONCE((unsigned long)ei & 1)) - goto out; - /* This could belong to one of the files of the ei */ if (ei->dentry != dentry) { for (i = 0; i < ei->nr_entries; i++) { @@ -336,6 +309,8 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) if (WARN_ON_ONCE(i == ei->nr_entries)) goto out; ei->d_children[i] = NULL; + } else if (ei->is_freed) { + free_ei(ei); } else { ei->dentry = NULL; } @@ -962,13 +937,65 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry return ERR_PTR(-ENOMEM); } +static LLIST_HEAD(free_list); + +static void eventfs_workfn(struct work_struct *work) +{ + struct eventfs_inode *ei, *tmp; + struct llist_node *llnode; + + llnode = llist_del_all(&free_list); + llist_for_each_entry_safe(ei, tmp, llnode, llist) { + /* This dput() matches the dget() from unhook_dentry() */ + for (int i = 0; i < ei->nr_entries; i++) { + if (ei->d_children[i]) + dput(ei->d_children[i]); + } + /* This should only get here if it had a dentry */ + if (!WARN_ON_ONCE(!ei->dentry)) + dput(ei->dentry); + } +} + +static DECLARE_WORK(eventfs_work, eventfs_workfn); + static void free_rcu_ei(struct rcu_head *head) { struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu); + if (ei->dentry) { + /* Do not free the ei until all references of dentry are gone */ + if (llist_add(&ei->llist, &free_list)) + queue_work(system_unbound_wq, &eventfs_work); + return; + } + + /* If the ei doesn't have a dentry, neither should its children */ + for (int i = 0; i < ei->nr_entries; i++) { + WARN_ON_ONCE(ei->d_children[i]); + } + free_ei(ei); } +static void unhook_dentry(struct dentry *dentry) +{ + if (!dentry) + return; + + /* Keep the dentry from being freed yet (see eventfs_workfn()) */ + dget(dentry); + + dentry->d_fsdata = NULL; + d_invalidate(dentry); + mutex_lock(&eventfs_mutex); + /* dentry should now have at least a single reference */ + WARN_ONCE((int)d_count(dentry) < 1, + "dentry %px (%s) less than one reference (%d) after invalidate\n", + dentry, dentry->d_name.name, d_count(dentry)); + mutex_unlock(&eventfs_mutex); +} + /** * eventfs_remove_rec - remove eventfs dir or file from list * @ei: eventfs_inode to be removed. @@ -1006,33 +1033,6 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, struct list_head *head, list_add_tail(&ei->del_list, head); } -static void unhook_dentry(struct dentry **dentry, struct dentry **list) -{ - if (*dentry) { - unsigned long ptr = (unsigned long)*list; - - /* Keep the dentry from being freed yet */ - dget(*dentry); - - /* - * Paranoid: The dget() above should prevent the dentry - * from being freed and calling eventfs_set_ei_status_free(). - * But just in case, set the link list LSB pointer to 1 - * and have eventfs_set_ei_status_free() check that to - * make sure that if it does happen, it will not think - * the d_fsdata is an eventfs_inode. - * - * For this to work, no eventfs_inode should be allocated - * on a odd space, as the ef should always be allocated - * to be at least word aligned. Check for that too. - */ - WARN_ON_ONCE(ptr & 1); - - (*dentry)->d_fsdata = (void *)(ptr | 1); - *list = *dentry; - *dentry = NULL; - } -} /** * eventfs_remove_dir - remove eventfs dir or file from list * @ei: eventfs_inode to be removed. @@ -1043,39 +1043,27 @@ void eventfs_remove_dir(struct eventfs_inode *ei) { struct eventfs_inode *tmp; LIST_HEAD(ei_del_list); - struct dentry *dentry_list = NULL; - struct dentry *dentry; - int i; if (!ei) return; + /* + * Move the deleted eventfs_inodes onto the ei_del_list + * which will also set the is_freed value. Note, this has to be + * done under the eventfs_mutex, but the deletions of + * the dentries must be done outside the eventfs_mutex. + * Hence moving them to this temporary list. + */ mutex_lock(&eventfs_mutex); eventfs_remove_rec(ei, &ei_del_list, 0); - - list_for_each_entry_safe(ei, tmp, &ei_del_list, del_list) { - for (i = 0; i < ei->nr_entries; i++) - unhook_dentry(&ei->d_children[i], &dentry_list); - unhook_dentry(&ei->dentry, &dentry_list); - call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei); - } mutex_unlock(&eventfs_mutex); - while (dentry_list) { - unsigned long ptr; - - dentry = dentry_list; - ptr = (unsigned long)dentry->d_fsdata & ~1UL; - dentry_list = (struct dentry *)ptr; - dentry->d_fsdata = NULL; - d_invalidate(dentry); - mutex_lock(&eventfs_mutex); - /* dentry should now have at least a single reference */ - WARN_ONCE((int)d_count(dentry) < 1, - "dentry %px (%s) less than one reference (%d) after invalidate\n", - dentry, dentry->d_name.name, d_count(dentry)); - mutex_unlock(&eventfs_mutex); - dput(dentry); + list_for_each_entry_safe(ei, tmp, &ei_del_list, del_list) { + for (int i = 0; i < ei->nr_entries; i++) + unhook_dentry(ei->d_children[i]); + unhook_dentry(ei->dentry); + list_del(&ei->del_list); + call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei); } } diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 5f60bcd69289..06a1f220b901 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -54,10 +54,12 @@ struct eventfs_inode { void *data; /* * Union - used for deletion + * @llist: for calling dput() if needed after RCU * @del_list: list of eventfs_inode to delete * @rcu: eventfs_inode to delete in RCU */ union { + struct llist_node llist; struct list_head del_list; struct rcu_head rcu; }; From 62d65cac119d08d39f751b4e3e2063ed996edc05 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 1 Nov 2023 13:25:48 -0400 Subject: [PATCH 31/34] eventfs: Remove special processing of dput() of events directory The top level events directory is no longer special with regards to how it should be delete. Remove the extra processing for it in eventfs_set_ei_status_free(). Link: https://lkml.kernel.org/r/20231101172650.340876747@goodmis.org Cc: Ajay Kaher Cc: Mark Rutland Cc: Andrew Morton Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 0a04ae0ca8c8..0087a3f455f1 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -274,28 +274,11 @@ static void free_ei(struct eventfs_inode *ei) */ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) { - struct tracefs_inode *ti_parent; struct eventfs_inode *ei; int i; - /* The top level events directory may be freed by this */ - if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) { - mutex_lock(&eventfs_mutex); - ei = ti->private; - /* Nothing should access this, but just in case! */ - ti->private = NULL; - mutex_unlock(&eventfs_mutex); - - free_ei(ei); - return; - } - mutex_lock(&eventfs_mutex); - ti_parent = get_tracefs(dentry->d_parent->d_inode); - if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE)) - goto out; - ei = dentry->d_fsdata; if (!ei) goto out; @@ -920,6 +903,8 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry inode->i_op = &eventfs_root_dir_inode_operations; inode->i_fop = &eventfs_file_operations; + dentry->d_fsdata = ei; + /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); From 407c6726ca71b33330d2d6345d9ea7ebc02575e9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 1 Nov 2023 13:25:49 -0400 Subject: [PATCH 32/34] eventfs: Use simple_recursive_removal() to clean up dentries Looking at how dentry is removed via the tracefs system, I found that eventfs does not do everything that it did under tracefs. The tracefs removal of a dentry calls simple_recursive_removal() that does a lot more than a simple d_invalidate(). As it should be a requirement that any eventfs_inode that has a dentry, so does its parent. When removing a eventfs_inode, if it has a dentry, a call to simple_recursive_removal() on that dentry should clean up all the dentries underneath it. Add WARN_ON_ONCE() to check for the parent having a dentry if any children do. Link: https://lore.kernel.org/all/20231101022553.GE1957730@ZenIV/ Link: https://lkml.kernel.org/r/20231101172650.552471568@goodmis.org Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Al Viro Fixes: 5bdcd5f5331a2 ("eventfs: Implement removal of meta data from eventfs") Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 77 +++++++++++++++++++++++----------------- fs/tracefs/internal.h | 2 -- 2 files changed, 44 insertions(+), 35 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 0087a3f455f1..f8a594a50ae6 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -967,30 +967,29 @@ static void unhook_dentry(struct dentry *dentry) { if (!dentry) return; - - /* Keep the dentry from being freed yet (see eventfs_workfn()) */ + /* + * Need to add a reference to the dentry that is expected by + * simple_recursive_removal(), which will include a dput(). + */ dget(dentry); - dentry->d_fsdata = NULL; - d_invalidate(dentry); - mutex_lock(&eventfs_mutex); - /* dentry should now have at least a single reference */ - WARN_ONCE((int)d_count(dentry) < 1, - "dentry %px (%s) less than one reference (%d) after invalidate\n", - dentry, dentry->d_name.name, d_count(dentry)); - mutex_unlock(&eventfs_mutex); + /* + * Also add a reference for the dput() in eventfs_workfn(). + * That is required as that dput() will free the ei after + * the SRCU grace period is over. + */ + dget(dentry); } /** * eventfs_remove_rec - remove eventfs dir or file from list * @ei: eventfs_inode to be removed. - * @head: the list head to place the deleted @ei and children * @level: prevent recursion from going more than 3 levels deep. * * This function recursively removes eventfs_inodes which * contains info of files and/or directories. */ -static void eventfs_remove_rec(struct eventfs_inode *ei, struct list_head *head, int level) +static void eventfs_remove_rec(struct eventfs_inode *ei, int level) { struct eventfs_inode *ei_child; @@ -1009,13 +1008,26 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, struct list_head *head, /* search for nested folders or files */ list_for_each_entry_srcu(ei_child, &ei->children, list, lockdep_is_held(&eventfs_mutex)) { - eventfs_remove_rec(ei_child, head, level + 1); + /* Children only have dentry if parent does */ + WARN_ON_ONCE(ei_child->dentry && !ei->dentry); + eventfs_remove_rec(ei_child, level + 1); } + ei->is_freed = 1; + for (int i = 0; i < ei->nr_entries; i++) { + if (ei->d_children[i]) { + /* Children only have dentry if parent does */ + WARN_ON_ONCE(!ei->dentry); + unhook_dentry(ei->d_children[i]); + } + } + + unhook_dentry(ei->dentry); + list_del_rcu(&ei->list); - list_add_tail(&ei->del_list, head); + call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei); } /** @@ -1026,30 +1038,22 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, struct list_head *head, */ void eventfs_remove_dir(struct eventfs_inode *ei) { - struct eventfs_inode *tmp; - LIST_HEAD(ei_del_list); + struct dentry *dentry; if (!ei) return; - /* - * Move the deleted eventfs_inodes onto the ei_del_list - * which will also set the is_freed value. Note, this has to be - * done under the eventfs_mutex, but the deletions of - * the dentries must be done outside the eventfs_mutex. - * Hence moving them to this temporary list. - */ mutex_lock(&eventfs_mutex); - eventfs_remove_rec(ei, &ei_del_list, 0); + dentry = ei->dentry; + eventfs_remove_rec(ei, 0); mutex_unlock(&eventfs_mutex); - list_for_each_entry_safe(ei, tmp, &ei_del_list, del_list) { - for (int i = 0; i < ei->nr_entries; i++) - unhook_dentry(ei->d_children[i]); - unhook_dentry(ei->dentry); - list_del(&ei->del_list); - call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei); - } + /* + * If any of the ei children has a dentry, then the ei itself + * must have a dentry. + */ + if (dentry) + simple_recursive_removal(dentry, NULL); } /** @@ -1060,10 +1064,17 @@ void eventfs_remove_dir(struct eventfs_inode *ei) */ void eventfs_remove_events_dir(struct eventfs_inode *ei) { - struct dentry *dentry = ei->dentry; + struct dentry *dentry; + dentry = ei->dentry; eventfs_remove_dir(ei); - /* Matches the dget() from eventfs_create_events_dir() */ + /* + * Matches the dget() done by tracefs_start_creating() + * in eventfs_create_events_dir() when it the dentry was + * created. In other words, it's a normal dentry that + * sticks around while the other ei->dentry are created + * and destroyed dynamically. + */ dput(dentry); } diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 06a1f220b901..ccee18ca66c7 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -55,12 +55,10 @@ struct eventfs_inode { /* * Union - used for deletion * @llist: for calling dput() if needed after RCU - * @del_list: list of eventfs_inode to delete * @rcu: eventfs_inode to delete in RCU */ union { struct llist_node llist; - struct list_head del_list; struct rcu_head rcu; }; unsigned int is_freed:1; From 685b38c7650a0f461f56761cf61936b453d5bb24 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 1 Nov 2023 18:59:05 +0100 Subject: [PATCH 33/34] seq_buf: Export seq_buf_putc() Mark seq_buf_putc() which is part of the seq_buf API to be exported to kernel loadable GPL modules. Link: https://lkml.kernel.org/r/5c9a5ed97ac37dbdcd9c1e7bcbdec9ac166e79be.1698861216.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Signed-off-by: Steven Rostedt (Google) --- lib/seq_buf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/seq_buf.c b/lib/seq_buf.c index 23518f77ea9c..fb99168c3309 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -208,6 +208,7 @@ int seq_buf_putc(struct seq_buf *s, unsigned char c) seq_buf_set_overflow(s); return -1; } +EXPORT_SYMBOL_GPL(seq_buf_putc); /** * seq_buf_putmem - write raw data into the sequenc buffer From 70a9affa930c7aeba27893c7d402ef1294f43aa2 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 1 Nov 2023 18:59:06 +0100 Subject: [PATCH 34/34] seq_buf: Export seq_buf_puts() Mark seq_buf_puts() which is part of the seq_buf API to be exported to kernel loadable GPL modules. Link: https://lkml.kernel.org/r/b9e3737f66ec2450221b492048ce0d9c65c84953.1698861216.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Signed-off-by: Steven Rostedt (Google) --- lib/seq_buf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/seq_buf.c b/lib/seq_buf.c index fb99168c3309..010c730ca7fc 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -187,6 +187,7 @@ int seq_buf_puts(struct seq_buf *s, const char *str) seq_buf_set_overflow(s); return -1; } +EXPORT_SYMBOL_GPL(seq_buf_puts); /** * seq_buf_putc - sequence printing of simple character