From 30d103f2d4602da07313cf43d7b746e89d7d9c1d Mon Sep 17 00:00:00 2001 From: zuoqilin Date: Wed, 14 Apr 2021 21:46:47 +0800 Subject: [PATCH 01/49] tools/bootconfig: Simplify expression It is not necessary to define the variable ret to receive the return value of the xbc_node_compose_key() method. Link: https://lkml.kernel.org/r/20210414134647.1870-1-zuoqilin1@163.com Acked-by: Masami Hiramatsu Signed-off-by: zuoqilin Signed-off-by: Steven Rostedt (VMware) --- tools/bootconfig/main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index 6cd6080cac04..268b72f4cc92 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -84,11 +84,9 @@ static void xbc_show_list(void) char key[XBC_KEYLEN_MAX]; struct xbc_node *leaf; const char *val; - int ret = 0; xbc_for_each_key_value(leaf, val) { - ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX); - if (ret < 0) + if (xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX) < 0) break; printf("%s = ", key); if (!val || val[0] == '\0') { From 58b9987de86cc5f154b5e91923676f952fcf8a93 Mon Sep 17 00:00:00 2001 From: Ed Tsai Date: Thu, 22 Apr 2021 20:22:26 +0800 Subject: [PATCH 02/49] sched/tracing: Remove the redundant 'success' in the sched tracepoint 'success' is left here for a long time and also it is meaningless for the upper user. Just remove it. [ There were some tools expecting this, and this may break them. But hopefully they've been fixed in the mean time. Otherwise this may be likely reverted - SDR ] Link: https://lkml.kernel.org/r/20210422122226.9415-1-ed.tsai@mediatek.com Cc: Peter Zijlstra Signed-off-by: Ed Tsai Signed-off-by: Steven Rostedt (VMware) --- include/trace/events/sched.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 1eca2305ca42..94640482cfe7 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -148,7 +148,6 @@ DECLARE_EVENT_CLASS(sched_wakeup_template, __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) - __field( int, success ) __field( int, target_cpu ) ), @@ -156,7 +155,6 @@ DECLARE_EVENT_CLASS(sched_wakeup_template, memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ - __entry->success = 1; /* rudiment, kill when possible */ __entry->target_cpu = task_cpu(p); ), From 614db49c72db5ecf85ef94fe8bad7ebc88744ba9 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Sat, 8 May 2021 18:37:16 +0800 Subject: [PATCH 03/49] tracing: Remove redundant assignment to event_var Variable event_var is set to 'ERR_PTR(-EINVAL)', but this value is never read as it is overwritten or not used later on, hence it is a redundant assignment and can be removed. Clean up the following clang-analyzer warning: kernel/trace/trace_events_hist.c:2437:21: warning: Value stored to 'event_var' during its initialization is never read [clang-analyzer-deadcode.DeadStores]. Link: https://lkml.kernel.org/r/1620470236-26562-1-git-send-email-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c1abd63f1d6c..dacd6fe0f60c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2434,12 +2434,12 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, char *subsys_name, char *event_name, char *field_name) { struct trace_array *tr = target_hist_data->event_file->tr; - struct hist_field *event_var = ERR_PTR(-EINVAL); struct hist_trigger_data *hist_data; unsigned int i, n, first = true; struct field_var_hist *var_hist; struct trace_event_file *file; struct hist_field *key_field; + struct hist_field *event_var; char *saved_filter; char *cmd; int ret; From 957cdcd9bd7e035dcf0f29e4124b8021ea2ed696 Mon Sep 17 00:00:00 2001 From: Wei Ming Chen Date: Tue, 11 May 2021 22:02:46 +0800 Subject: [PATCH 04/49] ring-buffer: Use fallthrough pseudo-keyword Replace /* fall through */ comment with pseudo-keyword macro fallthrough[1] [1] https://www.kernel.org/doc/html/latest/process/deprecated.html?highlight=fallthrough#implicit-switch-case-fall-through Link: https://lkml.kernel.org/r/20210511140246.18868-1-jj251510319013@gmail.com Signed-off-by: Wei Ming Chen Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2c0ee6484990..d1463eac11a3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3391,7 +3391,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, case RINGBUF_TYPE_PADDING: if (event->time_delta == 1) break; - /* fall through */ + fallthrough; case RINGBUF_TYPE_DATA: ts += event->time_delta; break; From 3d3d9c072e377a0c5fa0a1f661a95134e787db3a Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Sun, 2 May 2021 08:12:27 +0800 Subject: [PATCH 05/49] trace: replace WB_REASON_FOREIGN_FLUSH with a string Now WB_REASON_FOREIGN_FLUSH is displayed as a number, maybe a string is better. v2: replace some space with tab. Link: https://lkml.kernel.org/r/1619914347-21904-1-git-send-email-brookxu.cn@gmail.com Signed-off-by: Chunguang Xu Signed-off-by: Steven Rostedt (VMware) --- include/trace/events/writeback.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 1efa463c4979..840d1ba84cf5 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -36,7 +36,8 @@ EM( WB_REASON_PERIODIC, "periodic") \ EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ - EMe(WB_REASON_FORKER_THREAD, "forker_thread") + EM( WB_REASON_FORKER_THREAD, "forker_thread") \ + EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") WB_WORK_REASON From 08b0c9b4b922ccd1b7b54589942492cfa686214e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 13 May 2021 12:55:17 +0100 Subject: [PATCH 06/49] tracing: Remove redundant initialization of variable ret The variable ret is being initialized with a value that is never read, it is being updated later on. The assignment is redundant and can be removed. Link: https://lkml.kernel.org/r/20210513115517.58178-1-colin.king@canonical.com Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9299057feb56..2e592885a167 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6145,7 +6145,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id) { - int ret = size; + int ret; mutex_lock(&trace_types_lock); From 099dcc1801d981260aee9496dbeb55270dca70c1 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Sat, 15 May 2021 10:57:35 +0000 Subject: [PATCH 07/49] tracing: Fix set_named_trigger_data() kernel-doc comment Fix the description of the parameters. Link: https://lkml.kernel.org/r/20210515105735.52785-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_trigger.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b8bfa8505b7b..cf84d0f6583a 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -916,7 +916,8 @@ void unpause_named_trigger(struct event_trigger_data *data) /** * set_named_trigger_data - Associate common named trigger data - * @data: The trigger data of a named trigger to unpause + * @data: The trigger data to associate + * @named_data: The common named trigger to be associated * * Named triggers are sets of triggers that share a common set of * trigger data. The first named trigger registered with a given name From 6c610dba6e2beb1a16ac309672181d0090fb8d30 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Sat, 29 May 2021 15:14:23 +0900 Subject: [PATCH 08/49] tracing: Add WARN_ON_ONCE when returned value is negative ret is assigned return value of event_hist_trigger_func, but the value is unused. It is better to warn when returned value is negative, rather than just ignoring it. Link: https://lkml.kernel.org/r/20210529061423.GA103954@hyeyoo Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index dacd6fe0f60c..ba03b7d84fc2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5232,6 +5232,7 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data) cmd = hist_data->field_var_hists[i]->cmd; ret = event_hist_trigger_func(&trigger_hist_cmd, file, "!hist", "hist", cmd); + WARN_ON_ONCE(ret < 0); } } From 4f99f8489950c03c792f17ca2d55cbb591286174 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 2 Jun 2021 23:33:00 +0900 Subject: [PATCH 09/49] tracing/boot: Add per-group/all events enablement Add ftrace.event..enable and ftrace.event.enable boot-time tracing, which enables all events under given GROUP and all events respectivly. Link: https://lkml.kernel.org/r/162264438005.302580.12019174481201855444.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index a82f03f385f8..94ef2d099e32 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -225,14 +225,37 @@ static void __init trace_boot_init_events(struct trace_array *tr, struct xbc_node *node) { struct xbc_node *gnode, *enode; + bool enable, enable_all = false; + const char *data; node = xbc_node_find_child(node, "event"); if (!node) return; /* per-event key starts with "event.GROUP.EVENT" */ - xbc_node_for_each_child(node, gnode) - xbc_node_for_each_child(gnode, enode) + xbc_node_for_each_child(node, gnode) { + data = xbc_node_get_data(gnode); + if (!strcmp(data, "enable")) { + enable_all = true; + continue; + } + enable = false; + xbc_node_for_each_child(gnode, enode) { + data = xbc_node_get_data(enode); + if (!strcmp(data, "enable")) { + enable = true; + continue; + } trace_boot_init_one_event(tr, gnode, enode); + } + /* Event enablement must be done after event settings */ + if (enable) { + data = xbc_node_get_data(gnode); + trace_array_set_clr_event(tr, data, NULL, true); + } + } + /* Ditto */ + if (enable_all) + trace_array_set_clr_event(tr, NULL, NULL, true); } #else #define trace_boot_enable_events(tr, node) do {} while (0) From ee0a07017ae34f2dbf5775956d8fc3f6d36be985 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 2 Jun 2021 23:33:09 +0900 Subject: [PATCH 10/49] Documentation: tracing: Add per-group/all events enablement desciption Add ftrace.event..enable and ftrace.event.enable bootconfig description in the boot-time tracing document. Link: https://lkml.kernel.org/r/162264438901.302580.10697703336929432947.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/boottime-trace.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/trace/boottime-trace.rst b/Documentation/trace/boottime-trace.rst index 89b64334929b..8053898cfeb4 100644 --- a/Documentation/trace/boottime-trace.rst +++ b/Documentation/trace/boottime-trace.rst @@ -99,6 +99,12 @@ These options are setting per-event options. ftrace.[instance.INSTANCE.]event.GROUP.EVENT.enable Enable GROUP:EVENT tracing. +ftrace.[instance.INSTANCE.]event.GROUP.enable + Enable all event tracing within GROUP. + +ftrace.[instance.INSTANCE.]event.enable + Enable all event tracing. + ftrace.[instance.INSTANCE.]event.GROUP.EVENT.filter = FILTER Set FILTER rule to the GROUP:EVENT. From ca24306d83a125df187ad53eddb038fe0cffb8ca Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 2 Jun 2021 17:18:58 +0900 Subject: [PATCH 11/49] bootconfig: Change array value to use child node It is not possible to put an array value with subkeys under a key node, because both of subkeys and the array elements are using "next" field of the xbc_node. Thus this changes the array values to use "child" field in the array case. The reason why split this change is to test it easily. Link: https://lkml.kernel.org/r/162262193838.264090.16044473274501498656.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- fs/proc/bootconfig.c | 2 +- include/linux/bootconfig.h | 6 +++--- lib/bootconfig.c | 23 +++++++++++++++++++---- tools/bootconfig/main.c | 2 +- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index ad31ec4ad627..6d8d4bf20837 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -49,7 +49,7 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) else q = '"'; ret = snprintf(dst, rest(dst, end), "%c%s%c%s", - q, val, q, vnode->next ? ", " : "\n"); + q, val, q, xbc_node_is_array(vnode) ? ", " : "\n"); if (ret < 0) goto out; dst += ret; diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 2696eb0fc149..3178a31fdabc 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -71,7 +71,7 @@ static inline __init bool xbc_node_is_key(struct xbc_node *node) */ static inline __init bool xbc_node_is_array(struct xbc_node *node) { - return xbc_node_is_value(node) && node->next != 0; + return xbc_node_is_value(node) && node->child != 0; } /** @@ -140,7 +140,7 @@ static inline struct xbc_node * __init xbc_find_node(const char *key) */ #define xbc_array_for_each_value(anode, value) \ for (value = xbc_node_get_data(anode); anode != NULL ; \ - anode = xbc_node_get_next(anode), \ + anode = xbc_node_get_child(anode), \ value = anode ? xbc_node_get_data(anode) : NULL) /** @@ -171,7 +171,7 @@ static inline struct xbc_node * __init xbc_find_node(const char *key) */ #define xbc_node_for_each_array_value(node, key, anode, value) \ for (value = xbc_node_find_value(node, key, &anode); value != NULL; \ - anode = xbc_node_get_next(anode), \ + anode = xbc_node_get_child(anode), \ value = anode ? xbc_node_get_data(anode) : NULL) /** diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 9f8c70a98fcf..44dcdcbd746a 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -367,6 +367,14 @@ static inline __init struct xbc_node *xbc_last_sibling(struct xbc_node *node) return node; } +static inline __init struct xbc_node *xbc_last_child(struct xbc_node *node) +{ + while (node->child) + node = xbc_node_get_child(node); + + return node; +} + static struct xbc_node * __init xbc_add_sibling(char *data, u32 flag) { struct xbc_node *sib, *node = xbc_add_node(data, flag); @@ -517,17 +525,20 @@ static int __init xbc_parse_array(char **__v) char *next; int c = 0; + if (last_parent->child) + last_parent = xbc_node_get_child(last_parent); + do { c = __xbc_parse_value(__v, &next); if (c < 0) return c; - node = xbc_add_sibling(*__v, XBC_VALUE); + node = xbc_add_child(*__v, XBC_VALUE); if (!node) return -ENOMEM; *__v = next; } while (c == ','); - node->next = 0; + node->child = 0; return c; } @@ -615,8 +626,12 @@ static int __init xbc_parse_kv(char **k, char *v, int op) if (op == ':' && child) { xbc_init_node(child, v, XBC_VALUE); - } else if (!xbc_add_sibling(v, XBC_VALUE)) - return -ENOMEM; + } else { + if (op == '+' && child) + last_parent = xbc_last_child(child); + if (!xbc_add_sibling(v, XBC_VALUE)) + return -ENOMEM; + } if (c == ',') { /* Array */ c = xbc_parse_array(&next); diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index 268b72f4cc92..23569fb634f1 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -27,7 +27,7 @@ static int xbc_show_value(struct xbc_node *node, bool semicolon) q = '\''; else q = '"'; - printf("%c%s%c%s", q, val, q, node->next ? ", " : eol); + printf("%c%s%c%s", q, val, q, xbc_node_is_array(node) ? ", " : eol); i++; } return i; From e5efaeb8a8f527d6e91289ff1f67fbcae452b2ca Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 2 Jun 2021 17:19:07 +0900 Subject: [PATCH 12/49] bootconfig: Support mixing a value and subkeys under a key Support mixing a value and subkeys under a key. Since kernel cmdline options will support "aaa.bbb=value1 aaa.bbb.ccc=value2", it is better that the bootconfig supports such configuration too. Note that this does not change syntax itself but just accepts mixed value and subkeys e.g. key = value1 key.subkey = value2 But this is not accepted; key { value1 subkey = value2 } That will make value1 as a subkey. Also, the order of the value node under a key is fixed. If there are a value and subkeys, the value is always the first child node of the key. Thus if user specifies subkeys first, e.g. key.subkey = value1 key = value2 In the program (and /proc/bootconfig), it will be shown as below key = value2 key.subkey = value1 Link: https://lkml.kernel.org/r/162262194685.264090.7738574774030567419.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 32 ++++++++++++++++++ lib/bootconfig.c | 67 ++++++++++++++++++++++++++------------ tools/bootconfig/main.c | 45 ++++++++++++++++++++----- 3 files changed, 115 insertions(+), 29 deletions(-) diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 3178a31fdabc..e49043ac77c9 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -80,6 +80,8 @@ static inline __init bool xbc_node_is_array(struct xbc_node *node) * * Test the @node is a leaf key node which is a key node and has a value node * or no child. Returns true if it is a leaf node, or false if not. + * Note that the leaf node can have subkey nodes in addition to the + * value node. */ static inline __init bool xbc_node_is_leaf(struct xbc_node *node) { @@ -129,6 +131,23 @@ static inline struct xbc_node * __init xbc_find_node(const char *key) return xbc_node_find_child(NULL, key); } +/** + * xbc_node_get_subkey() - Return the first subkey node if exists + * @node: Parent node + * + * Return the first subkey node of the @node. If the @node has no child + * or only value node, this will return NULL. + */ +static inline struct xbc_node * __init xbc_node_get_subkey(struct xbc_node *node) +{ + struct xbc_node *child = xbc_node_get_child(node); + + if (child && xbc_node_is_value(child)) + return xbc_node_get_next(child); + else + return child; +} + /** * xbc_array_for_each_value() - Iterate value nodes on an array * @anode: An XBC arraied value node @@ -149,11 +168,24 @@ static inline struct xbc_node * __init xbc_find_node(const char *key) * @child: Iterated XBC node. * * Iterate child nodes of @parent. Each child nodes are stored to @child. + * The @child can be mixture of a value node and subkey nodes. */ #define xbc_node_for_each_child(parent, child) \ for (child = xbc_node_get_child(parent); child != NULL ; \ child = xbc_node_get_next(child)) +/** + * xbc_node_for_each_subkey() - Iterate child subkey nodes + * @parent: An XBC node. + * @child: Iterated XBC node. + * + * Iterate subkey nodes of @parent. Each child nodes are stored to @child. + * The @child is only the subkey node. + */ +#define xbc_node_for_each_subkey(parent, child) \ + for (child = xbc_node_get_subkey(parent); child != NULL ; \ + child = xbc_node_get_next(child)) + /** * xbc_node_for_each_array_value() - Iterate array entries of geven key * @node: An XBC node. diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 44dcdcbd746a..927017431fb6 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -156,7 +156,7 @@ xbc_node_find_child(struct xbc_node *parent, const char *key) struct xbc_node *node; if (parent) - node = xbc_node_get_child(parent); + node = xbc_node_get_subkey(parent); else node = xbc_root_node(); @@ -164,7 +164,7 @@ xbc_node_find_child(struct xbc_node *parent, const char *key) if (!xbc_node_match_prefix(node, &key)) node = xbc_node_get_next(node); else if (*key != '\0') - node = xbc_node_get_child(node); + node = xbc_node_get_subkey(node); else break; } @@ -274,6 +274,8 @@ int __init xbc_node_compose_key_after(struct xbc_node *root, struct xbc_node * __init xbc_node_find_next_leaf(struct xbc_node *root, struct xbc_node *node) { + struct xbc_node *next; + if (unlikely(!xbc_data)) return NULL; @@ -282,6 +284,13 @@ struct xbc_node * __init xbc_node_find_next_leaf(struct xbc_node *root, if (!node) node = xbc_nodes; } else { + /* Leaf node may have a subkey */ + next = xbc_node_get_subkey(node); + if (next) { + node = next; + goto found; + } + if (node == root) /* @root was a leaf, no child node. */ return NULL; @@ -296,6 +305,7 @@ struct xbc_node * __init xbc_node_find_next_leaf(struct xbc_node *root, node = xbc_node_get_next(node); } +found: while (node && !xbc_node_is_leaf(node)) node = xbc_node_get_child(node); @@ -375,18 +385,20 @@ static inline __init struct xbc_node *xbc_last_child(struct xbc_node *node) return node; } -static struct xbc_node * __init xbc_add_sibling(char *data, u32 flag) +static struct xbc_node * __init __xbc_add_sibling(char *data, u32 flag, bool head) { struct xbc_node *sib, *node = xbc_add_node(data, flag); if (node) { if (!last_parent) { + /* Ignore @head in this case */ node->parent = XBC_NODE_MAX; sib = xbc_last_sibling(xbc_nodes); sib->next = xbc_node_index(node); } else { node->parent = xbc_node_index(last_parent); - if (!last_parent->child) { + if (!last_parent->child || head) { + node->next = last_parent->child; last_parent->child = xbc_node_index(node); } else { sib = xbc_node_get_child(last_parent); @@ -400,6 +412,16 @@ static struct xbc_node * __init xbc_add_sibling(char *data, u32 flag) return node; } +static inline struct xbc_node * __init xbc_add_sibling(char *data, u32 flag) +{ + return __xbc_add_sibling(data, flag, false); +} + +static inline struct xbc_node * __init xbc_add_head_sibling(char *data, u32 flag) +{ + return __xbc_add_sibling(data, flag, true); +} + static inline __init struct xbc_node *xbc_add_child(char *data, u32 flag) { struct xbc_node *node = xbc_add_sibling(data, flag); @@ -568,8 +590,9 @@ static int __init __xbc_add_key(char *k) node = find_match_node(xbc_nodes, k); else { child = xbc_node_get_child(last_parent); + /* Since the value node is the first child, skip it. */ if (child && xbc_node_is_value(child)) - return xbc_parse_error("Subkey is mixed with value", k); + child = xbc_node_get_next(child); node = find_match_node(child, k); } @@ -612,27 +635,29 @@ static int __init xbc_parse_kv(char **k, char *v, int op) if (ret) return ret; - child = xbc_node_get_child(last_parent); - if (child) { - if (xbc_node_is_key(child)) - return xbc_parse_error("Value is mixed with subkey", v); - else if (op == '=') - return xbc_parse_error("Value is redefined", v); - } - c = __xbc_parse_value(&v, &next); if (c < 0) return c; - if (op == ':' && child) { - xbc_init_node(child, v, XBC_VALUE); - } else { - if (op == '+' && child) - last_parent = xbc_last_child(child); - if (!xbc_add_sibling(v, XBC_VALUE)) - return -ENOMEM; - } + child = xbc_node_get_child(last_parent); + if (child && xbc_node_is_value(child)) { + if (op == '=') + return xbc_parse_error("Value is redefined", v); + if (op == ':') { + unsigned short nidx = child->next; + xbc_init_node(child, v, XBC_VALUE); + child->next = nidx; /* keep subkeys */ + goto array; + } + /* op must be '+' */ + last_parent = xbc_last_child(child); + } + /* The value node should always be the first child */ + if (!xbc_add_head_sibling(v, XBC_VALUE)) + return -ENOMEM; + +array: if (c == ',') { /* Array */ c = xbc_parse_array(&next); if (c < 0) diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index 23569fb634f1..62a3b5064b17 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -35,30 +35,55 @@ static int xbc_show_value(struct xbc_node *node, bool semicolon) static void xbc_show_compact_tree(void) { - struct xbc_node *node, *cnode; + struct xbc_node *node, *cnode = NULL, *vnode; int depth = 0, i; node = xbc_root_node(); while (node && xbc_node_is_key(node)) { for (i = 0; i < depth; i++) printf("\t"); - cnode = xbc_node_get_child(node); + if (!cnode) + cnode = xbc_node_get_child(node); while (cnode && xbc_node_is_key(cnode) && !cnode->next) { + vnode = xbc_node_get_child(cnode); + /* + * If @cnode has value and subkeys, this + * should show it as below. + * + * key(@node) { + * key(@cnode) = value; + * key(@cnode) { + * subkeys; + * } + * } + */ + if (vnode && xbc_node_is_value(vnode) && vnode->next) + break; printf("%s.", xbc_node_get_data(node)); node = cnode; - cnode = xbc_node_get_child(node); + cnode = vnode; } if (cnode && xbc_node_is_key(cnode)) { printf("%s {\n", xbc_node_get_data(node)); depth++; node = cnode; + cnode = NULL; continue; } else if (cnode && xbc_node_is_value(cnode)) { printf("%s = ", xbc_node_get_data(node)); xbc_show_value(cnode, true); + /* + * If @node has value and subkeys, continue + * looping on subkeys with same node. + */ + if (cnode->next) { + cnode = xbc_node_get_next(cnode); + continue; + } } else { printf("%s;\n", xbc_node_get_data(node)); } + cnode = NULL; if (node->next) { node = xbc_node_get_next(node); @@ -70,10 +95,12 @@ static void xbc_show_compact_tree(void) return; if (!xbc_node_get_child(node)->next) continue; - depth--; - for (i = 0; i < depth; i++) - printf("\t"); - printf("}\n"); + if (depth) { + depth--; + for (i = 0; i < depth; i++) + printf("\t"); + printf("}\n"); + } } node = xbc_node_get_next(node); } @@ -86,8 +113,10 @@ static void xbc_show_list(void) const char *val; xbc_for_each_key_value(leaf, val) { - if (xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX) < 0) + if (xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX) < 0) { + fprintf(stderr, "Failed to compose key %d\n", ret); break; + } printf("%s = ", key); if (!val || val[0] == '\0') { printf("\"\"\n"); From 29e1c1ad3ff7f345d80c7b81b08175f5a8c84122 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 2 Jun 2021 17:19:15 +0900 Subject: [PATCH 13/49] tools/bootconfig: Support mixed value and subkey test cases Update test case to support mixed value and subkey on a key. Link: https://lkml.kernel.org/r/162262195568.264090.12431204030021242896.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/bootconfig/samples/bad-override.bconf | 3 --- tools/bootconfig/samples/bad-override2.bconf | 3 --- tools/bootconfig/samples/good-mixed-append.bconf | 4 ++++ .../samples/{bad-mixed-kv1.bconf => good-mixed-kv1.bconf} | 0 .../samples/{bad-mixed-kv2.bconf => good-mixed-kv2.bconf} | 0 tools/bootconfig/samples/good-mixed-kv3.bconf | 6 ++++++ tools/bootconfig/samples/good-mixed-override.bconf | 4 ++++ 7 files changed, 14 insertions(+), 6 deletions(-) delete mode 100644 tools/bootconfig/samples/bad-override.bconf delete mode 100644 tools/bootconfig/samples/bad-override2.bconf create mode 100644 tools/bootconfig/samples/good-mixed-append.bconf rename tools/bootconfig/samples/{bad-mixed-kv1.bconf => good-mixed-kv1.bconf} (100%) rename tools/bootconfig/samples/{bad-mixed-kv2.bconf => good-mixed-kv2.bconf} (100%) create mode 100644 tools/bootconfig/samples/good-mixed-kv3.bconf create mode 100644 tools/bootconfig/samples/good-mixed-override.bconf diff --git a/tools/bootconfig/samples/bad-override.bconf b/tools/bootconfig/samples/bad-override.bconf deleted file mode 100644 index fde6c561512e..000000000000 --- a/tools/bootconfig/samples/bad-override.bconf +++ /dev/null @@ -1,3 +0,0 @@ -key.subkey = value -# We can not override pre-defined subkeys with value -key := value diff --git a/tools/bootconfig/samples/bad-override2.bconf b/tools/bootconfig/samples/bad-override2.bconf deleted file mode 100644 index 688587cb023c..000000000000 --- a/tools/bootconfig/samples/bad-override2.bconf +++ /dev/null @@ -1,3 +0,0 @@ -key = value -# We can not override pre-defined value with subkey -key.subkey := value diff --git a/tools/bootconfig/samples/good-mixed-append.bconf b/tools/bootconfig/samples/good-mixed-append.bconf new file mode 100644 index 000000000000..b99a089a05f5 --- /dev/null +++ b/tools/bootconfig/samples/good-mixed-append.bconf @@ -0,0 +1,4 @@ +key = foo +keyx.subkey = value +key += bar + diff --git a/tools/bootconfig/samples/bad-mixed-kv1.bconf b/tools/bootconfig/samples/good-mixed-kv1.bconf similarity index 100% rename from tools/bootconfig/samples/bad-mixed-kv1.bconf rename to tools/bootconfig/samples/good-mixed-kv1.bconf diff --git a/tools/bootconfig/samples/bad-mixed-kv2.bconf b/tools/bootconfig/samples/good-mixed-kv2.bconf similarity index 100% rename from tools/bootconfig/samples/bad-mixed-kv2.bconf rename to tools/bootconfig/samples/good-mixed-kv2.bconf diff --git a/tools/bootconfig/samples/good-mixed-kv3.bconf b/tools/bootconfig/samples/good-mixed-kv3.bconf new file mode 100644 index 000000000000..2ce2b02224b8 --- /dev/null +++ b/tools/bootconfig/samples/good-mixed-kv3.bconf @@ -0,0 +1,6 @@ +# mixed key and subkeys with braces +key = value +key { + subkey1 + subkey2 = foo +} diff --git a/tools/bootconfig/samples/good-mixed-override.bconf b/tools/bootconfig/samples/good-mixed-override.bconf new file mode 100644 index 000000000000..18195b2873b6 --- /dev/null +++ b/tools/bootconfig/samples/good-mixed-override.bconf @@ -0,0 +1,4 @@ +key.foo = bar +key = value +# mixed key value can be overridden +key := value2 From 0ff2bb7d42c36ee60bbf3e60993666a8e0c06a24 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 2 Jun 2021 17:19:25 +0900 Subject: [PATCH 14/49] docs: bootconfig: Update for mixing value and subkeys Update document for the mixing value and subkeys on a key. Link: https://lkml.kernel.org/r/162262196510.264090.15044943496205914148.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/admin-guide/bootconfig.rst | 30 ++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst index 452b7dcd7f6b..6a79f2e59396 100644 --- a/Documentation/admin-guide/bootconfig.rst +++ b/Documentation/admin-guide/bootconfig.rst @@ -89,13 +89,35 @@ you can use ``+=`` operator. For example:: In this case, the key ``foo`` has ``bar``, ``baz`` and ``qux``. -However, a sub-key and a value can not co-exist under a parent key. -For example, following config is NOT allowed.:: +Moreover, sub-keys and a value can coexist under a parent key. +For example, following config is allowed.:: foo = value1 - foo.bar = value2 # !ERROR! subkey "bar" and value "value1" can NOT co-exist - foo.bar := value2 # !ERROR! even with the override operator, this is NOT allowed. + foo.bar = value2 + foo := value3 # This will update foo's value. +Note, since there is no syntax to put a raw value directly under a +structured key, you have to define it outside of the brace. For example:: + + foo { + bar = value1 + bar { + baz = value2 + qux = value3 + } + } + +Also, the order of the value node under a key is fixed. If there +are a value and subkeys, the value is always the first child node +of the key. Thus if user specifies subkeys first, e.g.:: + + foo.bar = value1 + foo = value2 + +In the program (and /proc/bootconfig), it will be shown as below:: + + foo = value2 + foo.bar = value1 Comments -------- From 99f4f5d62338cab9dcf45735344541574daedd20 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 2 Jun 2021 17:19:34 +0900 Subject: [PATCH 15/49] bootconfig: Share the checksum function with tools Move the checksum calculation function into the header for sharing it with tools/bootconfig. Link: https://lkml.kernel.org/r/162262197470.264090.16325743685807878807.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 20 ++++++++++++++++++++ init/main.c | 12 +----------- tools/bootconfig/main.c | 15 ++------------- 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index e49043ac77c9..6bdd94cff4e2 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -16,6 +16,26 @@ #define BOOTCONFIG_ALIGN (1 << BOOTCONFIG_ALIGN_SHIFT) #define BOOTCONFIG_ALIGN_MASK (BOOTCONFIG_ALIGN - 1) +/** + * xbc_calc_checksum() - Calculate checksum of bootconfig + * @data: Bootconfig data. + * @size: The size of the bootconfig data. + * + * Calculate the checksum value of the bootconfig data. + * The checksum will be used with the BOOTCONFIG_MAGIC and the size for + * embedding the bootconfig in the initrd image. + */ +static inline __init u32 xbc_calc_checksum(void *data, u32 size) +{ + unsigned char *p = data; + u32 ret = 0; + + while (size--) + ret += *p++; + + return ret; +} + /* XBC tree node */ struct xbc_node { u16 next; diff --git a/init/main.c b/init/main.c index e9c42a183e33..7b150f0501e2 100644 --- a/init/main.c +++ b/init/main.c @@ -386,16 +386,6 @@ static char * __init xbc_make_cmdline(const char *key) return new_cmdline; } -static u32 boot_config_checksum(unsigned char *p, u32 size) -{ - u32 ret = 0; - - while (size--) - ret += *p++; - - return ret; -} - static int __init bootconfig_params(char *param, char *val, const char *unused, void *arg) { @@ -439,7 +429,7 @@ static void __init setup_boot_config(void) return; } - if (boot_config_checksum((unsigned char *)data, size) != csum) { + if (xbc_calc_checksum(data, size) != csum) { pr_err("bootconfig checksum failed\n"); return; } diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index 62a3b5064b17..f45fa992e01d 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -126,17 +126,6 @@ static void xbc_show_list(void) } } -/* Simple real checksum */ -static int checksum(unsigned char *buf, int len) -{ - int i, sum = 0; - - for (i = 0; i < len; i++) - sum += buf[i]; - - return sum; -} - #define PAGE_SIZE 4096 static int load_xbc_fd(int fd, char **buf, int size) @@ -232,7 +221,7 @@ static int load_xbc_from_initrd(int fd, char **buf) return ret; /* Wrong Checksum */ - rcsum = checksum((unsigned char *)*buf, size); + rcsum = xbc_calc_checksum(*buf, size); if (csum != rcsum) { pr_err("checksum error: %d != %d\n", csum, rcsum); return -EINVAL; @@ -381,7 +370,7 @@ static int apply_xbc(const char *path, const char *xbc_path) return ret; } size = strlen(buf) + 1; - csum = checksum((unsigned char *)buf, size); + csum = xbc_calc_checksum(buf, size); /* Backup the bootconfig data */ data = calloc(size + BOOTCONFIG_ALIGN + From faa76a6c289f43c88affcb292bc02870921d93bf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 9 Jun 2021 18:04:58 -0400 Subject: [PATCH 16/49] tracing: Simplify the max length test when using the filtering temp buffer When filtering trace events, a temp buffer is used because the extra copy from the temp buffer into the ring buffer is still faster than the direct write into the ring buffer followed by a discard if the filter does not match. But the data that can be stored in the temp buffer is a PAGE_SIZE minus the ring buffer event header. The calculation of that header size is complex, but using the helper macro "struct_size()" can simplify it. Link: https://lore.kernel.org/stable/CAHk-=whKbJkuVmzb0hD3N6q7veprUrSpiBHRxVY=AffWZPtxmg@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2e592885a167..a0a84ff46ecd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2735,8 +2735,10 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ + int max_len = PAGE_SIZE - struct_size(entry, array, 1); + val = this_cpu_inc_return(trace_buffered_event_cnt); - if ((len < (PAGE_SIZE - sizeof(*entry) - sizeof(entry->array[0]))) && val == 1) { + if (val == 1 && likely(len <= max_len)) { trace_event_setup(entry, type, trace_ctx); entry->array[0] = len; return entry; From 8f0901cda14d3be38cd2196d8cf61cdf3b368e34 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 9 Jun 2021 18:04:59 -0400 Subject: [PATCH 17/49] tracing: Add better comments for the filtering temp buffer use case When filtering is enabled, the event is copied into a temp buffer instead of being written into the ring buffer directly, because the discarding of events from the ring buffer is very expensive, and doing the extra copy is much faster than having to discard most of the time. As that logic is subtle, add comments to explain in more detail to what is going on and how it works. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a0a84ff46ecd..a0d66a056e59 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2734,10 +2734,44 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, if (!tr->no_filter_buffering_ref && (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && (entry = this_cpu_read(trace_buffered_event))) { - /* Try to use the per cpu buffer first */ + /* + * Filtering is on, so try to use the per cpu buffer first. + * This buffer will simulate a ring_buffer_event, + * where the type_len is zero and the array[0] will + * hold the full length. + * (see include/linux/ring-buffer.h for details on + * how the ring_buffer_event is structured). + * + * Using a temp buffer during filtering and copying it + * on a matched filter is quicker than writing directly + * into the ring buffer and then discarding it when + * it doesn't match. That is because the discard + * requires several atomic operations to get right. + * Copying on match and doing nothing on a failed match + * is still quicker than no copy on match, but having + * to discard out of the ring buffer on a failed match. + */ int max_len = PAGE_SIZE - struct_size(entry, array, 1); val = this_cpu_inc_return(trace_buffered_event_cnt); + + /* + * Preemption is disabled, but interrupts and NMIs + * can still come in now. If that happens after + * the above increment, then it will have to go + * back to the old method of allocating the event + * on the ring buffer, and if the filter fails, it + * will have to call ring_buffer_discard_commit() + * to remove it. + * + * Need to also check the unlikely case that the + * length is bigger than the temp buffer size. + * If that happens, then the reserve is pretty much + * guaranteed to fail, as the ring buffer currently + * only allows events less than a page. But that may + * change in the future, so let the ring buffer reserve + * handle the failure in that case. + */ if (val == 1 && likely(len <= max_len)) { trace_event_setup(entry, type, trace_ctx); entry->array[0] = len; From f38601368f4a0c2a9f859511768dc3957e2e1769 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 17 Jun 2021 10:51:02 -0400 Subject: [PATCH 18/49] tracing: Add tp_printk_stop_on_boot option Add a kernel command line option that disables printing of events to console at late_initcall_sync(). This is useful when needing to see specific events written to console on boot up, but not wanting it when user space starts, as user space may make the console so noisy that the system becomes inoperable. Signed-off-by: Steven Rostedt (VMware) --- .../admin-guide/kernel-parameters.txt | 13 ++++++ kernel/trace/trace.c | 40 ++++++++++++++----- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index cb89dbdedc46..0fcc725105cd 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5605,12 +5605,25 @@ Note, echoing 1 into this file without the tracepoint_printk kernel cmdline option has no effect. + The tp_printk_stop_on_boot (see below) can also be used + to stop the printing of events to console at + late_initcall_sync. + ** CAUTION ** Having tracepoints sent to printk() and activating high frequency tracepoints such as irq or sched, can cause the system to live lock. + tp_printk_stop_on_boot[FTRACE] + When tp_printk (above) is set, it can cause a lot of noise + on the console. It may be useful to only include the + printing of events during boot up, as user space may + make the system inoperable. + + This command line option will stop the printing of events + to console at the late_initcall_sync() time frame. + traceoff_on_warning [FTRACE] enable this option to disable tracing when a warning is hit. This turns off "tracing_on". Tracing can diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a0d66a056e59..bbc63ac5b47f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -86,6 +86,7 @@ void __init disable_tracing_selftest(const char *reason) /* Pipe tracepoints to printk */ struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; +static bool tracepoint_printk_stop_on_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); /* For tracers that don't implement custom flags */ @@ -256,6 +257,13 @@ static int __init set_tracepoint_printk(char *str) } __setup("tp_printk", set_tracepoint_printk); +static int __init set_tracepoint_printk_stop(char *str) +{ + tracepoint_printk_stop_on_boot = true; + return 1; +} +__setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop); + unsigned long long ns2usecs(u64 nsec) { nsec += 500; @@ -9578,6 +9586,8 @@ static __init int tracer_init_tracefs(void) return 0; } +fs_initcall(tracer_init_tracefs); + static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { @@ -9998,7 +10008,7 @@ void __init trace_init(void) trace_event_init(); } -__init static int clear_boot_tracer(void) +__init static void clear_boot_tracer(void) { /* * The default tracer at boot buffer is an init section. @@ -10008,26 +10018,21 @@ __init static int clear_boot_tracer(void) * about to be freed. */ if (!default_bootup_tracer) - return 0; + return; printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n", default_bootup_tracer); default_bootup_tracer = NULL; - - return 0; } -fs_initcall(tracer_init_tracefs); -late_initcall_sync(clear_boot_tracer); - #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -__init static int tracing_set_default_clock(void) +__init static void tracing_set_default_clock(void) { /* sched_clock_stable() is determined in late_initcall */ if (!trace_boot_clock && !sched_clock_stable()) { if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Can not set tracing clock due to lockdown\n"); - return -EPERM; + return; } printk(KERN_WARNING @@ -10037,8 +10042,21 @@ __init static int tracing_set_default_clock(void) "on the kernel command line\n"); tracing_set_clock(&global_trace, "global"); } +} +#else +static inline void tracing_set_default_clock(void) { } +#endif +__init static int late_trace_init(void) +{ + if (tracepoint_printk && tracepoint_printk_stop_on_boot) { + static_key_disable(&tracepoint_printk_key.key); + tracepoint_printk = 0; + } + + tracing_set_default_clock(); + clear_boot_tracer(); return 0; } -late_initcall_sync(tracing_set_default_clock); -#endif + +late_initcall_sync(late_trace_init); From 2db7ab6b4c962e2499c86e8fe9cb1369ebaf91d1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 17 Jun 2021 16:20:41 -0400 Subject: [PATCH 19/49] tracing: Have ftrace_dump_on_oops kernel parameter take numbers The kernel parameter for ftrace_dump_on_oops can take a single assignment. That is, it can be: ftrace_dump_on_oops or ftrace_dump_on_oops=orig_cpu But the content in the sysctl file is a number. 0 for disabled 1 for ftrace_dump_on_oops (all CPUs) 2 for ftrace_dump_on_oops (orig CPU) Allow the kernel command line to take a number as well to match the sysctl numbers. That is: ftrace_dump_on_oops=1 is the same as ftrace_dump_on_oops and ftrace_dump_on_oops=2 is the same as ftrace_dump_on_oops=orig_cpu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bbc63ac5b47f..d352fb4b7709 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -197,12 +197,12 @@ __setup("ftrace=", set_cmdline_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { - if (*str++ != '=' || !*str) { + if (*str++ != '=' || !*str || !strcmp("1", str)) { ftrace_dump_on_oops = DUMP_ALL; return 1; } - if (!strcmp("orig_cpu", str)) { + if (!strcmp("orig_cpu", str) || !strcmp("2", str)) { ftrace_dump_on_oops = DUMP_ORIG; return 1; } From 171ec346fc8936f52e1184f1ab1377ee40052bfc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 18 Jun 2021 11:26:47 -0400 Subject: [PATCH 20/49] bootconfig/tracing/ktest: Add ktest examples of testing bootconfig bootconfig is a new feature that appends scripts onto the initrd, and the kernel executes the scripts as an extended kernel command line. Need to add tests to test that the happened. To test the bootconfig properly, the initrd needs to be updated and the kernel rebooted. ktest is the perfect solution to perform these tests. Add a example bootconfig.conf in the tools/testing/ktest/examples/include and example bootconfig scripts in tools/testing/ktest/examples/bootconfig and also include verifier scripts that ktest will install on the target and run to make sure that the bootconfig options in the scripts took place after the target rebooted with the new initrd update. Link: https://lkml.kernel.org/r/20210618112647.6a81dec5@oasis.local.home Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- .../examples/bootconfigs/boottrace.bconf | 49 +++++++++++ .../examples/bootconfigs/config-bootconfig | 1 + .../examples/bootconfigs/functiongraph.bconf | 15 ++++ .../ktest/examples/bootconfigs/tracing.bconf | 33 ++++++++ .../examples/bootconfigs/verify-boottrace.sh | 84 +++++++++++++++++++ .../bootconfigs/verify-functiongraph.sh | 61 ++++++++++++++ .../examples/bootconfigs/verify-tracing.sh | 72 ++++++++++++++++ .../ktest/examples/include/bootconfig.conf | 69 +++++++++++++++ tools/testing/ktest/examples/kvm.conf | 1 + 9 files changed, 385 insertions(+) create mode 100644 tools/testing/ktest/examples/bootconfigs/boottrace.bconf create mode 100644 tools/testing/ktest/examples/bootconfigs/config-bootconfig create mode 100644 tools/testing/ktest/examples/bootconfigs/functiongraph.bconf create mode 100644 tools/testing/ktest/examples/bootconfigs/tracing.bconf create mode 100755 tools/testing/ktest/examples/bootconfigs/verify-boottrace.sh create mode 100755 tools/testing/ktest/examples/bootconfigs/verify-functiongraph.sh create mode 100755 tools/testing/ktest/examples/bootconfigs/verify-tracing.sh create mode 100644 tools/testing/ktest/examples/include/bootconfig.conf diff --git a/tools/testing/ktest/examples/bootconfigs/boottrace.bconf b/tools/testing/ktest/examples/bootconfigs/boottrace.bconf new file mode 100644 index 000000000000..9db64ec589d5 --- /dev/null +++ b/tools/testing/ktest/examples/bootconfigs/boottrace.bconf @@ -0,0 +1,49 @@ +ftrace.event { + task.task_newtask { + filter = "pid < 128" + enable + } + kprobes.vfs_read { + probes = "vfs_read $arg1 $arg2" + filter = "common_pid < 200" + enable + } + synthetic.initcall_latency { + fields = "unsigned long func", "u64 lat" + actions = "hist:keys=func.sym,lat:vals=lat:sort=lat" + } + initcall.initcall_start { + actions = "hist:keys=func:ts0=common_timestamp.usecs" + } + initcall.initcall_finish { + actions = "hist:keys=func:lat=common_timestamp.usecs-$ts0:onmatch(initcall.initcall_start).initcall_latency(func,$lat)" + } +} + +ftrace.instance { + foo { + tracer = "function" + ftrace.filters = "user_*" + cpumask = 1 + options = nosym-addr + buffer_size = 512KB + trace_clock = mono + event.signal.signal_deliver.actions=snapshot + } + bar { + tracer = "function" + ftrace.filters = "kernel_*" + cpumask = 2 + trace_clock = x86-tsc + } +} + +ftrace.alloc_snapshot + +kernel { + trace_options = sym-addr + trace_event = "initcall:*" + trace_buf_size = 1M + ftrace = function + ftrace_filter = "vfs*" +} diff --git a/tools/testing/ktest/examples/bootconfigs/config-bootconfig b/tools/testing/ktest/examples/bootconfigs/config-bootconfig new file mode 100644 index 000000000000..0685b6811388 --- /dev/null +++ b/tools/testing/ktest/examples/bootconfigs/config-bootconfig @@ -0,0 +1 @@ +CONFIG_CMDLINE="bootconfig" diff --git a/tools/testing/ktest/examples/bootconfigs/functiongraph.bconf b/tools/testing/ktest/examples/bootconfigs/functiongraph.bconf new file mode 100644 index 000000000000..68debfcbda76 --- /dev/null +++ b/tools/testing/ktest/examples/bootconfigs/functiongraph.bconf @@ -0,0 +1,15 @@ +ftrace { + tracing_on = 0 # off by default + tracer = function_graph + event.kprobes { + start_event { + probes = "pci_proc_init" + actions = "traceon" + } + end_event { + probes = "pci_proc_init%return" + actions = "traceoff" + } + } +} + diff --git a/tools/testing/ktest/examples/bootconfigs/tracing.bconf b/tools/testing/ktest/examples/bootconfigs/tracing.bconf new file mode 100644 index 000000000000..bf117c78115a --- /dev/null +++ b/tools/testing/ktest/examples/bootconfigs/tracing.bconf @@ -0,0 +1,33 @@ +ftrace { + tracer = function_graph; + options = event-fork, sym-addr, stacktrace; + buffer_size = 1M; + alloc_snapshot; + trace_clock = global; + events = "task:task_newtask", "initcall:*"; + event.sched.sched_process_exec { + filter = "pid < 128"; + } + instance.bar { + event.kprobes { + myevent { + probes = "vfs_read $arg2 $arg3"; + } + myevent2 { + probes = "vfs_write $arg2 +0($arg2):ustring $arg3"; + } + myevent3 { + probes = "initrd_load"; + } + enable + } + } + instance.foo { + tracer = function; + tracing_on = false; + }; +} +kernel { + ftrace_dump_on_oops = "orig_cpu" + traceoff_on_warning +} diff --git a/tools/testing/ktest/examples/bootconfigs/verify-boottrace.sh b/tools/testing/ktest/examples/bootconfigs/verify-boottrace.sh new file mode 100755 index 000000000000..f271940ce7fb --- /dev/null +++ b/tools/testing/ktest/examples/bootconfigs/verify-boottrace.sh @@ -0,0 +1,84 @@ +#!/bin/sh + +cd /sys/kernel/tracing + +compare_file() { + file="$1" + val="$2" + content=`cat $file` + if [ "$content" != "$val" ]; then + echo "FAILED: $file has '$content', expected '$val'" + exit 1 + fi +} + +compare_file_partial() { + file="$1" + val="$2" + content=`cat $file | sed -ne "/^$val/p"` + if [ -z "$content" ]; then + echo "FAILED: $file does not contain '$val'" + cat $file + exit 1 + fi +} + +file_contains() { + file=$1 + val="$2" + + if ! grep -q "$val" $file ; then + echo "FAILED: $file does not contain $val" + cat $file + exit 1 + fi +} + +compare_mask() { + file=$1 + val="$2" + + content=`cat $file | sed -ne "/^[0 ]*$val/p"` + if [ -z "$content" ]; then + echo "FAILED: $file does not have mask '$val'" + cat $file + exit 1 + fi +} + +compare_file "events/task/task_newtask/filter" "pid < 128" +compare_file "events/task/task_newtask/enable" "1" + +compare_file "events/kprobes/vfs_read/filter" "common_pid < 200" +compare_file "events/kprobes/vfs_read/enable" "1" + +compare_file_partial "events/synthetic/initcall_latency/trigger" "hist:keys=func.sym,lat:vals=hitcount,lat:sort=lat" +compare_file_partial "events/synthetic/initcall_latency/enable" "0" + +compare_file_partial "events/initcall/initcall_start/trigger" "hist:keys=func:vals=hitcount:ts0=common_timestamp.usecs" +compare_file_partial "events/initcall/initcall_start/enable" "1" + +compare_file_partial "events/initcall/initcall_finish/trigger" 'hist:keys=func:vals=hitcount:lat=common_timestamp.usecs-\$ts0:sort=hitcount:size=2048:clock=global:onmatch(initcall.initcall_start).initcall_latency(func,\$lat)' +compare_file_partial "events/initcall/initcall_finish/enable" "1" + +compare_file "instances/foo/current_tracer" "function" +file_contains "instances/foo/set_ftrace_filter" "^user" +compare_file "instances/foo/buffer_size_kb" "512" +compare_mask "instances/foo/tracing_cpumask" "1" +compare_file "instances/foo/options/sym-addr" "0" +file_contains "instances/foo/trace_clock" '\[mono\]' +compare_file_partial "instances/foo/events/signal/signal_deliver/trigger" "snapshot" + +compare_file "instances/bar/current_tracer" "function" +file_contains "instances/bar/set_ftrace_filter" "^kernel" +compare_mask "instances/bar/tracing_cpumask" "2" +file_contains "instances/bar/trace_clock" '\[x86-tsc\]' + +file_contains "snapshot" "Snapshot is allocated" +compare_file "options/sym-addr" "1" +compare_file "events/initcall/enable" "1" +compare_file "buffer_size_kb" "1027" +compare_file "current_tracer" "function" +file_contains "set_ftrace_filter" '^vfs' + +exit 0 diff --git a/tools/testing/ktest/examples/bootconfigs/verify-functiongraph.sh b/tools/testing/ktest/examples/bootconfigs/verify-functiongraph.sh new file mode 100755 index 000000000000..b50baa10fe97 --- /dev/null +++ b/tools/testing/ktest/examples/bootconfigs/verify-functiongraph.sh @@ -0,0 +1,61 @@ +#!/bin/sh + +cd /sys/kernel/tracing + +compare_file() { + file="$1" + val="$2" + content=`cat $file` + if [ "$content" != "$val" ]; then + echo "FAILED: $file has '$content', expected '$val'" + exit 1 + fi +} + +compare_file_partial() { + file="$1" + val="$2" + content=`cat $file | sed -ne "/^$val/p"` + if [ -z "$content" ]; then + echo "FAILED: $file does not contain '$val'" + cat $file + exit 1 + fi +} + +file_contains() { + file=$1 + val="$2" + + if ! grep -q "$val" $file ; then + echo "FAILED: $file does not contain $val" + cat $file + exit 1 + fi +} + +compare_mask() { + file=$1 + val="$2" + + content=`cat $file | sed -ne "/^[0 ]*$val/p"` + if [ -z "$content" ]; then + echo "FAILED: $file does not have mask '$val'" + cat $file + exit 1 + fi +} + + +compare_file "tracing_on" "0" +compare_file "current_tracer" "function_graph" + +compare_file_partial "events/kprobes/start_event/enable" "1" +compare_file_partial "events/kprobes/start_event/trigger" "traceon" +file_contains "kprobe_events" 'start_event.*pci_proc_init' + +compare_file_partial "events/kprobes/end_event/enable" "1" +compare_file_partial "events/kprobes/end_event/trigger" "traceoff" +file_contains "kprobe_events" '^r.*end_event.*pci_proc_init' + +exit 0 diff --git a/tools/testing/ktest/examples/bootconfigs/verify-tracing.sh b/tools/testing/ktest/examples/bootconfigs/verify-tracing.sh new file mode 100755 index 000000000000..01e111e36e63 --- /dev/null +++ b/tools/testing/ktest/examples/bootconfigs/verify-tracing.sh @@ -0,0 +1,72 @@ +#!/bin/sh + +cd /sys/kernel/tracing + +compare_file() { + file="$1" + val="$2" + content=`cat $file` + if [ "$content" != "$val" ]; then + echo "FAILED: $file has '$content', expected '$val'" + exit 1 + fi +} + +compare_file_partial() { + file="$1" + val="$2" + content=`cat $file | sed -ne "/^$val/p"` + if [ -z "$content" ]; then + echo "FAILED: $file does not contain '$val'" + cat $file + exit 1 + fi +} + +file_contains() { + file=$1 + val="$2" + + if ! grep -q "$val" $file ; then + echo "FAILED: $file does not contain $val" + cat $file + exit 1 + fi +} + +compare_mask() { + file=$1 + val="$2" + + content=`cat $file | sed -ne "/^[0 ]*$val/p"` + if [ -z "$content" ]; then + echo "FAILED: $file does not have mask '$val'" + cat $file + exit 1 + fi +} + +compare_file "current_tracer" "function_graph" +compare_file "options/event-fork" "1" +compare_file "options/sym-addr" "1" +compare_file "options/stacktrace" "1" +compare_file "buffer_size_kb" "1024" +file_contains "snapshot" "Snapshot is allocated" +file_contains "trace_clock" '\[global\]' + +compare_file "events/initcall/enable" "1" +compare_file "events/task/task_newtask/enable" "1" +compare_file "events/sched/sched_process_exec/filter" "pid < 128" +compare_file "events/kprobes/enable" "1" + +compare_file "instances/bar/events/kprobes/myevent/enable" "1" +compare_file "instances/bar/events/kprobes/myevent2/enable" "1" +compare_file "instances/bar/events/kprobes/myevent3/enable" "1" + +compare_file "instances/foo/current_tracer" "function" +compare_file "instances/foo/tracing_on" "0" + +compare_file "/proc/sys/kernel/ftrace_dump_on_oops" "2" +compare_file "/proc/sys/kernel/traceoff_on_warning" "1" + +exit 0 diff --git a/tools/testing/ktest/examples/include/bootconfig.conf b/tools/testing/ktest/examples/include/bootconfig.conf new file mode 100644 index 000000000000..3b885de085bd --- /dev/null +++ b/tools/testing/ktest/examples/include/bootconfig.conf @@ -0,0 +1,69 @@ +# bootconfig.conf +# +# Tests to test some bootconfig scripts + +# List where on the target machine the initrd is used +INITRD := /boot/initramfs-test.img + +# Install bootconfig on the target machine and define the path here. +BOOTCONFIG := /usr/bin/bootconfig + +# Currenty we just build the .config in the BUILD_DIR +BUILD_TYPE := oldconfig + +# Helper macro to run bootconfig on the target +# SSH is defined in include/defaults.conf +ADD_BOOTCONFIG := ${SSH} "${BOOTCONFIG} -d ${INITRD} && ${BOOTCONFIG} -a /tmp/${BOOTCONFIG_FILE} ${INITRD}" + +# This copies a bootconfig script to the target and then will +# add it to the initrd. SSH_USER is defined in include/defaults.conf +# and MACHINE is defined in the example configs. +BOOTCONFIG_TEST_PREP = scp ${BOOTCONFIG_PATH}${BOOTCONFIG_FILE} ${SSH_USER}@${MACHINE}:/tmp && ${ADD_BOOTCONFIG} + +# When a test is complete, remove the bootconfig from the initrd. +CLEAR_BOOTCONFIG := ${SSH} "${BOOTCONFIG} -d ${INITRD}" + +# Run a verifier on the target after it had booted, to make sure that the +# bootconfig script did what it was expected to do +DO_TEST = scp ${BOOTCONFIG_PATH}${BOOTCONFIG_VERIFY} ${SSH_USER}@${MACHINE}:/tmp && ${SSH} /tmp/${BOOTCONFIG_VERIFY} + +# Comment this out to not run the boot configs +RUN_BOOTCONFIG := 1 + +TEST_START IF DEFINED RUN_BOOTCONFIG +TEST_TYPE = test +TEST_NAME = bootconfig boottrace +# Just testing the bootconfig on initrd, no need to build the kernel +BUILD_TYPE = nobuild +BOOTCONFIG_FILE = boottrace.bconf +BOOTCONFIG_VERIFY = verify-boottrace.sh +ADD_CONFIG = ${ADD_CONFIG} ${BOOTCONFIG_PATH}/config-bootconfig +PRE_TEST = ${BOOTCONFIG_TEST_PREP} +PRE_TEST_DIE = 1 +TEST = ${DO_TEST} +POST_TEST = ${CLEAR_BOOTCONFIG} + +TEST_START IF DEFINED RUN_BOOTCONFIG +TEST_TYPE = test +TEST_NAME = bootconfig function graph +BUILD_TYPE = nobuild +BOOTCONFIG_FILE = functiongraph.bconf +BOOTCONFIG_VERIFY = verify-functiongraph.sh +ADD_CONFIG = ${ADD_CONFIG} ${BOOTCONFIG_PATH}/config-bootconfig +PRE_TEST = ${BOOTCONFIG_TEST_PREP} +PRE_TEST_DIE = 1 +TEST = ${DO_TEST} +POST_TEST = ${CLEAR_BOOTCONFIG} + +TEST_START IF DEFINED RUN_BOOTCONFIG +TEST_TYPE = test +TEST_NAME = bootconfig tracing +BUILD_TYPE = nobuild +BOOTCONFIG_FILE = tracing.bconf +BOOTCONFIG_VERIFY = verify-tracing.sh +ADD_CONFIG = ${ADD_CONFIG} ${BOOTCONFIG_PATH}/config-bootconfig +PRE_TEST = ${BOOTCONFIG_TEST_PREP} +PRE_TEST_DIE = 1 +TEST = ${DO_TEST} +POST_TEST = ${CLEAR_BOOTCONFIG} + diff --git a/tools/testing/ktest/examples/kvm.conf b/tools/testing/ktest/examples/kvm.conf index fbc134f9ac6e..c700e8bb7fde 100644 --- a/tools/testing/ktest/examples/kvm.conf +++ b/tools/testing/ktest/examples/kvm.conf @@ -90,3 +90,4 @@ INCLUDE include/patchcheck.conf INCLUDE include/tests.conf INCLUDE include/bisect.conf INCLUDE include/min-config.conf +INCLUDE include/bootconfig.conf \ No newline at end of file From bb1b24cf41b5b3b96a921f80f9799e7be75f167d Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 22 Jun 2021 16:42:19 +0200 Subject: [PATCH 21/49] trace/hwlat: Fix Clark's email Clark's email is williams@redhat.com. No functional change. Link: https://lkml.kernel.org/r/6fa4b49e17ab8a1ff19c335ab7cde38d8afb0e29.1624372313.git.bristot@redhat.com Cc: Phil Auld Cc: Sebastian Andrzej Siewior Cc: Kate Carcia Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Clark Willaims Cc: John Kacur Cc: Juri Lelli Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 632ef88131a9..0a5635401125 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -34,7 +34,7 @@ * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. * - * Includes useful feedback from Clark Williams + * Includes useful feedback from Clark Williams * */ #include From 8fa826b7344d6752f5cfd72380d9fe7bd8c6b928 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 22 Jun 2021 16:42:20 +0200 Subject: [PATCH 22/49] trace/hwlat: Implement the mode config option Provides the "mode" config to the hardware latency detector. hwlatd has two different operation modes. The default mode is the "round-robin" one, in which a single hwlatd thread runs, migrating among the allowed CPUs in a "round-robin" fashion. This is the current behavior. The "none" sets the allowed cpumask for a single hwlatd thread at the startup, but skips the round-robin, letting the scheduler handle the migration. In preparation to the per-cpu mode. Link: https://lkml.kernel.org/r/f3b1271262aa030c680e26615c1b9b2d71e55e92.1624372313.git.bristot@redhat.com Cc: Phil Auld Cc: Sebastian Andrzej Siewior Cc: Kate Carcia Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Clark Willaims Cc: John Kacur Cc: Juri Lelli Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/hwlat_detector.rst | 12 +- kernel/trace/trace_hwlat.c | 179 +++++++++++++++++++++++-- 2 files changed, 175 insertions(+), 16 deletions(-) diff --git a/Documentation/trace/hwlat_detector.rst b/Documentation/trace/hwlat_detector.rst index 5739349649c8..4d952df0586a 100644 --- a/Documentation/trace/hwlat_detector.rst +++ b/Documentation/trace/hwlat_detector.rst @@ -76,8 +76,12 @@ in /sys/kernel/tracing: - tracing_cpumask - the CPUs to move the hwlat thread across - hwlat_detector/width - specified amount of time to spin within window (usecs) - hwlat_detector/window - amount of time between (width) runs (usecs) + - hwlat_detector/mode - the thread mode -The hwlat detector's kernel thread will migrate across each CPU specified in -tracing_cpumask between each window. To limit the migration, either modify -tracing_cpumask, or modify the hwlat kernel thread (named [hwlatd]) CPU -affinity directly, and the migration will stop. +By default, the hwlat detector's kernel thread will migrate across each CPU +specified in cpumask at the beginning of a new window, in a round-robin +fashion. This behavior can be changed by changing the thread mode, +the available options are: + + - none: do not force migration + - round-robin: migrate across each CPU specified in cpumask [default] diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 0a5635401125..43a436d85a01 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -59,6 +59,14 @@ static struct task_struct *hwlat_kthread; static struct dentry *hwlat_sample_width; /* sample width us */ static struct dentry *hwlat_sample_window; /* sample window us */ +static struct dentry *hwlat_thread_mode; /* hwlat thread mode */ + +enum { + MODE_NONE = 0, + MODE_ROUND_ROBIN, + MODE_MAX +}; +static char *thread_mode_str[] = { "none", "round-robin" }; /* Save the previous tracing_thresh value */ static unsigned long save_tracing_thresh; @@ -96,11 +104,16 @@ static struct hwlat_data { u64 sample_window; /* total sampling window (on+off) */ u64 sample_width; /* active sampling portion of window */ + int thread_mode; /* thread mode */ + } hwlat_data = { .sample_window = DEFAULT_SAMPLE_WINDOW, .sample_width = DEFAULT_SAMPLE_WIDTH, + .thread_mode = MODE_ROUND_ROBIN }; +static bool hwlat_busy; + static void trace_hwlat_sample(struct hwlat_sample *sample) { struct trace_array *tr = hwlat_trace; @@ -328,7 +341,8 @@ static int kthread_fn(void *data) while (!kthread_should_stop()) { - move_to_next_cpu(); + if (hwlat_data.thread_mode == MODE_ROUND_ROBIN) + move_to_next_cpu(); local_irq_disable(); get_sample(); @@ -351,7 +365,7 @@ static int kthread_fn(void *data) return 0; } -/** +/* * start_kthread - Kick off the hardware latency sampling/detector kthread * * This starts the kernel thread that will sit and sample the CPU timestamp @@ -366,11 +380,6 @@ static int start_kthread(struct trace_array *tr) if (hwlat_kthread) return 0; - /* Just pick the first CPU on first iteration */ - get_online_cpus(); - cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); - put_online_cpus(); - next_cpu = cpumask_first(current_mask); kthread = kthread_create(kthread_fn, NULL, "hwlatd"); if (IS_ERR(kthread)) { @@ -378,8 +387,19 @@ static int start_kthread(struct trace_array *tr) return -ENOMEM; } - cpumask_clear(current_mask); - cpumask_set_cpu(next_cpu, current_mask); + + /* Just pick the first CPU on first iteration */ + get_online_cpus(); + cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); + put_online_cpus(); + + if (hwlat_data.thread_mode == MODE_ROUND_ROBIN) { + next_cpu = cpumask_first(current_mask); + cpumask_clear(current_mask); + cpumask_set_cpu(next_cpu, current_mask); + + } + sched_setaffinity(kthread->pid, current_mask); hwlat_kthread = kthread; @@ -388,7 +408,7 @@ static int start_kthread(struct trace_array *tr) return 0; } -/** +/* * stop_kthread - Inform the hardware latency sampling/detector kthread to stop * * This kicks the running hardware latency sampling/detector kernel thread and @@ -511,6 +531,129 @@ hwlat_window_write(struct file *filp, const char __user *ubuf, return cnt; } +static void *s_mode_start(struct seq_file *s, loff_t *pos) +{ + int mode = *pos; + + mutex_lock(&hwlat_data.lock); + + if (mode >= MODE_MAX) + return NULL; + + return pos; +} + +static void *s_mode_next(struct seq_file *s, void *v, loff_t *pos) +{ + int mode = ++(*pos); + + if (mode >= MODE_MAX) + return NULL; + + return pos; +} + +static int s_mode_show(struct seq_file *s, void *v) +{ + loff_t *pos = v; + int mode = *pos; + + if (mode == hwlat_data.thread_mode) + seq_printf(s, "[%s]", thread_mode_str[mode]); + else + seq_printf(s, "%s", thread_mode_str[mode]); + + if (mode != MODE_MAX) + seq_puts(s, " "); + + return 0; +} + +static void s_mode_stop(struct seq_file *s, void *v) +{ + seq_puts(s, "\n"); + mutex_unlock(&hwlat_data.lock); +} + +static const struct seq_operations thread_mode_seq_ops = { + .start = s_mode_start, + .next = s_mode_next, + .show = s_mode_show, + .stop = s_mode_stop +}; + +static int hwlat_mode_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &thread_mode_seq_ops); +}; + +static void hwlat_tracer_start(struct trace_array *tr); +static void hwlat_tracer_stop(struct trace_array *tr); + +/** + * hwlat_mode_write - Write function for "mode" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * This function provides a write implementation for the "mode" interface + * to the hardware latency detector. hwlatd has different operation modes. + * The "none" sets the allowed cpumask for a single hwlatd thread at the + * startup and lets the scheduler handle the migration. The default mode is + * the "round-robin" one, in which a single hwlatd thread runs, migrating + * among the allowed CPUs in a round-robin fashion. + */ +static ssize_t hwlat_mode_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = hwlat_trace; + const char *mode; + char buf[64]; + int ret, i; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + mode = strstrip(buf); + + ret = -EINVAL; + + /* + * trace_types_lock is taken to avoid concurrency on start/stop + * and hwlat_busy. + */ + mutex_lock(&trace_types_lock); + if (hwlat_busy) + hwlat_tracer_stop(tr); + + mutex_lock(&hwlat_data.lock); + + for (i = 0; i < MODE_MAX; i++) { + if (strcmp(mode, thread_mode_str[i]) == 0) { + hwlat_data.thread_mode = i; + ret = cnt; + } + } + + mutex_unlock(&hwlat_data.lock); + + if (hwlat_busy) + hwlat_tracer_start(tr); + mutex_unlock(&trace_types_lock); + + *ppos += cnt; + + + + return ret; +} + static const struct file_operations width_fops = { .open = tracing_open_generic, .read = hwlat_read, @@ -523,6 +666,13 @@ static const struct file_operations window_fops = { .write = hwlat_window_write, }; +static const struct file_operations thread_mode_fops = { + .open = hwlat_mode_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = hwlat_mode_write +}; /** * init_tracefs - A function to initialize the tracefs interface files * @@ -558,6 +708,13 @@ static int init_tracefs(void) if (!hwlat_sample_width) goto err; + hwlat_thread_mode = trace_create_file("mode", 0644, + top_dir, + NULL, + &thread_mode_fops); + if (!hwlat_thread_mode) + goto err; + return 0; err: @@ -579,8 +736,6 @@ static void hwlat_tracer_stop(struct trace_array *tr) stop_kthread(); } -static bool hwlat_busy; - static int hwlat_tracer_init(struct trace_array *tr) { /* Only allow one instance to enable this */ From 7bb7d802af1d0b2608ef5afafcf968073a50acb7 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 22 Jun 2021 16:42:21 +0200 Subject: [PATCH 23/49] trace/hwlat: Switch disable_migrate to mode none When in the round-robin mode, if the tracer detects a change in the hwlatd thread affinity by an external tool, e.g., taskset, the round-robin logic is disabled. The disable_migrate variable currently tracks this. With the addition of the "mode" config and the mode "none," the disable_migrate logic is equivalent to switch to the "none" mode. Hence, instead of using a hidden variable to track this behavior, switch the mode to none, informing the user about this change. Link: https://lkml.kernel.org/r/a679af672458d6b1f62252605905c5214030f247.1624372313.git.bristot@redhat.com Cc: Phil Auld Cc: Sebastian Andrzej Siewior Cc: Kate Carcia Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Clark Willaims Cc: John Kacur Cc: Juri Lelli Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 43a436d85a01..bae74b95cf55 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -286,7 +286,6 @@ out: } static struct cpumask save_cpumask; -static bool disable_migrate; static void move_to_next_cpu(void) { @@ -294,15 +293,13 @@ static void move_to_next_cpu(void) struct trace_array *tr = hwlat_trace; int next_cpu; - if (disable_migrate) - return; /* * If for some reason the user modifies the CPU affinity * of this thread, then stop migrating for the duration * of the current test. */ if (!cpumask_equal(current_mask, current->cpus_ptr)) - goto disable; + goto change_mode; get_online_cpus(); cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); @@ -313,7 +310,7 @@ static void move_to_next_cpu(void) next_cpu = cpumask_first(current_mask); if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ - goto disable; + goto change_mode; cpumask_clear(current_mask); cpumask_set_cpu(next_cpu, current_mask); @@ -321,8 +318,9 @@ static void move_to_next_cpu(void) sched_setaffinity(0, current_mask); return; - disable: - disable_migrate = true; + change_mode: + hwlat_data.thread_mode = MODE_NONE; + pr_info(BANNER "cpumask changed while in round-robin mode, switching to mode none\n"); } /* @@ -744,7 +742,6 @@ static int hwlat_tracer_init(struct trace_array *tr) hwlat_trace = tr; - disable_migrate = false; hwlat_data.count = 0; tr->max_latency = 0; save_tracing_thresh = tracing_thresh; From f46b16520a087e892a189db9c23ccf7e9bb5fa69 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 22 Jun 2021 16:42:22 +0200 Subject: [PATCH 24/49] trace/hwlat: Implement the per-cpu mode Implements the per-cpu mode in which a sampling thread is created for each cpu in the "cpus" (and tracing_mask). The per-cpu mode has the potention to speed up the hwlat detection by running on multiple CPUs at the same time, at the cost of higher cpu usage with irqs disabled. Use with care. [ Changed get_cpu_data() to static. Reported-by: kernel test robot ] Link: https://lkml.kernel.org/r/ec06d0ab340e8460d293772faba19ad8a5c371aa.1624372313.git.bristot@redhat.com Cc: Phil Auld Cc: Sebastian Andrzej Siewior Cc: Kate Carcia Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Clark Willaims Cc: John Kacur Cc: Juri Lelli Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/hwlat_detector.rst | 3 +- kernel/trace/trace_hwlat.c | 186 ++++++++++++++++++++----- 2 files changed, 152 insertions(+), 37 deletions(-) diff --git a/Documentation/trace/hwlat_detector.rst b/Documentation/trace/hwlat_detector.rst index 4d952df0586a..de94b499b0bc 100644 --- a/Documentation/trace/hwlat_detector.rst +++ b/Documentation/trace/hwlat_detector.rst @@ -78,10 +78,11 @@ in /sys/kernel/tracing: - hwlat_detector/window - amount of time between (width) runs (usecs) - hwlat_detector/mode - the thread mode -By default, the hwlat detector's kernel thread will migrate across each CPU +By default, one hwlat detector's kernel thread will migrate across each CPU specified in cpumask at the beginning of a new window, in a round-robin fashion. This behavior can be changed by changing the thread mode, the available options are: - none: do not force migration - round-robin: migrate across each CPU specified in cpumask [default] + - per-cpu: create one thread for each cpu in tracing_cpumask diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index bae74b95cf55..3957b36826e2 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -54,9 +54,6 @@ static struct trace_array *hwlat_trace; #define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */ #define DEFAULT_LAT_THRESHOLD 10 /* 10us */ -/* sampling thread*/ -static struct task_struct *hwlat_kthread; - static struct dentry *hwlat_sample_width; /* sample width us */ static struct dentry *hwlat_sample_window; /* sample window us */ static struct dentry *hwlat_thread_mode; /* hwlat thread mode */ @@ -64,18 +61,26 @@ static struct dentry *hwlat_thread_mode; /* hwlat thread mode */ enum { MODE_NONE = 0, MODE_ROUND_ROBIN, + MODE_PER_CPU, MODE_MAX }; -static char *thread_mode_str[] = { "none", "round-robin" }; +static char *thread_mode_str[] = { "none", "round-robin", "per-cpu" }; /* Save the previous tracing_thresh value */ static unsigned long save_tracing_thresh; -/* NMI timestamp counters */ -static u64 nmi_ts_start; -static u64 nmi_total_ts; -static int nmi_count; -static int nmi_cpu; +/* runtime kthread data */ +struct hwlat_kthread_data { + struct task_struct *kthread; + /* NMI timestamp counters */ + u64 nmi_ts_start; + u64 nmi_total_ts; + int nmi_count; + int nmi_cpu; +}; + +struct hwlat_kthread_data hwlat_single_cpu_data; +DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data); /* Tells NMIs to call back to the hwlat tracer to record timestamps */ bool trace_hwlat_callback_enabled; @@ -112,6 +117,14 @@ static struct hwlat_data { .thread_mode = MODE_ROUND_ROBIN }; +static struct hwlat_kthread_data *get_cpu_data(void) +{ + if (hwlat_data.thread_mode == MODE_PER_CPU) + return this_cpu_ptr(&hwlat_per_cpu_data); + else + return &hwlat_single_cpu_data; +} + static bool hwlat_busy; static void trace_hwlat_sample(struct hwlat_sample *sample) @@ -149,7 +162,9 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) void trace_hwlat_callback(bool enter) { - if (smp_processor_id() != nmi_cpu) + struct hwlat_kthread_data *kdata = get_cpu_data(); + + if (!kdata->kthread) return; /* @@ -158,13 +173,13 @@ void trace_hwlat_callback(bool enter) */ if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { if (enter) - nmi_ts_start = time_get(); + kdata->nmi_ts_start = time_get(); else - nmi_total_ts += time_get() - nmi_ts_start; + kdata->nmi_total_ts += time_get() - kdata->nmi_ts_start; } if (enter) - nmi_count++; + kdata->nmi_count++; } /** @@ -176,6 +191,7 @@ void trace_hwlat_callback(bool enter) */ static int get_sample(void) { + struct hwlat_kthread_data *kdata = get_cpu_data(); struct trace_array *tr = hwlat_trace; struct hwlat_sample s; time_type start, t1, t2, last_t2; @@ -188,9 +204,8 @@ static int get_sample(void) do_div(thresh, NSEC_PER_USEC); /* modifies interval value */ - nmi_cpu = smp_processor_id(); - nmi_total_ts = 0; - nmi_count = 0; + kdata->nmi_total_ts = 0; + kdata->nmi_count = 0; /* Make sure NMIs see this first */ barrier(); @@ -260,15 +275,15 @@ static int get_sample(void) ret = 1; /* We read in microseconds */ - if (nmi_total_ts) - do_div(nmi_total_ts, NSEC_PER_USEC); + if (kdata->nmi_total_ts) + do_div(kdata->nmi_total_ts, NSEC_PER_USEC); hwlat_data.count++; s.seqnum = hwlat_data.count; s.duration = sample; s.outer_duration = outer_sample; - s.nmi_total_ts = nmi_total_ts; - s.nmi_count = nmi_count; + s.nmi_total_ts = kdata->nmi_total_ts; + s.nmi_count = kdata->nmi_count; s.count = count; trace_hwlat_sample(&s); @@ -364,21 +379,40 @@ static int kthread_fn(void *data) } /* - * start_kthread - Kick off the hardware latency sampling/detector kthread + * stop_stop_kthread - Inform the hardware latency sampling/detector kthread to stop + * + * This kicks the running hardware latency sampling/detector kernel thread and + * tells it to stop sampling now. Use this on unload and at system shutdown. + */ +static void stop_single_kthread(void) +{ + struct hwlat_kthread_data *kdata = get_cpu_data(); + struct task_struct *kthread = kdata->kthread; + + if (!kthread) + return; + + kthread_stop(kthread); + kdata->kthread = NULL; +} + + +/* + * start_single_kthread - Kick off the hardware latency sampling/detector kthread * * This starts the kernel thread that will sit and sample the CPU timestamp * counter (TSC or similar) and look for potential hardware latencies. */ -static int start_kthread(struct trace_array *tr) +static int start_single_kthread(struct trace_array *tr) { + struct hwlat_kthread_data *kdata = get_cpu_data(); struct cpumask *current_mask = &save_cpumask; struct task_struct *kthread; int next_cpu; - if (hwlat_kthread) + if (kdata->kthread) return 0; - kthread = kthread_create(kthread_fn, NULL, "hwlatd"); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); @@ -400,24 +434,97 @@ static int start_kthread(struct trace_array *tr) sched_setaffinity(kthread->pid, current_mask); - hwlat_kthread = kthread; + kdata->kthread = kthread; wake_up_process(kthread); return 0; } /* - * stop_kthread - Inform the hardware latency sampling/detector kthread to stop + * stop_cpu_kthread - Stop a hwlat cpu kthread + */ +static void stop_cpu_kthread(unsigned int cpu) +{ + struct task_struct *kthread; + + kthread = per_cpu(hwlat_per_cpu_data, cpu).kthread; + if (kthread) + kthread_stop(kthread); +} + +/* + * stop_per_cpu_kthreads - Inform the hardware latency sampling/detector kthread to stop * - * This kicks the running hardware latency sampling/detector kernel thread and + * This kicks the running hardware latency sampling/detector kernel threads and * tells it to stop sampling now. Use this on unload and at system shutdown. */ -static void stop_kthread(void) +static void stop_per_cpu_kthreads(void) { - if (!hwlat_kthread) - return; - kthread_stop(hwlat_kthread); - hwlat_kthread = NULL; + unsigned int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + stop_cpu_kthread(cpu); + put_online_cpus(); +} + +/* + * start_cpu_kthread - Start a hwlat cpu kthread + */ +static int start_cpu_kthread(unsigned int cpu) +{ + struct task_struct *kthread; + char comm[24]; + + snprintf(comm, 24, "hwlatd/%d", cpu); + + kthread = kthread_create_on_cpu(kthread_fn, NULL, cpu, comm); + if (IS_ERR(kthread)) { + pr_err(BANNER "could not start sampling thread\n"); + return -ENOMEM; + } + + per_cpu(hwlat_per_cpu_data, cpu).kthread = kthread; + wake_up_process(kthread); + + return 0; +} + +/* + * start_per_cpu_kthreads - Kick off the hardware latency sampling/detector kthreads + * + * This starts the kernel threads that will sit on potentially all cpus and + * sample the CPU timestamp counter (TSC or similar) and look for potential + * hardware latencies. + */ +static int start_per_cpu_kthreads(struct trace_array *tr) +{ + struct cpumask *current_mask = &save_cpumask; + unsigned int cpu; + int retval; + + get_online_cpus(); + /* + * Run only on CPUs in which hwlat is allowed to run. + */ + cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); + + for_each_online_cpu(cpu) + per_cpu(hwlat_per_cpu_data, cpu).kthread = NULL; + + for_each_cpu(cpu, current_mask) { + retval = start_cpu_kthread(cpu); + if (retval) + goto out_error; + } + put_online_cpus(); + + return 0; + +out_error: + put_online_cpus(); + stop_per_cpu_kthreads(); + return retval; } /* @@ -600,7 +707,8 @@ static void hwlat_tracer_stop(struct trace_array *tr); * The "none" sets the allowed cpumask for a single hwlatd thread at the * startup and lets the scheduler handle the migration. The default mode is * the "round-robin" one, in which a single hwlatd thread runs, migrating - * among the allowed CPUs in a round-robin fashion. + * among the allowed CPUs in a round-robin fashion. The "per-cpu" mode + * creates one hwlatd thread per allowed CPU. */ static ssize_t hwlat_mode_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -724,14 +832,20 @@ static void hwlat_tracer_start(struct trace_array *tr) { int err; - err = start_kthread(tr); + if (hwlat_data.thread_mode == MODE_PER_CPU) + err = start_per_cpu_kthreads(tr); + else + err = start_single_kthread(tr); if (err) pr_err(BANNER "Cannot start hwlat kthread\n"); } static void hwlat_tracer_stop(struct trace_array *tr) { - stop_kthread(); + if (hwlat_data.thread_mode == MODE_PER_CPU) + stop_per_cpu_kthreads(); + else + stop_single_kthread(); } static int hwlat_tracer_init(struct trace_array *tr) @@ -760,7 +874,7 @@ static int hwlat_tracer_init(struct trace_array *tr) static void hwlat_tracer_reset(struct trace_array *tr) { - stop_kthread(); + hwlat_tracer_stop(tr); /* the tracing threshold is static between runs */ last_tracing_thresh = tracing_thresh; From bc87cf0a08d437ea192b15f0918cb581a8698f15 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 22 Jun 2021 16:42:23 +0200 Subject: [PATCH 25/49] trace: Add a generic function to read/write u64 values from tracefs The hwlat detector and (in preparation for) the osnoise/timerlat tracers have a set of u64 parameters that the user can read/write via tracefs. For instance, we have hwlat_detector's window and width. To reduce the code duplication, hwlat's window and width share the same read function. However, they do not share the write functions because they do different parameter checks. For instance, the width needs to be smaller than the window, while the window needs to be larger than the window. The same pattern repeats on osnoise/timerlat, and a large portion of the code was devoted to the write function. Despite having different checks, the write functions have the same structure: read a user-space buffer take the lock that protects the value check for minimum and maximum acceptable values save the value release the lock return success or error To reduce the code duplication also in the write functions, this patch provides a generic read and write implementation for u64 values that need to be within some minimum and/or maximum parameters, while (potentially) being protected by a lock. To use this interface, the structure trace_min_max_param needs to be filled: struct trace_min_max_param { struct mutex *lock; u64 *val; u64 *min; u64 *max; }; The desired value is stored on the variable pointed by *val. If *min points to a minimum acceptable value, it will be checked during the write operation. Likewise, if *max points to a maximum allowable value, it will be checked during the write operation. Finally, if *lock points to a mutex, it will be taken at the beginning of the operation and released at the end. The definition of a trace_min_max_param needs to passed as the (private) *data for tracefs_create_file(), and the trace_min_max_fops (added by this patch) as the *fops file_operations. Link: https://lkml.kernel.org/r/3e35760a7c8b5c55f16ae5ad5fc54a0e71cbe647.1624372313.git.bristot@redhat.com Cc: Phil Auld Cc: Sebastian Andrzej Siewior Cc: Kate Carcia Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Clark Willaims Cc: John Kacur Cc: Juri Lelli Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 85 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 18 ++++++++++ 2 files changed, 103 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d352fb4b7709..27bf203ef05a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7583,6 +7583,91 @@ static const struct file_operations snapshot_raw_fops = { #endif /* CONFIG_TRACER_SNAPSHOT */ +/* + * trace_min_max_write - Write a u64 value to a trace_min_max_param struct + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function implements the write interface for a struct trace_min_max_param. + * The filp->private_data must point to a trace_min_max_param structure that + * defines where to write the value, the min and the max acceptable values, + * and a lock to protect the write. + */ +static ssize_t +trace_min_max_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_min_max_param *param = filp->private_data; + u64 val; + int err; + + if (!param) + return -EFAULT; + + err = kstrtoull_from_user(ubuf, cnt, 10, &val); + if (err) + return err; + + if (param->lock) + mutex_lock(param->lock); + + if (param->min && val < *param->min) + err = -EINVAL; + + if (param->max && val > *param->max) + err = -EINVAL; + + if (!err) + *param->val = val; + + if (param->lock) + mutex_unlock(param->lock); + + if (err) + return err; + + return cnt; +} + +/* + * trace_min_max_read - Read a u64 value from a trace_min_max_param struct + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function implements the read interface for a struct trace_min_max_param. + * The filp->private_data must point to a trace_min_max_param struct with valid + * data. + */ +static ssize_t +trace_min_max_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_min_max_param *param = filp->private_data; + char buf[U64_STR_SIZE]; + int len; + u64 val; + + if (!param) + return -EFAULT; + + val = *param->val; + + if (cnt > sizeof(buf)) + cnt = sizeof(buf); + + len = snprintf(buf, sizeof(buf), "%llu\n", val); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); +} + +const struct file_operations trace_min_max_fops = { + .open = tracing_open_generic, + .read = trace_min_max_read, + .write = trace_min_max_write, +}; + #define TRACING_LOG_ERRS_MAX 8 #define TRACING_LOG_LOC_MAX 128 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cd80d046c7a5..22f8c652ef8b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1952,4 +1952,22 @@ static inline bool is_good_name(const char *name) return true; } +/* + * This is a generic way to read and write a u64 value from a file in tracefs. + * + * The value is stored on the variable pointed by *val. The value needs + * to be at least *min and at most *max. The write is protected by an + * existing *lock. + */ +struct trace_min_max_param { + struct mutex *lock; + u64 *val; + u64 *min; + u64 *max; +}; + +#define U64_STR_SIZE 24 /* 20 digits max */ + +extern const struct file_operations trace_min_max_fops; + #endif /* _LINUX_KERNEL_TRACE_H */ From f27a1c9e1ba1e4f18f2c01e7bcbc400651ed821d Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 22 Jun 2021 16:42:24 +0200 Subject: [PATCH 26/49] trace/hwlat: Use trace_min_max_param for width and window params Use the trace_min_max_param to reduce code duplication. No functional change. Link: https://lkml.kernel.org/r/b91accd5a7c6c14ea02d3379aae974ba22b47dd6.1624372313.git.bristot@redhat.com Cc: Phil Auld Cc: Sebastian Andrzej Siewior Cc: Kate Carcia Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Clark Willaims Cc: John Kacur Cc: Juri Lelli Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 145 ++++++------------------------------- 1 file changed, 24 insertions(+), 121 deletions(-) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 3957b36826e2..44f46bc1140f 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -527,115 +527,6 @@ out_error: return retval; } -/* - * hwlat_read - Wrapper read function for reading both window and width - * @filp: The active open file structure - * @ubuf: The userspace provided buffer to read value into - * @cnt: The maximum number of bytes to read - * @ppos: The current "file" position - * - * This function provides a generic read implementation for the global state - * "hwlat_data" structure filesystem entries. - */ -static ssize_t hwlat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[U64STR_SIZE]; - u64 *entry = filp->private_data; - u64 val; - int len; - - if (!entry) - return -EFAULT; - - if (cnt > sizeof(buf)) - cnt = sizeof(buf); - - val = *entry; - - len = snprintf(buf, sizeof(buf), "%llu\n", val); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); -} - -/** - * hwlat_width_write - Write function for "width" entry - * @filp: The active open file structure - * @ubuf: The user buffer that contains the value to write - * @cnt: The maximum number of bytes to write to "file" - * @ppos: The current position in @file - * - * This function provides a write implementation for the "width" interface - * to the hardware latency detector. It can be used to configure - * for how many us of the total window us we will actively sample for any - * hardware-induced latency periods. Obviously, it is not possible to - * sample constantly and have the system respond to a sample reader, or, - * worse, without having the system appear to have gone out to lunch. It - * is enforced that width is less that the total window size. - */ -static ssize_t -hwlat_width_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - u64 val; - int err; - - err = kstrtoull_from_user(ubuf, cnt, 10, &val); - if (err) - return err; - - mutex_lock(&hwlat_data.lock); - if (val < hwlat_data.sample_window) - hwlat_data.sample_width = val; - else - err = -EINVAL; - mutex_unlock(&hwlat_data.lock); - - if (err) - return err; - - return cnt; -} - -/** - * hwlat_window_write - Write function for "window" entry - * @filp: The active open file structure - * @ubuf: The user buffer that contains the value to write - * @cnt: The maximum number of bytes to write to "file" - * @ppos: The current position in @file - * - * This function provides a write implementation for the "window" interface - * to the hardware latency detector. The window is the total time - * in us that will be considered one sample period. Conceptually, windows - * occur back-to-back and contain a sample width period during which - * actual sampling occurs. Can be used to write a new total window size. It - * is enforced that any value written must be greater than the sample width - * size, or an error results. - */ -static ssize_t -hwlat_window_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - u64 val; - int err; - - err = kstrtoull_from_user(ubuf, cnt, 10, &val); - if (err) - return err; - - mutex_lock(&hwlat_data.lock); - if (hwlat_data.sample_width < val) - hwlat_data.sample_window = val; - else - err = -EINVAL; - mutex_unlock(&hwlat_data.lock); - - if (err) - return err; - - return cnt; -} - static void *s_mode_start(struct seq_file *s, loff_t *pos) { int mode = *pos; @@ -760,16 +651,28 @@ static ssize_t hwlat_mode_write(struct file *filp, const char __user *ubuf, return ret; } -static const struct file_operations width_fops = { - .open = tracing_open_generic, - .read = hwlat_read, - .write = hwlat_width_write, +/* + * The width parameter is read/write using the generic trace_min_max_param + * method. The *val is protected by the hwlat_data lock and is upper + * bounded by the window parameter. + */ +static struct trace_min_max_param hwlat_width = { + .lock = &hwlat_data.lock, + .val = &hwlat_data.sample_width, + .max = &hwlat_data.sample_window, + .min = NULL, }; -static const struct file_operations window_fops = { - .open = tracing_open_generic, - .read = hwlat_read, - .write = hwlat_window_write, +/* + * The window parameter is read/write using the generic trace_min_max_param + * method. The *val is protected by the hwlat_data lock and is lower + * bounded by the width parameter. + */ +static struct trace_min_max_param hwlat_window = { + .lock = &hwlat_data.lock, + .val = &hwlat_data.sample_window, + .max = NULL, + .min = &hwlat_data.sample_width, }; static const struct file_operations thread_mode_fops = { @@ -802,15 +705,15 @@ static int init_tracefs(void) hwlat_sample_window = tracefs_create_file("window", 0640, top_dir, - &hwlat_data.sample_window, - &window_fops); + &hwlat_window, + &trace_min_max_fops); if (!hwlat_sample_window) goto err; hwlat_sample_width = tracefs_create_file("width", 0644, top_dir, - &hwlat_data.sample_width, - &width_fops); + &hwlat_width, + &trace_min_max_fops); if (!hwlat_sample_width) goto err; From aa892f8c887dd4331458d04de9425cde6664c694 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 22 Jun 2021 16:42:25 +0200 Subject: [PATCH 27/49] trace/hwlat: Remove printk from sampling loop hwlat has some time operation checks on the sample loop, and it is currently using pr_err (printk) to report them. The problem is that this can lead the system to an unresponsible state due to an overflow of printk messages. This problem can be mitigated by writing the error message to the trace buffer. Remove the printk messages from the sampling loop, switching the to messages in the trace buffer. No functional change. Link: https://lkml.kernel.org/r/9d77c34869748aa105e965c769d24642914eea3a.1624372313.git.bristot@redhat.com Cc: Phil Auld Cc: Sebastian Andrzej Siewior Cc: Kate Carcia Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Clark Willaims Cc: John Kacur Cc: Juri Lelli Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 44f46bc1140f..a625bfdb844e 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -182,6 +182,15 @@ void trace_hwlat_callback(bool enter) kdata->nmi_count++; } +/* + * hwlat_err - report a hwlat error. + */ +#define hwlat_err(msg) ({ \ + struct trace_array *tr = hwlat_trace; \ + \ + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg); \ +}) + /** * get_sample - sample the CPU TSC and look for likely hardware latencies * @@ -225,7 +234,7 @@ static int get_sample(void) outer_diff = time_to_us(time_sub(t1, last_t2)); /* This shouldn't happen */ if (outer_diff < 0) { - pr_err(BANNER "time running backwards\n"); + hwlat_err(BANNER "time running backwards\n"); goto out; } if (outer_diff > outer_sample) @@ -237,7 +246,7 @@ static int get_sample(void) /* Check for possible overflows */ if (total < last_total) { - pr_err("Time total overflowed\n"); + hwlat_err("Time total overflowed\n"); break; } last_total = total; @@ -253,7 +262,7 @@ static int get_sample(void) /* This shouldn't happen */ if (diff < 0) { - pr_err(BANNER "time running backwards\n"); + hwlat_err(BANNER "time running backwards\n"); goto out; } From 62de4f29e9174e67beb8d34ef5ced6730e087a31 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jun 2021 16:42:26 +0200 Subject: [PATCH 28/49] trace: Add __print_ns_to_secs() and __print_ns_without_secs() helpers To have nanosecond output displayed in a more human readable format, its nicer to convert it to a seconds format (XXX.YYYYYYYYY). The problem is that to do so, the numbers must be divided by NSEC_PER_SEC, and moded too. But as these numbers are 64 bit, this can not be done simply with '/' and '%' operators, but must use do_div() instead. Instead of performing the expensive do_div() in the hot path of the tracepoint, it is more efficient to perform it during the output phase. But passing in do_div() can confuse the parser, and do_div() doesn't work exactly like a normal C function. It modifies the number in place, and we don't want to modify the actual values in the ring buffer. Two helper functions are now created: __print_ns_to_secs() and __print_ns_without_secs() They both take a value of nanoseconds, and the former will return that number divided by NSEC_PER_SEC, and the latter will mod it with NSEC_PER_SEC giving a way to print a nice human readable format: __print_fmt("time=%llu.%09u", __print_ns_to_secs(REC->nsec_val), __print_ns_without_secs(REC->nsec_val)) Link: https://lkml.kernel.org/r/e503b903045496c4ccde52843e1e318b422f7a56.1624372313.git.bristot@redhat.com Cc: Phil Auld Cc: Sebastian Andrzej Siewior Cc: Kate Carcia Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Clark Willaims Cc: John Kacur Cc: Juri Lelli Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Steven Rostedt Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- include/trace/trace_events.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 8268bf747d6f..acc17194c160 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -358,6 +358,21 @@ TRACE_MAKE_SYSTEM_STR(); trace_print_hex_dump_seq(p, prefix_str, prefix_type, \ rowsize, groupsize, buf, len, ascii) +#undef __print_ns_to_secs +#define __print_ns_to_secs(value) \ + ({ \ + u64 ____val = (u64)(value); \ + do_div(____val, NSEC_PER_SEC); \ + ____val; \ + }) + +#undef __print_ns_without_secs +#define __print_ns_without_secs(value) \ + ({ \ + u64 ____val = (u64)(value); \ + (u32) do_div(____val, NSEC_PER_SEC); \ + }) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace enum print_line_t \ @@ -736,6 +751,16 @@ static inline void ftrace_test_probe_##call(void) \ #undef __print_array #undef __print_hex_dump +/* + * The below is not executed in the kernel. It is only what is + * displayed in the print format for userspace to parse. + */ +#undef __print_ns_to_secs +#define __print_ns_to_secs(val) (val) / 1000000000UL + +#undef __print_ns_without_secs +#define __print_ns_without_secs(val) (val) % 1000000000UL + #undef TP_printk #define TP_printk(fmt, args...) "\"" fmt "\", " __stringify(args) From 6880c987e45172fdaca0b4c07b0990f5b3c74f70 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 25 Jun 2021 19:47:33 -0400 Subject: [PATCH 29/49] tracing: Add LATENCY_FS_NOTIFY to define if latency_fsnotify() is defined With the coming addition of the osnoise tracer, the configs needed to include the latency_fsnotify() has become more complex, and to keep the declaration in the header file the same as in the C file, just have the logic needed to define it in one place, and that defines LATENCY_FS_NOTIFY which will be used in the C code. Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 +-- kernel/trace/trace.h | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 27bf203ef05a..60492464281e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1690,8 +1690,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) unsigned long __read_mostly tracing_thresh; static const struct file_operations tracing_max_lat_fops; -#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ - defined(CONFIG_FSNOTIFY) +#ifdef LATENCY_FS_NOTIFY static struct workqueue_struct *fsnotify_wq; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 22f8c652ef8b..87588d1e24ca 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -677,13 +677,13 @@ void update_max_tr_single(struct trace_array *tr, #if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ defined(CONFIG_FSNOTIFY) +#define LATENCY_FS_NOTIFY +#endif +#ifdef LATENCY_FS_NOTIFY void latency_fsnotify(struct trace_array *tr); - #else - static inline void latency_fsnotify(struct trace_array *tr) { } - #endif #ifdef CONFIG_STACKTRACE From bce29ac9ce0bb0b0b146b687ab978378c21e9078 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 22 Jun 2021 16:42:27 +0200 Subject: [PATCH 30/49] trace: Add osnoise tracer In the context of high-performance computing (HPC), the Operating System Noise (*osnoise*) refers to the interference experienced by an application due to activities inside the operating system. In the context of Linux, NMIs, IRQs, SoftIRQs, and any other system thread can cause noise to the system. Moreover, hardware-related jobs can also cause noise, for example, via SMIs. The osnoise tracer leverages the hwlat_detector by running a similar loop with preemption, SoftIRQs and IRQs enabled, thus allowing all the sources of *osnoise* during its execution. Using the same approach of hwlat, osnoise takes note of the entry and exit point of any source of interferences, increasing a per-cpu interference counter. The osnoise tracer also saves an interference counter for each source of interference. The interference counter for NMI, IRQs, SoftIRQs, and threads is increased anytime the tool observes these interferences' entry events. When a noise happens without any interference from the operating system level, the hardware noise counter increases, pointing to a hardware-related noise. In this way, osnoise can account for any source of interference. At the end of the period, the osnoise tracer prints the sum of all noise, the max single noise, the percentage of CPU available for the thread, and the counters for the noise sources. Usage Write the ASCII text "osnoise" into the current_tracer file of the tracing system (generally mounted at /sys/kernel/tracing). For example:: [root@f32 ~]# cd /sys/kernel/tracing/ [root@f32 tracing]# echo osnoise > current_tracer It is possible to follow the trace by reading the trace trace file:: [root@f32 tracing]# cat trace # tracer: osnoise # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth MAX # || / SINGLE Interference counters: # |||| RUNTIME NOISE % OF CPU NOISE +-----------------------------+ # TASK-PID CPU# |||| TIMESTAMP IN US IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD # | | | |||| | | | | | | | | | | <...>-859 [000] .... 81.637220: 1000000 190 99.98100 9 18 0 1007 18 1 <...>-860 [001] .... 81.638154: 1000000 656 99.93440 74 23 0 1006 16 3 <...>-861 [002] .... 81.638193: 1000000 5675 99.43250 202 6 0 1013 25 21 <...>-862 [003] .... 81.638242: 1000000 125 99.98750 45 1 0 1011 23 0 <...>-863 [004] .... 81.638260: 1000000 1721 99.82790 168 7 0 1002 49 41 <...>-864 [005] .... 81.638286: 1000000 263 99.97370 57 6 0 1006 26 2 <...>-865 [006] .... 81.638302: 1000000 109 99.98910 21 3 0 1006 18 1 <...>-866 [007] .... 81.638326: 1000000 7816 99.21840 107 8 0 1016 39 19 In addition to the regular trace fields (from TASK-PID to TIMESTAMP), the tracer prints a message at the end of each period for each CPU that is running an osnoise/CPU thread. The osnoise specific fields report: - The RUNTIME IN USE reports the amount of time in microseconds that the osnoise thread kept looping reading the time. - The NOISE IN US reports the sum of noise in microseconds observed by the osnoise tracer during the associated runtime. - The % OF CPU AVAILABLE reports the percentage of CPU available for the osnoise thread during the runtime window. - The MAX SINGLE NOISE IN US reports the maximum single noise observed during the runtime window. - The Interference counters display how many each of the respective interference happened during the runtime window. Note that the example above shows a high number of HW noise samples. The reason being is that this sample was taken on a virtual machine, and the host interference is detected as a hardware interference. Tracer options The tracer has a set of options inside the osnoise directory, they are: - osnoise/cpus: CPUs at which a osnoise thread will execute. - osnoise/period_us: the period of the osnoise thread. - osnoise/runtime_us: how long an osnoise thread will look for noise. - osnoise/stop_tracing_us: stop the system tracing if a single noise higher than the configured value happens. Writing 0 disables this option. - osnoise/stop_tracing_total_us: stop the system tracing if total noise higher than the configured value happens. Writing 0 disables this option. - tracing_threshold: the minimum delta between two time() reads to be considered as noise, in us. When set to 0, the default value will be used, which is currently 5 us. Additional Tracing In addition to the tracer, a set of tracepoints were added to facilitate the identification of the osnoise source. - osnoise:sample_threshold: printed anytime a noise is higher than the configurable tolerance_ns. - osnoise:nmi_noise: noise from NMI, including the duration. - osnoise:irq_noise: noise from an IRQ, including the duration. - osnoise:softirq_noise: noise from a SoftIRQ, including the duration. - osnoise:thread_noise: noise from a thread, including the duration. Note that all the values are *net values*. For example, if while osnoise is running, another thread preempts the osnoise thread, it will start a thread_noise duration at the start. Then, an IRQ takes place, preempting the thread_noise, starting a irq_noise. When the IRQ ends its execution, it will compute its duration, and this duration will be subtracted from the thread_noise, in such a way as to avoid the double accounting of the IRQ execution. This logic is valid for all sources of noise. Here is one example of the usage of these tracepoints:: osnoise/8-961 [008] d.h. 5789.857532: irq_noise: local_timer:236 start 5789.857529929 duration 1845 ns osnoise/8-961 [008] dNh. 5789.858408: irq_noise: local_timer:236 start 5789.858404871 duration 2848 ns migration/8-54 [008] d... 5789.858413: thread_noise: migration/8:54 start 5789.858409300 duration 3068 ns osnoise/8-961 [008] .... 5789.858413: sample_threshold: start 5789.858404555 duration 8723 ns interferences 2 In this example, a noise sample of 8 microseconds was reported in the last line, pointing to two interferences. Looking backward in the trace, the two previous entries were about the migration thread running after a timer IRQ execution. The first event is not part of the noise because it took place one millisecond before. It is worth noticing that the sum of the duration reported in the tracepoints is smaller than eight us reported in the sample_threshold. The reason roots in the overhead of the entry and exit code that happens before and after any interference execution. This justifies the dual approach: measuring thread and tracing. Link: https://lkml.kernel.org/r/e649467042d60e7b62714c9c6751a56299d15119.1624372313.git.bristot@redhat.com Cc: Phil Auld Cc: Sebastian Andrzej Siewior Cc: Kate Carcia Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Clark Willaims Cc: John Kacur Cc: Juri Lelli Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira [ Made the following functions static: trace_irqentry_callback() trace_irqexit_callback() trace_intel_irqentry_callback() trace_intel_irqexit_callback() Added to include/trace.h: osnoise_arch_register() osnoise_arch_unregister() Fixed define logic for LATENCY_FS_NOTIFY Reported-by: kernel test robot ] Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/index.rst | 1 + Documentation/trace/osnoise-tracer.rst | 152 +++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/trace.c | 237 ++++ include/linux/ftrace_irq.h | 13 + include/linux/trace.h | 5 + include/trace/events/osnoise.h | 142 +++ kernel/trace/Kconfig | 34 + kernel/trace/Makefile | 1 + kernel/trace/trace.h | 9 +- kernel/trace/trace_entries.h | 25 + kernel/trace/trace_osnoise.c | 1384 ++++++++++++++++++++++++ kernel/trace/trace_output.c | 72 +- 13 files changed, 2072 insertions(+), 4 deletions(-) create mode 100644 Documentation/trace/osnoise-tracer.rst create mode 100644 arch/x86/kernel/trace.c create mode 100644 include/trace/events/osnoise.h create mode 100644 kernel/trace/trace_osnoise.c diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst index f634b36fd3aa..608107b27cc0 100644 --- a/Documentation/trace/index.rst +++ b/Documentation/trace/index.rst @@ -23,6 +23,7 @@ Linux Tracing Technologies histogram-design boottime-trace hwlat_detector + osnoise-tracer intel_th ring-buffer-design stm diff --git a/Documentation/trace/osnoise-tracer.rst b/Documentation/trace/osnoise-tracer.rst new file mode 100644 index 000000000000..37a3c10fb216 --- /dev/null +++ b/Documentation/trace/osnoise-tracer.rst @@ -0,0 +1,152 @@ +============== +OSNOISE Tracer +============== + +In the context of high-performance computing (HPC), the Operating System +Noise (*osnoise*) refers to the interference experienced by an application +due to activities inside the operating system. In the context of Linux, +NMIs, IRQs, SoftIRQs, and any other system thread can cause noise to the +system. Moreover, hardware-related jobs can also cause noise, for example, +via SMIs. + +hwlat_detector is one of the tools used to identify the most complex +source of noise: *hardware noise*. + +In a nutshell, the hwlat_detector creates a thread that runs +periodically for a given period. At the beginning of a period, the thread +disables interrupt and starts sampling. While running, the hwlatd +thread reads the time in a loop. As interrupts are disabled, threads, +IRQs, and SoftIRQs cannot interfere with the hwlatd thread. Hence, the +cause of any gap between two different reads of the time roots either on +NMI or in the hardware itself. At the end of the period, hwlatd enables +interrupts and reports the max observed gap between the reads. It also +prints a NMI occurrence counter. If the output does not report NMI +executions, the user can conclude that the hardware is the culprit for +the latency. The hwlat detects the NMI execution by observing +the entry and exit of a NMI. + +The osnoise tracer leverages the hwlat_detector by running a +similar loop with preemption, SoftIRQs and IRQs enabled, thus allowing +all the sources of *osnoise* during its execution. Using the same approach +of hwlat, osnoise takes note of the entry and exit point of any +source of interferences, increasing a per-cpu interference counter. The +osnoise tracer also saves an interference counter for each source of +interference. The interference counter for NMI, IRQs, SoftIRQs, and +threads is increased anytime the tool observes these interferences' entry +events. When a noise happens without any interference from the operating +system level, the hardware noise counter increases, pointing to a +hardware-related noise. In this way, osnoise can account for any +source of interference. At the end of the period, the osnoise tracer +prints the sum of all noise, the max single noise, the percentage of CPU +available for the thread, and the counters for the noise sources. + +Usage +----- + +Write the ASCII text "osnoise" into the current_tracer file of the +tracing system (generally mounted at /sys/kernel/tracing). + +For example:: + + [root@f32 ~]# cd /sys/kernel/tracing/ + [root@f32 tracing]# echo osnoise > current_tracer + +It is possible to follow the trace by reading the trace trace file:: + + [root@f32 tracing]# cat trace + # tracer: osnoise + # + # _-----=> irqs-off + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth MAX + # || / SINGLE Interference counters: + # |||| RUNTIME NOISE % OF CPU NOISE +-----------------------------+ + # TASK-PID CPU# |||| TIMESTAMP IN US IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD + # | | | |||| | | | | | | | | | | + <...>-859 [000] .... 81.637220: 1000000 190 99.98100 9 18 0 1007 18 1 + <...>-860 [001] .... 81.638154: 1000000 656 99.93440 74 23 0 1006 16 3 + <...>-861 [002] .... 81.638193: 1000000 5675 99.43250 202 6 0 1013 25 21 + <...>-862 [003] .... 81.638242: 1000000 125 99.98750 45 1 0 1011 23 0 + <...>-863 [004] .... 81.638260: 1000000 1721 99.82790 168 7 0 1002 49 41 + <...>-864 [005] .... 81.638286: 1000000 263 99.97370 57 6 0 1006 26 2 + <...>-865 [006] .... 81.638302: 1000000 109 99.98910 21 3 0 1006 18 1 + <...>-866 [007] .... 81.638326: 1000000 7816 99.21840 107 8 0 1016 39 19 + +In addition to the regular trace fields (from TASK-PID to TIMESTAMP), the +tracer prints a message at the end of each period for each CPU that is +running an osnoise/ thread. The osnoise specific fields report: + + - The RUNTIME IN USE reports the amount of time in microseconds that + the osnoise thread kept looping reading the time. + - The NOISE IN US reports the sum of noise in microseconds observed + by the osnoise tracer during the associated runtime. + - The % OF CPU AVAILABLE reports the percentage of CPU available for + the osnoise thread during the runtime window. + - The MAX SINGLE NOISE IN US reports the maximum single noise observed + during the runtime window. + - The Interference counters display how many each of the respective + interference happened during the runtime window. + +Note that the example above shows a high number of HW noise samples. +The reason being is that this sample was taken on a virtual machine, +and the host interference is detected as a hardware interference. + +Tracer options +--------------------- + +The tracer has a set of options inside the osnoise directory, they are: + + - osnoise/cpus: CPUs at which a osnoise thread will execute. + - osnoise/period_us: the period of the osnoise thread. + - osnoise/runtime_us: how long an osnoise thread will look for noise. + - osnoise/stop_tracing_us: stop the system tracing if a single noise + higher than the configured value happens. Writing 0 disables this + option. + - osnoise/stop_tracing_total_us: stop the system tracing if total noise + higher than the configured value happens. Writing 0 disables this + option. + - tracing_threshold: the minimum delta between two time() reads to be + considered as noise, in us. When set to 0, the default value will + will be used, which is currently 5 us. + +Additional Tracing +------------------ + +In addition to the tracer, a set of tracepoints were added to +facilitate the identification of the osnoise source. + + - osnoise:sample_threshold: printed anytime a noise is higher than + the configurable tolerance_ns. + - osnoise:nmi_noise: noise from NMI, including the duration. + - osnoise:irq_noise: noise from an IRQ, including the duration. + - osnoise:softirq_noise: noise from a SoftIRQ, including the + duration. + - osnoise:thread_noise: noise from a thread, including the duration. + +Note that all the values are *net values*. For example, if while osnoise +is running, another thread preempts the osnoise thread, it will start a +thread_noise duration at the start. Then, an IRQ takes place, preempting +the thread_noise, starting a irq_noise. When the IRQ ends its execution, +it will compute its duration, and this duration will be subtracted from +the thread_noise, in such a way as to avoid the double accounting of the +IRQ execution. This logic is valid for all sources of noise. + +Here is one example of the usage of these tracepoints:: + + osnoise/8-961 [008] d.h. 5789.857532: irq_noise: local_timer:236 start 5789.857529929 duration 1845 ns + osnoise/8-961 [008] dNh. 5789.858408: irq_noise: local_timer:236 start 5789.858404871 duration 2848 ns + migration/8-54 [008] d... 5789.858413: thread_noise: migration/8:54 start 5789.858409300 duration 3068 ns + osnoise/8-961 [008] .... 5789.858413: sample_threshold: start 5789.858404555 duration 8812 ns interferences 2 + +In this example, a noise sample of 8 microseconds was reported in the last +line, pointing to two interferences. Looking backward in the trace, the +two previous entries were about the migration thread running after a +timer IRQ execution. The first event is not part of the noise because +it took place one millisecond before. + +It is worth noticing that the sum of the duration reported in the +tracepoints is smaller than eight us reported in the sample_threshold. +The reason roots in the overhead of the entry and exit code that happens +before and after any interference execution. This justifies the dual +approach: measuring thread and tracing. diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0f66682ac02a..3e625c61f008 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -102,6 +102,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o +obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c new file mode 100644 index 000000000000..6912672c33a7 --- /dev/null +++ b/arch/x86/kernel/trace.c @@ -0,0 +1,237 @@ +#include +#include + +#if defined(CONFIG_OSNOISE_TRACER) && defined(CONFIG_X86_LOCAL_APIC) +extern void osnoise_trace_irq_entry(int id); +extern void osnoise_trace_irq_exit(int id, const char *desc); + +/* + * trace_intel_irq_entry - record intel specific IRQ entry + */ +static void trace_intel_irq_entry(void *data, int vector) +{ + osnoise_trace_irq_entry(vector); +} + +/* + * trace_intel_irq_exit - record intel specific IRQ exit + */ +static void trace_intel_irq_exit(void *data, int vector) +{ + char *vector_desc = (char *) data; + + osnoise_trace_irq_exit(vector, vector_desc); +} + +/* + * register_intel_irq_tp - Register intel specific IRQ entry tracepoints + */ +int osnoise_arch_register(void) +{ + int ret; + + ret = register_trace_local_timer_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_err; + + ret = register_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); + if (ret) + goto out_timer_entry; + +#ifdef CONFIG_X86_THERMAL_VECTOR + ret = register_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_timer_exit; + + ret = register_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); + if (ret) + goto out_thermal_entry; +#endif /* CONFIG_X86_THERMAL_VECTOR */ + +#ifdef CONFIG_X86_MCE_AMD + ret = register_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_thermal_exit; + + ret = register_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); + if (ret) + goto out_deferred_entry; +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + ret = register_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_deferred_exit; + + ret = register_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); + if (ret) + goto out_threshold_entry; +#endif /* CONFIG_X86_MCE_THRESHOLD */ + +#ifdef CONFIG_SMP + ret = register_trace_call_function_single_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_threshold_exit; + + ret = register_trace_call_function_single_exit(trace_intel_irq_exit, + "call_function_single"); + if (ret) + goto out_call_function_single_entry; + + ret = register_trace_call_function_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_call_function_single_exit; + + ret = register_trace_call_function_exit(trace_intel_irq_exit, "call_function"); + if (ret) + goto out_call_function_entry; + + ret = register_trace_reschedule_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_call_function_exit; + + ret = register_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); + if (ret) + goto out_reschedule_entry; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_IRQ_WORK + ret = register_trace_irq_work_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_reschedule_exit; + + ret = register_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); + if (ret) + goto out_irq_work_entry; +#endif + + ret = register_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_irq_work_exit; + + ret = register_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); + if (ret) + goto out_x86_ipi_entry; + + ret = register_trace_error_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_x86_ipi_exit; + + ret = register_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); + if (ret) + goto out_error_apic_entry; + + ret = register_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_error_apic_exit; + + ret = register_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic"); + if (ret) + goto out_spurious_apic_entry; + + return 0; + +out_spurious_apic_entry: + unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); +out_error_apic_exit: + unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); +out_error_apic_entry: + unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL); +out_x86_ipi_exit: + unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); +out_x86_ipi_entry: + unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); +out_irq_work_exit: + +#ifdef CONFIG_IRQ_WORK + unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); +out_irq_work_entry: + unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL); +out_reschedule_exit: +#endif + +#ifdef CONFIG_SMP + unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); +out_reschedule_entry: + unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL); +out_call_function_exit: + unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function"); +out_call_function_entry: + unregister_trace_call_function_entry(trace_intel_irq_entry, NULL); +out_call_function_single_exit: + unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single"); +out_call_function_single_entry: + unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL); +out_threshold_exit: +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); +out_threshold_entry: + unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); +out_deferred_exit: +#endif + +#ifdef CONFIG_X86_MCE_AMD + unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); +out_deferred_entry: + unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); +out_thermal_exit: +#endif /* CONFIG_X86_MCE_AMD */ + +#ifdef CONFIG_X86_THERMAL_VECTOR + unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); +out_thermal_entry: + unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); +out_timer_exit: +#endif /* CONFIG_X86_THERMAL_VECTOR */ + + unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); +out_timer_entry: + unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL); +out_err: + return -EINVAL; +} + +void osnoise_arch_unregister(void) +{ + unregister_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic"); + unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); + unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); + unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL); + unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); + unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); + +#ifdef CONFIG_IRQ_WORK + unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); + unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_SMP + unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); + unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL); + unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function"); + unregister_trace_call_function_entry(trace_intel_irq_entry, NULL); + unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single"); + unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); + unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_MCE_AMD + unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); + unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR + unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); + unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); +#endif /* CONFIG_X86_THERMAL_VECTOR */ + + unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); + unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL); +} +#endif /* CONFIG_OSNOISE_TRAECR && CONFIG_X86_LOCAL_APIC */ diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index 0abd9a1d2852..f6faa31289ba 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -7,12 +7,21 @@ extern bool trace_hwlat_callback_enabled; extern void trace_hwlat_callback(bool enter); #endif +#ifdef CONFIG_OSNOISE_TRACER +extern bool trace_osnoise_callback_enabled; +extern void trace_osnoise_callback(bool enter); +#endif + static inline void ftrace_nmi_enter(void) { #ifdef CONFIG_HWLAT_TRACER if (trace_hwlat_callback_enabled) trace_hwlat_callback(true); #endif +#ifdef CONFIG_OSNOISE_TRACER + if (trace_osnoise_callback_enabled) + trace_osnoise_callback(true); +#endif } static inline void ftrace_nmi_exit(void) @@ -21,6 +30,10 @@ static inline void ftrace_nmi_exit(void) if (trace_hwlat_callback_enabled) trace_hwlat_callback(false); #endif +#ifdef CONFIG_OSNOISE_TRACER + if (trace_osnoise_callback_enabled) + trace_osnoise_callback(false); +#endif } #endif /* _LINUX_FTRACE_IRQ_H */ diff --git a/include/linux/trace.h b/include/linux/trace.h index be1e130ed87c..4e3858640c47 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -41,6 +41,11 @@ int trace_array_init_printk(struct trace_array *tr); void trace_array_put(struct trace_array *tr); struct trace_array *trace_array_get_by_name(const char *name); int trace_array_destroy(struct trace_array *tr); + +/* For osnoise tracer */ +int osnoise_arch_register(void); +void osnoise_arch_unregister(void); + #endif /* CONFIG_TRACING */ #endif /* _LINUX_TRACE_H */ diff --git a/include/trace/events/osnoise.h b/include/trace/events/osnoise.h new file mode 100644 index 000000000000..28762c69f6c9 --- /dev/null +++ b/include/trace/events/osnoise.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM osnoise + +#if !defined(_OSNOISE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _OSNOISE_TRACE_H + +#include +TRACE_EVENT(thread_noise, + + TP_PROTO(struct task_struct *t, u64 start, u64 duration), + + TP_ARGS(t, start, duration), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN) + __field( u64, start ) + __field( u64, duration) + __field( pid_t, pid ) + ), + + TP_fast_assign( + memcpy(__entry->comm, t->comm, TASK_COMM_LEN); + __entry->pid = t->pid; + __entry->start = start; + __entry->duration = duration; + ), + + TP_printk("%8s:%d start %llu.%09u duration %llu ns", + __entry->comm, + __entry->pid, + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration) +); + +TRACE_EVENT(softirq_noise, + + TP_PROTO(int vector, u64 start, u64 duration), + + TP_ARGS(vector, start, duration), + + TP_STRUCT__entry( + __field( u64, start ) + __field( u64, duration) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->vector = vector; + __entry->start = start; + __entry->duration = duration; + ), + + TP_printk("%8s:%d start %llu.%09u duration %llu ns", + show_softirq_name(__entry->vector), + __entry->vector, + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration) +); + +TRACE_EVENT(irq_noise, + + TP_PROTO(int vector, const char *desc, u64 start, u64 duration), + + TP_ARGS(vector, desc, start, duration), + + TP_STRUCT__entry( + __field( u64, start ) + __field( u64, duration) + __string( desc, desc ) + __field( int, vector ) + + ), + + TP_fast_assign( + __assign_str(desc, desc); + __entry->vector = vector; + __entry->start = start; + __entry->duration = duration; + ), + + TP_printk("%s:%d start %llu.%09u duration %llu ns", + __get_str(desc), + __entry->vector, + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration) +); + +TRACE_EVENT(nmi_noise, + + TP_PROTO(u64 start, u64 duration), + + TP_ARGS(start, duration), + + TP_STRUCT__entry( + __field( u64, start ) + __field( u64, duration) + ), + + TP_fast_assign( + __entry->start = start; + __entry->duration = duration; + ), + + TP_printk("start %llu.%09u duration %llu ns", + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration) +); + +TRACE_EVENT(sample_threshold, + + TP_PROTO(u64 start, u64 duration, u64 interference), + + TP_ARGS(start, duration, interference), + + TP_STRUCT__entry( + __field( u64, start ) + __field( u64, duration) + __field( u64, interference) + ), + + TP_fast_assign( + __entry->start = start; + __entry->duration = duration; + __entry->interference = interference; + ), + + TP_printk("start %llu.%09u duration %llu ns interferences %llu", + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration, + __entry->interference) +); + +#endif /* _TRACE_OSNOISE_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 7fa82778c3e6..41582ae4682b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -356,6 +356,40 @@ config HWLAT_TRACER file. Every time a latency is greater than tracing_thresh, it will be recorded into the ring buffer. +config OSNOISE_TRACER + bool "OS Noise tracer" + select GENERIC_TRACER + help + In the context of high-performance computing (HPC), the Operating + System Noise (osnoise) refers to the interference experienced by an + application due to activities inside the operating system. In the + context of Linux, NMIs, IRQs, SoftIRQs, and any other system thread + can cause noise to the system. Moreover, hardware-related jobs can + also cause noise, for example, via SMIs. + + The osnoise tracer leverages the hwlat_detector by running a similar + loop with preemption, SoftIRQs and IRQs enabled, thus allowing all + the sources of osnoise during its execution. The osnoise tracer takes + note of the entry and exit point of any source of interferences, + increasing a per-cpu interference counter. It saves an interference + counter for each source of interference. The interference counter for + NMI, IRQs, SoftIRQs, and threads is increased anytime the tool + observes these interferences' entry events. When a noise happens + without any interference from the operating system level, the + hardware noise counter increases, pointing to a hardware-related + noise. In this way, osnoise can account for any source of + interference. At the end of the period, the osnoise tracer prints + the sum of all noise, the max single noise, the percentage of CPU + available for the thread, and the counters for the noise sources. + + In addition to the tracer, a set of tracepoints were added to + facilitate the identification of the osnoise source. + + The output will appear in the trace and trace_pipe files. + + To enable this tracer, echo in "osnoise" into the current_tracer + file. + config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index b28d3e5013cd..b1c47ccf4f73 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o +obj-$(CONFIG_OSNOISE_TRACER) += trace_osnoise.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 87588d1e24ca..b959c9ec9711 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -44,6 +44,7 @@ enum trace_type { TRACE_BLK, TRACE_BPUTS, TRACE_HWLAT, + TRACE_OSNOISE, TRACE_RAW_DATA, TRACE_FUNC_REPEATS, @@ -297,7 +298,8 @@ struct trace_array { struct array_buffer max_buffer; bool allocated_snapshot; #endif -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ + || defined(CONFIG_OSNOISE_TRACER) unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; @@ -445,6 +447,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ + IF_ASSIGN(var, ent, struct osnoise_entry, TRACE_OSNOISE);\ IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ @@ -675,8 +678,8 @@ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */ -#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ - defined(CONFIG_FSNOTIFY) +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ + || defined(CONFIG_OSNOISE_TRACER)) && defined(CONFIG_FSNOTIFY) #define LATENCY_FS_NOTIFY #endif diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 251c819cf0c5..158c0984b59b 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -360,3 +360,28 @@ FTRACE_ENTRY(func_repeats, func_repeats_entry, __entry->count, FUNC_REPEATS_GET_DELTA_TS(__entry)) ); + +FTRACE_ENTRY(osnoise, osnoise_entry, + + TRACE_OSNOISE, + + F_STRUCT( + __field( u64, noise ) + __field( u64, runtime ) + __field( u64, max_sample ) + __field( unsigned int, hw_count ) + __field( unsigned int, nmi_count ) + __field( unsigned int, irq_count ) + __field( unsigned int, softirq_count ) + __field( unsigned int, thread_count ) + ), + + F_printk("noise:%llu\tmax_sample:%llu\thw:%u\tnmi:%u\tirq:%u\tsoftirq:%u\tthread:%u\n", + __entry->noise, + __entry->max_sample, + __entry->hw_count, + __entry->nmi_count, + __entry->irq_count, + __entry->softirq_count, + __entry->thread_count) +); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c new file mode 100644 index 000000000000..4e2c47dc4f19 --- /dev/null +++ b/kernel/trace/trace_osnoise.c @@ -0,0 +1,1384 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OS Noise Tracer: computes the OS Noise suffered by a running thread. + * + * Based on "hwlat_detector" tracer by: + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. + * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. + * With feedback from Clark Williams + * + * And also based on the rtsl tracer presented on: + * DE OLIVEIRA, Daniel Bristot, et al. Demystifying the real-time linux + * scheduling latency. In: 32nd Euromicro Conference on Real-Time Systems + * (ECRTS 2020). Schloss Dagstuhl-Leibniz-Zentrum fur Informatik, 2020. + * + * Copyright (C) 2021 Daniel Bristot de Oliveira, Red Hat, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "trace.h" + +#ifdef CONFIG_X86_LOCAL_APIC +#include +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#endif /* CONFIG_X86_LOCAL_APIC */ + +#include +#include + +#define CREATE_TRACE_POINTS +#include + +static struct trace_array *osnoise_trace; + +/* + * Default values. + */ +#define BANNER "osnoise: " +#define DEFAULT_SAMPLE_PERIOD 1000000 /* 1s */ +#define DEFAULT_SAMPLE_RUNTIME 1000000 /* 1s */ + +/* + * NMI runtime info. + */ +struct osn_nmi { + u64 count; + u64 delta_start; +}; + +/* + * IRQ runtime info. + */ +struct osn_irq { + u64 count; + u64 arrival_time; + u64 delta_start; +}; + +/* + * sofirq runtime info. + */ +struct osn_softirq { + u64 count; + u64 arrival_time; + u64 delta_start; +}; + +/* + * thread runtime info. + */ +struct osn_thread { + u64 count; + u64 arrival_time; + u64 delta_start; +}; + +/* + * Runtime information: this structure saves the runtime information used by + * one sampling thread. + */ +struct osnoise_variables { + struct task_struct *kthread; + bool sampling; + pid_t pid; + struct osn_nmi nmi; + struct osn_irq irq; + struct osn_softirq softirq; + struct osn_thread thread; + local_t int_counter; +}; + +/* + * Per-cpu runtime information. + */ +DEFINE_PER_CPU(struct osnoise_variables, per_cpu_osnoise_var); + +/* + * this_cpu_osn_var - Return the per-cpu osnoise_variables on its relative CPU + */ +static inline struct osnoise_variables *this_cpu_osn_var(void) +{ + return this_cpu_ptr(&per_cpu_osnoise_var); +} + +/* + * osn_var_reset - Reset the values of the given osnoise_variables + */ +static inline void osn_var_reset(struct osnoise_variables *osn_var) +{ + /* + * So far, all the values are initialized as 0, so + * zeroing the structure is perfect. + */ + memset(osn_var, 0, sizeof(*osn_var)); +} + +/* + * osn_var_reset_all - Reset the value of all per-cpu osnoise_variables + */ +static inline void osn_var_reset_all(void) +{ + struct osnoise_variables *osn_var; + int cpu; + + for_each_cpu(cpu, cpu_online_mask) { + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + osn_var_reset(osn_var); + } +} + +/* + * Tells NMIs to call back to the osnoise tracer to record timestamps. + */ +bool trace_osnoise_callback_enabled; + +/* + * osnoise sample structure definition. Used to store the statistics of a + * sample run. + */ +struct osnoise_sample { + u64 runtime; /* runtime */ + u64 noise; /* noise */ + u64 max_sample; /* max single noise sample */ + int hw_count; /* # HW (incl. hypervisor) interference */ + int nmi_count; /* # NMIs during this sample */ + int irq_count; /* # IRQs during this sample */ + int softirq_count; /* # softirqs during this sample */ + int thread_count; /* # threads during this sample */ +}; + +/* + * Protect the interface. + */ +struct mutex interface_lock; + +/* + * Tracer data. + */ +static struct osnoise_data { + u64 sample_period; /* total sampling period */ + u64 sample_runtime; /* active sampling portion of period */ + u64 stop_tracing; /* stop trace in the inside operation (loop) */ + u64 stop_tracing_total; /* stop trace in the outside operation (report) */ + bool tainted; /* infor users and developers about a problem */ +} osnoise_data = { + .sample_period = DEFAULT_SAMPLE_PERIOD, + .sample_runtime = DEFAULT_SAMPLE_RUNTIME, + .stop_tracing = 0, + .stop_tracing_total = 0, +}; + +/* + * Boolean variable used to inform that the tracer is currently sampling. + */ +static bool osnoise_busy; + +/* + * Print the osnoise header info. + */ +static void print_osnoise_headers(struct seq_file *s) +{ + if (osnoise_data.tainted) + seq_puts(s, "# osnoise is tainted!\n"); + + seq_puts(s, "# _-----=> irqs-off\n"); + seq_puts(s, "# / _----=> need-resched\n"); + seq_puts(s, "# | / _---=> hardirq/softirq\n"); + seq_puts(s, "# || / _--=> preempt-depth "); + seq_puts(s, " MAX\n"); + + seq_puts(s, "# || / "); + seq_puts(s, " SINGLE Interference counters:\n"); + + seq_puts(s, "# |||| RUNTIME "); + seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n"); + + seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP IN US "); + seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n"); + + seq_puts(s, "# | | | |||| | | "); + seq_puts(s, " | | | | | | | |\n"); +} + +/* + * osnoise_taint - report an osnoise error. + */ +#define osnoise_taint(msg) ({ \ + struct trace_array *tr = osnoise_trace; \ + \ + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg); \ + osnoise_data.tainted = true; \ +}) + +/* + * Record an osnoise_sample into the tracer buffer. + */ +static void trace_osnoise_sample(struct osnoise_sample *sample) +{ + struct trace_array *tr = osnoise_trace; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct trace_event_call *call = &event_osnoise; + struct ring_buffer_event *event; + struct osnoise_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_OSNOISE, sizeof(*entry), + tracing_gen_ctx()); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->runtime = sample->runtime; + entry->noise = sample->noise; + entry->max_sample = sample->max_sample; + entry->hw_count = sample->hw_count; + entry->nmi_count = sample->nmi_count; + entry->irq_count = sample->irq_count; + entry->softirq_count = sample->softirq_count; + entry->thread_count = sample->thread_count; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); +} + +/* + * Macros to encapsulate the time capturing infrastructure. + */ +#define time_get() trace_clock_local() +#define time_to_us(x) div_u64(x, 1000) +#define time_sub(a, b) ((a) - (b)) + +/* + * cond_move_irq_delta_start - Forward the delta_start of a running IRQ + * + * If an IRQ is preempted by an NMI, its delta_start is pushed forward + * to discount the NMI interference. + * + * See get_int_safe_duration(). + */ +static inline void +cond_move_irq_delta_start(struct osnoise_variables *osn_var, u64 duration) +{ + if (osn_var->irq.delta_start) + osn_var->irq.delta_start += duration; +} + +#ifndef CONFIG_PREEMPT_RT +/* + * cond_move_softirq_delta_start - Forward the delta_start of a running softirq. + * + * If a softirq is preempted by an IRQ or NMI, its delta_start is pushed + * forward to discount the interference. + * + * See get_int_safe_duration(). + */ +static inline void +cond_move_softirq_delta_start(struct osnoise_variables *osn_var, u64 duration) +{ + if (osn_var->softirq.delta_start) + osn_var->softirq.delta_start += duration; +} +#else /* CONFIG_PREEMPT_RT */ +#define cond_move_softirq_delta_start(osn_var, duration) do {} while (0) +#endif + +/* + * cond_move_thread_delta_start - Forward the delta_start of a running thread + * + * If a noisy thread is preempted by an softirq, IRQ or NMI, its delta_start + * is pushed forward to discount the interference. + * + * See get_int_safe_duration(). + */ +static inline void +cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration) +{ + if (osn_var->thread.delta_start) + osn_var->thread.delta_start += duration; +} + +/* + * get_int_safe_duration - Get the duration of a window + * + * The irq, softirq and thread varaibles need to have its duration without + * the interference from higher priority interrupts. Instead of keeping a + * variable to discount the interrupt interference from these variables, the + * starting time of these variables are pushed forward with the interrupt's + * duration. In this way, a single variable is used to: + * + * - Know if a given window is being measured. + * - Account its duration. + * - Discount the interference. + * + * To avoid getting inconsistent values, e.g.,: + * + * now = time_get() + * ---> interrupt! + * delta_start -= int duration; + * <--- + * duration = now - delta_start; + * + * result: negative duration if the variable duration before the + * interrupt was smaller than the interrupt execution. + * + * A counter of interrupts is used. If the counter increased, try + * to capture an interference safe duration. + */ +static inline s64 +get_int_safe_duration(struct osnoise_variables *osn_var, u64 *delta_start) +{ + u64 int_counter, now; + s64 duration; + + do { + int_counter = local_read(&osn_var->int_counter); + /* synchronize with interrupts */ + barrier(); + + now = time_get(); + duration = (now - *delta_start); + + /* synchronize with interrupts */ + barrier(); + } while (int_counter != local_read(&osn_var->int_counter)); + + /* + * This is an evidence of race conditions that cause + * a value to be "discounted" too much. + */ + if (duration < 0) + osnoise_taint("Negative duration!\n"); + + *delta_start = 0; + + return duration; +} + +/* + * + * set_int_safe_time - Save the current time on *time, aware of interference + * + * Get the time, taking into consideration a possible interference from + * higher priority interrupts. + * + * See get_int_safe_duration() for an explanation. + */ +static u64 +set_int_safe_time(struct osnoise_variables *osn_var, u64 *time) +{ + u64 int_counter; + + do { + int_counter = local_read(&osn_var->int_counter); + /* synchronize with interrupts */ + barrier(); + + *time = time_get(); + + /* synchronize with interrupts */ + barrier(); + } while (int_counter != local_read(&osn_var->int_counter)); + + return int_counter; +} + +/* + * trace_osnoise_callback - NMI entry/exit callback + * + * This function is called at the entry and exit NMI code. The bool enter + * distinguishes between either case. This function is used to note a NMI + * occurrence, compute the noise caused by the NMI, and to remove the noise + * it is potentially causing on other interference variables. + */ +void trace_osnoise_callback(bool enter) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + u64 duration; + + if (!osn_var->sampling) + return; + + /* + * Currently trace_clock_local() calls sched_clock() and the + * generic version is not NMI safe. + */ + if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { + if (enter) { + osn_var->nmi.delta_start = time_get(); + local_inc(&osn_var->int_counter); + } else { + duration = time_get() - osn_var->nmi.delta_start; + + trace_nmi_noise(osn_var->nmi.delta_start, duration); + + cond_move_irq_delta_start(osn_var, duration); + cond_move_softirq_delta_start(osn_var, duration); + cond_move_thread_delta_start(osn_var, duration); + } + } + + if (enter) + osn_var->nmi.count++; +} + +/* + * osnoise_trace_irq_entry - Note the starting of an IRQ + * + * Save the starting time of an IRQ. As IRQs are non-preemptive to other IRQs, + * it is safe to use a single variable (ons_var->irq) to save the statistics. + * The arrival_time is used to report... the arrival time. The delta_start + * is used to compute the duration at the IRQ exit handler. See + * cond_move_irq_delta_start(). + */ +void osnoise_trace_irq_entry(int id) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + + if (!osn_var->sampling) + return; + /* + * This value will be used in the report, but not to compute + * the execution time, so it is safe to get it unsafe. + */ + osn_var->irq.arrival_time = time_get(); + set_int_safe_time(osn_var, &osn_var->irq.delta_start); + osn_var->irq.count++; + + local_inc(&osn_var->int_counter); +} + +/* + * osnoise_irq_exit - Note the end of an IRQ, sava data and trace + * + * Computes the duration of the IRQ noise, and trace it. Also discounts the + * interference from other sources of noise could be currently being accounted. + */ +void osnoise_trace_irq_exit(int id, const char *desc) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + int duration; + + if (!osn_var->sampling) + return; + + duration = get_int_safe_duration(osn_var, &osn_var->irq.delta_start); + trace_irq_noise(id, desc, osn_var->irq.arrival_time, duration); + osn_var->irq.arrival_time = 0; + cond_move_softirq_delta_start(osn_var, duration); + cond_move_thread_delta_start(osn_var, duration); +} + +/* + * trace_irqentry_callback - Callback to the irq:irq_entry traceevent + * + * Used to note the starting of an IRQ occurece. + */ +static void trace_irqentry_callback(void *data, int irq, + struct irqaction *action) +{ + osnoise_trace_irq_entry(irq); +} + +/* + * trace_irqexit_callback - Callback to the irq:irq_exit traceevent + * + * Used to note the end of an IRQ occurece. + */ +static void trace_irqexit_callback(void *data, int irq, + struct irqaction *action, int ret) +{ + osnoise_trace_irq_exit(irq, action->name); +} + +/* + * arch specific register function. + */ +int __weak osnoise_arch_register(void) +{ + return 0; +} + +/* + * arch specific unregister function. + */ +void __weak osnoise_arch_unregister(void) +{ + return; +} + +/* + * hook_irq_events - Hook IRQ handling events + * + * This function hooks the IRQ related callbacks to the respective trace + * events. + */ +int hook_irq_events(void) +{ + int ret; + + ret = register_trace_irq_handler_entry(trace_irqentry_callback, NULL); + if (ret) + goto out_err; + + ret = register_trace_irq_handler_exit(trace_irqexit_callback, NULL); + if (ret) + goto out_unregister_entry; + + ret = osnoise_arch_register(); + if (ret) + goto out_irq_exit; + + return 0; + +out_irq_exit: + unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL); +out_unregister_entry: + unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL); +out_err: + return -EINVAL; +} + +/* + * unhook_irq_events - Unhook IRQ handling events + * + * This function unhooks the IRQ related callbacks to the respective trace + * events. + */ +void unhook_irq_events(void) +{ + osnoise_arch_unregister(); + unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL); + unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL); +} + +#ifndef CONFIG_PREEMPT_RT +/* + * trace_softirq_entry_callback - Note the starting of a softirq + * + * Save the starting time of a softirq. As softirqs are non-preemptive to + * other softirqs, it is safe to use a single variable (ons_var->softirq) + * to save the statistics. The arrival_time is used to report... the + * arrival time. The delta_start is used to compute the duration at the + * softirq exit handler. See cond_move_softirq_delta_start(). + */ +void trace_softirq_entry_callback(void *data, unsigned int vec_nr) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + + if (!osn_var->sampling) + return; + /* + * This value will be used in the report, but not to compute + * the execution time, so it is safe to get it unsafe. + */ + osn_var->softirq.arrival_time = time_get(); + set_int_safe_time(osn_var, &osn_var->softirq.delta_start); + osn_var->softirq.count++; + + local_inc(&osn_var->int_counter); +} + +/* + * trace_softirq_exit_callback - Note the end of an softirq + * + * Computes the duration of the softirq noise, and trace it. Also discounts the + * interference from other sources of noise could be currently being accounted. + */ +void trace_softirq_exit_callback(void *data, unsigned int vec_nr) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + int duration; + + if (!osn_var->sampling) + return; + + duration = get_int_safe_duration(osn_var, &osn_var->softirq.delta_start); + trace_softirq_noise(vec_nr, osn_var->softirq.arrival_time, duration); + cond_move_thread_delta_start(osn_var, duration); + osn_var->softirq.arrival_time = 0; +} + +/* + * hook_softirq_events - Hook softirq handling events + * + * This function hooks the softirq related callbacks to the respective trace + * events. + */ +static int hook_softirq_events(void) +{ + int ret; + + ret = register_trace_softirq_entry(trace_softirq_entry_callback, NULL); + if (ret) + goto out_err; + + ret = register_trace_softirq_exit(trace_softirq_exit_callback, NULL); + if (ret) + goto out_unreg_entry; + + return 0; + +out_unreg_entry: + unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL); +out_err: + return -EINVAL; +} + +/* + * unhook_softirq_events - Unhook softirq handling events + * + * This function hooks the softirq related callbacks to the respective trace + * events. + */ +static void unhook_softirq_events(void) +{ + unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL); + unregister_trace_softirq_exit(trace_softirq_exit_callback, NULL); +} +#else /* CONFIG_PREEMPT_RT */ +/* + * softirq are threads on the PREEMPT_RT mode. + */ +static int hook_softirq_events(void) +{ + return 0; +} +static void unhook_softirq_events(void) +{ +} +#endif + +/* + * thread_entry - Record the starting of a thread noise window + * + * It saves the context switch time for a noisy thread, and increments + * the interference counters. + */ +static void +thread_entry(struct osnoise_variables *osn_var, struct task_struct *t) +{ + if (!osn_var->sampling) + return; + /* + * The arrival time will be used in the report, but not to compute + * the execution time, so it is safe to get it unsafe. + */ + osn_var->thread.arrival_time = time_get(); + + set_int_safe_time(osn_var, &osn_var->thread.delta_start); + + osn_var->thread.count++; + local_inc(&osn_var->int_counter); +} + +/* + * thread_exit - Report the end of a thread noise window + * + * It computes the total noise from a thread, tracing if needed. + */ +static void +thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) +{ + int duration; + + if (!osn_var->sampling) + return; + + duration = get_int_safe_duration(osn_var, &osn_var->thread.delta_start); + + trace_thread_noise(t, osn_var->thread.arrival_time, duration); + + osn_var->thread.arrival_time = 0; +} + +/* + * trace_sched_switch - sched:sched_switch trace event handler + * + * This function is hooked to the sched:sched_switch trace event, and it is + * used to record the beginning and to report the end of a thread noise window. + */ +void +trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, + struct task_struct *n) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + + if (p->pid != osn_var->pid) + thread_exit(osn_var, p); + + if (n->pid != osn_var->pid) + thread_entry(osn_var, n); +} + +/* + * hook_thread_events - Hook the insturmentation for thread noise + * + * Hook the osnoise tracer callbacks to handle the noise from other + * threads on the necessary kernel events. + */ +int hook_thread_events(void) +{ + int ret; + + ret = register_trace_sched_switch(trace_sched_switch_callback, NULL); + if (ret) + return -EINVAL; + + return 0; +} + +/* + * unhook_thread_events - *nhook the insturmentation for thread noise + * + * Unook the osnoise tracer callbacks to handle the noise from other + * threads on the necessary kernel events. + */ +void unhook_thread_events(void) +{ + unregister_trace_sched_switch(trace_sched_switch_callback, NULL); +} + +/* + * save_osn_sample_stats - Save the osnoise_sample statistics + * + * Save the osnoise_sample statistics before the sampling phase. These + * values will be used later to compute the diff betwneen the statistics + * before and after the osnoise sampling. + */ +void save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) +{ + s->nmi_count = osn_var->nmi.count; + s->irq_count = osn_var->irq.count; + s->softirq_count = osn_var->softirq.count; + s->thread_count = osn_var->thread.count; +} + +/* + * diff_osn_sample_stats - Compute the osnoise_sample statistics + * + * After a sample period, compute the difference on the osnoise_sample + * statistics. The struct osnoise_sample *s contains the statistics saved via + * save_osn_sample_stats() before the osnoise sampling. + */ +void diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) +{ + s->nmi_count = osn_var->nmi.count - s->nmi_count; + s->irq_count = osn_var->irq.count - s->irq_count; + s->softirq_count = osn_var->softirq.count - s->softirq_count; + s->thread_count = osn_var->thread.count - s->thread_count; +} + +/* + * osnoise_stop_tracing - Stop tracing and the tracer. + */ +static void osnoise_stop_tracing(void) +{ + struct trace_array *tr = osnoise_trace; + tracer_tracing_off(tr); +} + +/* + * run_osnoise - Sample the time and look for osnoise + * + * Used to capture the time, looking for potential osnoise latency repeatedly. + * Different from hwlat_detector, it is called with preemption and interrupts + * enabled. This allows irqs, softirqs and threads to run, interfering on the + * osnoise sampling thread, as they would do with a regular thread. + */ +static int run_osnoise(void) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + u64 noise = 0, sum_noise = 0, max_noise = 0; + struct trace_array *tr = osnoise_trace; + u64 start, sample, last_sample; + u64 last_int_count, int_count; + s64 total, last_total = 0; + struct osnoise_sample s; + unsigned int threshold; + int hw_count = 0; + u64 runtime, stop_in; + int ret = -1; + + /* + * Considers the current thread as the workload. + */ + osn_var->pid = current->pid; + + /* + * Save the current stats for the diff + */ + save_osn_sample_stats(osn_var, &s); + + /* + * if threshold is 0, use the default value of 5 us. + */ + threshold = tracing_thresh ? : 5000; + + /* + * Make sure NMIs see sampling first + */ + osn_var->sampling = true; + barrier(); + + /* + * Transform the *_us config to nanoseconds to avoid the + * division on the main loop. + */ + runtime = osnoise_data.sample_runtime * NSEC_PER_USEC; + stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC; + + /* + * Start timestemp + */ + start = time_get(); + + /* + * "previous" loop. + */ + last_int_count = set_int_safe_time(osn_var, &last_sample); + + do { + /* + * Get sample! + */ + int_count = set_int_safe_time(osn_var, &sample); + + noise = time_sub(sample, last_sample); + + /* + * This shouldn't happen. + */ + if (noise < 0) { + osnoise_taint("negative noise!"); + goto out; + } + + /* + * Sample runtime. + */ + total = time_sub(sample, start); + + /* + * Check for possible overflows. + */ + if (total < last_total) { + osnoise_taint("total overflow!"); + break; + } + + last_total = total; + + if (noise >= threshold) { + int interference = int_count - last_int_count; + + if (noise > max_noise) + max_noise = noise; + + if (!interference) + hw_count++; + + sum_noise += noise; + + trace_sample_threshold(last_sample, noise, interference); + + if (osnoise_data.stop_tracing) + if (noise > stop_in) + osnoise_stop_tracing(); + } + + /* + * For the non-preemptive kernel config: let threads runs, if + * they so wish. + */ + cond_resched(); + + last_sample = sample; + last_int_count = int_count; + + } while (total < runtime && !kthread_should_stop()); + + /* + * Finish the above in the view for interrupts. + */ + barrier(); + + osn_var->sampling = false; + + /* + * Make sure sampling data is no longer updated. + */ + barrier(); + + /* + * Save noise info. + */ + s.noise = time_to_us(sum_noise); + s.runtime = time_to_us(total); + s.max_sample = time_to_us(max_noise); + s.hw_count = hw_count; + + /* Save interference stats info */ + diff_osn_sample_stats(osn_var, &s); + + trace_osnoise_sample(&s); + + /* Keep a running maximum ever recorded osnoise "latency" */ + if (max_noise > tr->max_latency) { + tr->max_latency = max_noise; + latency_fsnotify(tr); + } + + if (osnoise_data.stop_tracing_total) + if (s.noise > osnoise_data.stop_tracing_total) + osnoise_stop_tracing(); + + return 0; +out: + return ret; +} + +static struct cpumask osnoise_cpumask; +static struct cpumask save_cpumask; + +/* + * osnoise_main - The osnoise detection kernel thread + * + * Calls run_osnoise() function to measure the osnoise for the configured runtime, + * every period. + */ +static int osnoise_main(void *data) +{ + s64 interval; + + while (!kthread_should_stop()) { + + run_osnoise(); + + mutex_lock(&interface_lock); + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + mutex_unlock(&interface_lock); + + do_div(interval, USEC_PER_MSEC); + + /* + * differently from hwlat_detector, the osnoise tracer can run + * without a pause because preemption is on. + */ + if (interval < 1) + continue; + + if (msleep_interruptible(interval)) + break; + } + + return 0; +} + +/* + * stop_per_cpu_kthread - stop per-cpu threads + * + * Stop the osnoise sampling htread. Use this on unload and at system + * shutdown. + */ +static void stop_per_cpu_kthreads(void) +{ + struct task_struct *kthread; + int cpu; + + for_each_online_cpu(cpu) { + kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + if (kthread) + kthread_stop(kthread); + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + } +} + +/* + * start_per_cpu_kthread - Kick off per-cpu osnoise sampling kthreads + * + * This starts the kernel thread that will look for osnoise on many + * cpus. + */ +static int start_per_cpu_kthreads(struct trace_array *tr) +{ + struct cpumask *current_mask = &save_cpumask; + struct task_struct *kthread; + char comm[24]; + int cpu; + + get_online_cpus(); + /* + * Run only on CPUs in which trace and osnoise are allowed to run. + */ + cpumask_and(current_mask, tr->tracing_cpumask, &osnoise_cpumask); + /* + * And the CPU is online. + */ + cpumask_and(current_mask, cpu_online_mask, current_mask); + put_online_cpus(); + + for_each_online_cpu(cpu) + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + + for_each_cpu(cpu, current_mask) { + snprintf(comm, 24, "osnoise/%d", cpu); + + kthread = kthread_create_on_cpu(osnoise_main, NULL, cpu, comm); + + if (IS_ERR(kthread)) { + pr_err(BANNER "could not start sampling thread\n"); + stop_per_cpu_kthreads(); + return -ENOMEM; + } + + per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; + wake_up_process(kthread); + } + + return 0; +} + +/* + * osnoise_cpus_read - Read function for reading the "cpus" file + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * Prints the "cpus" output into the user-provided buffer. + */ +static ssize_t +osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, + loff_t *ppos) +{ + char *mask_str; + int len; + + mutex_lock(&interface_lock); + + len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1; + mask_str = kmalloc(len, GFP_KERNEL); + if (!mask_str) { + count = -ENOMEM; + goto out_unlock; + } + + len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)); + if (len >= count) { + count = -EINVAL; + goto out_free; + } + + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); + +out_free: + kfree(mask_str); +out_unlock: + mutex_unlock(&interface_lock); + + return count; +} + +static void osnoise_tracer_start(struct trace_array *tr); +static void osnoise_tracer_stop(struct trace_array *tr); + +/* + * osnoise_cpus_write - Write function for "cpus" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * This function provides a write implementation for the "cpus" + * interface to the osnoise trace. By default, it lists all CPUs, + * in this way, allowing osnoise threads to run on any online CPU + * of the system. It serves to restrict the execution of osnoise to the + * set of CPUs writing via this interface. Note that osnoise also + * respects the "tracing_cpumask." Hence, osnoise threads will run only + * on the set of CPUs allowed here AND on "tracing_cpumask." Why not + * have just "tracing_cpumask?" Because the user might be interested + * in tracing what is running on other CPUs. For instance, one might + * run osnoise in one HT CPU while observing what is running on the + * sibling HT CPU. + */ +static ssize_t +osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, + loff_t *ppos) +{ + struct trace_array *tr = osnoise_trace; + cpumask_var_t osnoise_cpumask_new; + int running, err; + char buf[256]; + + if (count >= 256) + return -EINVAL; + + if (copy_from_user(buf, ubuf, count)) + return -EFAULT; + + if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL)) + return -ENOMEM; + + err = cpulist_parse(buf, osnoise_cpumask_new); + if (err) + goto err_free; + + /* + * trace_types_lock is taken to avoid concurrency on start/stop + * and osnoise_busy. + */ + mutex_lock(&trace_types_lock); + running = osnoise_busy; + if (running) + osnoise_tracer_stop(tr); + + mutex_lock(&interface_lock); + cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); + mutex_unlock(&interface_lock); + + if (running) + osnoise_tracer_start(tr); + mutex_unlock(&trace_types_lock); + + free_cpumask_var(osnoise_cpumask_new); + return count; + +err_free: + free_cpumask_var(osnoise_cpumask_new); + + return err; +} + +/* + * osnoise/runtime_us: cannot be greater than the period. + */ +static struct trace_min_max_param osnoise_runtime = { + .lock = &interface_lock, + .val = &osnoise_data.sample_runtime, + .max = &osnoise_data.sample_period, + .min = NULL, +}; + +/* + * osnoise/period_us: cannot be smaller than the runtime. + */ +static struct trace_min_max_param osnoise_period = { + .lock = &interface_lock, + .val = &osnoise_data.sample_period, + .max = NULL, + .min = &osnoise_data.sample_runtime, +}; + +/* + * osnoise/stop_tracing_us: no limit. + */ +static struct trace_min_max_param osnoise_stop_tracing_in = { + .lock = &interface_lock, + .val = &osnoise_data.stop_tracing, + .max = NULL, + .min = NULL, +}; + +/* + * osnoise/stop_tracing_total_us: no limit. + */ +static struct trace_min_max_param osnoise_stop_tracing_total = { + .lock = &interface_lock, + .val = &osnoise_data.stop_tracing_total, + .max = NULL, + .min = NULL, +}; + +static const struct file_operations cpus_fops = { + .open = tracing_open_generic, + .read = osnoise_cpus_read, + .write = osnoise_cpus_write, + .llseek = generic_file_llseek, +}; + +/* + * init_tracefs - A function to initialize the tracefs interface files + * + * This function creates entries in tracefs for "osnoise". It creates the + * "osnoise" directory in the tracing directory, and within that + * directory is the count, runtime and period files to change and view + * those values. + */ +static int init_tracefs(void) +{ + struct dentry *top_dir; + struct dentry *tmp; + int ret; + + ret = tracing_init_dentry(); + if (ret) + return -ENOMEM; + + top_dir = tracefs_create_dir("osnoise", NULL); + if (!top_dir) + return -ENOMEM; + + tmp = tracefs_create_file("period_us", 0640, top_dir, + &osnoise_period, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = tracefs_create_file("runtime_us", 0644, top_dir, + &osnoise_runtime, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = tracefs_create_file("stop_tracing_us", 0640, top_dir, + &osnoise_stop_tracing_in, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = tracefs_create_file("stop_tracing_total_us", 0640, top_dir, + &osnoise_stop_tracing_total, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = trace_create_file("cpus", 0644, top_dir, NULL, &cpus_fops); + if (!tmp) + goto err; + + return 0; + +err: + tracefs_remove(top_dir); + return -ENOMEM; +} + +static int osnoise_hook_events(void) +{ + int retval; + + /* + * Trace is already hooked, we are re-enabling from + * a stop_tracing_*. + */ + if (trace_osnoise_callback_enabled) + return 0; + + retval = hook_irq_events(); + if (retval) + return -EINVAL; + + retval = hook_softirq_events(); + if (retval) + goto out_unhook_irq; + + retval = hook_thread_events(); + /* + * All fine! + */ + if (!retval) + return 0; + + unhook_softirq_events(); +out_unhook_irq: + unhook_irq_events(); + return -EINVAL; +} + +static void osnoise_tracer_start(struct trace_array *tr) +{ + int retval; + + if (osnoise_busy) + return; + + osn_var_reset_all(); + + retval = osnoise_hook_events(); + if (retval) + goto out_err; + /* + * Make sure NMIs see reseted values. + */ + barrier(); + trace_osnoise_callback_enabled = true; + + retval = start_per_cpu_kthreads(tr); + /* + * all fine! + */ + if (!retval) + return; + +out_err: + unhook_irq_events(); + pr_err(BANNER "Error starting osnoise tracer\n"); +} + +static void osnoise_tracer_stop(struct trace_array *tr) +{ + if (!osnoise_busy) + return; + + trace_osnoise_callback_enabled = false; + barrier(); + + stop_per_cpu_kthreads(); + + unhook_irq_events(); + unhook_softirq_events(); + unhook_thread_events(); + + osnoise_busy = false; +} + +static int osnoise_tracer_init(struct trace_array *tr) +{ + /* Only allow one instance to enable this */ + if (osnoise_busy) + return -EBUSY; + + osnoise_trace = tr; + + tr->max_latency = 0; + + osnoise_tracer_start(tr); + + osnoise_busy = true; + + return 0; +} + +static void osnoise_tracer_reset(struct trace_array *tr) +{ + osnoise_tracer_stop(tr); +} + +static struct tracer osnoise_tracer __read_mostly = { + .name = "osnoise", + .init = osnoise_tracer_init, + .reset = osnoise_tracer_reset, + .start = osnoise_tracer_start, + .stop = osnoise_tracer_stop, + .print_header = print_osnoise_headers, + .allow_instances = true, +}; + +__init static int init_osnoise_tracer(void) +{ + int ret; + + mutex_init(&interface_lock); + + cpumask_copy(&osnoise_cpumask, cpu_all_mask); + + ret = register_tracer(&osnoise_tracer); + if (ret) + return ret; + + init_tracefs(); + + return 0; +} +late_initcall(init_osnoise_tracer); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d0368a569bfa..642b6584eba5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1202,7 +1202,6 @@ trace_hwlat_print(struct trace_iterator *iter, int flags, return trace_handle_return(s); } - static enum print_line_t trace_hwlat_raw(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -1232,6 +1231,76 @@ static struct trace_event trace_hwlat_event = { .funcs = &trace_hwlat_funcs, }; +/* TRACE_OSNOISE */ +static enum print_line_t +trace_osnoise_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct osnoise_entry *field; + u64 ratio, ratio_dec; + u64 net_runtime; + + trace_assign_type(field, entry); + + /* + * compute the available % of cpu time. + */ + net_runtime = field->runtime - field->noise; + ratio = net_runtime * 10000000; + do_div(ratio, field->runtime); + ratio_dec = do_div(ratio, 100000); + + trace_seq_printf(s, "%llu %10llu %3llu.%05llu %7llu", + field->runtime, + field->noise, + ratio, ratio_dec, + field->max_sample); + + trace_seq_printf(s, " %6u", field->hw_count); + trace_seq_printf(s, " %6u", field->nmi_count); + trace_seq_printf(s, " %6u", field->irq_count); + trace_seq_printf(s, " %6u", field->softirq_count); + trace_seq_printf(s, " %6u", field->thread_count); + + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static enum print_line_t +trace_osnoise_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct osnoise_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%lld %llu %llu %u %u %u %u %u\n", + field->runtime, + field->noise, + field->max_sample, + field->hw_count, + field->nmi_count, + field->irq_count, + field->softirq_count, + field->thread_count); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_osnoise_funcs = { + .trace = trace_osnoise_print, + .raw = trace_osnoise_raw, +}; + +static struct trace_event trace_osnoise_event = { + .type = TRACE_OSNOISE, + .funcs = &trace_osnoise_funcs, +}; + /* TRACE_BPUTS */ static enum print_line_t trace_bputs_print(struct trace_iterator *iter, int flags, @@ -1442,6 +1511,7 @@ static struct trace_event *events[] __initdata = { &trace_bprint_event, &trace_print_event, &trace_hwlat_event, + &trace_osnoise_event, &trace_raw_data_event, &trace_func_repeats_event, NULL From a955d7eac1779b437ceb24fc352026a2cbcec140 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 22 Jun 2021 16:42:28 +0200 Subject: [PATCH 31/49] trace: Add timerlat tracer The timerlat tracer aims to help the preemptive kernel developers to found souces of wakeup latencies of real-time threads. Like cyclictest, the tracer sets a periodic timer that wakes up a thread. The thread then computes a *wakeup latency* value as the difference between the *current time* and the *absolute time* that the timer was set to expire. The main goal of timerlat is tracing in such a way to help kernel developers. Usage Write the ASCII text "timerlat" into the current_tracer file of the tracing system (generally mounted at /sys/kernel/tracing). For example: [root@f32 ~]# cd /sys/kernel/tracing/ [root@f32 tracing]# echo timerlat > current_tracer It is possible to follow the trace by reading the trace trace file: [root@f32 tracing]# cat trace # tracer: timerlat # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # || / # |||| ACTIVATION # TASK-PID CPU# |||| TIMESTAMP ID CONTEXT LATENCY # | | | |||| | | | | -0 [000] d.h1 54.029328: #1 context irq timer_latency 932 ns <...>-867 [000] .... 54.029339: #1 context thread timer_latency 11700 ns -0 [001] dNh1 54.029346: #1 context irq timer_latency 2833 ns <...>-868 [001] .... 54.029353: #1 context thread timer_latency 9820 ns -0 [000] d.h1 54.030328: #2 context irq timer_latency 769 ns <...>-867 [000] .... 54.030330: #2 context thread timer_latency 3070 ns -0 [001] d.h1 54.030344: #2 context irq timer_latency 935 ns <...>-868 [001] .... 54.030347: #2 context thread timer_latency 4351 ns The tracer creates a per-cpu kernel thread with real-time priority that prints two lines at every activation. The first is the *timer latency* observed at the *hardirq* context before the activation of the thread. The second is the *timer latency* observed by the thread, which is the same level that cyclictest reports. The ACTIVATION ID field serves to relate the *irq* execution to its respective *thread* execution. The irq/thread splitting is important to clarify at which context the unexpected high value is coming from. The *irq* context can be delayed by hardware related actions, such as SMIs, NMIs, IRQs or by a thread masking interrupts. Once the timer happens, the delay can also be influenced by blocking caused by threads. For example, by postponing the scheduler execution via preempt_disable(), by the scheduler execution, or by masking interrupts. Threads can also be delayed by the interference from other threads and IRQs. The timerlat can also take advantage of the osnoise: traceevents. For example: [root@f32 ~]# cd /sys/kernel/tracing/ [root@f32 tracing]# echo timerlat > current_tracer [root@f32 tracing]# echo osnoise > set_event [root@f32 tracing]# echo 25 > osnoise/stop_tracing_total_us [root@f32 tracing]# tail -10 trace cc1-87882 [005] d..h... 548.771078: #402268 context irq timer_latency 1585 ns cc1-87882 [005] dNLh1.. 548.771082: irq_noise: local_timer:236 start 548.771077442 duration 4597 ns cc1-87882 [005] dNLh2.. 548.771083: irq_noise: reschedule:253 start 548.771083017 duration 56 ns cc1-87882 [005] dNLh2.. 548.771086: irq_noise: call_function_single:251 start 548.771083811 duration 2048 ns cc1-87882 [005] dNLh2.. 548.771088: irq_noise: call_function_single:251 start 548.771086814 duration 1495 ns cc1-87882 [005] dNLh2.. 548.771091: irq_noise: call_function_single:251 start 548.771089194 duration 1558 ns cc1-87882 [005] dNLh2.. 548.771094: irq_noise: call_function_single:251 start 548.771091719 duration 1932 ns cc1-87882 [005] dNLh2.. 548.771096: irq_noise: call_function_single:251 start 548.771094696 duration 1050 ns cc1-87882 [005] d...3.. 548.771101: thread_noise: cc1:87882 start 548.771078243 duration 10909 ns timerlat/5-1035 [005] ....... 548.771103: #402268 context thread timer_latency 25960 ns For further information see: Documentation/trace/timerlat-tracer.rst Link: https://lkml.kernel.org/r/71f18efc013e1194bcaea1e54db957de2b19ba62.1624372313.git.bristot@redhat.com Cc: Phil Auld Cc: Sebastian Andrzej Siewior Cc: Kate Carcia Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Clark Willaims Cc: John Kacur Cc: Juri Lelli Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/index.rst | 1 + Documentation/trace/timerlat-tracer.rst | 181 +++++++ kernel/trace/Kconfig | 28 ++ kernel/trace/trace.h | 2 + kernel/trace/trace_entries.h | 16 + kernel/trace/trace_osnoise.c | 634 ++++++++++++++++++++++-- kernel/trace/trace_output.c | 47 ++ 7 files changed, 874 insertions(+), 35 deletions(-) create mode 100644 Documentation/trace/timerlat-tracer.rst diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst index 608107b27cc0..3769b9b7aed8 100644 --- a/Documentation/trace/index.rst +++ b/Documentation/trace/index.rst @@ -24,6 +24,7 @@ Linux Tracing Technologies boottime-trace hwlat_detector osnoise-tracer + timerlat-tracer intel_th ring-buffer-design stm diff --git a/Documentation/trace/timerlat-tracer.rst b/Documentation/trace/timerlat-tracer.rst new file mode 100644 index 000000000000..c7cbb557aee7 --- /dev/null +++ b/Documentation/trace/timerlat-tracer.rst @@ -0,0 +1,181 @@ +############### +Timerlat tracer +############### + +The timerlat tracer aims to help the preemptive kernel developers to +find souces of wakeup latencies of real-time threads. Like cyclictest, +the tracer sets a periodic timer that wakes up a thread. The thread then +computes a *wakeup latency* value as the difference between the *current +time* and the *absolute time* that the timer was set to expire. The main +goal of timerlat is tracing in such a way to help kernel developers. + +Usage +----- + +Write the ASCII text "timerlat" into the current_tracer file of the +tracing system (generally mounted at /sys/kernel/tracing). + +For example:: + + [root@f32 ~]# cd /sys/kernel/tracing/ + [root@f32 tracing]# echo timerlat > current_tracer + +It is possible to follow the trace by reading the trace trace file:: + + [root@f32 tracing]# cat trace + # tracer: timerlat + # + # _-----=> irqs-off + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth + # || / + # |||| ACTIVATION + # TASK-PID CPU# |||| TIMESTAMP ID CONTEXT LATENCY + # | | | |||| | | | | + -0 [000] d.h1 54.029328: #1 context irq timer_latency 932 ns + <...>-867 [000] .... 54.029339: #1 context thread timer_latency 11700 ns + -0 [001] dNh1 54.029346: #1 context irq timer_latency 2833 ns + <...>-868 [001] .... 54.029353: #1 context thread timer_latency 9820 ns + -0 [000] d.h1 54.030328: #2 context irq timer_latency 769 ns + <...>-867 [000] .... 54.030330: #2 context thread timer_latency 3070 ns + -0 [001] d.h1 54.030344: #2 context irq timer_latency 935 ns + <...>-868 [001] .... 54.030347: #2 context thread timer_latency 4351 ns + + +The tracer creates a per-cpu kernel thread with real-time priority that +prints two lines at every activation. The first is the *timer latency* +observed at the *hardirq* context before the activation of the thread. +The second is the *timer latency* observed by the thread. The ACTIVATION +ID field serves to relate the *irq* execution to its respective *thread* +execution. + +The *irq*/*thread* splitting is important to clarify at which context +the unexpected high value is coming from. The *irq* context can be +delayed by hardware related actions, such as SMIs, NMIs, IRQs +or by a thread masking interrupts. Once the timer happens, the delay +can also be influenced by blocking caused by threads. For example, by +postponing the scheduler execution via preempt_disable(), by the +scheduler execution, or by masking interrupts. Threads can +also be delayed by the interference from other threads and IRQs. + +Tracer options +--------------------- + +The timerlat tracer is built on top of osnoise tracer. +So its configuration is also done in the osnoise/ config +directory. The timerlat configs are: + + - cpus: CPUs at which a timerlat thread will execute. + - timerlat_period_us: the period of the timerlat thread. + - osnoise/stop_tracing_us: stop the system tracing if a + timer latency at the *irq* context higher than the configured + value happens. Writing 0 disables this option. + - stop_tracing_total_us: stop the system tracing if a + timer latency at the *thread* context higher than the configured + value happens. Writing 0 disables this option. + - print_stack: save the stack of the IRQ ocurrence, and print + it afte the *thread context* event". + +timerlat and osnoise +---------------------------- + +The timerlat can also take advantage of the osnoise: traceevents. +For example:: + + [root@f32 ~]# cd /sys/kernel/tracing/ + [root@f32 tracing]# echo timerlat > current_tracer + [root@f32 tracing]# echo 1 > events/osnoise/enable + [root@f32 tracing]# echo 25 > osnoise/stop_tracing_total_us + [root@f32 tracing]# tail -10 trace + cc1-87882 [005] d..h... 548.771078: #402268 context irq timer_latency 13585 ns + cc1-87882 [005] dNLh1.. 548.771082: irq_noise: local_timer:236 start 548.771077442 duration 7597 ns + cc1-87882 [005] dNLh2.. 548.771099: irq_noise: qxl:21 start 548.771085017 duration 7139 ns + cc1-87882 [005] d...3.. 548.771102: thread_noise: cc1:87882 start 548.771078243 duration 9909 ns + timerlat/5-1035 [005] ....... 548.771104: #402268 context thread timer_latency 39960 ns + +In this case, the root cause of the timer latency does not point to a +single cause, but to multiple ones. Firstly, the timer IRQ was delayed +for 13 us, which may point to a long IRQ disabled section (see IRQ +stacktrace section). Then the timer interrupt that wakes up the timerlat +thread took 7597 ns, and the qxl:21 device IRQ took 7139 ns. Finally, +the cc1 thread noise took 9909 ns of time before the context switch. +Such pieces of evidence are useful for the developer to use other +tracing methods to figure out how to debug and optimize the system. + +It is worth mentioning that the *duration* values reported +by the osnoise: events are *net* values. For example, the +thread_noise does not include the duration of the overhead caused +by the IRQ execution (which indeed accounted for 12736 ns). But +the values reported by the timerlat tracer (timerlat_latency) +are *gross* values. + +The art below illustrates a CPU timeline and how the timerlat tracer +observes it at the top and the osnoise: events at the bottom. Each "-" +in the timelines means circa 1 us, and the time moves ==>:: + + External timer irq thread + clock latency latency + event 13585 ns 39960 ns + | ^ ^ + v | | + |-------------| | + |-------------+-------------------------| + ^ ^ + ======================================================================== + [tmr irq] [dev irq] + [another thread...^ v..^ v.......][timerlat/ thread] <-- CPU timeline + ========================================================================= + |-------| |-------| + |--^ v-------| + | | | + | | + thread_noise: 9909 ns + | +-> irq_noise: 6139 ns + +-> irq_noise: 7597 ns + +IRQ stacktrace +--------------------------- + +The osnoise/print_stack option is helpful for the cases in which a thread +noise causes the major factor for the timer latency, because of preempt or +irq disabled. For example:: + + [root@f32 tracing]# echo 500 > osnoise/stop_tracing_total_us + [root@f32 tracing]# echo 500 > osnoise/print_stack + [root@f32 tracing]# echo timerlat > current_tracer + [root@f32 tracing]# tail -21 per_cpu/cpu7/trace + insmod-1026 [007] dN.h1.. 200.201948: irq_noise: local_timer:236 start 200.201939376 duration 7872 ns + insmod-1026 [007] d..h1.. 200.202587: #29800 context irq timer_latency 1616 ns + insmod-1026 [007] dN.h2.. 200.202598: irq_noise: local_timer:236 start 200.202586162 duration 11855 ns + insmod-1026 [007] dN.h3.. 200.202947: irq_noise: local_timer:236 start 200.202939174 duration 7318 ns + insmod-1026 [007] d...3.. 200.203444: thread_noise: insmod:1026 start 200.202586933 duration 838681 ns + timerlat/7-1001 [007] ....... 200.203445: #29800 context thread timer_latency 859978 ns + timerlat/7-1001 [007] ....1.. 200.203446: + => timerlat_irq + => __hrtimer_run_queues + => hrtimer_interrupt + => __sysvec_apic_timer_interrupt + => asm_call_irq_on_stack + => sysvec_apic_timer_interrupt + => asm_sysvec_apic_timer_interrupt + => delay_tsc + => dummy_load_1ms_pd_init + => do_one_initcall + => do_init_module + => __do_sys_finit_module + => do_syscall_64 + => entry_SYSCALL_64_after_hwframe + +In this case, it is possible to see that the thread added the highest +contribution to the *timer latency* and the stack trace, saved during +the timerlat IRQ handler, points to a function named +dummy_load_1ms_pd_init, which had the following code (on purpose):: + + static int __init dummy_load_1ms_pd_init(void) + { + preempt_disable(); + mdelay(1); + preempt_enable(); + return 0; + + } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 41582ae4682b..d567b1717c4c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -390,6 +390,34 @@ config OSNOISE_TRACER To enable this tracer, echo in "osnoise" into the current_tracer file. +config TIMERLAT_TRACER + bool "Timerlat tracer" + select OSNOISE_TRACER + select GENERIC_TRACER + help + The timerlat tracer aims to help the preemptive kernel developers + to find sources of wakeup latencies of real-time threads. + + The tracer creates a per-cpu kernel thread with real-time priority. + The tracer thread sets a periodic timer to wakeup itself, and goes + to sleep waiting for the timer to fire. At the wakeup, the thread + then computes a wakeup latency value as the difference between + the current time and the absolute time that the timer was set + to expire. + + The tracer prints two lines at every activation. The first is the + timer latency observed at the hardirq context before the + activation of the thread. The second is the timer latency observed + by the thread, which is the same level that cyclictest reports. The + ACTIVATION ID field serves to relate the irq execution to its + respective thread execution. + + The tracer is build on top of osnoise tracer, and the osnoise: + events can be used to trace the source of interference from NMI, + IRQs and other threads. It also enables the capture of the + stacktrace at the IRQ context, which helps to identify the code + path that can cause thread delay. + config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b959c9ec9711..d83bbb6859b4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -45,6 +45,7 @@ enum trace_type { TRACE_BPUTS, TRACE_HWLAT, TRACE_OSNOISE, + TRACE_TIMERLAT, TRACE_RAW_DATA, TRACE_FUNC_REPEATS, @@ -448,6 +449,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ IF_ASSIGN(var, ent, struct osnoise_entry, TRACE_OSNOISE);\ + IF_ASSIGN(var, ent, struct timerlat_entry, TRACE_TIMERLAT);\ IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 158c0984b59b..cd41e863b51c 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -385,3 +385,19 @@ FTRACE_ENTRY(osnoise, osnoise_entry, __entry->softirq_count, __entry->thread_count) ); + +FTRACE_ENTRY(timerlat, timerlat_entry, + + TRACE_TIMERLAT, + + F_STRUCT( + __field( unsigned int, seqnum ) + __field( int, context ) + __field( u64, timer_latency ) + ), + + F_printk("seq:%u\tcontext:%d\ttimer_latency:%llu\n", + __entry->seqnum, + __entry->context, + __entry->timer_latency) +); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 4e2c47dc4f19..8546e66bafcb 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * OS Noise Tracer: computes the OS Noise suffered by a running thread. + * Timerlat Tracer: measures the wakeup latency of a timer triggered IRQ and thread. * * Based on "hwlat_detector" tracer by: * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. @@ -21,6 +22,7 @@ #include #include #include +#include #include #include "trace.h" @@ -45,6 +47,9 @@ static struct trace_array *osnoise_trace; #define DEFAULT_SAMPLE_PERIOD 1000000 /* 1s */ #define DEFAULT_SAMPLE_RUNTIME 1000000 /* 1s */ +#define DEFAULT_TIMERLAT_PERIOD 1000 /* 1ms */ +#define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */ + /* * NMI runtime info. */ @@ -62,6 +67,8 @@ struct osn_irq { u64 delta_start; }; +#define IRQ_CONTEXT 0 +#define THREAD_CONTEXT 1 /* * sofirq runtime info. */ @@ -108,16 +115,65 @@ static inline struct osnoise_variables *this_cpu_osn_var(void) return this_cpu_ptr(&per_cpu_osnoise_var); } +#ifdef CONFIG_TIMERLAT_TRACER /* - * osn_var_reset - Reset the values of the given osnoise_variables + * Runtime information for the timer mode. */ -static inline void osn_var_reset(struct osnoise_variables *osn_var) +struct timerlat_variables { + struct task_struct *kthread; + struct hrtimer timer; + u64 rel_period; + u64 abs_period; + bool tracing_thread; + u64 count; +}; + +DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var); + +/* + * this_cpu_tmr_var - Return the per-cpu timerlat_variables on its relative CPU + */ +static inline struct timerlat_variables *this_cpu_tmr_var(void) { + return this_cpu_ptr(&per_cpu_timerlat_var); +} + +/* + * tlat_var_reset - Reset the values of the given timerlat_variables + */ +static inline void tlat_var_reset(void) +{ + struct timerlat_variables *tlat_var; + int cpu; /* * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ - memset(osn_var, 0, sizeof(*osn_var)); + for_each_cpu(cpu, cpu_online_mask) { + tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); + memset(tlat_var, 0, sizeof(*tlat_var)); + } +} +#else /* CONFIG_TIMERLAT_TRACER */ +#define tlat_var_reset() do {} while (0) +#endif /* CONFIG_TIMERLAT_TRACER */ + +/* + * osn_var_reset - Reset the values of the given osnoise_variables + */ +static inline void osn_var_reset(void) +{ + struct osnoise_variables *osn_var; + int cpu; + + /* + * So far, all the values are initialized as 0, so + * zeroing the structure is perfect. + */ + for_each_cpu(cpu, cpu_online_mask) { + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + memset(osn_var, 0, sizeof(*osn_var)); + } } /* @@ -125,13 +181,8 @@ static inline void osn_var_reset(struct osnoise_variables *osn_var) */ static inline void osn_var_reset_all(void) { - struct osnoise_variables *osn_var; - int cpu; - - for_each_cpu(cpu, cpu_online_mask) { - osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); - osn_var_reset(osn_var); - } + osn_var_reset(); + tlat_var_reset(); } /* @@ -154,6 +205,18 @@ struct osnoise_sample { int thread_count; /* # threads during this sample */ }; +#ifdef CONFIG_TIMERLAT_TRACER +/* + * timerlat sample structure definition. Used to store the statistics of + * a sample run. + */ +struct timerlat_sample { + u64 timer_latency; /* timer_latency */ + unsigned int seqnum; /* unique sequence */ + int context; /* timer context */ +}; +#endif + /* * Protect the interface. */ @@ -165,14 +228,24 @@ struct mutex interface_lock; static struct osnoise_data { u64 sample_period; /* total sampling period */ u64 sample_runtime; /* active sampling portion of period */ - u64 stop_tracing; /* stop trace in the inside operation (loop) */ - u64 stop_tracing_total; /* stop trace in the outside operation (report) */ + u64 stop_tracing; /* stop trace in the internal operation (loop/irq) */ + u64 stop_tracing_total; /* stop trace in the final operation (report/thread) */ +#ifdef CONFIG_TIMERLAT_TRACER + u64 timerlat_period; /* timerlat period */ + u64 print_stack; /* print IRQ stack if total > */ + int timerlat_tracer; /* timerlat tracer */ +#endif bool tainted; /* infor users and developers about a problem */ } osnoise_data = { .sample_period = DEFAULT_SAMPLE_PERIOD, .sample_runtime = DEFAULT_SAMPLE_RUNTIME, .stop_tracing = 0, .stop_tracing_total = 0, +#ifdef CONFIG_TIMERLAT_TRACER + .print_stack = 0, + .timerlat_period = DEFAULT_TIMERLAT_PERIOD, + .timerlat_tracer = 0, +#endif }; /* @@ -246,6 +319,128 @@ static void trace_osnoise_sample(struct osnoise_sample *sample) trace_buffer_unlock_commit_nostack(buffer, event); } +#ifdef CONFIG_TIMERLAT_TRACER +/* + * Print the timerlat header info. + */ +static void print_timerlat_headers(struct seq_file *s) +{ + seq_puts(s, "# _-----=> irqs-off\n"); + seq_puts(s, "# / _----=> need-resched\n"); + seq_puts(s, "# | / _---=> hardirq/softirq\n"); + seq_puts(s, "# || / _--=> preempt-depth\n"); + seq_puts(s, "# || /\n"); + seq_puts(s, "# |||| ACTIVATION\n"); + seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP ID "); + seq_puts(s, " CONTEXT LATENCY\n"); + seq_puts(s, "# | | | |||| | | "); + seq_puts(s, " | |\n"); +} + +/* + * Record an timerlat_sample into the tracer buffer. + */ +static void trace_timerlat_sample(struct timerlat_sample *sample) +{ + struct trace_array *tr = osnoise_trace; + struct trace_event_call *call = &event_osnoise; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct timerlat_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_TIMERLAT, sizeof(*entry), + tracing_gen_ctx()); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->seqnum = sample->seqnum; + entry->context = sample->context; + entry->timer_latency = sample->timer_latency; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); +} + +#ifdef CONFIG_STACKTRACE + +#define MAX_CALLS 256 + +/* + * Stack trace will take place only at IRQ level, so, no need + * to control nesting here. + */ +struct trace_stack { + int stack_size; + int nr_entries; + unsigned long calls[MAX_CALLS]; +}; + +static DEFINE_PER_CPU(struct trace_stack, trace_stack); + +/* + * timerlat_save_stack - save a stack trace without printing + * + * Save the current stack trace without printing. The + * stack will be printed later, after the end of the measurement. + */ +static void timerlat_save_stack(int skip) +{ + unsigned int size, nr_entries; + struct trace_stack *fstack; + + fstack = this_cpu_ptr(&trace_stack); + + size = ARRAY_SIZE(fstack->calls); + + nr_entries = stack_trace_save(fstack->calls, size, skip); + + fstack->stack_size = nr_entries * sizeof(unsigned long); + fstack->nr_entries = nr_entries; + + return; + +} +/* + * timerlat_dump_stack - dump a stack trace previously saved + * + * Dump a saved stack trace into the trace buffer. + */ +static void timerlat_dump_stack(void) +{ + struct trace_event_call *call = &event_osnoise; + struct trace_array *tr = osnoise_trace; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct trace_stack *fstack; + struct stack_entry *entry; + unsigned int size; + + preempt_disable_notrace(); + fstack = this_cpu_ptr(&trace_stack); + size = fstack->stack_size; + + event = trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry) + size, + tracing_gen_ctx()); + if (!event) + goto out; + + entry = ring_buffer_event_data(event); + + memcpy(&entry->caller, fstack->calls, size); + entry->size = fstack->nr_entries; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); + +out: + preempt_enable_notrace(); +} +#else +#define timerlat_dump_stack() do {} while (0) +#define timerlat_save_stack(a) do {} while (0) +#endif /* CONFIG_STACKTRACE */ +#endif /* CONFIG_TIMERLAT_TRACER */ + /* * Macros to encapsulate the time capturing infrastructure. */ @@ -387,6 +582,30 @@ set_int_safe_time(struct osnoise_variables *osn_var, u64 *time) return int_counter; } +#ifdef CONFIG_TIMERLAT_TRACER +/* + * copy_int_safe_time - Copy *src into *desc aware of interference + */ +static u64 +copy_int_safe_time(struct osnoise_variables *osn_var, u64 *dst, u64 *src) +{ + u64 int_counter; + + do { + int_counter = local_read(&osn_var->int_counter); + /* synchronize with interrupts */ + barrier(); + + *dst = *src; + + /* synchronize with interrupts */ + barrier(); + } while (int_counter != local_read(&osn_var->int_counter)); + + return int_counter; +} +#endif /* CONFIG_TIMERLAT_TRACER */ + /* * trace_osnoise_callback - NMI entry/exit callback * @@ -597,6 +816,22 @@ void trace_softirq_exit_callback(void *data, unsigned int vec_nr) if (!osn_var->sampling) return; +#ifdef CONFIG_TIMERLAT_TRACER + /* + * If the timerlat is enabled, but the irq handler did + * not run yet enabling timerlat_tracer, do not trace. + */ + if (unlikely(osnoise_data.timerlat_tracer)) { + struct timerlat_variables *tlat_var; + tlat_var = this_cpu_tmr_var(); + if (!tlat_var->tracing_thread) { + osn_var->softirq.arrival_time = 0; + osn_var->softirq.delta_start = 0; + return; + } + } +#endif + duration = get_int_safe_duration(osn_var, &osn_var->softirq.delta_start); trace_softirq_noise(vec_nr, osn_var->softirq.arrival_time, duration); cond_move_thread_delta_start(osn_var, duration); @@ -689,6 +924,18 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) if (!osn_var->sampling) return; +#ifdef CONFIG_TIMERLAT_TRACER + if (osnoise_data.timerlat_tracer) { + struct timerlat_variables *tlat_var; + tlat_var = this_cpu_tmr_var(); + if (!tlat_var->tracing_thread) { + osn_var->thread.delta_start = 0; + osn_var->thread.arrival_time = 0; + return; + } + } +#endif + duration = get_int_safe_duration(osn_var, &osn_var->thread.delta_start); trace_thread_noise(t, osn_var->thread.arrival_time, duration); @@ -979,6 +1226,195 @@ static int osnoise_main(void *data) return 0; } +#ifdef CONFIG_TIMERLAT_TRACER +/* + * timerlat_irq - hrtimer handler for timerlat. + */ +static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + struct trace_array *tr = osnoise_trace; + struct timerlat_variables *tlat; + struct timerlat_sample s; + u64 now; + u64 diff; + + /* + * I am not sure if the timer was armed for this CPU. So, get + * the timerlat struct from the timer itself, not from this + * CPU. + */ + tlat = container_of(timer, struct timerlat_variables, timer); + + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + + /* + * Enable the osnoise: events for thread an softirq. + */ + tlat->tracing_thread = true; + + osn_var->thread.arrival_time = time_get(); + + /* + * A hardirq is running: the timer IRQ. It is for sure preempting + * a thread, and potentially preempting a softirq. + * + * At this point, it is not interesting to know the duration of the + * preempted thread (and maybe softirq), but how much time they will + * delay the beginning of the execution of the timer thread. + * + * To get the correct (net) delay added by the softirq, its delta_start + * is set as the IRQ one. In this way, at the return of the IRQ, the delta + * start of the sofitrq will be zeroed, accounting then only the time + * after that. + * + * The thread follows the same principle. However, if a softirq is + * running, the thread needs to receive the softirq delta_start. The + * reason being is that the softirq will be the last to be unfolded, + * resseting the thread delay to zero. + */ +#ifndef CONFIG_PREEMPT_RT + if (osn_var->softirq.delta_start) { + copy_int_safe_time(osn_var, &osn_var->thread.delta_start, + &osn_var->softirq.delta_start); + + copy_int_safe_time(osn_var, &osn_var->softirq.delta_start, + &osn_var->irq.delta_start); + } else { + copy_int_safe_time(osn_var, &osn_var->thread.delta_start, + &osn_var->irq.delta_start); + } +#else /* CONFIG_PREEMPT_RT */ + /* + * The sofirqs run as threads on RT, so there is not need + * to keep track of it. + */ + copy_int_safe_time(osn_var, &osn_var->thread.delta_start, &osn_var->irq.delta_start); +#endif /* CONFIG_PREEMPT_RT */ + + /* + * Compute the current time with the expected time. + */ + diff = now - tlat->abs_period; + + tlat->count++; + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = IRQ_CONTEXT; + + trace_timerlat_sample(&s); + + /* Keep a running maximum ever recorded os noise "latency" */ + if (diff > tr->max_latency) { + tr->max_latency = diff; + latency_fsnotify(tr); + } + + if (osnoise_data.stop_tracing) + if (time_to_us(diff) >= osnoise_data.stop_tracing) + osnoise_stop_tracing(); + + wake_up_process(tlat->kthread); + + if (osnoise_data.print_stack) + timerlat_save_stack(0); + + return HRTIMER_NORESTART; +} + +/* + * wait_next_period - Wait for the next period for timerlat + */ +static int wait_next_period(struct timerlat_variables *tlat) +{ + ktime_t next_abs_period, now; + u64 rel_period = osnoise_data.timerlat_period * 1000; + + now = hrtimer_cb_get_time(&tlat->timer); + next_abs_period = ns_to_ktime(tlat->abs_period + rel_period); + + /* + * Save the next abs_period. + */ + tlat->abs_period = (u64) ktime_to_ns(next_abs_period); + + /* + * If the new abs_period is in the past, skip the activation. + */ + while (ktime_compare(now, next_abs_period) > 0) { + next_abs_period = ns_to_ktime(tlat->abs_period + rel_period); + tlat->abs_period = (u64) ktime_to_ns(next_abs_period); + } + + set_current_state(TASK_INTERRUPTIBLE); + + hrtimer_start(&tlat->timer, next_abs_period, HRTIMER_MODE_ABS_PINNED_HARD); + schedule(); + return 1; +} + +/* + * timerlat_main- Timerlat main + */ +static int timerlat_main(void *data) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + struct timerlat_variables *tlat = this_cpu_tmr_var(); + struct timerlat_sample s; + struct sched_param sp; + u64 now, diff; + + /* + * Make the thread RT, that is how cyclictest is usually used. + */ + sp.sched_priority = DEFAULT_TIMERLAT_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + + tlat->count = 0; + tlat->tracing_thread = false; + + hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); + tlat->timer.function = timerlat_irq; + tlat->kthread = current; + osn_var->pid = current->pid; + /* + * Anotate the arrival time. + */ + tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); + + wait_next_period(tlat); + + osn_var->sampling = 1; + + while (!kthread_should_stop()) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_CONTEXT; + + trace_timerlat_sample(&s); + +#ifdef CONFIG_STACKTRACE + if (osnoise_data.print_stack) + if (osnoise_data.print_stack <= time_to_us(diff)) + timerlat_dump_stack(); +#endif /* CONFIG_STACKTRACE */ + + tlat->tracing_thread = false; + if (osnoise_data.stop_tracing_total) + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) + osnoise_stop_tracing(); + + wait_next_period(tlat); + } + + hrtimer_cancel(&tlat->timer); + return 0; +} +#endif /* CONFIG_TIMERLAT_TRACER */ + /* * stop_per_cpu_kthread - stop per-cpu threads * @@ -1009,6 +1445,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr) struct cpumask *current_mask = &save_cpumask; struct task_struct *kthread; char comm[24]; + void *main = osnoise_main; int cpu; get_online_cpus(); @@ -1026,9 +1463,17 @@ static int start_per_cpu_kthreads(struct trace_array *tr) per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; for_each_cpu(cpu, current_mask) { +#ifdef CONFIG_TIMERLAT_TRACER + if (osnoise_data.timerlat_tracer) { + snprintf(comm, 24, "timerlat/%d", cpu); + main = timerlat_main; + } else { + snprintf(comm, 24, "osnoise/%d", cpu); + } +#else snprintf(comm, 24, "osnoise/%d", cpu); - - kthread = kthread_create_on_cpu(osnoise_main, NULL, cpu, comm); +#endif + kthread = kthread_create_on_cpu(main, NULL, cpu, comm); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); @@ -1194,6 +1639,31 @@ static struct trace_min_max_param osnoise_stop_tracing_total = { .min = NULL, }; +#ifdef CONFIG_TIMERLAT_TRACER +/* + * osnoise/print_stack: print the stacktrace of the IRQ handler if the total + * latency is higher than val. + */ +static struct trace_min_max_param osnoise_print_stack = { + .lock = &interface_lock, + .val = &osnoise_data.print_stack, + .max = NULL, + .min = NULL, +}; + +/* + * osnoise/timerlat_period: min 100 us, max 1 s + */ +u64 timerlat_min_period = 100; +u64 timerlat_max_period = 1000000; +static struct trace_min_max_param timerlat_period = { + .lock = &interface_lock, + .val = &osnoise_data.timerlat_period, + .max = &timerlat_max_period, + .min = &timerlat_min_period, +}; +#endif + static const struct file_operations cpus_fops = { .open = tracing_open_generic, .read = osnoise_cpus_read, @@ -1204,10 +1674,9 @@ static const struct file_operations cpus_fops = { /* * init_tracefs - A function to initialize the tracefs interface files * - * This function creates entries in tracefs for "osnoise". It creates the - * "osnoise" directory in the tracing directory, and within that - * directory is the count, runtime and period files to change and view - * those values. + * This function creates entries in tracefs for "osnoise" and "timerlat". + * It creates these directories in the tracing directory, and within that + * directory the use can change and view the configs. */ static int init_tracefs(void) { @@ -1221,7 +1690,7 @@ static int init_tracefs(void) top_dir = tracefs_create_dir("osnoise", NULL); if (!top_dir) - return -ENOMEM; + return 0; tmp = tracefs_create_file("period_us", 0640, top_dir, &osnoise_period, &trace_min_max_fops); @@ -1246,6 +1715,19 @@ static int init_tracefs(void) tmp = trace_create_file("cpus", 0644, top_dir, NULL, &cpus_fops); if (!tmp) goto err; +#ifdef CONFIG_TIMERLAT_TRACER +#ifdef CONFIG_STACKTRACE + tmp = tracefs_create_file("print_stack", 0640, top_dir, + &osnoise_print_stack, &trace_min_max_fops); + if (!tmp) + goto err; +#endif + + tmp = tracefs_create_file("timerlat_period_us", 0640, top_dir, + &timerlat_period, &trace_min_max_fops); + if (!tmp) + goto err; +#endif return 0; @@ -1286,18 +1768,15 @@ out_unhook_irq: return -EINVAL; } -static void osnoise_tracer_start(struct trace_array *tr) +static int __osnoise_tracer_start(struct trace_array *tr) { int retval; - if (osnoise_busy) - return; - osn_var_reset_all(); retval = osnoise_hook_events(); if (retval) - goto out_err; + return retval; /* * Make sure NMIs see reseted values. */ @@ -1305,15 +1784,27 @@ static void osnoise_tracer_start(struct trace_array *tr) trace_osnoise_callback_enabled = true; retval = start_per_cpu_kthreads(tr); - /* - * all fine! - */ - if (!retval) + if (retval) { + unhook_irq_events(); + return retval; + } + + osnoise_busy = true; + + return 0; +} + +static void osnoise_tracer_start(struct trace_array *tr) +{ + int retval; + + if (osnoise_busy) return; -out_err: - unhook_irq_events(); - pr_err(BANNER "Error starting osnoise tracer\n"); + retval = __osnoise_tracer_start(tr); + if (retval) + pr_err(BANNER "Error starting osnoise tracer\n"); + } static void osnoise_tracer_stop(struct trace_array *tr) @@ -1335,18 +1826,16 @@ static void osnoise_tracer_stop(struct trace_array *tr) static int osnoise_tracer_init(struct trace_array *tr) { + /* Only allow one instance to enable this */ if (osnoise_busy) return -EBUSY; osnoise_trace = tr; - tr->max_latency = 0; osnoise_tracer_start(tr); - osnoise_busy = true; - return 0; } @@ -1365,6 +1854,71 @@ static struct tracer osnoise_tracer __read_mostly = { .allow_instances = true, }; +#ifdef CONFIG_TIMERLAT_TRACER +static void timerlat_tracer_start(struct trace_array *tr) +{ + int retval; + + if (osnoise_busy) + return; + + osnoise_data.timerlat_tracer = 1; + + retval = __osnoise_tracer_start(tr); + if (retval) + goto out_err; + + return; +out_err: + pr_err(BANNER "Error starting timerlat tracer\n"); +} + +static void timerlat_tracer_stop(struct trace_array *tr) +{ + int cpu; + + if (!osnoise_busy) + return; + + for_each_online_cpu(cpu) + per_cpu(per_cpu_osnoise_var, cpu).sampling = 0; + + osnoise_tracer_stop(tr); + + osnoise_data.timerlat_tracer = 0; +} + +static int timerlat_tracer_init(struct trace_array *tr) +{ + /* Only allow one instance to enable this */ + if (osnoise_busy) + return -EBUSY; + + osnoise_trace = tr; + + tr->max_latency = 0; + + timerlat_tracer_start(tr); + + return 0; +} + +static void timerlat_tracer_reset(struct trace_array *tr) +{ + timerlat_tracer_stop(tr); +} + +static struct tracer timerlat_tracer __read_mostly = { + .name = "timerlat", + .init = timerlat_tracer_init, + .reset = timerlat_tracer_reset, + .start = timerlat_tracer_start, + .stop = timerlat_tracer_stop, + .print_header = print_timerlat_headers, + .allow_instances = true, +}; +#endif /* CONFIG_TIMERLAT_TRACER */ + __init static int init_osnoise_tracer(void) { int ret; @@ -1374,8 +1928,18 @@ __init static int init_osnoise_tracer(void) cpumask_copy(&osnoise_cpumask, cpu_all_mask); ret = register_tracer(&osnoise_tracer); - if (ret) + if (ret) { + pr_err(BANNER "Error registering osnoise!\n"); return ret; + } + +#ifdef CONFIG_TIMERLAT_TRACER + ret = register_tracer(&timerlat_tracer); + if (ret) { + pr_err(BANNER "Error registering timerlat\n"); + return ret; + } +#endif init_tracefs(); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 642b6584eba5..a0bf446bb034 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1301,6 +1301,52 @@ static struct trace_event trace_osnoise_event = { .funcs = &trace_osnoise_funcs, }; +/* TRACE_TIMERLAT */ +static enum print_line_t +trace_timerlat_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct timerlat_entry *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "#%-5u context %6s timer_latency %9llu ns\n", + field->seqnum, + field->context ? "thread" : "irq", + field->timer_latency); + + return trace_handle_return(s); +} + +static enum print_line_t +trace_timerlat_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct timerlat_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%u %d %llu\n", + field->seqnum, + field->context, + field->timer_latency); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_timerlat_funcs = { + .trace = trace_timerlat_print, + .raw = trace_timerlat_raw, +}; + +static struct trace_event trace_timerlat_event = { + .type = TRACE_TIMERLAT, + .funcs = &trace_timerlat_funcs, +}; + /* TRACE_BPUTS */ static enum print_line_t trace_bputs_print(struct trace_iterator *iter, int flags, @@ -1512,6 +1558,7 @@ static struct trace_event *events[] __initdata = { &trace_print_event, &trace_hwlat_event, &trace_osnoise_event, + &trace_timerlat_event, &trace_raw_data_event, &trace_func_repeats_event, NULL From 039a602db393c00665e7a3b968b15fb6fd7042b1 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 22 Jun 2021 16:42:29 +0200 Subject: [PATCH 32/49] trace/hwlat: Protect kdata->kthread with get/put_online_cpus In preparation to the hotplug support, protect kdata->kthread with get/put_online_cpus() to avoid concurrency with hotplug operations. Link: https://lore.kernel.org/linux-doc/20210621134636.5b332226@oasis.local.home/ Link: https://lkml.kernel.org/r/8bdb2a56f46abfd301d6fffbf43448380c09a6f5.1624372313.git.bristot@redhat.com Cc: Phil Auld Cc: Sebastian Andrzej Siewior Cc: Kate Carcia Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Clark Willaims Cc: John Kacur Cc: Juri Lelli Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Suggested-by: Steven Rostedt (VMware) Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index a625bfdb844e..20e31f79ebd9 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -396,13 +396,19 @@ static int kthread_fn(void *data) static void stop_single_kthread(void) { struct hwlat_kthread_data *kdata = get_cpu_data(); - struct task_struct *kthread = kdata->kthread; + struct task_struct *kthread; + + get_online_cpus(); + kthread = kdata->kthread; if (!kthread) - return; + goto out_put_cpus; kthread_stop(kthread); kdata->kthread = NULL; + +out_put_cpus: + put_online_cpus(); } @@ -419,20 +425,19 @@ static int start_single_kthread(struct trace_array *tr) struct task_struct *kthread; int next_cpu; + get_online_cpus(); if (kdata->kthread) - return 0; + goto out_put_cpus; kthread = kthread_create(kthread_fn, NULL, "hwlatd"); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); + put_online_cpus(); return -ENOMEM; } - /* Just pick the first CPU on first iteration */ - get_online_cpus(); cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); - put_online_cpus(); if (hwlat_data.thread_mode == MODE_ROUND_ROBIN) { next_cpu = cpumask_first(current_mask); @@ -446,6 +451,8 @@ static int start_single_kthread(struct trace_array *tr) kdata->kthread = kthread; wake_up_process(kthread); +out_put_cpus: + put_online_cpus(); return 0; } From ba998f7d9531ef4ce462cabd2ce57a7558c33ede Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 22 Jun 2021 16:42:31 +0200 Subject: [PATCH 33/49] trace/hwlat: Support hotplug operations Enable and disable hwlat thread during cpu hotplug online and offline operations, respectivelly. Link: https://lore.kernel.org/linux-doc/20210621134636.5b332226@oasis.local.home/ Link: https://lkml.kernel.org/r/52012d25ea35491a0f8088b947864d8df8e25157.1624372313.git.bristot@redhat.com Cc: Phil Auld Cc: Sebastian Andrzej Siewior Cc: Kate Carcia Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Clark Willaims Cc: John Kacur Cc: Juri Lelli Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Suggested-by: Steven Rostedt (VMware) Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 65 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 20e31f79ebd9..a6c0cdaf4b87 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -466,6 +466,7 @@ static void stop_cpu_kthread(unsigned int cpu) kthread = per_cpu(hwlat_per_cpu_data, cpu).kthread; if (kthread) kthread_stop(kthread); + per_cpu(hwlat_per_cpu_data, cpu).kthread = NULL; } /* @@ -506,6 +507,68 @@ static int start_cpu_kthread(unsigned int cpu) return 0; } +#ifdef CONFIG_HOTPLUG_CPU +static void hwlat_hotplug_workfn(struct work_struct *dummy) +{ + struct trace_array *tr = hwlat_trace; + unsigned int cpu = smp_processor_id(); + + mutex_lock(&trace_types_lock); + mutex_lock(&hwlat_data.lock); + get_online_cpus(); + + if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU) + goto out_unlock; + + if (!cpumask_test_cpu(cpu, tr->tracing_cpumask)) + goto out_unlock; + + start_cpu_kthread(cpu); + +out_unlock: + put_online_cpus(); + mutex_unlock(&hwlat_data.lock); + mutex_unlock(&trace_types_lock); +} + +static DECLARE_WORK(hwlat_hotplug_work, hwlat_hotplug_workfn); + +/* + * hwlat_cpu_init - CPU hotplug online callback function + */ +static int hwlat_cpu_init(unsigned int cpu) +{ + schedule_work_on(cpu, &hwlat_hotplug_work); + return 0; +} + +/* + * hwlat_cpu_die - CPU hotplug offline callback function + */ +static int hwlat_cpu_die(unsigned int cpu) +{ + stop_cpu_kthread(cpu); + return 0; +} + +static void hwlat_init_hotplug_support(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "trace/hwlat:online", + hwlat_cpu_init, hwlat_cpu_die); + if (ret < 0) + pr_warn(BANNER "Error to init cpu hotplug support\n"); + + return; +} +#else /* CONFIG_HOTPLUG_CPU */ +static void hwlat_init_hotplug_support(void) +{ + return; +} +#endif /* CONFIG_HOTPLUG_CPU */ + /* * start_per_cpu_kthreads - Kick off the hardware latency sampling/detector kthreads * @@ -822,6 +885,8 @@ __init static int init_hwlat_tracer(void) if (ret) return ret; + hwlat_init_hotplug_support(); + init_tracefs(); return 0; From c8895e271f7994a3ecb13b8a280e39aa53879545 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 22 Jun 2021 16:42:32 +0200 Subject: [PATCH 34/49] trace/osnoise: Support hotplug operations Enable and disable osnoise/timerlat thread during on CPU hotplug online and offline operations respectivelly. Link: https://lore.kernel.org/linux-doc/20210621134636.5b332226@oasis.local.home/ Link: https://lkml.kernel.org/r/39f98590b3caeb3c32f09526214058efe0e9272a.1624372313.git.bristot@redhat.com Cc: Phil Auld Cc: Sebastian Andrzej Siewior Cc: Kate Carcia Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Clark Willaims Cc: John Kacur Cc: Juri Lelli Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Suggested-by: Steven Rostedt (VMware) Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 165 ++++++++++++++++++++++++++++------- 1 file changed, 135 insertions(+), 30 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 8546e66bafcb..38aa5e208ffd 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1416,22 +1416,67 @@ static int timerlat_main(void *data) #endif /* CONFIG_TIMERLAT_TRACER */ /* - * stop_per_cpu_kthread - stop per-cpu threads + * stop_kthread - stop a workload thread + */ +static void stop_kthread(unsigned int cpu) +{ + struct task_struct *kthread; + + kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + if (kthread) + kthread_stop(kthread); + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; +} + +/* + * stop_per_cpu_kthread - Stop per-cpu threads * * Stop the osnoise sampling htread. Use this on unload and at system * shutdown. */ static void stop_per_cpu_kthreads(void) { - struct task_struct *kthread; int cpu; - for_each_online_cpu(cpu) { - kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; - if (kthread) - kthread_stop(kthread); - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + get_online_cpus(); + + for_each_online_cpu(cpu) + stop_kthread(cpu); + + put_online_cpus(); +} + +/* + * start_kthread - Start a workload tread + */ +static int start_kthread(unsigned int cpu) +{ + struct task_struct *kthread; + void *main = osnoise_main; + char comm[24]; + +#ifdef CONFIG_TIMERLAT_TRACER + if (osnoise_data.timerlat_tracer) { + snprintf(comm, 24, "timerlat/%d", cpu); + main = timerlat_main; + } else { + snprintf(comm, 24, "osnoise/%d", cpu); } +#else + snprintf(comm, 24, "osnoise/%d", cpu); +#endif + kthread = kthread_create_on_cpu(main, NULL, cpu, comm); + + if (IS_ERR(kthread)) { + pr_err(BANNER "could not start sampling thread\n"); + stop_per_cpu_kthreads(); + return -ENOMEM; + } + + per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; + wake_up_process(kthread); + + return 0; } /* @@ -1443,9 +1488,7 @@ static void stop_per_cpu_kthreads(void) static int start_per_cpu_kthreads(struct trace_array *tr) { struct cpumask *current_mask = &save_cpumask; - struct task_struct *kthread; - char comm[24]; - void *main = osnoise_main; + int retval; int cpu; get_online_cpus(); @@ -1457,37 +1500,91 @@ static int start_per_cpu_kthreads(struct trace_array *tr) * And the CPU is online. */ cpumask_and(current_mask, cpu_online_mask, current_mask); - put_online_cpus(); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; for_each_cpu(cpu, current_mask) { -#ifdef CONFIG_TIMERLAT_TRACER - if (osnoise_data.timerlat_tracer) { - snprintf(comm, 24, "timerlat/%d", cpu); - main = timerlat_main; - } else { - snprintf(comm, 24, "osnoise/%d", cpu); - } -#else - snprintf(comm, 24, "osnoise/%d", cpu); -#endif - kthread = kthread_create_on_cpu(main, NULL, cpu, comm); - - if (IS_ERR(kthread)) { - pr_err(BANNER "could not start sampling thread\n"); + retval = start_kthread(cpu); + if (retval) { stop_per_cpu_kthreads(); - return -ENOMEM; + return retval; } - - per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; - wake_up_process(kthread); } + put_online_cpus(); + return 0; } +#ifdef CONFIG_HOTPLUG_CPU +static void osnoise_hotplug_workfn(struct work_struct *dummy) +{ + struct trace_array *tr = osnoise_trace; + unsigned int cpu = smp_processor_id(); + + + mutex_lock(&trace_types_lock); + + if (!osnoise_busy) + goto out_unlock_trace; + + mutex_lock(&interface_lock); + get_online_cpus(); + + if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) + goto out_unlock; + + if (!cpumask_test_cpu(cpu, tr->tracing_cpumask)) + goto out_unlock; + + start_kthread(cpu); + +out_unlock: + put_online_cpus(); + mutex_unlock(&interface_lock); +out_unlock_trace: + mutex_unlock(&trace_types_lock); +} + +static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn); + +/* + * osnoise_cpu_init - CPU hotplug online callback function + */ +static int osnoise_cpu_init(unsigned int cpu) +{ + schedule_work_on(cpu, &osnoise_hotplug_work); + return 0; +} + +/* + * osnoise_cpu_die - CPU hotplug offline callback function + */ +static int osnoise_cpu_die(unsigned int cpu) +{ + stop_kthread(cpu); + return 0; +} + +static void osnoise_init_hotplug_support(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "trace/osnoise:online", + osnoise_cpu_init, osnoise_cpu_die); + if (ret < 0) + pr_warn(BANNER "Error to init cpu hotplug support\n"); + + return; +} +#else /* CONFIG_HOTPLUG_CPU */ +static void osnoise_init_hotplug_support(void) +{ + return 0; +} +#endif /* CONFIG_HOTPLUG_CPU */ + /* * osnoise_cpus_read - Read function for reading the "cpus" file * @filp: The active open file structure @@ -1583,7 +1680,14 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, osnoise_tracer_stop(tr); mutex_lock(&interface_lock); + /* + * osnoise_cpumask is read by CPU hotplug operations. + */ + get_online_cpus(); + cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); + + put_online_cpus(); mutex_unlock(&interface_lock); if (running) @@ -1940,6 +2044,7 @@ __init static int init_osnoise_tracer(void) return ret; } #endif + osnoise_init_hotplug_support(); init_tracefs(); From d3b16034a24a112bb83aeb669ac5b9b01f744bb7 Mon Sep 17 00:00:00 2001 From: Yun Zhou Date: Sat, 26 Jun 2021 11:21:55 +0800 Subject: [PATCH 35/49] seq_buf: Fix overflow in seq_buf_putmem_hex() There's two variables being increased in that loop (i and j), and i follows the raw data, and j follows what is being written into the buffer. We should compare 'i' to MAX_MEMHEX_BYTES or compare 'j' to HEX_CHARS. Otherwise, if 'j' goes bigger than HEX_CHARS, it will overflow the destination buffer. Link: https://lore.kernel.org/lkml/20210625122453.5e2fe304@oasis.local.home/ Link: https://lkml.kernel.org/r/20210626032156.47889-1-yun.zhou@windriver.com Cc: stable@vger.kernel.org Fixes: 5e3ca0ec76fce ("ftrace: introduce the "hex" output method") Signed-off-by: Yun Zhou Signed-off-by: Steven Rostedt (VMware) --- lib/seq_buf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/seq_buf.c b/lib/seq_buf.c index 707453f5d58e..62c20596ef42 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -229,8 +229,10 @@ int seq_buf_putmem_hex(struct seq_buf *s, const void *mem, WARN_ON(s->size == 0); + BUILD_BUG_ON(MAX_MEMHEX_BYTES * 2 >= HEX_CHARS); + while (len) { - start_len = min(len, HEX_CHARS - 1); + start_len = min(len, MAX_MEMHEX_BYTES); #ifdef __BIG_ENDIAN for (i = 0, j = 0; i < start_len; i++) { #else From 6a2cbc58d6c9d90cd74288cc497c2b45815bc064 Mon Sep 17 00:00:00 2001 From: Yun Zhou Date: Sat, 26 Jun 2021 11:21:56 +0800 Subject: [PATCH 36/49] seq_buf: Make trace_seq_putmem_hex() support data longer than 8 Since the raw memory 'data' does not go forward, it will dump repeated data if the data length is more than 8. If we want to dump longer data blocks, we need to repeatedly call macro SEQ_PUT_HEX_FIELD. I think it is a bit redundant, and multiple function calls also affect the performance. Link: https://lore.kernel.org/lkml/20210625122453.5e2fe304@oasis.local.home/ Link: https://lkml.kernel.org/r/20210626032156.47889-2-yun.zhou@windriver.com Cc: stable@vger.kernel.org Fixes: 6d2289f3faa7 ("tracing: Make trace_seq_putmem_hex() more robust") Signed-off-by: Yun Zhou Signed-off-by: Steven Rostedt (VMware) --- lib/seq_buf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/seq_buf.c b/lib/seq_buf.c index 62c20596ef42..6dafde851333 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -245,12 +245,14 @@ int seq_buf_putmem_hex(struct seq_buf *s, const void *mem, break; /* j increments twice per loop */ - len -= j / 2; hex[j++] = ' '; seq_buf_putmem(s, hex, j); if (seq_buf_has_overflowed(s)) return -1; + + len -= start_len; + data += start_len; } return 0; } From b96285e10aad234acfa0628f7e8336990f778c03 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 28 Jun 2021 11:49:53 -0400 Subject: [PATCH 37/49] tracing: Have osnoise_main() add a quiescent state for task rcu ftracetest triggered: INFO: rcu_tasks detected stalls on tasks: 00000000b92b832d: .. nvcsw: 1/1 holdout: 1 idle_cpu: -1/7 task:osnoise/7 state:R running task stack: 0 pid: 2133 ppid: 2 flags:0x00004000 Call Trace: ? asm_sysvec_apic_timer_interrupt+0x12/0x20 ? asm_sysvec_apic_timer_interrupt+0x12/0x20 ? trace_hardirqs_on+0x2b/0xe0 ? asm_sysvec_apic_timer_interrupt+0x12/0x20 ? trace_clock_local+0xc/0x20 ? osnoise_main+0x10e/0x450 ? trace_softirq_entry_callback+0x50/0x50 ? kthread+0x153/0x170 ? __kthread_bind_mask+0x60/0x60 ? ret_from_fork+0x22/0x30 While running osnoise tracer with other tracers that rely on synchronize_rcu_tasks(), where that just hung. The reason is that osnoise_main() never schedules out if the interval is less than 1, and this will cause synchronize_rcu_tasks() to never return. Link: https://lkml.kernel.org/r/20210628114953.6dc06a91@oasis.local.home Fixes: bce29ac9ce0bb ("trace: Add osnoise tracer") Acked-by: Paul E. McKenney Reviewed-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 38aa5e208ffd..556d530af805 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1216,8 +1216,11 @@ static int osnoise_main(void *data) * differently from hwlat_detector, the osnoise tracer can run * without a pause because preemption is on. */ - if (interval < 1) + if (interval < 1) { + /* Let synchronize_rcu_tasks() make progress */ + cond_resched_tasks_rcu_qs(); continue; + } if (msleep_interruptible(interval)) break; From f7d9f6370e006400655ff96cb148f56598492d91 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 28 Jun 2021 11:45:47 +0200 Subject: [PATCH 38/49] trace/osnoise: Fix 'no previous prototype' warnings kernel test robot reported some osnoise functions with "no previous prototype." Fix these warnings by making local functions static, and by adding: void osnoise_trace_irq_entry(int id); void osnoise_trace_irq_exit(int id, const char *desc); to include/linux/trace.h. Link: https://lkml.kernel.org/r/e40d3cb4be8bde921f4b40fa6a095cf85ab807bd.1624872608.git.bristot@redhat.com Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Reported-by: kernel test robot Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/trace.c | 3 --- include/linux/trace.h | 2 ++ kernel/trace/trace_osnoise.c | 20 +++++++++++--------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c index 6912672c33a7..6b73b6f92ad3 100644 --- a/arch/x86/kernel/trace.c +++ b/arch/x86/kernel/trace.c @@ -2,9 +2,6 @@ #include #if defined(CONFIG_OSNOISE_TRACER) && defined(CONFIG_X86_LOCAL_APIC) -extern void osnoise_trace_irq_entry(int id); -extern void osnoise_trace_irq_exit(int id, const char *desc); - /* * trace_intel_irq_entry - record intel specific IRQ entry */ diff --git a/include/linux/trace.h b/include/linux/trace.h index 4e3858640c47..bf169612ffe1 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -45,6 +45,8 @@ int trace_array_destroy(struct trace_array *tr); /* For osnoise tracer */ int osnoise_arch_register(void); void osnoise_arch_unregister(void); +void osnoise_trace_irq_entry(int id); +void osnoise_trace_irq_exit(int id, const char *desc); #endif /* CONFIG_TRACING */ diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 556d530af805..9c3109e3ffeb 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -736,7 +736,7 @@ void __weak osnoise_arch_unregister(void) * This function hooks the IRQ related callbacks to the respective trace * events. */ -int hook_irq_events(void) +static int hook_irq_events(void) { int ret; @@ -768,7 +768,7 @@ out_err: * This function unhooks the IRQ related callbacks to the respective trace * events. */ -void unhook_irq_events(void) +static void unhook_irq_events(void) { osnoise_arch_unregister(); unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL); @@ -785,7 +785,7 @@ void unhook_irq_events(void) * arrival time. The delta_start is used to compute the duration at the * softirq exit handler. See cond_move_softirq_delta_start(). */ -void trace_softirq_entry_callback(void *data, unsigned int vec_nr) +static void trace_softirq_entry_callback(void *data, unsigned int vec_nr) { struct osnoise_variables *osn_var = this_cpu_osn_var(); @@ -808,7 +808,7 @@ void trace_softirq_entry_callback(void *data, unsigned int vec_nr) * Computes the duration of the softirq noise, and trace it. Also discounts the * interference from other sources of noise could be currently being accounted. */ -void trace_softirq_exit_callback(void *data, unsigned int vec_nr) +static void trace_softirq_exit_callback(void *data, unsigned int vec_nr) { struct osnoise_variables *osn_var = this_cpu_osn_var(); int duration; @@ -949,7 +949,7 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) * This function is hooked to the sched:sched_switch trace event, and it is * used to record the beginning and to report the end of a thread noise window. */ -void +static void trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, struct task_struct *n) { @@ -968,7 +968,7 @@ trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, * Hook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. */ -int hook_thread_events(void) +static int hook_thread_events(void) { int ret; @@ -985,7 +985,7 @@ int hook_thread_events(void) * Unook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. */ -void unhook_thread_events(void) +static void unhook_thread_events(void) { unregister_trace_sched_switch(trace_sched_switch_callback, NULL); } @@ -997,7 +997,8 @@ void unhook_thread_events(void) * values will be used later to compute the diff betwneen the statistics * before and after the osnoise sampling. */ -void save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) +static void +save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) { s->nmi_count = osn_var->nmi.count; s->irq_count = osn_var->irq.count; @@ -1012,7 +1013,8 @@ void save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sam * statistics. The struct osnoise_sample *s contains the statistics saved via * save_osn_sample_stats() before the osnoise sampling. */ -void diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) +static void +diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) { s->nmi_count = osn_var->nmi.count - s->nmi_count; s->irq_count = osn_var->irq.count - s->irq_count; From 2a81afa326fd23add336cfd7e35e1d699d11d9c4 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 28 Jun 2021 11:45:48 +0200 Subject: [PATCH 39/49] trace/osnoise: Make interval u64 on osnoise_main kernel test robot reported: >> kernel/trace/trace_osnoise.c:966:3: warning: comparison of distinct pointer types ('typeof ((interval)) *' (aka 'long long *') and 'uint64_t *' (aka 'unsigned long long *')) [-Wcompare-distinct-pointer-types] do_div(interval, USEC_PER_MSEC); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/asm-generic/div64.h:228:28: note: expanded from macro 'do_div' (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \ ~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~ As interval cannot be negative because sample_period >= sample_runtime, making interval u64 on osnoise_main() is enough to fix this problem. Link: https://lkml.kernel.org/r/4ae1e7780563598563de079a3ef6d4d10b5f5546.1624872608.git.bristot@redhat.com Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Reported-by: kernel test robot Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 9c3109e3ffeb..79be14380581 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1202,7 +1202,7 @@ static struct cpumask save_cpumask; */ static int osnoise_main(void *data) { - s64 interval; + u64 interval; while (!kthread_should_stop()) { From 498627b4ac85780b9962ed9b5c5abbefd884ef8e Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 28 Jun 2021 11:45:49 +0200 Subject: [PATCH 40/49] trace/osnoise: Fix return value on osnoise_init_hotplug_support kernel test robot reported: >> kernel/trace/trace_osnoise.c:1584:2: error: void function 'osnoise_init_hotplug_support' should not return a value [-Wreturn-type] return 0; When !CONFIG_HOTPLUG_CPU. Fix it problem by removing the return value. Link: https://lkml.kernel.org/r/c7fc67f1a117cc88bab2e508c898634872795341.1624872608.git.bristot@redhat.com Fixes: c8895e271f79 ("trace/osnoise: Support hotplug operations") Reported-by: kernel test robot Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 79be14380581..085a83de98ad 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1586,7 +1586,7 @@ static void osnoise_init_hotplug_support(void) #else /* CONFIG_HOTPLUG_CPU */ static void osnoise_init_hotplug_support(void) { - return 0; + return; } #endif /* CONFIG_HOTPLUG_CPU */ From bd09c0556eca17f55fb09a26b6ed27bedd1b42ef Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 28 Jun 2021 11:45:50 +0200 Subject: [PATCH 41/49] Documentation: Fix a typo on trace/osnoise-tracer s/RUNTIME IN USE/RUNTIME IN US/ Link: https://lkml.kernel.org/r/43e5160422a967218aa651c47f523e8d32d6a59e.1624872608.git.bristot@redhat.com Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/osnoise-tracer.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/trace/osnoise-tracer.rst b/Documentation/trace/osnoise-tracer.rst index 37a3c10fb216..b648cb9bf1f0 100644 --- a/Documentation/trace/osnoise-tracer.rst +++ b/Documentation/trace/osnoise-tracer.rst @@ -77,7 +77,7 @@ In addition to the regular trace fields (from TASK-PID to TIMESTAMP), the tracer prints a message at the end of each period for each CPU that is running an osnoise/ thread. The osnoise specific fields report: - - The RUNTIME IN USE reports the amount of time in microseconds that + - The RUNTIME IN US reports the amount of time in microseconds that the osnoise thread kept looping reading the time. - The NOISE IN US reports the sum of noise in microseconds observed by the osnoise tracer during the associated runtime. From b62613b431bdababc90bf1440b2c7427172d94f4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 28 Jun 2021 13:55:22 +0100 Subject: [PATCH 42/49] tracing: Fix spelling in osnoise tracer "interferences" -> "interference" There is a spelling mistake in a TP_printk message, the word interferences is not the plural of interference. Fix this. Link: https://lkml.kernel.org/r/20210628125522.56361-1-colin.king@canonical.com Reviewed-by: Daniel Bristot de Oliveira Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- include/trace/events/osnoise.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/osnoise.h b/include/trace/events/osnoise.h index 28762c69f6c9..82f741ec0f57 100644 --- a/include/trace/events/osnoise.h +++ b/include/trace/events/osnoise.h @@ -129,7 +129,7 @@ TRACE_EVENT(sample_threshold, __entry->interference = interference; ), - TP_printk("start %llu.%09u duration %llu ns interferences %llu", + TP_printk("start %llu.%09u duration %llu ns interference %llu", __print_ns_to_secs(__entry->start), __print_ns_without_secs(__entry->start), __entry->duration, From 9913d5745bd720c4266805c8d29952a3702e4eca Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 29 Jun 2021 09:40:10 -0400 Subject: [PATCH 43/49] tracepoint: Add tracepoint_probe_register_may_exist() for BPF tracing All internal use cases for tracepoint_probe_register() is set to not ever be called with the same function and data. If it is, it is considered a bug, as that means the accounting of handling tracepoints is corrupted. If the function and data for a tracepoint is already registered when tracepoint_probe_register() is called, it will call WARN_ON_ONCE() and return with EEXISTS. The BPF system call can end up calling tracepoint_probe_register() with the same data, which now means that this can trigger the warning because of a user space process. As WARN_ON_ONCE() should not be called because user space called a system call with bad data, there needs to be a way to register a tracepoint without triggering a warning. Enter tracepoint_probe_register_may_exist(), which can be called, but will not cause a WARN_ON() if the probe already exists. It will still error out with EEXIST, which will then be sent to the user space that performed the BPF system call. This keeps the previous testing for issues with other users of the tracepoint code, while letting BPF call it with duplicated data and not warn about it. Link: https://lore.kernel.org/lkml/20210626135845.4080-1-penguin-kernel@I-love.SAKURA.ne.jp/ Link: https://syzkaller.appspot.com/bug?id=41f4318cf01762389f4d1c1c459da4f542fe5153 Cc: stable@vger.kernel.org Fixes: c4f6699dfcb85 ("bpf: introduce BPF_RAW_TRACEPOINT") Reported-by: syzbot Reported-by: Tetsuo Handa Tested-by: syzbot+721aa903751db87aa244@syzkaller.appspotmail.com Signed-off-by: Steven Rostedt (VMware) --- include/linux/tracepoint.h | 10 ++++++++++ kernel/trace/bpf_trace.c | 3 ++- kernel/tracepoint.c | 33 ++++++++++++++++++++++++++++++--- 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 13f65420f188..ab58696d0ddd 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -41,7 +41,17 @@ extern int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, void *data, int prio); extern int +tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe, void *data, + int prio); +extern int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data); +static inline int +tracepoint_probe_register_may_exist(struct tracepoint *tp, void *probe, + void *data) +{ + return tracepoint_probe_register_prio_may_exist(tp, probe, data, + TRACEPOINT_DEFAULT_PRIO); +} extern void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv), void *priv); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7a52bc172841..f0568b3d6bd1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1840,7 +1840,8 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * if (prog->aux->max_tp_access > btp->writable_size) return -EINVAL; - return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); + return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, + prog); } int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 9f478d29b926..976bf8ce8039 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -273,7 +273,8 @@ static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func * Add the probe function to a tracepoint. */ static int tracepoint_add_func(struct tracepoint *tp, - struct tracepoint_func *func, int prio) + struct tracepoint_func *func, int prio, + bool warn) { struct tracepoint_func *old, *tp_funcs; int ret; @@ -288,7 +289,7 @@ static int tracepoint_add_func(struct tracepoint *tp, lockdep_is_held(&tracepoints_mutex)); old = func_add(&tp_funcs, func, prio); if (IS_ERR(old)) { - WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM); + WARN_ON_ONCE(warn && PTR_ERR(old) != -ENOMEM); return PTR_ERR(old); } @@ -343,6 +344,32 @@ static int tracepoint_remove_func(struct tracepoint *tp, return 0; } +/** + * tracepoint_probe_register_prio_may_exist - Connect a probe to a tracepoint with priority + * @tp: tracepoint + * @probe: probe handler + * @data: tracepoint data + * @prio: priority of this function over other registered functions + * + * Same as tracepoint_probe_register_prio() except that it will not warn + * if the tracepoint is already registered. + */ +int tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe, + void *data, int prio) +{ + struct tracepoint_func tp_func; + int ret; + + mutex_lock(&tracepoints_mutex); + tp_func.func = probe; + tp_func.data = data; + tp_func.prio = prio; + ret = tracepoint_add_func(tp, &tp_func, prio, false); + mutex_unlock(&tracepoints_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio_may_exist); + /** * tracepoint_probe_register_prio - Connect a probe to a tracepoint with priority * @tp: tracepoint @@ -366,7 +393,7 @@ int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, tp_func.func = probe; tp_func.data = data; tp_func.prio = prio; - ret = tracepoint_add_func(tp, &tp_func, prio); + ret = tracepoint_add_func(tp, &tp_func, prio, true); mutex_unlock(&tracepoints_mutex); return ret; } From 19c3eaa72288ce161441dd6b74b765a094d73488 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 29 Jun 2021 19:10:26 +0200 Subject: [PATCH 44/49] trace/osnoise: Make 'noise' variable s64 in run_osnoise() Dab Carpenter reported that: The patch bce29ac9ce0b: "trace: Add osnoise tracer" from Jun 22, 2021, leads to the following static checker warning: kernel/trace/trace_osnoise.c:1103 run_osnoise() warn: unsigned 'noise' is never less than zero. In this part of the code: 1100 /* 1101 * This shouldn't happen. 1102 */ 1103 if (noise < 0) { ^^^^^^^^^ 1104 osnoise_taint("negative noise!"); 1105 goto out; 1106 } 1107 And the static checker is right because 'noise' is u64. Make noise s64 and keep the check. It is important to check if the time read is behaving correctly - so we can trust the results. I also re-arranged some variable declarations. Link: https://lkml.kernel.org/r/acd7cd6e7d56b798a298c3bc8139a390b3c4ab52.1624986368.git.bristot@redhat.com Cc: Ingo Molnar Cc: Daniel Bristot de Oliveira Cc: Dan Carpenter Cc: linux-kernel@vger.kernel.org Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Reported-by: Dan Carpenter Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 085a83de98ad..cc1faca1a620 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1042,15 +1042,16 @@ static void osnoise_stop_tracing(void) static int run_osnoise(void) { struct osnoise_variables *osn_var = this_cpu_osn_var(); - u64 noise = 0, sum_noise = 0, max_noise = 0; struct trace_array *tr = osnoise_trace; u64 start, sample, last_sample; u64 last_int_count, int_count; + s64 noise = 0, max_noise = 0; s64 total, last_total = 0; struct osnoise_sample s; unsigned int threshold; - int hw_count = 0; u64 runtime, stop_in; + u64 sum_noise = 0; + int hw_count = 0; int ret = -1; /* From 6a82f42a2e55d43bd99f1b457982be6c856c800d Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 29 Jun 2021 19:10:27 +0200 Subject: [PATCH 45/49] trace/timerlat: Fix indentation on timerlat_main() Dan Carpenter reported that: The patch a955d7eac177: "trace: Add timerlat tracer" from Jun 22, 2021, leads to the following static checker warning: kernel/trace/trace_osnoise.c:1400 timerlat_main() warn: inconsistent indenting here: 1389 while (!kthread_should_stop()) { 1390 now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); 1391 diff = now - tlat->abs_period; 1392 1393 s.seqnum = tlat->count; 1394 s.timer_latency = diff; 1395 s.context = THREAD_CONTEXT; 1396 1397 trace_timerlat_sample(&s); 1398 1399 #ifdef CONFIG_STACKTRACE 1400 if (osnoise_data.print_stack) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This should be indented another tab? 1401 if (osnoise_data.print_stack <= time_to_us(diff)) 1402 timerlat_dump_stack(); 1403 #endif /* CONFIG_STACKTRACE */ 1404 1405 tlat->tracing_thread = false; 1406 if (osnoise_data.stop_tracing_total) 1407 if (time_to_us(diff) >= osnoise_data.stop_tracing_total) 1408 osnoise_stop_tracing(); 1409 1410 wait_next_period(tlat); 1411 } And the static checker is right. Fix the indentation. Link: https://lkml.kernel.org/r/3d5d8c9258fbdcfa9d3c7362941b3d13a2a28d9d.1624986368.git.bristot@redhat.com Cc: Ingo Molnar Cc: Daniel Bristot de Oliveira Cc: Dan Carpenter Cc: linux-kernel@vger.kernel.org Fixes: a955d7eac177 ("trace: Add timerlat tracer") Reported-by: Dan Carpenter Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index cc1faca1a620..a7e3c24dee13 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1403,9 +1403,9 @@ static int timerlat_main(void *data) trace_timerlat_sample(&s); #ifdef CONFIG_STACKTRACE - if (osnoise_data.print_stack) - if (osnoise_data.print_stack <= time_to_us(diff)) - timerlat_dump_stack(); + if (osnoise_data.print_stack) + if (osnoise_data.print_stack <= time_to_us(diff)) + timerlat_dump_stack(); #endif /* CONFIG_STACKTRACE */ tlat->tracing_thread = false; From bfbf8d157a8e85beca929f1959f567e03eb72c34 Mon Sep 17 00:00:00 2001 From: Austin Kim Date: Tue, 29 Jun 2021 15:05:48 +0100 Subject: [PATCH 46/49] tracing: Change variable type as bool for clean-up The wakeup_rt wakeup_dl, tracing_dl is only set to 0, 1. So changing type of wakeup_rt wakeup_dl, tracing_dl as bool makes relevant routine be more readable. Link: https://lkml.kernel.org/r/20210629140548.GA1627@raspberrypi Signed-off-by: Austin Kim [ Removed unneeded initialization of static bool tracing_dl ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_sched_wakeup.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e5778d1d7a5b..2402de520eca 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -26,9 +26,9 @@ static struct task_struct *wakeup_task; static int wakeup_cpu; static int wakeup_current_cpu; static unsigned wakeup_prio = -1; -static int wakeup_rt; -static int wakeup_dl; -static int tracing_dl = 0; +static bool wakeup_rt; +static bool wakeup_dl; +static bool tracing_dl; static arch_spinlock_t wakeup_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -498,7 +498,7 @@ static void __wakeup_reset(struct trace_array *tr) { wakeup_cpu = -1; wakeup_prio = -1; - tracing_dl = 0; + tracing_dl = false; if (wakeup_task) put_task_struct(wakeup_task); @@ -572,9 +572,9 @@ probe_wakeup(void *ignore, struct task_struct *p) * another task until the first one wakes up. */ if (dl_task(p)) - tracing_dl = 1; + tracing_dl = true; else - tracing_dl = 0; + tracing_dl = false; wakeup_task = get_task_struct(p); @@ -685,8 +685,8 @@ static int wakeup_tracer_init(struct trace_array *tr) if (wakeup_busy) return -EBUSY; - wakeup_dl = 0; - wakeup_rt = 0; + wakeup_dl = false; + wakeup_rt = false; return __wakeup_tracer_init(tr); } @@ -695,8 +695,8 @@ static int wakeup_rt_tracer_init(struct trace_array *tr) if (wakeup_busy) return -EBUSY; - wakeup_dl = 0; - wakeup_rt = 1; + wakeup_dl = false; + wakeup_rt = true; return __wakeup_tracer_init(tr); } @@ -705,8 +705,8 @@ static int wakeup_dl_tracer_init(struct trace_array *tr) if (wakeup_busy) return -EBUSY; - wakeup_dl = 1; - wakeup_rt = 0; + wakeup_dl = true; + wakeup_rt = false; return __wakeup_tracer_init(tr); } From 78c14b385c195d4f25ab7c19186b8897a5b9ae3f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 12 Jun 2021 08:42:27 -0700 Subject: [PATCH 47/49] treewide: Add missing semicolons to __assign_str uses The __assign_str macro has an unusual ending semicolon but the vast majority of uses of the macro already have semicolon termination. $ git grep -P '\b__assign_str\b' | wc -l 551 $ git grep -P '\b__assign_str\b.*;' | wc -l 480 Add semicolons to the __assign_str() uses without semicolon termination and all the other uses without semicolon termination via additional defines that are equivalent to __assign_str() with the eventual goal of removing the semicolon from the __assign_str() macro definition. Link: https://lore.kernel.org/lkml/1e068d21106bb6db05b735b4916bb420e6c9842a.camel@perches.com/ Link: https://lkml.kernel.org/r/48a056adabd8f70444475352f617914cef504a45.camel@perches.com Signed-off-by: Joe Perches Signed-off-by: Steven Rostedt (VMware) --- drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 14 +++---- drivers/gpu/drm/lima/lima_trace.h | 2 +- drivers/infiniband/hw/hfi1/trace_misc.h | 4 +- drivers/infiniband/hw/hfi1/trace_rc.h | 4 +- drivers/infiniband/hw/hfi1/trace_tid.h | 6 +-- drivers/infiniband/hw/hfi1/trace_tx.h | 8 ++-- drivers/infiniband/sw/rdmavt/trace_cq.h | 4 +- drivers/infiniband/sw/rdmavt/trace_mr.h | 2 +- drivers/infiniband/sw/rdmavt/trace_qp.h | 4 +- drivers/infiniband/sw/rdmavt/trace_rc.h | 2 +- drivers/infiniband/sw/rdmavt/trace_tx.h | 4 +- drivers/misc/mei/mei-trace.h | 6 +-- .../ethernet/marvell/octeontx2/af/rvu_trace.h | 12 +++--- drivers/net/fjes/fjes_trace.h | 4 +- drivers/usb/cdns3/cdnsp-trace.h | 2 +- fs/nfs/nfs4trace.h | 6 +-- fs/nfs/nfstrace.h | 4 +- include/trace/events/btrfs.h | 2 +- include/trace/events/dma_fence.h | 4 +- include/trace/events/rpcgss.h | 4 +- include/trace/events/sunrpc.h | 40 +++++++++---------- 21 files changed, 69 insertions(+), 69 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h index 792d20261846..e8361210575e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h @@ -176,10 +176,10 @@ TRACE_EVENT(amdgpu_cs_ioctl, TP_fast_assign( __entry->sched_job_id = job->base.id; - __assign_str(timeline, AMDGPU_JOB_GET_TIMELINE_NAME(job)) + __assign_str(timeline, AMDGPU_JOB_GET_TIMELINE_NAME(job)); __entry->context = job->base.s_fence->finished.context; __entry->seqno = job->base.s_fence->finished.seqno; - __assign_str(ring, to_amdgpu_ring(job->base.sched)->name) + __assign_str(ring, to_amdgpu_ring(job->base.sched)->name); __entry->num_ibs = job->num_ibs; ), TP_printk("sched_job=%llu, timeline=%s, context=%u, seqno=%u, ring_name=%s, num_ibs=%u", @@ -201,10 +201,10 @@ TRACE_EVENT(amdgpu_sched_run_job, TP_fast_assign( __entry->sched_job_id = job->base.id; - __assign_str(timeline, AMDGPU_JOB_GET_TIMELINE_NAME(job)) + __assign_str(timeline, AMDGPU_JOB_GET_TIMELINE_NAME(job)); __entry->context = job->base.s_fence->finished.context; __entry->seqno = job->base.s_fence->finished.seqno; - __assign_str(ring, to_amdgpu_ring(job->base.sched)->name) + __assign_str(ring, to_amdgpu_ring(job->base.sched)->name); __entry->num_ibs = job->num_ibs; ), TP_printk("sched_job=%llu, timeline=%s, context=%u, seqno=%u, ring_name=%s, num_ibs=%u", @@ -229,7 +229,7 @@ TRACE_EVENT(amdgpu_vm_grab_id, TP_fast_assign( __entry->pasid = vm->pasid; - __assign_str(ring, ring->name) + __assign_str(ring, ring->name); __entry->vmid = job->vmid; __entry->vm_hub = ring->funcs->vmhub, __entry->pd_addr = job->vm_pd_addr; @@ -424,7 +424,7 @@ TRACE_EVENT(amdgpu_vm_flush, ), TP_fast_assign( - __assign_str(ring, ring->name) + __assign_str(ring, ring->name); __entry->vmid = vmid; __entry->vm_hub = ring->funcs->vmhub; __entry->pd_addr = pd_addr; @@ -525,7 +525,7 @@ TRACE_EVENT(amdgpu_ib_pipe_sync, ), TP_fast_assign( - __assign_str(ring, sched_job->base.sched->name) + __assign_str(ring, sched_job->base.sched->name); __entry->id = sched_job->base.id; __entry->fence = fence; __entry->ctx = fence->context; diff --git a/drivers/gpu/drm/lima/lima_trace.h b/drivers/gpu/drm/lima/lima_trace.h index 3a430e93d384..494b9790b1da 100644 --- a/drivers/gpu/drm/lima/lima_trace.h +++ b/drivers/gpu/drm/lima/lima_trace.h @@ -24,7 +24,7 @@ DECLARE_EVENT_CLASS(lima_task, __entry->task_id = task->base.id; __entry->context = task->base.s_fence->finished.context; __entry->seqno = task->base.s_fence->finished.seqno; - __assign_str(pipe, task->base.sched->name) + __assign_str(pipe, task->base.sched->name); ), TP_printk("task=%llu, context=%u seqno=%u pipe=%s", diff --git a/drivers/infiniband/hw/hfi1/trace_misc.h b/drivers/infiniband/hw/hfi1/trace_misc.h index 8db2253523ff..93338988b922 100644 --- a/drivers/infiniband/hw/hfi1/trace_misc.h +++ b/drivers/infiniband/hw/hfi1/trace_misc.h @@ -63,7 +63,7 @@ TRACE_EVENT(hfi1_interrupt, __array(char, buf, 64) __field(int, src) ), - TP_fast_assign(DD_DEV_ASSIGN(dd) + TP_fast_assign(DD_DEV_ASSIGN(dd); is_entry->is_name(__entry->buf, 64, src - is_entry->start); __entry->src = src; @@ -100,7 +100,7 @@ TRACE_EVENT(hfi1_fault_opcode, __field(u32, qpn) __field(u8, opcode) ), - TP_fast_assign(DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + TP_fast_assign(DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); __entry->qpn = qp->ibqp.qp_num; __entry->opcode = opcode; ), diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h index 1ebca37862e0..5f49e1eeb211 100644 --- a/drivers/infiniband/hw/hfi1/trace_rc.h +++ b/drivers/infiniband/hw/hfi1/trace_rc.h @@ -70,7 +70,7 @@ DECLARE_EVENT_CLASS(hfi1_rc_template, __field(u32, r_psn) ), TP_fast_assign( - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); __entry->qpn = qp->ibqp.qp_num; __entry->s_flags = qp->s_flags; __entry->psn = psn; @@ -130,7 +130,7 @@ DECLARE_EVENT_CLASS(/* rc_ack */ __field(u32, lpsn) ), TP_fast_assign(/* assign */ - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); __entry->qpn = qp->ibqp.qp_num; __entry->aeth = aeth; __entry->psn = psn; diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index 985ffa9cc958..d129b8195959 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -886,7 +886,7 @@ DECLARE_EVENT_CLASS(/* sender_info */ __field(u8, s_retry) ), TP_fast_assign(/* assign */ - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); __entry->qpn = qp->ibqp.qp_num; __entry->state = qp->state; __entry->s_cur = qp->s_cur; @@ -1285,7 +1285,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */ __field(int, diff) ), TP_fast_assign(/* assign */ - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); __entry->qpn = qp->ibqp.qp_num; __entry->s_flags = qp->s_flags; __entry->state = qp->state; @@ -1574,7 +1574,7 @@ DECLARE_EVENT_CLASS(/* tid_ack */ __field(u32, resync_psn) ), TP_fast_assign(/* assign */ - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); __entry->qpn = qp->ibqp.qp_num; __entry->aeth = aeth; __entry->psn = psn; diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h index d44fc54858b9..f1922a7619fe 100644 --- a/drivers/infiniband/hw/hfi1/trace_tx.h +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -120,7 +120,7 @@ DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template, __field(unsigned long, iow_flags) ), TP_fast_assign( - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); __entry->flags = flags; __entry->qpn = qp->ibqp.qp_num; __entry->s_flags = qp->s_flags; @@ -868,7 +868,7 @@ TRACE_EVENT( __field(int, send_flags) ), TP_fast_assign( - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); __entry->wqe = wqe; __entry->wr_id = wqe->wr.wr_id; __entry->qpn = qp->ibqp.qp_num; @@ -904,7 +904,7 @@ DECLARE_EVENT_CLASS( __field(bool, flag) ), TP_fast_assign( - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); __entry->qpn = qp->ibqp.qp_num; __entry->flag = flag; ), @@ -952,7 +952,7 @@ DECLARE_EVENT_CLASS(/* AIP */ __field(u8, stopped) ), TP_fast_assign(/* assign */ - DD_DEV_ASSIGN(txq->priv->dd) + DD_DEV_ASSIGN(txq->priv->dd); __entry->txq = txq; __entry->sde = txq->sde; __entry->head = txq->tx_ring.head; diff --git a/drivers/infiniband/sw/rdmavt/trace_cq.h b/drivers/infiniband/sw/rdmavt/trace_cq.h index e3c416c6f900..91bc192cee5e 100644 --- a/drivers/infiniband/sw/rdmavt/trace_cq.h +++ b/drivers/infiniband/sw/rdmavt/trace_cq.h @@ -85,7 +85,7 @@ DECLARE_EVENT_CLASS(rvt_cq_template, __field(int, comp_vector_cpu) __field(u32, flags) ), - TP_fast_assign(RDI_DEV_ASSIGN(cq->rdi) + TP_fast_assign(RDI_DEV_ASSIGN(cq->rdi); __entry->ip = cq->ip; __entry->cqe = attr->cqe; __entry->comp_vector = attr->comp_vector; @@ -123,7 +123,7 @@ DECLARE_EVENT_CLASS( __field(u32, imm) ), TP_fast_assign( - RDI_DEV_ASSIGN(cq->rdi) + RDI_DEV_ASSIGN(cq->rdi); __entry->wr_id = wc->wr_id; __entry->status = wc->status; __entry->opcode = wc->opcode; diff --git a/drivers/infiniband/sw/rdmavt/trace_mr.h b/drivers/infiniband/sw/rdmavt/trace_mr.h index 95b8a0e3b8bd..c5b675ca4fa0 100644 --- a/drivers/infiniband/sw/rdmavt/trace_mr.h +++ b/drivers/infiniband/sw/rdmavt/trace_mr.h @@ -195,7 +195,7 @@ TRACE_EVENT( __field(uint, sg_offset) ), TP_fast_assign( - RDI_DEV_ASSIGN(ib_to_rvt(to_imr(ibmr)->mr.pd->device)) + RDI_DEV_ASSIGN(ib_to_rvt(to_imr(ibmr)->mr.pd->device)); __entry->ibmr_iova = ibmr->iova; __entry->iova = to_imr(ibmr)->mr.iova; __entry->user_base = to_imr(ibmr)->mr.user_base; diff --git a/drivers/infiniband/sw/rdmavt/trace_qp.h b/drivers/infiniband/sw/rdmavt/trace_qp.h index c32d21cc615e..800cec8bb3c7 100644 --- a/drivers/infiniband/sw/rdmavt/trace_qp.h +++ b/drivers/infiniband/sw/rdmavt/trace_qp.h @@ -65,7 +65,7 @@ DECLARE_EVENT_CLASS(rvt_qphash_template, __field(u32, bucket) ), TP_fast_assign( - RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) + RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)); __entry->qpn = qp->ibqp.qp_num; __entry->bucket = bucket; ), @@ -97,7 +97,7 @@ DECLARE_EVENT_CLASS( __field(u32, to) ), TP_fast_assign( - RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) + RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)); __entry->qpn = qp->ibqp.qp_num; __entry->hrtimer = &qp->s_rnr_timer; __entry->s_flags = qp->s_flags; diff --git a/drivers/infiniband/sw/rdmavt/trace_rc.h b/drivers/infiniband/sw/rdmavt/trace_rc.h index c47357af2099..9de52e138025 100644 --- a/drivers/infiniband/sw/rdmavt/trace_rc.h +++ b/drivers/infiniband/sw/rdmavt/trace_rc.h @@ -71,7 +71,7 @@ DECLARE_EVENT_CLASS(rvt_rc_template, __field(u32, r_psn) ), TP_fast_assign( - RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) + RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)); __entry->qpn = qp->ibqp.qp_num; __entry->s_flags = qp->s_flags; __entry->psn = psn; diff --git a/drivers/infiniband/sw/rdmavt/trace_tx.h b/drivers/infiniband/sw/rdmavt/trace_tx.h index d963ca755828..cb96be0f8f19 100644 --- a/drivers/infiniband/sw/rdmavt/trace_tx.h +++ b/drivers/infiniband/sw/rdmavt/trace_tx.h @@ -111,7 +111,7 @@ TRACE_EVENT( __field(int, wr_num_sge) ), TP_fast_assign( - RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) + RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)); __entry->wqe = wqe; __entry->wr_id = wqe->wr.wr_id; __entry->qpn = qp->ibqp.qp_num; @@ -170,7 +170,7 @@ TRACE_EVENT( __field(int, send_flags) ), TP_fast_assign( - RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) + RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)); __entry->wqe = wqe; __entry->wr_id = wqe->wr.wr_id; __entry->qpn = qp->ibqp.qp_num; diff --git a/drivers/misc/mei/mei-trace.h b/drivers/misc/mei/mei-trace.h index df758033dc93..fe46ff2b9d69 100644 --- a/drivers/misc/mei/mei-trace.h +++ b/drivers/misc/mei/mei-trace.h @@ -26,7 +26,7 @@ TRACE_EVENT(mei_reg_read, __field(u32, val) ), TP_fast_assign( - __assign_str(dev, dev_name(dev)) + __assign_str(dev, dev_name(dev)); __entry->reg = reg; __entry->offs = offs; __entry->val = val; @@ -45,7 +45,7 @@ TRACE_EVENT(mei_reg_write, __field(u32, val) ), TP_fast_assign( - __assign_str(dev, dev_name(dev)) + __assign_str(dev, dev_name(dev)); __entry->reg = reg; __entry->offs = offs; __entry->val = val; @@ -64,7 +64,7 @@ TRACE_EVENT(mei_pci_cfg_read, __field(u32, val) ), TP_fast_assign( - __assign_str(dev, dev_name(dev)) + __assign_str(dev, dev_name(dev)); __entry->reg = reg; __entry->offs = offs; __entry->val = val; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h index e6609068e81b..64aa7d350df1 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h @@ -21,7 +21,7 @@ TRACE_EVENT(otx2_msg_alloc, __field(u16, id) __field(u64, size) ), - TP_fast_assign(__assign_str(dev, pci_name(pdev)) + TP_fast_assign(__assign_str(dev, pci_name(pdev)); __entry->id = id; __entry->size = size; ), @@ -36,7 +36,7 @@ TRACE_EVENT(otx2_msg_send, __field(u16, num_msgs) __field(u64, msg_size) ), - TP_fast_assign(__assign_str(dev, pci_name(pdev)) + TP_fast_assign(__assign_str(dev, pci_name(pdev)); __entry->num_msgs = num_msgs; __entry->msg_size = msg_size; ), @@ -52,7 +52,7 @@ TRACE_EVENT(otx2_msg_check, __field(u16, rspid) __field(int, rc) ), - TP_fast_assign(__assign_str(dev, pci_name(pdev)) + TP_fast_assign(__assign_str(dev, pci_name(pdev)); __entry->reqid = reqid; __entry->rspid = rspid; __entry->rc = rc; @@ -69,8 +69,8 @@ TRACE_EVENT(otx2_msg_interrupt, __string(str, msg) __field(u64, intr) ), - TP_fast_assign(__assign_str(dev, pci_name(pdev)) - __assign_str(str, msg) + TP_fast_assign(__assign_str(dev, pci_name(pdev)); + __assign_str(str, msg); __entry->intr = intr; ), TP_printk("[%s] mbox interrupt %s (0x%llx)\n", __get_str(dev), @@ -84,7 +84,7 @@ TRACE_EVENT(otx2_msg_process, __field(u16, id) __field(int, err) ), - TP_fast_assign(__assign_str(dev, pci_name(pdev)) + TP_fast_assign(__assign_str(dev, pci_name(pdev)); __entry->id = id; __entry->err = err; ), diff --git a/drivers/net/fjes/fjes_trace.h b/drivers/net/fjes/fjes_trace.h index 9237b69d8e21..6437ddbd7842 100644 --- a/drivers/net/fjes/fjes_trace.h +++ b/drivers/net/fjes/fjes_trace.h @@ -232,7 +232,7 @@ TRACE_EVENT(fjes_hw_start_debug_err, __string(err, err) ), TP_fast_assign( - __assign_str(err, err) + __assign_str(err, err); ), TP_printk("%s", __get_str(err)) ); @@ -258,7 +258,7 @@ TRACE_EVENT(fjes_hw_stop_debug_err, __string(err, err) ), TP_fast_assign( - __assign_str(err, err) + __assign_str(err, err); ), TP_printk("%s", __get_str(err)) ); diff --git a/drivers/usb/cdns3/cdnsp-trace.h b/drivers/usb/cdns3/cdnsp-trace.h index 5aa88ca012de..6a2571c6aa9e 100644 --- a/drivers/usb/cdns3/cdnsp-trace.h +++ b/drivers/usb/cdns3/cdnsp-trace.h @@ -138,7 +138,7 @@ DECLARE_EVENT_CLASS(cdnsp_log_simple, __string(text, msg) ), TP_fast_assign( - __assign_str(text, msg) + __assign_str(text, msg); ), TP_printk("%s", __get_str(text)) ); diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 2ef75caad6da..7a2567aa2b86 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -625,7 +625,7 @@ TRACE_EVENT(nfs4_state_mgr, TP_fast_assign( __entry->state = clp->cl_state; - __assign_str(hostname, clp->cl_hostname) + __assign_str(hostname, clp->cl_hostname); ), TP_printk( @@ -1637,7 +1637,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown") + __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown"); ), TP_printk( @@ -1694,7 +1694,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown") + __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown"); __entry->stateid_seq = be32_to_cpu(stateid->seqid); __entry->stateid_hash = diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index eb1ef3462e84..dd0df132772a 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1431,8 +1431,8 @@ DECLARE_EVENT_CLASS(nfs_xdr_event, __entry->version = task->tk_client->cl_vers; __entry->error = error; __assign_str(program, - task->tk_client->cl_program->name) - __assign_str(procedure, task->tk_msg.rpc_proc->p_name) + task->tk_client->cl_program->name); + __assign_str(procedure, task->tk_msg.rpc_proc->p_name); ), TP_printk( diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index a41dd8a0c730..2b2d8bc66901 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1097,7 +1097,7 @@ TRACE_EVENT(btrfs_trigger_flush, __entry->flags = flags; __entry->bytes = bytes; __entry->flush = flush; - __assign_str(reason, reason) + __assign_str(reason, reason); ), TP_printk_btrfs("%s: flush=%d(%s) flags=%llu(%s) bytes=%llu", diff --git a/include/trace/events/dma_fence.h b/include/trace/events/dma_fence.h index 64e92d56c6a8..3963e79ca7b4 100644 --- a/include/trace/events/dma_fence.h +++ b/include/trace/events/dma_fence.h @@ -23,8 +23,8 @@ DECLARE_EVENT_CLASS(dma_fence, ), TP_fast_assign( - __assign_str(driver, fence->ops->get_driver_name(fence)) - __assign_str(timeline, fence->ops->get_timeline_name(fence)) + __assign_str(driver, fence->ops->get_driver_name(fence)); + __assign_str(timeline, fence->ops->get_timeline_name(fence)); __entry->context = fence->context; __entry->seqno = fence->seqno; ), diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h index ffdbe6f85da8..b2a2672e6632 100644 --- a/include/trace/events/rpcgss.h +++ b/include/trace/events/rpcgss.h @@ -152,7 +152,7 @@ DECLARE_EVENT_CLASS(rpcgss_ctx_class, TP_fast_assign( __entry->cred = gc; __entry->service = gc->gc_service; - __assign_str(principal, gc->gc_principal) + __assign_str(principal, gc->gc_principal); ), TP_printk("cred=%p service=%s principal='%s'", @@ -535,7 +535,7 @@ TRACE_EVENT(rpcgss_upcall_msg, ), TP_fast_assign( - __assign_str(msg, buf) + __assign_str(msg, buf); ), TP_printk("msg='%s'", __get_str(msg)) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index d02e01a27b69..861f199896c6 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -154,8 +154,8 @@ TRACE_EVENT(rpc_clnt_new, __entry->client_id = clnt->cl_clid; __assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]); __assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]); - __assign_str(program, program) - __assign_str(server, server) + __assign_str(program, program); + __assign_str(server, server); ), TP_printk("client=%u peer=[%s]:%s program=%s server=%s", @@ -180,8 +180,8 @@ TRACE_EVENT(rpc_clnt_new_err, TP_fast_assign( __entry->error = error; - __assign_str(program, program) - __assign_str(server, server) + __assign_str(program, program); + __assign_str(server, server); ), TP_printk("program=%s server=%s error=%d", @@ -284,8 +284,8 @@ TRACE_EVENT(rpc_request, __entry->client_id = task->tk_client->cl_clid; __entry->version = task->tk_client->cl_vers; __entry->async = RPC_IS_ASYNC(task); - __assign_str(progname, task->tk_client->cl_program->name) - __assign_str(procname, rpc_proc_name(task)) + __assign_str(progname, task->tk_client->cl_program->name); + __assign_str(procname, rpc_proc_name(task)); ), TP_printk("task:%u@%u %sv%d %s (%ssync)", @@ -494,10 +494,10 @@ DECLARE_EVENT_CLASS(rpc_reply_event, __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; __entry->xid = be32_to_cpu(task->tk_rqstp->rq_xid); - __assign_str(progname, task->tk_client->cl_program->name) + __assign_str(progname, task->tk_client->cl_program->name); __entry->version = task->tk_client->cl_vers; - __assign_str(procname, rpc_proc_name(task)) - __assign_str(servername, task->tk_xprt->servername) + __assign_str(procname, rpc_proc_name(task)); + __assign_str(servername, task->tk_xprt->servername); ), TP_printk("task:%u@%d server=%s xid=0x%08x %sv%d %s", @@ -622,8 +622,8 @@ TRACE_EVENT(rpc_stats_latency, __entry->task_id = task->tk_pid; __entry->xid = be32_to_cpu(task->tk_rqstp->rq_xid); __entry->version = task->tk_client->cl_vers; - __assign_str(progname, task->tk_client->cl_program->name) - __assign_str(procname, rpc_proc_name(task)) + __assign_str(progname, task->tk_client->cl_program->name); + __assign_str(procname, rpc_proc_name(task)); __entry->backlog = ktime_to_us(backlog); __entry->rtt = ktime_to_us(rtt); __entry->execute = ktime_to_us(execute); @@ -669,15 +669,15 @@ TRACE_EVENT(rpc_xdr_overflow, __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; __assign_str(progname, - task->tk_client->cl_program->name) + task->tk_client->cl_program->name); __entry->version = task->tk_client->cl_vers; - __assign_str(procedure, task->tk_msg.rpc_proc->p_name) + __assign_str(procedure, task->tk_msg.rpc_proc->p_name); } else { __entry->task_id = 0; __entry->client_id = 0; - __assign_str(progname, "unknown") + __assign_str(progname, "unknown"); __entry->version = 0; - __assign_str(procedure, "unknown") + __assign_str(procedure, "unknown"); } __entry->requested = requested; __entry->end = xdr->end; @@ -735,9 +735,9 @@ TRACE_EVENT(rpc_xdr_alignment, __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; __assign_str(progname, - task->tk_client->cl_program->name) + task->tk_client->cl_program->name); __entry->version = task->tk_client->cl_vers; - __assign_str(procedure, task->tk_msg.rpc_proc->p_name) + __assign_str(procedure, task->tk_msg.rpc_proc->p_name); __entry->offset = offset; __entry->copied = copied; @@ -1107,9 +1107,9 @@ TRACE_EVENT(xprt_retransmit, __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->ntrans = rqst->rq_ntrans; __assign_str(progname, - task->tk_client->cl_program->name) + task->tk_client->cl_program->name); __entry->version = task->tk_client->cl_vers; - __assign_str(procedure, task->tk_msg.rpc_proc->p_name) + __assign_str(procedure, task->tk_msg.rpc_proc->p_name); ), TP_printk( @@ -1842,7 +1842,7 @@ TRACE_EVENT(svc_xprt_accept, TP_fast_assign( __assign_str(addr, xprt->xpt_remotebuf); - __assign_str(protocol, xprt->xpt_class->xcl_name) + __assign_str(protocol, xprt->xpt_class->xcl_name); __assign_str(service, service); ), From b81b3e959adb107cd5b36c7dc5ba1364bbd31eb2 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 29 Jun 2021 17:34:05 -0700 Subject: [PATCH 48/49] tracing: Simplify & fix saved_tgids logic The tgid_map array records a mapping from pid to tgid, where the index of an entry within the array is the pid & the value stored at that index is the tgid. The saved_tgids_next() function iterates over pointers into the tgid_map array & dereferences the pointers which results in the tgid, but then it passes that dereferenced value to trace_find_tgid() which treats it as a pid & does a further lookup within the tgid_map array. It seems likely that the intent here was to skip over entries in tgid_map for which the recorded tgid is zero, but instead we end up skipping over entries for which the thread group leader hasn't yet had its own tgid recorded in tgid_map. A minimal fix would be to remove the call to trace_find_tgid, turning: if (trace_find_tgid(*ptr)) into: if (*ptr) ..but it seems like this logic can be much simpler if we simply let seq_read() iterate over the whole tgid_map array & filter out empty entries by returning SEQ_SKIP from saved_tgids_show(). Here we take that approach, removing the incorrect logic here entirely. Link: https://lkml.kernel.org/r/20210630003406.4013668-1-paulburton@google.com Fixes: d914ba37d714 ("tracing: Add support for recording tgid of tasks") Cc: Ingo Molnar Cc: Joel Fernandes Cc: Signed-off-by: Paul Burton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 60492464281e..4843076d67d3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5662,37 +5662,20 @@ static const struct file_operations tracing_readme_fops = { static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) { - int *ptr = v; + int pid = ++(*pos); - if (*pos || m->count) - ptr++; + if (pid > PID_MAX_DEFAULT) + return NULL; - (*pos)++; - - for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) { - if (trace_find_tgid(*ptr)) - return ptr; - } - - return NULL; + return &tgid_map[pid]; } static void *saved_tgids_start(struct seq_file *m, loff_t *pos) { - void *v; - loff_t l = 0; - - if (!tgid_map) + if (!tgid_map || *pos > PID_MAX_DEFAULT) return NULL; - v = &tgid_map[0]; - while (l <= *pos) { - v = saved_tgids_next(m, v, &l); - if (!v) - return NULL; - } - - return v; + return &tgid_map[*pos]; } static void saved_tgids_stop(struct seq_file *m, void *v) @@ -5701,9 +5684,14 @@ static void saved_tgids_stop(struct seq_file *m, void *v) static int saved_tgids_show(struct seq_file *m, void *v) { - int pid = (int *)v - tgid_map; + int *entry = (int *)v; + int pid = entry - tgid_map; + int tgid = *entry; - seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid)); + if (tgid == 0) + return SEQ_SKIP; + + seq_printf(m, "%d %d\n", pid, tgid); return 0; } From 4030a6e6a6a4a42ff8c18414c9e0c93e24cc70b8 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 1 Jul 2021 10:24:07 -0700 Subject: [PATCH 49/49] tracing: Resize tgid_map to pid_max, not PID_MAX_DEFAULT Currently tgid_map is sized at PID_MAX_DEFAULT entries, which means that on systems where pid_max is configured higher than PID_MAX_DEFAULT the ftrace record-tgid option doesn't work so well. Any tasks with PIDs higher than PID_MAX_DEFAULT are simply not recorded in tgid_map, and don't show up in the saved_tgids file. In particular since systemd v243 & above configure pid_max to its highest possible 1<<22 value by default on 64 bit systems this renders the record-tgids option of little use. Increase the size of tgid_map to the configured pid_max instead, allowing it to cover the full range of PIDs up to the maximum value of PID_MAX_LIMIT if the system is configured that way. On 64 bit systems with pid_max == PID_MAX_LIMIT this will increase the size of tgid_map from 256KiB to 16MiB. Whilst this 64x increase in memory overhead sounds significant 64 bit systems are presumably best placed to accommodate it, and since tgid_map is only allocated when the record-tgid option is actually used presumably the user would rather it spends sufficient memory to actually record the tgids they expect. The size of tgid_map could also increase for CONFIG_BASE_SMALL=y configurations, but these seem unlikely to be systems upon which people are both configuring a large pid_max and running ftrace with record-tgid anyway. Of note is that we only allocate tgid_map once, the first time that the record-tgid option is enabled. Therefore its size is only set once, to the value of pid_max at the time the record-tgid option is first enabled. If a user increases pid_max after that point, the saved_tgids file will not contain entries for any tasks with pids beyond the earlier value of pid_max. Link: https://lkml.kernel.org/r/20210701172407.889626-2-paulburton@google.com Fixes: d914ba37d714 ("tracing: Add support for recording tgid of tasks") Cc: Ingo Molnar Cc: Joel Fernandes Cc: Signed-off-by: Paul Burton [ Fixed comment coding style ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 63 +++++++++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4843076d67d3..14f56e9fa001 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2191,8 +2191,15 @@ void tracing_reset_all_online_cpus(void) } } +/* + * The tgid_map array maps from pid to tgid; i.e. the value stored at index i + * is the tgid last observed corresponding to pid=i. + */ static int *tgid_map; +/* The maximum valid index into tgid_map. */ +static size_t tgid_map_max; + #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -2468,24 +2475,41 @@ void trace_find_cmdline(int pid, char comm[]) preempt_enable(); } +static int *trace_find_tgid_ptr(int pid) +{ + /* + * Pairs with the smp_store_release in set_tracer_flag() to ensure that + * if we observe a non-NULL tgid_map then we also observe the correct + * tgid_map_max. + */ + int *map = smp_load_acquire(&tgid_map); + + if (unlikely(!map || pid > tgid_map_max)) + return NULL; + + return &map[pid]; +} + int trace_find_tgid(int pid) { - if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT)) - return 0; + int *ptr = trace_find_tgid_ptr(pid); - return tgid_map[pid]; + return ptr ? *ptr : 0; } static int trace_save_tgid(struct task_struct *tsk) { + int *ptr; + /* treat recording of idle task as a success */ if (!tsk->pid) return 1; - if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT)) + ptr = trace_find_tgid_ptr(tsk->pid); + if (!ptr) return 0; - tgid_map[tsk->pid] = tsk->tgid; + *ptr = tsk->tgid; return 1; } @@ -5225,6 +5249,8 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { + int *map; + if ((mask == TRACE_ITER_RECORD_TGID) || (mask == TRACE_ITER_RECORD_CMD)) lockdep_assert_held(&event_mutex); @@ -5247,10 +5273,19 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) trace_event_enable_cmd_record(enabled); if (mask == TRACE_ITER_RECORD_TGID) { - if (!tgid_map) - tgid_map = kvcalloc(PID_MAX_DEFAULT + 1, - sizeof(*tgid_map), - GFP_KERNEL); + if (!tgid_map) { + tgid_map_max = pid_max; + map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), + GFP_KERNEL); + + /* + * Pairs with smp_load_acquire() in + * trace_find_tgid_ptr() to ensure that if it observes + * the tgid_map we just allocated then it also observes + * the corresponding tgid_map_max value. + */ + smp_store_release(&tgid_map, map); + } if (!tgid_map) { tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; return -ENOMEM; @@ -5664,18 +5699,14 @@ static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) { int pid = ++(*pos); - if (pid > PID_MAX_DEFAULT) - return NULL; - - return &tgid_map[pid]; + return trace_find_tgid_ptr(pid); } static void *saved_tgids_start(struct seq_file *m, loff_t *pos) { - if (!tgid_map || *pos > PID_MAX_DEFAULT) - return NULL; + int pid = *pos; - return &tgid_map[*pos]; + return trace_find_tgid_ptr(pid); } static void saved_tgids_stop(struct seq_file *m, void *v)