From 04204cd9b0b450bfe561a5f8d0fc91288c6427ab Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 1 Feb 2024 12:33:46 -0500 Subject: [PATCH 01/34] eventfs: Add WARN_ON_ONCE() to checks in eventfs_root_lookup() There's a couple of if statements in eventfs_root_lookup() that should never be true. Instead of removing them, add WARN_ON_ONCE() around them. One is a tracefs_inode not being for eventfs. The other is a child being freed but still on the parent's children list. When a child is freed, it is removed from the list under the same mutex that is held during the iteration. Link: https://lore.kernel.org/linux-trace-kernel/20240201002719.GS2087318@ZenIV/ Link: https://lore.kernel.org/linux-trace-kernel/20240201123346.724afa46@gandalf.local.home Cc: Linus Torvalds Cc: Al Viro Cc: Christian Brauner Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Ajay Kaher Reported-by: Al Viro Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 110e8a272189..9d9c7dc3114b 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -483,7 +483,7 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *result = NULL; ti = get_tracefs(dir); - if (!(ti->flags & TRACEFS_EVENT_INODE)) + if (WARN_ON_ONCE(!(ti->flags & TRACEFS_EVENT_INODE))) return ERR_PTR(-EIO); mutex_lock(&eventfs_mutex); @@ -495,7 +495,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, list_for_each_entry(ei_child, &ei->children, list) { if (strcmp(ei_child->name, name) != 0) continue; - if (ei_child->is_freed) + /* A child is freed and removed from the list at the same time */ + if (WARN_ON_ONCE(ei_child->is_freed)) goto out; result = lookup_dir_entry(dentry, ei, ei_child); goto out; From c3137ab6318d56370dd5541ebf027ddfc0c8557c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 1 Feb 2024 10:34:52 -0500 Subject: [PATCH 02/34] eventfs: Create eventfs_root_inode to store dentry Only the root "events" directory stores a dentry. There's no reason to hold a dentry pointer for every eventfs_inode as it is never set except for the root "events" eventfs_inode. Create a eventfs_root_inode structure that holds the events_dir dentry. The "events" eventfs_inode *is* special, let it have its own descriptor. Link: https://lore.kernel.org/linux-trace-kernel/20240201161617.658992558@goodmis.org Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Christian Brauner Cc: Al Viro Cc: Ajay Kaher Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 65 +++++++++++++++++++++++++++++++++------- fs/tracefs/internal.h | 2 -- 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 9d9c7dc3114b..dc067eeb6387 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -35,6 +35,17 @@ static DEFINE_MUTEX(eventfs_mutex); /* Choose something "unique" ;-) */ #define EVENTFS_FILE_INODE_INO 0x12c4e37 +struct eventfs_root_inode { + struct eventfs_inode ei; + struct dentry *events_dir; +}; + +static struct eventfs_root_inode *get_root_inode(struct eventfs_inode *ei) +{ + WARN_ON_ONCE(!ei->is_events); + return container_of(ei, struct eventfs_root_inode, ei); +} + /* Just try to make something consistent and unique */ static int eventfs_dir_ino(struct eventfs_inode *ei) { @@ -73,12 +84,18 @@ enum { static void release_ei(struct kref *ref) { struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref); + struct eventfs_root_inode *rei; WARN_ON_ONCE(!ei->is_freed); kfree(ei->entry_attrs); kfree_const(ei->name); - kfree_rcu(ei, rcu); + if (ei->is_events) { + rei = get_root_inode(ei); + kfree_rcu(rei, ei.rcu); + } else { + kfree_rcu(ei, rcu); + } } static inline void put_ei(struct eventfs_inode *ei) @@ -408,19 +425,43 @@ static struct dentry *lookup_dir_entry(struct dentry *dentry, return NULL; } +static inline struct eventfs_inode *init_ei(struct eventfs_inode *ei, const char *name) +{ + ei->name = kstrdup_const(name, GFP_KERNEL); + if (!ei->name) + return NULL; + kref_init(&ei->kref); + return ei; +} + static inline struct eventfs_inode *alloc_ei(const char *name) { struct eventfs_inode *ei = kzalloc(sizeof(*ei), GFP_KERNEL); + struct eventfs_inode *result; if (!ei) return NULL; - ei->name = kstrdup_const(name, GFP_KERNEL); - if (!ei->name) { + result = init_ei(ei, name); + if (!result) kfree(ei); + + return result; +} + +static inline struct eventfs_inode *alloc_root_ei(const char *name) +{ + struct eventfs_root_inode *rei = kzalloc(sizeof(*rei), GFP_KERNEL); + struct eventfs_inode *ei; + + if (!rei) return NULL; - } - kref_init(&ei->kref); + + rei->ei.is_events = 1; + ei = init_ei(&rei->ei, name); + if (!ei) + kfree(rei); + return ei; } @@ -710,6 +751,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry int size, void *data) { struct dentry *dentry = tracefs_start_creating(name, parent); + struct eventfs_root_inode *rei; struct eventfs_inode *ei; struct tracefs_inode *ti; struct inode *inode; @@ -722,7 +764,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry if (IS_ERR(dentry)) return ERR_CAST(dentry); - ei = alloc_ei(name); + ei = alloc_root_ei(name); if (!ei) goto fail; @@ -731,10 +773,11 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry goto fail; // Note: we have a ref to the dentry from tracefs_start_creating() - ei->events_dir = dentry; + rei = get_root_inode(ei); + rei->events_dir = dentry; + ei->entries = entries; ei->nr_entries = size; - ei->is_events = 1; ei->data = data; /* Save the ownership of this directory */ @@ -845,13 +888,15 @@ void eventfs_remove_dir(struct eventfs_inode *ei) */ void eventfs_remove_events_dir(struct eventfs_inode *ei) { + struct eventfs_root_inode *rei; struct dentry *dentry; - dentry = ei->events_dir; + rei = get_root_inode(ei); + dentry = rei->events_dir; if (!dentry) return; - ei->events_dir = NULL; + rei->events_dir = NULL; eventfs_remove_dir(ei); /* diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index beb3dcd0e434..15c26f9aaad4 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -36,7 +36,6 @@ struct eventfs_attr { * @children: link list into the child eventfs_inode * @entries: the array of entries representing the files in the directory * @name: the name of the directory to create - * @events_dir: the dentry of the events directory * @entry_attrs: Saved mode and ownership of the @d_children * @data: The private data to pass to the callbacks * @attr: Saved mode and ownership of eventfs_inode itself @@ -54,7 +53,6 @@ struct eventfs_inode { struct list_head children; const struct eventfs_entry *entries; const char *name; - struct dentry *events_dir; struct eventfs_attr *entry_attrs; void *data; struct eventfs_attr attr; From 0b18c852cc6fb8284ac0ab97e3e840974a6a8a64 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 20 Feb 2024 09:06:14 -0500 Subject: [PATCH 03/34] tracing: Have saved_cmdlines arrays all in one allocation The saved_cmdlines have three arrays for mapping PIDs to COMMs: - map_pid_to_cmdline[] - map_cmdline_to_pid[] - saved_cmdlines The map_pid_to_cmdline[] is PID_MAX_DEFAULT in size and holds the index into the other arrays. The map_cmdline_to_pid[] is a mapping back to the full pid as it can be larger than PID_MAX_DEFAULT. And the saved_cmdlines[] just holds the COMMs associated to the pids. Currently the map_pid_to_cmdline[] and saved_cmdlines[] are allocated together (in reality the saved_cmdlines is just in the memory of the rounding of the allocation of the structure as it is always allocated in powers of two). The map_cmdline_to_pid[] array is allocated separately. Since the rounding to a power of two is rather large (it allows for 8000 elements in saved_cmdlines), also include the map_cmdline_to_pid[] array. (This drops it to 6000 by default, which is still plenty for most use cases). This saves even more memory as the map_cmdline_to_pid[] array doesn't need to be allocated. Link: https://lore.kernel.org/linux-trace-kernel/20240212174011.068211d9@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20240220140703.182330529@goodmis.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tim Chen Cc: Vincent Donnefort Cc: Sven Schnelle Cc: Mete Durlu Fixes: 44dc5c41b5b1 ("tracing: Fix wasted memory in saved_cmdlines logic") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ab4c1a1fbda8..70202e60a60a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2346,6 +2346,10 @@ struct saved_cmdlines_buffer { }; static struct saved_cmdlines_buffer *savedcmd; +/* Holds the size of a cmdline and pid element */ +#define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s) \ + (TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0])) + static inline char *get_saved_cmdlines(int idx) { return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; @@ -2360,7 +2364,6 @@ static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) { int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); - kfree(s->map_cmdline_to_pid); kmemleak_free(s); free_pages((unsigned long)s, order); } @@ -2373,7 +2376,7 @@ static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) int order; /* Figure out how much is needed to hold the given number of cmdlines */ - orig_size = sizeof(*s) + val * TASK_COMM_LEN; + orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); order = get_order(orig_size); size = 1 << (order + PAGE_SHIFT); page = alloc_pages(GFP_KERNEL, order); @@ -2385,16 +2388,11 @@ static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) memset(s, 0, sizeof(*s)); /* Round up to actual allocation */ - val = (size - sizeof(*s)) / TASK_COMM_LEN; + val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); s->cmdline_num = val; - s->map_cmdline_to_pid = kmalloc_array(val, - sizeof(*s->map_cmdline_to_pid), - GFP_KERNEL); - if (!s->map_cmdline_to_pid) { - free_saved_cmdlines_buffer(s); - return NULL; - } + /* Place map_cmdline_to_pid array right after saved_cmdlines */ + s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN]; s->cmdline_idx = 0; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, From e85d471c2be5d3d4dadee1110db316f352daaf0b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 20 Feb 2024 09:06:15 -0500 Subject: [PATCH 04/34] tracing: Move open coded processing of tgid_map into helper function In preparation of moving the saved_cmdlines logic out of trace.c and into trace_sched_switch.c, replace the open coded manipulation of tgid_map in set_tracer_flag() into a helper function trace_alloc_tgid_map() so that it can be easily moved into trace_sched_switch.c without changing existing functions in trace.c. No functional changes. Link: https://lore.kernel.org/linux-trace-kernel/20240220140703.338116216@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tim Chen Cc: Vincent Donnefort Cc: Sven Schnelle Cc: Mete Durlu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 70202e60a60a..d3e630dd36f8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5453,10 +5453,31 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) return 0; } -int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) +static int trace_alloc_tgid_map(void) { int *map; + if (tgid_map) + return 0; + + tgid_map_max = pid_max; + map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), + GFP_KERNEL); + if (!map) + return -ENOMEM; + + /* + * Pairs with smp_load_acquire() in + * trace_find_tgid_ptr() to ensure that if it observes + * the tgid_map we just allocated then it also observes + * the corresponding tgid_map_max value. + */ + smp_store_release(&tgid_map, map); + return 0; +} + +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) +{ if ((mask == TRACE_ITER_RECORD_TGID) || (mask == TRACE_ITER_RECORD_CMD)) lockdep_assert_held(&event_mutex); @@ -5479,20 +5500,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) trace_event_enable_cmd_record(enabled); if (mask == TRACE_ITER_RECORD_TGID) { - if (!tgid_map) { - tgid_map_max = pid_max; - map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), - GFP_KERNEL); - - /* - * Pairs with smp_load_acquire() in - * trace_find_tgid_ptr() to ensure that if it observes - * the tgid_map we just allocated then it also observes - * the corresponding tgid_map_max value. - */ - smp_store_release(&tgid_map, map); - } - if (!tgid_map) { + if (trace_alloc_tgid_map() < 0) { tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; return -ENOMEM; } From 2cc621fd2e9b8494df06de459c14738cf76add91 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 20 Feb 2024 09:06:16 -0500 Subject: [PATCH 05/34] tracing: Move saved_cmdline code into trace_sched_switch.c The code that handles saved_cmdlines is split between the trace.c file and the trace_sched_switch.c. There's some history to this. The trace_sched_switch.c was originally created to handle the sched_switch tracer that was deprecated due to sched_switch trace event making it obsolete. But that file did not get deleted as it had some code to help with saved_cmdlines. But trace.c has grown tremendously since then. Just move all the saved_cmdlines code into trace_sched_switch.c as that's the only reason that file still exists, and trace.c has gotten too big. No functional changes. Link: https://lore.kernel.org/linux-trace-kernel/20240220140703.497966629@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tim Chen Cc: Vincent Donnefort Cc: Sven Schnelle Cc: Mete Durlu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 515 +----------------------------- kernel/trace/trace.h | 10 + kernel/trace/trace_sched_switch.c | 515 ++++++++++++++++++++++++++++++ 3 files changed, 528 insertions(+), 512 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d3e630dd36f8..1e5f80cda39a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include @@ -105,7 +104,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) * tracing is active, only save the comm when a trace event * occurred. */ -static DEFINE_PER_CPU(bool, trace_taskinfo_save); +DEFINE_PER_CPU(bool, trace_taskinfo_save); /* * Kill all tracing for good (never come back). @@ -2320,96 +2319,6 @@ void tracing_reset_all_online_cpus(void) mutex_unlock(&trace_types_lock); } -/* - * The tgid_map array maps from pid to tgid; i.e. the value stored at index i - * is the tgid last observed corresponding to pid=i. - */ -static int *tgid_map; - -/* The maximum valid index into tgid_map. */ -static size_t tgid_map_max; - -#define SAVED_CMDLINES_DEFAULT 128 -#define NO_CMDLINE_MAP UINT_MAX -/* - * Preemption must be disabled before acquiring trace_cmdline_lock. - * The various trace_arrays' max_lock must be acquired in a context - * where interrupt is disabled. - */ -static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; -struct saved_cmdlines_buffer { - unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; - unsigned *map_cmdline_to_pid; - unsigned cmdline_num; - int cmdline_idx; - char saved_cmdlines[]; -}; -static struct saved_cmdlines_buffer *savedcmd; - -/* Holds the size of a cmdline and pid element */ -#define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s) \ - (TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0])) - -static inline char *get_saved_cmdlines(int idx) -{ - return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; -} - -static inline void set_cmdline(int idx, const char *cmdline) -{ - strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); -} - -static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) -{ - int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); - - kmemleak_free(s); - free_pages((unsigned long)s, order); -} - -static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) -{ - struct saved_cmdlines_buffer *s; - struct page *page; - int orig_size, size; - int order; - - /* Figure out how much is needed to hold the given number of cmdlines */ - orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); - order = get_order(orig_size); - size = 1 << (order + PAGE_SHIFT); - page = alloc_pages(GFP_KERNEL, order); - if (!page) - return NULL; - - s = page_address(page); - kmemleak_alloc(s, size, 1, GFP_KERNEL); - memset(s, 0, sizeof(*s)); - - /* Round up to actual allocation */ - val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); - s->cmdline_num = val; - - /* Place map_cmdline_to_pid array right after saved_cmdlines */ - s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN]; - - s->cmdline_idx = 0; - memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, - sizeof(s->map_pid_to_cmdline)); - memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, - val * sizeof(*s->map_cmdline_to_pid)); - - return s; -} - -static int trace_create_savedcmd(void) -{ - savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); - - return savedcmd ? 0 : -ENOMEM; -} - int is_tracing_stopped(void) { return global_trace.stop_count; @@ -2502,201 +2411,6 @@ void tracing_stop(void) return tracing_stop_tr(&global_trace); } -static int trace_save_cmdline(struct task_struct *tsk) -{ - unsigned tpid, idx; - - /* treat recording of idle task as a success */ - if (!tsk->pid) - return 1; - - tpid = tsk->pid & (PID_MAX_DEFAULT - 1); - - /* - * It's not the end of the world if we don't get - * the lock, but we also don't want to spin - * nor do we want to disable interrupts, - * so if we miss here, then better luck next time. - * - * This is called within the scheduler and wake up, so interrupts - * had better been disabled and run queue lock been held. - */ - lockdep_assert_preemption_disabled(); - if (!arch_spin_trylock(&trace_cmdline_lock)) - return 0; - - idx = savedcmd->map_pid_to_cmdline[tpid]; - if (idx == NO_CMDLINE_MAP) { - idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; - - savedcmd->map_pid_to_cmdline[tpid] = idx; - savedcmd->cmdline_idx = idx; - } - - savedcmd->map_cmdline_to_pid[idx] = tsk->pid; - set_cmdline(idx, tsk->comm); - - arch_spin_unlock(&trace_cmdline_lock); - - return 1; -} - -static void __trace_find_cmdline(int pid, char comm[]) -{ - unsigned map; - int tpid; - - if (!pid) { - strcpy(comm, ""); - return; - } - - if (WARN_ON_ONCE(pid < 0)) { - strcpy(comm, ""); - return; - } - - tpid = pid & (PID_MAX_DEFAULT - 1); - map = savedcmd->map_pid_to_cmdline[tpid]; - if (map != NO_CMDLINE_MAP) { - tpid = savedcmd->map_cmdline_to_pid[map]; - if (tpid == pid) { - strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); - return; - } - } - strcpy(comm, "<...>"); -} - -void trace_find_cmdline(int pid, char comm[]) -{ - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - - __trace_find_cmdline(pid, comm); - - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); -} - -static int *trace_find_tgid_ptr(int pid) -{ - /* - * Pairs with the smp_store_release in set_tracer_flag() to ensure that - * if we observe a non-NULL tgid_map then we also observe the correct - * tgid_map_max. - */ - int *map = smp_load_acquire(&tgid_map); - - if (unlikely(!map || pid > tgid_map_max)) - return NULL; - - return &map[pid]; -} - -int trace_find_tgid(int pid) -{ - int *ptr = trace_find_tgid_ptr(pid); - - return ptr ? *ptr : 0; -} - -static int trace_save_tgid(struct task_struct *tsk) -{ - int *ptr; - - /* treat recording of idle task as a success */ - if (!tsk->pid) - return 1; - - ptr = trace_find_tgid_ptr(tsk->pid); - if (!ptr) - return 0; - - *ptr = tsk->tgid; - return 1; -} - -static bool tracing_record_taskinfo_skip(int flags) -{ - if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) - return true; - if (!__this_cpu_read(trace_taskinfo_save)) - return true; - return false; -} - -/** - * tracing_record_taskinfo - record the task info of a task - * - * @task: task to record - * @flags: TRACE_RECORD_CMDLINE for recording comm - * TRACE_RECORD_TGID for recording tgid - */ -void tracing_record_taskinfo(struct task_struct *task, int flags) -{ - bool done; - - if (tracing_record_taskinfo_skip(flags)) - return; - - /* - * Record as much task information as possible. If some fail, continue - * to try to record the others. - */ - done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); - done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); - - /* If recording any information failed, retry again soon. */ - if (!done) - return; - - __this_cpu_write(trace_taskinfo_save, false); -} - -/** - * tracing_record_taskinfo_sched_switch - record task info for sched_switch - * - * @prev: previous task during sched_switch - * @next: next task during sched_switch - * @flags: TRACE_RECORD_CMDLINE for recording comm - * TRACE_RECORD_TGID for recording tgid - */ -void tracing_record_taskinfo_sched_switch(struct task_struct *prev, - struct task_struct *next, int flags) -{ - bool done; - - if (tracing_record_taskinfo_skip(flags)) - return; - - /* - * Record as much task information as possible. If some fail, continue - * to try to record the others. - */ - done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); - done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); - done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); - done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); - - /* If recording any information failed, retry again soon. */ - if (!done) - return; - - __this_cpu_write(trace_taskinfo_save, false); -} - -/* Helpers to record a specific task information */ -void tracing_record_cmdline(struct task_struct *task) -{ - tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); -} - -void tracing_record_tgid(struct task_struct *task) -{ - tracing_record_taskinfo(task, TRACE_RECORD_TGID); -} - /* * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function @@ -5453,29 +5167,6 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) return 0; } -static int trace_alloc_tgid_map(void) -{ - int *map; - - if (tgid_map) - return 0; - - tgid_map_max = pid_max; - map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), - GFP_KERNEL); - if (!map) - return -ENOMEM; - - /* - * Pairs with smp_load_acquire() in - * trace_find_tgid_ptr() to ensure that if it observes - * the tgid_map we just allocated then it also observes - * the corresponding tgid_map_max value. - */ - smp_store_release(&tgid_map, map); - return 0; -} - int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { if ((mask == TRACE_ITER_RECORD_TGID) || @@ -5500,6 +5191,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) trace_event_enable_cmd_record(enabled); if (mask == TRACE_ITER_RECORD_TGID) { + if (trace_alloc_tgid_map() < 0) { tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; return -ENOMEM; @@ -5944,207 +5636,6 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; -static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) -{ - int pid = ++(*pos); - - return trace_find_tgid_ptr(pid); -} - -static void *saved_tgids_start(struct seq_file *m, loff_t *pos) -{ - int pid = *pos; - - return trace_find_tgid_ptr(pid); -} - -static void saved_tgids_stop(struct seq_file *m, void *v) -{ -} - -static int saved_tgids_show(struct seq_file *m, void *v) -{ - int *entry = (int *)v; - int pid = entry - tgid_map; - int tgid = *entry; - - if (tgid == 0) - return SEQ_SKIP; - - seq_printf(m, "%d %d\n", pid, tgid); - return 0; -} - -static const struct seq_operations tracing_saved_tgids_seq_ops = { - .start = saved_tgids_start, - .stop = saved_tgids_stop, - .next = saved_tgids_next, - .show = saved_tgids_show, -}; - -static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) -{ - int ret; - - ret = tracing_check_open_get_tr(NULL); - if (ret) - return ret; - - return seq_open(filp, &tracing_saved_tgids_seq_ops); -} - - -static const struct file_operations tracing_saved_tgids_fops = { - .open = tracing_saved_tgids_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) -{ - unsigned int *ptr = v; - - if (*pos || m->count) - ptr++; - - (*pos)++; - - for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; - ptr++) { - if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) - continue; - - return ptr; - } - - return NULL; -} - -static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) -{ - void *v; - loff_t l = 0; - - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - - v = &savedcmd->map_cmdline_to_pid[0]; - while (l <= *pos) { - v = saved_cmdlines_next(m, v, &l); - if (!v) - return NULL; - } - - return v; -} - -static void saved_cmdlines_stop(struct seq_file *m, void *v) -{ - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); -} - -static int saved_cmdlines_show(struct seq_file *m, void *v) -{ - char buf[TASK_COMM_LEN]; - unsigned int *pid = v; - - __trace_find_cmdline(*pid, buf); - seq_printf(m, "%d %s\n", *pid, buf); - return 0; -} - -static const struct seq_operations tracing_saved_cmdlines_seq_ops = { - .start = saved_cmdlines_start, - .next = saved_cmdlines_next, - .stop = saved_cmdlines_stop, - .show = saved_cmdlines_show, -}; - -static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) -{ - int ret; - - ret = tracing_check_open_get_tr(NULL); - if (ret) - return ret; - - return seq_open(filp, &tracing_saved_cmdlines_seq_ops); -} - -static const struct file_operations tracing_saved_cmdlines_fops = { - .open = tracing_saved_cmdlines_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static ssize_t -tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static int tracing_resize_saved_cmdlines(unsigned int val) -{ - struct saved_cmdlines_buffer *s, *savedcmd_temp; - - s = allocate_cmdlines_buffer(val); - if (!s) - return -ENOMEM; - - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - savedcmd_temp = savedcmd; - savedcmd = s; - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); - free_saved_cmdlines_buffer(savedcmd_temp); - - return 0; -} - -static ssize_t -tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - /* must have at least 1 entry or less than PID_MAX_DEFAULT */ - if (!val || val > PID_MAX_DEFAULT) - return -EINVAL; - - ret = tracing_resize_saved_cmdlines((unsigned int)val); - if (ret < 0) - return ret; - - *ppos += cnt; - - return cnt; -} - -static const struct file_operations tracing_saved_cmdlines_size_fops = { - .open = tracing_open_generic, - .read = tracing_saved_cmdlines_size_read, - .write = tracing_saved_cmdlines_size_write, -}; - #ifdef CONFIG_TRACE_EVAL_MAP_FILE static union trace_eval_map_item * update_eval_map(union trace_eval_map_item *ptr) @@ -10720,7 +10211,7 @@ __init static int tracer_alloc_buffers(void) out_free_pipe_cpumask: free_cpumask_var(global_trace.pipe_cpumask); out_free_savedcmd: - free_saved_cmdlines_buffer(savedcmd); + trace_free_saved_cmdlines_buffer(); out_free_temp_buffer: ring_buffer_free(temp_buffer); out_rm_hp_state: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 00f873910c5d..e4f0714d7a49 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1375,6 +1375,16 @@ static inline void trace_buffer_unlock_commit(struct trace_array *tr, trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL); } +DECLARE_PER_CPU(bool, trace_taskinfo_save); +int trace_save_cmdline(struct task_struct *tsk); +int trace_create_savedcmd(void); +int trace_alloc_tgid_map(void); +void trace_free_saved_cmdlines_buffer(void); + +extern const struct file_operations tracing_saved_cmdlines_fops; +extern const struct file_operations tracing_saved_tgids_fops; +extern const struct file_operations tracing_saved_cmdlines_size_fops; + DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); DECLARE_PER_CPU(int, trace_buffered_event_cnt); void trace_buffered_event_disable(void); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index c9ffdcfe622e..8a407adb0e1c 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -148,3 +149,517 @@ void tracing_stop_tgid_record(void) { tracing_stop_sched_switch(RECORD_TGID); } + +/* + * The tgid_map array maps from pid to tgid; i.e. the value stored at index i + * is the tgid last observed corresponding to pid=i. + */ +static int *tgid_map; + +/* The maximum valid index into tgid_map. */ +static size_t tgid_map_max; + +#define SAVED_CMDLINES_DEFAULT 128 +#define NO_CMDLINE_MAP UINT_MAX +/* + * Preemption must be disabled before acquiring trace_cmdline_lock. + * The various trace_arrays' max_lock must be acquired in a context + * where interrupt is disabled. + */ +static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; +struct saved_cmdlines_buffer { + unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; + unsigned *map_cmdline_to_pid; + unsigned cmdline_num; + int cmdline_idx; + char saved_cmdlines[]; +}; +static struct saved_cmdlines_buffer *savedcmd; + +/* Holds the size of a cmdline and pid element */ +#define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s) \ + (TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0])) + +static inline char *get_saved_cmdlines(int idx) +{ + return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; +} + +static inline void set_cmdline(int idx, const char *cmdline) +{ + strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); +} + +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) +{ + int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); + + kmemleak_free(s); + free_pages((unsigned long)s, order); +} + +static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) +{ + struct saved_cmdlines_buffer *s; + struct page *page; + int orig_size, size; + int order; + + /* Figure out how much is needed to hold the given number of cmdlines */ + orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); + order = get_order(orig_size); + size = 1 << (order + PAGE_SHIFT); + page = alloc_pages(GFP_KERNEL, order); + if (!page) + return NULL; + + s = page_address(page); + kmemleak_alloc(s, size, 1, GFP_KERNEL); + memset(s, 0, sizeof(*s)); + + /* Round up to actual allocation */ + val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); + s->cmdline_num = val; + + /* Place map_cmdline_to_pid array right after saved_cmdlines */ + s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN]; + + s->cmdline_idx = 0; + memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, + sizeof(s->map_pid_to_cmdline)); + memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_pid)); + + return s; +} + +int trace_create_savedcmd(void) +{ + savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); + + return savedcmd ? 0 : -ENOMEM; +} + +int trace_save_cmdline(struct task_struct *tsk) +{ + unsigned tpid, idx; + + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + tpid = tsk->pid & (PID_MAX_DEFAULT - 1); + + /* + * It's not the end of the world if we don't get + * the lock, but we also don't want to spin + * nor do we want to disable interrupts, + * so if we miss here, then better luck next time. + * + * This is called within the scheduler and wake up, so interrupts + * had better been disabled and run queue lock been held. + */ + lockdep_assert_preemption_disabled(); + if (!arch_spin_trylock(&trace_cmdline_lock)) + return 0; + + idx = savedcmd->map_pid_to_cmdline[tpid]; + if (idx == NO_CMDLINE_MAP) { + idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; + + savedcmd->map_pid_to_cmdline[tpid] = idx; + savedcmd->cmdline_idx = idx; + } + + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; + set_cmdline(idx, tsk->comm); + + arch_spin_unlock(&trace_cmdline_lock); + + return 1; +} + +static void __trace_find_cmdline(int pid, char comm[]) +{ + unsigned map; + int tpid; + + if (!pid) { + strcpy(comm, ""); + return; + } + + if (WARN_ON_ONCE(pid < 0)) { + strcpy(comm, ""); + return; + } + + tpid = pid & (PID_MAX_DEFAULT - 1); + map = savedcmd->map_pid_to_cmdline[tpid]; + if (map != NO_CMDLINE_MAP) { + tpid = savedcmd->map_cmdline_to_pid[map]; + if (tpid == pid) { + strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); + return; + } + } + strcpy(comm, "<...>"); +} + +void trace_find_cmdline(int pid, char comm[]) +{ + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + __trace_find_cmdline(pid, comm); + + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +static int *trace_find_tgid_ptr(int pid) +{ + /* + * Pairs with the smp_store_release in set_tracer_flag() to ensure that + * if we observe a non-NULL tgid_map then we also observe the correct + * tgid_map_max. + */ + int *map = smp_load_acquire(&tgid_map); + + if (unlikely(!map || pid > tgid_map_max)) + return NULL; + + return &map[pid]; +} + +int trace_find_tgid(int pid) +{ + int *ptr = trace_find_tgid_ptr(pid); + + return ptr ? *ptr : 0; +} + +static int trace_save_tgid(struct task_struct *tsk) +{ + int *ptr; + + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + ptr = trace_find_tgid_ptr(tsk->pid); + if (!ptr) + return 0; + + *ptr = tsk->tgid; + return 1; +} + +static bool tracing_record_taskinfo_skip(int flags) +{ + if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) + return true; + if (!__this_cpu_read(trace_taskinfo_save)) + return true; + return false; +} + +/** + * tracing_record_taskinfo - record the task info of a task + * + * @task: task to record + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo(struct task_struct *task, int flags) +{ + bool done; + + if (tracing_record_taskinfo_skip(flags)) + return; + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); + + /* If recording any information failed, retry again soon. */ + if (!done) + return; + + __this_cpu_write(trace_taskinfo_save, false); +} + +/** + * tracing_record_taskinfo_sched_switch - record task info for sched_switch + * + * @prev: previous task during sched_switch + * @next: next task during sched_switch + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo_sched_switch(struct task_struct *prev, + struct task_struct *next, int flags) +{ + bool done; + + if (tracing_record_taskinfo_skip(flags)) + return; + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); + done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); + + /* If recording any information failed, retry again soon. */ + if (!done) + return; + + __this_cpu_write(trace_taskinfo_save, false); +} + +/* Helpers to record a specific task information */ +void tracing_record_cmdline(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); +} + +void tracing_record_tgid(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_TGID); +} + +int trace_alloc_tgid_map(void) +{ + int *map; + + if (tgid_map) + return 0; + + tgid_map_max = pid_max; + map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), + GFP_KERNEL); + if (!map) + return -ENOMEM; + + /* + * Pairs with smp_load_acquire() in + * trace_find_tgid_ptr() to ensure that if it observes + * the tgid_map we just allocated then it also observes + * the corresponding tgid_map_max value. + */ + smp_store_release(&tgid_map, map); + return 0; +} + +static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) +{ + int pid = ++(*pos); + + return trace_find_tgid_ptr(pid); +} + +static void *saved_tgids_start(struct seq_file *m, loff_t *pos) +{ + int pid = *pos; + + return trace_find_tgid_ptr(pid); +} + +static void saved_tgids_stop(struct seq_file *m, void *v) +{ +} + +static int saved_tgids_show(struct seq_file *m, void *v) +{ + int *entry = (int *)v; + int pid = entry - tgid_map; + int tgid = *entry; + + if (tgid == 0) + return SEQ_SKIP; + + seq_printf(m, "%d %d\n", pid, tgid); + return 0; +} + +static const struct seq_operations tracing_saved_tgids_seq_ops = { + .start = saved_tgids_start, + .stop = saved_tgids_stop, + .next = saved_tgids_next, + .show = saved_tgids_show, +}; + +static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + + return seq_open(filp, &tracing_saved_tgids_seq_ops); +} + + +const struct file_operations tracing_saved_tgids_fops = { + .open = tracing_saved_tgids_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) +{ + unsigned int *ptr = v; + + if (*pos || m->count) + ptr++; + + (*pos)++; + + for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; + ptr++) { + if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) + continue; + + return ptr; + } + + return NULL; +} + +static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + v = &savedcmd->map_cmdline_to_pid[0]; + while (l <= *pos) { + v = saved_cmdlines_next(m, v, &l); + if (!v) + return NULL; + } + + return v; +} + +static void saved_cmdlines_stop(struct seq_file *m, void *v) +{ + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +static int saved_cmdlines_show(struct seq_file *m, void *v) +{ + char buf[TASK_COMM_LEN]; + unsigned int *pid = v; + + __trace_find_cmdline(*pid, buf); + seq_printf(m, "%d %s\n", *pid, buf); + return 0; +} + +static const struct seq_operations tracing_saved_cmdlines_seq_ops = { + .start = saved_cmdlines_start, + .next = saved_cmdlines_next, + .stop = saved_cmdlines_stop, + .show = saved_cmdlines_show, +}; + +static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + + return seq_open(filp, &tracing_saved_cmdlines_seq_ops); +} + +const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_saved_cmdlines_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static ssize_t +tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +void trace_free_saved_cmdlines_buffer(void) +{ + free_saved_cmdlines_buffer(savedcmd); +} + +static int tracing_resize_saved_cmdlines(unsigned int val) +{ + struct saved_cmdlines_buffer *s, *savedcmd_temp; + + s = allocate_cmdlines_buffer(val); + if (!s) + return -ENOMEM; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + savedcmd_temp = savedcmd; + savedcmd = s; + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + free_saved_cmdlines_buffer(savedcmd_temp); + + return 0; +} + +static ssize_t +tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* must have at least 1 entry or less than PID_MAX_DEFAULT */ + if (!val || val > PID_MAX_DEFAULT) + return -EINVAL; + + ret = tracing_resize_saved_cmdlines((unsigned int)val); + if (ret < 0) + return ret; + + *ppos += cnt; + + return cnt; +} + +const struct file_operations tracing_saved_cmdlines_size_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_size_read, + .write = tracing_saved_cmdlines_size_write, +}; From 6b76323e5a483e53568254146c3d141123f3b839 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Tue, 20 Feb 2024 20:23:05 +0000 Subject: [PATCH 06/34] ring-buffer: Zero ring-buffer sub-buffers In preparation for the ring-buffer memory mapping where each subbuf will be accessible to user-space, zero all the page allocations. Link: https://lore.kernel.org/linux-trace-kernel/20240220202310.2489614-2-vdonnefort@google.com Signed-off-by: Vincent Donnefort Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 09df645ab9c7..67d32af48640 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1515,7 +1515,8 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), + mflags | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto free_pages; @@ -1600,7 +1601,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order); + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_ZERO, + cpu_buffer->buffer->subbuf_order); if (!page) goto fail_free_reader; bpage->page = page_address(page); @@ -5568,7 +5570,8 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) if (bpage->data) goto out; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) { kfree(bpage); From f1e30cb6369251c03f63c564006f96a54197dcc4 Mon Sep 17 00:00:00 2001 From: linke li Date: Sat, 2 Mar 2024 12:42:21 +0800 Subject: [PATCH 07/34] ring-buffer: use READ_ONCE() to read cpu_buffer->commit_page in concurrent environment In function ring_buffer_iter_empty(), cpu_buffer->commit_page is read while other threads may change it. It may cause the time_stamp that read in the next line come from a different page. Use READ_ONCE() to avoid having to reason about compiler optimizations now and in future. Link: https://lore.kernel.org/linux-trace-kernel/tencent_DFF7D3561A0686B5E8FC079150A02505180A@qq.com Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Signed-off-by: linke li Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 67d32af48640..788d321036bd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4382,7 +4382,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; reader = cpu_buffer->reader_page; head_page = cpu_buffer->head_page; - commit_page = cpu_buffer->commit_page; + commit_page = READ_ONCE(cpu_buffer->commit_page); commit_ts = commit_page->page->time_stamp; /* From b70f2938242a028f8e9473781ede175486a59dc8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 15 Mar 2024 06:31:15 -0400 Subject: [PATCH 08/34] ring-buffer: Make wake once of ring_buffer_wait() more robust The default behavior of ring_buffer_wait() when passed a NULL "cond" parameter is to exit the function the first time it is woken up. The current implementation uses a counter that starts at zero and when it is greater than one it exits the wait_event_interruptible(). But this relies on the internal working of wait_event_interruptible() as that code basically has: if (cond) return; prepare_to_wait(); if (!cond) schedule(); finish_wait(); That is, cond is called twice before it sleeps. The default cond of ring_buffer_wait() needs to account for that and wait for its counter to increment twice before exiting. Instead, use the seq/atomic_inc logic that is used by the tracing code that calls this function. Add an atomic_t seq to rb_irq_work and when cond is NULL, have the default callback take a descriptor as its data that holds the rbwork and the value of the seq when it started. The wakeups will now increment the rbwork->seq and the cond callback will simply check if that number is different, and no longer have to rely on the implementation of wait_event_interruptible(). Link: https://lore.kernel.org/linux-trace-kernel/20240315063115.6cb5d205@gandalf.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Fixes: 7af9ded0c2ca ("ring-buffer: Use wait_event_interruptible() in ring_buffer_wait()") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 788d321036bd..25476ead681b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -384,6 +384,7 @@ struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; + atomic_t seq; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; @@ -753,6 +754,9 @@ static void rb_wake_up_waiters(struct irq_work *work) { struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + /* For waiters waiting for the first wake up */ + (void)atomic_fetch_inc_release(&rbwork->seq); + wake_up_all(&rbwork->waiters); if (rbwork->full_waiters_pending || rbwork->wakeup_full) { /* Only cpu_buffer sets the above flags */ @@ -881,20 +885,21 @@ rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer, return false; } +struct rb_wait_data { + struct rb_irq_work *irq_work; + int seq; +}; + /* * The default wait condition for ring_buffer_wait() is to just to exit the * wait loop the first time it is woken up. */ static bool rb_wait_once(void *data) { - long *once = data; + struct rb_wait_data *rdata = data; + struct rb_irq_work *rbwork = rdata->irq_work; - /* wait_event() actually calls this twice before scheduling*/ - if (*once > 1) - return true; - - (*once)++; - return false; + return atomic_read_acquire(&rbwork->seq) != rdata->seq; } /** @@ -915,14 +920,9 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full, struct ring_buffer_per_cpu *cpu_buffer; struct wait_queue_head *waitq; struct rb_irq_work *rbwork; - long once = 0; + struct rb_wait_data rdata; int ret = 0; - if (!cond) { - cond = rb_wait_once; - data = &once; - } - /* * Depending on what the caller is waiting for, either any * data in any cpu buffer, or a specific buffer, put the @@ -944,6 +944,14 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full, else waitq = &rbwork->waiters; + /* Set up to exit loop as soon as it is woken */ + if (!cond) { + cond = rb_wait_once; + rdata.irq_work = rbwork; + rdata.seq = atomic_read_acquire(&rbwork->seq); + data = &rdata; + } + ret = wait_event_interruptible((*waitq), rb_wait_cond(rbwork, buffer, cpu, full, cond, data)); From 180e4e390978af9d0cc060e87920c462276453b9 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Tue, 20 Feb 2024 20:23:07 +0000 Subject: [PATCH 09/34] tracing: Add snapshot refcount When a ring-buffer is memory mapped by user-space, no trace or ring-buffer swap is possible. This means the snapshot feature is mutually exclusive with the memory mapping. Having a refcount on snapshot users will help to know if a mapping is possible or not. Instead of relying on the global trace_types_lock, a new spinlock is introduced to serialize accesses to trace_array->snapshot. This intends to allow access to that variable in a context where the mmap lock is already held. Link: https://lore.kernel.org/linux-trace-kernel/20240220202310.2489614-4-vdonnefort@google.com Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 99 ++++++++++++++++++++++++----- kernel/trace/trace.h | 8 ++- kernel/trace/trace_events_trigger.c | 58 ++++++++++++----- 3 files changed, 129 insertions(+), 36 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1e5f80cda39a..c66ca2f6944e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1300,6 +1300,50 @@ static void free_snapshot(struct trace_array *tr) tr->allocated_snapshot = false; } +static int tracing_arm_snapshot_locked(struct trace_array *tr) +{ + int ret; + + lockdep_assert_held(&trace_types_lock); + + spin_lock(&tr->snapshot_trigger_lock); + if (tr->snapshot == UINT_MAX) { + spin_unlock(&tr->snapshot_trigger_lock); + return -EBUSY; + } + + tr->snapshot++; + spin_unlock(&tr->snapshot_trigger_lock); + + ret = tracing_alloc_snapshot_instance(tr); + if (ret) { + spin_lock(&tr->snapshot_trigger_lock); + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); + } + + return ret; +} + +int tracing_arm_snapshot(struct trace_array *tr) +{ + int ret; + + mutex_lock(&trace_types_lock); + ret = tracing_arm_snapshot_locked(tr); + mutex_unlock(&trace_types_lock); + + return ret; +} + +void tracing_disarm_snapshot(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->snapshot)) + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); +} + /** * tracing_alloc_snapshot - allocate snapshot buffer. * @@ -1373,10 +1417,6 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, mutex_lock(&trace_types_lock); - ret = tracing_alloc_snapshot_instance(tr); - if (ret) - goto fail_unlock; - if (tr->current_trace->use_max_tr) { ret = -EBUSY; goto fail_unlock; @@ -1395,6 +1435,10 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, goto fail_unlock; } + ret = tracing_arm_snapshot_locked(tr); + if (ret) + goto fail_unlock; + local_irq_disable(); arch_spin_lock(&tr->max_lock); tr->cond_snapshot = cond_snapshot; @@ -1439,6 +1483,8 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) arch_spin_unlock(&tr->max_lock); local_irq_enable(); + tracing_disarm_snapshot(tr); + return ret; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); @@ -1481,6 +1527,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #define free_snapshot(tr) do { } while (0) +#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; }) #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -6112,11 +6159,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) */ synchronize_rcu(); free_snapshot(tr); + tracing_disarm_snapshot(tr); } - if (t->use_max_tr && !tr->allocated_snapshot) { - ret = tracing_alloc_snapshot_instance(tr); - if (ret < 0) + if (t->use_max_tr) { + ret = tracing_arm_snapshot_locked(tr); + if (ret) goto out; } #else @@ -6125,8 +6173,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t->init) { ret = tracer_init(t, tr); - if (ret) + if (ret) { +#ifdef CONFIG_TRACER_MAX_TRACE + if (t->use_max_tr) + tracing_disarm_snapshot(tr); +#endif goto out; + } } tr->current_trace = t; @@ -7228,10 +7281,11 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, if (tr->allocated_snapshot) ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, iter->cpu_file); - else - ret = tracing_alloc_snapshot_instance(tr); - if (ret < 0) + + ret = tracing_arm_snapshot_locked(tr); + if (ret) break; + /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { local_irq_disable(); @@ -7241,6 +7295,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, (void *)tr, 1); } + tracing_disarm_snapshot(tr); break; default: if (tr->allocated_snapshot) { @@ -8372,8 +8427,13 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; - if (glob[0] == '!') - return unregister_ftrace_function_probe_func(glob+1, tr, ops); + if (glob[0] == '!') { + ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); + if (!ret) + tracing_disarm_snapshot(tr); + + return ret; + } if (!param) goto out_reg; @@ -8392,12 +8452,13 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, return ret; out_reg: - ret = tracing_alloc_snapshot_instance(tr); + ret = tracing_arm_snapshot(tr); if (ret < 0) goto out; ret = register_ftrace_function_probe(glob, tr, ops, count); - + if (ret < 0) + tracing_disarm_snapshot(tr); out: return ret < 0 ? ret : 0; } @@ -9204,7 +9265,9 @@ trace_array_create_systems(const char *name, const char *systems) raw_spin_lock_init(&tr->start_lock); tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - +#ifdef CONFIG_TRACER_MAX_TRACE + spin_lock_init(&tr->snapshot_trigger_lock); +#endif tr->current_trace = &nop_trace; INIT_LIST_HEAD(&tr->systems); @@ -10174,7 +10237,9 @@ __init static int tracer_alloc_buffers(void) global_trace.current_trace = &nop_trace; global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - +#ifdef CONFIG_TRACER_MAX_TRACE + spin_lock_init(&global_trace.snapshot_trigger_lock); +#endif ftrace_init_global_array_ops(&global_trace); init_trace_flags_index(&global_trace); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e4f0714d7a49..64450615ca0c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -334,8 +334,8 @@ struct trace_array { */ struct array_buffer max_buffer; bool allocated_snapshot; -#endif -#ifdef CONFIG_TRACER_MAX_TRACE + spinlock_t snapshot_trigger_lock; + unsigned int snapshot; unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; @@ -1983,12 +1983,16 @@ static inline void trace_event_eval_update(struct trace_eval_map **map, int len) #ifdef CONFIG_TRACER_SNAPSHOT void tracing_snapshot_instance(struct trace_array *tr); int tracing_alloc_snapshot_instance(struct trace_array *tr); +int tracing_arm_snapshot(struct trace_array *tr); +void tracing_disarm_snapshot(struct trace_array *tr); #else static inline void tracing_snapshot_instance(struct trace_array *tr) { } static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) { return 0; } +static inline int tracing_arm_snapshot(struct trace_array *tr) { return 0; } +static inline void tracing_disarm_snapshot(struct trace_array *tr) { } #endif #ifdef CONFIG_PREEMPT_TRACER diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b33c3861fbbb..62e4f58b8671 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -597,20 +597,12 @@ out: return ret; } -/** - * unregister_trigger - Generic event_command @unreg implementation - * @glob: The raw string used to register the trigger - * @test: Trigger-specific data used to find the trigger to remove - * @file: The trace_event_file associated with the event - * - * Common implementation for event trigger unregistration. - * - * Usually used directly as the @unreg method in event command - * implementations. +/* + * True if the trigger was found and unregistered, else false. */ -static void unregister_trigger(char *glob, - struct event_trigger_data *test, - struct trace_event_file *file) +static bool try_unregister_trigger(char *glob, + struct event_trigger_data *test, + struct trace_event_file *file) { struct event_trigger_data *data = NULL, *iter; @@ -626,8 +618,32 @@ static void unregister_trigger(char *glob, } } - if (data && data->ops->free) - data->ops->free(data); + if (data) { + if (data->ops->free) + data->ops->free(data); + + return true; + } + + return false; +} + +/** + * unregister_trigger - Generic event_command @unreg implementation + * @glob: The raw string used to register the trigger + * @test: Trigger-specific data used to find the trigger to remove + * @file: The trace_event_file associated with the event + * + * Common implementation for event trigger unregistration. + * + * Usually used directly as the @unreg method in event command + * implementations. + */ +static void unregister_trigger(char *glob, + struct event_trigger_data *test, + struct trace_event_file *file) +{ + try_unregister_trigger(glob, test, file); } /* @@ -1470,7 +1486,7 @@ register_snapshot_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { - int ret = tracing_alloc_snapshot_instance(file->tr); + int ret = tracing_arm_snapshot(file->tr); if (ret < 0) return ret; @@ -1478,6 +1494,14 @@ register_snapshot_trigger(char *glob, return register_trigger(glob, data, file); } +static void unregister_snapshot_trigger(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + if (try_unregister_trigger(glob, data, file)) + tracing_disarm_snapshot(file->tr); +} + static int snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) { @@ -1510,7 +1534,7 @@ static struct event_command trigger_snapshot_cmd = { .trigger_type = ETT_SNAPSHOT, .parse = event_trigger_parse, .reg = register_snapshot_trigger, - .unreg = unregister_trigger, + .unreg = unregister_snapshot_trigger, .get_trigger_ops = snapshot_get_trigger_ops, .set_filter = set_trigger_filter, }; From 1e953de9e9b4ca77a9ce0fc17a0778eba3a4ca64 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 22 Feb 2024 00:18:04 +0000 Subject: [PATCH 10/34] tracing/user_events: Prepare find/delete for same name events The current code for finding and deleting events assumes that there will never be cases when user_events are registered with the same name, but different formats. Scenarios exist where programs want to use the same name but have different formats. An example is multiple versions of a program running side-by-side using the same event name, but with updated formats in each version. This change does not yet allow for multi-format events. If user_events are registered with the same name but different arguments the programs see the same return values as before. This change simply makes it possible to easily accommodate for this. Update find_user_event() to take in argument parameters and register flags to accommodate future multi-format event scenarios. Have find validate argument matching and return error pointers to cover when an existing event has the same name but different format. Update callers to handle error pointer logic. Move delete_user_event() to use hash walking directly now that find_user_event() has changed. Delete all events found that match the register name, stop if an error occurs and report back to the user. Update user_fields_match() to cover list_empty() scenarios now that find_user_event() uses it directly. This makes the logic consistent across several callsites. Link: https://lore.kernel.org/linux-trace-kernel/20240222001807.1463-2-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 107 +++++++++++++++++-------------- 1 file changed, 59 insertions(+), 48 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index e76f5e1efdf2..fce5ed5fec50 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -202,6 +202,8 @@ static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); static void user_event_mm_put(struct user_event_mm *mm); static int destroy_user_event(struct user_event *user); +static bool user_fields_match(struct user_event *user, int argc, + const char **argv); static u32 user_event_key(char *name) { @@ -1493,17 +1495,24 @@ static int destroy_user_event(struct user_event *user) } static struct user_event *find_user_event(struct user_event_group *group, - char *name, u32 *outkey) + char *name, int argc, const char **argv, + u32 flags, u32 *outkey) { struct user_event *user; u32 key = user_event_key(name); *outkey = key; - hash_for_each_possible(group->register_table, user, node, key) - if (!strcmp(EVENT_NAME(user), name)) + hash_for_each_possible(group->register_table, user, node, key) { + if (strcmp(EVENT_NAME(user), name)) + continue; + + if (user_fields_match(user, argc, argv)) return user_event_get(user); + return ERR_PTR(-EADDRINUSE); + } + return NULL; } @@ -1860,6 +1869,9 @@ static bool user_fields_match(struct user_event *user, int argc, struct list_head *head = &user->fields; int i = 0; + if (argc == 0) + return list_empty(head); + list_for_each_entry_reverse(field, head, link) { if (!user_field_match(field, argc, argv, &i)) return false; @@ -1880,10 +1892,8 @@ static bool user_event_match(const char *system, const char *event, match = strcmp(EVENT_NAME(user), event) == 0 && (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); - if (match && argc > 0) + if (match) match = user_fields_match(user, argc, argv); - else if (match && argc == 0) - match = list_empty(&user->fields); return match; } @@ -1922,11 +1932,11 @@ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, struct user_event **newuser, int reg_flags) { + struct user_event *user; + char **argv = NULL; + int argc = 0; int ret; u32 key; - struct user_event *user; - int argc = 0; - char **argv; /* Currently don't support any text based flags */ if (flags != NULL) @@ -1935,41 +1945,34 @@ static int user_event_parse(struct user_event_group *group, char *name, if (!user_event_capable(reg_flags)) return -EPERM; + if (args) { + argv = argv_split(GFP_KERNEL, args, &argc); + + if (!argv) + return -ENOMEM; + } + /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); - user = find_user_event(group, name, &key); + user = find_user_event(group, name, argc, (const char **)argv, + reg_flags, &key); mutex_unlock(&event_mutex); + if (argv) + argv_free(argv); + + if (IS_ERR(user)) + return PTR_ERR(user); + if (user) { - if (args) { - argv = argv_split(GFP_KERNEL, args, &argc); - if (!argv) { - ret = -ENOMEM; - goto error; - } - - ret = user_fields_match(user, argc, (const char **)argv); - argv_free(argv); - - } else - ret = list_empty(&user->fields); - - if (ret) { - *newuser = user; - /* - * Name is allocated by caller, free it since it already exists. - * Caller only worries about failure cases for freeing. - */ - kfree(name); - } else { - ret = -EADDRINUSE; - goto error; - } + *newuser = user; + /* + * Name is allocated by caller, free it since it already exists. + * Caller only worries about failure cases for freeing. + */ + kfree(name); return 0; -error: - user_event_put(user, false); - return ret; } user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT); @@ -2052,25 +2055,33 @@ put_user: } /* - * Deletes a previously created event if it is no longer being used. + * Deletes previously created events if they are no longer being used. */ static int delete_user_event(struct user_event_group *group, char *name) { - u32 key; - struct user_event *user = find_user_event(group, name, &key); + struct user_event *user; + struct hlist_node *tmp; + u32 key = user_event_key(name); + int ret = -ENOENT; - if (!user) - return -ENOENT; + /* Attempt to delete all event(s) with the name passed in */ + hash_for_each_possible_safe(group->register_table, user, tmp, node, key) { + if (strcmp(EVENT_NAME(user), name)) + continue; - user_event_put(user, true); + if (!user_event_last_ref(user)) + return -EBUSY; - if (!user_event_last_ref(user)) - return -EBUSY; + if (!user_event_capable(user->reg_flags)) + return -EPERM; - if (!user_event_capable(user->reg_flags)) - return -EPERM; + ret = destroy_user_event(user); - return destroy_user_event(user); + if (ret) + goto out; + } +out: + return ret; } /* From 64805e4039f1687b9857034123a9ec10bb9abddd Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 22 Feb 2024 00:18:05 +0000 Subject: [PATCH 11/34] tracing/user_events: Introduce multi-format events Currently user_events supports 1 event with the same name and must have the exact same format when referenced by multiple programs. This opens an opportunity for malicious or poorly thought through programs to create events that others use with different formats. Another scenario is user programs wishing to use the same event name but add more fields later when the software updates. Various versions of a program may be running side-by-side, which is prevented by the current single format requirement. Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates the user program wishes to use the same user_event name, but may have several different formats of the event. When this flag is used, create the underlying tracepoint backing the user_event with a unique name per-version of the format. It's important that existing ABI users do not get this logic automatically, even if one of the multi format events matches the format. This ensures existing programs that create events and assume the tracepoint name will match exactly continue to work as expected. Add logic to only check multi-format events with other multi-format events and single-format events to only check single-format events during find. Change system name of the multi-format event tracepoint to ensure that multi-format events are isolated completely from single-format events. This prevents single-format names from conflicting with multi-format events if they end with the same suffix as the multi-format events. Add a register_name (reg_name) to the user_event struct which allows for split naming of events. We now have the name that was used to register within user_events as well as the unique name for the tracepoint. Upon registering events ensure matches based on first the reg_name, followed by the fields and format of the event. This allows for multiple events with the same registered name to have different formats. The underlying tracepoint will have a unique name in the format of {reg_name}.{unique_id}. For example, if both "test u32 value" and "test u64 value" are used with the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique tracepoints. The dynamic_events file would then show the following: u:test u64 count u:test u32 count The actual tracepoint names look like this: test.0 test.1 Both would be under the new user_events_multi system name to prevent the older ABI from being used to squat on multi-formatted events and block their use. Deleting events via "!u:test u64 count" would only delete the first tracepoint that matched that format. When the delete ABI is used all events with the same name will be attempted to be deleted. If per-version deletion is required, user programs should either not use persistent events or delete them via dynamic_events. Link: https://lore.kernel.org/linux-trace-kernel/20240222001807.1463-3-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/uapi/linux/user_events.h | 6 +- kernel/trace/trace_events_user.c | 102 +++++++++++++++++++++++++++---- 2 files changed, 95 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h index f74f3aedd49c..a03de03dccbc 100644 --- a/include/uapi/linux/user_events.h +++ b/include/uapi/linux/user_events.h @@ -12,6 +12,7 @@ #include #define USER_EVENTS_SYSTEM "user_events" +#define USER_EVENTS_MULTI_SYSTEM "user_events_multi" #define USER_EVENTS_PREFIX "u:" /* Create dynamic location entry within a 32-bit value */ @@ -22,8 +23,11 @@ enum user_reg_flag { /* Event will not delete upon last reference closing */ USER_EVENT_REG_PERSIST = 1U << 0, + /* Event will be allowed to have multiple formats */ + USER_EVENT_REG_MULTI_FORMAT = 1U << 1, + /* This value or above is currently non-ABI */ - USER_EVENT_REG_MAX = 1U << 1, + USER_EVENT_REG_MAX = 1U << 2, }; /* diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index fce5ed5fec50..70d428c394b6 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -34,7 +34,8 @@ /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 -#define EVENT_NAME(user_event) ((user_event)->tracepoint.name) +#define EVENT_NAME(user_event) ((user_event)->reg_name) +#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 /* @@ -54,10 +55,13 @@ * allows isolation for events by various means. */ struct user_event_group { - char *system_name; - struct hlist_node node; - struct mutex reg_mutex; + char *system_name; + char *system_multi_name; + struct hlist_node node; + struct mutex reg_mutex; DECLARE_HASHTABLE(register_table, 8); + /* ID that moves forward within the group for multi-event names */ + u64 multi_id; }; /* Group for init_user_ns mapping, top-most group */ @@ -78,6 +82,7 @@ static unsigned int current_user_events; */ struct user_event { struct user_event_group *group; + char *reg_name; struct tracepoint tracepoint; struct trace_event_call call; struct trace_event_class class; @@ -127,6 +132,8 @@ struct user_event_enabler { #define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK)) +#define EVENT_MULTI_FORMAT(f) ((f) & USER_EVENT_REG_MULTI_FORMAT) + /* Used for asynchronous faulting in of pages */ struct user_event_enabler_fault { struct work_struct work; @@ -330,6 +337,7 @@ out: static void user_event_group_destroy(struct user_event_group *group) { kfree(group->system_name); + kfree(group->system_multi_name); kfree(group); } @@ -348,6 +356,11 @@ static char *user_event_group_system_name(void) return system_name; } +static char *user_event_group_system_multi_name(void) +{ + return kstrdup(USER_EVENTS_MULTI_SYSTEM, GFP_KERNEL); +} + static struct user_event_group *current_user_event_group(void) { return init_group; @@ -367,6 +380,11 @@ static struct user_event_group *user_event_group_create(void) if (!group->system_name) goto error; + group->system_multi_name = user_event_group_system_multi_name(); + + if (!group->system_multi_name) + goto error; + mutex_init(&group->reg_mutex); hash_init(group->register_table); @@ -1482,6 +1500,11 @@ static int destroy_user_event(struct user_event *user) hash_del(&user->node); user_event_destroy_validators(user); + + /* If we have different names, both must be freed */ + if (EVENT_NAME(user) != EVENT_TP_NAME(user)) + kfree(EVENT_TP_NAME(user)); + kfree(user->call.print_fmt); kfree(EVENT_NAME(user)); kfree(user); @@ -1504,12 +1527,24 @@ static struct user_event *find_user_event(struct user_event_group *group, *outkey = key; hash_for_each_possible(group->register_table, user, node, key) { + /* + * Single-format events shouldn't return multi-format + * events. Callers expect the underlying tracepoint to match + * the name exactly in these cases. Only check like-formats. + */ + if (EVENT_MULTI_FORMAT(flags) != EVENT_MULTI_FORMAT(user->reg_flags)) + continue; + if (strcmp(EVENT_NAME(user), name)) continue; if (user_fields_match(user, argc, argv)) return user_event_get(user); + /* Scan others if this is a multi-format event */ + if (EVENT_MULTI_FORMAT(flags)) + continue; + return ERR_PTR(-EADDRINUSE); } @@ -1889,8 +1924,12 @@ static bool user_event_match(const char *system, const char *event, struct user_event *user = container_of(ev, struct user_event, devent); bool match; - match = strcmp(EVENT_NAME(user), event) == 0 && - (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); + match = strcmp(EVENT_NAME(user), event) == 0; + + if (match && system) { + match = strcmp(system, user->group->system_name) == 0 || + strcmp(system, user->group->system_multi_name) == 0; + } if (match) match = user_fields_match(user, argc, argv); @@ -1923,6 +1962,33 @@ static int user_event_trace_register(struct user_event *user) return ret; } +static int user_event_set_tp_name(struct user_event *user) +{ + lockdep_assert_held(&user->group->reg_mutex); + + if (EVENT_MULTI_FORMAT(user->reg_flags)) { + char *multi_name; + + multi_name = kasprintf(GFP_KERNEL_ACCOUNT, "%s.%llx", + user->reg_name, user->group->multi_id); + + if (!multi_name) + return -ENOMEM; + + user->call.name = multi_name; + user->tracepoint.name = multi_name; + + /* Inc to ensure unique multi-event name next time */ + user->group->multi_id++; + } else { + /* Non Multi-format uses register name */ + user->call.name = user->reg_name; + user->tracepoint.name = user->reg_name; + } + + return 0; +} + /* * Parses the event name, arguments and flags then registers if successful. * The name buffer lifetime is owned by this method for success cases only. @@ -1985,7 +2051,13 @@ static int user_event_parse(struct user_event_group *group, char *name, INIT_LIST_HEAD(&user->validators); user->group = group; - user->tracepoint.name = name; + user->reg_name = name; + user->reg_flags = reg_flags; + + ret = user_event_set_tp_name(user); + + if (ret) + goto put_user; ret = user_event_parse_fields(user, args); @@ -1999,11 +2071,14 @@ static int user_event_parse(struct user_event_group *group, char *name, user->call.data = user; user->call.class = &user->class; - user->call.name = name; user->call.flags = TRACE_EVENT_FL_TRACEPOINT; user->call.tp = &user->tracepoint; user->call.event.funcs = &user_event_funcs; - user->class.system = group->system_name; + + if (EVENT_MULTI_FORMAT(user->reg_flags)) + user->class.system = group->system_multi_name; + else + user->class.system = group->system_name; user->class.fields_array = user_event_fields_array; user->class.get_fields = user_event_get_fields; @@ -2025,8 +2100,6 @@ static int user_event_parse(struct user_event_group *group, char *name, if (ret) goto put_user_lock; - user->reg_flags = reg_flags; - if (user->reg_flags & USER_EVENT_REG_PERSIST) { /* Ensure we track self ref and caller ref (2) */ refcount_set(&user->refcnt, 2); @@ -2050,6 +2123,11 @@ put_user: user_event_destroy_fields(user); user_event_destroy_validators(user); kfree(user->call.print_fmt); + + /* Caller frees reg_name on error, but not multi-name */ + if (EVENT_NAME(user) != EVENT_TP_NAME(user)) + kfree(EVENT_TP_NAME(user)); + kfree(user); return ret; } @@ -2639,7 +2717,7 @@ static int user_seq_show(struct seq_file *m, void *p) hash_for_each(group->register_table, i, user, node) { status = user->status; - seq_printf(m, "%s", EVENT_NAME(user)); + seq_printf(m, "%s", EVENT_TP_NAME(user)); if (status != 0) seq_puts(m, " #"); From bcb7bdcc17e0e8d79b19748d912ae6991f087345 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 22 Feb 2024 00:18:06 +0000 Subject: [PATCH 12/34] selftests/user_events: Test multi-format events User_events now has multi-format events which allow for the same register name, but with different formats. When this occurs, different tracepoints are created with unique names. Add a new test that ensures the same name can be used for two different formats. Ensure they are isolated from each other and that name and arg matching still works if yet another register comes in with the same format as one of the two. Link: https://lore.kernel.org/linux-trace-kernel/20240222001807.1463-4-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- .../testing/selftests/user_events/abi_test.c | 134 ++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/tools/testing/selftests/user_events/abi_test.c b/tools/testing/selftests/user_events/abi_test.c index cef1ff1af223..7288a05136ba 100644 --- a/tools/testing/selftests/user_events/abi_test.c +++ b/tools/testing/selftests/user_events/abi_test.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include "../kselftest_harness.h" @@ -23,6 +25,62 @@ const char *data_file = "/sys/kernel/tracing/user_events_data"; const char *enable_file = "/sys/kernel/tracing/events/user_events/__abi_event/enable"; +const char *multi_dir_glob = "/sys/kernel/tracing/events/user_events_multi/__abi_event.*"; + +static int wait_for_delete(char *dir) +{ + struct stat buf; + int i; + + for (i = 0; i < 10000; ++i) { + if (stat(dir, &buf) == -1 && errno == ENOENT) + return 0; + + usleep(1000); + } + + return -1; +} + +static int find_multi_event_dir(char *unique_field, char *out_dir, int dir_len) +{ + char path[256]; + glob_t buf; + int i, ret; + + ret = glob(multi_dir_glob, GLOB_ONLYDIR, NULL, &buf); + + if (ret) + return -1; + + ret = -1; + + for (i = 0; i < buf.gl_pathc; ++i) { + FILE *fp; + + snprintf(path, sizeof(path), "%s/format", buf.gl_pathv[i]); + fp = fopen(path, "r"); + + if (!fp) + continue; + + while (fgets(path, sizeof(path), fp) != NULL) { + if (strstr(path, unique_field)) { + fclose(fp); + /* strscpy is not available, use snprintf */ + snprintf(out_dir, dir_len, "%s", buf.gl_pathv[i]); + ret = 0; + goto out; + } + } + + fclose(fp); + } +out: + globfree(&buf); + + return ret; +} static bool event_exists(void) { @@ -74,6 +132,39 @@ static int event_delete(void) return ret; } +static int reg_enable_multi(void *enable, int size, int bit, int flags, + char *args) +{ + struct user_reg reg = {0}; + char full_args[512] = {0}; + int fd = open(data_file, O_RDWR); + int len; + int ret; + + if (fd < 0) + return -1; + + len = snprintf(full_args, sizeof(full_args), "__abi_event %s", args); + + if (len > sizeof(full_args)) { + ret = -E2BIG; + goto out; + } + + reg.size = sizeof(reg); + reg.name_args = (__u64)full_args; + reg.flags = USER_EVENT_REG_MULTI_FORMAT | flags; + reg.enable_bit = bit; + reg.enable_addr = (__u64)enable; + reg.enable_size = size; + + ret = ioctl(fd, DIAG_IOCSREG, ®); +out: + close(fd); + + return ret; +} + static int reg_enable_flags(void *enable, int size, int bit, int flags) { struct user_reg reg = {0}; @@ -207,6 +298,49 @@ TEST_F(user, bit_sizes) { ASSERT_NE(0, reg_enable(&self->check, 128, 0)); } +TEST_F(user, multi_format) { + char first_dir[256]; + char second_dir[256]; + struct stat buf; + + /* Multiple formats for the same name should work */ + ASSERT_EQ(0, reg_enable_multi(&self->check, sizeof(int), 0, + 0, "u32 multi_first")); + + ASSERT_EQ(0, reg_enable_multi(&self->check, sizeof(int), 1, + 0, "u64 multi_second")); + + /* Same name with same format should also work */ + ASSERT_EQ(0, reg_enable_multi(&self->check, sizeof(int), 2, + 0, "u64 multi_second")); + + ASSERT_EQ(0, find_multi_event_dir("multi_first", + first_dir, sizeof(first_dir))); + + ASSERT_EQ(0, find_multi_event_dir("multi_second", + second_dir, sizeof(second_dir))); + + /* Should not be found in the same dir */ + ASSERT_NE(0, strcmp(first_dir, second_dir)); + + /* First dir should still exist */ + ASSERT_EQ(0, stat(first_dir, &buf)); + + /* Disabling first register should remove first dir */ + ASSERT_EQ(0, reg_disable(&self->check, 0)); + ASSERT_EQ(0, wait_for_delete(first_dir)); + + /* Second dir should still exist */ + ASSERT_EQ(0, stat(second_dir, &buf)); + + /* Disabling second register should remove second dir */ + ASSERT_EQ(0, reg_disable(&self->check, 1)); + /* Ensure bit 1 and 2 are tied together, should not delete yet */ + ASSERT_EQ(0, stat(second_dir, &buf)); + ASSERT_EQ(0, reg_disable(&self->check, 2)); + ASSERT_EQ(0, wait_for_delete(second_dir)); +} + TEST_F(user, forks) { int i; From 3727db1c09b44c5bdcf2497a538da24beee16a3a Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 22 Feb 2024 00:18:07 +0000 Subject: [PATCH 13/34] tracing/user_events: Document multi-format flag User programs can now ask user_events to handle the synchronization of multiple different formats for an event with the same name via the new USER_EVENT_REG_MULTI_FORMAT flag. Add a section for USER_EVENT_REG_MULTI_FORMAT that explains the intended purpose and caveats of using it. Explain how deletion works in these cases and how to use /sys/kernel/tracing/dynamic_events for per-version deletion. Link: https://lore.kernel.org/linux-trace-kernel/20240222001807.1463-5-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/user_events.rst | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst index d8f12442aaa6..1d5a7626e6a6 100644 --- a/Documentation/trace/user_events.rst +++ b/Documentation/trace/user_events.rst @@ -92,6 +92,24 @@ The following flags are currently supported. process closes or unregisters the event. Requires CAP_PERFMON otherwise -EPERM is returned. ++ USER_EVENT_REG_MULTI_FORMAT: The event can contain multiple formats. This + allows programs to prevent themselves from being blocked when their event + format changes and they wish to use the same name. When this flag is used the + tracepoint name will be in the new format of "name.unique_id" vs the older + format of "name". A tracepoint will be created for each unique pair of name + and format. This means if several processes use the same name and format, + they will use the same tracepoint. If yet another process uses the same name, + but a different format than the other processes, it will use a different + tracepoint with a new unique id. Recording programs need to scan tracefs for + the various different formats of the event name they are interested in + recording. The system name of the tracepoint will also use "user_events_multi" + instead of "user_events". This prevents single-format event names conflicting + with any multi-format event names within tracefs. The unique_id is output as + a hex string. Recording programs should ensure the tracepoint name starts with + the event name they registered and has a suffix that starts with . and only + has hex characters. For example to find all versions of the event "test" you + can use the regex "^test\.[0-9a-fA-F]+$". + Upon successful registration the following is set. + write_index: The index to use for this file descriptor that represents this @@ -106,6 +124,9 @@ or perf record -e user_events:[name] when attaching/recording. **NOTE:** The event subsystem name by default is "user_events". Callers should not assume it will always be "user_events". Operators reserve the right in the future to change the subsystem name per-process to accommodate event isolation. +In addition if the USER_EVENT_REG_MULTI_FORMAT flag is used the tracepoint name +will have a unique id appended to it and the system name will be +"user_events_multi" as described above. Command Format ^^^^^^^^^^^^^^ @@ -156,7 +177,11 @@ to request deletes than the one used for registration due to this. to the event. If programs do not want auto-delete, they must use the USER_EVENT_REG_PERSIST flag when registering the event. Once that flag is used the event exists until DIAG_IOCSDEL is invoked. Both register and delete of an -event that persists requires CAP_PERFMON, otherwise -EPERM is returned. +event that persists requires CAP_PERFMON, otherwise -EPERM is returned. When +there are multiple formats of the same event name, all events with the same +name will be attempted to be deleted. If only a specific version is wanted to +be deleted then the /sys/kernel/tracing/dynamic_events file should be used for +that specific format of the event. Unregistering ------------- From ed89683763a12e8289cce6f233dd07b4eb42fb96 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 22 Feb 2024 12:46:39 +0000 Subject: [PATCH 14/34] tracing: Use init_utsname()->release Instead of using UTS_RELEASE, use init_utsname()->release, which means that we don't need to rebuild the code just for the git head commit changing. Link: https://lore.kernel.org/linux-trace-kernel/20240222124639.65629-1-john.g.garry@oracle.com Signed-off-by: John Garry Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c66ca2f6944e..a7c0dc0aaf14 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -13,7 +13,7 @@ * Copyright (C) 2004 Nadia Yvette Chambers */ #include -#include +#include #include #include #include @@ -4148,7 +4148,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) get_total_entries(buf, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", - name, UTS_RELEASE); + name, init_utsname()->release); seq_puts(m, "# -----------------------------------" "---------------------------------\n"); seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |" From 9388a2aa453321bcf1ad2603959debea9e6ab6d4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 22 Feb 2024 12:28:28 -0500 Subject: [PATCH 15/34] NFSD: Fix nfsd_clid_class use of __string_len() macro I'm working on restructuring the __string* macros so that it doesn't need to recalculate the string twice. That is, it will save it off when processing __string() and the __assign_str() will not need to do the work again as it currently does. Currently __string_len(item, src, len) doesn't actually use "src", but my changes will require src to be correct as that is where the __assign_str() will get its value from. The event class nfsd_clid_class has: __string_len(name, name, clp->cl_name.len) But the second "name" does not exist and causes my changes to fail to build. That second parameter should be: clp->cl_name.data. Link: https://lore.kernel.org/linux-trace-kernel/20240222122828.3d8d213c@gandalf.local.home Cc: Neil Brown Cc: Olga Kornievskaia Cc: Dai Ngo Cc: Tom Talpey Cc: stable@vger.kernel.org Fixes: d27b74a8675ca ("NFSD: Use new __string_len C macros for nfsd_clid_class") Acked-by: Chuck Lever Acked-by: Jeff Layton Signed-off-by: Steven Rostedt (Google) --- fs/nfsd/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index e545e92c4408..288a796b2878 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -896,7 +896,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class, __array(unsigned char, addr, sizeof(struct sockaddr_in6)) __field(unsigned long, flavor) __array(unsigned char, verifier, NFS4_VERIFIER_SIZE) - __string_len(name, name, clp->cl_name.len) + __string_len(name, clp->cl_name.data, clp->cl_name.len) ), TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; From 0df4c388a1e310400a6e90fb10b286e2673756f0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 22 Feb 2024 13:30:57 -0500 Subject: [PATCH 16/34] drm/i915: Add missing ; to __assign_str() macros in tracepoint code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I'm working on improving the __assign_str() and __string() macros to be more efficient, and removed some unneeded semicolons. This triggered a bug in the build as some of the __assign_str() macros in intel_display_trace was missing a terminating semicolon. Link: https://lore.kernel.org/linux-trace-kernel/20240222133057.2af72a19@gandalf.local.home Cc: Daniel Vetter Cc: David Airlie Cc: stable@vger.kernel.org Fixes: 2ceea5d88048b ("drm/i915: Print plane name in fbc tracepoints") Reviewed-by: Ville Syrjälä Acked-by: Rodrigo Vivi Signed-off-by: Steven Rostedt (Google) --- drivers/gpu/drm/i915/display/intel_display_trace.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_display_trace.h b/drivers/gpu/drm/i915/display/intel_display_trace.h index 99bdb833591c..7862e7cefe02 100644 --- a/drivers/gpu/drm/i915/display/intel_display_trace.h +++ b/drivers/gpu/drm/i915/display/intel_display_trace.h @@ -411,7 +411,7 @@ TRACE_EVENT(intel_fbc_activate, struct intel_crtc *crtc = intel_crtc_for_pipe(to_i915(plane->base.dev), plane->pipe); __assign_str(dev, __dev_name_kms(plane)); - __assign_str(name, plane->base.name) + __assign_str(name, plane->base.name); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -438,7 +438,7 @@ TRACE_EVENT(intel_fbc_deactivate, struct intel_crtc *crtc = intel_crtc_for_pipe(to_i915(plane->base.dev), plane->pipe); __assign_str(dev, __dev_name_kms(plane)); - __assign_str(name, plane->base.name) + __assign_str(name, plane->base.name); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -465,7 +465,7 @@ TRACE_EVENT(intel_fbc_nuke, struct intel_crtc *crtc = intel_crtc_for_pipe(to_i915(plane->base.dev), plane->pipe); __assign_str(dev, __dev_name_kms(plane)); - __assign_str(name, plane->base.name) + __assign_str(name, plane->base.name); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); From 3f9952e8d80cca2da3b47ecd5ad9ec16cfd1a649 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 13 Mar 2024 09:34:54 -0400 Subject: [PATCH 17/34] net: hns3: tracing: fix hclgevf trace event strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The __string() and __assign_str() helper macros of the TRACE_EVENT() macro are going through some optimizations where only the source string of __string() will be used and the __assign_str() source will be ignored and later removed. To make sure that there's no issues, a new check is added between the __string() src argument and the __assign_str() src argument that does a strcmp() to make sure they are the same string. The hclgevf trace events have: __assign_str(devname, &hdev->nic.kinfo.netdev->name); Which triggers the warning: hclgevf_trace.h:34:39: error: passing argument 1 of ‘strcmp’ from incompatible pointer type [-Werror=incompatible-pointer-types] 34 | __assign_str(devname, &hdev->nic.kinfo.netdev->name); [..] arch/x86/include/asm/string_64.h:75:24: note: expected ‘const char *’ but argument is of type ‘char (*)[16]’ 75 | int strcmp(const char *cs, const char *ct); | ~~~~~~~~~~~~^~ Because __assign_str() now has: WARN_ON_ONCE(__builtin_constant_p(src) ? \ strcmp((src), __data_offsets.dst##_ptr_) : \ (src) != __data_offsets.dst##_ptr_); \ The problem is the '&' on hdev->nic.kinfo.netdev->name. That's because that name is: char name[IFNAMSIZ] Where passing an address '&' of a char array is not compatible with strcmp(). The '&' is not necessary, remove it. Link: https://lore.kernel.org/linux-trace-kernel/20240313093454.3909afe7@gandalf.local.home Cc: netdev Cc: Yisen Zhuang Cc: Salil Mehta Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Yufeng Mo Cc: Huazhong Tan Cc: stable@vger.kernel.org Acked-by: Paolo Abeni Reviewed-by: Jijie Shao Fixes: d8355240cf8fb ("net: hns3: add trace event support for PF/VF mailbox") Signed-off-by: Steven Rostedt (Google) --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h | 8 ++++---- .../net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h index 8510b88d4982..f3cd5a376eca 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h @@ -24,7 +24,7 @@ TRACE_EVENT(hclge_pf_mbx_get, __field(u8, code) __field(u8, subcode) __string(pciname, pci_name(hdev->pdev)) - __string(devname, &hdev->vport[0].nic.kinfo.netdev->name) + __string(devname, hdev->vport[0].nic.kinfo.netdev->name) __array(u32, mbx_data, PF_GET_MBX_LEN) ), @@ -33,7 +33,7 @@ TRACE_EVENT(hclge_pf_mbx_get, __entry->code = req->msg.code; __entry->subcode = req->msg.subcode; __assign_str(pciname, pci_name(hdev->pdev)); - __assign_str(devname, &hdev->vport[0].nic.kinfo.netdev->name); + __assign_str(devname, hdev->vport[0].nic.kinfo.netdev->name); memcpy(__entry->mbx_data, req, sizeof(struct hclge_mbx_vf_to_pf_cmd)); ), @@ -56,7 +56,7 @@ TRACE_EVENT(hclge_pf_mbx_send, __field(u8, vfid) __field(u16, code) __string(pciname, pci_name(hdev->pdev)) - __string(devname, &hdev->vport[0].nic.kinfo.netdev->name) + __string(devname, hdev->vport[0].nic.kinfo.netdev->name) __array(u32, mbx_data, PF_SEND_MBX_LEN) ), @@ -64,7 +64,7 @@ TRACE_EVENT(hclge_pf_mbx_send, __entry->vfid = req->dest_vfid; __entry->code = le16_to_cpu(req->msg.code); __assign_str(pciname, pci_name(hdev->pdev)); - __assign_str(devname, &hdev->vport[0].nic.kinfo.netdev->name); + __assign_str(devname, hdev->vport[0].nic.kinfo.netdev->name); memcpy(__entry->mbx_data, req, sizeof(struct hclge_mbx_pf_to_vf_cmd)); ), diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h index 5d4895bb57a1..b259e95dd53c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h @@ -23,7 +23,7 @@ TRACE_EVENT(hclge_vf_mbx_get, __field(u8, vfid) __field(u16, code) __string(pciname, pci_name(hdev->pdev)) - __string(devname, &hdev->nic.kinfo.netdev->name) + __string(devname, hdev->nic.kinfo.netdev->name) __array(u32, mbx_data, VF_GET_MBX_LEN) ), @@ -31,7 +31,7 @@ TRACE_EVENT(hclge_vf_mbx_get, __entry->vfid = req->dest_vfid; __entry->code = le16_to_cpu(req->msg.code); __assign_str(pciname, pci_name(hdev->pdev)); - __assign_str(devname, &hdev->nic.kinfo.netdev->name); + __assign_str(devname, hdev->nic.kinfo.netdev->name); memcpy(__entry->mbx_data, req, sizeof(struct hclge_mbx_pf_to_vf_cmd)); ), @@ -55,7 +55,7 @@ TRACE_EVENT(hclge_vf_mbx_send, __field(u8, code) __field(u8, subcode) __string(pciname, pci_name(hdev->pdev)) - __string(devname, &hdev->nic.kinfo.netdev->name) + __string(devname, hdev->nic.kinfo.netdev->name) __array(u32, mbx_data, VF_SEND_MBX_LEN) ), @@ -64,7 +64,7 @@ TRACE_EVENT(hclge_vf_mbx_send, __entry->code = req->msg.code; __entry->subcode = req->msg.subcode; __assign_str(pciname, pci_name(hdev->pdev)); - __assign_str(devname, &hdev->nic.kinfo.netdev->name); + __assign_str(devname, hdev->nic.kinfo.netdev->name); memcpy(__entry->mbx_data, req, sizeof(struct hclge_mbx_vf_to_pf_cmd)); ), From 6c871260965255a1c142fb77ccee58b172d1690b Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Thu, 14 Mar 2024 13:12:17 -0700 Subject: [PATCH 18/34] cxl/trace: Properly initialize cxl_poison region name The TP_STRUCT__entry that gets assigned the region name, or an empty string if no region is present, is erroneously initialized to the cxl_region pointer. It needs to be properly initialized otherwise it's length is wrong and garbage chars can appear in the kernel trace output: /sys/kernel/tracing/trace The bad initialization was due in part to a naming conflict with the parameter: struct cxl_region *region. The field 'region' is already exposed externally as the region name, so changing that to something logical, like 'region_name' is not an option. Instead rename the internal only struct cxl_region to the commonly used 'cxlr'. Impact is that tooling depending on that trace data can miss picking up a valid event when searching by region name. The TP_printk() output, if enabled, does emit the correct region names in the dmesg log. This was found during testing of the cxl-list option to report media-errors for a region. Cc: Davidlohr Bueso Cc: Jonathan Cameron Cc: Dave Jiang Cc: Vishal Verma Cc: stable@vger.kernel.org Fixes: ddf49d57b841 ("cxl/trace: Add TRACE support for CXL media-error records") Signed-off-by: Alison Schofield Reviewed-by: Ira Weiny Acked-by: Dan Williams Signed-off-by: Steven Rostedt (Google) --- drivers/cxl/core/trace.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index bdf117a33744..e5f13260fc52 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -646,18 +646,18 @@ u64 cxl_trace_hpa(struct cxl_region *cxlr, struct cxl_memdev *memdev, u64 dpa); TRACE_EVENT(cxl_poison, - TP_PROTO(struct cxl_memdev *cxlmd, struct cxl_region *region, + TP_PROTO(struct cxl_memdev *cxlmd, struct cxl_region *cxlr, const struct cxl_poison_record *record, u8 flags, __le64 overflow_ts, enum cxl_poison_trace_type trace_type), - TP_ARGS(cxlmd, region, record, flags, overflow_ts, trace_type), + TP_ARGS(cxlmd, cxlr, record, flags, overflow_ts, trace_type), TP_STRUCT__entry( __string(memdev, dev_name(&cxlmd->dev)) __string(host, dev_name(cxlmd->dev.parent)) __field(u64, serial) __field(u8, trace_type) - __string(region, region) + __string(region, cxlr ? dev_name(&cxlr->dev) : "") __field(u64, overflow_ts) __field(u64, hpa) __field(u64, dpa) @@ -677,10 +677,10 @@ TRACE_EVENT(cxl_poison, __entry->source = cxl_poison_record_source(record); __entry->trace_type = trace_type; __entry->flags = flags; - if (region) { - __assign_str(region, dev_name(®ion->dev)); - memcpy(__entry->uuid, ®ion->params.uuid, 16); - __entry->hpa = cxl_trace_hpa(region, cxlmd, + if (cxlr) { + __assign_str(region, dev_name(&cxlr->dev)); + memcpy(__entry->uuid, &cxlr->params.uuid, 16); + __entry->hpa = cxl_trace_hpa(cxlr, cxlmd, __entry->dpa); } else { __assign_str(region, ""); From c1fa617caeb005e7e3db60826cff6dddebb0363f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 22 Feb 2024 16:14:16 -0500 Subject: [PATCH 19/34] tracing: Rework __assign_str() and __string() to not duplicate getting the string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TRACE_EVENT() macro handles dynamic strings by having: TP_PROTO(struct some_struct *s), TP_ARGS(s), TP_STRUCT__entry( __string(my_string, s->string) ), TP_fast_assign( __assign_str(my_string, s->string); ) TP_printk("%s", __get_str(my_string)) There's even some code that may call a function helper to find the s->string value. The problem with the above is that the work to get the s->string is done twice. Once at the __string() and again in the __assign_str(). But the __string() uses dynamic_array() which has a helper structure that is created holding the offsets and length of the string fields. Instead of finding the string twice, just save it off in another field from that helper structure, and have __assign_str() use that instead. Note, this also means that the second parameter of __assign_str() isn't even used anymore, and may be removed in the future. Link: https://lore.kernel.org/linux-trace-kernel/20240222211442.634192653@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Ville Syrjälä Cc: Rodrigo Vivi Cc: Chuck Lever Signed-off-by: Steven Rostedt (Google) --- include/trace/stages/stage2_data_offsets.h | 4 ++-- include/trace/stages/stage5_get_offsets.h | 15 ++++++++++----- include/trace/stages/stage6_event_callback.h | 12 ++++++++---- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/include/trace/stages/stage2_data_offsets.h b/include/trace/stages/stage2_data_offsets.h index 469b6a64293d..8b0cff06d346 100644 --- a/include/trace/stages/stage2_data_offsets.h +++ b/include/trace/stages/stage2_data_offsets.h @@ -24,7 +24,7 @@ #define __array(type, item, len) #undef __dynamic_array -#define __dynamic_array(type, item, len) u32 item; +#define __dynamic_array(type, item, len) u32 item; const void *item##_ptr_; #undef __string #define __string(item, src) __dynamic_array(char, item, -1) @@ -45,7 +45,7 @@ #define __sockaddr(field, len) __dynamic_array(u8, field, len) #undef __rel_dynamic_array -#define __rel_dynamic_array(type, item, len) u32 item; +#define __rel_dynamic_array(type, item, len) u32 item; const void *item##_ptr_; #undef __rel_string #define __rel_string(item, src) __rel_dynamic_array(char, item, -1) diff --git a/include/trace/stages/stage5_get_offsets.h b/include/trace/stages/stage5_get_offsets.h index e30a13be46ba..45f8151cf622 100644 --- a/include/trace/stages/stage5_get_offsets.h +++ b/include/trace/stages/stage5_get_offsets.h @@ -47,10 +47,12 @@ #undef __string #define __string(item, src) __dynamic_array(char, item, \ - strlen((src) ? (const char *)(src) : "(null)") + 1) + strlen((src) ? (const char *)(src) : "(null)") + 1) \ + __data_offsets->item##_ptr_ = src; #undef __string_len -#define __string_len(item, src, len) __dynamic_array(char, item, (len) + 1) +#define __string_len(item, src, len) __dynamic_array(char, item, (len) + 1)\ + __data_offsets->item##_ptr_ = src; #undef __vstring #define __vstring(item, fmt, ap) __dynamic_array(char, item, \ @@ -67,11 +69,14 @@ __data_size += __item_length; #undef __rel_string -#define __rel_string(item, src) __rel_dynamic_array(char, item, \ - strlen((src) ? (const char *)(src) : "(null)") + 1) +#define __rel_string(item, src) __rel_dynamic_array(char, item, \ + strlen((src) ? (const char *)(src) : "(null)") + 1) \ + __data_offsets->item##_ptr_ = src; #undef __rel_string_len -#define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, (len) + 1) +#define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, (len) + 1)\ + __data_offsets->item##_ptr_ = src; + /* * __bitmask_size_in_bytes_raw is the number of bytes needed to hold * num_possible_cpus(). diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h index 919b1a4da980..b3e2f321e787 100644 --- a/include/trace/stages/stage6_event_callback.h +++ b/include/trace/stages/stage6_event_callback.h @@ -32,12 +32,14 @@ #undef __assign_str #define __assign_str(dst, src) \ - strcpy(__get_str(dst), (src) ? (const char *)(src) : "(null)"); + strcpy(__get_str(dst), __data_offsets.dst##_ptr_ ? \ + __data_offsets.dst##_ptr_ : "(null)") #undef __assign_str_len #define __assign_str_len(dst, src, len) \ do { \ - memcpy(__get_str(dst), (src), (len)); \ + memcpy(__get_str(dst), __data_offsets.dst##_ptr_ ? \ + __data_offsets.dst##_ptr_ : "(null)", len); \ __get_str(dst)[len] = '\0'; \ } while(0) @@ -92,12 +94,14 @@ #undef __assign_rel_str #define __assign_rel_str(dst, src) \ - strcpy(__get_rel_str(dst), (src) ? (const char *)(src) : "(null)"); + strcpy(__get_rel_str(dst), __data_offsets.dst##_ptr_ ? \ + __data_offsets.dst##_ptr_ : "(null)") #undef __assign_rel_str_len #define __assign_rel_str_len(dst, src, len) \ do { \ - memcpy(__get_rel_str(dst), (src), (len)); \ + memcpy(__get_rel_str(dst), __data_offsets.dst##_ptr_ ? \ + __data_offsets.dst##_ptr_ : "(null)", len); \ __get_rel_str(dst)[len] = '\0'; \ } while (0) From e8b737bfb16a0d540413173e8d1574e3bf8cc0e9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 22 Feb 2024 16:14:17 -0500 Subject: [PATCH 20/34] tracing: Do not calculate strlen() twice for __string() fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TRACE_EVENT() macro handles dynamic strings by having: TP_PROTO(struct some_struct *s), TP_ARGS(s), TP_STRUCT__entry( __string(my_string, s->string) ), TP_fast_assign( __assign_str(my_string, s->string); ) TP_printk("%s", __get_str(my_string)) There's even some code that may call a function helper to find the s->string value. The problem with the above is that the work to get the s->string is done twice. Once at the __string() and again in the __assign_str(). The length of the string is calculated via a strlen(), not once, but twice. Once during the __string() macro and again in __assign_str(). But the length is actually already recorded in the data location and here's no reason to call strlen() again. Just use the saved length that was saved in the __string() code for the __assign_str() code. Link: https://lore.kernel.org/linux-trace-kernel/20240222211442.793074999@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Ville Syrjälä Cc: Rodrigo Vivi Cc: Chuck Lever Signed-off-by: Steven Rostedt (Google) --- include/trace/stages/stage6_event_callback.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h index b3e2f321e787..c0e5d097324e 100644 --- a/include/trace/stages/stage6_event_callback.h +++ b/include/trace/stages/stage6_event_callback.h @@ -32,8 +32,9 @@ #undef __assign_str #define __assign_str(dst, src) \ - strcpy(__get_str(dst), __data_offsets.dst##_ptr_ ? \ - __data_offsets.dst##_ptr_ : "(null)") + memcpy(__get_str(dst), __data_offsets.dst##_ptr_ ? \ + __data_offsets.dst##_ptr_ : "(null)", \ + __get_dynamic_array_len(dst)) #undef __assign_str_len #define __assign_str_len(dst, src, len) \ @@ -94,8 +95,9 @@ #undef __assign_rel_str #define __assign_rel_str(dst, src) \ - strcpy(__get_rel_str(dst), __data_offsets.dst##_ptr_ ? \ - __data_offsets.dst##_ptr_ : "(null)") + memcpy(__get_rel_str(dst), __data_offsets.dst##_ptr_ ? \ + __data_offsets.dst##_ptr_ : "(null)", \ + __get_rel_dynamic_array_len(dst)) #undef __assign_rel_str_len #define __assign_rel_str_len(dst, src, len) \ From 916849860fa9c7d3caeb144cb5dec8831cf23bfc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 22 Feb 2024 16:14:18 -0500 Subject: [PATCH 21/34] tracing: Use ? : shortcut in trace macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of having: #define __assign_str(dst, src) \ memcpy(__get_str(dst), __data_offsets.dst##_ptr_ ? \ __data_offsets.dst##_ptr_ : "(null)", \ __get_dynamic_array_len(dst)) Use the ? : shortcut and compact it down to: #define __assign_str(dst, src) \ memcpy(__get_str(dst), __data_offsets.dst##_ptr_ ? : "(null)", \ __get_dynamic_array_len(dst)) Link: https://lore.kernel.org/linux-trace-kernel/20240222211442.949327725@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Ville Syrjälä Cc: Rodrigo Vivi Cc: Chuck Lever Suggested-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/stages/stage5_get_offsets.h | 4 ++-- include/trace/stages/stage6_event_callback.h | 14 ++++++-------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/include/trace/stages/stage5_get_offsets.h b/include/trace/stages/stage5_get_offsets.h index 45f8151cf622..20b801ed3fd4 100644 --- a/include/trace/stages/stage5_get_offsets.h +++ b/include/trace/stages/stage5_get_offsets.h @@ -47,7 +47,7 @@ #undef __string #define __string(item, src) __dynamic_array(char, item, \ - strlen((src) ? (const char *)(src) : "(null)") + 1) \ + strlen((const char *)(src) ? : "(null)") + 1) \ __data_offsets->item##_ptr_ = src; #undef __string_len @@ -70,7 +70,7 @@ #undef __rel_string #define __rel_string(item, src) __rel_dynamic_array(char, item, \ - strlen((src) ? (const char *)(src) : "(null)") + 1) \ + strlen((const char *)(src) ? : "(null)") + 1) \ __data_offsets->item##_ptr_ = src; #undef __rel_string_len diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h index c0e5d097324e..38732855eadb 100644 --- a/include/trace/stages/stage6_event_callback.h +++ b/include/trace/stages/stage6_event_callback.h @@ -32,15 +32,14 @@ #undef __assign_str #define __assign_str(dst, src) \ - memcpy(__get_str(dst), __data_offsets.dst##_ptr_ ? \ - __data_offsets.dst##_ptr_ : "(null)", \ + memcpy(__get_str(dst), __data_offsets.dst##_ptr_ ? : "(null)", \ __get_dynamic_array_len(dst)) #undef __assign_str_len #define __assign_str_len(dst, src, len) \ do { \ - memcpy(__get_str(dst), __data_offsets.dst##_ptr_ ? \ - __data_offsets.dst##_ptr_ : "(null)", len); \ + memcpy(__get_str(dst), \ + __data_offsets.dst##_ptr_ ? : "(null)", len); \ __get_str(dst)[len] = '\0'; \ } while(0) @@ -95,15 +94,14 @@ #undef __assign_rel_str #define __assign_rel_str(dst, src) \ - memcpy(__get_rel_str(dst), __data_offsets.dst##_ptr_ ? \ - __data_offsets.dst##_ptr_ : "(null)", \ + memcpy(__get_rel_str(dst), __data_offsets.dst##_ptr_ ? : "(null)", \ __get_rel_dynamic_array_len(dst)) #undef __assign_rel_str_len #define __assign_rel_str_len(dst, src, len) \ do { \ - memcpy(__get_rel_str(dst), __data_offsets.dst##_ptr_ ? \ - __data_offsets.dst##_ptr_ : "(null)", len); \ + memcpy(__get_rel_str(dst), \ + __data_offsets.dst##_ptr_ ? : "(null)", len); \ __get_rel_str(dst)[len] = '\0'; \ } while (0) From 70a6ed553f7d3504febac467cb4a0bae621ba3c6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 22 Feb 2024 16:14:19 -0500 Subject: [PATCH 22/34] tracing: Use EVENT_NULL_STR macro instead of open coding "(null)" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TRACE_EVENT macros has some dependency if a __string() field is NULL, where it will save "(null)" as the string. This string is also used by __assign_str(). It's better to create a single macro instead of having something that will not be caught by the compiler if there is an unfortunate typo. Link: https://lore.kernel.org/linux-trace-kernel/20240222211443.106216915@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Ville Syrjälä Cc: Rodrigo Vivi Cc: Chuck Lever Suggested-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 3 +++ include/trace/events/sunrpc.h | 12 ++++++------ include/trace/stages/stage5_get_offsets.h | 4 ++-- include/trace/stages/stage6_event_callback.h | 8 ++++---- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index fc6d0af56bb1..6f9bdfb09d1d 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -17,6 +17,9 @@ struct dentry; struct bpf_prog; union bpf_attr; +/* Used for event string fields when they are NULL */ +#define EVENT_NULL_STR "(null)" + const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, const struct trace_print_flags *flag_array); diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index cdd3a45e6003..ce6a85b82afa 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1327,18 +1327,18 @@ TRACE_EVENT(xs_stream_read_data, __field(ssize_t, err) __field(size_t, total) __string(addr, xprt ? xprt->address_strings[RPC_DISPLAY_ADDR] : - "(null)") + EVENT_NULL_STR) __string(port, xprt ? xprt->address_strings[RPC_DISPLAY_PORT] : - "(null)") + EVENT_NULL_STR) ), TP_fast_assign( __entry->err = err; __entry->total = total; __assign_str(addr, xprt ? - xprt->address_strings[RPC_DISPLAY_ADDR] : "(null)"); + xprt->address_strings[RPC_DISPLAY_ADDR] : EVENT_NULL_STR); __assign_str(port, xprt ? - xprt->address_strings[RPC_DISPLAY_PORT] : "(null)"); + xprt->address_strings[RPC_DISPLAY_PORT] : EVENT_NULL_STR); ), TP_printk("peer=[%s]:%s err=%zd total=%zu", __get_str(addr), @@ -1783,7 +1783,7 @@ TRACE_EVENT(svc_process, __string(service, name) __string(procedure, svc_proc_name(rqst)) __string(addr, rqst->rq_xprt ? - rqst->rq_xprt->xpt_remotebuf : "(null)") + rqst->rq_xprt->xpt_remotebuf : EVENT_NULL_STR) ), TP_fast_assign( @@ -1793,7 +1793,7 @@ TRACE_EVENT(svc_process, __assign_str(service, name); __assign_str(procedure, svc_proc_name(rqst)); __assign_str(addr, rqst->rq_xprt ? - rqst->rq_xprt->xpt_remotebuf : "(null)"); + rqst->rq_xprt->xpt_remotebuf : EVENT_NULL_STR); ), TP_printk("addr=%s xid=0x%08x service=%s vers=%u proc=%s", diff --git a/include/trace/stages/stage5_get_offsets.h b/include/trace/stages/stage5_get_offsets.h index 20b801ed3fd4..e6b96608f452 100644 --- a/include/trace/stages/stage5_get_offsets.h +++ b/include/trace/stages/stage5_get_offsets.h @@ -47,7 +47,7 @@ #undef __string #define __string(item, src) __dynamic_array(char, item, \ - strlen((const char *)(src) ? : "(null)") + 1) \ + strlen((const char *)(src) ? : EVENT_NULL_STR) + 1) \ __data_offsets->item##_ptr_ = src; #undef __string_len @@ -70,7 +70,7 @@ #undef __rel_string #define __rel_string(item, src) __rel_dynamic_array(char, item, \ - strlen((const char *)(src) ? : "(null)") + 1) \ + strlen((const char *)(src) ? : EVENT_NULL_STR) + 1) \ __data_offsets->item##_ptr_ = src; #undef __rel_string_len diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h index 38732855eadb..2bfd49713b42 100644 --- a/include/trace/stages/stage6_event_callback.h +++ b/include/trace/stages/stage6_event_callback.h @@ -32,14 +32,14 @@ #undef __assign_str #define __assign_str(dst, src) \ - memcpy(__get_str(dst), __data_offsets.dst##_ptr_ ? : "(null)", \ + memcpy(__get_str(dst), __data_offsets.dst##_ptr_ ? : EVENT_NULL_STR, \ __get_dynamic_array_len(dst)) #undef __assign_str_len #define __assign_str_len(dst, src, len) \ do { \ memcpy(__get_str(dst), \ - __data_offsets.dst##_ptr_ ? : "(null)", len); \ + __data_offsets.dst##_ptr_ ? : EVENT_NULL_STR, len); \ __get_str(dst)[len] = '\0'; \ } while(0) @@ -94,14 +94,14 @@ #undef __assign_rel_str #define __assign_rel_str(dst, src) \ - memcpy(__get_rel_str(dst), __data_offsets.dst##_ptr_ ? : "(null)", \ + memcpy(__get_rel_str(dst), __data_offsets.dst##_ptr_ ? : EVENT_NULL_STR, \ __get_rel_dynamic_array_len(dst)) #undef __assign_rel_str_len #define __assign_rel_str_len(dst, src, len) \ do { \ memcpy(__get_rel_str(dst), \ - __data_offsets.dst##_ptr_ ? : "(null)", len); \ + __data_offsets.dst##_ptr_ ? : EVENT_NULL_STR, len); \ __get_rel_str(dst)[len] = '\0'; \ } while (0) From cca990c7b565af0dc61a8f647c00833453cf5bff Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 22 Feb 2024 20:33:24 -0500 Subject: [PATCH 23/34] tracing: Fix snapshot counter going between two tracers that use it Running the ftrace selftests caused the ring buffer mapping test to fail. Investigating, I found that the snapshot counter would be incremented every time a tracer that uses the snapshot is enabled even if the snapshot was used by the previous tracer. That is: # cd /sys/kernel/tracing # echo wakeup_rt > current_tracer # echo wakeup_dl > current_tracer # echo nop > current_tracer would leave the snapshot counter at 1 and not zero. That's because the enabling of wakeup_dl would increment the counter again but the setting the tracer to nop would only decrement it once. Do not arm the snapshot for a tracer if the previous tracer already had it armed. Link: https://lore.kernel.org/linux-trace-kernel/20240223013344.570525723@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Fixes: 16f7e48ffc53a ("tracing: Add snapshot refcount") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a7c0dc0aaf14..1e1fd377a1cf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6162,7 +6162,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) tracing_disarm_snapshot(tr); } - if (t->use_max_tr) { + if (!had_max_tr && t->use_max_tr) { ret = tracing_arm_snapshot_locked(tr); if (ret) goto out; From 2048fdc27525c6b32c63f8429d9335b7fd2f90c2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 22 Feb 2024 20:33:25 -0500 Subject: [PATCH 24/34] tracing: Decrement the snapshot if the snapshot trigger fails to register Running the ftrace selftests caused the ring buffer mapping test to fail. Investigating, I found that the snapshot counter would be incremented every time a snapshot trigger was added, even if that snapshot trigger failed. # cd /sys/kernel/tracing # echo "snapshot" > events/sched/sched_process_fork/trigger # echo "snapshot" > events/sched/sched_process_fork/trigger -bash: echo: write error: File exists That second one that fails increments the snapshot counter but doesn't decrement it. It needs to be decremented when the snapshot fails. Link: https://lore.kernel.org/linux-trace-kernel/20240223013344.729055907@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Fixes: 16f7e48ffc53a ("tracing: Add snapshot refcount") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 62e4f58b8671..4bec043c8690 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1491,7 +1491,10 @@ register_snapshot_trigger(char *glob, if (ret < 0) return ret; - return register_trigger(glob, data, file); + ret = register_trigger(glob, data, file); + if (ret < 0) + tracing_disarm_snapshot(file->tr); + return ret; } static void unregister_snapshot_trigger(char *glob, From d15304135c7f6cbcbf9e6e37814de495a56d51f8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 22 Feb 2024 21:48:33 -0800 Subject: [PATCH 25/34] ftrace: Fix most kernel-doc warnings Reduce the number of kernel-doc warnings from 52 down to 10, i.e., fix 42 kernel-doc warnings by (a) using the Returns: format for function return values or (b) using "@var:" instead of "@var -" for function parameter descriptions. Fix one return values list so that it is formatted correctly when rendered for output. Spell "non-zero" with a hyphen in several places. Link: https://lore.kernel.org/linux-trace-kernel/20240223054833.15471-1-rdunlap@infradead.org Cc: Mathieu Desnoyers Cc: Mark Rutland Link: https://lore.kernel.org/oe-kbuild-all/202312180518.X6fRyDSN-lkp@intel.com/ Reported-by: kernel test robot Acked-by: Masami Hiramatsu (Google) Signed-off-by: Randy Dunlap Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 90 ++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 83ba342aef31..da1710499698 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1160,7 +1160,7 @@ __ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) * Search a given @hash to see if a given instruction pointer (@ip) * exists in it. * - * Returns the entry that holds the @ip if found. NULL otherwise. + * Returns: the entry that holds the @ip if found. NULL otherwise. */ struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) @@ -1282,7 +1282,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) /** * ftrace_free_filter - remove all filters for an ftrace_ops - * @ops - the ops to remove the filters from + * @ops: the ops to remove the filters from */ void ftrace_free_filter(struct ftrace_ops *ops) { @@ -1587,7 +1587,7 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) * @end: end of range to search (inclusive). @end points to the last byte * to check. * - * Returns rec->ip if the related ftrace location is a least partly within + * Returns: rec->ip if the related ftrace location is a least partly within * the given address range. That is, the first address of the instruction * that is either a NOP or call to the function tracer. It checks the ftrace * internal tables to determine if the address belongs or not. @@ -1607,9 +1607,10 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) * ftrace_location - return the ftrace location * @ip: the instruction pointer to check * - * If @ip matches the ftrace location, return @ip. - * If @ip matches sym+0, return sym's ftrace location. - * Otherwise, return 0. + * Returns: + * * If @ip matches the ftrace location, return @ip. + * * If @ip matches sym+0, return sym's ftrace location. + * * Otherwise, return 0. */ unsigned long ftrace_location(unsigned long ip) { @@ -1639,7 +1640,7 @@ out: * @start: start of range to search * @end: end of range to search (inclusive). @end points to the last byte to check. * - * Returns 1 if @start and @end contains a ftrace location. + * Returns: 1 if @start and @end contains a ftrace location. * That is, the instruction that is either a NOP or call to * the function tracer. It checks the ftrace internal tables to * determine if the address belongs or not. @@ -2574,7 +2575,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip, * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS * is not set, then it wants to convert to the normal callback. * - * Returns the address of the trampoline to set to + * Returns: the address of the trampoline to set to */ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) { @@ -2615,7 +2616,7 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) * a function that saves all the regs. Basically the '_EN' version * represents the current state of the function. * - * Returns the address of the trampoline that is currently being called + * Returns: the address of the trampoline that is currently being called */ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) { @@ -2719,7 +2720,7 @@ struct ftrace_rec_iter { /** * ftrace_rec_iter_start - start up iterating over traced functions * - * Returns an iterator handle that is used to iterate over all + * Returns: an iterator handle that is used to iterate over all * the records that represent address locations where functions * are traced. * @@ -2751,7 +2752,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void) * ftrace_rec_iter_next - get the next record to process. * @iter: The handle to the iterator. * - * Returns the next iterator after the given iterator @iter. + * Returns: the next iterator after the given iterator @iter. */ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) { @@ -2776,7 +2777,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) * ftrace_rec_iter_record - get the record at the iterator location * @iter: The current iterator location * - * Returns the record that the current @iter is at. + * Returns: the record that the current @iter is at. */ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) { @@ -4010,6 +4011,8 @@ ftrace_avail_addrs_open(struct inode *inode, struct file *file) * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. * tracing_lseek() should be used as the lseek routine, and * release must call ftrace_regex_release(). + * + * Returns: 0 on success or a negative errno value on failure */ int ftrace_regex_open(struct ftrace_ops *ops, int flag, @@ -4626,7 +4629,7 @@ struct ftrace_func_mapper { /** * allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper * - * Returns a ftrace_func_mapper descriptor that can be used to map ips to data. + * Returns: a ftrace_func_mapper descriptor that can be used to map ips to data. */ struct ftrace_func_mapper *allocate_ftrace_func_mapper(void) { @@ -4646,7 +4649,7 @@ struct ftrace_func_mapper *allocate_ftrace_func_mapper(void) * @mapper: The mapper that has the ip maps * @ip: the instruction pointer to find the data for * - * Returns the data mapped to @ip if found otherwise NULL. The return + * Returns: the data mapped to @ip if found otherwise NULL. The return * is actually the address of the mapper data pointer. The address is * returned for use cases where the data is no bigger than a long, and * the user can use the data pointer as its data instead of having to @@ -4672,7 +4675,7 @@ void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, * @ip: The instruction pointer address to map @data to * @data: The data to map to @ip * - * Returns 0 on success otherwise an error. + * Returns: 0 on success otherwise an error. */ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, unsigned long ip, void *data) @@ -4701,7 +4704,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, * @mapper: The mapper that has the ip maps * @ip: The instruction pointer address to remove the data from * - * Returns the data if it is found, otherwise NULL. + * Returns: the data if it is found, otherwise NULL. * Note, if the data pointer is used as the data itself, (see * ftrace_func_mapper_find_ip(), then the return value may be meaningless, * if the data pointer was set to zero. @@ -5625,10 +5628,10 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct); /** * ftrace_set_filter_ip - set a function to filter on in ftrace by address - * @ops - the ops to set the filter with - * @ip - the address to add to or remove from the filter. - * @remove - non zero to remove the ip from the filter - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the filter with + * @ip: the address to add to or remove from the filter. + * @remove: non zero to remove the ip from the filter + * @reset: non zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled * If @ip is NULL, it fails to update filter. @@ -5647,11 +5650,11 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); /** * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses - * @ops - the ops to set the filter with - * @ips - the array of addresses to add to or remove from the filter. - * @cnt - the number of addresses in @ips - * @remove - non zero to remove ips from the filter - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the filter with + * @ips: the array of addresses to add to or remove from the filter. + * @cnt: the number of addresses in @ips + * @remove: non zero to remove ips from the filter + * @reset: non zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled * If @ips array or any ip specified within is NULL , it fails to update filter. @@ -5670,7 +5673,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ips); /** * ftrace_ops_set_global_filter - setup ops to use global filters - * @ops - the ops which will use the global filters + * @ops: the ops which will use the global filters * * ftrace users who need global function trace filtering should call this. * It can set the global filter only if ops were not initialized before. @@ -5694,10 +5697,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, /** * ftrace_set_filter - set a function to filter on in ftrace - * @ops - the ops to set the filter with - * @buf - the string that holds the function filter text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the filter with + * @buf: the string that holds the function filter text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. @@ -5716,10 +5719,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); /** * ftrace_set_notrace - set a function to not trace in ftrace - * @ops - the ops to set the notrace filter with - * @buf - the string that holds the function notrace text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the notrace filter with + * @buf: the string that holds the function notrace text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Notrace Filters denote which functions should not be enabled when tracing * is enabled. If @buf is NULL and reset is set, all functions will be enabled @@ -5738,9 +5741,9 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, EXPORT_SYMBOL_GPL(ftrace_set_notrace); /** * ftrace_set_global_filter - set a function to filter on with global tracers - * @buf - the string that holds the function filter text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @buf: the string that holds the function filter text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. @@ -5753,9 +5756,9 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_filter); /** * ftrace_set_global_notrace - set a function to not trace with global tracers - * @buf - the string that holds the function notrace text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @buf: the string that holds the function notrace text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Notrace Filters denote which functions should not be enabled when tracing * is enabled. If @buf is NULL and reset is set, all functions will be enabled @@ -7443,7 +7446,7 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func); * have its own recursion protection, then it should call the * ftrace_ops_assist_func() instead. * - * Returns the function that the trampoline should call for @ops. + * Returns: the function that the trampoline should call for @ops. */ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) { @@ -7897,7 +7900,7 @@ void ftrace_kill(void) /** * ftrace_is_dead - Test if ftrace is dead or not. * - * Returns 1 if ftrace is "dead", zero otherwise. + * Returns: 1 if ftrace is "dead", zero otherwise. */ int ftrace_is_dead(void) { @@ -8142,8 +8145,7 @@ static int kallsyms_callback(void *data, const char *name, unsigned long addr) * @addrs array, which needs to be big enough to store at least @cnt * addresses. * - * This function returns 0 if all provided symbols are found, - * -ESRCH otherwise. + * Returns: 0 if all provided symbols are found, -ESRCH otherwise. */ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) { From c759e609030ca37e59866cbc849fdc611cc56292 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 23 Feb 2024 15:22:06 -0500 Subject: [PATCH 26/34] tracing: Remove __assign_str_len() Now that __assign_str() gets the length from the __string() (and __string_len()) macros, there's no reason to have a separate __assign_str_len() macro as __assign_str() can get the length of the string needed. Also remove __assign_rel_str() although it had no users anyway. Link: https://lore.kernel.org/linux-trace-kernel/20240223152206.0b650659@gandalf.local.home Cc: Jeff Layton Acked-by: Chuck Lever Signed-off-by: Steven Rostedt (Google) --- fs/nfsd/trace.h | 8 +++--- include/trace/stages/stage6_event_callback.h | 28 ++++++++------------ samples/trace_events/trace-events-sample.h | 9 ++++--- 3 files changed, 20 insertions(+), 25 deletions(-) diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 288a796b2878..1cd2076210b1 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -104,7 +104,7 @@ TRACE_EVENT(nfsd_compound, TP_fast_assign( __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->opcnt = opcnt; - __assign_str_len(tag, tag, taglen); + __assign_str(tag, tag); ), TP_printk("xid=0x%08x opcnt=%u tag=%s", __entry->xid, __entry->opcnt, __get_str(tag) @@ -485,7 +485,7 @@ TRACE_EVENT(nfsd_dirent, TP_fast_assign( __entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0; __entry->ino = ino; - __assign_str_len(name, name, namlen) + __assign_str(name, name); ), TP_printk("fh_hash=0x%08x ino=%llu name=%s", __entry->fh_hash, __entry->ino, __get_str(name) @@ -906,7 +906,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class, __entry->flavor = clp->cl_cred.cr_flavor; memcpy(__entry->verifier, (void *)&clp->cl_verifier, NFS4_VERIFIER_SIZE); - __assign_str_len(name, clp->cl_name.data, clp->cl_name.len); + __assign_str(name, clp->cl_name.data); ), TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x", __entry->addr, __get_str(name), @@ -1976,7 +1976,7 @@ TRACE_EVENT(nfsd_ctl_time, TP_fast_assign( __entry->netns_ino = net->ns.inum; __entry->time = time; - __assign_str_len(name, name, namelen); + __assign_str(name, name); ), TP_printk("file=%s time=%d\n", __get_str(name), __entry->time diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h index 2bfd49713b42..0c0f50bcdc56 100644 --- a/include/trace/stages/stage6_event_callback.h +++ b/include/trace/stages/stage6_event_callback.h @@ -32,16 +32,13 @@ #undef __assign_str #define __assign_str(dst, src) \ - memcpy(__get_str(dst), __data_offsets.dst##_ptr_ ? : EVENT_NULL_STR, \ - __get_dynamic_array_len(dst)) - -#undef __assign_str_len -#define __assign_str_len(dst, src, len) \ do { \ - memcpy(__get_str(dst), \ - __data_offsets.dst##_ptr_ ? : EVENT_NULL_STR, len); \ - __get_str(dst)[len] = '\0'; \ - } while(0) + char *__str__ = __get_str(dst); \ + int __len__ = __get_dynamic_array_len(dst) - 1; \ + memcpy(__str__, __data_offsets.dst##_ptr_ ? : \ + EVENT_NULL_STR, __len__); \ + __str__[__len__] = '\0'; \ + } while (0) #undef __assign_vstr #define __assign_vstr(dst, fmt, va) \ @@ -94,15 +91,12 @@ #undef __assign_rel_str #define __assign_rel_str(dst, src) \ - memcpy(__get_rel_str(dst), __data_offsets.dst##_ptr_ ? : EVENT_NULL_STR, \ - __get_rel_dynamic_array_len(dst)) - -#undef __assign_rel_str_len -#define __assign_rel_str_len(dst, src, len) \ do { \ - memcpy(__get_rel_str(dst), \ - __data_offsets.dst##_ptr_ ? : EVENT_NULL_STR, len); \ - __get_rel_str(dst)[len] = '\0'; \ + char *__str__ = __get_rel_str(dst); \ + int __len__ = __get_rel_dynamic_array_len(dst) - 1; \ + memcpy(__str__, __data_offsets.dst##_ptr_ ? : \ + EVENT_NULL_STR, __len__); \ + __str__[__len__] = '\0'; \ } while (0) #undef __rel_bitmask diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 23f923ccd529..f2d2d56ce8e2 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -163,8 +163,7 @@ * __string(). * * __string_len: This is a helper to a __dynamic_array, but it understands - * that the array has characters in it, and with the combined - * use of __assign_str_len(), it will allocate 'len' + 1 bytes + * that the array has characters in it, it will allocate 'len' + 1 bytes * in the ring buffer and add a '\0' to the string. This is * useful if the string being saved has no terminating '\0' byte. * It requires that the length of the string is known as it acts @@ -174,9 +173,11 @@ * * __string_len(foo, bar, len) * - * To assign this string, use the helper macro __assign_str_len(). + * To assign this string, use the helper macro __assign_str(). + * The length is saved via the __string_len() and is retrieved in + * __assign_str(). * - * __assign_str_len(foo, bar, len); + * __assign_str(foo, bar); * * Then len + 1 is allocated to the ring buffer, and a nul terminating * byte is added. This is similar to: From dd6ae6d90a84d4bec49887c7aa2b22aa1c8b2897 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 23 Feb 2024 15:28:27 -0500 Subject: [PATCH 27/34] tracing: Add __string_len() example There's no example code that uses __string_len(), and since the sample code is used for testing the event logic, add a use case. Link: https://lore.kernel.org/linux-trace-kernel/20240223152827.5f9f78e2@gandalf.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- samples/trace_events/trace-events-sample.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index f2d2d56ce8e2..2dfaf7fc7bfa 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -303,6 +303,7 @@ TRACE_EVENT(foo_bar, __bitmask( cpus, num_possible_cpus() ) __cpumask( cpum ) __vstring( vstr, fmt, va ) + __string_len( lstr, foo, bar / 2 < strlen(foo) ? bar / 2 : strlen(foo) ) ), TP_fast_assign( @@ -311,12 +312,13 @@ TRACE_EVENT(foo_bar, memcpy(__get_dynamic_array(list), lst, __length_of(lst) * sizeof(int)); __assign_str(str, string); + __assign_str(lstr, foo); __assign_vstr(vstr, fmt, va); __assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus()); __assign_cpumask(cpum, cpumask_bits(mask)); ), - TP_printk("foo %s %d %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar, + TP_printk("foo %s %d %s %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar, /* * Notice here the use of some helper functions. This includes: @@ -360,7 +362,8 @@ TRACE_EVENT(foo_bar, __print_array(__get_dynamic_array(list), __get_dynamic_array_len(list) / sizeof(int), sizeof(int)), - __get_str(str), __get_bitmask(cpus), __get_cpumask(cpum), + __get_str(str), __get_str(lstr), + __get_bitmask(cpus), __get_cpumask(cpum), __get_str(vstr)) ); From cf986e57d606849288eecf572800f2bd476fb1ab Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 23 Feb 2024 16:13:56 -0500 Subject: [PATCH 28/34] tracing: Add warning if string in __assign_str() does not match __string() In preparation to remove the second parameter of __assign_str(), make sure it is really a duplicate of __string() by adding a WARN_ON_ONCE(). Link: https://lore.kernel.org/linux-trace-kernel/20240223161356.63b72403@gandalf.local.home Cc: Mathieu Desnoyers Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/trace/stages/stage6_event_callback.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h index 0c0f50bcdc56..935608971899 100644 --- a/include/trace/stages/stage6_event_callback.h +++ b/include/trace/stages/stage6_event_callback.h @@ -35,6 +35,7 @@ do { \ char *__str__ = __get_str(dst); \ int __len__ = __get_dynamic_array_len(dst) - 1; \ + WARN_ON_ONCE((src) != __data_offsets.dst##_ptr_); \ memcpy(__str__, __data_offsets.dst##_ptr_ ? : \ EVENT_NULL_STR, __len__); \ __str__[__len__] = '\0'; \ From 0bdfb68c845edd7e2dbe815266bb09641576e22f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 23 Feb 2024 16:25:19 -0500 Subject: [PATCH 29/34] tracing: Remove second parameter to __assign_rel_str() The second parameter of __assign_rel_str() is no longer used. It can be removed. Note, the only real users of rel_string is user events. This code is just in the sample code for testing purposes. This makes __assign_rel_str() different than __assign_str() but that's fine. __assign_str() is used over 700 places and has a larger impact. That change will come later. Link: https://lore.kernel.org/linux-trace-kernel/20240223162519.2beb8112@gandalf.local.home Cc: Mathieu Desnoyers Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/trace/stages/stage6_event_callback.h | 2 +- samples/trace_events/trace-events-sample.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h index 935608971899..a0c15f67eabe 100644 --- a/include/trace/stages/stage6_event_callback.h +++ b/include/trace/stages/stage6_event_callback.h @@ -91,7 +91,7 @@ #define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, -1) #undef __assign_rel_str -#define __assign_rel_str(dst, src) \ +#define __assign_rel_str(dst) \ do { \ char *__str__ = __get_rel_str(dst); \ int __len__ = __get_rel_dynamic_array_len(dst) - 1; \ diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 2dfaf7fc7bfa..500981eca74d 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -574,7 +574,7 @@ TRACE_EVENT(foo_rel_loc, ), TP_fast_assign( - __assign_rel_str(foo, foo); + __assign_rel_str(foo); __entry->bar = bar; __assign_rel_bitmask(bitmask, mask, BITS_PER_BYTE * sizeof(unsigned long)); From 19f0423fd55c301c8edaea286e568ec657f42750 Mon Sep 17 00:00:00 2001 From: Huang Yiwei Date: Fri, 23 Feb 2024 16:31:26 +0800 Subject: [PATCH 30/34] tracing: Support to dump instance traces by ftrace_dump_on_oops Currently ftrace only dumps the global trace buffer on an OOPs. For debugging a production usecase, instance trace will be helpful to check specific problems since global trace buffer may be used for other purposes. This patch extend the ftrace_dump_on_oops parameter to dump a specific or multiple trace instances: - ftrace_dump_on_oops=0: as before -- don't dump - ftrace_dump_on_oops[=1]: as before -- dump the global trace buffer on all CPUs - ftrace_dump_on_oops=2 or =orig_cpu: as before -- dump the global trace buffer on CPU that triggered the oops - ftrace_dump_on_oops=: new behavior -- dump the tracing instance matching - ftrace_dump_on_oops[=2/orig_cpu],[=2/orig_cpu], [=2/orig_cpu]: new behavior -- dump the global trace buffer and multiple instance buffer on all CPUs, or only dump on CPU that triggered the oops if =2 or =orig_cpu is given Also, the sysctl node can handle the input accordingly. Link: https://lore.kernel.org/linux-trace-kernel/20240223083126.1817731-1-quic_hyiwei@quicinc.com Cc: Ross Zwisler Cc: Cc: Cc: Cc: Cc: Cc: Cc: Signed-off-by: Huang Yiwei Signed-off-by: Steven Rostedt (Google) --- .../admin-guide/kernel-parameters.txt | 26 ++- Documentation/admin-guide/sysctl/kernel.rst | 28 +++- include/linux/ftrace.h | 4 +- include/linux/kernel.h | 1 + kernel/sysctl.c | 4 +- kernel/trace/trace.c | 156 +++++++++++++----- kernel/trace/trace_selftest.c | 2 +- 7 files changed, 167 insertions(+), 54 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 479ff1737c2f..fa871d53641c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1572,12 +1572,28 @@ The above will cause the "foo" tracing instance to trigger a snapshot at the end of boot up. - ftrace_dump_on_oops[=orig_cpu] + ftrace_dump_on_oops[=2(orig_cpu) | =][, | + ,=2(orig_cpu)] [FTRACE] will dump the trace buffers on oops. - If no parameter is passed, ftrace will dump - buffers of all CPUs, but if you pass orig_cpu, it will - dump only the buffer of the CPU that triggered the - oops. + If no parameter is passed, ftrace will dump global + buffers of all CPUs, if you pass 2 or orig_cpu, it + will dump only the buffer of the CPU that triggered + the oops, or the specific instance will be dumped if + its name is passed. Multiple instance dump is also + supported, and instances are separated by commas. Each + instance supports only dump on CPU that triggered the + oops by passing 2 or orig_cpu to it. + + ftrace_dump_on_oops=foo=orig_cpu + + The above will dump only the buffer of "foo" instance + on CPU that triggered the oops. + + ftrace_dump_on_oops,foo,bar=orig_cpu + + The above will dump global buffer on all CPUs, the + buffer of "foo" instance on all CPUs and the buffer + of "bar" instance on CPU that triggered the oops. ftrace_filter=[function-list] [FTRACE] Limit the functions traced by the function diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 6584a1f9bfe3..ea8e5f152edc 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -296,12 +296,30 @@ kernel panic). This will output the contents of the ftrace buffers to the console. This is very useful for capturing traces that lead to crashes and outputting them to a serial console. -= =================================================== -0 Disabled (default). -1 Dump buffers of all CPUs. -2 Dump the buffer of the CPU that triggered the oops. -= =================================================== +======================= =========================================== +0 Disabled (default). +1 Dump buffers of all CPUs. +2(orig_cpu) Dump the buffer of the CPU that triggered the + oops. + Dump the specific instance buffer on all CPUs. +=2(orig_cpu) Dump the specific instance buffer on the CPU + that triggered the oops. +======================= =========================================== +Multiple instance dump is also supported, and instances are separated +by commas. If global buffer also needs to be dumped, please specify +the dump mode (1/2/orig_cpu) first for global buffer. + +So for example to dump "foo" and "bar" instance buffer on all CPUs, +user can:: + + echo "foo,bar" > /proc/sys/kernel/ftrace_dump_on_oops + +To dump global buffer and "foo" instance buffer on all +CPUs along with the "bar" instance buffer on CPU that triggered the +oops, user can:: + + echo "1,foo,bar=2" > /proc/sys/kernel/ftrace_dump_on_oops ftrace_enabled, stack_tracer_enabled ==================================== diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e8921871ef9a..54d53f345d14 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1151,7 +1151,9 @@ static inline void unpause_graph_tracing(void) { } #ifdef CONFIG_TRACING enum ftrace_dump_mode; -extern enum ftrace_dump_mode ftrace_dump_on_oops; +#define MAX_TRACER_SIZE 100 +extern char ftrace_dump_on_oops[]; +extern int ftrace_dump_on_oops_enabled(void); extern int tracepoint_printk; extern void disable_trace_on_warning(void); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d718fbec72dd..be2e8c0a187e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -215,6 +215,7 @@ enum ftrace_dump_mode { DUMP_NONE, DUMP_ALL, DUMP_ORIG, + DUMP_PARAM, }; #ifdef CONFIG_TRACING diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 157f7ce2942d..81cc974913bb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1710,9 +1710,9 @@ static struct ctl_table kern_table[] = { { .procname = "ftrace_dump_on_oops", .data = &ftrace_dump_on_oops, - .maxlen = sizeof(int), + .maxlen = MAX_TRACER_SIZE, .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dostring, }, { .procname = "traceoff_on_warning", diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1e1fd377a1cf..233d1af39fff 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -130,9 +130,12 @@ cpumask_var_t __read_mostly tracing_buffer_mask; * /proc/sys/kernel/ftrace_dump_on_oops * Set 1 if you want to dump buffers of all CPUs * Set 2 if you want to dump the buffer of the CPU that triggered oops + * Set instance name if you want to dump the specific trace instance + * Multiple instance dump is also supported, and instances are seperated + * by commas. */ - -enum ftrace_dump_mode ftrace_dump_on_oops; +/* Set to string format zero to disable by default */ +char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; @@ -178,7 +181,6 @@ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx); -#define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; @@ -201,19 +203,33 @@ static int __init set_cmdline_ftrace(char *str) } __setup("ftrace=", set_cmdline_ftrace); +int ftrace_dump_on_oops_enabled(void) +{ + if (!strcmp("0", ftrace_dump_on_oops)) + return 0; + else + return 1; +} + static int __init set_ftrace_dump_on_oops(char *str) { - if (*str++ != '=' || !*str || !strcmp("1", str)) { - ftrace_dump_on_oops = DUMP_ALL; + if (!*str) { + strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE); return 1; } - if (!strcmp("orig_cpu", str) || !strcmp("2", str)) { - ftrace_dump_on_oops = DUMP_ORIG; - return 1; - } + if (*str == ',') { + strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE); + strscpy(ftrace_dump_on_oops + 1, str, MAX_TRACER_SIZE - 1); + return 1; + } - return 0; + if (*str++ == '=') { + strscpy(ftrace_dump_on_oops, str, MAX_TRACER_SIZE); + return 1; + } + + return 0; } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); @@ -9832,14 +9848,14 @@ static struct notifier_block trace_die_notifier = { static int trace_die_panic_handler(struct notifier_block *self, unsigned long ev, void *unused) { - if (!ftrace_dump_on_oops) + if (!ftrace_dump_on_oops_enabled()) return NOTIFY_DONE; /* The die notifier requires DIE_OOPS to trigger */ if (self == &trace_die_notifier && ev != DIE_OOPS) return NOTIFY_DONE; - ftrace_dump(ftrace_dump_on_oops); + ftrace_dump(DUMP_PARAM); return NOTIFY_DONE; } @@ -9880,12 +9896,12 @@ trace_printk_seq(struct trace_seq *s) trace_seq_init(s); } -void trace_init_global_iter(struct trace_iterator *iter) +static void trace_init_iter(struct trace_iterator *iter, struct trace_array *tr) { - iter->tr = &global_trace; + iter->tr = tr; iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; - iter->array_buffer = &global_trace.array_buffer; + iter->array_buffer = &tr->array_buffer; if (iter->trace && iter->trace->open) iter->trace->open(iter); @@ -9905,22 +9921,19 @@ void trace_init_global_iter(struct trace_iterator *iter) iter->fmt_size = STATIC_FMT_BUF_SIZE; } -void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) +void trace_init_global_iter(struct trace_iterator *iter) +{ + trace_init_iter(iter, &global_trace); +} + +static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_mode) { /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; - static atomic_t dump_running; - struct trace_array *tr = &global_trace; unsigned int old_userobj; unsigned long flags; int cnt = 0, cpu; - /* Only allow one dump user at a time. */ - if (atomic_inc_return(&dump_running) != 1) { - atomic_dec(&dump_running); - return; - } - /* * Always turn off tracing when we dump. * We don't need to show trace output of what happens @@ -9929,12 +9942,12 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) * If the user does a sysrq-z, then they can re-enable * tracing with echo 1 > tracing_on. */ - tracing_off(); + tracer_tracing_off(tr); local_irq_save(flags); /* Simulate the iterator */ - trace_init_global_iter(&iter); + trace_init_iter(&iter, tr); for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); @@ -9945,21 +9958,15 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) /* don't look at user memory in panic mode */ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; - switch (oops_dump_mode) { - case DUMP_ALL: - iter.cpu_file = RING_BUFFER_ALL_CPUS; - break; - case DUMP_ORIG: + if (dump_mode == DUMP_ORIG) iter.cpu_file = raw_smp_processor_id(); - break; - case DUMP_NONE: - goto out_enable; - default: - printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + else iter.cpu_file = RING_BUFFER_ALL_CPUS; - } - printk(KERN_TRACE "Dumping ftrace buffer:\n"); + if (tr == &global_trace) + printk(KERN_TRACE "Dumping ftrace buffer:\n"); + else + printk(KERN_TRACE "Dumping ftrace instance %s buffer:\n", tr->name); /* Did function tracer already get disabled? */ if (ftrace_is_dead()) { @@ -10001,15 +10008,84 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) else printk(KERN_TRACE "---------------------------------\n"); - out_enable: tr->trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } - atomic_dec(&dump_running); local_irq_restore(flags); } + +static void ftrace_dump_by_param(void) +{ + bool first_param = true; + char dump_param[MAX_TRACER_SIZE]; + char *buf, *token, *inst_name; + struct trace_array *tr; + + strscpy(dump_param, ftrace_dump_on_oops, MAX_TRACER_SIZE); + buf = dump_param; + + while ((token = strsep(&buf, ",")) != NULL) { + if (first_param) { + first_param = false; + if (!strcmp("0", token)) + continue; + else if (!strcmp("1", token)) { + ftrace_dump_one(&global_trace, DUMP_ALL); + continue; + } + else if (!strcmp("2", token) || + !strcmp("orig_cpu", token)) { + ftrace_dump_one(&global_trace, DUMP_ORIG); + continue; + } + } + + inst_name = strsep(&token, "="); + tr = trace_array_find(inst_name); + if (!tr) { + printk(KERN_TRACE "Instance %s not found\n", inst_name); + continue; + } + + if (token && (!strcmp("2", token) || + !strcmp("orig_cpu", token))) + ftrace_dump_one(tr, DUMP_ORIG); + else + ftrace_dump_one(tr, DUMP_ALL); + } +} + +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) +{ + static atomic_t dump_running; + + /* Only allow one dump user at a time. */ + if (atomic_inc_return(&dump_running) != 1) { + atomic_dec(&dump_running); + return; + } + + switch (oops_dump_mode) { + case DUMP_ALL: + ftrace_dump_one(&global_trace, DUMP_ALL); + break; + case DUMP_ORIG: + ftrace_dump_one(&global_trace, DUMP_ORIG); + break; + case DUMP_PARAM: + ftrace_dump_by_param(); + break; + case DUMP_NONE: + break; + default: + printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + ftrace_dump_one(&global_trace, DUMP_ALL); + } + + atomic_dec(&dump_running); +} EXPORT_SYMBOL_GPL(ftrace_dump); #define WRITE_BUFSIZE 4096 diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 529590499b1f..e9c5058a8efd 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -768,7 +768,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { ftrace_graph_stop(); printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); - if (ftrace_dump_on_oops) { + if (ftrace_dump_on_oops_enabled()) { ftrace_dump(DUMP_ALL); /* ftrace_dump() disables tracing */ tracing_on(); From d6cb38e10810743addf8cac0b277861d614de1e9 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 25 Feb 2024 17:45:08 +0100 Subject: [PATCH 31/34] tracing: Use div64_u64() instead of do_div() Fixes Coccinelle/coccicheck warnings reported by do_div.cocci. Compared to do_div(), div64_u64() does not implicitly cast the divisor and does not unnecessarily calculate the remainder. Link: https://lore.kernel.org/linux-trace-kernel/20240225164507.232942-2-thorsten.blum@toblux.com Cc: Mathieu Desnoyers Signed-off-by: Thorsten Blum Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_benchmark.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 54d5fa35c90a..811b08439406 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -92,7 +92,6 @@ static void trace_do_benchmark(void) bm_total += delta; bm_totalsq += delta * delta; - if (bm_cnt > 1) { /* * Apply Welford's method to calculate standard deviation: @@ -105,7 +104,7 @@ static void trace_do_benchmark(void) stddev = 0; delta = bm_total; - do_div(delta, bm_cnt); + delta = div64_u64(delta, bm_cnt); avg = delta; if (stddev > 0) { @@ -127,7 +126,7 @@ static void trace_do_benchmark(void) seed = stddev; if (!last_seed) break; - do_div(seed, last_seed); + seed = div64_u64(seed, last_seed); seed += last_seed; do_div(seed, 2); } while (i++ < 10 && last_seed != seed); From 1b273124107cc8b9dd52228eba701efa516a3d92 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 28 Feb 2024 13:31:12 -0500 Subject: [PATCH 32/34] tracepoints: Use WARN() and not WARN_ON() for warnings There are two WARN_ON*() warnings in tracepoint.h that deal with RCU usage. But when they trigger, especially from using a TRACE_EVENT() macro, the information is not very helpful and is confusing: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at include/trace/events/lock.h:24 lock_acquire+0x2b2/0x2d0 Where the above warning takes you to: TRACE_EVENT(lock_acquire, <<<--- line 24 in lock.h TP_PROTO(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *next_lock, unsigned long ip), [..] Change the WARN_ON_ONCE() to WARN_ONCE() and add a string that allows someone to search for exactly where the bug happened. Link: https://lore.kernel.org/linux-trace-kernel/20240228133112.0d64fb1b@gandalf.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Thomas Gleixner Reported-by: Borislav Petkov Tested-by: Borislav Petkov (AMD) Reviewed-by: Paul E. McKenney Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 88c0ba623ee6..689b6d71590e 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -199,7 +199,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) if (!(cond)) \ return; \ \ - if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle))) \ + if (WARN_ONCE(RCUIDLE_COND(rcuidle), \ + "Bad RCU usage for tracepoint")) \ return; \ \ /* keep srcu and sched-rcu usage consistent */ \ @@ -259,7 +260,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) TP_ARGS(args), \ TP_CONDITION(cond), 0); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ - WARN_ON_ONCE(!rcu_is_watching()); \ + WARN_ONCE(!rcu_is_watching(), \ + "RCU not watching for tracepoint"); \ } \ } \ __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \ From b1afefa62ca9d77c8b1d386c3512fb4f21cb7db1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Mar 2024 11:30:02 -0400 Subject: [PATCH 33/34] tracing: Use strcmp() in __assign_str() WARN_ON() check The WARN_ON() check in __assign_str() to catch where the source variable to the macro doesn't match the source variable to __string() gives an error in clang: >> include/trace/events/sunrpc.h:703:4: warning: result of comparison against a string literal is unspecified (use an explicit string comparison function instead) [-Wstring-compare] 670 | __assign_str(progname, "unknown"); That's because the __assign_str() macro has: WARN_ON_ONCE((src) != __data_offsets.dst##_ptr_); Where "src" is a string literal. Clang warns when comparing a string literal directly as it is undefined to what the value of the literal is. Since this is still to make sure the same string that goes to __string() is the same as __assign_str(), for string literals do a test for that and then use strcmp() in those cases Note that this depends on commit 51270d573a8d ("tracing/net_sched: Fix tracepoints that save qdisc_dev() as a string") being applied, as this was what found that bug. Link: https://lore.kernel.org/linux-trace-kernel/20240312113002.00031668@gandalf.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nathan Chancellor Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202402292111.KIdExylU-lkp@intel.com/ Fixes: 433e1d88a3be ("tracing: Add warning if string in __assign_str() does not match __string()") Signed-off-by: Steven Rostedt (Google) --- include/trace/stages/stage6_event_callback.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h index a0c15f67eabe..83da83a0c14f 100644 --- a/include/trace/stages/stage6_event_callback.h +++ b/include/trace/stages/stage6_event_callback.h @@ -35,7 +35,9 @@ do { \ char *__str__ = __get_str(dst); \ int __len__ = __get_dynamic_array_len(dst) - 1; \ - WARN_ON_ONCE((src) != __data_offsets.dst##_ptr_); \ + WARN_ON_ONCE(__builtin_constant_p(src) ? \ + strcmp((src), __data_offsets.dst##_ptr_) : \ + (src) != __data_offsets.dst##_ptr_); \ memcpy(__str__, __data_offsets.dst##_ptr_ ? : \ EVENT_NULL_STR, __len__); \ __str__[__len__] = '\0'; \ From 7604256cecef34a82333d9f78262d3180f4eb525 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 14 Mar 2024 23:27:54 -0400 Subject: [PATCH 34/34] tracing: Add __string_src() helper to help compilers not to get confused The __string() helper macro of the TRACE_EVENT() macro is used to determine how much of the ring buffer needs to be allocated to fit the given source string. Some trace events have a string that is dependent on another variable that could be NULL, and in those cases the string is passed in to be NULL. The __string() macro can handle being passed in a NULL pointer for which it will turn it into "(null)". It does that with: strlen((src) ? (const char *)(src) : "(null)") + 1 But if src itself has the same conditional type it can confuse the compiler. That is: __string(r ? dev(r)->name : NULL) Would turn into: strlen((r ? dev(r)->name : NULL) ? (r ? dev(r)->name : NULL) : "(null)" + 1 For which the compiler thinks that NULL is being passed to strlen() and gives this kind of warning: ./include/trace/stages/stage5_get_offsets.h:50:21: warning: argument 1 null where non-null expected [-Wnonnull] 50 | strlen((src) ? (const char *)(src) : "(null)") + 1) Instead, create a static inline function that takes the src string and will return the string if it is not NULL and will return "(null)" if it is. This will then make the strlen() line: strlen(__string_src(src)) + 1 Where the compiler can see that strlen() will not end up with NULL and does not warn about it. Note that this depends on commit 51270d573a8d ("tracing/net_sched: Fix tracepoints that save qdisc_dev() as a string") being applied, as passing the qdisc_dev() into __string_src() will give an error. Link: https://lore.kernel.org/all/ZfNmfCmgCs4Nc+EH@aschofie-mobl2/ Link: https://lore.kernel.org/linux-trace-kernel/20240314232754.345cea82@rorschach.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Reported-by: Alison Schofield Signed-off-by: Steven Rostedt (Google) --- include/trace/stages/stage5_get_offsets.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/include/trace/stages/stage5_get_offsets.h b/include/trace/stages/stage5_get_offsets.h index e6b96608f452..c6a62dfb18ef 100644 --- a/include/trace/stages/stage5_get_offsets.h +++ b/include/trace/stages/stage5_get_offsets.h @@ -9,6 +9,16 @@ #undef __entry #define __entry entry +#ifndef __STAGE5_STRING_SRC_H +#define __STAGE5_STRING_SRC_H +static inline const char *__string_src(const char *str) +{ + if (!str) + return EVENT_NULL_STR; + return str; +} +#endif /* __STAGE5_STRING_SRC_H */ + /* * Fields should never declare an array: i.e. __field(int, arr[5]) * If they do, it will cause issues in parsing and possibly corrupt the @@ -47,7 +57,7 @@ #undef __string #define __string(item, src) __dynamic_array(char, item, \ - strlen((const char *)(src) ? : EVENT_NULL_STR) + 1) \ + strlen(__string_src(src)) + 1) \ __data_offsets->item##_ptr_ = src; #undef __string_len @@ -70,7 +80,7 @@ #undef __rel_string #define __rel_string(item, src) __rel_dynamic_array(char, item, \ - strlen((const char *)(src) ? : EVENT_NULL_STR) + 1) \ + strlen(__string_src(src)) + 1) \ __data_offsets->item##_ptr_ = src; #undef __rel_string_len