From 3a301d7c1c68fd96bbcf82db82d9508918e0dc22 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 8 Aug 2011 21:39:39 -0400 Subject: [PATCH 01/20] tracing: Clean up tb_fmt to not give faulty compile warning gcc incorrectly states that the variable "fmt" is uninitialized when CC_OPITMIZE_FOR_SIZE is set. Instead of just blindly setting fmt to NULL, the code is cleaned up a little to be a bit easier for humans to follow, as well as gcc to know the variables are initialized. Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 1f06468a10d7..6fd4ffd042f9 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -59,18 +59,19 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) continue; } + fmt = NULL; tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); - if (tb_fmt) + if (tb_fmt) { fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); - if (tb_fmt && fmt) { - list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); - strcpy(fmt, *iter); - tb_fmt->fmt = fmt; - *iter = tb_fmt->fmt; - } else { - kfree(tb_fmt); - *iter = NULL; + if (fmt) { + list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); + strcpy(fmt, *iter); + tb_fmt->fmt = fmt; + } else + kfree(tb_fmt); } + *iter = fmt; + } mutex_unlock(&btrace_mutex); } From b75ef8b44b1cb95f5a26484b0e2fe37a63b12b44 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 10 Aug 2011 15:18:39 -0400 Subject: [PATCH 02/20] Tracepoint: Dissociate from module mutex Copy the information needed from struct module into a local module list held within tracepoint.c from within the module coming/going notifier. This vastly simplifies locking of tracepoint registration / unregistration, because we don't have to take the module mutex to register and unregister tracepoints anymore. Steven Rostedt ran into dependency problems related to modules mutex vs kprobes mutex vs ftrace mutex vs tracepoint mutex that seems to be hard to fix without removing this dependency between tracepoint and module mutex. (note: it should be investigated whether kprobes could benefit of being dissociated from the modules mutex too.) This also fixes module handling of tracepoint list iterators, because it was expecting the list to be sorted by pointer address. Given we have control on our own list now, it's OK to sort this list which has tracepoints as its only purpose. The reason why this sorting is required is to handle the fact that seq files (and any read() operation from user-space) cannot hold the tracepoint mutex across multiple calls, so list entries may vanish between calls. With sorting, the tracepoint iterator becomes usable even if the list don't contain the exact item pointed to by the iterator anymore. Signed-off-by: Mathieu Desnoyers Acked-by: Jason Baron CC: Ingo Molnar CC: Lai Jiangshan CC: Peter Zijlstra CC: Thomas Gleixner CC: Masami Hiramatsu Link: http://lkml.kernel.org/r/20110810191839.GC8525@Krystal Signed-off-by: Steven Rostedt --- include/linux/module.h | 12 --- include/linux/tracepoint.h | 25 +++--- kernel/module.c | 47 ----------- kernel/tracepoint.c | 169 ++++++++++++++++++++++++++++++++----- 4 files changed, 158 insertions(+), 95 deletions(-) diff --git a/include/linux/module.h b/include/linux/module.h index 1c30087a2d81..863921637d9f 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -580,9 +580,6 @@ int unregister_module_notifier(struct notifier_block * nb); extern void print_modules(void); -extern void module_update_tracepoints(void); -extern int module_get_iter_tracepoints(struct tracepoint_iter *iter); - #else /* !CONFIG_MODULES... */ #define EXPORT_SYMBOL(sym) #define EXPORT_SYMBOL_GPL(sym) @@ -698,15 +695,6 @@ static inline int unregister_module_notifier(struct notifier_block * nb) static inline void print_modules(void) { } - -static inline void module_update_tracepoints(void) -{ -} - -static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter) -{ - return 0; -} #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index d530a4460a0b..df0a779c1bbd 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -54,8 +54,18 @@ extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe, void *data); extern void tracepoint_probe_update_all(void); +#ifdef CONFIG_MODULES +struct tp_module { + struct list_head list; + unsigned int num_tracepoints; + struct tracepoint * const *tracepoints_ptrs; +}; +#endif /* CONFIG_MODULES */ + struct tracepoint_iter { - struct module *module; +#ifdef CONFIG_MODULES + struct tp_module *module; +#endif /* CONFIG_MODULES */ struct tracepoint * const *tracepoint; }; @@ -63,8 +73,6 @@ extern void tracepoint_iter_start(struct tracepoint_iter *iter); extern void tracepoint_iter_next(struct tracepoint_iter *iter); extern void tracepoint_iter_stop(struct tracepoint_iter *iter); extern void tracepoint_iter_reset(struct tracepoint_iter *iter); -extern int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, - struct tracepoint * const *begin, struct tracepoint * const *end); /* * tracepoint_synchronize_unregister must be called between the last tracepoint @@ -78,17 +86,6 @@ static inline void tracepoint_synchronize_unregister(void) #define PARAMS(args...) args -#ifdef CONFIG_TRACEPOINTS -extern -void tracepoint_update_probe_range(struct tracepoint * const *begin, - struct tracepoint * const *end); -#else -static inline -void tracepoint_update_probe_range(struct tracepoint * const *begin, - struct tracepoint * const *end) -{ } -#endif /* CONFIG_TRACEPOINTS */ - #endif /* _LINUX_TRACEPOINT_H */ /* diff --git a/kernel/module.c b/kernel/module.c index 04379f92f843..93342d992f34 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3487,50 +3487,3 @@ void module_layout(struct module *mod, } EXPORT_SYMBOL(module_layout); #endif - -#ifdef CONFIG_TRACEPOINTS -void module_update_tracepoints(void) -{ - struct module *mod; - - mutex_lock(&module_mutex); - list_for_each_entry(mod, &modules, list) - if (!mod->taints) - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); - mutex_unlock(&module_mutex); -} - -/* - * Returns 0 if current not found. - * Returns 1 if current found. - */ -int module_get_iter_tracepoints(struct tracepoint_iter *iter) -{ - struct module *iter_mod; - int found = 0; - - mutex_lock(&module_mutex); - list_for_each_entry(iter_mod, &modules, list) { - if (!iter_mod->taints) { - /* - * Sorted module list - */ - if (iter_mod < iter->module) - continue; - else if (iter_mod > iter->module) - iter->tracepoint = NULL; - found = tracepoint_get_iter_range(&iter->tracepoint, - iter_mod->tracepoints_ptrs, - iter_mod->tracepoints_ptrs - + iter_mod->num_tracepoints); - if (found) { - iter->module = iter_mod; - break; - } - } - } - mutex_unlock(&module_mutex); - return found; -} -#endif diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index b219f1449c54..db110b8ae030 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -34,11 +34,16 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[]; static const int tracepoint_debug; /* - * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the - * builtin and module tracepoints and the hash table. + * Tracepoints mutex protects the builtin and module tracepoints and the hash + * table, as well as the local module list. */ static DEFINE_MUTEX(tracepoints_mutex); +#ifdef CONFIG_MODULES +/* Local list of struct module */ +static LIST_HEAD(tracepoint_module_list); +#endif /* CONFIG_MODULES */ + /* * Tracepoint hash table, containing the active tracepoints. * Protected by tracepoints_mutex. @@ -292,9 +297,10 @@ static void disable_tracepoint(struct tracepoint *elem) * @end: end of the range * * Updates the probe callback corresponding to a range of tracepoints. + * Called with tracepoints_mutex held. */ -void tracepoint_update_probe_range(struct tracepoint * const *begin, - struct tracepoint * const *end) +static void tracepoint_update_probe_range(struct tracepoint * const *begin, + struct tracepoint * const *end) { struct tracepoint * const *iter; struct tracepoint_entry *mark_entry; @@ -302,7 +308,6 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin, if (!begin) return; - mutex_lock(&tracepoints_mutex); for (iter = begin; iter < end; iter++) { mark_entry = get_tracepoint((*iter)->name); if (mark_entry) { @@ -312,11 +317,27 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin, disable_tracepoint(*iter); } } - mutex_unlock(&tracepoints_mutex); } +#ifdef CONFIG_MODULES +void module_update_tracepoints(void) +{ + struct tp_module *tp_mod; + + list_for_each_entry(tp_mod, &tracepoint_module_list, list) + tracepoint_update_probe_range(tp_mod->tracepoints_ptrs, + tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints); +} +#else /* CONFIG_MODULES */ +void module_update_tracepoints(void) +{ +} +#endif /* CONFIG_MODULES */ + + /* * Update probes, removing the faulty probes. + * Called with tracepoints_mutex held. */ static void tracepoint_update_probes(void) { @@ -359,11 +380,12 @@ int tracepoint_probe_register(const char *name, void *probe, void *data) mutex_lock(&tracepoints_mutex); old = tracepoint_add_probe(name, probe, data); - mutex_unlock(&tracepoints_mutex); - if (IS_ERR(old)) + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); return PTR_ERR(old); - + } tracepoint_update_probes(); /* may update entry */ + mutex_unlock(&tracepoints_mutex); release_probes(old); return 0; } @@ -402,11 +424,12 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data) mutex_lock(&tracepoints_mutex); old = tracepoint_remove_probe(name, probe, data); - mutex_unlock(&tracepoints_mutex); - if (IS_ERR(old)) + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); return PTR_ERR(old); - + } tracepoint_update_probes(); /* may update entry */ + mutex_unlock(&tracepoints_mutex); release_probes(old); return 0; } @@ -489,9 +512,8 @@ void tracepoint_probe_update_all(void) if (!list_empty(&old_probes)) list_replace_init(&old_probes, &release_probes); need_update = 0; - mutex_unlock(&tracepoints_mutex); - tracepoint_update_probes(); + mutex_unlock(&tracepoints_mutex); list_for_each_entry_safe(pos, next, &release_probes, u.list) { list_del(&pos->u.list); call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); @@ -509,7 +531,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); * Will return the first tracepoint in the range if the input tracepoint is * NULL. */ -int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, +static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, struct tracepoint * const *begin, struct tracepoint * const *end) { if (!*tracepoint && begin != end) { @@ -520,11 +542,12 @@ int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, return 1; return 0; } -EXPORT_SYMBOL_GPL(tracepoint_get_iter_range); +#ifdef CONFIG_MODULES static void tracepoint_get_iter(struct tracepoint_iter *iter) { int found = 0; + struct tp_module *iter_mod; /* Core kernel tracepoints */ if (!iter->module) { @@ -534,12 +557,43 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter) if (found) goto end; } - /* tracepoints in modules. */ - found = module_get_iter_tracepoints(iter); + /* Tracepoints in modules */ + mutex_lock(&tracepoints_mutex); + list_for_each_entry(iter_mod, &tracepoint_module_list, list) { + /* + * Sorted module list + */ + if (iter_mod < iter->module) + continue; + else if (iter_mod > iter->module) + iter->tracepoint = NULL; + found = tracepoint_get_iter_range(&iter->tracepoint, + iter_mod->tracepoints_ptrs, + iter_mod->tracepoints_ptrs + + iter_mod->num_tracepoints); + if (found) { + iter->module = iter_mod; + break; + } + } + mutex_unlock(&tracepoints_mutex); end: if (!found) tracepoint_iter_reset(iter); } +#else /* CONFIG_MODULES */ +static void tracepoint_get_iter(struct tracepoint_iter *iter) +{ + int found = 0; + + /* Core kernel tracepoints */ + found = tracepoint_get_iter_range(&iter->tracepoint, + __start___tracepoints_ptrs, + __stop___tracepoints_ptrs); + if (!found) + tracepoint_iter_reset(iter); +} +#endif /* CONFIG_MODULES */ void tracepoint_iter_start(struct tracepoint_iter *iter) { @@ -566,26 +620,98 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop); void tracepoint_iter_reset(struct tracepoint_iter *iter) { +#ifdef CONFIG_MODULES iter->module = NULL; +#endif /* CONFIG_MODULES */ iter->tracepoint = NULL; } EXPORT_SYMBOL_GPL(tracepoint_iter_reset); #ifdef CONFIG_MODULES +static int tracepoint_module_coming(struct module *mod) +{ + struct tp_module *tp_mod, *iter; + int ret = 0; + + /* + * We skip modules that tain the kernel, especially those with different + * module header (for forced load), to make sure we don't cause a crash. + */ + if (mod->taints) + return 0; + mutex_lock(&tracepoints_mutex); + tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); + if (!tp_mod) { + ret = -ENOMEM; + goto end; + } + tp_mod->num_tracepoints = mod->num_tracepoints; + tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; + + /* + * tracepoint_module_list is kept sorted by struct module pointer + * address for iteration on tracepoints from a seq_file that can release + * the mutex between calls. + */ + list_for_each_entry_reverse(iter, &tracepoint_module_list, list) { + BUG_ON(iter == tp_mod); /* Should never be in the list twice */ + if (iter < tp_mod) { + /* We belong to the location right after iter. */ + list_add(&tp_mod->list, &iter->list); + goto module_added; + } + } + /* We belong to the beginning of the list */ + list_add(&tp_mod->list, &tracepoint_module_list); +module_added: + tracepoint_update_probe_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); +end: + mutex_unlock(&tracepoints_mutex); + return ret; +} + +static int tracepoint_module_going(struct module *mod) +{ + struct tp_module *pos; + + mutex_lock(&tracepoints_mutex); + tracepoint_update_probe_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); + list_for_each_entry(pos, &tracepoint_module_list, list) { + if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) { + list_del(&pos->list); + kfree(pos); + break; + } + } + /* + * In the case of modules that were tainted at "coming", we'll simply + * walk through the list without finding it. We cannot use the "tainted" + * flag on "going", in case a module taints the kernel only after being + * loaded. + */ + mutex_unlock(&tracepoints_mutex); + return 0; +} int tracepoint_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; + int ret = 0; switch (val) { case MODULE_STATE_COMING: + ret = tracepoint_module_coming(mod); + break; + case MODULE_STATE_LIVE: + break; case MODULE_STATE_GOING: - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); + ret = tracepoint_module_going(mod); break; } - return 0; + return ret; } struct notifier_block tracepoint_module_nb = { @@ -598,7 +724,6 @@ static int init_tracepoints(void) return register_module_notifier(&tracepoint_module_nb); } __initcall(init_tracepoints); - #endif /* CONFIG_MODULES */ #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS From 2f3aa7a06f6f48d6f78a90595b17e6beafa7abf6 Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Tue, 16 Aug 2011 21:04:38 -0300 Subject: [PATCH 03/20] x86: jump_label: arch_jump_label_text_poke_early: add missing __init arch_jump_label_text_poke_early calls text_poke_early, which is an __init function. Thus arch_jump_label_text_poke_early should be the same. Signed-off-by: Kevin Winchester Cc: jbaron@redhat.com Cc: tglx@linutronix.de Cc: mingo@redhat.com Cc: hpa@zytor.com Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1313539478-30303-1-git-send-email-kjwinchester@gmail.com [ Use __init_or_module instead of __init ] Signed-off-by: Steven Rostedt --- arch/x86/kernel/jump_label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 3fee346ef545..cacdd46d184d 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -42,7 +42,7 @@ void arch_jump_label_transform(struct jump_entry *entry, put_online_cpus(); } -void arch_jump_label_text_poke_early(jump_label_t addr) +void __init_or_module arch_jump_label_text_poke_early(jump_label_t addr) { text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); From 81570d9caaad46a056580c9af078c5c55e6c764f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Aug 2011 16:25:45 +0200 Subject: [PATCH 04/20] tracing/filter: Use static allocation for filter predicates Don't dynamically allocate filter_pred struct, use static memory. This way we can get rid of the code managing the dynamic filter_pred struct object. The create_pred function integrates create_logical_pred function. This way the static predicate memory is returned only from one place. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1313072754-4620-2-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 57 +++++++++--------------------- 1 file changed, 16 insertions(+), 41 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 256764ecccd6..cb295a117ee7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -630,11 +630,7 @@ find_event_field(struct ftrace_event_call *call, char *name) static void filter_free_pred(struct filter_pred *pred) { - if (!pred) - return; - kfree(pred->field_name); - kfree(pred); } static void filter_clear_pred(struct filter_pred *pred) @@ -1302,39 +1298,30 @@ static int filter_parse(struct filter_parse_state *ps) return 0; } -static struct filter_pred *create_pred(int op, char *operand1, char *operand2) +static struct filter_pred *create_pred(struct filter_parse_state *ps, + int op, char *operand1, char *operand2) { - struct filter_pred *pred; + static struct filter_pred pred; - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) - return NULL; + memset(&pred, 0, sizeof(pred)); + pred.op = op; - pred->field_name = kstrdup(operand1, GFP_KERNEL); - if (!pred->field_name) { - kfree(pred); + if (op == OP_AND || op == OP_OR) + return &pred; + + if (!operand1 || !operand2) { + parse_error(ps, FILT_ERR_MISSING_FIELD, 0); return NULL; } - strcpy(pred->regex.pattern, operand2); - pred->regex.len = strlen(pred->regex.pattern); - - pred->op = op; - - return pred; -} - -static struct filter_pred *create_logical_pred(int op) -{ - struct filter_pred *pred; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + pred.field_name = kstrdup(operand1, GFP_KERNEL); + if (!pred.field_name) return NULL; - pred->op = op; + strcpy(pred.regex.pattern, operand2); + pred.regex.len = strlen(pred.regex.pattern); - return pred; + return &pred; } static int check_preds(struct filter_parse_state *ps) @@ -1643,19 +1630,7 @@ static int replace_preds(struct ftrace_event_call *call, goto fail; } - if (elt->op == OP_AND || elt->op == OP_OR) { - pred = create_logical_pred(elt->op); - goto add_pred; - } - - if (!operand1 || !operand2) { - parse_error(ps, FILT_ERR_MISSING_FIELD, 0); - err = -EINVAL; - goto fail; - } - - pred = create_pred(elt->op, operand1, operand2); -add_pred: + pred = create_pred(ps, elt->op, operand1, operand2); if (!pred) { err = -ENOMEM; goto fail; From 9d96cd1743547f07a8a6c51a3f7741cfca0a0bee Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Aug 2011 16:25:46 +0200 Subject: [PATCH 05/20] tracing/filter: Separate predicate init and filter addition Making the code cleaner by having one function to fully prepare the predicate (create_pred), and another to add the predicate to the filter (filter_add_pred). As a benefit, this way the dry_run flag stays only inside the replace_preds function and is not passed deeper. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1313072754-4620-3-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 56 ++++++++++++------------------ 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index cb295a117ee7..61c8dec3b2b7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -685,8 +685,7 @@ __pop_pred_stack(struct pred_stack *stack) static int filter_set_pred(struct event_filter *filter, int idx, struct pred_stack *stack, - struct filter_pred *src, - filter_pred_fn_t fn) + struct filter_pred *src) { struct filter_pred *dest = &filter->preds[idx]; struct filter_pred *left; @@ -698,7 +697,6 @@ static int filter_set_pred(struct event_filter *filter, if (!dest->field_name) return -ENOMEM; } - dest->fn = fn; dest->index = idx; if (dest->op == OP_OR || dest->op == OP_AND) { @@ -836,12 +834,10 @@ static void filter_free_subsystem_filters(struct event_subsystem *system) } } -static int filter_add_pred_fn(struct filter_parse_state *ps, - struct ftrace_event_call *call, - struct event_filter *filter, - struct filter_pred *pred, - struct pred_stack *stack, - filter_pred_fn_t fn) +static int filter_add_pred(struct filter_parse_state *ps, + struct event_filter *filter, + struct filter_pred *pred, + struct pred_stack *stack) { int idx, err; @@ -852,7 +848,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, idx = filter->n_preds; filter_clear_pred(&filter->preds[idx]); - err = filter_set_pred(filter, idx, stack, pred, fn); + err = filter_set_pred(filter, idx, stack, pred); if (err) return err; @@ -933,25 +929,16 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, return fn; } -static int filter_add_pred(struct filter_parse_state *ps, - struct ftrace_event_call *call, - struct event_filter *filter, - struct filter_pred *pred, - struct pred_stack *stack, - bool dry_run) +static int init_pred(struct filter_parse_state *ps, + struct ftrace_event_call *call, + struct filter_pred *pred) + { struct ftrace_event_field *field; - filter_pred_fn_t fn; + filter_pred_fn_t fn = filter_pred_none; unsigned long long val; int ret; - fn = pred->fn = filter_pred_none; - - if (pred->op == OP_AND) - goto add_pred_fn; - else if (pred->op == OP_OR) - goto add_pred_fn; - field = find_event_field(call, pred->field_name); if (!field) { parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); @@ -997,9 +984,7 @@ static int filter_add_pred(struct filter_parse_state *ps, if (pred->op == OP_NE) pred->not = 1; -add_pred_fn: - if (!dry_run) - return filter_add_pred_fn(ps, call, filter, pred, stack, fn); + pred->fn = fn; return 0; } @@ -1299,6 +1284,7 @@ static int filter_parse(struct filter_parse_state *ps) } static struct filter_pred *create_pred(struct filter_parse_state *ps, + struct ftrace_event_call *call, int op, char *operand1, char *operand2) { static struct filter_pred pred; @@ -1321,7 +1307,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, strcpy(pred.regex.pattern, operand2); pred.regex.len = strlen(pred.regex.pattern); - return &pred; + return init_pred(ps, call, &pred) ? NULL : &pred; } static int check_preds(struct filter_parse_state *ps) @@ -1630,16 +1616,20 @@ static int replace_preds(struct ftrace_event_call *call, goto fail; } - pred = create_pred(ps, elt->op, operand1, operand2); + pred = create_pred(ps, call, elt->op, operand1, operand2); if (!pred) { err = -ENOMEM; goto fail; } - err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); - filter_free_pred(pred); - if (err) - goto fail; + if (!dry_run) { + err = filter_add_pred(ps, filter, pred, &stack); + if (err) { + filter_free_pred(pred); + goto fail; + } + } + filter_free_pred(pred); operand1 = operand2 = NULL; } From 61aaef55300088e12d7f853adeea65d1aa1562db Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Aug 2011 16:25:47 +0200 Subject: [PATCH 06/20] tracing/filter: Remove field_name from filter_pred struct The field_name was used just for finding event's fields. This way we don't need to care about field_name allocation/free. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1313072754-4620-4-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 11 +------ kernel/trace/trace_events_filter.c | 53 +++++++----------------------- 2 files changed, 13 insertions(+), 51 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 616846bcfee5..2eb3cf6d37bc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -761,16 +761,7 @@ struct filter_pred { filter_pred_fn_t fn; u64 val; struct regex regex; - /* - * Leaf nodes use field_name, ops is used by AND and OR - * nodes. The field_name is always freed when freeing a pred. - * We can overload field_name for ops and have it freed - * as well. - */ - union { - char *field_name; - unsigned short *ops; - }; + unsigned short *ops; int offset; int not; int op; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 61c8dec3b2b7..97b93f31884b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -628,18 +628,6 @@ find_event_field(struct ftrace_event_call *call, char *name) return __find_event_field(head, name); } -static void filter_free_pred(struct filter_pred *pred) -{ - kfree(pred->field_name); -} - -static void filter_clear_pred(struct filter_pred *pred) -{ - kfree(pred->field_name); - pred->field_name = NULL; - pred->regex.len = 0; -} - static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) { stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); @@ -692,11 +680,6 @@ static int filter_set_pred(struct event_filter *filter, struct filter_pred *right; *dest = *src; - if (src->field_name) { - dest->field_name = kstrdup(src->field_name, GFP_KERNEL); - if (!dest->field_name) - return -ENOMEM; - } dest->index = idx; if (dest->op == OP_OR || dest->op == OP_AND) { @@ -737,11 +720,7 @@ static int filter_set_pred(struct event_filter *filter, static void __free_preds(struct event_filter *filter) { - int i; - if (filter->preds) { - for (i = 0; i < filter->a_preds; i++) - kfree(filter->preds[i].field_name); kfree(filter->preds); filter->preds = NULL; } @@ -839,16 +818,14 @@ static int filter_add_pred(struct filter_parse_state *ps, struct filter_pred *pred, struct pred_stack *stack) { - int idx, err; + int err; if (WARN_ON(filter->n_preds == filter->a_preds)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } - idx = filter->n_preds; - filter_clear_pred(&filter->preds[idx]); - err = filter_set_pred(filter, idx, stack, pred); + err = filter_set_pred(filter, filter->n_preds, stack, pred); if (err) return err; @@ -930,21 +907,14 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, } static int init_pred(struct filter_parse_state *ps, - struct ftrace_event_call *call, + struct ftrace_event_field *field, struct filter_pred *pred) { - struct ftrace_event_field *field; filter_pred_fn_t fn = filter_pred_none; unsigned long long val; int ret; - field = find_event_field(call, pred->field_name); - if (!field) { - parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); - return -EINVAL; - } - pred->offset = field->offset; if (!is_legal_op(field, pred->op)) { @@ -1287,6 +1257,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, int op, char *operand1, char *operand2) { + struct ftrace_event_field *field; static struct filter_pred pred; memset(&pred, 0, sizeof(pred)); @@ -1300,14 +1271,16 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, return NULL; } - pred.field_name = kstrdup(operand1, GFP_KERNEL); - if (!pred.field_name) + field = find_event_field(call, operand1); + if (!field) { + parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); return NULL; + } strcpy(pred.regex.pattern, operand2); pred.regex.len = strlen(pred.regex.pattern); - return init_pred(ps, call, &pred) ? NULL : &pred; + return init_pred(ps, field, &pred) ? NULL : &pred; } static int check_preds(struct filter_parse_state *ps) @@ -1618,18 +1591,16 @@ static int replace_preds(struct ftrace_event_call *call, pred = create_pred(ps, call, elt->op, operand1, operand2); if (!pred) { - err = -ENOMEM; + err = -EINVAL; goto fail; } + if (!dry_run) { err = filter_add_pred(ps, filter, pred, &stack); - if (err) { - filter_free_pred(pred); + if (err) goto fail; - } } - filter_free_pred(pred); operand1 = operand2 = NULL; } From 3f78f935e7fec3067b7990f9e4d324e1de70f79c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Aug 2011 16:25:48 +0200 Subject: [PATCH 07/20] tracing/filter: Simplify tracepoint event lookup We dont need to perform lookup through the ftrace_events list, instead we can use the 'tp_event' field. Each perf_event contains tracepoint event field 'tp_event', which got initialized during the tracepoint event initialization. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1313072754-4620-5-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 97b93f31884b..0948905dd39c 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1894,17 +1894,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, int err; struct event_filter *filter; struct filter_parse_state *ps; - struct ftrace_event_call *call = NULL; + struct ftrace_event_call *call; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - if (call->event.type == event_id) - break; - } + call = event->tp_event; err = -EINVAL; - if (&call->list == &ftrace_events) + if (!call) goto out_unlock; err = -EEXIST; From f03f5979945c573801c25ba3089ef17c4d7edc61 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Aug 2011 16:25:49 +0200 Subject: [PATCH 08/20] tracing/filter: Unify predicate tree walking, change check_pred_tree function to use it Adding walk_pred_tree function to be used for walking throught the filter predicates. For each predicate the callback function is called, allowing users to add their own functionality or customize their way through the filter predicates. Changing check_pred_tree function to use walk_pred_tree. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1313072754-4620-6-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 137 ++++++++++++++++++----------- 1 file changed, 86 insertions(+), 51 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0948905dd39c..5b889d43d856 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -381,6 +381,63 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, return pred; } +enum walk_return { + WALK_PRED_ABORT, + WALK_PRED_PARENT, + WALK_PRED_DEFAULT, +}; + +typedef int (*filter_pred_walkcb_t) (enum move_type move, + struct filter_pred *pred, + int *err, void *data); + +static int walk_pred_tree(struct filter_pred *preds, + struct filter_pred *root, + filter_pred_walkcb_t cb, void *data) +{ + struct filter_pred *pred = root; + enum move_type move = MOVE_DOWN; + int done = 0; + + if (!preds) + return -EINVAL; + + do { + int err = 0, ret; + + ret = cb(move, pred, &err, data); + if (ret == WALK_PRED_ABORT) + return err; + if (ret == WALK_PRED_PARENT) + goto get_parent; + + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + goto get_parent; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + get_parent: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, + &move); + continue; + } + done = 1; + } while (!done); + + /* We are fine. */ + return 0; +} + /* * A series of AND or ORs where found together. Instead of * climbing up and down the tree branches, an array of the @@ -1321,6 +1378,23 @@ static int count_preds(struct filter_parse_state *ps) return n_preds; } +struct check_pred_data { + int count; + int max; +}; + +static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct check_pred_data *d = data; + + if (WARN_ON(d->count++ > d->max)) { + *err = -EINVAL; + return WALK_PRED_ABORT; + } + return WALK_PRED_DEFAULT; +} + /* * The tree is walked at filtering of an event. If the tree is not correctly * built, it may cause an infinite loop. Check here that the tree does @@ -1329,58 +1403,19 @@ static int count_preds(struct filter_parse_state *ps) static int check_pred_tree(struct event_filter *filter, struct filter_pred *root) { - struct filter_pred *preds; - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int count = 0; - int done = 0; - int max; + struct check_pred_data data = { + /* + * The max that we can hit a node is three times. + * Once going down, once coming up from left, and + * once coming up from right. This is more than enough + * since leafs are only hit a single time. + */ + .max = 3 * filter->n_preds, + .count = 0, + }; - /* - * The max that we can hit a node is three times. - * Once going down, once coming up from left, and - * once coming up from right. This is more than enough - * since leafs are only hit a single time. - */ - max = 3 * filter->n_preds; - - preds = filter->preds; - if (!preds) - return -EINVAL; - pred = root; - - do { - if (WARN_ON(count++ > max)) - return -EINVAL; - - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - /* A leaf at the root is just a leaf in the tree */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); - - /* We are fine. */ - return 0; + return walk_pred_tree(filter->preds, root, + check_pred_tree_cb, &data); } static int count_leafs(struct filter_pred *preds, struct filter_pred *root) From c00b060f36e1238816ebcf2c8cccd5e9fa068980 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Aug 2011 16:25:50 +0200 Subject: [PATCH 09/20] tracing/filter: Change count_leafs function to use walk_pred_tree Changing count_leafs function to use unified predicates tree processing. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1313072754-4620-7-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 49 +++++++++--------------------- 1 file changed, 15 insertions(+), 34 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 5b889d43d856..ebbb2611982e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1418,43 +1418,24 @@ static int check_pred_tree(struct event_filter *filter, check_pred_tree_cb, &data); } +static int count_leafs_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + int *count = data; + + if ((move == MOVE_DOWN) && + (pred->left == FILTER_PRED_INVALID)) + (*count)++; + + return WALK_PRED_DEFAULT; +} + static int count_leafs(struct filter_pred *preds, struct filter_pred *root) { - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int count = 0; - int done = 0; - - pred = root; - - do { - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - /* A leaf at the root is just a leaf in the tree */ - if (pred == root) - return 1; - count++; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); + int count = 0, ret; + ret = walk_pred_tree(preds, root, count_leafs_cb, &count); + WARN_ON(ret); return count; } From 1b797fe5aaac11e60fce1592119d0517e95aba95 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Aug 2011 16:25:51 +0200 Subject: [PATCH 10/20] tracing/filter: Change fold_pred_tree function to use walk_pred_tree Changing fold_pred_tree function to use unified predicates tree processing. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1313072754-4620-8-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 65 +++++++++--------------------- 1 file changed, 20 insertions(+), 45 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index ebbb2611982e..d8aa100cb22e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1496,6 +1496,24 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) return 0; } +static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct filter_pred *preds = data; + + if (move != MOVE_DOWN) + return WALK_PRED_DEFAULT; + if (!(pred->index & FILTER_PRED_FOLD)) + return WALK_PRED_DEFAULT; + + *err = fold_pred(preds, pred); + if (*err) + return WALK_PRED_ABORT; + + /* eveyrhing below is folded, continue with parent */ + return WALK_PRED_PARENT; +} + /* * To optimize the processing of the ops, if we have several "ors" or * "ands" together, we can put them in an array and process them all @@ -1504,51 +1522,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) static int fold_pred_tree(struct event_filter *filter, struct filter_pred *root) { - struct filter_pred *preds; - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int done = 0; - int err; - - preds = filter->preds; - if (!preds) - return -EINVAL; - pred = root; - - do { - switch (move) { - case MOVE_DOWN: - if (pred->index & FILTER_PRED_FOLD) { - err = fold_pred(preds, pred); - if (err) - return err; - /* Folded nodes are like leafs */ - } else if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - - /* A leaf at the root is just a leaf in the tree */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); - - return 0; + return walk_pred_tree(filter->preds, root, fold_pred_tree_cb, + filter->preds); } static int replace_preds(struct ftrace_event_call *call, From 96bc293a97e7f1651977976be7dd42031f6d8ea3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Aug 2011 16:25:52 +0200 Subject: [PATCH 11/20] tracing/filter: Change fold_pred function to use walk_pred_tree Changing fold_pred_tree function to use unified predicates tree processing. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1313072754-4620-9-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 68 +++++++++++++++--------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d8aa100cb22e..f44e68b89f15 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1439,13 +1439,40 @@ static int count_leafs(struct filter_pred *preds, struct filter_pred *root) return count; } +struct fold_pred_data { + struct filter_pred *root; + int count; + int children; +}; + +static int fold_pred_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct fold_pred_data *d = data; + struct filter_pred *root = d->root; + + if (move != MOVE_DOWN) + return WALK_PRED_DEFAULT; + if (pred->left != FILTER_PRED_INVALID) + return WALK_PRED_DEFAULT; + + if (WARN_ON(d->count == d->children)) { + *err = -EINVAL; + return WALK_PRED_ABORT; + } + + pred->index &= ~FILTER_PRED_FOLD; + root->ops[d->count++] = pred->index; + return WALK_PRED_DEFAULT; +} + static int fold_pred(struct filter_pred *preds, struct filter_pred *root) { - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int count = 0; + struct fold_pred_data data = { + .root = root, + .count = 0, + }; int children; - int done = 0; /* No need to keep the fold flag */ root->index &= ~FILTER_PRED_FOLD; @@ -1463,37 +1490,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) return -ENOMEM; root->val = children; - - pred = root; - do { - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - if (WARN_ON(count == children)) - return -EINVAL; - pred->index &= ~FILTER_PRED_FOLD; - root->ops[count++] = pred->index; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); - - return 0; + data.children = children; + return walk_pred_tree(preds, root, fold_pred_cb, &data); } static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred, From f30120fce1efaa426f340a354d5ace36dab59f0e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Aug 2011 16:25:53 +0200 Subject: [PATCH 12/20] tracing/filter: Change filter_match_preds function to use walk_pred_tree Changing filter_match_preds function to use unified predicates tree processing. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1313072754-4620-10-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 124 ++++++++++++++--------------- 1 file changed, 58 insertions(+), 66 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f44e68b89f15..319c3cac7d95 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -467,99 +467,91 @@ static int process_ops(struct filter_pred *preds, for (i = 0; i < op->val; i++) { pred = &preds[op->ops[i]]; - match = pred->fn(pred, rec); + if (!WARN_ON_ONCE(!pred->fn)) + match = pred->fn(pred, rec); if (!!match == type) return match; } return match; } +struct filter_match_preds_data { + struct filter_pred *preds; + int match; + void *rec; +}; + +static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct filter_match_preds_data *d = data; + + *err = 0; + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* If ops is set, then it was folded. */ + if (!pred->ops) + return WALK_PRED_DEFAULT; + /* We can treat folded ops as a leaf node */ + d->match = process_ops(d->preds, pred, d->rec); + } else { + if (!WARN_ON_ONCE(!pred->fn)) + d->match = pred->fn(pred, d->rec); + } + + return WALK_PRED_PARENT; + case MOVE_UP_FROM_LEFT: + /* + * Check for short circuits. + * + * Optimization: !!match == (pred->op == OP_OR) + * is the same as: + * if ((match && pred->op == OP_OR) || + * (!match && pred->op == OP_AND)) + */ + if (!!d->match == (pred->op == OP_OR)) + return WALK_PRED_PARENT; + break; + case MOVE_UP_FROM_RIGHT: + break; + } + + return WALK_PRED_DEFAULT; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match = -1; - enum move_type move = MOVE_DOWN; struct filter_pred *preds; - struct filter_pred *pred; struct filter_pred *root; - int n_preds; - int done = 0; + struct filter_match_preds_data data = { + /* match is currently meaningless */ + .match = -1, + .rec = rec, + }; + int n_preds, ret; /* no filter is considered a match */ if (!filter) return 1; n_preds = filter->n_preds; - if (!n_preds) return 1; /* * n_preds, root and filter->preds are protect with preemption disabled. */ - preds = rcu_dereference_sched(filter->preds); root = rcu_dereference_sched(filter->root); if (!root) return 1; - pred = root; - - /* match is currently meaningless */ - match = -1; - - do { - switch (move) { - case MOVE_DOWN: - /* only AND and OR have children */ - if (pred->left != FILTER_PRED_INVALID) { - /* If ops is set, then it was folded. */ - if (!pred->ops) { - /* keep going to down the left side */ - pred = &preds[pred->left]; - continue; - } - /* We can treat folded ops as a leaf node */ - match = process_ops(preds, pred, rec); - } else - match = pred->fn(pred, rec); - /* If this pred is the only pred */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - /* - * Check for short circuits. - * - * Optimization: !!match == (pred->op == OP_OR) - * is the same as: - * if ((match && pred->op == OP_OR) || - * (!match && pred->op == OP_AND)) - */ - if (!!match == (pred->op == OP_OR)) { - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - /* now go down the right side of the tree. */ - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - /* We finished this equation. */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); - - return match; + data.preds = preds = rcu_dereference_sched(filter->preds); + ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); + WARN_ON(ret); + return data.match; } EXPORT_SYMBOL_GPL(filter_match_preds); From 1d0e78e380cd2802aa603a50e08220dfc681141c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Aug 2011 16:25:54 +0200 Subject: [PATCH 13/20] tracing/filter: Add startup tests for events filter Adding automated tests running as late_initcall. Tests are compiled in with CONFIG_FTRACE_STARTUP_TEST option. Adding test event "ftrace_test_filter" used to simulate filter processing during event occurance. String filters are compiled and tested against several test events with different values. Also testing that evaluation of explicit predicates is ommited due to the lazy filter evaluation. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1313072754-4620-11-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/Makefile | 2 + kernel/trace/trace.h | 3 + kernel/trace/trace_events_filter.c | 209 ++++++++++++++++++++++++ kernel/trace/trace_events_filter_test.h | 50 ++++++ 4 files changed, 264 insertions(+) create mode 100644 kernel/trace/trace_events_filter_test.h diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 761c510a06c5..b384ed512bac 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -15,6 +15,8 @@ ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +CFLAGS_trace_events_filter.o := -I$(src) + # # Make the trace clocks available generally: it's infrastructure # relied on by ptrace for example: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2eb3cf6d37bc..4c7540ad5dcb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -762,6 +762,9 @@ struct filter_pred { u64 val; struct regex regex; unsigned short *ops; +#ifdef CONFIG_FTRACE_STARTUP_TEST + struct ftrace_event_field *field; +#endif int offset; int not; int op; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 319c3cac7d95..6a642e278241 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1329,6 +1329,9 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, strcpy(pred.regex.pattern, operand2); pred.regex.len = strlen(pred.regex.pattern); +#ifdef CONFIG_FTRACE_STARTUP_TEST + pred.field = field; +#endif return init_pred(ps, field, &pred) ? NULL : &pred; } @@ -1926,3 +1929,209 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, #endif /* CONFIG_PERF_EVENTS */ +#ifdef CONFIG_FTRACE_STARTUP_TEST + +#include +#include + +#define CREATE_TRACE_POINTS +#include "trace_events_filter_test.h" + +static int test_get_filter(char *filter_str, struct ftrace_event_call *call, + struct event_filter **pfilter) +{ + struct event_filter *filter; + struct filter_parse_state *ps; + int err = -ENOMEM; + + filter = __alloc_filter(); + if (!filter) + goto out; + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto free_filter; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err) + goto free_ps; + + err = replace_preds(call, filter, ps, filter_str, false); + if (!err) + *pfilter = filter; + + free_ps: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + + free_filter: + if (err) + __free_filter(filter); + + out: + return err; +} + +#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ +{ \ + .filter = FILTER, \ + .rec = { .a = va, .b = vb, .c = vc, .d = vd, \ + .e = ve, .f = vf, .g = vg, .h = vh }, \ + .match = m, \ + .not_visited = nvisit, \ +} +#define YES 1 +#define NO 0 + +static struct test_filter_data_t { + char *filter; + struct ftrace_raw_ftrace_test_filter rec; + int match; + char *not_visited; +} test_filter_data[] = { +#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \ + "e == 1 && f == 1 && g == 1 && h == 1" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""), + DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"), + DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""), +#undef FILTER +#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \ + "e == 1 || f == 1 || g == 1 || h == 1" + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), + DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""), + DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"), +#undef FILTER +#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \ + "(e == 1 || f == 1) && (g == 1 || h == 1)" + DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"), + DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"), + DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"), +#undef FILTER +#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \ + "(e == 1 && f == 1) || (g == 1 && h == 1)" + DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"), + DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), +#undef FILTER +#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \ + "(e == 1 && f == 1) || (g == 1 && h == 1)" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), + DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""), +#undef FILTER +#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \ + "(e == 1 || f == 1)) && (g == 1 || h == 1)" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), + DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"), +#undef FILTER +#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \ + "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"), + DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""), +#undef FILTER +#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \ + "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"), + DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"), +}; + +#undef DATA_REC +#undef FILTER +#undef YES +#undef NO + +#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t)) + +static int test_pred_visited; + +static int test_pred_visited_fn(struct filter_pred *pred, void *event) +{ + struct ftrace_event_field *field = pred->field; + + test_pred_visited = 1; + printk(KERN_INFO "\npred visited %s\n", field->name); + return 1; +} + +static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + char *fields = data; + + if ((move == MOVE_DOWN) && + (pred->left == FILTER_PRED_INVALID)) { + struct ftrace_event_field *field = pred->field; + + if (!field) { + WARN(1, "all leafs should have field defined"); + return WALK_PRED_DEFAULT; + } + if (!strchr(fields, *field->name)) + return WALK_PRED_DEFAULT; + + WARN_ON(!pred->fn); + pred->fn = test_pred_visited_fn; + } + return WALK_PRED_DEFAULT; +} + +static __init int ftrace_test_event_filter(void) +{ + int i; + + printk(KERN_INFO "Testing ftrace filter: "); + + for (i = 0; i < DATA_CNT; i++) { + struct event_filter *filter = NULL; + struct test_filter_data_t *d = &test_filter_data[i]; + int err; + + err = test_get_filter(d->filter, &event_ftrace_test_filter, + &filter); + if (err) { + printk(KERN_INFO + "Failed to get filter for '%s', err %d\n", + d->filter, err); + break; + } + + if (*d->not_visited) + walk_pred_tree(filter->preds, filter->root, + test_walk_pred_cb, + d->not_visited); + + test_pred_visited = 0; + err = filter_match_preds(filter, &d->rec); + + __free_filter(filter); + + if (test_pred_visited) { + printk(KERN_INFO + "Failed, unwanted pred visited for filter %s\n", + d->filter); + break; + } + + if (err != d->match) { + printk(KERN_INFO + "Failed to match filter '%s', expected %d\n", + d->filter, d->match); + break; + } + } + + if (i == DATA_CNT) + printk(KERN_CONT "OK\n"); + + return 0; +} + +late_initcall(ftrace_test_event_filter); + +#endif /* CONFIG_FTRACE_STARTUP_TEST */ diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h new file mode 100644 index 000000000000..bfd4dba0d603 --- /dev/null +++ b/kernel/trace/trace_events_filter_test.h @@ -0,0 +1,50 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM test + +#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TEST_H + +#include + +TRACE_EVENT(ftrace_test_filter, + + TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h), + + TP_ARGS(a, b, c, d, e, f, g, h), + + TP_STRUCT__entry( + __field(int, a) + __field(int, b) + __field(int, c) + __field(int, d) + __field(int, e) + __field(int, f) + __field(int, g) + __field(int, h) + ), + + TP_fast_assign( + __entry->a = a; + __entry->b = b; + __entry->c = c; + __entry->d = d; + __entry->e = e; + __entry->f = f; + __entry->g = g; + __entry->h = h; + ), + + TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d", + __entry->a, __entry->b, __entry->c, __entry->d, + __entry->e, __entry->f, __entry->g, __entry->h) +); + +#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_events_filter_test + +/* This part must be outside protection */ +#include From 86b6ef21b80ac6565d172cdab4384404de007eea Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 22 Aug 2011 09:41:46 -0400 Subject: [PATCH 14/20] tracing: Add preempt disable for filter self test The self testing for event filters does not really need preemption disabled as there are no races at the time of testing, but the functions it calls uses rcu_dereference_sched() which will complain if preemption is not disabled. Cc: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 6a642e278241..816d3d074979 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2101,6 +2101,11 @@ static __init int ftrace_test_event_filter(void) break; } + /* + * The preemption disabling is not really needed for self + * tests, but the rcu dereference will complain without it. + */ + preempt_disable(); if (*d->not_visited) walk_pred_tree(filter->preds, filter->root, test_walk_pred_cb, @@ -2108,6 +2113,7 @@ static __init int ftrace_test_event_filter(void) test_pred_visited = 0; err = filter_match_preds(filter, &d->rec); + preempt_enable(); __free_filter(filter); From f81ab074c30234b07c8309c542cafd07bed721f7 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 16 Aug 2011 14:46:15 -0700 Subject: [PATCH 15/20] trace: Add a new readonly entry to report total buffer size The current file "buffer_size_kb" reports the size of per-cpu buffer and not the overall memory allocated which could be misleading. A new file "buffer_total_size_kb" adds up all the enabled CPU buffer sizes and reports it. This is only a readonly entry. Signed-off-by: Vaibhav Nagarnaik Cc: Michael Rubin Cc: David Sharp Link: http://lkml.kernel.org/r/1313531179-9323-2-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e5df02c69b1d..01176788c387 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3568,6 +3568,30 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, return cnt; } +static ssize_t +tracing_total_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r, cpu; + unsigned long size = 0, expanded_size = 0; + + mutex_lock(&trace_types_lock); + for_each_tracing_cpu(cpu) { + size += tr->entries >> 10; + if (!ring_buffer_expanded) + expanded_size += trace_buf_size >> 10; + } + if (ring_buffer_expanded) + r = sprintf(buf, "%lu\n", size); + else + r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + static ssize_t tracing_free_buffer_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3739,6 +3763,12 @@ static const struct file_operations tracing_entries_fops = { .llseek = generic_file_llseek, }; +static const struct file_operations tracing_total_entries_fops = { + .open = tracing_open_generic, + .read = tracing_total_entries_read, + .llseek = generic_file_llseek, +}; + static const struct file_operations tracing_free_buffer_fops = { .write = tracing_free_buffer_write, .release = tracing_free_buffer_release, @@ -4450,6 +4480,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("buffer_size_kb", 0644, d_tracer, &global_trace, &tracing_entries_fops); + trace_create_file("buffer_total_size_kb", 0444, d_tracer, + &global_trace, &tracing_total_entries_fops); + trace_create_file("free_buffer", 0644, d_tracer, &global_trace, &tracing_free_buffer_fops); From c64e148a3be3cb786534ad38298c25c833116c26 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 16 Aug 2011 14:46:16 -0700 Subject: [PATCH 16/20] trace: Add ring buffer stats to measure rate of events The stats file under per_cpu folder provides the number of entries, overruns and other statistics about the CPU ring buffer. However, the numbers do not provide any indication of how full the ring buffer is in bytes compared to the overall size in bytes. Also, it is helpful to know the rate at which the cpu buffer is filling up. This patch adds an entry "bytes: " in printed stats for per_cpu ring buffer which provides the actual bytes consumed in the ring buffer. This field includes the number of bytes used by recorded events and the padding bytes added when moving the tail pointer to next page. It also adds the following time stamps: "oldest event ts:" - the oldest timestamp in the ring buffer "now ts:" - the timestamp at the time of reading The field "now ts" provides a consistent time snapshot to the userspace when being read. This is read from the same trace clock used by tracing event timestamps. Together, these values provide the rate at which the buffer is filling up, from the formula: bytes / (now_ts - oldest_event_ts) Signed-off-by: Vaibhav Nagarnaik Cc: Michael Rubin Cc: David Sharp Link: http://lkml.kernel.org/r/1313531179-9323-3-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 2 ++ kernel/trace/ring_buffer.c | 70 ++++++++++++++++++++++++++++++++++++- kernel/trace/trace.c | 13 +++++++ 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b891de96000f..67be0376d8e3 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -154,6 +154,8 @@ void ring_buffer_record_enable(struct ring_buffer *buffer); void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_entries(struct ring_buffer *buffer); unsigned long ring_buffer_overruns(struct ring_buffer *buffer); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 731201bf4acc..acf6b68dc4a8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -488,12 +488,14 @@ struct ring_buffer_per_cpu { struct buffer_page *reader_page; unsigned long lost_events; unsigned long last_overrun; + local_t entries_bytes; local_t commit_overrun; local_t overrun; local_t entries; local_t committing; local_t commits; unsigned long read; + unsigned long read_bytes; u64 write_stamp; u64 read_stamp; }; @@ -1708,6 +1710,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * the counters. */ local_add(entries, &cpu_buffer->overrun); + local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); /* * The entries will be zeroed out when we move the @@ -1863,6 +1866,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); + /* account for padding bytes */ + local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + /* * Save the original length to the meta data. * This will be used by the reader to add lost event @@ -2054,6 +2060,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (!tail) tail_page->page->time_stamp = ts; + /* account for these added bytes */ + local_add(length, &cpu_buffer->entries_bytes); + return event; } @@ -2076,6 +2085,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; + unsigned long event_length = rb_event_length(event); /* * This is on the tail page. It is possible that * a write could come in and move the tail page @@ -2085,8 +2095,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, old_index += write_mask; new_index += write_mask; index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) + if (index == old_index) { + /* update counters */ + local_sub(event_length, &cpu_buffer->entries_bytes); return 1; + } } /* could not discard */ @@ -2660,6 +2673,58 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) (local_read(&cpu_buffer->overrun) + cpu_buffer->read); } +/** + * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) +{ + unsigned long flags; + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *bpage; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* + * if the tail is on reader_page, oldest time stamp is on the reader + * page + */ + if (cpu_buffer->tail_page == cpu_buffer->reader_page) + bpage = cpu_buffer->reader_page; + else + bpage = rb_set_head_page(cpu_buffer); + ret = bpage->page->time_stamp; + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); + +/** + * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); + /** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer @@ -3527,11 +3592,13 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read = 0; local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; @@ -3918,6 +3985,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); + cpu_buffer->read_bytes += BUF_PAGE_SIZE; /* swap the pages */ rb_init_page(bpage); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 01176788c387..b41907000bc6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4056,6 +4056,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf, struct trace_array *tr = &global_trace; struct trace_seq *s; unsigned long cnt; + unsigned long long t; + unsigned long usec_rem; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) @@ -4072,6 +4074,17 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); + cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); + trace_seq_printf(s, "bytes: %ld\n", cnt); + + t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); + + t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); From 6249687f76b69cc0b2ad34636f4a18d693ef3262 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Sep 2011 11:35:58 -0400 Subject: [PATCH 17/20] tracing: Add a counter clock for those that do not trust clocks When debugging tight race conditions, it can be helpful to have a synchronized tracing method. Although in most cases the global clock provides this functionality, if timings is not the issue, it is more comforting to know that the order of events really happened in a precise order. Instead of using a clock, add a "counter" that is simply an incrementing atomic 64bit counter that orders the events as they are perceived to happen. The trace_clock_counter() is added from the attempt by Peter Zijlstra trying to convert the trace_clock_global() to it. I took Peter's counter code and made trace_clock_counter() instead, and added it to the choice of clocks. Just echo counter > /debug/tracing/trace_clock to activate it. Requested-by: Thomas Gleixner Requested-by: Peter Zijlstra Reviewed-By: Valdis Kletnieks Signed-off-by: Steven Rostedt --- include/linux/trace_clock.h | 1 + kernel/trace/trace.c | 1 + kernel/trace/trace_clock.c | 12 ++++++++++++ 3 files changed, 14 insertions(+) diff --git a/include/linux/trace_clock.h b/include/linux/trace_clock.h index 7a8130384087..4eb490237d4c 100644 --- a/include/linux/trace_clock.h +++ b/include/linux/trace_clock.h @@ -15,5 +15,6 @@ extern u64 notrace trace_clock_local(void); extern u64 notrace trace_clock(void); extern u64 notrace trace_clock_global(void); +extern u64 notrace trace_clock_counter(void); #endif /* _LINUX_TRACE_CLOCK_H */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b41907000bc6..4b8df0dc9358 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -435,6 +435,7 @@ static struct { } trace_clocks[] = { { trace_clock_local, "local" }, { trace_clock_global, "global" }, + { trace_clock_counter, "counter" }, }; int trace_clock_id; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 6302747a1398..394783531cbb 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -113,3 +113,15 @@ u64 notrace trace_clock_global(void) return now; } + +static atomic64_t trace_counter; + +/* + * trace_clock_counter(): simply an atomic counter. + * Use the trace_counter "counter" for cases where you do not care + * about timings, but are interested in strict ordering. + */ +u64 notrace trace_clock_counter(void) +{ + return atomic64_add_return(1, &trace_counter); +} From e36de1de4a5f95b7cb3e5c37d10e6bbb91833ef0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Sep 2011 11:11:51 -0400 Subject: [PATCH 18/20] tracing: Fix preemptirqsoff tracer to not stop at preempt off If irqs are disabled when preemption count reaches zero, the preemptirqsoff tracer should not flag that as the end. When interrupts are enabled and preemption count is not zero the preemptirqsoff correctly continues its tracing. Signed-off-by: Steven Rostedt --- kernel/trace/trace_irqsoff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 667aa8cc0cfc..a1a3359996a7 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -505,13 +505,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); #ifdef CONFIG_PREEMPT_TRACER void trace_preempt_on(unsigned long a0, unsigned long a1) { - if (preempt_trace()) + if (preempt_trace() && !irq_trace()) stop_critical_timing(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - if (preempt_trace()) + if (preempt_trace() && !irq_trace()) start_critical_timing(a0, a1); } #endif /* CONFIG_PREEMPT_TRACER */ From e0a413f619ef8bc366dafc6f8221674993b8d85f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 29 Sep 2011 21:26:16 -0400 Subject: [PATCH 19/20] tracing: Warn on output if the function tracer was found corrupted As the function tracer is very intrusive, lots of self checks are performed on the tracer and if something is found to be strange it will shut itself down keeping it from corrupting the rest of the kernel. This shutdown may still allow functions to be traced, as the tracing only stops new modifications from happening. Trying to stop the function tracer itself can cause more harm as it requires code modification. Although a WARN_ON() is executed, a user may not notice it. To help the user see that something isn't right with the tracing of the system a big warning is added to the output of the tracer that lets the user know that their data may be incomplete. Reported-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++++++++ kernel/trace/trace.c | 15 +++++++++++++++ kernel/trace/trace.h | 2 ++ 3 files changed, 25 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c3e4575e7829..077d85387908 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3862,6 +3862,14 @@ void ftrace_kill(void) clear_ftrace_function(); } +/** + * Test if ftrace is dead or not. + */ +int ftrace_is_dead(void) +{ + return ftrace_disabled; +} + /** * register_ftrace_function - register a function for profiling * @ops - ops structure that holds the function for profiling. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4b8df0dc9358..13f2b8472fed 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2160,6 +2160,14 @@ void trace_default_header(struct seq_file *m) } } +static void test_ftrace_alive(struct seq_file *m) +{ + if (!ftrace_is_dead()) + return; + seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); +} + static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -2169,6 +2177,7 @@ static int s_show(struct seq_file *m, void *v) if (iter->tr) { seq_printf(m, "# tracer: %s\n", iter->trace->name); seq_puts(m, "#\n"); + test_ftrace_alive(m); } if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); @@ -4613,6 +4622,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) tracing_off(); + /* Did function tracer already get disabled? */ + if (ftrace_is_dead()) { + printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + printk("# MAY BE MISSING FUNCTION EVENTS\n"); + } + if (disable_tracing) ftrace_kill(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4c7540ad5dcb..092e1f8d18dc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -579,11 +579,13 @@ static inline int ftrace_trace_task(struct task_struct *task) return test_tsk_trace_trace(task); } +extern int ftrace_is_dead(void); #else static inline int ftrace_trace_task(struct task_struct *task) { return 1; } +static inline int ftrace_is_dead(void) { return 0; } #endif /* From d696b58ca2c3ca76e784ef89a7e0453d9b7ab187 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Sep 2011 11:50:27 -0400 Subject: [PATCH 20/20] tracing: Do not allocate buffer for trace_marker When doing intense tracing, the kmalloc inside trace_marker can introduce side effects to what is being traced. As trace_marker() is used by userspace to inject data into the kernel ring buffer, it needs to do so with the least amount of intrusion to the operations of the kernel or the user space application. As the ring buffer is designed to write directly into the buffer without the need to make a temporary buffer, and userspace already went through the hassle of knowing how big the write will be, we can simply pin the userspace pages and write the data directly into the buffer. This improves the impact of tracing via trace_marker tremendously! Thanks to Peter Zijlstra and Thomas Gleixner for pointing out the use of get_user_pages_fast() and kmap_atomic(). Suggested-by: Thomas Gleixner Suggested-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 113 ++++++++++++++++++++++++++++++++----------- 1 file changed, 84 insertions(+), 29 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 13f2b8472fed..f86efe90ca45 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3628,22 +3628,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) return 0; } -static int mark_printk(const char *fmt, ...) -{ - int ret; - va_list args; - va_start(args, fmt); - ret = trace_vprintk(0, fmt, args); - va_end(args); - return ret; -} - static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { - char *buf; - size_t written; + unsigned long addr = (unsigned long)ubuf; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct print_entry *entry; + unsigned long irq_flags; + struct page *pages[2]; + int nr_pages = 1; + ssize_t written; + void *page1; + void *page2; + int offset; + int size; + int len; + int ret; if (tracing_disabled) return -EINVAL; @@ -3651,28 +3653,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; - buf = kmalloc(cnt + 2, GFP_KERNEL); - if (buf == NULL) - return -ENOMEM; + /* + * Userspace is injecting traces into the kernel trace buffer. + * We want to be as non intrusive as possible. + * To do so, we do not want to allocate any special buffers + * or take any locks, but instead write the userspace data + * straight into the ring buffer. + * + * First we need to pin the userspace buffer into memory, + * which, most likely it is, because it just referenced it. + * But there's no guarantee that it is. By using get_user_pages_fast() + * and kmap_atomic/kunmap_atomic() we can get access to the + * pages directly. We then write the data directly into the + * ring buffer. + */ + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - if (copy_from_user(buf, ubuf, cnt)) { - kfree(buf); - return -EFAULT; + /* check if we cross pages */ + if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) + nr_pages = 2; + + offset = addr & (PAGE_SIZE - 1); + addr &= PAGE_MASK; + + ret = get_user_pages_fast(addr, nr_pages, 0, pages); + if (ret < nr_pages) { + while (--ret >= 0) + put_page(pages[ret]); + written = -EFAULT; + goto out; } - if (buf[cnt-1] != '\n') { - buf[cnt] = '\n'; - buf[cnt+1] = '\0'; - } else - buf[cnt] = '\0'; - written = mark_printk("%s", buf); - kfree(buf); + page1 = kmap_atomic(pages[0]); + if (nr_pages == 2) + page2 = kmap_atomic(pages[1]); + + local_save_flags(irq_flags); + size = sizeof(*entry) + cnt + 2; /* possible \n added */ + buffer = global_trace.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + irq_flags, preempt_count()); + if (!event) { + /* Ring buffer disabled, return as if not open for write */ + written = -EBADF; + goto out_unlock; + } + + entry = ring_buffer_event_data(event); + entry->ip = _THIS_IP_; + + if (nr_pages == 2) { + len = PAGE_SIZE - offset; + memcpy(&entry->buf, page1 + offset, len); + memcpy(&entry->buf[len], page2, cnt - len); + } else + memcpy(&entry->buf, page1 + offset, cnt); + + if (entry->buf[cnt - 1] != '\n') { + entry->buf[cnt] = '\n'; + entry->buf[cnt + 1] = '\0'; + } else + entry->buf[cnt] = '\0'; + + ring_buffer_unlock_commit(buffer, event); + + written = cnt; + *fpos += written; - /* don't tell userspace we wrote more - it might confuse them */ - if (written > cnt) - written = cnt; - + out_unlock: + if (nr_pages == 2) + kunmap_atomic(page2); + kunmap_atomic(page1); + while (nr_pages > 0) + put_page(pages[--nr_pages]); + out: return written; }