diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 8a12a25fa40d..41fdbb7953c6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1249,7 +1249,13 @@ __acquires(hlist_lock) *head = &kretprobe_inst_table[hash]; hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_lock_irqsave(hlist_lock, *flags); + /* + * Nested is a workaround that will soon not be needed. + * There's other protections that make sure the same lock + * is not taken on the same CPU that lockdep is unaware of. + * Differentiate when it is taken in NMI context. + */ + raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi()); } NOKPROBE_SYMBOL(kretprobe_hash_lock); @@ -1258,7 +1264,13 @@ static void kretprobe_table_lock(unsigned long hash, __acquires(hlist_lock) { raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_lock_irqsave(hlist_lock, *flags); + /* + * Nested is a workaround that will soon not be needed. + * There's other protections that make sure the same lock + * is not taken on the same CPU that lockdep is unaware of. + * Differentiate when it is taken in NMI context. + */ + raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi()); } NOKPROBE_SYMBOL(kretprobe_table_lock); @@ -2028,7 +2040,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) /* TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); - raw_spin_lock_irqsave(&rp->lock, flags); + /* + * Nested is a workaround that will soon not be needed. + * There's other protections that make sure the same lock + * is not taken on the same CPU that lockdep is unaware of. + */ + raw_spin_lock_irqsave_nested(&rp->lock, flags, 1); if (!hlist_empty(&rp->free_instances)) { ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance, hlist); @@ -2039,7 +2056,7 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) ri->task = current; if (rp->entry_handler && rp->entry_handler(ri, regs)) { - raw_spin_lock_irqsave(&rp->lock, flags); + raw_spin_lock_irqsave_nested(&rp->lock, flags, 1); hlist_add_head(&ri->hlist, &rp->free_instances); raw_spin_unlock_irqrestore(&rp->lock, flags); return 0; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7f45fd9d5a45..dc83b3fa9fe7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -438,14 +438,16 @@ enum { }; /* * Used for which event context the event is in. - * NMI = 0 - * IRQ = 1 - * SOFTIRQ = 2 - * NORMAL = 3 + * TRANSITION = 0 + * NMI = 1 + * IRQ = 2 + * SOFTIRQ = 3 + * NORMAL = 4 * * See trace_recursive_lock() comment below for more details. */ enum { + RB_CTX_TRANSITION, RB_CTX_NMI, RB_CTX_IRQ, RB_CTX_SOFTIRQ, @@ -3014,10 +3016,10 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * a bit of overhead in something as critical as function tracing, * we use a bitmask trick. * - * bit 0 = NMI context - * bit 1 = IRQ context - * bit 2 = SoftIRQ context - * bit 3 = normal context. + * bit 1 = NMI context + * bit 2 = IRQ context + * bit 3 = SoftIRQ context + * bit 4 = normal context. * * This works because this is the order of contexts that can * preempt other contexts. A SoftIRQ never preempts an IRQ @@ -3040,6 +3042,30 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * The least significant bit can be cleared this way, and it * just so happens that it is the same bit corresponding to * the current context. + * + * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit + * is set when a recursion is detected at the current context, and if + * the TRANSITION bit is already set, it will fail the recursion. + * This is needed because there's a lag between the changing of + * interrupt context and updating the preempt count. In this case, + * a false positive will be found. To handle this, one extra recursion + * is allowed, and this is done by the TRANSITION bit. If the TRANSITION + * bit is already set, then it is considered a recursion and the function + * ends. Otherwise, the TRANSITION bit is set, and that bit is returned. + * + * On the trace_recursive_unlock(), the TRANSITION bit will be the first + * to be cleared. Even if it wasn't the context that set it. That is, + * if an interrupt comes in while NORMAL bit is set and the ring buffer + * is called before preempt_count() is updated, since the check will + * be on the NORMAL bit, the TRANSITION bit will then be set. If an + * NMI then comes in, it will set the NMI bit, but when the NMI code + * does the trace_recursive_unlock() it will clear the TRANSTION bit + * and leave the NMI bit set. But this is fine, because the interrupt + * code that set the TRANSITION bit will then clear the NMI bit when it + * calls trace_recursive_unlock(). If another NMI comes in, it will + * set the TRANSITION bit and continue. + * + * Note: The TRANSITION bit only handles a single transition between context. */ static __always_inline int @@ -3055,8 +3081,16 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) bit = pc & NMI_MASK ? RB_CTX_NMI : pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; - if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) - return 1; + if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { + /* + * It is possible that this was called by transitioning + * between interrupt context, and preempt_count() has not + * been updated yet. In this case, use the TRANSITION bit. + */ + bit = RB_CTX_TRANSITION; + if (val & (1 << (bit + cpu_buffer->nest))) + return 1; + } val |= (1 << (bit + cpu_buffer->nest)); cpu_buffer->current_context = val; @@ -3071,8 +3105,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->current_context - (1 << cpu_buffer->nest); } -/* The recursive locking above uses 4 bits */ -#define NESTED_BITS 4 +/* The recursive locking above uses 5 bits */ +#define NESTED_BITS 5 /** * ring_buffer_nest_start - Allow to trace while nested diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 528971714fc6..410cfeb16db5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2750,7 +2750,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, /* * If tracing is off, but we have triggers enabled * we still need to look at the event data. Use the temp_buffer - * to store the trace event for the tigger to use. It's recusive + * to store the trace event for the trigger to use. It's recursive * safe and will not be recorded anywhere. */ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { @@ -2952,7 +2952,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; /* This should never happen. If it does, yell once and skip */ - if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING)) + if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING)) goto out; /* @@ -3132,7 +3132,7 @@ static char *get_trace_buf(void) /* Interrupts must see nesting incremented before we use the buffer */ barrier(); - return &buffer->buffer[buffer->nesting][0]; + return &buffer->buffer[buffer->nesting - 1][0]; } static void put_trace_buf(void) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f3f5e77123ad..1dadef445cd1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -637,6 +637,12 @@ enum { * function is called to clear it. */ TRACE_GRAPH_NOTRACE_BIT, + + /* + * When transitioning between context, the preempt_count() may + * not be correct. Allow for a single recursion to cover this case. + */ + TRACE_TRANSITION_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) @@ -691,14 +697,27 @@ static __always_inline int trace_test_and_set_recursion(int start, int max) return 0; bit = trace_get_context_bit() + start; - if (unlikely(val & (1 << bit))) - return -1; + if (unlikely(val & (1 << bit))) { + /* + * It could be that preempt_count has not been updated during + * a switch between contexts. Allow for a single recursion. + */ + bit = TRACE_TRANSITION_BIT; + if (trace_recursion_test(bit)) + return -1; + trace_recursion_set(bit); + barrier(); + return bit + 1; + } + + /* Normal check passed, clear the transition to allow it again */ + trace_recursion_clear(TRACE_TRANSITION_BIT); val |= 1 << bit; current->trace_recursion = val; barrier(); - return bit; + return bit + 1; } static __always_inline void trace_clear_recursion(int bit) @@ -708,6 +727,7 @@ static __always_inline void trace_clear_recursion(int bit) if (!bit) return; + bit--; bit = 1 << bit; val &= ~bit; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 84b7cab55291..881df991742a 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -584,7 +584,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, { struct synth_field *field; const char *prefix = NULL, *field_type = argv[0], *field_name, *array; - int len, ret = 0; + int len, ret = -ENOMEM; struct seq_buf s; ssize_t size; @@ -617,10 +617,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, len--; field->name = kmemdup_nul(field_name, len, GFP_KERNEL); - if (!field->name) { - ret = -ENOMEM; + if (!field->name) goto free; - } + if (!is_good_name(field->name)) { synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name)); ret = -EINVAL; @@ -638,10 +637,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, len += strlen(prefix); field->type = kzalloc(len, GFP_KERNEL); - if (!field->type) { - ret = -ENOMEM; + if (!field->type) goto free; - } + seq_buf_init(&s, field->type, len); if (prefix) seq_buf_puts(&s, prefix); @@ -653,6 +651,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, } if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) goto free; + s.buffer[s.len] = '\0'; size = synth_field_size(field->type); @@ -666,10 +665,8 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, len = sizeof("__data_loc ") + strlen(field->type) + 1; type = kzalloc(len, GFP_KERNEL); - if (!type) { - ret = -ENOMEM; + if (!type) goto free; - } seq_buf_init(&s, type, len); seq_buf_puts(&s, "__data_loc "); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b5e3496cf803..4738ad48a667 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -492,8 +492,13 @@ trace_selftest_function_recursion(void) unregister_ftrace_function(&test_rec_probe); ret = -1; - if (trace_selftest_recursion_cnt != 1) { - pr_cont("*callback not called once (%d)* ", + /* + * Recursion allows for transitions between context, + * and may call the callback twice. + */ + if (trace_selftest_recursion_cnt != 1 && + trace_selftest_recursion_cnt != 2) { + pr_cont("*callback not called once (or twice) (%d)* ", trace_selftest_recursion_cnt); goto out; }