diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 027437b745a0..f606c5bd1c0d 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -324,6 +324,12 @@ of ftrace. Here is a list of some of the key files: "set_graph_function", or "set_graph_notrace". (See the section "dynamic ftrace" below for more details.) + available_filter_functions_addrs: + + Similar to available_filter_functions, but with address displayed + for each function. The displayed address is the patch-site address + and can differ from /proc/kallsyms address. + dyn_ftrace_total_info: This file is for debugging purposes. The number of functions that @@ -1359,6 +1365,19 @@ Options for function_graph tracer: only a closing curly bracket "}" is displayed for the return of a function. + funcgraph-retval + When set, the return value of each traced function + will be printed after an equal sign "=". By default + this is off. + + funcgraph-retval-hex + When set, the return value will always be printed + in hexadecimal format. If the option is not set and + the return value is an error code, it will be printed + in signed decimal format; otherwise it will also be + printed in hexadecimal format. By default, this option + is off. + sleep-time When running function graph tracer, to include the time a task schedules out in its function. @@ -2704,6 +2723,119 @@ It is default disabled. 0) 1.757 us | } /* kmem_cache_free() */ 0) 2.861 us | } /* putname() */ +The return value of each traced function can be displayed after +an equal sign "=". When encountering system call failures, it +can be verfy helpful to quickly locate the function that first +returns an error code. + + - hide: echo nofuncgraph-retval > trace_options + - show: echo funcgraph-retval > trace_options + + Example with funcgraph-retval:: + + 1) | cgroup_migrate() { + 1) 0.651 us | cgroup_migrate_add_task(); /* = 0xffff93fcfd346c00 */ + 1) | cgroup_migrate_execute() { + 1) | cpu_cgroup_can_attach() { + 1) | cgroup_taskset_first() { + 1) 0.732 us | cgroup_taskset_next(); /* = 0xffff93fc8fb20000 */ + 1) 1.232 us | } /* cgroup_taskset_first = 0xffff93fc8fb20000 */ + 1) 0.380 us | sched_rt_can_attach(); /* = 0x0 */ + 1) 2.335 us | } /* cpu_cgroup_can_attach = -22 */ + 1) 4.369 us | } /* cgroup_migrate_execute = -22 */ + 1) 7.143 us | } /* cgroup_migrate = -22 */ + +The above example shows that the function cpu_cgroup_can_attach +returned the error code -22 firstly, then we can read the code +of this function to get the root cause. + +When the option funcgraph-retval-hex is not set, the return value can +be displayed in a smart way. Specifically, if it is an error code, +it will be printed in signed decimal format, otherwise it will +printed in hexadecimal format. + + - smart: echo nofuncgraph-retval-hex > trace_options + - hexadecimal: echo funcgraph-retval-hex > trace_options + + Example with funcgraph-retval-hex:: + + 1) | cgroup_migrate() { + 1) 0.651 us | cgroup_migrate_add_task(); /* = 0xffff93fcfd346c00 */ + 1) | cgroup_migrate_execute() { + 1) | cpu_cgroup_can_attach() { + 1) | cgroup_taskset_first() { + 1) 0.732 us | cgroup_taskset_next(); /* = 0xffff93fc8fb20000 */ + 1) 1.232 us | } /* cgroup_taskset_first = 0xffff93fc8fb20000 */ + 1) 0.380 us | sched_rt_can_attach(); /* = 0x0 */ + 1) 2.335 us | } /* cpu_cgroup_can_attach = 0xffffffea */ + 1) 4.369 us | } /* cgroup_migrate_execute = 0xffffffea */ + 1) 7.143 us | } /* cgroup_migrate = 0xffffffea */ + +At present, there are some limitations when using the funcgraph-retval +option, and these limitations will be eliminated in the future: + +- Even if the function return type is void, a return value will still + be printed, and you can just ignore it. + +- Even if return values are stored in multiple registers, only the + value contained in the first register will be recorded and printed. + To illustrate, in the x86 architecture, eax and edx are used to store + a 64-bit return value, with the lower 32 bits saved in eax and the + upper 32 bits saved in edx. However, only the value stored in eax + will be recorded and printed. + +- In certain procedure call standards, such as arm64's AAPCS64, when a + type is smaller than a GPR, it is the responsibility of the consumer + to perform the narrowing, and the upper bits may contain UNKNOWN values. + Therefore, it is advisable to check the code for such cases. For instance, + when using a u8 in a 64-bit GPR, bits [63:8] may contain arbitrary values, + especially when larger types are truncated, whether explicitly or implicitly. + Here are some specific cases to illustrate this point: + + **Case One**: + + The function narrow_to_u8 is defined as follows:: + + u8 narrow_to_u8(u64 val) + { + // implicitly truncated + return val; + } + + It may be compiled to:: + + narrow_to_u8: + < ... ftrace instrumentation ... > + RET + + If you pass 0x123456789abcdef to this function and want to narrow it, + it may be recorded as 0x123456789abcdef instead of 0xef. + + **Case Two**: + + The function error_if_not_4g_aligned is defined as follows:: + + int error_if_not_4g_aligned(u64 val) + { + if (val & GENMASK(31, 0)) + return -EINVAL; + + return 0; + } + + It could be compiled to:: + + error_if_not_4g_aligned: + CBNZ w0, .Lnot_aligned + RET // bits [31:0] are zero, bits + // [63:32] are UNKNOWN + .Lnot_aligned: + MOV x0, #-EINVAL + RET + + When passing 0x2_0000_0000 to it, the return value may be recorded as + 0x2_0000_0000 instead of 0. + You can put some comments on specific functions by using trace_printk() For example, if you want to put a comment inside the __might_sleep() function, you just have to include diff --git a/Documentation/trace/timerlat-tracer.rst b/Documentation/trace/timerlat-tracer.rst index db17df312bc8..53a56823e903 100644 --- a/Documentation/trace/timerlat-tracer.rst +++ b/Documentation/trace/timerlat-tracer.rst @@ -180,3 +180,81 @@ dummy_load_1ms_pd_init, which had the following code (on purpose):: return 0; } + +User-space interface +--------------------------- + +Timerlat allows user-space threads to use timerlat infra-structure to +measure scheduling latency. This interface is accessible via a per-CPU +file descriptor inside $tracing_dir/osnoise/per_cpu/cpu$ID/timerlat_fd. + +This interface is accessible under the following conditions: + + - timerlat tracer is enable + - osnoise workload option is set to NO_OSNOISE_WORKLOAD + - The user-space thread is affined to a single processor + - The thread opens the file associated with its single processor + - Only one thread can access the file at a time + +The open() syscall will fail if any of these conditions are not met. +After opening the file descriptor, the user space can read from it. + +The read() system call will run a timerlat code that will arm the +timer in the future and wait for it as the regular kernel thread does. + +When the timer IRQ fires, the timerlat IRQ will execute, report the +IRQ latency and wake up the thread waiting in the read. The thread will be +scheduled and report the thread latency via tracer - as for the kernel +thread. + +The difference from the in-kernel timerlat is that, instead of re-arming +the timer, timerlat will return to the read() system call. At this point, +the user can run any code. + +If the application rereads the file timerlat file descriptor, the tracer +will report the return from user-space latency, which is the total +latency. If this is the end of the work, it can be interpreted as the +response time for the request. + +After reporting the total latency, timerlat will restart the cycle, arm +a timer, and go to sleep for the following activation. + +If at any time one of the conditions is broken, e.g., the thread migrates +while in user space, or the timerlat tracer is disabled, the SIG_KILL +signal will be sent to the user-space thread. + +Here is an basic example of user-space code for timerlat:: + + int main(void) + { + char buffer[1024]; + int timerlat_fd; + int retval; + long cpu = 0; /* place in CPU 0 */ + cpu_set_t set; + + CPU_ZERO(&set); + CPU_SET(cpu, &set); + + if (sched_setaffinity(gettid(), sizeof(set), &set) == -1) + return 1; + + snprintf(buffer, sizeof(buffer), + "/sys/kernel/tracing/osnoise/per_cpu/cpu%ld/timerlat_fd", + cpu); + + timerlat_fd = open(buffer, O_RDONLY); + if (timerlat_fd < 0) { + printf("error opening %s: %s\n", buffer, strerror(errno)); + exit(1); + } + + for (;;) { + retval = read(timerlat_fd, buffer, 1024); + if (retval < 0) + break; + } + + close(timerlat_fd); + exit(0); + } diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 595028bd9160..891ab530a665 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -202,6 +202,7 @@ config ARM64 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_ERROR_INJECTION + select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_GCC_PLUGINS select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && \ diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h index b87d70b693c6..21ac1c5c71d3 100644 --- a/arch/arm64/include/asm/ftrace.h +++ b/arch/arm64/include/asm/ftrace.h @@ -192,4 +192,26 @@ static inline bool arch_syscall_match_sym_name(const char *sym, } #endif /* ifndef __ASSEMBLY__ */ +#ifndef __ASSEMBLY__ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +struct fgraph_ret_regs { + /* x0 - x7 */ + unsigned long regs[8]; + + unsigned long fp; + unsigned long __unused; +}; + +static inline unsigned long fgraph_ret_regs_return_value(struct fgraph_ret_regs *ret_regs) +{ + return ret_regs->regs[0]; +} + +static inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph_ret_regs *ret_regs) +{ + return ret_regs->fp; +} +#endif /* ifdef CONFIG_FUNCTION_GRAPH_TRACER */ +#endif + #endif /* __ASM_FTRACE_H */ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 0996094b0d22..757d01a68ffd 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -200,6 +200,19 @@ int main(void) #endif #ifdef CONFIG_FUNCTION_TRACER DEFINE(FTRACE_OPS_FUNC, offsetof(struct ftrace_ops, func)); +#endif + BLANK(); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + DEFINE(FGRET_REGS_X0, offsetof(struct fgraph_ret_regs, regs[0])); + DEFINE(FGRET_REGS_X1, offsetof(struct fgraph_ret_regs, regs[1])); + DEFINE(FGRET_REGS_X2, offsetof(struct fgraph_ret_regs, regs[2])); + DEFINE(FGRET_REGS_X3, offsetof(struct fgraph_ret_regs, regs[3])); + DEFINE(FGRET_REGS_X4, offsetof(struct fgraph_ret_regs, regs[4])); + DEFINE(FGRET_REGS_X5, offsetof(struct fgraph_ret_regs, regs[5])); + DEFINE(FGRET_REGS_X6, offsetof(struct fgraph_ret_regs, regs[6])); + DEFINE(FGRET_REGS_X7, offsetof(struct fgraph_ret_regs, regs[7])); + DEFINE(FGRET_REGS_FP, offsetof(struct fgraph_ret_regs, fp)); + DEFINE(FGRET_REGS_SIZE, sizeof(struct fgraph_ret_regs)); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS DEFINE(FTRACE_OPS_DIRECT_CALL, offsetof(struct ftrace_ops, direct_call)); #endif diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index 1c38a60575aa..f0c16640ef21 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -330,22 +330,23 @@ SYM_FUNC_END(ftrace_stub_graph) */ SYM_CODE_START(return_to_handler) /* save return value regs */ - sub sp, sp, #64 - stp x0, x1, [sp] - stp x2, x3, [sp, #16] - stp x4, x5, [sp, #32] - stp x6, x7, [sp, #48] + sub sp, sp, #FGRET_REGS_SIZE + stp x0, x1, [sp, #FGRET_REGS_X0] + stp x2, x3, [sp, #FGRET_REGS_X2] + stp x4, x5, [sp, #FGRET_REGS_X4] + stp x6, x7, [sp, #FGRET_REGS_X6] + str x29, [sp, #FGRET_REGS_FP] // parent's fp - mov x0, x29 // parent's fp - bl ftrace_return_to_handler// addr = ftrace_return_to_hander(fp); - mov x30, x0 // restore the original return address + mov x0, sp + bl ftrace_return_to_handler // addr = ftrace_return_to_hander(regs); + mov x30, x0 // restore the original return address /* restore return value regs */ - ldp x0, x1, [sp] - ldp x2, x3, [sp, #16] - ldp x4, x5, [sp, #32] - ldp x6, x7, [sp, #48] - add sp, sp, #64 + ldp x0, x1, [sp, #FGRET_REGS_X0] + ldp x2, x3, [sp, #FGRET_REGS_X2] + ldp x4, x5, [sp, #FGRET_REGS_X4] + ldp x6, x7, [sp, #FGRET_REGS_X6] + add sp, sp, #FGRET_REGS_SIZE ret SYM_CODE_END(return_to_handler) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 0c7fb52727ba..e55511af4c77 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -111,6 +111,7 @@ config LOONGARCH select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION + select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GENERIC_VDSO diff --git a/arch/loongarch/include/asm/ftrace.h b/arch/loongarch/include/asm/ftrace.h index 23e2ba78dcb0..a11996eb5892 100644 --- a/arch/loongarch/include/asm/ftrace.h +++ b/arch/loongarch/include/asm/ftrace.h @@ -100,4 +100,26 @@ __arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) #endif /* CONFIG_FUNCTION_TRACER */ +#ifndef __ASSEMBLY__ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +struct fgraph_ret_regs { + /* a0 - a1 */ + unsigned long regs[2]; + + unsigned long fp; + unsigned long __unused; +}; + +static inline unsigned long fgraph_ret_regs_return_value(struct fgraph_ret_regs *ret_regs) +{ + return ret_regs->regs[0]; +} + +static inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph_ret_regs *ret_regs) +{ + return ret_regs->fp; +} +#endif /* ifdef CONFIG_FUNCTION_GRAPH_TRACER */ +#endif + #endif /* _ASM_LOONGARCH_FTRACE_H */ diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c index 4bdb203fc66e..505e4bf59603 100644 --- a/arch/loongarch/kernel/asm-offsets.c +++ b/arch/loongarch/kernel/asm-offsets.c @@ -12,6 +12,7 @@ #include #include #include +#include void output_ptreg_defines(void) { @@ -264,7 +265,7 @@ void output_smpboot_defines(void) #ifdef CONFIG_HIBERNATION void output_pbe_defines(void) { - COMMENT(" Linux struct pbe offsets. "); + COMMENT("Linux struct pbe offsets."); OFFSET(PBE_ADDRESS, pbe, address); OFFSET(PBE_ORIG_ADDRESS, pbe, orig_address); OFFSET(PBE_NEXT, pbe, next); @@ -272,3 +273,15 @@ void output_pbe_defines(void) BLANK(); } #endif + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +void output_fgraph_ret_regs_defines(void) +{ + COMMENT("LoongArch fgraph_ret_regs offsets."); + OFFSET(FGRET_REGS_A0, fgraph_ret_regs, regs[0]); + OFFSET(FGRET_REGS_A1, fgraph_ret_regs, regs[1]); + OFFSET(FGRET_REGS_FP, fgraph_ret_regs, fp); + DEFINE(FGRET_REGS_SIZE, sizeof(struct fgraph_ret_regs)); + BLANK(); +} +#endif diff --git a/arch/loongarch/kernel/mcount.S b/arch/loongarch/kernel/mcount.S index 8cdc1563cd33..cb8e5803de4b 100644 --- a/arch/loongarch/kernel/mcount.S +++ b/arch/loongarch/kernel/mcount.S @@ -79,18 +79,20 @@ SYM_FUNC_START(ftrace_graph_caller) SYM_FUNC_END(ftrace_graph_caller) SYM_FUNC_START(return_to_handler) - PTR_ADDI sp, sp, -2 * SZREG - PTR_S a0, sp, 0 - PTR_S a1, sp, SZREG + PTR_ADDI sp, sp, -FGRET_REGS_SIZE + PTR_S a0, sp, FGRET_REGS_A0 + PTR_S a1, sp, FGRET_REGS_A1 + PTR_S zero, sp, FGRET_REGS_FP + move a0, sp bl ftrace_return_to_handler /* Restore the real parent address: a0 -> ra */ move ra, a0 - PTR_L a0, sp, 0 - PTR_L a1, sp, SZREG - PTR_ADDI sp, sp, 2 * SZREG + PTR_L a0, sp, FGRET_REGS_A0 + PTR_L a1, sp, FGRET_REGS_A1 + PTR_ADDI sp, sp, FGRET_REGS_SIZE jr ra SYM_FUNC_END(return_to_handler) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/loongarch/kernel/mcount_dyn.S b/arch/loongarch/kernel/mcount_dyn.S index c7d961fc72c2..e16ab0b98e5a 100644 --- a/arch/loongarch/kernel/mcount_dyn.S +++ b/arch/loongarch/kernel/mcount_dyn.S @@ -136,18 +136,19 @@ SYM_CODE_END(ftrace_graph_caller) SYM_CODE_START(return_to_handler) /* Save return value regs */ - PTR_ADDI sp, sp, -2 * SZREG - PTR_S a0, sp, 0 - PTR_S a1, sp, SZREG + PTR_ADDI sp, sp, -FGRET_REGS_SIZE + PTR_S a0, sp, FGRET_REGS_A0 + PTR_S a1, sp, FGRET_REGS_A1 + PTR_S zero, sp, FGRET_REGS_FP - move a0, zero + move a0, sp bl ftrace_return_to_handler move ra, a0 /* Restore return value regs */ - PTR_L a0, sp, 0 - PTR_L a1, sp, SZREG - PTR_ADDI sp, sp, 2 * SZREG + PTR_L a0, sp, FGRET_REGS_A0 + PTR_L a1, sp, FGRET_REGS_A1 + PTR_ADDI sp, sp, FGRET_REGS_SIZE jr ra SYM_CODE_END(return_to_handler) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 08f3a01fa1ea..b49793cf34eb 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -152,6 +152,7 @@ config RISCV select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION config CLANG_SUPPORTS_DYNAMIC_FTRACE diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index d47d87c2d7e3..740a979171e5 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -111,4 +111,25 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec); #endif /* CONFIG_DYNAMIC_FTRACE */ +#ifndef __ASSEMBLY__ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +struct fgraph_ret_regs { + unsigned long a1; + unsigned long a0; + unsigned long s0; + unsigned long ra; +}; + +static inline unsigned long fgraph_ret_regs_return_value(struct fgraph_ret_regs *ret_regs) +{ + return ret_regs->a0; +} + +static inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph_ret_regs *ret_regs) +{ + return ret_regs->s0; +} +#endif /* ifdef CONFIG_FUNCTION_GRAPH_TRACER */ +#endif + #endif /* _ASM_RISCV_FTRACE_H */ diff --git a/arch/riscv/kernel/mcount.S b/arch/riscv/kernel/mcount.S index 30102aadc4d7..8a6e5a9e842a 100644 --- a/arch/riscv/kernel/mcount.S +++ b/arch/riscv/kernel/mcount.S @@ -65,13 +65,8 @@ ENTRY(return_to_handler) * So alternatively we check the *old* frame pointer position, that is, the * value stored in -16(s0) on entry, and the s0 on return. */ -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - mv t6, s0 -#endif SAVE_RET_ABI_STATE -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - mv a0, t6 -#endif + mv a0, sp call ftrace_return_to_handler mv a2, a0 RESTORE_RET_ABI_STATE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b3ebf58cf77e..7422db409770 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -217,6 +217,7 @@ config X86 select HAVE_FAST_GUP select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_TRACER if X86_32 || (X86_64 && DYNAMIC_FTRACE) select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index b8d4a07f9595..897cf02c20b1 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -150,4 +150,24 @@ static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) #endif /* !COMPILE_OFFSETS */ #endif /* !__ASSEMBLY__ */ +#ifndef __ASSEMBLY__ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +struct fgraph_ret_regs { + unsigned long ax; + unsigned long dx; + unsigned long bp; +}; + +static inline unsigned long fgraph_ret_regs_return_value(struct fgraph_ret_regs *ret_regs) +{ + return ret_regs->ax; +} + +static inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph_ret_regs *ret_regs) +{ + return ret_regs->bp; +} +#endif /* ifdef CONFIG_FUNCTION_GRAPH_TRACER */ +#endif + #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 0d9a14528176..24c1175a47e2 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -187,12 +187,14 @@ SYM_CODE_END(ftrace_graph_caller) .globl return_to_handler return_to_handler: - pushl %eax + pushl $0 pushl %edx - movl $0, %eax + pushl %eax + movl %esp, %eax call ftrace_return_to_handler movl %eax, %ecx - popl %edx popl %eax + popl %edx + addl $4, %esp # skip ebp JMP_NOSPEC ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index b8c720b5dab2..945cfa5f7239 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -348,12 +348,13 @@ STACK_FRAME_NON_STANDARD_FP(__fentry__) SYM_CODE_START(return_to_handler) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR - subq $16, %rsp + subq $24, %rsp /* Save the return values */ movq %rax, (%rsp) movq %rdx, 8(%rsp) - movq %rbp, %rdi + movq %rbp, 16(%rsp) + movq %rsp, %rdi call ftrace_return_to_handler @@ -361,7 +362,7 @@ SYM_CODE_START(return_to_handler) movq 8(%rsp), %rdx movq (%rsp), %rax - addq $16, %rsp + addq $24, %rsp /* * Jump back to the old return address. This cannot be JMP_NOSPEC rdi * since IBT would demand that contain ENDBR, which simply isn't so for diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b23bdd414394..8e59bd954153 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -633,6 +633,7 @@ enum { FTRACE_ITER_MOD = (1 << 5), FTRACE_ITER_ENABLED = (1 << 6), FTRACE_ITER_TOUCHED = (1 << 7), + FTRACE_ITER_ADDRS = (1 << 8), }; void arch_ftrace_update_code(int command); @@ -1018,6 +1019,9 @@ struct ftrace_graph_ent { */ struct ftrace_graph_ret { unsigned long func; /* Current function */ +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + unsigned long retval; +#endif int depth; /* Number of functions that overran the depth limit for current task */ unsigned int overrun; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8cf97fa4a4b3..abe5c583bd59 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -31,6 +31,9 @@ config HAVE_FUNCTION_GRAPH_TRACER help See Documentation/trace/ftrace-design.rst +config HAVE_FUNCTION_GRAPH_RETVAL + bool + config HAVE_DYNAMIC_FTRACE bool help @@ -227,6 +230,18 @@ config FUNCTION_GRAPH_TRACER the return value. This is done by setting the current return address on the current task structure into a stack of calls. +config FUNCTION_GRAPH_RETVAL + bool "Kernel Function Graph Return Value" + depends on HAVE_FUNCTION_GRAPH_RETVAL + depends on FUNCTION_GRAPH_TRACER + default n + help + Support recording and printing the function return value when + using function graph tracer. It can be helpful to locate functions + that return errors. This feature is off by default, and you can + enable it via the trace option funcgraph-retval. + See Documentation/trace/ftrace.rst + config DYNAMIC_FTRACE bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 218cd95bf8e4..cd2c35b1dd8f 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -236,16 +236,23 @@ static struct notifier_block ftrace_suspend_notifier = { .notifier_call = ftrace_suspend_notifier_call, }; +/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */ +struct fgraph_ret_regs; + /* * Send the trace to the ring-buffer. * @return the original return address. */ -unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs, + unsigned long frame_pointer) { struct ftrace_graph_ret trace; unsigned long ret; ftrace_pop_return_trace(&trace, &ret, frame_pointer); +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + trace.retval = fgraph_ret_regs_return_value(ret_regs); +#endif trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); /* @@ -266,6 +273,23 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +/* + * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can + * leave only ftrace_return_to_handler(ret_regs). + */ +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL +unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs) +{ + return __ftrace_return_to_handler(ret_regs, + fgraph_ret_regs_frame_pointer(ret_regs)); +} +#else +unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +{ + return __ftrace_return_to_handler(NULL, frame_pointer); +} +#endif + /** * ftrace_graph_get_ret_stack - return the entry of the shadow stack * @task: The task to read the shadow stack from diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6a77edb51f18..3740aca79fe7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3861,6 +3861,9 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; + if (iter->flags & FTRACE_ITER_ADDRS) + seq_printf(m, "%lx ", rec->ip); + if (print_rec(m, rec->ip)) { /* This should only happen when a rec is disabled */ WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED)); @@ -3996,6 +3999,30 @@ ftrace_touched_open(struct inode *inode, struct file *file) return 0; } +static int +ftrace_avail_addrs_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ADDRS; + iter->ops = &global_ops; + + return 0; +} + /** * ftrace_regex_open - initialize function tracer filter files * @ops: The ftrace_ops that hold the hash filters @@ -5916,6 +5943,13 @@ static const struct file_operations ftrace_touched_fops = { .release = seq_release_private, }; +static const struct file_operations ftrace_avail_addrs_fops = { + .open = ftrace_avail_addrs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, @@ -6377,6 +6411,9 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) trace_create_file("available_filter_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_avail_fops); + trace_create_file("available_filter_functions_addrs", TRACE_MODE_READ, + d_tracer, NULL, &ftrace_avail_addrs_fops); + trace_create_file("enabled_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_enabled_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79bdefe9261b..e6407a27d644 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -832,6 +832,8 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_TAIL 0x100 #define TRACE_GRAPH_SLEEP_TIME 0x200 #define TRACE_GRAPH_GRAPH_TIME 0x400 +#define TRACE_GRAPH_PRINT_RETVAL 0x800 +#define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 778200dd8ede..5fe525f1b8cc 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -31,7 +31,7 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) /* Common ftrace options */ xbc_node_for_each_array_value(node, "options", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) { pr_err("String is too long: %s\n", p); continue; } @@ -87,7 +87,7 @@ trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node) const char *p; xbc_node_for_each_array_value(node, "events", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) { pr_err("String is too long: %s\n", p); continue; } @@ -486,7 +486,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, p = xbc_node_find_value(enode, "filter", NULL); if (p && *p != '\0') { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) pr_err("filter string is too long: %s\n", p); else if (apply_event_filter(file, buf) < 0) pr_err("Failed to apply filter: %s\n", buf); @@ -494,7 +494,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) { xbc_node_for_each_array_value(enode, "actions", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) pr_err("action string is too long: %s\n", p); else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", p); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index cd41e863b51c..340b2fa98218 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -86,6 +86,30 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, ); /* Function return entry */ +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + +FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field_struct( struct ftrace_graph_ret, ret ) + __field_packed( unsigned long, ret, func ) + __field_packed( unsigned long, ret, retval ) + __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, overrun ) + __field_packed( unsigned long long, ret, calltime) + __field_packed( unsigned long long, ret, rettime ) + ), + + F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx", + (void *)__entry->func, __entry->depth, + __entry->calltime, __entry->rettime, + __entry->depth, __entry->retval) +); + +#else + FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, @@ -105,6 +129,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __entry->depth) ); +#endif + /* * Context switch trace entry - which task (and prio) we switched from/to: * diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 203204cadf92..c35fbaab2a47 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -58,6 +58,12 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, /* Display function name after trailing } */ { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + /* Display function return value ? */ + { TRACER_OPT(funcgraph-retval, TRACE_GRAPH_PRINT_RETVAL) }, + /* Display function return value in hexadecimal format ? */ + { TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) }, +#endif /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, @@ -619,6 +625,56 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, trace_seq_puts(s, "| "); } +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + +#define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL + +static void print_graph_retval(struct trace_seq *s, unsigned long retval, + bool leaf, void *func, bool hex_format) +{ + unsigned long err_code = 0; + + if (retval == 0 || hex_format) + goto done; + + /* Check if the return value matches the negative format */ + if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) && + (((u64)retval) >> 32) == 0) { + /* sign extension */ + err_code = (unsigned long)(s32)retval; + } else { + err_code = retval; + } + + if (!IS_ERR_VALUE(err_code)) + err_code = 0; + +done: + if (leaf) { + if (hex_format || (err_code == 0)) + trace_seq_printf(s, "%ps(); /* = 0x%lx */\n", + func, retval); + else + trace_seq_printf(s, "%ps(); /* = %ld */\n", + func, err_code); + } else { + if (hex_format || (err_code == 0)) + trace_seq_printf(s, "} /* %ps = 0x%lx */\n", + func, retval); + else + trace_seq_printf(s, "} /* %ps = %ld */\n", + func, err_code); + } +} + +#else + +#define __TRACE_GRAPH_PRINT_RETVAL 0 + +#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0) + +#endif + /* Case of a leaf function on its call entry */ static enum print_line_t print_graph_entry_leaf(struct trace_iterator *iter, @@ -663,7 +719,15 @@ print_graph_entry_leaf(struct trace_iterator *iter, for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) trace_seq_putc(s, ' '); - trace_seq_printf(s, "%ps();\n", (void *)call->func); + /* + * Write out the function return value if the option function-retval is + * enabled. + */ + if (flags & __TRACE_GRAPH_PRINT_RETVAL) + print_graph_retval(s, graph_ret->retval, true, (void *)call->func, + !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); + else + trace_seq_printf(s, "%ps();\n", (void *)call->func); print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -942,16 +1006,25 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, trace_seq_putc(s, ' '); /* - * If the return function does not have a matching entry, - * then the entry was lost. Instead of just printing - * the '}' and letting the user guess what function this - * belongs to, write out the function name. Always do - * that if the funcgraph-tail option is enabled. + * Always write out the function name and its return value if the + * function-retval option is enabled. */ - if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) - trace_seq_puts(s, "}\n"); - else - trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + if (flags & __TRACE_GRAPH_PRINT_RETVAL) { + print_graph_retval(s, trace->retval, false, (void *)trace->func, + !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); + } else { + /* + * If the return function does not have a matching entry, + * then the entry was lost. Instead of just printing + * the '}' and letting the user guess what function this + * belongs to, write out the function name. Always do + * that if the funcgraph-tail option is enabled. + */ + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) + trace_seq_puts(s, "}\n"); + else + trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + } /* Overrun */ if (flags & TRACE_GRAPH_PRINT_OVERRUN) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e97e3fa5cbed..bd0d01d00fb9 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -181,6 +181,7 @@ struct osn_irq { #define IRQ_CONTEXT 0 #define THREAD_CONTEXT 1 +#define THREAD_URET 2 /* * sofirq runtime info. */ @@ -238,6 +239,7 @@ struct timerlat_variables { u64 abs_period; bool tracing_thread; u64 count; + bool uthread_migrate; }; static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var); @@ -1181,6 +1183,78 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) osn_var->thread.arrival_time = 0; } +#ifdef CONFIG_TIMERLAT_TRACER +/* + * osnoise_stop_exception - Stop tracing and the tracer. + */ +static __always_inline void osnoise_stop_exception(char *msg, int cpu) +{ + struct osnoise_instance *inst; + struct trace_array *tr; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + tr = inst->tr; + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, + "stop tracing hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); + + if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) + panic("tracer hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); + + tracer_tracing_off(tr); + } + rcu_read_unlock(); +} + +/* + * trace_sched_migrate_callback - sched:sched_migrate_task trace event handler + * + * his function is hooked to the sched:sched_migrate_task trace event, and monitors + * timerlat user-space thread migration. + */ +static void trace_sched_migrate_callback(void *data, struct task_struct *p, int dest_cpu) +{ + struct osnoise_variables *osn_var; + long cpu = task_cpu(p); + + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + if (osn_var->pid == p->pid && dest_cpu != cpu) { + per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1; + osnoise_taint("timerlat user-thread migrated\n"); + osnoise_stop_exception("timerlat user-thread migrated", cpu); + } +} + +static int register_migration_monitor(void) +{ + int ret = 0; + + /* + * Timerlat thread migration check is only required when running timerlat in user-space. + * Thus, enable callback only if timerlat is set with no workload. + */ + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + + return ret; +} + +static void unregister_migration_monitor(void) +{ + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); +} +#else +static int register_migration_monitor(void) +{ + return 0; +} +static void unregister_migration_monitor(void) {} +#endif /* * trace_sched_switch - sched:sched_switch trace event handler * @@ -1204,7 +1278,7 @@ trace_sched_switch_callback(void *data, bool preempt, } /* - * hook_thread_events - Hook the insturmentation for thread noise + * hook_thread_events - Hook the instrumentation for thread noise * * Hook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. @@ -1217,11 +1291,19 @@ static int hook_thread_events(void) if (ret) return -EINVAL; + ret = register_migration_monitor(); + if (ret) + goto out_unreg; + return 0; + +out_unreg: + unregister_trace_sched_switch(trace_sched_switch_callback, NULL); + return -EINVAL; } /* - * unhook_thread_events - *nhook the insturmentation for thread noise + * unhook_thread_events - unhook the instrumentation for thread noise * * Unook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. @@ -1229,6 +1311,7 @@ static int hook_thread_events(void) static void unhook_thread_events(void) { unregister_trace_sched_switch(trace_sched_switch_callback, NULL); + unregister_migration_monitor(); } /* @@ -1285,6 +1368,22 @@ static __always_inline void osnoise_stop_tracing(void) rcu_read_unlock(); } +/* + * osnoise_has_tracing_on - Check if there is at least one instance on + */ +static __always_inline int osnoise_has_tracing_on(void) +{ + struct osnoise_instance *inst; + int trace_is_on = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) + trace_is_on += tracer_tracing_is_on(inst->tr); + rcu_read_unlock(); + + return trace_is_on; +} + /* * notify_new_max_latency - Notify a new max latency via fsnotify interface. */ @@ -1517,13 +1616,16 @@ static struct cpumask save_cpumask; /* * osnoise_sleep - sleep until the next period */ -static void osnoise_sleep(void) +static void osnoise_sleep(bool skip_period) { u64 interval; ktime_t wake_time; mutex_lock(&interface_lock); - interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + if (skip_period) + interval = osnoise_data.sample_period; + else + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; mutex_unlock(&interface_lock); /* @@ -1545,6 +1647,39 @@ static void osnoise_sleep(void) } } +/* + * osnoise_migration_pending - checks if the task needs to migrate + * + * osnoise/timerlat threads are per-cpu. If there is a pending request to + * migrate the thread away from the current CPU, something bad has happened. + * Play the good citizen and leave. + * + * Returns 0 if it is safe to continue, 1 otherwise. + */ +static inline int osnoise_migration_pending(void) +{ + if (!current->migration_pending) + return 0; + + /* + * If migration is pending, there is a task waiting for the + * tracer to enable migration. The tracer does not allow migration, + * thus: taint and leave to unblock the blocked thread. + */ + osnoise_taint("migration requested to osnoise threads, leaving."); + + /* + * Unset this thread from the threads managed by the interface. + * The tracers are responsible for cleaning their env before + * exiting. + */ + mutex_lock(&interface_lock); + this_cpu_osn_var()->kthread = NULL; + mutex_unlock(&interface_lock); + + return 1; +} + /* * osnoise_main - The osnoise detection kernel thread * @@ -1553,12 +1688,35 @@ static void osnoise_sleep(void) */ static int osnoise_main(void *data) { + unsigned long flags; + + /* + * This thread was created pinned to the CPU using PF_NO_SETAFFINITY. + * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread. + * + * To work around this limitation, disable migration and remove the + * flag. + */ + migrate_disable(); + raw_spin_lock_irqsave(¤t->pi_lock, flags); + current->flags &= ~(PF_NO_SETAFFINITY); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); while (!kthread_should_stop()) { + if (osnoise_migration_pending()) + break; + + /* skip a period if tracing is off on all instances */ + if (!osnoise_has_tracing_on()) { + osnoise_sleep(true); + continue; + } + run_osnoise(); - osnoise_sleep(); + osnoise_sleep(false); } + migrate_enable(); return 0; } @@ -1706,6 +1864,7 @@ static int timerlat_main(void *data) struct timerlat_variables *tlat = this_cpu_tmr_var(); struct timerlat_sample s; struct sched_param sp; + unsigned long flags; u64 now, diff; /* @@ -1714,6 +1873,18 @@ static int timerlat_main(void *data) sp.sched_priority = DEFAULT_TIMERLAT_PRIO; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + /* + * This thread was created pinned to the CPU using PF_NO_SETAFFINITY. + * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread. + * + * To work around this limitation, disable migration and remove the + * flag. + */ + migrate_disable(); + raw_spin_lock_irqsave(¤t->pi_lock, flags); + current->flags &= ~(PF_NO_SETAFFINITY); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + tlat->count = 0; tlat->tracing_thread = false; @@ -1731,6 +1902,7 @@ static int timerlat_main(void *data) osn_var->sampling = 1; while (!kthread_should_stop()) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); diff = now - tlat->abs_period; @@ -1749,10 +1921,14 @@ static int timerlat_main(void *data) if (time_to_us(diff) >= osnoise_data.stop_tracing_total) osnoise_stop_tracing(); + if (osnoise_migration_pending()) + break; + wait_next_period(tlat); } hrtimer_cancel(&tlat->timer); + migrate_enable(); return 0; } #else /* CONFIG_TIMERLAT_TRACER */ @@ -1771,10 +1947,24 @@ static void stop_kthread(unsigned int cpu) kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; if (kthread) { - kthread_stop(kthread); + if (test_bit(OSN_WORKLOAD, &osnoise_options)) { + kthread_stop(kthread); + } else { + /* + * This is a user thread waiting on the timerlat_fd. We need + * to close all users, and the best way to guarantee this is + * by killing the thread. NOTE: this is a purpose specific file. + */ + kill_pid(kthread->thread_pid, SIGKILL, 1); + put_task_struct(kthread); + } per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; } else { + /* if no workload, just return */ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + /* + * This is set in the osnoise tracer case. + */ per_cpu(per_cpu_osnoise_var, cpu).sampling = false; barrier(); return; @@ -1819,7 +2009,6 @@ static int start_kthread(unsigned int cpu) barrier(); return 0; } - snprintf(comm, 24, "osnoise/%d", cpu); } @@ -1848,6 +2037,11 @@ static int start_per_cpu_kthreads(void) int retval = 0; int cpu; + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + if (timerlat_enabled()) + return 0; + } + cpus_read_lock(); /* * Run only on online CPUs in which osnoise is allowed to run. @@ -2188,6 +2382,223 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, return err; } +#ifdef CONFIG_TIMERLAT_TRACER +static int timerlat_fd_open(struct inode *inode, struct file *file) +{ + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat; + long cpu = (long) inode->i_cdev; + + mutex_lock(&interface_lock); + + /* + * This file is accessible only if timerlat is enabled, and + * NO_OSNOISE_WORKLOAD is set. + */ + if (!timerlat_enabled() || test_bit(OSN_WORKLOAD, &osnoise_options)) { + mutex_unlock(&interface_lock); + return -EINVAL; + } + + migrate_disable(); + + osn_var = this_cpu_osn_var(); + + /* + * The osn_var->pid holds the single access to this file. + */ + if (osn_var->pid) { + mutex_unlock(&interface_lock); + migrate_enable(); + return -EBUSY; + } + + /* + * timerlat tracer is a per-cpu tracer. Check if the user-space too + * is pinned to a single CPU. The tracer laters monitor if the task + * migrates and then disables tracer if it does. However, it is + * worth doing this basic acceptance test to avoid obviusly wrong + * setup. + */ + if (current->nr_cpus_allowed > 1 || cpu != smp_processor_id()) { + mutex_unlock(&interface_lock); + migrate_enable(); + return -EPERM; + } + + /* + * From now on, it is good to go. + */ + file->private_data = inode->i_cdev; + + get_task_struct(current); + + osn_var->kthread = current; + osn_var->pid = current->pid; + + /* + * Setup is done. + */ + mutex_unlock(&interface_lock); + + tlat = this_cpu_tmr_var(); + tlat->count = 0; + + migrate_enable(); + return 0; +}; + +/* + * timerlat_fd_read - Read function for "timerlat_fd" file + * @file: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * Prints 1 on timerlat, the number of interferences on osnoise, -1 on error. + */ +static ssize_t +timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, + loff_t *ppos) +{ + long cpu = (long) file->private_data; + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat; + struct timerlat_sample s; + s64 diff; + u64 now; + + migrate_disable(); + + tlat = this_cpu_tmr_var(); + + /* + * While in user-space, the thread is migratable. There is nothing + * we can do about it. + * So, if the thread is running on another CPU, stop the machinery. + */ + if (cpu == smp_processor_id()) { + if (tlat->uthread_migrate) { + migrate_enable(); + return -EINVAL; + } + } else { + per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1; + osnoise_taint("timerlat user thread migrate\n"); + osnoise_stop_tracing(); + migrate_enable(); + return -EINVAL; + } + + osn_var = this_cpu_osn_var(); + + /* + * The timerlat in user-space runs in a different order: + * the read() starts from the execution of the previous occurrence, + * sleeping for the next occurrence. + * + * So, skip if we are entering on read() before the first wakeup + * from timerlat IRQ: + */ + if (likely(osn_var->sampling)) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + /* + * it was not a timer firing, but some other signal? + */ + if (diff < 0) + goto out; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_URET; + + trace_timerlat_sample(&s); + + notify_new_max_latency(diff); + + tlat->tracing_thread = false; + if (osnoise_data.stop_tracing_total) + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) + osnoise_stop_tracing(); + } else { + tlat->tracing_thread = false; + tlat->kthread = current; + + hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); + tlat->timer.function = timerlat_irq; + + /* Annotate now to drift new period */ + tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); + + osn_var->sampling = 1; + } + + /* wait for the next period */ + wait_next_period(tlat); + + /* This is the wakeup from this cycle */ + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + /* + * it was not a timer firing, but some other signal? + */ + if (diff < 0) + goto out; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_CONTEXT; + + trace_timerlat_sample(&s); + + if (osnoise_data.stop_tracing_total) { + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) { + timerlat_dump_stack(time_to_us(diff)); + notify_new_max_latency(diff); + osnoise_stop_tracing(); + } + } + +out: + migrate_enable(); + return 0; +} + +static int timerlat_fd_release(struct inode *inode, struct file *file) +{ + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat_var; + long cpu = (long) file->private_data; + + migrate_disable(); + mutex_lock(&interface_lock); + + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); + + hrtimer_cancel(&tlat_var->timer); + memset(tlat_var, 0, sizeof(*tlat_var)); + + osn_var->sampling = 0; + osn_var->pid = 0; + + /* + * We are leaving, not being stopped... see stop_kthread(); + */ + if (osn_var->kthread) { + put_task_struct(osn_var->kthread); + osn_var->kthread = NULL; + } + + mutex_unlock(&interface_lock); + migrate_enable(); + return 0; +} +#endif + /* * osnoise/runtime_us: cannot be greater than the period. */ @@ -2251,6 +2662,13 @@ static struct trace_min_max_param timerlat_period = { .max = &timerlat_max_period, .min = &timerlat_min_period, }; + +static const struct file_operations timerlat_fd_fops = { + .open = timerlat_fd_open, + .read = timerlat_fd_read, + .release = timerlat_fd_release, + .llseek = generic_file_llseek, +}; #endif static const struct file_operations cpus_fops = { @@ -2288,18 +2706,63 @@ static int init_timerlat_stack_tracefs(struct dentry *top_dir) } #endif /* CONFIG_STACKTRACE */ +static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir) +{ + struct dentry *timerlat_fd; + struct dentry *per_cpu; + struct dentry *cpu_dir; + char cpu_str[30]; /* see trace.c: tracing_init_tracefs_percpu() */ + long cpu; + + /* + * Why not using tracing instance per_cpu/ dir? + * + * Because osnoise/timerlat have a single workload, having + * multiple files like these are wast of memory. + */ + per_cpu = tracefs_create_dir("per_cpu", top_dir); + if (!per_cpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + snprintf(cpu_str, 30, "cpu%ld", cpu); + cpu_dir = tracefs_create_dir(cpu_str, per_cpu); + if (!cpu_dir) + goto out_clean; + + timerlat_fd = trace_create_file("timerlat_fd", TRACE_MODE_READ, + cpu_dir, NULL, &timerlat_fd_fops); + if (!timerlat_fd) + goto out_clean; + + /* Record the CPU */ + d_inode(timerlat_fd)->i_cdev = (void *)(cpu); + } + + return 0; + +out_clean: + tracefs_remove(per_cpu); + return -ENOMEM; +} + /* * init_timerlat_tracefs - A function to initialize the timerlat interface files */ static int init_timerlat_tracefs(struct dentry *top_dir) { struct dentry *tmp; + int retval; tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir, &timerlat_period, &trace_min_max_fops); if (!tmp) return -ENOMEM; + retval = osnoise_create_cpu_timerlat_fd(top_dir); + if (retval) + return retval; + return init_timerlat_stack_tracefs(top_dir); } #else /* CONFIG_TIMERLAT_TRACER */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 1e33f367783e..db575094c498 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1446,6 +1446,8 @@ static struct trace_event trace_osnoise_event = { }; /* TRACE_TIMERLAT */ + +static char *timerlat_lat_context[] = {"irq", "thread", "user-ret"}; static enum print_line_t trace_timerlat_print(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -1458,7 +1460,7 @@ trace_timerlat_print(struct trace_iterator *iter, int flags, trace_seq_printf(s, "#%-5u context %6s timer_latency %9llu ns\n", field->seqnum, - field->context ? "thread" : "irq", + timerlat_lat_context[field->context], field->timer_latency); return trace_handle_return(s); diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc new file mode 100644 index 000000000000..e34c0bdef3ed --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc @@ -0,0 +1,44 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: ftrace - function graph print function return value +# requires: options/funcgraph-retval options/funcgraph-retval-hex function_graph:tracer + +# Make sure that funcgraph-retval works + +fail() { # msg + echo $1 + exit_fail +} + +disable_tracing +clear_trace + +# get self PID, can not use $$, because it is PPID +read PID _ < /proc/self/stat + +[ -f set_ftrace_filter ] && echo proc_reg_write > set_ftrace_filter +[ -f set_ftrace_pid ] && echo ${PID} > set_ftrace_pid +echo function_graph > current_tracer +echo 1 > options/funcgraph-retval + +set +e +enable_tracing +echo > /proc/interrupts +disable_tracing +set -e + +: "Test printing the error code in signed decimal format" +echo 0 > options/funcgraph-retval-hex +count=`cat trace | grep 'proc_reg_write' | grep '= -5' | wc -l` +if [ $count -eq 0 ]; then + fail "Return value can not be printed in signed decimal format" +fi + +: "Test printing the error code in hexadecimal format" +echo 1 > options/funcgraph-retval-hex +count=`cat trace | grep 'proc_reg_write' | grep 'fffffffb' | wc -l` +if [ $count -eq 0 ]; then + fail "Return value can not be printed in hexadecimal format" +fi + +exit 0