linux-stable/include/linux/rethook.h
wuqiang.matt 4bbd934556 kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.

Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):

OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s

         1T       2T       4T       8T      16T      24T
   24150045 29317964 15446741 12494489 18287272 17708768
        32T      48T      64T      72T      96T     128T
   16200682 13737658 11645677 11269858 10470118  9931051

This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:

                  1T         2T         4T         8T        16T
native:     41186213   82336866  164250978  328662645  658810299
freelist:   24150045   29317964   15446741   12494489   18287272
objpool:    23926730   48010314   96125218  191782984  385091769
                 32T        48T        64T        96T       128T
native:   1330338351 1969957941 2512291791 2615754135 2671040914
freelist:   16200682   13737658   11645677   10470118    9931051
objpool:   764481096 1147149781 1456220214 1502109662 1579015050

Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:

OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s

                  1T         2T         4T         8T        16T
native: .   30066096   63569843  126194076  257447289  505800181
freelist:   16152090   11064397   11124068    7215768    5663013
objpool:    13997541   28032100   55726624  110099926  221498787
                 24T        32T        48T        64T        96T
native:    763305277 1015925192 1521075123 2033009392 3021013752
freelist:    5015810    4602893    3766792    3382478    2945292
objpool:   328192025  439439564  668534502  887401381 1319972072

Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/

Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-18 23:59:54 +09:00

93 lines
2.9 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* Return hooking with list-based shadow stack.
*/
#ifndef _LINUX_RETHOOK_H
#define _LINUX_RETHOOK_H
#include <linux/compiler.h>
#include <linux/objpool.h>
#include <linux/kallsyms.h>
#include <linux/llist.h>
#include <linux/rcupdate.h>
struct rethook_node;
typedef void (*rethook_handler_t) (struct rethook_node *, void *, unsigned long, struct pt_regs *);
/**
* struct rethook - The rethook management data structure.
* @data: The user-defined data storage.
* @handler: The user-defined return hook handler.
* @pool: The pool of struct rethook_node.
* @ref: The reference counter.
* @rcu: The rcu_head for deferred freeing.
*
* Don't embed to another data structure, because this is a self-destructive
* data structure when all rethook_node are freed.
*/
struct rethook {
void *data;
rethook_handler_t handler;
struct objpool_head pool;
struct rcu_head rcu;
};
/**
* struct rethook_node - The rethook shadow-stack entry node.
* @rcu: The rcu_head for deferred freeing.
* @llist: The llist, linked to a struct task_struct::rethooks.
* @rethook: The pointer to the struct rethook.
* @ret_addr: The storage for the real return address.
* @frame: The storage for the frame pointer.
*
* You can embed this to your extended data structure to store any data
* on each entry of the shadow stack.
*/
struct rethook_node {
struct rcu_head rcu;
struct llist_node llist;
struct rethook *rethook;
unsigned long ret_addr;
unsigned long frame;
};
struct rethook *rethook_alloc(void *data, rethook_handler_t handler, int size, int num);
void rethook_stop(struct rethook *rh);
void rethook_free(struct rethook *rh);
struct rethook_node *rethook_try_get(struct rethook *rh);
void rethook_recycle(struct rethook_node *node);
void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount);
unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame,
struct llist_node **cur);
/* Arch dependent code must implement arch_* and trampoline code */
void arch_rethook_prepare(struct rethook_node *node, struct pt_regs *regs, bool mcount);
void arch_rethook_trampoline(void);
/**
* is_rethook_trampoline() - Check whether the address is rethook trampoline
* @addr: The address to be checked
*
* Return true if the @addr is the rethook trampoline address.
*/
static inline bool is_rethook_trampoline(unsigned long addr)
{
return addr == (unsigned long)dereference_symbol_descriptor(arch_rethook_trampoline);
}
/* If the architecture needs to fixup the return address, implement it. */
void arch_rethook_fixup_return(struct pt_regs *regs,
unsigned long correct_ret_addr);
/* Generic trampoline handler, arch code must prepare asm stub */
unsigned long rethook_trampoline_handler(struct pt_regs *regs,
unsigned long frame);
#ifdef CONFIG_RETHOOK
void rethook_flush_task(struct task_struct *tk);
#else
#define rethook_flush_task(tsk) do { } while (0)
#endif
#endif