selftests/bpf: Add uprobe triggering overhead benchmarks

Add benchmark to measure overhead of uprobes and uretprobes. Also have
a baseline (no uprobe attached) benchmark.

On my dev machine, baseline benchmark can trigger 130M user_target()
invocations. When uprobe is attached, this falls to just 700K. With
uretprobe, we get down to 520K:

  $ sudo ./bench trig-uprobe-base -a
  Summary: hits  131.289 ± 2.872M/s

  # UPROBE
  $ sudo ./bench -a trig-uprobe-without-nop
  Summary: hits    0.729 ± 0.007M/s

  $ sudo ./bench -a trig-uprobe-with-nop
  Summary: hits    1.798 ± 0.017M/s

  # URETPROBE
  $ sudo ./bench -a trig-uretprobe-without-nop
  Summary: hits    0.508 ± 0.012M/s

  $ sudo ./bench -a trig-uretprobe-with-nop
  Summary: hits    0.883 ± 0.008M/s

So there is almost 2.5x performance difference between probing nop vs
non-nop instruction for entry uprobe. And 1.7x difference for uretprobe.

This means that non-nop uprobe overhead is around 1.4 microseconds for uprobe
and 2 microseconds for non-nop uretprobe.

For nop variants, uprobe and uretprobe overhead is down to 0.556 and
1.13 microseconds, respectively.

For comparison, just doing a very low-overhead syscall (with no BPF
programs attached anywhere) gives:

  $ sudo ./bench trig-base -a
  Summary: hits    4.830 ± 0.036M/s

So uprobes are about 2.67x slower than pure context switch.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211116013041.4072571-1-andrii@kernel.org
This commit is contained in:
Andrii Nakryiko 2021-11-15 17:30:41 -08:00 committed by Daniel Borkmann
parent ebf7f6f0a6
commit d41bc48bfa
4 changed files with 166 additions and 1 deletions

View file

@ -533,7 +533,9 @@ $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
$(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h
$(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
$(OUTPUT)/bench: LDLIBS += -lm
$(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \
$(OUTPUT)/bench: $(OUTPUT)/bench.o \
$(OUTPUT)/testing_helpers.o \
$(OUTPUT)/trace_helpers.o \
$(OUTPUT)/bench_count.o \
$(OUTPUT)/bench_rename.o \
$(OUTPUT)/bench_trigger.o \

View file

@ -359,6 +359,11 @@ extern const struct bench bench_trig_kprobe;
extern const struct bench bench_trig_fentry;
extern const struct bench bench_trig_fentry_sleep;
extern const struct bench bench_trig_fmodret;
extern const struct bench bench_trig_uprobe_base;
extern const struct bench bench_trig_uprobe_with_nop;
extern const struct bench bench_trig_uretprobe_with_nop;
extern const struct bench bench_trig_uprobe_without_nop;
extern const struct bench bench_trig_uretprobe_without_nop;
extern const struct bench bench_rb_libbpf;
extern const struct bench bench_rb_custom;
extern const struct bench bench_pb_libbpf;
@ -385,6 +390,11 @@ static const struct bench *benchs[] = {
&bench_trig_fentry,
&bench_trig_fentry_sleep,
&bench_trig_fmodret,
&bench_trig_uprobe_base,
&bench_trig_uprobe_with_nop,
&bench_trig_uretprobe_with_nop,
&bench_trig_uprobe_without_nop,
&bench_trig_uretprobe_without_nop,
&bench_rb_libbpf,
&bench_rb_custom,
&bench_pb_libbpf,

View file

@ -2,6 +2,7 @@
/* Copyright (c) 2020 Facebook */
#include "bench.h"
#include "trigger_bench.skel.h"
#include "trace_helpers.h"
/* BPF triggering benchmarks */
static struct trigger_ctx {
@ -107,6 +108,101 @@ static void *trigger_consumer(void *input)
return NULL;
}
/* make sure call is not inlined and not avoided by compiler, so __weak and
* inline asm volatile in the body of the function
*
* There is a performance difference between uprobing at nop location vs other
* instructions. So use two different targets, one of which starts with nop
* and another doesn't.
*
* GCC doesn't generate stack setup preample for these functions due to them
* having no input arguments and doing nothing in the body.
*/
__weak void uprobe_target_with_nop(void)
{
asm volatile ("nop");
}
__weak void uprobe_target_without_nop(void)
{
asm volatile ("");
}
static void *uprobe_base_producer(void *input)
{
while (true) {
uprobe_target_with_nop();
atomic_inc(&base_hits.value);
}
return NULL;
}
static void *uprobe_producer_with_nop(void *input)
{
while (true)
uprobe_target_with_nop();
return NULL;
}
static void *uprobe_producer_without_nop(void *input)
{
while (true)
uprobe_target_without_nop();
return NULL;
}
static void usetup(bool use_retprobe, bool use_nop)
{
size_t uprobe_offset;
ssize_t base_addr;
struct bpf_link *link;
setup_libbpf();
ctx.skel = trigger_bench__open_and_load();
if (!ctx.skel) {
fprintf(stderr, "failed to open skeleton\n");
exit(1);
}
base_addr = get_base_addr();
if (use_nop)
uprobe_offset = get_uprobe_offset(&uprobe_target_with_nop, base_addr);
else
uprobe_offset = get_uprobe_offset(&uprobe_target_without_nop, base_addr);
link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe,
use_retprobe,
-1 /* all PIDs */,
"/proc/self/exe",
uprobe_offset);
if (!link) {
fprintf(stderr, "failed to attach uprobe!\n");
exit(1);
}
ctx.skel->links.bench_trigger_uprobe = link;
}
static void uprobe_setup_with_nop()
{
usetup(false, true);
}
static void uretprobe_setup_with_nop()
{
usetup(true, true);
}
static void uprobe_setup_without_nop()
{
usetup(false, false);
}
static void uretprobe_setup_without_nop()
{
usetup(true, false);
}
const struct bench bench_trig_base = {
.name = "trig-base",
.validate = trigger_validate,
@ -182,3 +278,53 @@ const struct bench bench_trig_fmodret = {
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_trig_uprobe_base = {
.name = "trig-uprobe-base",
.setup = NULL, /* no uprobe/uretprobe is attached */
.producer_thread = uprobe_base_producer,
.consumer_thread = trigger_consumer,
.measure = trigger_base_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_trig_uprobe_with_nop = {
.name = "trig-uprobe-with-nop",
.setup = uprobe_setup_with_nop,
.producer_thread = uprobe_producer_with_nop,
.consumer_thread = trigger_consumer,
.measure = trigger_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_trig_uretprobe_with_nop = {
.name = "trig-uretprobe-with-nop",
.setup = uretprobe_setup_with_nop,
.producer_thread = uprobe_producer_with_nop,
.consumer_thread = trigger_consumer,
.measure = trigger_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_trig_uprobe_without_nop = {
.name = "trig-uprobe-without-nop",
.setup = uprobe_setup_without_nop,
.producer_thread = uprobe_producer_without_nop,
.consumer_thread = trigger_consumer,
.measure = trigger_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_trig_uretprobe_without_nop = {
.name = "trig-uretprobe-without-nop",
.setup = uretprobe_setup_without_nop,
.producer_thread = uprobe_producer_without_nop,
.consumer_thread = trigger_consumer,
.measure = trigger_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};

View file

@ -52,3 +52,10 @@ int bench_trigger_fmodret(void *ctx)
__sync_add_and_fetch(&hits, 1);
return -22;
}
SEC("uprobe/self/uprobe_target")
int bench_trigger_uprobe(void *ctx)
{
__sync_add_and_fetch(&hits, 1);
return 0;
}