From 29788f39a4171dd48a6d19eb78cf2ab168c4349a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 29 Jan 2024 11:33:26 -0300 Subject: [PATCH 001/119] bpftool: Be more portable by using POSIX's basename() musl libc had the basename() prototype in string.h, but this is a glibc-ism, now they removed the _GNU_SOURCE bits in their devel distro, Alpine Linux edge: https://git.musl-libc.org/cgit/musl/commit/?id=725e17ed6dff4d0cd22487bb64470881e86a92e7 So lets use the POSIX version, the whole rationale is spelled out at: https://gitlab.alpinelinux.org/alpine/aports/-/issues/15643 Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Acked-by: Quentin Monnet Link: https://lore.kernel.org/lkml/ZZhsPs00TI75RdAr@kernel.org Link: https://lore.kernel.org/bpf/Zbe3NuOgaupvUcpF@kernel.org --- tools/bpf/bpftool/gen.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index ee3ce2b8000d..a9334c57e859 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -56,9 +57,11 @@ static bool str_has_suffix(const char *str, const char *suffix) static void get_obj_name(char *name, const char *file) { - /* Using basename() GNU version which doesn't modify arg. */ - strncpy(name, basename(file), MAX_OBJ_NAME_LEN - 1); - name[MAX_OBJ_NAME_LEN - 1] = '\0'; + char file_copy[PATH_MAX]; + + /* Using basename() POSIX version to be more portable. */ + strncpy(file_copy, file, PATH_MAX - 1)[PATH_MAX - 1] = '\0'; + strncpy(name, basename(file_copy), MAX_OBJ_NAME_LEN - 1)[MAX_OBJ_NAME_LEN - 1] = '\0'; if (str_has_suffix(name, ".o")) name[strlen(name) - 2] = '\0'; sanitize_identifier(name); From ad57654053805bf9a62602aaec74cc78edb6f235 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 26 Jan 2024 14:09:44 -0800 Subject: [PATCH 002/119] libbpf: Fix faccessat() usage on Android Android implementation of libc errors out with -EINVAL in faccessat() if passed AT_EACCESS ([0]), this leads to ridiculous issue with libbpf refusing to load /sys/kernel/btf/vmlinux on Androids ([1]). Fix by detecting Android and redefining AT_EACCESS to 0, it's equivalent on Android. [0] https://android.googlesource.com/platform/bionic/+/refs/heads/android13-release/libc/bionic/faccessat.cpp#50 [1] https://github.com/libbpf/libbpf-bootstrap/issues/250#issuecomment-1911324250 Fixes: 6a4ab8869d0b ("libbpf: Fix the case of running as non-root with capabilities") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240126220944.2497665-1-andrii@kernel.org --- tools/lib/bpf/libbpf_internal.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 930cc9616527..5b30f3b67a02 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -19,6 +19,20 @@ #include #include "relo_core.h" +/* Android's libc doesn't support AT_EACCESS in faccessat() implementation + * ([0]), and just returns -EINVAL even if file exists and is accessible. + * See [1] for issues caused by this. + * + * So just redefine it to 0 on Android. + * + * [0] https://android.googlesource.com/platform/bionic/+/refs/heads/android13-release/libc/bionic/faccessat.cpp#50 + * [1] https://github.com/libbpf/libbpf-bootstrap/issues/250#issuecomment-1911324250 + */ +#ifdef __ANDROID__ +#undef AT_EACCESS +#define AT_EACCESS 0 +#endif + /* make sure libbpf doesn't use kernel-only integer typedefs */ #pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 From e33758f7493c9ad8cf6960bcf7c70f5761f3acfb Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Mon, 15 Jan 2024 13:12:30 +0000 Subject: [PATCH 003/119] riscv, bpf: Unify 32-bit sign-extension to emit_sextw MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For code unification, add emit_sextw wrapper to unify all the 32-bit sign-extension operations. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20240115131235.2914289-2-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit.h | 5 +++++ arch/riscv/net/bpf_jit_comp64.c | 10 +++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index a5ce1ab76ece..f9f8d86e762f 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -1087,6 +1087,11 @@ static inline void emit_subw(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx) emit(rv_subw(rd, rs1, rs2), ctx); } +static inline void emit_sextw(u8 rd, u8 rs, struct rv_jit_context *ctx) +{ + emit_addiw(rd, rs, 0, ctx); +} + #endif /* __riscv_xlen == 64 */ void bpf_jit_build_prologue(struct rv_jit_context *ctx); diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 719a97e7edb2..46aaf5c1f95c 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -417,8 +417,8 @@ static void emit_zext_32_rd_rs(u8 *rd, u8 *rs, struct rv_jit_context *ctx) static void emit_sext_32_rd_rs(u8 *rd, u8 *rs, struct rv_jit_context *ctx) { - emit_addiw(RV_REG_T2, *rd, 0, ctx); - emit_addiw(RV_REG_T1, *rs, 0, ctx); + emit_sextw(RV_REG_T2, *rd, ctx); + emit_sextw(RV_REG_T1, *rs, ctx); *rd = RV_REG_T2; *rs = RV_REG_T1; } @@ -433,7 +433,7 @@ static void emit_zext_32_rd_t1(u8 *rd, struct rv_jit_context *ctx) static void emit_sext_32_rd(u8 *rd, struct rv_jit_context *ctx) { - emit_addiw(RV_REG_T2, *rd, 0, ctx); + emit_sextw(RV_REG_T2, *rd, ctx); *rd = RV_REG_T2; } @@ -1104,7 +1104,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, emit_srai(rd, RV_REG_T1, 64 - insn->off, ctx); break; case 32: - emit_addiw(rd, rs, 0, ctx); + emit_sextw(rd, rs, ctx); break; } if (!is64 && !aux->verifier_zext) @@ -1504,7 +1504,7 @@ out_be: * as t1 is used only in comparison against zero. */ if (!is64 && imm < 0) - emit_addiw(RV_REG_T1, RV_REG_T1, 0, ctx); + emit_sextw(RV_REG_T1, RV_REG_T1, ctx); e = ctx->ninsns; rvoff -= ninsns_rvoff(e - s); emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff, ctx); From 914c7a5ff18a225f7df254ae3433574f3d47b711 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Mon, 15 Jan 2024 13:12:31 +0000 Subject: [PATCH 004/119] riscv, bpf: Unify 32-bit zero-extension to emit_zextw MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For code unification, add emit_zextw wrapper to unify all the 32-bit zero-extension operations. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20240115131235.2914289-3-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit.h | 6 +++ arch/riscv/net/bpf_jit_comp64.c | 80 +++++++++++++++------------------ 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index f9f8d86e762f..e30501b46f8f 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -1092,6 +1092,12 @@ static inline void emit_sextw(u8 rd, u8 rs, struct rv_jit_context *ctx) emit_addiw(rd, rs, 0, ctx); } +static inline void emit_zextw(u8 rd, u8 rs, struct rv_jit_context *ctx) +{ + emit_slli(rd, rs, 32, ctx); + emit_srli(rd, rd, 32, ctx); +} + #endif /* __riscv_xlen == 64 */ void bpf_jit_build_prologue(struct rv_jit_context *ctx); diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 46aaf5c1f95c..d3d764ed32a2 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -326,12 +326,6 @@ static void emit_branch(u8 cond, u8 rd, u8 rs, int rvoff, emit(rv_jalr(RV_REG_ZERO, RV_REG_T1, lower), ctx); } -static void emit_zext_32(u8 reg, struct rv_jit_context *ctx) -{ - emit_slli(reg, reg, 32, ctx); - emit_srli(reg, reg, 32, ctx); -} - static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) { int tc_ninsn, off, start_insn = ctx->ninsns; @@ -346,7 +340,7 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) */ tc_ninsn = insn ? ctx->offset[insn] - ctx->offset[insn - 1] : ctx->offset[0]; - emit_zext_32(RV_REG_A2, ctx); + emit_zextw(RV_REG_A2, RV_REG_A2, ctx); off = offsetof(struct bpf_array, map.max_entries); if (is_12b_check(off, insn)) @@ -408,9 +402,9 @@ static void init_regs(u8 *rd, u8 *rs, const struct bpf_insn *insn, static void emit_zext_32_rd_rs(u8 *rd, u8 *rs, struct rv_jit_context *ctx) { emit_mv(RV_REG_T2, *rd, ctx); - emit_zext_32(RV_REG_T2, ctx); + emit_zextw(RV_REG_T2, RV_REG_T2, ctx); emit_mv(RV_REG_T1, *rs, ctx); - emit_zext_32(RV_REG_T1, ctx); + emit_zextw(RV_REG_T1, RV_REG_T1, ctx); *rd = RV_REG_T2; *rs = RV_REG_T1; } @@ -426,8 +420,8 @@ static void emit_sext_32_rd_rs(u8 *rd, u8 *rs, struct rv_jit_context *ctx) static void emit_zext_32_rd_t1(u8 *rd, struct rv_jit_context *ctx) { emit_mv(RV_REG_T2, *rd, ctx); - emit_zext_32(RV_REG_T2, ctx); - emit_zext_32(RV_REG_T1, ctx); + emit_zextw(RV_REG_T2, RV_REG_T2, ctx); + emit_zextw(RV_REG_T1, RV_REG_T2, ctx); *rd = RV_REG_T2; } @@ -519,32 +513,32 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64, emit(is64 ? rv_amoadd_d(rs, rs, rd, 0, 0) : rv_amoadd_w(rs, rs, rd, 0, 0), ctx); if (!is64) - emit_zext_32(rs, ctx); + emit_zextw(rs, rs, ctx); break; case BPF_AND | BPF_FETCH: emit(is64 ? rv_amoand_d(rs, rs, rd, 0, 0) : rv_amoand_w(rs, rs, rd, 0, 0), ctx); if (!is64) - emit_zext_32(rs, ctx); + emit_zextw(rs, rs, ctx); break; case BPF_OR | BPF_FETCH: emit(is64 ? rv_amoor_d(rs, rs, rd, 0, 0) : rv_amoor_w(rs, rs, rd, 0, 0), ctx); if (!is64) - emit_zext_32(rs, ctx); + emit_zextw(rs, rs, ctx); break; case BPF_XOR | BPF_FETCH: emit(is64 ? rv_amoxor_d(rs, rs, rd, 0, 0) : rv_amoxor_w(rs, rs, rd, 0, 0), ctx); if (!is64) - emit_zext_32(rs, ctx); + emit_zextw(rs, rs, ctx); break; /* src_reg = atomic_xchg(dst_reg + off16, src_reg); */ case BPF_XCHG: emit(is64 ? rv_amoswap_d(rs, rs, rd, 0, 0) : rv_amoswap_w(rs, rs, rd, 0, 0), ctx); if (!is64) - emit_zext_32(rs, ctx); + emit_zextw(rs, rs, ctx); break; /* r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg); */ case BPF_CMPXCHG: @@ -1091,7 +1085,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_ALU64 | BPF_MOV | BPF_X: if (imm == 1) { /* Special mov32 for zext */ - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; } switch (insn->off) { @@ -1108,7 +1102,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, break; } if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; /* dst = dst OP src */ @@ -1116,7 +1110,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_ALU64 | BPF_ADD | BPF_X: emit_add(rd, rd, rs, ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_SUB | BPF_X: case BPF_ALU64 | BPF_SUB | BPF_X: @@ -1126,31 +1120,31 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, emit_subw(rd, rd, rs, ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_AND | BPF_X: case BPF_ALU64 | BPF_AND | BPF_X: emit_and(rd, rd, rs, ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_OR | BPF_X: case BPF_ALU64 | BPF_OR | BPF_X: emit_or(rd, rd, rs, ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_XOR | BPF_X: case BPF_ALU64 | BPF_XOR | BPF_X: emit_xor(rd, rd, rs, ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU64 | BPF_MUL | BPF_X: emit(is64 ? rv_mul(rd, rd, rs) : rv_mulw(rd, rd, rs), ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_DIV | BPF_X: case BPF_ALU64 | BPF_DIV | BPF_X: @@ -1159,7 +1153,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, else emit(is64 ? rv_divu(rd, rd, rs) : rv_divuw(rd, rd, rs), ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU64 | BPF_MOD | BPF_X: @@ -1168,25 +1162,25 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, else emit(is64 ? rv_remu(rd, rd, rs) : rv_remuw(rd, rd, rs), ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU64 | BPF_LSH | BPF_X: emit(is64 ? rv_sll(rd, rd, rs) : rv_sllw(rd, rd, rs), ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_RSH | BPF_X: case BPF_ALU64 | BPF_RSH | BPF_X: emit(is64 ? rv_srl(rd, rd, rs) : rv_srlw(rd, rd, rs), ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_ARSH | BPF_X: case BPF_ALU64 | BPF_ARSH | BPF_X: emit(is64 ? rv_sra(rd, rd, rs) : rv_sraw(rd, rd, rs), ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; /* dst = -dst */ @@ -1194,7 +1188,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_ALU64 | BPF_NEG: emit_sub(rd, RV_REG_ZERO, rd, ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; /* dst = BSWAP##imm(dst) */ @@ -1206,7 +1200,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, break; case 32: if (!aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case 64: /* Do nothing */ @@ -1268,7 +1262,7 @@ out_be: case BPF_ALU64 | BPF_MOV | BPF_K: emit_imm(rd, imm, ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; /* dst = dst OP imm */ @@ -1281,7 +1275,7 @@ out_be: emit_add(rd, rd, RV_REG_T1, ctx); } if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU64 | BPF_SUB | BPF_K: @@ -1292,7 +1286,7 @@ out_be: emit_sub(rd, rd, RV_REG_T1, ctx); } if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU64 | BPF_AND | BPF_K: @@ -1303,7 +1297,7 @@ out_be: emit_and(rd, rd, RV_REG_T1, ctx); } if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU64 | BPF_OR | BPF_K: @@ -1314,7 +1308,7 @@ out_be: emit_or(rd, rd, RV_REG_T1, ctx); } if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU64 | BPF_XOR | BPF_K: @@ -1325,7 +1319,7 @@ out_be: emit_xor(rd, rd, RV_REG_T1, ctx); } if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU64 | BPF_MUL | BPF_K: @@ -1333,7 +1327,7 @@ out_be: emit(is64 ? rv_mul(rd, rd, RV_REG_T1) : rv_mulw(rd, rd, RV_REG_T1), ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU64 | BPF_DIV | BPF_K: @@ -1345,7 +1339,7 @@ out_be: emit(is64 ? rv_divu(rd, rd, RV_REG_T1) : rv_divuw(rd, rd, RV_REG_T1), ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_MOD | BPF_K: case BPF_ALU64 | BPF_MOD | BPF_K: @@ -1357,14 +1351,14 @@ out_be: emit(is64 ? rv_remu(rd, rd, RV_REG_T1) : rv_remuw(rd, rd, RV_REG_T1), ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU64 | BPF_LSH | BPF_K: emit_slli(rd, rd, imm, ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_RSH | BPF_K: case BPF_ALU64 | BPF_RSH | BPF_K: @@ -1374,7 +1368,7 @@ out_be: emit(rv_srliw(rd, rd, imm), ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; case BPF_ALU | BPF_ARSH | BPF_K: case BPF_ALU64 | BPF_ARSH | BPF_K: @@ -1384,7 +1378,7 @@ out_be: emit(rv_sraiw(rd, rd, imm), ctx); if (!is64 && !aux->verifier_zext) - emit_zext_32(rd, ctx); + emit_zextw(rd, rd, ctx); break; /* JUMP off */ From 361db44c3c59cde05e9926647f16255e274a37f4 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Mon, 15 Jan 2024 13:12:32 +0000 Subject: [PATCH 005/119] riscv, bpf: Simplify sext and zext logics in branch instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are many extension helpers in the current branch instructions, and the implementation is a bit complicated. We simplify this logic through two simple extension helpers with alternate register. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20240115131235.2914289-4-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit_comp64.c | 79 +++++++++++++-------------------- 1 file changed, 31 insertions(+), 48 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index d3d764ed32a2..0a26842535ea 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -141,6 +141,19 @@ static bool in_auipc_jalr_range(s64 val) val < ((1L << 31) - (1L << 11)); } +/* Modify rd pointer to alternate reg to avoid corrupting original reg */ +static void emit_sextw_alt(u8 *rd, u8 ra, struct rv_jit_context *ctx) +{ + emit_sextw(ra, *rd, ctx); + *rd = ra; +} + +static void emit_zextw_alt(u8 *rd, u8 ra, struct rv_jit_context *ctx) +{ + emit_zextw(ra, *rd, ctx); + *rd = ra; +} + /* Emit fixed-length instructions for address */ static int emit_addr(u8 rd, u64 addr, bool extra_pass, struct rv_jit_context *ctx) { @@ -399,38 +412,6 @@ static void init_regs(u8 *rd, u8 *rs, const struct bpf_insn *insn, *rs = bpf_to_rv_reg(insn->src_reg, ctx); } -static void emit_zext_32_rd_rs(u8 *rd, u8 *rs, struct rv_jit_context *ctx) -{ - emit_mv(RV_REG_T2, *rd, ctx); - emit_zextw(RV_REG_T2, RV_REG_T2, ctx); - emit_mv(RV_REG_T1, *rs, ctx); - emit_zextw(RV_REG_T1, RV_REG_T1, ctx); - *rd = RV_REG_T2; - *rs = RV_REG_T1; -} - -static void emit_sext_32_rd_rs(u8 *rd, u8 *rs, struct rv_jit_context *ctx) -{ - emit_sextw(RV_REG_T2, *rd, ctx); - emit_sextw(RV_REG_T1, *rs, ctx); - *rd = RV_REG_T2; - *rs = RV_REG_T1; -} - -static void emit_zext_32_rd_t1(u8 *rd, struct rv_jit_context *ctx) -{ - emit_mv(RV_REG_T2, *rd, ctx); - emit_zextw(RV_REG_T2, RV_REG_T2, ctx); - emit_zextw(RV_REG_T1, RV_REG_T2, ctx); - *rd = RV_REG_T2; -} - -static void emit_sext_32_rd(u8 *rd, struct rv_jit_context *ctx) -{ - emit_sextw(RV_REG_T2, *rd, ctx); - *rd = RV_REG_T2; -} - static int emit_jump_and_link(u8 rd, s64 rvoff, bool fixed_addr, struct rv_jit_context *ctx) { @@ -1419,10 +1400,13 @@ out_be: rvoff = rv_offset(i, off, ctx); if (!is64) { s = ctx->ninsns; - if (is_signed_bpf_cond(BPF_OP(code))) - emit_sext_32_rd_rs(&rd, &rs, ctx); - else - emit_zext_32_rd_rs(&rd, &rs, ctx); + if (is_signed_bpf_cond(BPF_OP(code))) { + emit_sextw_alt(&rs, RV_REG_T1, ctx); + emit_sextw_alt(&rd, RV_REG_T2, ctx); + } else { + emit_zextw_alt(&rs, RV_REG_T1, ctx); + emit_zextw_alt(&rd, RV_REG_T2, ctx); + } e = ctx->ninsns; /* Adjust for extra insns */ @@ -1433,8 +1417,7 @@ out_be: /* Adjust for and */ rvoff -= 4; emit_and(RV_REG_T1, rd, rs, ctx); - emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff, - ctx); + emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff, ctx); } else { emit_branch(BPF_OP(code), rd, rs, rvoff, ctx); } @@ -1463,18 +1446,18 @@ out_be: case BPF_JMP32 | BPF_JSLE | BPF_K: rvoff = rv_offset(i, off, ctx); s = ctx->ninsns; - if (imm) { + if (imm) emit_imm(RV_REG_T1, imm, ctx); - rs = RV_REG_T1; - } else { - /* If imm is 0, simply use zero register. */ - rs = RV_REG_ZERO; - } + rs = imm ? RV_REG_T1 : RV_REG_ZERO; if (!is64) { - if (is_signed_bpf_cond(BPF_OP(code))) - emit_sext_32_rd(&rd, ctx); - else - emit_zext_32_rd_t1(&rd, ctx); + if (is_signed_bpf_cond(BPF_OP(code))) { + emit_sextw_alt(&rd, RV_REG_T2, ctx); + /* rs has been sign extended */ + } else { + emit_zextw_alt(&rd, RV_REG_T2, ctx); + if (imm) + emit_zextw(rs, rs, ctx); + } } e = ctx->ninsns; From 647b93f65daa128d9a0e4aac744a5fcf5f58b2d2 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Mon, 15 Jan 2024 13:12:33 +0000 Subject: [PATCH 006/119] riscv, bpf: Add necessary Zbb instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add necessary Zbb instructions introduced by [0] to reduce code size and improve performance of RV64 JIT. Meanwhile, a runtime deteted helper is added to check whether the CPU supports Zbb instructions. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Acked-by: Björn Töpel Link: https://github.com/riscv/riscv-bitmanip/releases/download/1.0.0/bitmanip-1.0.0-38-g865e7a7.pdf [0] Link: https://lore.kernel.org/bpf/20240115131235.2914289-5-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index e30501b46f8f..51f6d214086f 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -18,6 +18,11 @@ static inline bool rvc_enabled(void) return IS_ENABLED(CONFIG_RISCV_ISA_C); } +static inline bool rvzbb_enabled(void) +{ + return IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && riscv_has_extension_likely(RISCV_ISA_EXT_ZBB); +} + enum { RV_REG_ZERO = 0, /* The constant value 0 */ RV_REG_RA = 1, /* Return address */ @@ -730,6 +735,33 @@ static inline u16 rvc_swsp(u32 imm8, u8 rs2) return rv_css_insn(0x6, imm, rs2, 0x2); } +/* RVZBB instrutions. */ +static inline u32 rvzbb_sextb(u8 rd, u8 rs1) +{ + return rv_i_insn(0x604, rs1, 1, rd, 0x13); +} + +static inline u32 rvzbb_sexth(u8 rd, u8 rs1) +{ + return rv_i_insn(0x605, rs1, 1, rd, 0x13); +} + +static inline u32 rvzbb_zexth(u8 rd, u8 rs) +{ + if (IS_ENABLED(CONFIG_64BIT)) + return rv_i_insn(0x80, rs, 4, rd, 0x3b); + + return rv_i_insn(0x80, rs, 4, rd, 0x33); +} + +static inline u32 rvzbb_rev8(u8 rd, u8 rs) +{ + if (IS_ENABLED(CONFIG_64BIT)) + return rv_i_insn(0x6b8, rs, 5, rd, 0x13); + + return rv_i_insn(0x698, rs, 5, rd, 0x13); +} + /* * RV64-only instructions. * From 519fb722bea09ae2664ad21f8ef4360fb799eb2f Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Mon, 15 Jan 2024 13:12:34 +0000 Subject: [PATCH 007/119] riscv, bpf: Optimize sign-extention mov insns with Zbb support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 8-bit and 16-bit sign-extention wraper with Zbb support to optimize sign-extension mov instructions. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20240115131235.2914289-6-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit.h | 22 ++++++++++++++++++++++ arch/riscv/net/bpf_jit_comp64.c | 5 +++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index 51f6d214086f..b00c5c0591d2 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -1119,6 +1119,28 @@ static inline void emit_subw(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx) emit(rv_subw(rd, rs1, rs2), ctx); } +static inline void emit_sextb(u8 rd, u8 rs, struct rv_jit_context *ctx) +{ + if (rvzbb_enabled()) { + emit(rvzbb_sextb(rd, rs), ctx); + return; + } + + emit_slli(rd, rs, 56, ctx); + emit_srai(rd, rd, 56, ctx); +} + +static inline void emit_sexth(u8 rd, u8 rs, struct rv_jit_context *ctx) +{ + if (rvzbb_enabled()) { + emit(rvzbb_sexth(rd, rs), ctx); + return; + } + + emit_slli(rd, rs, 48, ctx); + emit_srai(rd, rd, 48, ctx); +} + static inline void emit_sextw(u8 rd, u8 rs, struct rv_jit_context *ctx) { emit_addiw(rd, rs, 0, ctx); diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 0a26842535ea..fc1a334e2f70 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1074,9 +1074,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, emit_mv(rd, rs, ctx); break; case 8: + emit_sextb(rd, rs, ctx); + break; case 16: - emit_slli(RV_REG_T1, rs, 64 - insn->off, ctx); - emit_srai(rd, RV_REG_T1, 64 - insn->off, ctx); + emit_sexth(rd, rs, ctx); break; case 32: emit_sextw(rd, rs, ctx); From 06a33d024838414432b6c0f51f994e7f1695b74f Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Mon, 15 Jan 2024 13:12:35 +0000 Subject: [PATCH 008/119] riscv, bpf: Optimize bswap insns with Zbb support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Optimize bswap instructions by rev8 Zbb instruction conbined with srli instruction. And Optimize 16-bit zero-extension with Zbb support. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20240115131235.2914289-7-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit.h | 69 +++++++++++++++++++++++++++++++++ arch/riscv/net/bpf_jit_comp64.c | 50 +----------------------- 2 files changed, 71 insertions(+), 48 deletions(-) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index b00c5c0591d2..8b35f12a4452 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -1146,12 +1146,81 @@ static inline void emit_sextw(u8 rd, u8 rs, struct rv_jit_context *ctx) emit_addiw(rd, rs, 0, ctx); } +static inline void emit_zexth(u8 rd, u8 rs, struct rv_jit_context *ctx) +{ + if (rvzbb_enabled()) { + emit(rvzbb_zexth(rd, rs), ctx); + return; + } + + emit_slli(rd, rs, 48, ctx); + emit_srli(rd, rd, 48, ctx); +} + static inline void emit_zextw(u8 rd, u8 rs, struct rv_jit_context *ctx) { emit_slli(rd, rs, 32, ctx); emit_srli(rd, rd, 32, ctx); } +static inline void emit_bswap(u8 rd, s32 imm, struct rv_jit_context *ctx) +{ + if (rvzbb_enabled()) { + int bits = 64 - imm; + + emit(rvzbb_rev8(rd, rd), ctx); + if (bits) + emit_srli(rd, rd, bits, ctx); + return; + } + + emit_li(RV_REG_T2, 0, ctx); + + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); + emit_srli(rd, rd, 8, ctx); + if (imm == 16) + goto out_be; + + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); + emit_srli(rd, rd, 8, ctx); + + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); + emit_srli(rd, rd, 8, ctx); + if (imm == 32) + goto out_be; + + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); + emit_srli(rd, rd, 8, ctx); + + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); + emit_srli(rd, rd, 8, ctx); + + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); + emit_srli(rd, rd, 8, ctx); + + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); + emit_srli(rd, rd, 8, ctx); +out_be: + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + + emit_mv(rd, RV_REG_T2, ctx); +} + #endif /* __riscv_xlen == 64 */ void bpf_jit_build_prologue(struct rv_jit_context *ctx); diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index fc1a334e2f70..fda6b4f6a4c1 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1177,8 +1177,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_ALU | BPF_END | BPF_FROM_LE: switch (imm) { case 16: - emit_slli(rd, rd, 48, ctx); - emit_srli(rd, rd, 48, ctx); + emit_zexth(rd, rd, ctx); break; case 32: if (!aux->verifier_zext) @@ -1189,54 +1188,9 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, break; } break; - case BPF_ALU | BPF_END | BPF_FROM_BE: case BPF_ALU64 | BPF_END | BPF_FROM_LE: - emit_li(RV_REG_T2, 0, ctx); - - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); - emit_srli(rd, rd, 8, ctx); - if (imm == 16) - goto out_be; - - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); - emit_srli(rd, rd, 8, ctx); - - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); - emit_srli(rd, rd, 8, ctx); - if (imm == 32) - goto out_be; - - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); - emit_srli(rd, rd, 8, ctx); - - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); - emit_srli(rd, rd, 8, ctx); - - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); - emit_srli(rd, rd, 8, ctx); - - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); - emit_srli(rd, rd, 8, ctx); -out_be: - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - - emit_mv(rd, RV_REG_T2, ctx); + emit_bswap(rd, imm, ctx); break; /* dst = imm */ From f149d03f450b4afab11f5e1ebd8fdfaf7eb24a28 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 28 Jan 2024 19:43:57 +0800 Subject: [PATCH 009/119] selftests/bpf: Drop return in bpf_testmod_exit bpf_testmod_exit() does not need to have a return value (given the void), so this patch drops this useless 'return' in it. Signed-off-by: Geliang Tang Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/5765b287ea088f0c820f2a834faf9b20fb2f8215.1706442113.git.tanggeliang@kylinos.cn --- tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 8befaf17d454..6f163a0f1c94 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -619,7 +619,7 @@ static void bpf_testmod_exit(void) while (refcount_read(&prog_test_struct.cnt) > 1) msleep(20); - return sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); + sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); } module_init(bpf_testmod_init); From efaa47db92451608499ab7edf108bf30141c33db Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 28 Jan 2024 13:54:43 +0800 Subject: [PATCH 010/119] bpf: Remove unused field "mod" in struct bpf_trampoline It seems that the field "mod" in struct bpf_trampoline is not used anywhere after the commit 31bf1dbccfb0 ("bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules"). So we can just remove it now. Fixes: 31bf1dbccfb0 ("bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules") Signed-off-by: Menglong Dong Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240128055443.413291-1-dongmenglong.8@bytedance.com --- include/linux/bpf.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b86bd15a051d..1ebbee1d648e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1189,7 +1189,6 @@ struct bpf_trampoline { int progs_cnt[BPF_TRAMP_MAX]; /* Executable image of trampoline */ struct bpf_tramp_image *cur_image; - struct module *mod; }; struct bpf_attach_target_info { From ced33f2cfa21a14a292a00e31dc9f85c1bfbda1c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 27 Jan 2024 11:46:29 -0800 Subject: [PATCH 011/119] docs/bpf: Improve documentation of 64-bit immediate instructions For 64-bit immediate instruction, 'BPF_IMM | BPF_DW | BPF_LD' and src_reg=[0-6], the current documentation describes the 64-bit immediate is constructed by: imm64 = (next_imm << 32) | imm But actually imm64 is only used when src_reg=0. For all other variants (src_reg != 0), 'imm' and 'next_imm' have separate special encoding requirement and imm64 cannot be easily used to describe instruction semantics. This patch clarifies that 64-bit immediate instructions use two 32-bit immediate values instead of a 64-bit immediate value, so later describing individual 64-bit immediate instructions becomes less confusing. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Dave Thaler Link: https://lore.kernel.org/bpf/20240127194629.737589-1-yonghong.song@linux.dev --- .../bpf/standardization/instruction-set.rst | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst index af43227b6ee4..fceacca46299 100644 --- a/Documentation/bpf/standardization/instruction-set.rst +++ b/Documentation/bpf/standardization/instruction-set.rst @@ -166,7 +166,7 @@ Note that most instructions do not use all of the fields. Unused fields shall be cleared to zero. As discussed below in `64-bit immediate instructions`_, a 64-bit immediate -instruction uses a 64-bit immediate value that is constructed as follows. +instruction uses two 32-bit immediate values that are constructed as follows. The 64 bits following the basic instruction contain a pseudo instruction using the same format but with opcode, dst_reg, src_reg, and offset all set to zero, and imm containing the high 32 bits of the immediate value. @@ -181,13 +181,8 @@ This is depicted in the following figure:: '--------------' pseudo instruction -Thus the 64-bit immediate value is constructed as follows: - - imm64 = (next_imm << 32) | imm - -where 'next_imm' refers to the imm value of the pseudo instruction -following the basic instruction. The unused bytes in the pseudo -instruction are reserved and shall be cleared to zero. +Here, the imm value of the pseudo instruction is called 'next_imm'. The unused +bytes in the pseudo instruction are reserved and shall be cleared to zero. Instruction classes ------------------- @@ -590,7 +585,7 @@ defined further below: ========================= ====== === ========================================= =========== ============== opcode construction opcode src pseudocode imm type dst type ========================= ====== === ========================================= =========== ============== -BPF_IMM | BPF_DW | BPF_LD 0x18 0x0 dst = imm64 integer integer +BPF_IMM | BPF_DW | BPF_LD 0x18 0x0 dst = (next_imm << 32) | imm integer integer BPF_IMM | BPF_DW | BPF_LD 0x18 0x1 dst = map_by_fd(imm) map fd map BPF_IMM | BPF_DW | BPF_LD 0x18 0x2 dst = map_val(map_by_fd(imm)) + next_imm map fd data pointer BPF_IMM | BPF_DW | BPF_LD 0x18 0x3 dst = var_addr(imm) variable id data pointer From 0e6d0a9d2348b64df74239e859fa9d6e86cdcdef Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 25 Jan 2024 12:55:04 -0800 Subject: [PATCH 012/119] libbpf: integrate __arg_ctx feature detector into kernel_supports() Now that feature detection code is in bpf-next tree, integrate __arg_ctx kernel-side support into kernel_supports() framework. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240125205510.3642094-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/features.c | 58 +++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.c | 65 +-------------------------------- tools/lib/bpf/libbpf_internal.h | 2 + 3 files changed, 61 insertions(+), 64 deletions(-) diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c index 5a5c766bf615..6b0738ad7063 100644 --- a/tools/lib/bpf/features.c +++ b/tools/lib/bpf/features.c @@ -407,6 +407,61 @@ static int probe_kern_btf_enum64(int token_fd) strs, sizeof(strs), token_fd)); } +static int probe_kern_arg_ctx_tag(int token_fd) +{ + static const char strs[] = "\0a\0b\0arg:ctx\0"; + const __u32 types[] = { + /* [1] INT */ + BTF_TYPE_INT_ENC(1 /* "a" */, BTF_INT_SIGNED, 0, 32, 4), + /* [2] PTR -> VOID */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0), + /* [3] FUNC_PROTO `int(void *a)` */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 1), + BTF_PARAM_ENC(1 /* "a" */, 2), + /* [4] FUNC 'a' -> FUNC_PROTO (main prog) */ + BTF_TYPE_ENC(1 /* "a" */, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 3), + /* [5] FUNC_PROTO `int(void *b __arg_ctx)` */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 1), + BTF_PARAM_ENC(3 /* "b" */, 2), + /* [6] FUNC 'b' -> FUNC_PROTO (subprog) */ + BTF_TYPE_ENC(3 /* "b" */, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 5), + /* [7] DECL_TAG 'arg:ctx' -> func 'b' arg 'b' */ + BTF_TYPE_DECL_TAG_ENC(5 /* "arg:ctx" */, 6, 0), + }; + const struct bpf_insn insns[] = { + /* main prog */ + BPF_CALL_REL(+1), + BPF_EXIT_INSN(), + /* global subprog */ + BPF_EMIT_CALL(BPF_FUNC_get_func_ip), /* needs PTR_TO_CTX */ + BPF_EXIT_INSN(), + }; + const struct bpf_func_info_min func_infos[] = { + { 0, 4 }, /* main prog -> FUNC 'a' */ + { 2, 6 }, /* subprog -> FUNC 'b' */ + }; + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .token_fd = token_fd, + .prog_flags = token_fd ? BPF_F_TOKEN_FD : 0, + ); + int prog_fd, btf_fd, insn_cnt = ARRAY_SIZE(insns); + + btf_fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs), token_fd); + if (btf_fd < 0) + return 0; + + opts.prog_btf_fd = btf_fd; + opts.func_info = &func_infos; + opts.func_info_cnt = ARRAY_SIZE(func_infos); + opts.func_info_rec_size = sizeof(func_infos[0]); + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, "det_arg_ctx", + "GPL", insns, insn_cnt, &opts); + close(btf_fd); + + return probe_fd(prog_fd); +} + typedef int (*feature_probe_fn)(int /* token_fd */); static struct kern_feature_cache feature_cache; @@ -476,6 +531,9 @@ static struct kern_feature_desc { [FEAT_UPROBE_MULTI_LINK] = { "BPF multi-uprobe link support", probe_uprobe_multi_link, }, + [FEAT_ARG_CTX_TAG] = { + "kernel-side __arg_ctx tag", probe_kern_arg_ctx_tag, + }, }; bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index fa7094ff3e66..41ab7a21f868 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6462,69 +6462,6 @@ static int clone_func_btf_info(struct btf *btf, int orig_fn_id, struct bpf_progr return fn_id; } -static int probe_kern_arg_ctx_tag(void) -{ - /* To minimize merge conflicts with BPF token series that refactors - * feature detection code a lot, we don't integrate - * probe_kern_arg_ctx_tag() into kernel_supports() feature-detection - * framework yet, doing our own caching internally. - * This will be cleaned up a bit later when bpf/bpf-next trees settle. - */ - static int cached_result = -1; - static const char strs[] = "\0a\0b\0arg:ctx\0"; - const __u32 types[] = { - /* [1] INT */ - BTF_TYPE_INT_ENC(1 /* "a" */, BTF_INT_SIGNED, 0, 32, 4), - /* [2] PTR -> VOID */ - BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0), - /* [3] FUNC_PROTO `int(void *a)` */ - BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 1), - BTF_PARAM_ENC(1 /* "a" */, 2), - /* [4] FUNC 'a' -> FUNC_PROTO (main prog) */ - BTF_TYPE_ENC(1 /* "a" */, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 3), - /* [5] FUNC_PROTO `int(void *b __arg_ctx)` */ - BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 1), - BTF_PARAM_ENC(3 /* "b" */, 2), - /* [6] FUNC 'b' -> FUNC_PROTO (subprog) */ - BTF_TYPE_ENC(3 /* "b" */, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 5), - /* [7] DECL_TAG 'arg:ctx' -> func 'b' arg 'b' */ - BTF_TYPE_DECL_TAG_ENC(5 /* "arg:ctx" */, 6, 0), - }; - const struct bpf_insn insns[] = { - /* main prog */ - BPF_CALL_REL(+1), - BPF_EXIT_INSN(), - /* global subprog */ - BPF_EMIT_CALL(BPF_FUNC_get_func_ip), /* needs PTR_TO_CTX */ - BPF_EXIT_INSN(), - }; - const struct bpf_func_info_min func_infos[] = { - { 0, 4 }, /* main prog -> FUNC 'a' */ - { 2, 6 }, /* subprog -> FUNC 'b' */ - }; - LIBBPF_OPTS(bpf_prog_load_opts, opts); - int prog_fd, btf_fd, insn_cnt = ARRAY_SIZE(insns); - - if (cached_result >= 0) - return cached_result; - - btf_fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs), 0); - if (btf_fd < 0) - return 0; - - opts.prog_btf_fd = btf_fd; - opts.func_info = &func_infos; - opts.func_info_cnt = ARRAY_SIZE(func_infos); - opts.func_info_rec_size = sizeof(func_infos[0]); - - prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, "det_arg_ctx", - "GPL", insns, insn_cnt, &opts); - close(btf_fd); - - cached_result = probe_fd(prog_fd); - return cached_result; -} - /* Check if main program or global subprog's function prototype has `arg:ctx` * argument tags, and, if necessary, substitute correct type to match what BPF * verifier would expect, taking into account specific program type. This @@ -6549,7 +6486,7 @@ static int bpf_program_fixup_func_info(struct bpf_object *obj, struct bpf_progra return 0; /* don't do any fix ups if kernel natively supports __arg_ctx */ - if (probe_kern_arg_ctx_tag() > 0) + if (kernel_supports(obj, FEAT_ARG_CTX_TAG)) return 0; /* some BPF program types just don't have named context structs, so diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 5b30f3b67a02..ad936ac5e639 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -372,6 +372,8 @@ enum kern_feature_id { FEAT_SYSCALL_WRAPPER, /* BPF multi-uprobe link support */ FEAT_UPROBE_MULTI_LINK, + /* Kernel supports arg:ctx tag (__arg_ctx) for global subprogs natively */ + FEAT_ARG_CTX_TAG, __FEAT_CNT, }; From 9eea8fafe33eb70868f6ace2fc1e17c4ff5539c3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 25 Jan 2024 12:55:05 -0800 Subject: [PATCH 013/119] libbpf: fix __arg_ctx type enforcement for perf_event programs Adjust PERF_EVENT type enforcement around __arg_ctx to match exactly what kernel is doing. Fixes: 76ec90a996e3 ("libbpf: warn on unexpected __arg_ctx type when rewriting BTF") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240125205510.3642094-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 41ab7a21f868..db65ea59a05a 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -6339,6 +6340,14 @@ static struct { /* all other program types don't have "named" context structs */ }; +/* forward declarations for arch-specific underlying types of bpf_user_pt_regs_t typedef, + * for below __builtin_types_compatible_p() checks; + * with this approach we don't need any extra arch-specific #ifdef guards + */ +struct pt_regs; +struct user_pt_regs; +struct user_regs_struct; + static bool need_func_arg_type_fixup(const struct btf *btf, const struct bpf_program *prog, const char *subprog_name, int arg_idx, int arg_type_id, const char *ctx_name) @@ -6379,11 +6388,21 @@ static bool need_func_arg_type_fixup(const struct btf *btf, const struct bpf_pro /* special cases */ switch (prog->type) { case BPF_PROG_TYPE_KPROBE: - case BPF_PROG_TYPE_PERF_EVENT: /* `struct pt_regs *` is expected, but we need to fix up */ if (btf_is_struct(t) && strcmp(tname, "pt_regs") == 0) return true; break; + case BPF_PROG_TYPE_PERF_EVENT: + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) && + btf_is_struct(t) && strcmp(tname, "pt_regs") == 0) + return 0; + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) && + btf_is_struct(t) && strcmp(tname, "user_pt_regs") == 0) + return 0; + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) && + btf_is_struct(t) && strcmp(tname, "user_regs_struct") == 0) + return 0; + break; case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: /* allow u64* as ctx */ From add9c58cd44e88a15f285945e26bf0d9d81c5890 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 25 Jan 2024 12:55:06 -0800 Subject: [PATCH 014/119] bpf: move arg:ctx type enforcement check inside the main logic loop Now that bpf and bpf-next trees converged and we don't run the risk of merge conflicts, move btf_validate_prog_ctx_type() into its most logical place inside the main logic loop. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240125205510.3642094-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index edef96ceffa3..9ec08cfb2967 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7112,6 +7112,10 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) bpf_log(log, "arg#%d has invalid combination of tags\n", i); return -EINVAL; } + if ((tags & ARG_TAG_CTX) && + btf_validate_prog_ctx_type(log, btf, t, i, prog_type, + prog->expected_attach_type)) + return -EINVAL; sub->args[i].arg_type = ARG_PTR_TO_CTX; continue; } @@ -7156,23 +7160,6 @@ skip_pointer: return -EINVAL; } - for (i = 0; i < nargs; i++) { - const char *tag; - - if (sub->args[i].arg_type != ARG_PTR_TO_CTX) - continue; - - /* check if arg has "arg:ctx" tag */ - t = btf_type_by_id(btf, args[i].type); - tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:"); - if (IS_ERR_OR_NULL(tag) || strcmp(tag, "ctx") != 0) - continue; - - if (btf_validate_prog_ctx_type(log, btf, t, i, prog_type, - prog->expected_attach_type)) - return -EINVAL; - } - sub->arg_cnt = nargs; sub->args_cached = true; From fbaf59a9f513416c05f4b4e87d26898d3dccd1cc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 26 Jan 2024 18:50:17 -0800 Subject: [PATCH 015/119] selftests/bpf: Remove "&>" usage in the selftests In s390, CI reported that the sock_iter_batch selftest hits this error very often: 2024-01-26T16:56:49.3091804Z Bind /proc/self/ns/net -> /run/netns/sock_iter_batch_netns failed: No such file or directory 2024-01-26T16:56:49.3149524Z Cannot remove namespace file "/run/netns/sock_iter_batch_netns": No such file or directory 2024-01-26T16:56:49.3772213Z test_sock_iter_batch:FAIL:ip netns add sock_iter_batch_netns unexpected error: 256 (errno 0) It happens very often in s390 but Manu also noticed it happens very sparsely in other arch also. It turns out the default dash shell does not recognize "&>" as a redirection operator, so the command went to the background. In the sock_iter_batch selftest, the "ip netns delete" went into background and then race with the following "ip netns add" command. This patch replaces the "&> /dev/null" usage with ">/dev/null 2>&1" and does this redirection in the SYS_NOFAIL macro instead of doing it individually by its caller. The SYS_NOFAIL callers do not care about failure, so it is no harm to do this redirection even if some of the existing callers do not redirect to /dev/null now. It touches different test files, so I skipped the Fixes tags in this patch. Some of the changed tests do not use "&>" but they use the SYS_NOFAIL, so these tests are also changed to avoid doing its own redirection because SYS_NOFAIL does it internally now. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240127025017.950825-1-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/decap_sanity.c | 2 +- .../selftests/bpf/prog_tests/fib_lookup.c | 2 +- .../selftests/bpf/prog_tests/ip_check_defrag.c | 4 ++-- .../selftests/bpf/prog_tests/lwt_redirect.c | 2 +- .../selftests/bpf/prog_tests/lwt_reroute.c | 2 +- tools/testing/selftests/bpf/prog_tests/mptcp.c | 2 +- .../selftests/bpf/prog_tests/sock_destroy.c | 2 +- .../selftests/bpf/prog_tests/sock_iter_batch.c | 4 ++-- .../selftests/bpf/prog_tests/test_tunnel.c | 18 +++++++++--------- tools/testing/selftests/bpf/test_progs.h | 7 ++++++- 10 files changed, 25 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/decap_sanity.c b/tools/testing/selftests/bpf/prog_tests/decap_sanity.c index 5c0ebe6ba866..dcb9e5070cc3 100644 --- a/tools/testing/selftests/bpf/prog_tests/decap_sanity.c +++ b/tools/testing/selftests/bpf/prog_tests/decap_sanity.c @@ -72,6 +72,6 @@ fail: bpf_tc_hook_destroy(&qdisc_hook); close_netns(nstoken); } - SYS_NOFAIL("ip netns del " NS_TEST " &> /dev/null"); + SYS_NOFAIL("ip netns del " NS_TEST); decap_sanity__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c index 4ad4cd69152e..3379df2d4cf2 100644 --- a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c @@ -298,6 +298,6 @@ void test_fib_lookup(void) fail: if (nstoken) close_netns(nstoken); - SYS_NOFAIL("ip netns del " NS_TEST " &> /dev/null"); + SYS_NOFAIL("ip netns del " NS_TEST); fib_lookup__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c b/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c index 57c814f5f6a7..8dd2af9081f4 100644 --- a/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c +++ b/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c @@ -59,9 +59,9 @@ static int setup_topology(bool ipv6) /* Wait for up to 5s for links to come up */ for (i = 0; i < 5; ++i) { if (ipv6) - up = !system("ip netns exec " NS0 " ping -6 -c 1 -W 1 " VETH1_ADDR6 " &>/dev/null"); + up = !SYS_NOFAIL("ip netns exec " NS0 " ping -6 -c 1 -W 1 " VETH1_ADDR6); else - up = !system("ip netns exec " NS0 " ping -c 1 -W 1 " VETH1_ADDR " &>/dev/null"); + up = !SYS_NOFAIL("ip netns exec " NS0 " ping -c 1 -W 1 " VETH1_ADDR); if (up) break; diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c b/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c index 59b38569f310..beeb3ac1c361 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c @@ -85,7 +85,7 @@ static void ping_dev(const char *dev, bool is_ingress) snprintf(ip, sizeof(ip), "20.0.0.%d", link_index); /* We won't get a reply. Don't fail here */ - SYS_NOFAIL("ping %s -c1 -W1 -s %d >/dev/null 2>&1", + SYS_NOFAIL("ping %s -c1 -W1 -s %d", ip, ICMP_PAYLOAD_SIZE); } diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c b/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c index f4bb2d5fcae0..5610bc76928d 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c +++ b/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c @@ -63,7 +63,7 @@ static void ping_once(const char *ip) { /* We won't get a reply. Don't fail here */ - SYS_NOFAIL("ping %s -c1 -W1 -s %d >/dev/null 2>&1", + SYS_NOFAIL("ping %s -c1 -W1 -s %d", ip, ICMP_PAYLOAD_SIZE); } diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index 7c0be7cf550b..8f8d792307c1 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -79,7 +79,7 @@ static void cleanup_netns(struct nstoken *nstoken) if (nstoken) close_netns(nstoken); - SYS_NOFAIL("ip netns del %s &> /dev/null", NS_TEST); + SYS_NOFAIL("ip netns del %s", NS_TEST); } static int verify_tsk(int map_fd, int client_fd) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_destroy.c b/tools/testing/selftests/bpf/prog_tests/sock_destroy.c index b0583309a94e..9c11938fe597 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_destroy.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_destroy.c @@ -214,7 +214,7 @@ void test_sock_destroy(void) cleanup: if (nstoken) close_netns(nstoken); - SYS_NOFAIL("ip netns del " TEST_NS " &> /dev/null"); + SYS_NOFAIL("ip netns del " TEST_NS); if (cgroup_fd >= 0) close(cgroup_fd); sock_destroy_prog__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c b/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c index 0c365f36c73b..d56e18b25528 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c @@ -112,7 +112,7 @@ void test_sock_iter_batch(void) { struct nstoken *nstoken = NULL; - SYS_NOFAIL("ip netns del " TEST_NS " &> /dev/null"); + SYS_NOFAIL("ip netns del " TEST_NS); SYS(done, "ip netns add %s", TEST_NS); SYS(done, "ip -net %s link set dev lo up", TEST_NS); @@ -131,5 +131,5 @@ void test_sock_iter_batch(void) close_netns(nstoken); done: - SYS_NOFAIL("ip netns del " TEST_NS " &> /dev/null"); + SYS_NOFAIL("ip netns del " TEST_NS); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index 2b3c6dd66259..5f1fb0a2ea56 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -118,9 +118,9 @@ fail: static void cleanup(void) { SYS_NOFAIL("test -f /var/run/netns/at_ns0 && ip netns delete at_ns0"); - SYS_NOFAIL("ip link del veth1 2> /dev/null"); - SYS_NOFAIL("ip link del %s 2> /dev/null", VXLAN_TUNL_DEV1); - SYS_NOFAIL("ip link del %s 2> /dev/null", IP6VXLAN_TUNL_DEV1); + SYS_NOFAIL("ip link del veth1"); + SYS_NOFAIL("ip link del %s", VXLAN_TUNL_DEV1); + SYS_NOFAIL("ip link del %s", IP6VXLAN_TUNL_DEV1); } static int add_vxlan_tunnel(void) @@ -265,9 +265,9 @@ fail: static void delete_ipip_tunnel(void) { SYS_NOFAIL("ip -n at_ns0 link delete dev %s", IPIP_TUNL_DEV0); - SYS_NOFAIL("ip -n at_ns0 fou del port 5555 2> /dev/null"); + SYS_NOFAIL("ip -n at_ns0 fou del port 5555"); SYS_NOFAIL("ip link delete dev %s", IPIP_TUNL_DEV1); - SYS_NOFAIL("ip fou del port 5555 2> /dev/null"); + SYS_NOFAIL("ip fou del port 5555"); } static int add_xfrm_tunnel(void) @@ -346,13 +346,13 @@ fail: static void delete_xfrm_tunnel(void) { - SYS_NOFAIL("ip xfrm policy delete dir out src %s/32 dst %s/32 2> /dev/null", + SYS_NOFAIL("ip xfrm policy delete dir out src %s/32 dst %s/32", IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0); - SYS_NOFAIL("ip xfrm policy delete dir in src %s/32 dst %s/32 2> /dev/null", + SYS_NOFAIL("ip xfrm policy delete dir in src %s/32 dst %s/32", IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1); - SYS_NOFAIL("ip xfrm state delete src %s dst %s proto esp spi %d 2> /dev/null", + SYS_NOFAIL("ip xfrm state delete src %s dst %s proto esp spi %d", IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT); - SYS_NOFAIL("ip xfrm state delete src %s dst %s proto esp spi %d 2> /dev/null", + SYS_NOFAIL("ip xfrm state delete src %s dst %s proto esp spi %d", IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN); } diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 2f9f6f250f17..80df51244886 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -385,10 +385,15 @@ int test__join_cgroup(const char *path); goto goto_label; \ }) +#define ALL_TO_DEV_NULL " >/dev/null 2>&1" + #define SYS_NOFAIL(fmt, ...) \ ({ \ char cmd[1024]; \ - snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \ + int n; \ + n = snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \ + if (n < sizeof(cmd) && sizeof(cmd) - n >= sizeof(ALL_TO_DEV_NULL)) \ + strcat(cmd, ALL_TO_DEV_NULL); \ system(cmd); \ }) From 646751d523587cfd7ebcf1733298ecd470879eda Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Sat, 27 Jan 2024 11:07:02 +0100 Subject: [PATCH 016/119] bpf: Use -Wno-error in certain tests when building with GCC Certain BPF selftests contain code that, albeit being legal C, trigger warnings in GCC that cannot be disabled. This is the case for example for the tests progs/btf_dump_test_case_bitfields.c progs/btf_dump_test_case_namespacing.c progs/btf_dump_test_case_packing.c progs/btf_dump_test_case_padding.c progs/btf_dump_test_case_syntax.c which contain struct type declarations inside function parameter lists. This is problematic, because: - The BPF selftests are built with -Werror. - The Clang and GCC compilers sometimes differ when it comes to handle warnings. in the handling of warnings. One compiler may emit warnings for code that the other compiles compiles silently, and one compiler may offer the possibility to disable certain warnings, while the other doesn't. In order to overcome this problem, this patch modifies the tools/testing/selftests/bpf/Makefile in order to: 1. Enable the possibility of specifing per-source-file extra CFLAGS. This is done by defining a make variable like: -CFLAGS := And then modifying the proper Make rule in order to use these flags when compiling . 2. Use the mechanism above to add -Wno-error to CFLAGS for the following selftests: progs/btf_dump_test_case_bitfields.c progs/btf_dump_test_case_namespacing.c progs/btf_dump_test_case_packing.c progs/btf_dump_test_case_padding.c progs/btf_dump_test_case_syntax.c Note the corresponding -CFLAGS variables for these files are defined only if the selftests are being built with GCC. Note that, while compiler pragmas can generally be used to disable particular warnings per file, this 1) is only possible for warning that actually can be disabled in the command line, i.e. that have -Wno-FOO options, and 2) doesn't apply to -Wno-error. Tested in bpf-next master branch. No regressions. Signed-off-by: Jose E. Marchesi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240127100702.21549-1-jose.marchesi@oracle.com --- tools/testing/selftests/bpf/Makefile | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index fd15017ed3b1..1a3654bcb5dd 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -64,6 +64,15 @@ TEST_INST_SUBDIRS := no_alu32 ifneq ($(BPF_GCC),) TEST_GEN_PROGS += test_progs-bpf_gcc TEST_INST_SUBDIRS += bpf_gcc + +# The following tests contain C code that, although technically legal, +# triggers GCC warnings that cannot be disabled: declaration of +# anonymous struct types in function parameter lists. +progs/btf_dump_test_case_bitfields.c-CFLAGS := -Wno-error +progs/btf_dump_test_case_namespacing.c-CFLAGS := -Wno-error +progs/btf_dump_test_case_packing.c-CFLAGS := -Wno-error +progs/btf_dump_test_case_padding.c-CFLAGS := -Wno-error +progs/btf_dump_test_case_syntax.c-CFLAGS := -Wno-error endif ifneq ($(CLANG_CPUV4),) @@ -504,7 +513,8 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o: \ $(wildcard $(BPFDIR)/*.bpf.h) \ | $(TRUNNER_OUTPUT) $$(BPFOBJ) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ - $(TRUNNER_BPF_CFLAGS)) + $(TRUNNER_BPF_CFLAGS) \ + $$($$<-CFLAGS)) $(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) From aecaa3ed48c3ae74c06f5e8ef0746b69c62397f1 Mon Sep 17 00:00:00 2001 From: Florian Lehner Date: Sat, 20 Jan 2024 16:09:20 +0100 Subject: [PATCH 017/119] perf/bpf: Fix duplicate type check Remove the duplicate check on type and unify result. Signed-off-by: Florian Lehner Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20240120150920.3370-1-dev@der-flo.net --- kernel/events/core.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index f0f0f71213a1..5ecfa57e3b97 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9302,10 +9302,6 @@ void perf_event_bpf_event(struct bpf_prog *prog, { struct perf_bpf_event bpf_event; - if (type <= PERF_BPF_EVENT_UNKNOWN || - type >= PERF_BPF_EVENT_MAX) - return; - switch (type) { case PERF_BPF_EVENT_PROG_LOAD: case PERF_BPF_EVENT_PROG_UNLOAD: @@ -9313,7 +9309,7 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_event_bpf_emit_ksymbols(prog, type); break; default: - break; + return; } if (!atomic_read(&nr_bpf_events)) From f2e4040c82d3fddd11fa7c64e8f810e6f9cb7460 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 25 Jan 2024 15:18:40 -0800 Subject: [PATCH 018/119] libbpf: Add some details for BTF parsing failures As CONFIG_DEBUG_INFO_BTF is default off the existing "failed to find valid kernel BTF" message makes diagnosing the kernel build issue somewhat cryptic. Add a little more detail with the hope of helping users. Before: ``` libbpf: failed to find valid kernel BTF libbpf: Error loading vmlinux BTF: -3 ``` After not accessible: ``` libbpf: kernel BTF is missing at '/sys/kernel/btf/vmlinux', was CONFIG_DEBUG_INFO_BTF enabled? libbpf: failed to find valid kernel BTF libbpf: Error loading vmlinux BTF: -3 ``` After not readable: ``` libbpf: failed to read kernel BTF from (/sys/kernel/btf/vmlinux): -1 ``` Closes: https://lore.kernel.org/bpf/CAP-5=fU+DN_+Y=Y4gtELUsJxKNDDCOvJzPHvjUVaUoeFAzNnig@mail.gmail.com/ Signed-off-by: Ian Rogers Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240125231840.1647951-1-irogers@google.com --- tools/lib/bpf/btf.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index ec92b87cae01..95db88b36cf3 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -4932,10 +4932,9 @@ static int btf_dedup_remap_types(struct btf_dedup *d) */ struct btf *btf__load_vmlinux_btf(void) { + const char *sysfs_btf_path = "/sys/kernel/btf/vmlinux"; + /* fall back locations, trying to find vmlinux on disk */ const char *locations[] = { - /* try canonical vmlinux BTF through sysfs first */ - "/sys/kernel/btf/vmlinux", - /* fall back to trying to find vmlinux on disk otherwise */ "/boot/vmlinux-%1$s", "/lib/modules/%1$s/vmlinux-%1$s", "/lib/modules/%1$s/build/vmlinux", @@ -4949,8 +4948,23 @@ struct btf *btf__load_vmlinux_btf(void) struct btf *btf; int i, err; - uname(&buf); + /* is canonical sysfs location accessible? */ + if (faccessat(AT_FDCWD, sysfs_btf_path, F_OK, AT_EACCESS) < 0) { + pr_warn("kernel BTF is missing at '%s', was CONFIG_DEBUG_INFO_BTF enabled?\n", + sysfs_btf_path); + } else { + btf = btf__parse(sysfs_btf_path, NULL); + if (!btf) { + err = -errno; + pr_warn("failed to read kernel BTF from '%s': %d\n", sysfs_btf_path, err); + return libbpf_err_ptr(err); + } + pr_debug("loaded kernel BTF from '%s'\n", path); + return btf; + } + /* try fallback locations */ + uname(&buf); for (i = 0; i < ARRAY_SIZE(locations); i++) { snprintf(path, PATH_MAX, locations[i], buf.release); From ff2071a7b7fd77908417603c4a785822939b3841 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Sat, 27 Jan 2024 19:50:31 +0100 Subject: [PATCH 019/119] bpf: Generate const static pointers for kernel helpers The generated bpf_helper_defs.h file currently contains definitions like this for the kernel helpers, which are static objects: static void *(*bpf_map_lookup_elem)(void *map, const void *key) = (void *) 1; These work well in both clang and GCC because both compilers do constant propagation with -O1 and higher optimization, resulting in `call 1' BPF instructions being generated, which are calls to kernel helpers. However, there is a discrepancy on how the -Wunused-variable warning (activated by -Wall) is handled in these compilers: - clang will not emit -Wunused-variable warnings for static variables defined in C header files, be them constant or not constant. - GCC will not emit -Wunused-variable warnings for _constant_ static variables defined in header files, but it will emit warnings for non-constant static variables defined in header files. There is no reason for these bpf_helpers_def.h pointers to not be declared constant, and it is actually desirable to do so, since their values are not to be changed. So this patch modifies bpf_doc.py to generate prototypes like: static void *(* const bpf_map_lookup_elem)(void *map, const void *key) = (void *) 1; This allows GCC to not error while compiling BPF programs with `-Wall -Werror', while still being able to detect and error on legitimate unused variables in the program themselves. This change doesn't impact the desired constant propagation in neither Clang nor GCC with -O1 and higher. On the contrary, being declared as constant may increase the odds they get constant folded when used/referred to in certain circumstances. Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240127185031.29854-1-jose.marchesi@oracle.com --- scripts/bpf_doc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 61b7dddedc46..2b94749c99cc 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -827,7 +827,7 @@ class PrinterHelpers(Printer): print(' *{}{}'.format(' \t' if line else '', line)) print(' */') - print('static %s %s(*%s)(' % (self.map_type(proto['ret_type']), + print('static %s %s(* const %s)(' % (self.map_type(proto['ret_type']), proto['ret_star'], proto['name']), end='') comma = '' for i, a in enumerate(proto['args']): From 6668e818f960b0f32110a9efa7c97351a5771b35 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Sat, 27 Jan 2024 21:48:56 +0800 Subject: [PATCH 020/119] bpf,token: Use BIT_ULL() to convert the bit mask Replace the '(1ULL << *)' with the macro BIT_ULL(nr). Signed-off-by: Haiyue Wang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240127134901.3698613-1-haiyue.wang@intel.com --- kernel/bpf/token.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 0bca93b60c43..d6ccf8d00eab 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -72,28 +72,28 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) u64 mask; BUILD_BUG_ON(__MAX_BPF_CMD >= 64); - mask = (1ULL << __MAX_BPF_CMD) - 1; + mask = BIT_ULL(__MAX_BPF_CMD) - 1; if ((token->allowed_cmds & mask) == mask) seq_printf(m, "allowed_cmds:\tany\n"); else seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds); BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64); - mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; + mask = BIT_ULL(__MAX_BPF_MAP_TYPE) - 1; if ((token->allowed_maps & mask) == mask) seq_printf(m, "allowed_maps:\tany\n"); else seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps); BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64); - mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; + mask = BIT_ULL(__MAX_BPF_PROG_TYPE) - 1; if ((token->allowed_progs & mask) == mask) seq_printf(m, "allowed_progs:\tany\n"); else seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs); BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64); - mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; + mask = BIT_ULL(__MAX_BPF_ATTACH_TYPE) - 1; if ((token->allowed_attachs & mask) == mask) seq_printf(m, "allowed_attachs:\tany\n"); else @@ -253,7 +253,7 @@ bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd) { if (!token) return false; - if (!(token->allowed_cmds & (1ULL << cmd))) + if (!(token->allowed_cmds & BIT_ULL(cmd))) return false; return security_bpf_token_cmd(token, cmd) == 0; } @@ -263,7 +263,7 @@ bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type t if (!token || type >= __MAX_BPF_MAP_TYPE) return false; - return token->allowed_maps & (1ULL << type); + return token->allowed_maps & BIT_ULL(type); } bool bpf_token_allow_prog_type(const struct bpf_token *token, @@ -273,6 +273,6 @@ bool bpf_token_allow_prog_type(const struct bpf_token *token, if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE) return false; - return (token->allowed_progs & (1ULL << prog_type)) && - (token->allowed_attachs & (1ULL << attach_type)); + return (token->allowed_progs & BIT_ULL(prog_type)) && + (token->allowed_attachs & BIT_ULL(attach_type)); } From 27a90b14b93d3b2e1efd10764e456af7e2a42991 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 30 Jan 2024 12:03:43 +0100 Subject: [PATCH 021/119] bpf: Build type-punning BPF selftests with -fno-strict-aliasing A few BPF selftests perform type punning and they may break strict aliasing rules, which are exploited by both GCC and clang by default while optimizing. This can lead to broken compiled programs. This patch disables strict aliasing for these particular tests, by mean of the -fno-strict-aliasing command line option. This will make sure these tests are optimized properly even if some strict aliasing rule gets violated. After this patch, GCC is able to build all the selftests without warning about potential strict aliasing issue. bpf@vger discussion on strict aliasing and BPF selftests: https://lore.kernel.org/bpf/bae1205a-b6e5-4e46-8e20-520d7c327f7a@linux.dev/T/#t Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/bae1205a-b6e5-4e46-8e20-520d7c327f7a@linux.dev Link: https://lore.kernel.org/bpf/20240130110343.11217-1-jose.marchesi@oracle.com --- tools/testing/selftests/bpf/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1a3654bcb5dd..79253a376fc8 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -41,6 +41,19 @@ CFLAGS += -g $(OPT_FLAGS) -rdynamic \ LDFLAGS += $(SAN_LDFLAGS) LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread +# The following tests perform type punning and they may break strict +# aliasing rules, which are exploited by both GCC and clang by default +# while optimizing. This can lead to broken programs. +progs/bind4_prog.c-CFLAGS := -fno-strict-aliasing +progs/bind6_prog.c-CFLAGS := -fno-strict-aliasing +progs/dynptr_fail.c-CFLAGS := -fno-strict-aliasing +progs/linked_list_fail.c-CFLAGS := -fno-strict-aliasing +progs/map_kptr_fail.c-CFLAGS := -fno-strict-aliasing +progs/syscall.c-CFLAGS := -fno-strict-aliasing +progs/test_pkt_md_access.c-CFLAGS := -fno-strict-aliasing +progs/test_sk_lookup.c-CFLAGS := -fno-strict-aliasing +progs/timer_crash.c-CFLAGS := -fno-strict-aliasing + ifneq ($(LLVM),) # Silence some warnings when compiled with clang CFLAGS += -Wno-unused-command-line-argument From 24219056805f3988bf93e494499b2329453fc706 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 30 Jan 2024 12:36:24 +0100 Subject: [PATCH 022/119] bpf: Move -Wno-compare-distinct-pointer-types to BPF_CFLAGS Clang supports enabling/disabling certain conversion diagnostics via the -W[no-]compare-distinct-pointer-types command line options. Disabling this warning is required by some BPF selftests due to -Werror. Until very recently GCC would emit these warnings unconditionally, which was a problem for gcc-bpf, but we added support for the command-line options to GCC upstream [1]. This patch moves the -Wno-cmopare-distinct-pointer-types from CLANG_CFLAGS to BPF_CFLAGS in selftests/bpf/Makefile so the option is also used in gcc-bpf builds, not just in clang builds. Tested in bpf-next master. No regressions. [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-August/627769.html Signed-off-by: Jose E. Marchesi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240130113624.24940-1-jose.marchesi@oracle.com --- tools/testing/selftests/bpf/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 79253a376fc8..a38a3001527c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -404,11 +404,11 @@ endif CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ - -I$(abspath $(OUTPUT)/../usr/include) + -I$(abspath $(OUTPUT)/../usr/include) \ + -Wno-compare-distinct-pointer-types # TODO: enable me -Wsign-compare -CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) \ - -Wno-compare-distinct-pointer-types +CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) $(OUTPUT)/test_l4lb_noinline.o: BPF_CFLAGS += -fno-inline $(OUTPUT)/test_xdp_noinline.o: BPF_CFLAGS += -fno-inline From e2b3c4ff5d183da6d1863c2321413406a2752e7a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 29 Jan 2024 16:06:45 -0800 Subject: [PATCH 023/119] bpf: add __arg_trusted global func arg tag Add support for passing PTR_TO_BTF_ID registers to global subprogs. Currently only PTR_TRUSTED flavor of PTR_TO_BTF_ID is supported. Non-NULL semantics is assumed, so caller will be forced to prove PTR_TO_BTF_ID can't be NULL. Note, we disallow global subprogs to destroy passed in PTR_TO_BTF_ID arguments, even the trusted one. We achieve that by not setting ref_obj_id when validating subprog code. This basically enforces (in Rust terms) borrowing semantics vs move semantics. Borrowing semantics seems to be a better fit for isolated global subprog validation approach. Implementation-wise, we utilize existing logic for matching user-provided BTF type to kernel-side BTF type, used by BPF CO-RE logic and following same matching rules. We enforce a unique match for types. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240130000648.2144827-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/btf.c | 99 +++++++++++++++++++++++++++++++----- kernel/bpf/verifier.c | 24 +++++++++ 3 files changed, 111 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7f5816482a10..0dcde339dc7e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -610,6 +610,7 @@ struct bpf_subprog_arg_info { enum bpf_arg_type arg_type; union { u32 mem_size; + u32 btf_id; }; }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9ec08cfb2967..ed7a05815984 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6985,9 +6985,77 @@ static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t) return false; } +struct bpf_cand_cache { + const char *name; + u32 name_len; + u16 kind; + u16 cnt; + struct { + const struct btf *btf; + u32 id; + } cands[]; +}; + +static DEFINE_MUTEX(cand_cache_mutex); + +static struct bpf_cand_cache * +bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id); + +static int btf_get_ptr_to_btf_id(struct bpf_verifier_log *log, int arg_idx, + const struct btf *btf, const struct btf_type *t) +{ + struct bpf_cand_cache *cc; + struct bpf_core_ctx ctx = { + .btf = btf, + .log = log, + }; + u32 kern_type_id, type_id; + int err = 0; + + /* skip PTR and modifiers */ + type_id = t->type; + t = btf_type_by_id(btf, t->type); + while (btf_type_is_modifier(t)) { + type_id = t->type; + t = btf_type_by_id(btf, t->type); + } + + mutex_lock(&cand_cache_mutex); + cc = bpf_core_find_cands(&ctx, type_id); + if (IS_ERR(cc)) { + err = PTR_ERR(cc); + bpf_log(log, "arg#%d reference type('%s %s') candidate matching error: %d\n", + arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off), + err); + goto cand_cache_unlock; + } + if (cc->cnt != 1) { + bpf_log(log, "arg#%d reference type('%s %s') %s\n", + arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off), + cc->cnt == 0 ? "has no matches" : "is ambiguous"); + err = cc->cnt == 0 ? -ENOENT : -ESRCH; + goto cand_cache_unlock; + } + if (btf_is_module(cc->cands[0].btf)) { + bpf_log(log, "arg#%d reference type('%s %s') points to kernel module type (unsupported)\n", + arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off)); + err = -EOPNOTSUPP; + goto cand_cache_unlock; + } + kern_type_id = cc->cands[0].id; + +cand_cache_unlock: + mutex_unlock(&cand_cache_mutex); + if (err) + return err; + + return kern_type_id; +} + enum btf_arg_tag { ARG_TAG_CTX = 0x1, ARG_TAG_NONNULL = 0x2, + ARG_TAG_TRUSTED = 0x4, }; /* Process BTF of a function to produce high-level expectation of function @@ -7089,6 +7157,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) if (strcmp(tag, "ctx") == 0) { tags |= ARG_TAG_CTX; + } else if (strcmp(tag, "trusted") == 0) { + tags |= ARG_TAG_TRUSTED; } else if (strcmp(tag, "nonnull") == 0) { tags |= ARG_TAG_NONNULL; } else { @@ -7127,6 +7197,22 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY; continue; } + if (tags & ARG_TAG_TRUSTED) { + int kern_type_id; + + if (tags & ARG_TAG_NONNULL) { + bpf_log(log, "arg#%d has invalid combination of tags\n", i); + return -EINVAL; + } + + kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t); + if (kern_type_id < 0) + return kern_type_id; + + sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_TRUSTED; + sub->args[i].btf_id = kern_type_id; + continue; + } if (is_global) { /* generic user data pointer */ u32 mem_size; @@ -8229,17 +8315,6 @@ size_t bpf_core_essential_name_len(const char *name) return n; } -struct bpf_cand_cache { - const char *name; - u32 name_len; - u16 kind; - u16 cnt; - struct { - const struct btf *btf; - u32 id; - } cands[]; -}; - static void bpf_free_cands(struct bpf_cand_cache *cands) { if (!cands->cnt) @@ -8260,8 +8335,6 @@ static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE]; #define MODULE_CAND_CACHE_SIZE 31 static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE]; -static DEFINE_MUTEX(cand_cache_mutex); - static void __print_cand_cache(struct bpf_verifier_log *log, struct bpf_cand_cache **cache, int cache_size) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c5d68a9d8acc..cd4d780e5400 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9336,6 +9336,18 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0); if (ret) return ret; + } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { + struct bpf_call_arg_meta meta; + int err; + + if (register_is_null(reg) && type_may_be_null(arg->arg_type)) + continue; + + memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ + err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta); + err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); + if (err) + return err; } else { bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n", i, arg->arg_type); @@ -20137,6 +20149,18 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, i); reg->mem_size = arg->mem_size; reg->id = ++env->id_gen; + } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { + reg->type = PTR_TO_BTF_ID; + if (arg->arg_type & PTR_MAYBE_NULL) + reg->type |= PTR_MAYBE_NULL; + if (arg->arg_type & PTR_UNTRUSTED) + reg->type |= PTR_UNTRUSTED; + if (arg->arg_type & PTR_TRUSTED) + reg->type |= PTR_TRUSTED; + mark_reg_known_zero(env, regs, i); + reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */ + reg->btf_id = arg->btf_id; + reg->id = ++env->id_gen; } else { WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n", i - BPF_REG_1, arg->arg_type); From 8f2b44cd9d69ec36c9ce9623993978babb575ee8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 29 Jan 2024 16:06:46 -0800 Subject: [PATCH 024/119] bpf: add arg:nullable tag to be combined with trusted pointers Add ability to mark arg:trusted arguments with optional arg:nullable tag to mark it as PTR_TO_BTF_ID_OR_NULL variant, which will allow callers to pass NULL, and subsequently will force global subprog's code to do NULL check. This allows to have "optional" PTR_TO_BTF_ID values passed into global subprogs. For now arg:nullable cannot be combined with anything else. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240130000648.2144827-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ed7a05815984..c8c6e6cf18e7 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7056,6 +7056,7 @@ enum btf_arg_tag { ARG_TAG_CTX = 0x1, ARG_TAG_NONNULL = 0x2, ARG_TAG_TRUSTED = 0x4, + ARG_TAG_NULLABLE = 0x8, }; /* Process BTF of a function to produce high-level expectation of function @@ -7161,6 +7162,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tags |= ARG_TAG_TRUSTED; } else if (strcmp(tag, "nonnull") == 0) { tags |= ARG_TAG_NONNULL; + } else if (strcmp(tag, "nullable") == 0) { + tags |= ARG_TAG_NULLABLE; } else { bpf_log(log, "arg#%d has unsupported set of tags\n", i); return -EOPNOTSUPP; @@ -7210,12 +7213,19 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) return kern_type_id; sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_TRUSTED; + if (tags & ARG_TAG_NULLABLE) + sub->args[i].arg_type |= PTR_MAYBE_NULL; sub->args[i].btf_id = kern_type_id; continue; } if (is_global) { /* generic user data pointer */ u32 mem_size; + if (tags & ARG_TAG_NULLABLE) { + bpf_log(log, "arg#%d has invalid combination of tags\n", i); + return -EINVAL; + } + t = btf_type_skip_modifiers(btf, t->type, NULL); ref_t = btf_resolve_size(btf, t, &mem_size); if (IS_ERR(ref_t)) { From d28bb1a86e68a3d523e0acee8281bb904dd7f451 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 29 Jan 2024 16:06:47 -0800 Subject: [PATCH 025/119] libbpf: add __arg_trusted and __arg_nullable tag macros Add __arg_trusted to annotate global func args that accept trusted PTR_TO_BTF_ID arguments. Also add __arg_nullable to combine with __arg_trusted (and maybe other tags in the future) to force global subprog itself (i.e., callee) to do NULL checks, as opposed to default non-NULL semantics (and thus caller's responsibility to ensure non-NULL values). Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240130000648.2144827-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf_helpers.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 2324cc42b017..79eaa581be98 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -190,6 +190,8 @@ enum libbpf_tristate { #define __arg_ctx __attribute__((btf_decl_tag("arg:ctx"))) #define __arg_nonnull __attribute((btf_decl_tag("arg:nonnull"))) +#define __arg_nullable __attribute((btf_decl_tag("arg:nullable"))) +#define __arg_trusted __attribute((btf_decl_tag("arg:trusted"))) #ifndef ___bpf_concat #define ___bpf_concat(a, b) a ## b From c381203eadb76d5601fc04b814317e7608af5f5c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 29 Jan 2024 16:06:48 -0800 Subject: [PATCH 026/119] selftests/bpf: add trusted global subprog arg tests Add a bunch of test cases validating behavior of __arg_trusted and its combination with __arg_nullable tag. We also validate CO-RE flavor support by kernel for __arg_trusted args. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240130000648.2144827-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../bpf/progs/verifier_global_ptr_args.c | 156 ++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index d62c5bf00e71..9c6072a19745 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -28,6 +28,7 @@ #include "verifier_div0.skel.h" #include "verifier_div_overflow.skel.h" #include "verifier_global_subprogs.skel.h" +#include "verifier_global_ptr_args.skel.h" #include "verifier_gotol.skel.h" #include "verifier_helper_access_var_len.skel.h" #include "verifier_helper_packet_access.skel.h" @@ -140,6 +141,7 @@ void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_st void test_verifier_div0(void) { RUN(verifier_div0); } void test_verifier_div_overflow(void) { RUN(verifier_div_overflow); } void test_verifier_global_subprogs(void) { RUN(verifier_global_subprogs); } +void test_verifier_global_ptr_args(void) { RUN(verifier_global_ptr_args); } void test_verifier_gotol(void) { RUN(verifier_gotol); } void test_verifier_helper_access_var_len(void) { RUN(verifier_helper_access_var_len); } void test_verifier_helper_packet_access(void) { RUN(verifier_helper_packet_access); } diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c new file mode 100644 index 000000000000..484d6262363f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include "bpf_misc.h" +#include "xdp_metadata.h" +#include "bpf_kfuncs.h" + +extern struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym __weak; +extern void bpf_task_release(struct task_struct *p) __ksym __weak; + +__weak int subprog_trusted_task_nullable(struct task_struct *task __arg_trusted __arg_nullable) +{ + if (!task) + return 0; + return task->pid + task->tgid; +} + +SEC("?kprobe") +__success __log_level(2) +__msg("Validating subprog_trusted_task_nullable() func#1...") +__msg(": R1=trusted_ptr_or_null_task_struct(") +int trusted_task_arg_nullable(void *ctx) +{ + struct task_struct *t = bpf_get_current_task_btf(); + + return subprog_trusted_task_nullable(t) + subprog_trusted_task_nullable(NULL); +} + +__weak int subprog_trusted_task_nonnull(struct task_struct *task __arg_trusted) +{ + return task->pid + task->tgid; +} + +SEC("?kprobe") +__failure __log_level(2) +__msg("R1 type=scalar expected=ptr_, trusted_ptr_, rcu_ptr_") +__msg("Caller passes invalid args into func#1 ('subprog_trusted_task_nonnull')") +int trusted_task_arg_nonnull_fail1(void *ctx) +{ + return subprog_trusted_task_nonnull(NULL); +} + +SEC("?tp_btf/task_newtask") +__failure __log_level(2) +__msg("R1 type=ptr_or_null_ expected=ptr_, trusted_ptr_, rcu_ptr_") +__msg("Caller passes invalid args into func#1 ('subprog_trusted_task_nonnull')") +int trusted_task_arg_nonnull_fail2(void *ctx) +{ + struct task_struct *t = bpf_get_current_task_btf(); + struct task_struct *nullable; + int res; + + nullable = bpf_task_acquire(t); + + /* should fail, PTR_TO_BTF_ID_OR_NULL */ + res = subprog_trusted_task_nonnull(nullable); + + if (nullable) + bpf_task_release(nullable); + + return res; +} + +SEC("?kprobe") +__success __log_level(2) +__msg("Validating subprog_trusted_task_nonnull() func#1...") +__msg(": R1=trusted_ptr_task_struct(") +int trusted_task_arg_nonnull(void *ctx) +{ + struct task_struct *t = bpf_get_current_task_btf(); + + return subprog_trusted_task_nonnull(t); +} + +struct task_struct___local {} __attribute__((preserve_access_index)); + +__weak int subprog_nullable_task_flavor( + struct task_struct___local *task __arg_trusted __arg_nullable) +{ + char buf[16]; + + if (!task) + return 0; + + return bpf_copy_from_user_task(&buf, sizeof(buf), NULL, (void *)task, 0); +} + +SEC("?uprobe.s") +__success __log_level(2) +__msg("Validating subprog_nullable_task_flavor() func#1...") +__msg(": R1=trusted_ptr_or_null_task_struct(") +int flavor_ptr_nullable(void *ctx) +{ + struct task_struct___local *t = (void *)bpf_get_current_task_btf(); + + return subprog_nullable_task_flavor(t); +} + +__weak int subprog_nonnull_task_flavor(struct task_struct___local *task __arg_trusted) +{ + char buf[16]; + + return bpf_copy_from_user_task(&buf, sizeof(buf), NULL, (void *)task, 0); +} + +SEC("?uprobe.s") +__success __log_level(2) +__msg("Validating subprog_nonnull_task_flavor() func#1...") +__msg(": R1=trusted_ptr_task_struct(") +int flavor_ptr_nonnull(void *ctx) +{ + struct task_struct *t = bpf_get_current_task_btf(); + + return subprog_nonnull_task_flavor((void *)t); +} + +__weak int subprog_trusted_destroy(struct task_struct *task __arg_trusted) +{ + bpf_task_release(task); /* should be rejected */ + + return 0; +} + +SEC("?tp_btf/task_newtask") +__failure __log_level(2) +__msg("release kernel function bpf_task_release expects refcounted PTR_TO_BTF_ID") +int BPF_PROG(trusted_destroy_fail, struct task_struct *task, u64 clone_flags) +{ + return subprog_trusted_destroy(task); +} + +__weak int subprog_trusted_acq_rel(struct task_struct *task __arg_trusted) +{ + struct task_struct *owned; + + owned = bpf_task_acquire(task); + if (!owned) + return 0; + + bpf_task_release(owned); /* this one is OK, we acquired it locally */ + + return 0; +} + +SEC("?tp_btf/task_newtask") +__success __log_level(2) +int BPF_PROG(trusted_acq_rel, struct task_struct *task, u64 clone_flags) +{ + return subprog_trusted_acq_rel(task); +} + +char _license[] SEC("license") = "GPL"; From 20d59ee55172fdf6072abf871fa62b2070d6383f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 30 Jan 2024 13:20:22 -0800 Subject: [PATCH 027/119] libbpf: add bpf_core_cast() macro Add bpf_core_cast() macro that wraps bpf_rdonly_cast() kfunc. It's more ergonomic than kfunc, as it automatically extracts btf_id with bpf_core_type_id_kernel(), and works with type names. It also casts result to (T *) pointer. See the definition of the macro, it's self-explanatory. libbpf declares bpf_rdonly_cast() extern as __weak __ksym and should be safe to not conflict with other possible declarations in user code. But we do have a conflict with current BPF selftests that declare their externs with first argument as `void *obj`, while libbpf opts into more permissive `const void *obj`. This causes conflict, so we fix up BPF selftests uses in the same patch. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240130212023.183765-2-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/bpf_core_read.h | 13 +++++++++++++ tools/testing/selftests/bpf/bpf_kfuncs.h | 2 +- .../selftests/bpf/progs/sk_storage_omem_uncharge.c | 2 -- tools/testing/selftests/bpf/progs/type_cast.c | 4 +--- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index 5aec301e9585..0d3e88bd7d5f 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -2,6 +2,8 @@ #ifndef __BPF_CORE_READ_H__ #define __BPF_CORE_READ_H__ +#include + /* * enum bpf_field_info_kind is passed as a second argument into * __builtin_preserve_field_info() built-in to get a specific aspect of @@ -292,6 +294,17 @@ enum bpf_enum_value_kind { #define bpf_core_read_user_str(dst, sz, src) \ bpf_probe_read_user_str(dst, sz, (const void *)__builtin_preserve_access_index(src)) +extern void *bpf_rdonly_cast(const void *obj, __u32 btf_id) __ksym __weak; + +/* + * Cast provided pointer *ptr* into a pointer to a specified *type* in such + * a way that BPF verifier will become aware of associated kernel-side BTF + * type. This allows to access members of kernel types directly without the + * need to use BPF_CORE_READ() macros. + */ +#define bpf_core_cast(ptr, type) \ + ((typeof(type) *)bpf_rdonly_cast((ptr), bpf_core_type_id_kernel(type))) + #define ___concat(a, b) a ## b #define ___apply(fn, n) ___concat(fn, n) #define ___nth(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, __11, N, ...) N diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 23c24f852f4f..0cd4111f954c 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -63,7 +63,7 @@ extern int bpf_sk_assign_tcp_reqsk(struct __sk_buff *skb, struct sock *sk, void *bpf_cast_to_kern_ctx(void *) __ksym; -void *bpf_rdonly_cast(void *obj, __u32 btf_id) __ksym; +extern void *bpf_rdonly_cast(const void *obj, __u32 btf_id) __ksym __weak; extern int bpf_get_file_xattr(struct file *file, const char *name, struct bpf_dynptr *value_ptr) __ksym; diff --git a/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c index 3e745793b27a..934c4e5ada5b 100644 --- a/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c +++ b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c @@ -12,8 +12,6 @@ int cookie_found = 0; __u64 cookie = 0; __u32 omem = 0; -void *bpf_rdonly_cast(void *, __u32) __ksym; - struct { __uint(type, BPF_MAP_TYPE_SK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); diff --git a/tools/testing/selftests/bpf/progs/type_cast.c b/tools/testing/selftests/bpf/progs/type_cast.c index a9629ac230fd..98ab35893f51 100644 --- a/tools/testing/selftests/bpf/progs/type_cast.c +++ b/tools/testing/selftests/bpf/progs/type_cast.c @@ -4,6 +4,7 @@ #include #include #include +#include "bpf_kfuncs.h" struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); @@ -19,9 +20,6 @@ char name[IFNAMSIZ]; unsigned int inum; unsigned int meta_len, frag0_len, kskb_len, kskb2_len; -void *bpf_cast_to_kern_ctx(void *) __ksym; -void *bpf_rdonly_cast(void *, __u32) __ksym; - SEC("?xdp") int md_xdp(struct xdp_md *ctx) { From ea9d561686fbd0e1ddf05d861d8f2c1ae8291870 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 30 Jan 2024 13:20:23 -0800 Subject: [PATCH 028/119] selftests/bpf: convert bpf_rdonly_cast() uses to bpf_core_cast() macro Use more ergonomic bpf_core_cast() macro instead of bpf_rdonly_cast() in selftests code. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240130212023.183765-3-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/progs/connect_unix_prog.c | 3 +-- .../testing/selftests/bpf/progs/getpeername_unix_prog.c | 3 +-- .../testing/selftests/bpf/progs/getsockname_unix_prog.c | 3 +-- tools/testing/selftests/bpf/progs/recvmsg_unix_prog.c | 3 +-- tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c | 3 +-- .../selftests/bpf/progs/sk_storage_omem_uncharge.c | 2 +- tools/testing/selftests/bpf/progs/sock_iter_batch.c | 4 ++-- tools/testing/selftests/bpf/progs/type_cast.c | 9 ++++----- 8 files changed, 12 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/connect_unix_prog.c b/tools/testing/selftests/bpf/progs/connect_unix_prog.c index ca8aa2f116b3..2ef0e0c46d17 100644 --- a/tools/testing/selftests/bpf/progs/connect_unix_prog.c +++ b/tools/testing/selftests/bpf/progs/connect_unix_prog.c @@ -28,8 +28,7 @@ int connect_unix_prog(struct bpf_sock_addr *ctx) if (sa_kern->uaddrlen != unaddrlen) return 0; - sa_kern_unaddr = bpf_rdonly_cast(sa_kern->uaddr, - bpf_core_type_id_kernel(struct sockaddr_un)); + sa_kern_unaddr = bpf_core_cast(sa_kern->uaddr, struct sockaddr_un); if (memcmp(sa_kern_unaddr->sun_path, SERVUN_REWRITE_ADDRESS, sizeof(SERVUN_REWRITE_ADDRESS) - 1) != 0) return 0; diff --git a/tools/testing/selftests/bpf/progs/getpeername_unix_prog.c b/tools/testing/selftests/bpf/progs/getpeername_unix_prog.c index 9c078f34bbb2..5a76754f846b 100644 --- a/tools/testing/selftests/bpf/progs/getpeername_unix_prog.c +++ b/tools/testing/selftests/bpf/progs/getpeername_unix_prog.c @@ -27,8 +27,7 @@ int getpeername_unix_prog(struct bpf_sock_addr *ctx) if (sa_kern->uaddrlen != unaddrlen) return 1; - sa_kern_unaddr = bpf_rdonly_cast(sa_kern->uaddr, - bpf_core_type_id_kernel(struct sockaddr_un)); + sa_kern_unaddr = bpf_core_cast(sa_kern->uaddr, struct sockaddr_un); if (memcmp(sa_kern_unaddr->sun_path, SERVUN_REWRITE_ADDRESS, sizeof(SERVUN_REWRITE_ADDRESS) - 1) != 0) return 1; diff --git a/tools/testing/selftests/bpf/progs/getsockname_unix_prog.c b/tools/testing/selftests/bpf/progs/getsockname_unix_prog.c index ac7145111497..7867113c696f 100644 --- a/tools/testing/selftests/bpf/progs/getsockname_unix_prog.c +++ b/tools/testing/selftests/bpf/progs/getsockname_unix_prog.c @@ -27,8 +27,7 @@ int getsockname_unix_prog(struct bpf_sock_addr *ctx) if (sa_kern->uaddrlen != unaddrlen) return 1; - sa_kern_unaddr = bpf_rdonly_cast(sa_kern->uaddr, - bpf_core_type_id_kernel(struct sockaddr_un)); + sa_kern_unaddr = bpf_core_cast(sa_kern->uaddr, struct sockaddr_un); if (memcmp(sa_kern_unaddr->sun_path, SERVUN_REWRITE_ADDRESS, sizeof(SERVUN_REWRITE_ADDRESS) - 1) != 0) return 1; diff --git a/tools/testing/selftests/bpf/progs/recvmsg_unix_prog.c b/tools/testing/selftests/bpf/progs/recvmsg_unix_prog.c index 4dfbc8552558..1c7ab44bccfa 100644 --- a/tools/testing/selftests/bpf/progs/recvmsg_unix_prog.c +++ b/tools/testing/selftests/bpf/progs/recvmsg_unix_prog.c @@ -27,8 +27,7 @@ int recvmsg_unix_prog(struct bpf_sock_addr *ctx) if (sa_kern->uaddrlen != unaddrlen) return 1; - sa_kern_unaddr = bpf_rdonly_cast(sa_kern->uaddr, - bpf_core_type_id_kernel(struct sockaddr_un)); + sa_kern_unaddr = bpf_core_cast(sa_kern->uaddr, struct sockaddr_un); if (memcmp(sa_kern_unaddr->sun_path, SERVUN_ADDRESS, sizeof(SERVUN_ADDRESS) - 1) != 0) return 1; diff --git a/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c b/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c index 1f67e832666e..d8869b03dda9 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c @@ -28,8 +28,7 @@ int sendmsg_unix_prog(struct bpf_sock_addr *ctx) if (sa_kern->uaddrlen != unaddrlen) return 0; - sa_kern_unaddr = bpf_rdonly_cast(sa_kern->uaddr, - bpf_core_type_id_kernel(struct sockaddr_un)); + sa_kern_unaddr = bpf_core_cast(sa_kern->uaddr, struct sockaddr_un); if (memcmp(sa_kern_unaddr->sun_path, SERVUN_REWRITE_ADDRESS, sizeof(SERVUN_REWRITE_ADDRESS) - 1) != 0) return 0; diff --git a/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c index 934c4e5ada5b..46d6eb2a3b17 100644 --- a/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c +++ b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c @@ -27,7 +27,7 @@ int BPF_PROG(bpf_local_storage_destroy, struct bpf_local_storage *local_storage) if (local_storage_ptr != local_storage) return 0; - sk = bpf_rdonly_cast(sk_ptr, bpf_core_type_id_kernel(struct sock)); + sk = bpf_core_cast(sk_ptr, struct sock); if (sk->sk_cookie.counter != cookie) return 0; diff --git a/tools/testing/selftests/bpf/progs/sock_iter_batch.c b/tools/testing/selftests/bpf/progs/sock_iter_batch.c index ffbbfe1fa1c1..96531b0d9d55 100644 --- a/tools/testing/selftests/bpf/progs/sock_iter_batch.c +++ b/tools/testing/selftests/bpf/progs/sock_iter_batch.c @@ -32,7 +32,7 @@ int iter_tcp_soreuse(struct bpf_iter__tcp *ctx) if (!sk) return 0; - sk = bpf_rdonly_cast(sk, bpf_core_type_id_kernel(struct sock)); + sk = bpf_core_cast(sk, struct sock); if (sk->sk_family != AF_INET6 || sk->sk_state != TCP_LISTEN || !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) @@ -68,7 +68,7 @@ int iter_udp_soreuse(struct bpf_iter__udp *ctx) if (!sk) return 0; - sk = bpf_rdonly_cast(sk, bpf_core_type_id_kernel(struct sock)); + sk = bpf_core_cast(sk, struct sock); if (sk->sk_family != AF_INET6 || !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) return 0; diff --git a/tools/testing/selftests/bpf/progs/type_cast.c b/tools/testing/selftests/bpf/progs/type_cast.c index 98ab35893f51..9d808b8f4ab0 100644 --- a/tools/testing/selftests/bpf/progs/type_cast.c +++ b/tools/testing/selftests/bpf/progs/type_cast.c @@ -46,13 +46,12 @@ int md_skb(struct __sk_buff *skb) /* Simulate the following kernel macro: * #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) */ - shared_info = bpf_rdonly_cast(kskb->head + kskb->end, - bpf_core_type_id_kernel(struct skb_shared_info)); + shared_info = bpf_core_cast(kskb->head + kskb->end, struct skb_shared_info); meta_len = shared_info->meta_len; frag0_len = shared_info->frag_list->len; /* kskb2 should be equal to kskb */ - kskb2 = bpf_rdonly_cast(kskb, bpf_core_type_id_kernel(struct sk_buff)); + kskb2 = bpf_core_cast(kskb, typeof(*kskb2)); kskb2_len = kskb2->len; return 0; } @@ -63,7 +62,7 @@ int BPF_PROG(untrusted_ptr, struct pt_regs *regs, long id) struct task_struct *task, *task_dup; task = bpf_get_current_task_btf(); - task_dup = bpf_rdonly_cast(task, bpf_core_type_id_kernel(struct task_struct)); + task_dup = bpf_core_cast(task, struct task_struct); (void)bpf_task_storage_get(&enter_id, task_dup, 0, 0); return 0; } @@ -71,7 +70,7 @@ int BPF_PROG(untrusted_ptr, struct pt_regs *regs, long id) SEC("?tracepoint/syscalls/sys_enter_nanosleep") int kctx_u64(void *ctx) { - u64 *kctx = bpf_rdonly_cast(ctx, bpf_core_type_id_kernel(u64)); + u64 *kctx = bpf_core_cast(ctx, u64); (void)kctx; return 0; From 2ef61296d2844c6a4211e07ab70ef2fb412b2c30 Mon Sep 17 00:00:00 2001 From: Manu Bretelle Date: Tue, 30 Jan 2024 21:32:12 -0800 Subject: [PATCH 029/119] selftests/bpf: Disable IPv6 for lwt_redirect test After a recent change in the vmtest runner, this test started failing sporadically. Investigation showed that this test was subject to race condition which got exacerbated after the vm runner change. The symptoms being that the logic that waited for an ICMPv4 packet is naive and will break if 5 or more non-ICMPv4 packets make it to tap0. When ICMPv6 is enabled, the kernel will generate traffic such as ICMPv6 router solicitation... On a system with good performance, the expected ICMPv4 packet would very likely make it to the network interface promptly, but on a system with poor performance, those "guarantees" do not hold true anymore. Given that the test is IPv4 only, this change disable IPv6 in the test netns by setting `net.ipv6.conf.all.disable_ipv6` to 1. This essentially leaves "ping" as the sole generator of traffic in the network namespace. If this test was to be made IPv6 compatible, the logic in `wait_for_packet` would need to be modified. In more details... At a high level, the test does: - create a new namespace - in `setup_redirect_target` set up lo, tap0, and link_err interfaces as well as add 2 routes that attaches ingress/egress sections of `test_lwt_redirect.bpf.o` to the xmit path. - in `send_and_capture_test_packets` send an ICMP packet and read off the tap interface (using `wait_for_packet`) to check that a ICMP packet with the right size is read. `wait_for_packet` will try to read `max_retry` (5) times from the tap0 fd looking for an ICMPv4 packet matching some criteria. The problem is that when we set up the `tap0` interface, because IPv6 is enabled by default, traffic such as Router solicitation is sent through tap0, as in: # tcpdump -r /tmp/lwt_redirect.pc reading from file /tmp/lwt_redirect.pcap, link-type EN10MB (Ethernet) 04:46:23.578352 IP6 :: > ff02::1:ffc0:4427: ICMP6, neighbor solicitation, who has fe80::fcba:dff:fec0:4427, length 32 04:46:23.659522 IP6 :: > ff02::16: HBH ICMP6, multicast listener report v2, 1 group record(s), length 28 04:46:24.389169 IP 10.0.0.1 > 20.0.0.9: ICMP echo request, id 122, seq 1, length 108 04:46:24.618599 IP6 fe80::fcba:dff:fec0:4427 > ff02::16: HBH ICMP6, multicast listener report v2, 1 group record(s), length 28 04:46:24.619985 IP6 fe80::fcba:dff:fec0:4427 > ff02::2: ICMP6, router solicitation, length 16 04:46:24.767326 IP6 fe80::fcba:dff:fec0:4427 > ff02::16: HBH ICMP6, multicast listener report v2, 1 group record(s), length 28 04:46:28.936402 IP6 fe80::fcba:dff:fec0:4427 > ff02::2: ICMP6, router solicitation, length 16 If `wait_for_packet` sees 5 non-ICMPv4 packets, it will return 0, which is what we see in: 2024-01-31T03:51:25.0336992Z test_lwt_redirect_run:PASS:netns_create 0 nsec 2024-01-31T03:51:25.0341309Z open_netns:PASS:malloc token 0 nsec 2024-01-31T03:51:25.0344844Z open_netns:PASS:open /proc/self/ns/net 0 nsec 2024-01-31T03:51:25.0350071Z open_netns:PASS:open netns fd 0 nsec 2024-01-31T03:51:25.0353516Z open_netns:PASS:setns 0 nsec 2024-01-31T03:51:25.0356560Z test_lwt_redirect_run:PASS:setns 0 nsec 2024-01-31T03:51:25.0360140Z open_tuntap:PASS:open(/dev/net/tun) 0 nsec 2024-01-31T03:51:25.0363822Z open_tuntap:PASS:ioctl(TUNSETIFF) 0 nsec 2024-01-31T03:51:25.0367402Z open_tuntap:PASS:fcntl(O_NONBLOCK) 0 nsec 2024-01-31T03:51:25.0371167Z setup_redirect_target:PASS:open_tuntap 0 nsec 2024-01-31T03:51:25.0375180Z setup_redirect_target:PASS:if_nametoindex 0 nsec 2024-01-31T03:51:25.0379929Z setup_redirect_target:PASS:ip link add link_err type dummy 0 nsec 2024-01-31T03:51:25.0384874Z setup_redirect_target:PASS:ip link set lo up 0 nsec 2024-01-31T03:51:25.0389678Z setup_redirect_target:PASS:ip addr add dev lo 10.0.0.1/32 0 nsec 2024-01-31T03:51:25.0394814Z setup_redirect_target:PASS:ip link set link_err up 0 nsec 2024-01-31T03:51:25.0399874Z setup_redirect_target:PASS:ip link set tap0 up 0 nsec 2024-01-31T03:51:25.0407731Z setup_redirect_target:PASS:ip route add 10.0.0.0/24 dev link_err encap bpf xmit obj test_lwt_redirect.bpf.o sec redir_ingress 0 nsec 2024-01-31T03:51:25.0419105Z setup_redirect_target:PASS:ip route add 20.0.0.0/24 dev link_err encap bpf xmit obj test_lwt_redirect.bpf.o sec redir_egress 0 nsec 2024-01-31T03:51:25.0427209Z test_lwt_redirect_normal:PASS:setup_redirect_target 0 nsec 2024-01-31T03:51:25.0431424Z ping_dev:PASS:if_nametoindex 0 nsec 2024-01-31T03:51:25.0437222Z send_and_capture_test_packets:FAIL:wait_for_epacket unexpected wait_for_epacket: actual 0 != expected 1 2024-01-31T03:51:25.0448298Z (/tmp/work/bpf/bpf/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c:175: errno: Success) test_lwt_redirect_normal egress test fails 2024-01-31T03:51:25.0457124Z close_netns:PASS:setns 0 nsec When running in a VM which potential resource contrains, the odds that calling `ping` is not scheduled very soon after bringing `tap0` up increases, and with this the chances to get our ICMP packet pushed to position 6+ in the network trace. To confirm this indeed solves the issue, I ran the test 100 times in a row with: errors=0 successes=0 for i in `seq 1 100` do ./test_progs -t lwt_redirect/lwt_redirect_normal if [ $? -eq 0 ]; then successes=$((successes+1)) else errors=$((errors+1)) fi done echo "successes: $successes/errors: $errors" While this test would at least fail a couple of time every 10 runs, here it ran 100 times with no error. Fixes: 43a7c3ef8a15 ("selftests/bpf: Add lwt_xmit tests for BPF_REDIRECT") Signed-off-by: Manu Bretelle Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240131053212.2247527-1-chantr4@gmail.com --- tools/testing/selftests/bpf/prog_tests/lwt_redirect.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c b/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c index beeb3ac1c361..b5b9e74b1044 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c @@ -203,6 +203,7 @@ static int setup_redirect_target(const char *target_dev, bool need_mac) if (!ASSERT_GE(target_index, 0, "if_nametoindex")) goto fail; + SYS(fail, "sysctl -w net.ipv6.conf.all.disable_ipv6=1"); SYS(fail, "ip link add link_err type dummy"); SYS(fail, "ip link set lo up"); SYS(fail, "ip addr add dev lo " LOCAL_SRC "/32"); From 79b47344bbc5a693a92ed6b2b09dac59254bfac8 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Sun, 28 Jan 2024 18:24:06 -0700 Subject: [PATCH 030/119] bpf: btf: Support flags for BTF_SET8 sets This commit adds support for flags on BTF_SET8s. struct btf_id_set8 already supported 32 bits worth of flags, but was only used for alignment purposes before. We now use these bits to encode flags. The first use case is tagging kfunc sets with a flag so that pahole can recognize which BTF_ID_FLAGS(func, ..) are actual kfuncs. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/7bb152ec76d6c2c930daec88e995bf18484a5ebb.1706491398.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index a9cb10b0e2e9..dca09b7f21dc 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -21,6 +21,7 @@ struct btf_id_set8 { #include /* for __PASTE */ #include /* for __maybe_unused */ +#include /* * Following macros help to define lists of BTF IDs placed @@ -183,17 +184,18 @@ extern struct btf_id_set name; * .word (1 << 3) | (1 << 1) | (1 << 2) * */ -#define __BTF_SET8_START(name, scope) \ +#define __BTF_SET8_START(name, scope, flags) \ +__BTF_ID_LIST(name, local) \ asm( \ ".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ "." #scope " __BTF_ID__set8__" #name "; \n" \ "__BTF_ID__set8__" #name ":; \n" \ -".zero 8 \n" \ +".zero 4 \n" \ +".long " __stringify(flags) "\n" \ ".popsection; \n"); #define BTF_SET8_START(name) \ -__BTF_ID_LIST(name, local) \ -__BTF_SET8_START(name, local) +__BTF_SET8_START(name, local, 0) #define BTF_SET8_END(name) \ asm( \ From a05e90427ef6706f59188b379ad6366b9d298bc5 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Sun, 28 Jan 2024 18:24:07 -0700 Subject: [PATCH 031/119] bpf: btf: Add BTF_KFUNCS_START/END macro pair This macro pair is functionally equivalent to BTF_SET8_START/END, except with BTF_SET8_KFUNCS flag set in the btf_id_set8 flags field. The next commit will codemod all kfunc set8s to this new variant such that all kfuncs are tagged as such in .BTF_ids section. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/d536c57c7c2af428686853cc7396b7a44faa53b7.1706491398.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index dca09b7f21dc..e24aabfe8ecc 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -8,6 +8,9 @@ struct btf_id_set { u32 ids[]; }; +/* This flag implies BTF_SET8 holds kfunc(s) */ +#define BTF_SET8_KFUNCS (1 << 0) + struct btf_id_set8 { u32 cnt; u32 flags; @@ -204,6 +207,12 @@ asm( \ ".popsection; \n"); \ extern struct btf_id_set8 name; +#define BTF_KFUNCS_START(name) \ +__BTF_SET8_START(name, local, BTF_SET8_KFUNCS) + +#define BTF_KFUNCS_END(name) \ +BTF_SET8_END(name) + #else #define BTF_ID_LIST(name) static u32 __maybe_unused name[64]; @@ -218,6 +227,8 @@ extern struct btf_id_set8 name; #define BTF_SET_END(name) #define BTF_SET8_START(name) static struct btf_id_set8 __maybe_unused name = { 0 }; #define BTF_SET8_END(name) +#define BTF_KFUNCS_START(name) static struct btf_id_set8 __maybe_unused name = { .flags = BTF_SET8_KFUNCS }; +#define BTF_KFUNCS_END(name) #endif /* CONFIG_DEBUG_INFO_BTF */ From 6f3189f38a3e995232e028a4c341164c4aca1b20 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Sun, 28 Jan 2024 18:24:08 -0700 Subject: [PATCH 032/119] bpf: treewide: Annotate BPF kfuncs in BTF This commit marks kfuncs as such inside the .BTF_ids section. The upshot of these annotations is that we'll be able to automatically generate kfunc prototypes for downstream users. The process is as follows: 1. In source, use BTF_KFUNCS_START/END macro pair to mark kfuncs 2. During build, pahole injects into BTF a "bpf_kfunc" BTF_DECL_TAG for each function inside BTF_KFUNCS sets 3. At runtime, vmlinux or module BTF is made available in sysfs 4. At runtime, bpftool (or similar) can look at provided BTF and generate appropriate prototypes for functions with "bpf_kfunc" tag To ensure future kfunc are similarly tagged, we now also return error inside kfunc registration for untagged kfuncs. For vmlinux kfuncs, we also WARN(), as initcall machinery does not handle errors. Signed-off-by: Daniel Xu Acked-by: Benjamin Tissoires Link: https://lore.kernel.org/r/e55150ceecbf0a5d961e608941165c0bee7bc943.1706491398.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 8 ++++---- drivers/hid/bpf/hid_bpf_dispatch.c | 8 ++++---- fs/verity/measure.c | 4 ++-- kernel/bpf/btf.c | 8 ++++++++ kernel/bpf/cpumask.c | 4 ++-- kernel/bpf/helpers.c | 8 ++++---- kernel/bpf/map_iter.c | 4 ++-- kernel/cgroup/rstat.c | 4 ++-- kernel/trace/bpf_trace.c | 8 ++++---- net/bpf/test_run.c | 8 ++++---- net/core/filter.c | 20 +++++++++---------- net/core/xdp.c | 4 ++-- net/ipv4/bpf_tcp_ca.c | 4 ++-- net/ipv4/fou_bpf.c | 4 ++-- net/ipv4/tcp_bbr.c | 4 ++-- net/ipv4/tcp_cubic.c | 4 ++-- net/ipv4/tcp_dctcp.c | 4 ++-- net/netfilter/nf_conntrack_bpf.c | 4 ++-- net/netfilter/nf_nat_bpf.c | 4 ++-- net/xfrm/xfrm_interface_bpf.c | 4 ++-- net/xfrm/xfrm_state_bpf.c | 4 ++-- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 8 ++++---- 22 files changed, 70 insertions(+), 62 deletions(-) diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 7985c6615f3c..a8f5782bd833 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -177,10 +177,10 @@ In addition to kfuncs' arguments, verifier may need more information about the type of kfunc(s) being registered with the BPF subsystem. To do so, we define flags on a set of kfuncs as follows:: - BTF_SET8_START(bpf_task_set) + BTF_KFUNCS_START(bpf_task_set) BTF_ID_FLAGS(func, bpf_get_task_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_put_pid, KF_RELEASE) - BTF_SET8_END(bpf_task_set) + BTF_KFUNCS_END(bpf_task_set) This set encodes the BTF ID of each kfunc listed above, and encodes the flags along with it. Ofcourse, it is also allowed to specify no flags. @@ -347,10 +347,10 @@ Once the kfunc is prepared for use, the final step to making it visible is registering it with the BPF subsystem. Registration is done per BPF program type. An example is shown below:: - BTF_SET8_START(bpf_task_set) + BTF_KFUNCS_START(bpf_task_set) BTF_ID_FLAGS(func, bpf_get_task_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_put_pid, KF_RELEASE) - BTF_SET8_END(bpf_task_set) + BTF_KFUNCS_END(bpf_task_set) static const struct btf_kfunc_id_set bpf_task_kfunc_set = { .owner = THIS_MODULE, diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c index d9ef45fcaeab..02c441aaa217 100644 --- a/drivers/hid/bpf/hid_bpf_dispatch.c +++ b/drivers/hid/bpf/hid_bpf_dispatch.c @@ -172,9 +172,9 @@ hid_bpf_get_data(struct hid_bpf_ctx *ctx, unsigned int offset, const size_t rdwr * The following set contains all functions we agree BPF programs * can use. */ -BTF_SET8_START(hid_bpf_kfunc_ids) +BTF_KFUNCS_START(hid_bpf_kfunc_ids) BTF_ID_FLAGS(func, hid_bpf_get_data, KF_RET_NULL) -BTF_SET8_END(hid_bpf_kfunc_ids) +BTF_KFUNCS_END(hid_bpf_kfunc_ids) static const struct btf_kfunc_id_set hid_bpf_kfunc_set = { .owner = THIS_MODULE, @@ -440,12 +440,12 @@ static const struct btf_kfunc_id_set hid_bpf_fmodret_set = { }; /* for syscall HID-BPF */ -BTF_SET8_START(hid_bpf_syscall_kfunc_ids) +BTF_KFUNCS_START(hid_bpf_syscall_kfunc_ids) BTF_ID_FLAGS(func, hid_bpf_attach_prog) BTF_ID_FLAGS(func, hid_bpf_allocate_context, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, hid_bpf_release_context, KF_RELEASE) BTF_ID_FLAGS(func, hid_bpf_hw_request) -BTF_SET8_END(hid_bpf_syscall_kfunc_ids) +BTF_KFUNCS_END(hid_bpf_syscall_kfunc_ids) static const struct btf_kfunc_id_set hid_bpf_syscall_kfunc_set = { .owner = THIS_MODULE, diff --git a/fs/verity/measure.c b/fs/verity/measure.c index bf7a5f4cccaf..3969d54158d1 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -159,9 +159,9 @@ __bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr_ker __bpf_kfunc_end_defs(); -BTF_SET8_START(fsverity_set_ids) +BTF_KFUNCS_START(fsverity_set_ids) BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS) -BTF_SET8_END(fsverity_set_ids) +BTF_KFUNCS_END(fsverity_set_ids) static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id) { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c8c6e6cf18e7..ef380e546952 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8124,6 +8124,14 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, { enum btf_kfunc_hook hook; + /* All kfuncs need to be tagged as such in BTF. + * WARN() for initcall registrations that do not check errors. + */ + if (!(kset->set->flags & BTF_SET8_KFUNCS)) { + WARN_ON(!kset->owner); + return -EINVAL; + } + hook = bpf_prog_type_to_kfunc_hook(prog_type); return __register_btf_kfunc_id_set(hook, kset); } diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 2e73533a3811..dad0fb1c8e87 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -424,7 +424,7 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask) __bpf_kfunc_end_defs(); -BTF_SET8_START(cpumask_kfunc_btf_ids) +BTF_KFUNCS_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) @@ -450,7 +450,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU) -BTF_SET8_END(cpumask_kfunc_btf_ids) +BTF_KFUNCS_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { .owner = THIS_MODULE, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bcb951a2ecf4..4db1c658254c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2544,7 +2544,7 @@ __bpf_kfunc void bpf_throw(u64 cookie) __bpf_kfunc_end_defs(); -BTF_SET8_START(generic_btf_ids) +BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_KEXEC_CORE BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif @@ -2573,7 +2573,7 @@ BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) -BTF_SET8_END(generic_btf_ids) +BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { .owner = THIS_MODULE, @@ -2589,7 +2589,7 @@ BTF_ID(struct, cgroup) BTF_ID(func, bpf_cgroup_release_dtor) #endif -BTF_SET8_START(common_btf_ids) +BTF_KFUNCS_START(common_btf_ids) BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx) BTF_ID_FLAGS(func, bpf_rdonly_cast) BTF_ID_FLAGS(func, bpf_rcu_read_lock) @@ -2618,7 +2618,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) -BTF_SET8_END(common_btf_ids) +BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { .owner = THIS_MODULE, diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 6abd7c5df4b3..9575314f40a6 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -213,9 +213,9 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) __bpf_kfunc_end_defs(); -BTF_SET8_START(bpf_map_iter_kfunc_ids) +BTF_KFUNCS_START(bpf_map_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS) -BTF_SET8_END(bpf_map_iter_kfunc_ids) +BTF_KFUNCS_END(bpf_map_iter_kfunc_ids) static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = { .owner = THIS_MODULE, diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index a8350d2d63e6..07e2284bb499 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -562,10 +562,10 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) } /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ -BTF_SET8_START(bpf_rstat_kfunc_ids) +BTF_KFUNCS_START(bpf_rstat_kfunc_ids) BTF_ID_FLAGS(func, cgroup_rstat_updated) BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE) -BTF_SET8_END(bpf_rstat_kfunc_ids) +BTF_KFUNCS_END(bpf_rstat_kfunc_ids) static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { .owner = THIS_MODULE, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 64fdaf79d113..241ddf5e3895 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1412,14 +1412,14 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, __bpf_kfunc_end_defs(); -BTF_SET8_START(key_sig_kfunc_set) +BTF_KFUNCS_START(key_sig_kfunc_set) BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) #ifdef CONFIG_SYSTEM_DATA_VERIFICATION BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) #endif -BTF_SET8_END(key_sig_kfunc_set) +BTF_KFUNCS_END(key_sig_kfunc_set) static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { .owner = THIS_MODULE, @@ -1475,9 +1475,9 @@ __bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, __bpf_kfunc_end_defs(); -BTF_SET8_START(fs_kfunc_set_ids) +BTF_KFUNCS_START(fs_kfunc_set_ids) BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_SET8_END(fs_kfunc_set_ids) +BTF_KFUNCS_END(fs_kfunc_set_ids) static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id) { diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index dfd919374017..5535f9adc658 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -617,21 +617,21 @@ CFI_NOSEAL(bpf_kfunc_call_memb_release_dtor); __bpf_kfunc_end_defs(); -BTF_SET8_START(bpf_test_modify_return_ids) +BTF_KFUNCS_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) BTF_ID_FLAGS(func, bpf_modify_return_test2) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) -BTF_SET8_END(bpf_test_modify_return_ids) +BTF_KFUNCS_END(bpf_test_modify_return_ids) static const struct btf_kfunc_id_set bpf_test_modify_return_set = { .owner = THIS_MODULE, .set = &bpf_test_modify_return_ids, }; -BTF_SET8_START(test_sk_check_kfunc_ids) +BTF_KFUNCS_START(test_sk_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE) -BTF_SET8_END(test_sk_check_kfunc_ids) +BTF_KFUNCS_END(test_sk_check_kfunc_ids) static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, u32 size, u32 headroom, u32 tailroom) diff --git a/net/core/filter.c b/net/core/filter.c index 358870408a51..524adf1fa6d0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11982,21 +11982,21 @@ int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, return 0; } -BTF_SET8_START(bpf_kfunc_check_set_skb) +BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb) -BTF_SET8_END(bpf_kfunc_check_set_skb) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb) -BTF_SET8_START(bpf_kfunc_check_set_xdp) +BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) -BTF_SET8_END(bpf_kfunc_check_set_xdp) +BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) -BTF_SET8_START(bpf_kfunc_check_set_sock_addr) +BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) -BTF_SET8_END(bpf_kfunc_check_set_sock_addr) +BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) -BTF_SET8_START(bpf_kfunc_check_set_tcp_reqsk) +BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) -BTF_SET8_END(bpf_kfunc_check_set_tcp_reqsk) +BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, @@ -12075,9 +12075,9 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) __bpf_kfunc_end_defs(); -BTF_SET8_START(bpf_sk_iter_kfunc_ids) +BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) -BTF_SET8_END(bpf_sk_iter_kfunc_ids) +BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) { diff --git a/net/core/xdp.c b/net/core/xdp.c index 4869c1c2d8f3..034fb80f3fbe 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -771,11 +771,11 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, __bpf_kfunc_end_defs(); -BTF_SET8_START(xdp_metadata_kfunc_ids) +BTF_KFUNCS_START(xdp_metadata_kfunc_ids) #define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC -BTF_SET8_END(xdp_metadata_kfunc_ids) +BTF_KFUNCS_END(xdp_metadata_kfunc_ids) static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 834edc18463a..7f518ea5f4ac 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -201,13 +201,13 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, } } -BTF_SET8_START(bpf_tcp_ca_check_kfunc_ids) +BTF_KFUNCS_START(bpf_tcp_ca_check_kfunc_ids) BTF_ID_FLAGS(func, tcp_reno_ssthresh) BTF_ID_FLAGS(func, tcp_reno_cong_avoid) BTF_ID_FLAGS(func, tcp_reno_undo_cwnd) BTF_ID_FLAGS(func, tcp_slow_start) BTF_ID_FLAGS(func, tcp_cong_avoid_ai) -BTF_SET8_END(bpf_tcp_ca_check_kfunc_ids) +BTF_KFUNCS_END(bpf_tcp_ca_check_kfunc_ids) static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c index 4da03bf45c9b..06e5572f296f 100644 --- a/net/ipv4/fou_bpf.c +++ b/net/ipv4/fou_bpf.c @@ -100,10 +100,10 @@ __bpf_kfunc int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx, __bpf_kfunc_end_defs(); -BTF_SET8_START(fou_kfunc_set) +BTF_KFUNCS_START(fou_kfunc_set) BTF_ID_FLAGS(func, bpf_skb_set_fou_encap) BTF_ID_FLAGS(func, bpf_skb_get_fou_encap) -BTF_SET8_END(fou_kfunc_set) +BTF_KFUNCS_END(fou_kfunc_set) static const struct btf_kfunc_id_set fou_bpf_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 22358032dd48..05dc2d05bc7c 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1155,7 +1155,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .set_state = bbr_set_state, }; -BTF_SET8_START(tcp_bbr_check_kfunc_ids) +BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, bbr_init) @@ -1168,7 +1168,7 @@ BTF_ID_FLAGS(func, bbr_min_tso_segs) BTF_ID_FLAGS(func, bbr_set_state) #endif #endif -BTF_SET8_END(tcp_bbr_check_kfunc_ids) +BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 0fd78ecb67e7..44869ea089e3 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -485,7 +485,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { .name = "cubic", }; -BTF_SET8_START(tcp_cubic_check_kfunc_ids) +BTF_KFUNCS_START(tcp_cubic_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, cubictcp_init) @@ -496,7 +496,7 @@ BTF_ID_FLAGS(func, cubictcp_cwnd_event) BTF_ID_FLAGS(func, cubictcp_acked) #endif #endif -BTF_SET8_END(tcp_cubic_check_kfunc_ids) +BTF_KFUNCS_END(tcp_cubic_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index bb23bb5b387a..e33fbe4933e4 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -260,7 +260,7 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = { .name = "dctcp-reno", }; -BTF_SET8_START(tcp_dctcp_check_kfunc_ids) +BTF_KFUNCS_START(tcp_dctcp_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, dctcp_init) @@ -271,7 +271,7 @@ BTF_ID_FLAGS(func, dctcp_cwnd_undo) BTF_ID_FLAGS(func, dctcp_state) #endif #endif -BTF_SET8_END(tcp_dctcp_check_kfunc_ids) +BTF_KFUNCS_END(tcp_dctcp_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 475358ec8212..d2492d050fe6 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -467,7 +467,7 @@ __bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status) __bpf_kfunc_end_defs(); -BTF_SET8_START(nf_ct_kfunc_set) +BTF_KFUNCS_START(nf_ct_kfunc_set) BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) @@ -478,7 +478,7 @@ BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) -BTF_SET8_END(nf_ct_kfunc_set) +BTF_KFUNCS_END(nf_ct_kfunc_set) static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c index 6e3b2f58855f..481be15609b1 100644 --- a/net/netfilter/nf_nat_bpf.c +++ b/net/netfilter/nf_nat_bpf.c @@ -54,9 +54,9 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct, __bpf_kfunc_end_defs(); -BTF_SET8_START(nf_nat_kfunc_set) +BTF_KFUNCS_START(nf_nat_kfunc_set) BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) -BTF_SET8_END(nf_nat_kfunc_set) +BTF_KFUNCS_END(nf_nat_kfunc_set) static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c index 7d5e920141e9..5ea15037ebd1 100644 --- a/net/xfrm/xfrm_interface_bpf.c +++ b/net/xfrm/xfrm_interface_bpf.c @@ -93,10 +93,10 @@ __bpf_kfunc int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx, const struct bp __bpf_kfunc_end_defs(); -BTF_SET8_START(xfrm_ifc_kfunc_set) +BTF_KFUNCS_START(xfrm_ifc_kfunc_set) BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info) BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info) -BTF_SET8_END(xfrm_ifc_kfunc_set) +BTF_KFUNCS_END(xfrm_ifc_kfunc_set) static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/xfrm/xfrm_state_bpf.c b/net/xfrm/xfrm_state_bpf.c index 9e20d4a377f7..2248eda741f8 100644 --- a/net/xfrm/xfrm_state_bpf.c +++ b/net/xfrm/xfrm_state_bpf.c @@ -117,10 +117,10 @@ __bpf_kfunc void bpf_xdp_xfrm_state_release(struct xfrm_state *x) __bpf_kfunc_end_defs(); -BTF_SET8_START(xfrm_state_kfunc_set) +BTF_KFUNCS_START(xfrm_state_kfunc_set) BTF_ID_FLAGS(func, bpf_xdp_get_xfrm_state, KF_RET_NULL | KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_xdp_xfrm_state_release, KF_RELEASE) -BTF_SET8_END(xfrm_state_kfunc_set) +BTF_KFUNCS_END(xfrm_state_kfunc_set) static const struct btf_kfunc_id_set xfrm_state_xdp_kfunc_set = { .owner = THIS_MODULE, diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 6f163a0f1c94..4754c662b39f 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -343,12 +343,12 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { .write = bpf_testmod_test_write, }; -BTF_SET8_START(bpf_testmod_common_kfunc_ids) +BTF_KFUNCS_START(bpf_testmod_common_kfunc_ids) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_kfunc_common_test) -BTF_SET8_END(bpf_testmod_common_kfunc_ids) +BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids) static const struct btf_kfunc_id_set bpf_testmod_common_kfunc_set = { .owner = THIS_MODULE, @@ -494,7 +494,7 @@ __bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused return arg; } -BTF_SET8_START(bpf_testmod_check_kfunc_ids) +BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) BTF_ID_FLAGS(func, bpf_kfunc_call_test2) @@ -520,7 +520,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) -BTF_SET8_END(bpf_testmod_check_kfunc_ids) +BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) { From 8263b3382d8c1af0fffa27095a9f1db6f2dad899 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 31 Jan 2024 23:26:15 +0200 Subject: [PATCH 033/119] libbpf: Remove unnecessary null check in kernel_supports() After recent changes, Coverity complained about inconsistent null checks in kernel_supports() function: kernel_supports(const struct bpf_object *obj, ...) [...] // var_compare_op: Comparing obj to null implies that obj might be null if (obj && obj->gen_loader) return true; // var_deref_op: Dereferencing null pointer obj if (obj->token_fd) return feat_supported(obj->feat_cache, feat_id); [...] - The original null check was introduced by commit [0], which introduced a call `kernel_supports(NULL, ...)` in function bump_rlimit_memlock(); - This call was refactored to use `feat_supported(NULL, ...)` in commit [1]. Looking at all places where kernel_supports() is called: - There is either `obj->...` access before the call; - Or `obj` comes from `prog->obj` expression, where `prog` comes from enumeration of programs in `obj`; - Or `obj` comes from `prog->obj`, where `prog` is a parameter to one of the API functions: - bpf_program__attach_kprobe_opts; - bpf_program__attach_kprobe; - bpf_program__attach_ksyscall. Assuming correct API usage, it appears that `obj` can never be null when passed to kernel_supports(). Silence the Coverity warning by removing redundant null check. [0] e542f2c4cd16 ("libbpf: Auto-bump RLIMIT_MEMLOCK if kernel needs it for BPF") [1] d6dd1d49367a ("libbpf: Further decouple feature checking logic from bpf_object") Signed-off-by: Eduard Zingerman Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240131212615.20112-1-eddyz87@gmail.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index db65ea59a05a..f6953d7faff1 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4661,7 +4661,7 @@ bpf_object__probe_loading(struct bpf_object *obj) bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id) { - if (obj && obj->gen_loader) + if (obj->gen_loader) /* To generate loader program assume the latest kernel * to avoid doing extra prog_load, map_create syscalls. */ From 088a464ed53feeab9632c6748b9f25354639e2bd Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Tue, 30 Jan 2024 19:37:59 -0800 Subject: [PATCH 034/119] bpf, docs: Clarify which legacy packet instructions existed As discussed on the BPF IETF mailing list (see link), this patch updates the "Legacy BPF Packet access instructions" section to clarify which instructions are deprecated (vs which were never defined and so are not deprecated). Signed-off-by: Dave Thaler Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: David Vernet Link: https://mailarchive.ietf.org/arch/msg/bpf/5LnnKm093cGpOmDI9TnLQLBXyys Link: https://lore.kernel.org/bpf/20240131033759.3634-1-dthaler1968@gmail.com --- Documentation/bpf/standardization/instruction-set.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst index fceacca46299..dcbc9193c66f 100644 --- a/Documentation/bpf/standardization/instruction-set.rst +++ b/Documentation/bpf/standardization/instruction-set.rst @@ -630,7 +630,9 @@ Legacy BPF Packet access instructions ------------------------------------- BPF previously introduced special instructions for access to packet data that were -carried over from classic BPF. However, these instructions are +carried over from classic BPF. These instructions used an instruction +class of BPF_LD, a size modifier of BPF_W, BPF_H, or BPF_B, and a +mode modifier of BPF_ABS or BPF_IND. However, these instructions are deprecated and should no longer be used. All legacy packet access instructions belong to the "legacy" conformance group instead of the "basic" conformance group. From 69065aa11ca680d76a6c6bc088aa0f0abe24afdb Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Tue, 30 Jan 2024 12:46:58 +0000 Subject: [PATCH 035/119] riscv, bpf: Enable inline bpf_kptr_xchg() for RV64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RV64 JIT supports 64-bit BPF_XCHG atomic instructions. At the same time, the underlying implementation of xchg() and atomic64_xchg() in RV64 both are raw_xchg() that supported 64-bit. Therefore inline bpf_kptr_xchg() will have equivalent semantics. Let's inline it for better performance. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20240130124659.670321-2-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit_comp64.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index fda6b4f6a4c1..869e4282a2c4 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1806,3 +1806,8 @@ bool bpf_jit_supports_kfunc_call(void) { return true; } + +bool bpf_jit_supports_ptr_xchg(void) +{ + return true; +} From 994ff2f7973982af286608da10c295383650fc28 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Tue, 30 Jan 2024 12:46:59 +0000 Subject: [PATCH 036/119] selftests/bpf: Enable inline bpf_kptr_xchg() test for RV64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable inline bpf_kptr_xchg() test for RV64, and the test have passed as show below: Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20240130124659.670321-3-pulehui@huaweicloud.com --- tools/testing/selftests/bpf/prog_tests/kptr_xchg_inline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/kptr_xchg_inline.c b/tools/testing/selftests/bpf/prog_tests/kptr_xchg_inline.c index 15144943e88b..7def158da9eb 100644 --- a/tools/testing/selftests/bpf/prog_tests/kptr_xchg_inline.c +++ b/tools/testing/selftests/bpf/prog_tests/kptr_xchg_inline.c @@ -13,7 +13,8 @@ void test_kptr_xchg_inline(void) unsigned int cnt; int err; -#if !(defined(__x86_64__) || defined(__aarch64__)) +#if !(defined(__x86_64__) || defined(__aarch64__) || \ + (defined(__riscv) && __riscv_xlen == 64)) test__skip(); return; #endif From 1581e5118e485e82cfb5d04d636a79aaefb6f266 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Thu, 1 Feb 2024 10:43:40 +0000 Subject: [PATCH 037/119] bpf: Minor clean-up to sleepable_lsm_hooks BTF set There's already one main CONFIG_SECURITY_NETWORK ifdef block within the sleepable_lsm_hooks BTF set. Consolidate this duplicated ifdef block as there's no need for it and all things guarded by it should remain in one place in this specific context. Signed-off-by: Matt Bobrowski Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/Zbt1smz43GDMbVU3@google.com --- kernel/bpf/bpf_lsm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 63b4dc495125..68240c3c6e7d 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -282,10 +282,6 @@ BTF_ID(func, bpf_lsm_file_lock) BTF_ID(func, bpf_lsm_file_open) BTF_ID(func, bpf_lsm_file_receive) -#ifdef CONFIG_SECURITY_NETWORK -BTF_ID(func, bpf_lsm_inet_conn_established) -#endif /* CONFIG_SECURITY_NETWORK */ - BTF_ID(func, bpf_lsm_inode_create) BTF_ID(func, bpf_lsm_inode_free_security) BTF_ID(func, bpf_lsm_inode_getattr) @@ -336,6 +332,8 @@ BTF_ID(func, bpf_lsm_sb_umount) BTF_ID(func, bpf_lsm_settime) #ifdef CONFIG_SECURITY_NETWORK +BTF_ID(func, bpf_lsm_inet_conn_established) + BTF_ID(func, bpf_lsm_socket_accept) BTF_ID(func, bpf_lsm_socket_bind) BTF_ID(func, bpf_lsm_socket_connect) From 9fa5e1a180aa639fb156a16e453ab820b7e7860b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 1 Feb 2024 09:20:23 -0800 Subject: [PATCH 038/119] libbpf: Call memfd_create() syscall directly Some versions of Android do not implement memfd_create() wrapper in their libc implementation, leading to build failures ([0]). On the other hand, memfd_create() is available as a syscall on quite old kernels (3.17+, while bpf() syscall itself is available since 3.18+), so it is ok to assume that syscall availability and call into it with syscall() helper to avoid Android-specific workarounds. Validated in libbpf-bootstrap's CI ([1]). [0] https://github.com/libbpf/libbpf-bootstrap/actions/runs/7701003207/job/20986080319#step:5:83 [1] https://github.com/libbpf/libbpf-bootstrap/actions/runs/7715988887/job/21031767212?pr=253 Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240201172027.604869-2-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index f6953d7faff1..6932f2c4ddfd 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1525,11 +1525,20 @@ static Elf64_Sym *find_elf_var_sym(const struct bpf_object *obj, const char *nam return ERR_PTR(-ENOENT); } +/* Some versions of Android don't provide memfd_create() in their libc + * implementation, so avoid complications and just go straight to Linux + * syscall. + */ +static int sys_memfd_create(const char *name, unsigned flags) +{ + return syscall(__NR_memfd_create, name, flags); +} + static int create_placeholder_fd(void) { int fd; - fd = ensure_good_fd(memfd_create("libbpf-placeholder-fd", MFD_CLOEXEC)); + fd = ensure_good_fd(sys_memfd_create("libbpf-placeholder-fd", MFD_CLOEXEC)); if (fd < 0) return -errno; return fd; From 93ee1eb85e28d1e35bb059c1f5965d65d5fc83c2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 1 Feb 2024 09:20:24 -0800 Subject: [PATCH 039/119] libbpf: Add missing LIBBPF_API annotation to libbpf_set_memlock_rlim API LIBBPF_API annotation seems missing on libbpf_set_memlock_rlim API, so add it to make this API callable from libbpf's shared library version. Fixes: e542f2c4cd16 ("libbpf: Auto-bump RLIMIT_MEMLOCK if kernel needs it for BPF") Fixes: ab9a5a05dc48 ("libbpf: fix up few libbpf.map problems") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240201172027.604869-3-andrii@kernel.org --- tools/lib/bpf/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 1441f642c563..f866e98b2436 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -35,7 +35,7 @@ extern "C" { #endif -int libbpf_set_memlock_rlim(size_t memlock_bytes); +LIBBPF_API int libbpf_set_memlock_rlim(size_t memlock_bytes); struct bpf_map_create_opts { size_t sz; /* size of this struct for forward/backward compatibility */ From c81a8ab196b5083d5109a51585fcc24fa2055a77 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 1 Feb 2024 09:20:25 -0800 Subject: [PATCH 040/119] libbpf: Add btf__new_split() API that was declared but not implemented Seems like original commit adding split BTF support intended to add btf__new_split() API, and even declared it in libbpf.map, but never added (trivial) implementation. Fix this. Fixes: ba451366bf44 ("libbpf: Implement basic split BTF support") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240201172027.604869-4-andrii@kernel.org --- tools/lib/bpf/btf.c | 5 +++++ tools/lib/bpf/libbpf.map | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 95db88b36cf3..845034d15420 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1079,6 +1079,11 @@ struct btf *btf__new(const void *data, __u32 size) return libbpf_ptr(btf_new(data, size, NULL)); } +struct btf *btf__new_split(const void *data, __u32 size, struct btf *base_btf) +{ + return libbpf_ptr(btf_new(data, size, base_btf)); +} + static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, struct btf_ext **btf_ext) { diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index d9e1f57534fa..386964f572a8 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -245,7 +245,6 @@ LIBBPF_0.3.0 { btf__parse_raw_split; btf__parse_split; btf__new_empty_split; - btf__new_split; ring_buffer__epoll_fd; } LIBBPF_0.2.0; @@ -411,5 +410,7 @@ LIBBPF_1.3.0 { } LIBBPF_1.2.0; LIBBPF_1.4.0 { + global: bpf_token_create; + btf__new_split; } LIBBPF_1.3.0; From b9551da8cf3ade01a50316df8a618fd945723ee0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 1 Feb 2024 09:20:26 -0800 Subject: [PATCH 041/119] libbpf: Add missed btf_ext__raw_data() API Another API that was declared in libbpf.map but actual implementation was missing. btf_ext__get_raw_data() was intended as a discouraged alias to consistently-named btf_ext__raw_data(), so make this an actuality. Fixes: 20eccf29e297 ("libbpf: hide and discourage inconsistently named getters") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240201172027.604869-5-andrii@kernel.org --- tools/lib/bpf/btf.c | 6 +++++- tools/lib/bpf/libbpf.map | 2 +- tools/lib/bpf/linker.c | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 845034d15420..a17b4c9c4213 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -3050,12 +3050,16 @@ done: return btf_ext; } -const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size) +const void *btf_ext__raw_data(const struct btf_ext *btf_ext, __u32 *size) { *size = btf_ext->data_size; return btf_ext->data; } +__attribute__((alias("btf_ext__raw_data"))) +const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size); + + struct btf_dedup; static struct btf_dedup *btf_dedup_new(struct btf *btf, const struct btf_dedup_opts *opts); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 386964f572a8..86804fd90dd1 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -325,7 +325,6 @@ LIBBPF_0.7.0 { bpf_xdp_detach; bpf_xdp_query; bpf_xdp_query_id; - btf_ext__raw_data; libbpf_probe_bpf_helper; libbpf_probe_bpf_map_type; libbpf_probe_bpf_prog_type; @@ -413,4 +412,5 @@ LIBBPF_1.4.0 { global: bpf_token_create; btf__new_split; + btf_ext__raw_data; } LIBBPF_1.3.0; diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 16bca56002ab..0d4be829551b 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -2732,7 +2732,7 @@ static int finalize_btf(struct bpf_linker *linker) /* Emit .BTF.ext section */ if (linker->btf_ext) { - raw_data = btf_ext__get_raw_data(linker->btf_ext, &raw_sz); + raw_data = btf_ext__raw_data(linker->btf_ext, &raw_sz); if (!raw_data) return -ENOMEM; From 943b043aeecce9accb6d367af47791c633e95e4d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 1 Feb 2024 09:20:27 -0800 Subject: [PATCH 042/119] selftests/bpf: Fix bench runner SIGSEGV Some benchmarks don't have either "consumer" or "producer" sides. For example, trig-tp and other BPF triggering benchmarks don't have consumers, as they only do "producing" by calling into syscall or predefined uproes. As such it's valid for some benchmarks to have zero consumers or producers. So allows to specify `-c0` explicitly. This triggers another problem. If benchmark doesn't support either consumer or producer side, consumer_thread/producer_thread callback will be NULL, but benchmark runner will attempt to use those NULL callback to create threads anyways. So instead of crashing with SIGSEGV in case of misconfigured benchmark, detect the condition and report error. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240201172027.604869-6-andrii@kernel.org --- tools/testing/selftests/bpf/bench.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 73ce11b0547d..1724d50ba942 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -323,14 +323,14 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) break; case 'p': env.producer_cnt = strtol(arg, NULL, 10); - if (env.producer_cnt <= 0) { + if (env.producer_cnt < 0) { fprintf(stderr, "Invalid producer count: %s\n", arg); argp_usage(state); } break; case 'c': env.consumer_cnt = strtol(arg, NULL, 10); - if (env.consumer_cnt <= 0) { + if (env.consumer_cnt < 0) { fprintf(stderr, "Invalid consumer count: %s\n", arg); argp_usage(state); } @@ -607,6 +607,10 @@ static void setup_benchmark(void) bench->setup(); for (i = 0; i < env.consumer_cnt; i++) { + if (!bench->consumer_thread) { + fprintf(stderr, "benchmark doesn't support consumers!\n"); + exit(1); + } err = pthread_create(&state.consumers[i], NULL, bench->consumer_thread, (void *)(long)i); if (err) { @@ -626,6 +630,10 @@ static void setup_benchmark(void) env.prod_cpus.next_cpu = env.cons_cpus.next_cpu; for (i = 0; i < env.producer_cnt; i++) { + if (!bench->producer_thread) { + fprintf(stderr, "benchmark doesn't support producers!\n"); + exit(1); + } err = pthread_create(&state.producers[i], NULL, bench->producer_thread, (void *)(long)i); if (err) { From e67ddd9b1cff7872d43ead73a1403c4e532003d9 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Sat, 27 Jan 2024 19:52:32 +0200 Subject: [PATCH 043/119] bpf: Track spilled unbounded scalars Support the pattern where an unbounded scalar is spilled to the stack, then boundary checks are performed on the src register, after which the stack frame slot is refilled into a register. Before this commit, the verifier didn't treat the src register and the stack slot as related if the src register was an unbounded scalar. The register state wasn't copied, the id wasn't preserved, and the stack slot was marked as STACK_MISC. Subsequent boundary checks on the src register wouldn't result in updating the boundaries of the spilled variable on the stack. After this commit, the verifier will preserve the bond between src and dst even if src is unbounded, which permits to do boundary checks on src and refill dst later, still remembering its boundaries. Such a pattern is sometimes generated by clang when compiling complex long functions. One test is adjusted to reflect that now unbounded scalars are tracked. Signed-off-by: Maxim Mikityanskiy Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240127175237.526726-2-maxtram95@gmail.com --- kernel/bpf/verifier.c | 16 +--------------- .../selftests/bpf/progs/verifier_spill_fill.c | 6 +++--- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cd4d780e5400..28f62f24da7e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4380,20 +4380,6 @@ static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32) return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value; } -static bool __is_scalar_unbounded(struct bpf_reg_state *reg) -{ - return tnum_is_unknown(reg->var_off) && - reg->smin_value == S64_MIN && reg->smax_value == S64_MAX && - reg->umin_value == 0 && reg->umax_value == U64_MAX && - reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX && - reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX; -} - -static bool register_is_bounded(struct bpf_reg_state *reg) -{ - return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg); -} - static bool __is_pointer_value(bool allow_ptr_leaks, const struct bpf_reg_state *reg) { @@ -4504,7 +4490,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return err; mark_stack_slot_scratched(env, spi); - if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && env->bpf_capable) { + if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) { bool reg_value_fits; reg_value_fits = get_reg_width(reg) <= BITS_PER_BYTE * size; diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 7013a9694163..317806451762 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -452,9 +452,9 @@ l0_%=: r1 >>= 16; \ SEC("raw_tp") __log_level(2) __success -__msg("fp-8=0m??mmmm") -__msg("fp-16=00mm??mm") -__msg("fp-24=00mm???m") +__msg("fp-8=0m??scalar()") +__msg("fp-16=00mm??scalar()") +__msg("fp-24=00mm???scalar()") __naked void spill_subregs_preserve_stack_zero(void) { asm volatile ( From 6be503cec6c9bccd64f72c03697011d2e2b96fc3 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Sat, 27 Jan 2024 19:52:33 +0200 Subject: [PATCH 044/119] selftests/bpf: Test tracking spilled unbounded scalars The previous commit added tracking for unbounded scalars on spill. Add the test case to check the new functionality. Signed-off-by: Maxim Mikityanskiy Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240127175237.526726-3-maxtram95@gmail.com --- .../selftests/bpf/progs/verifier_spill_fill.c | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 317806451762..f9803005e1c0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -940,4 +940,31 @@ l0_%=: r0 = 0; \ : __clobber_all); } +SEC("xdp") +__description("spill unbounded reg, then range check src") +__success __retval(0) +__naked void spill_unbounded(void) +{ + asm volatile (" \ + /* Produce an unbounded scalar. */ \ + call %[bpf_get_prandom_u32]; \ + /* Spill r0 to stack. */ \ + *(u64*)(r10 - 8) = r0; \ + /* Boundary check on r0. */ \ + if r0 > 16 goto l0_%=; \ + /* Fill r0 from stack. */ \ + r0 = *(u64*)(r10 - 8); \ + /* Boundary check on r0 with predetermined result. */\ + if r0 <= 16 goto l0_%=; \ + /* Dead branch: the verifier should prune it. Do an invalid memory\ + * access if the verifier follows it. \ + */ \ + r0 = *(u64*)(r9 + 0); \ +l0_%=: r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; From c1e6148cb4f83cec841db1f066e8db4a86c1f118 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Sat, 27 Jan 2024 19:52:34 +0200 Subject: [PATCH 045/119] bpf: Preserve boundaries and track scalars on narrowing fill When the width of a fill is smaller than the width of the preceding spill, the information about scalar boundaries can still be preserved, as long as it's coerced to the right width (done by coerce_reg_to_size). Even further, if the actual value fits into the fill width, the ID can be preserved as well for further tracking of equal scalars. Implement the above improvements, which makes narrowing fills behave the same as narrowing spills and MOVs between registers. Two tests are adjusted to accommodate for endianness differences and to take into account that it's now allowed to do a narrowing fill from the least significant bits. reg_bounds_sync is added to coerce_reg_to_size to correctly adjust umin/umax boundaries after the var_off truncation, for example, a 64-bit value 0xXXXXXXXX00000000, when read as a 32-bit, gets umin = 0, umax = 0xFFFFFFFF, var_off = (0x0; 0xffffffff00000000), which needs to be synced down to umax = 0, otherwise reg_bounds_sanity_check doesn't pass. Signed-off-by: Maxim Mikityanskiy Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240127175237.526726-4-maxtram95@gmail.com --- include/linux/bpf_verifier.h | 9 +++++++ kernel/bpf/verifier.c | 15 ++++++++--- .../selftests/bpf/progs/verifier_spill_fill.c | 26 ++++++++++++++----- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0dcde339dc7e..84365e6dd85d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -919,6 +919,15 @@ static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env) env->scratched_stack_slots = ~0ULL; } +static inline bool bpf_stack_narrow_access_ok(int off, int fill_size, int spill_size) +{ +#ifdef __BIG_ENDIAN + off -= spill_size - fill_size; +#endif + + return !(off % BPF_REG_SIZE); +} + const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type); const char *dynptr_type_str(enum bpf_dynptr_type type); const char *iter_type_str(const struct btf *btf, u32 btf_id); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 28f62f24da7e..82af971926ac 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4778,7 +4778,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, if (dst_regno < 0) return 0; - if (!(off % BPF_REG_SIZE) && size == spill_size) { + if (size <= spill_size && + bpf_stack_narrow_access_ok(off, size, spill_size)) { /* The earlier check_reg_arg() has decided the * subreg_def for this insn. Save it first. */ @@ -4786,6 +4787,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, copy_register_state(&state->regs[dst_regno], reg); state->regs[dst_regno].subreg_def = subreg_def; + + /* Break the relation on a narrowing fill. + * coerce_reg_to_size will adjust the boundaries. + */ + if (get_reg_width(reg) > size * BITS_PER_BYTE) + state->regs[dst_regno].id = 0; } else { int spill_cnt = 0, zero_cnt = 0; @@ -6061,10 +6068,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) * values are also truncated so we push 64-bit bounds into * 32-bit bounds. Above were truncated < 32-bits already. */ - if (size < 4) { + if (size < 4) __mark_reg32_unbounded(reg); - reg_bounds_sync(reg); - } + + reg_bounds_sync(reg); } static void set_sext64_default_val(struct bpf_reg_state *reg, int size) diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index f9803005e1c0..3e5d063ea7e8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -217,7 +217,7 @@ __naked void uninit_u32_from_the_stack(void) SEC("tc") __description("Spill a u32 const scalar. Refill as u16. Offset to skb->data") -__failure __msg("invalid access to packet") +__success __retval(0) __naked void u16_offset_to_skb_data(void) { asm volatile (" \ @@ -225,13 +225,19 @@ __naked void u16_offset_to_skb_data(void) r3 = *(u32*)(r1 + %[__sk_buff_data_end]); \ w4 = 20; \ *(u32*)(r10 - 8) = r4; \ - r4 = *(u16*)(r10 - 8); \ + " +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "r4 = *(u16*)(r10 - 8);" +#else + "r4 = *(u16*)(r10 - 6);" +#endif + " \ r0 = r2; \ - /* r0 += r4 R0=pkt R2=pkt R3=pkt_end R4=umax=65535 */\ + /* r0 += r4 R0=pkt R2=pkt R3=pkt_end R4=20 */\ r0 += r4; \ - /* if (r0 > r3) R0=pkt,umax=65535 R2=pkt R3=pkt_end R4=umax=65535 */\ + /* if (r0 > r3) R0=pkt,off=20 R2=pkt R3=pkt_end R4=20 */\ if r0 > r3 goto l0_%=; \ - /* r0 = *(u32 *)r2 R0=pkt,umax=65535 R2=pkt R3=pkt_end R4=20 */\ + /* r0 = *(u32 *)r2 R0=pkt,off=20 R2=pkt R3=pkt_end R4=20 */\ r0 = *(u32*)(r2 + 0); \ l0_%=: r0 = 0; \ exit; \ @@ -268,7 +274,7 @@ l0_%=: r0 = 0; \ } SEC("tc") -__description("Spill a u32 const scalar. Refill as u16 from fp-6. Offset to skb->data") +__description("Spill a u32 const scalar. Refill as u16 from MSB. Offset to skb->data") __failure __msg("invalid access to packet") __naked void _6_offset_to_skb_data(void) { @@ -277,7 +283,13 @@ __naked void _6_offset_to_skb_data(void) r3 = *(u32*)(r1 + %[__sk_buff_data_end]); \ w4 = 20; \ *(u32*)(r10 - 8) = r4; \ - r4 = *(u16*)(r10 - 6); \ + " +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "r4 = *(u16*)(r10 - 6);" +#else + "r4 = *(u16*)(r10 - 8);" +#endif + " \ r0 = r2; \ /* r0 += r4 R0=pkt R2=pkt R3=pkt_end R4=umax=65535 */\ r0 += r4; \ From 067313a85c6f213932518f12f628810f0092492b Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Sat, 27 Jan 2024 19:52:35 +0200 Subject: [PATCH 046/119] selftests/bpf: Add test cases for narrowing fill The previous commit allowed to preserve boundaries and track IDs of scalars on narrowing fills. Add test cases for that pattern. Signed-off-by: Maxim Mikityanskiy Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240127175237.526726-5-maxtram95@gmail.com --- .../selftests/bpf/progs/verifier_spill_fill.c | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 3e5d063ea7e8..7f3b1319bd99 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -979,4 +979,115 @@ l0_%=: r0 = 0; \ : __clobber_all); } +SEC("xdp") +__description("32-bit fill after 64-bit spill") +__success __retval(0) +__naked void fill_32bit_after_spill_64bit(void) +{ + asm volatile(" \ + /* Randomize the upper 32 bits. */ \ + call %[bpf_get_prandom_u32]; \ + r0 <<= 32; \ + /* 64-bit spill r0 to stack. */ \ + *(u64*)(r10 - 8) = r0; \ + /* 32-bit fill r0 from stack. */ \ + " +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "r0 = *(u32*)(r10 - 8);" +#else + "r0 = *(u32*)(r10 - 4);" +#endif + " \ + /* Boundary check on r0 with predetermined result. */\ + if r0 == 0 goto l0_%=; \ + /* Dead branch: the verifier should prune it. Do an invalid memory\ + * access if the verifier follows it. \ + */ \ + r0 = *(u64*)(r9 + 0); \ +l0_%=: exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("xdp") +__description("32-bit fill after 64-bit spill of 32-bit value should preserve ID") +__success __retval(0) +__naked void fill_32bit_after_spill_64bit_preserve_id(void) +{ + asm volatile (" \ + /* Randomize the lower 32 bits. */ \ + call %[bpf_get_prandom_u32]; \ + w0 &= 0xffffffff; \ + /* 64-bit spill r0 to stack - should assign an ID. */\ + *(u64*)(r10 - 8) = r0; \ + /* 32-bit fill r1 from stack - should preserve the ID. */\ + " +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "r1 = *(u32*)(r10 - 8);" +#else + "r1 = *(u32*)(r10 - 4);" +#endif + " \ + /* Compare r1 with another register to trigger find_equal_scalars. */\ + r2 = 0; \ + if r1 != r2 goto l0_%=; \ + /* The result of this comparison is predefined. */\ + if r0 == r2 goto l0_%=; \ + /* Dead branch: the verifier should prune it. Do an invalid memory\ + * access if the verifier follows it. \ + */ \ + r0 = *(u64*)(r9 + 0); \ + exit; \ +l0_%=: r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("xdp") +__description("32-bit fill after 64-bit spill should clear ID") +__failure __msg("math between ctx pointer and 4294967295 is not allowed") +__naked void fill_32bit_after_spill_64bit_clear_id(void) +{ + asm volatile (" \ + r6 = r1; \ + /* Roll one bit to force the verifier to track both branches. */\ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x8; \ + /* Put a large number into r1. */ \ + r1 = 0xffffffff; \ + r1 <<= 32; \ + r1 += r0; \ + /* 64-bit spill r1 to stack - should assign an ID. */\ + *(u64*)(r10 - 8) = r1; \ + /* 32-bit fill r2 from stack - should clear the ID. */\ + " +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "r2 = *(u32*)(r10 - 8);" +#else + "r2 = *(u32*)(r10 - 4);" +#endif + " \ + /* Compare r2 with another register to trigger find_equal_scalars.\ + * Having one random bit is important here, otherwise the verifier cuts\ + * the corners. If the ID was mistakenly preserved on fill, this would\ + * cause the verifier to think that r1 is also equal to zero in one of\ + * the branches, and equal to eight on the other branch.\ + */ \ + r3 = 0; \ + if r2 != r3 goto l0_%=; \ +l0_%=: r1 >>= 32; \ + /* The verifier shouldn't propagate r2's range to r1, so it should\ + * still remember r1 = 0xffffffff and reject the below.\ + */ \ + r6 += r1; \ + r0 = *(u32*)(r6 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; From 6efbde200bf3cf2dbf6e7181893fed13a79c789b Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Sat, 27 Jan 2024 19:52:36 +0200 Subject: [PATCH 047/119] bpf: Handle scalar spill vs all MISC in stacksafe() When check_stack_read_fixed_off() reads value from an spi all stack slots of which are set to STACK_{MISC,INVALID}, the destination register is set to unbound SCALAR_VALUE. Exploit this fact by allowing stacksafe() to use a fake unbound scalar register to compare 'mmmm mmmm' stack value in old state vs spilled 64-bit scalar in current state and vice versa. Veristat results after this patch show some gains: ./veristat -C -e file,prog,states -f 'states_pct>10' not-opt after File Program States (DIFF) ----------------------- --------------------- --------------- bpf_overlay.o tail_rev_nodeport_lb4 -45 (-15.85%) bpf_xdp.o tail_lb_ipv4 -541 (-19.57%) pyperf100.bpf.o on_event -680 (-10.42%) pyperf180.bpf.o on_event -2164 (-19.62%) pyperf600.bpf.o on_event -9799 (-24.84%) strobemeta.bpf.o on_event -9157 (-65.28%) xdp_synproxy_kern.bpf.o syncookie_tc -54 (-19.29%) xdp_synproxy_kern.bpf.o syncookie_xdp -74 (-24.50%) Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240127175237.526726-6-maxtram95@gmail.com --- kernel/bpf/verifier.c | 72 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 82af971926ac..a0b8e400b3df 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1155,6 +1155,12 @@ static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack) stack->spilled_ptr.type == SCALAR_VALUE; } +static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack) +{ + return stack->slot_type[0] == STACK_SPILL && + stack->spilled_ptr.type == SCALAR_VALUE; +} + /* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which * case they are equivalent, or it's STACK_ZERO, in which case we preserve * more precise STACK_ZERO. @@ -2264,8 +2270,7 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg) } /* Mark a register as having a completely unknown (scalar) value. */ -static void __mark_reg_unknown(const struct bpf_verifier_env *env, - struct bpf_reg_state *reg) +static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg) { /* * Clear type, off, and union(map_ptr, range) and @@ -2277,10 +2282,20 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->ref_obj_id = 0; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = !env->bpf_capable; + reg->precise = false; __mark_reg_unbounded(reg); } +/* Mark a register as having a completely unknown (scalar) value, + * initialize .precise as true when not bpf capable. + */ +static void __mark_reg_unknown(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg) +{ + __mark_reg_unknown_imprecise(reg); + reg->precise = !env->bpf_capable; +} + static void mark_reg_unknown(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { @@ -16486,6 +16501,43 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } } +static struct bpf_reg_state unbound_reg; + +static __init int unbound_reg_init(void) +{ + __mark_reg_unknown_imprecise(&unbound_reg); + unbound_reg.live |= REG_LIVE_READ; + return 0; +} +late_initcall(unbound_reg_init); + +static bool is_stack_all_misc(struct bpf_verifier_env *env, + struct bpf_stack_state *stack) +{ + u32 i; + + for (i = 0; i < ARRAY_SIZE(stack->slot_type); ++i) { + if ((stack->slot_type[i] == STACK_MISC) || + (stack->slot_type[i] == STACK_INVALID && env->allow_uninit_stack)) + continue; + return false; + } + + return true; +} + +static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env, + struct bpf_stack_state *stack) +{ + if (is_spilled_scalar_reg64(stack)) + return &stack->spilled_ptr; + + if (is_stack_all_misc(env, stack)) + return &unbound_reg; + + return NULL; +} + static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact) { @@ -16524,6 +16576,20 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, if (i >= cur->allocated_stack) return false; + /* 64-bit scalar spill vs all slots MISC and vice versa. + * Load from all slots MISC produces unbound scalar. + * Construct a fake register for such stack and call + * regsafe() to ensure scalar ids are compared. + */ + old_reg = scalar_reg_for_stack(env, &old->stack[spi]); + cur_reg = scalar_reg_for_stack(env, &cur->stack[spi]); + if (old_reg && cur_reg) { + if (!regsafe(env, old_reg, cur_reg, idmap, exact)) + return false; + i += BPF_REG_SIZE - 1; + continue; + } + /* if old state was safe with misc data in the stack * it will be safe with zero-initialized stack. * The opposite is not true From 73a28d9d000e8d20b4b3c516b74ee92afe3ae4be Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Sat, 27 Jan 2024 19:52:37 +0200 Subject: [PATCH 048/119] selftests/bpf: States pruning checks for scalar vs STACK_MISC Check that stacksafe() compares spilled scalars with STACK_MISC. The following combinations are explored: - old spill of imprecise scalar is equivalent to cur STACK_{MISC,INVALID} (plus error in unpriv mode); - old spill of precise scalar is not equivalent to cur STACK_MISC; - old STACK_MISC is equivalent to cur scalar; - old STACK_MISC is not equivalent to cur non-scalar. Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240127175237.526726-7-maxtram95@gmail.com --- .../selftests/bpf/progs/verifier_spill_fill.c | 154 ++++++++++++++++++ 1 file changed, 154 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 7f3b1319bd99..85e48069c9e6 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -1090,4 +1090,158 @@ l0_%=: r1 >>= 32; \ : __clobber_all); } +/* stacksafe(): check if stack spill of an imprecise scalar in old state + * is considered equivalent to STACK_{MISC,INVALID} in cur state. + */ +SEC("socket") +__success __log_level(2) +__msg("8: (79) r1 = *(u64 *)(r10 -8)") +__msg("8: safe") +__msg("processed 11 insns") +/* STACK_INVALID should prevent verifier in unpriv mode from + * considering states equivalent and force an error on second + * verification path (entry - label 1 - label 2). + */ +__failure_unpriv +__msg_unpriv("8: (79) r1 = *(u64 *)(r10 -8)") +__msg_unpriv("9: (95) exit") +__msg_unpriv("8: (79) r1 = *(u64 *)(r10 -8)") +__msg_unpriv("invalid read from stack off -8+2 size 8") +__flag(BPF_F_TEST_STATE_FREQ) +__naked void old_imprecise_scalar_vs_cur_stack_misc(void) +{ + asm volatile( + /* get a random value for branching */ + "call %[bpf_ktime_get_ns];" + "if r0 == 0 goto 1f;" + /* conjure scalar at fp-8 */ + "r0 = 42;" + "*(u64*)(r10 - 8) = r0;" + "goto 2f;" +"1:" + /* conjure STACK_{MISC,INVALID} at fp-8 */ + "call %[bpf_ktime_get_ns];" + "*(u16*)(r10 - 8) = r0;" + "*(u16*)(r10 - 4) = r0;" +"2:" + /* read fp-8, should be considered safe on second visit */ + "r1 = *(u64*)(r10 - 8);" + "exit;" + : + : __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +/* stacksafe(): check that stack spill of a precise scalar in old state + * is not considered equivalent to STACK_MISC in cur state. + */ +SEC("socket") +__success __log_level(2) +/* verifier should visit 'if r1 == 0x2a ...' two times: + * - once for path entry - label 2; + * - once for path entry - label 1 - label 2. + */ +__msg("if r1 == 0x2a goto pc+0") +__msg("if r1 == 0x2a goto pc+0") +__msg("processed 15 insns") +__flag(BPF_F_TEST_STATE_FREQ) +__naked void old_precise_scalar_vs_cur_stack_misc(void) +{ + asm volatile( + /* get a random value for branching */ + "call %[bpf_ktime_get_ns];" + "if r0 == 0 goto 1f;" + /* conjure scalar at fp-8 */ + "r0 = 42;" + "*(u64*)(r10 - 8) = r0;" + "goto 2f;" +"1:" + /* conjure STACK_MISC at fp-8 */ + "call %[bpf_ktime_get_ns];" + "*(u64*)(r10 - 8) = r0;" + "*(u32*)(r10 - 4) = r0;" +"2:" + /* read fp-8, should not be considered safe on second visit */ + "r1 = *(u64*)(r10 - 8);" + /* use r1 in precise context */ + "if r1 == 42 goto +0;" + "exit;" + : + : __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +/* stacksafe(): check if STACK_MISC in old state is considered + * equivalent to stack spill of a scalar in cur state. + */ +SEC("socket") +__success __log_level(2) +__msg("8: (79) r0 = *(u64 *)(r10 -8)") +__msg("8: safe") +__msg("processed 11 insns") +__flag(BPF_F_TEST_STATE_FREQ) +__naked void old_stack_misc_vs_cur_scalar(void) +{ + asm volatile( + /* get a random value for branching */ + "call %[bpf_ktime_get_ns];" + "if r0 == 0 goto 1f;" + /* conjure STACK_{MISC,INVALID} at fp-8 */ + "call %[bpf_ktime_get_ns];" + "*(u16*)(r10 - 8) = r0;" + "*(u16*)(r10 - 4) = r0;" + "goto 2f;" +"1:" + /* conjure scalar at fp-8 */ + "r0 = 42;" + "*(u64*)(r10 - 8) = r0;" +"2:" + /* read fp-8, should be considered safe on second visit */ + "r0 = *(u64*)(r10 - 8);" + "exit;" + : + : __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +/* stacksafe(): check that STACK_MISC in old state is not considered + * equivalent to stack spill of a non-scalar in cur state. + */ +SEC("socket") +__success __log_level(2) +/* verifier should process exit instructions twice: + * - once for path entry - label 2; + * - once for path entry - label 1 - label 2. + */ +__msg("r1 = *(u64 *)(r10 -8)") +__msg("exit") +__msg("r1 = *(u64 *)(r10 -8)") +__msg("exit") +__msg("processed 11 insns") +__flag(BPF_F_TEST_STATE_FREQ) +__naked void old_stack_misc_vs_cur_ctx_ptr(void) +{ + asm volatile( + /* remember context pointer in r9 */ + "r9 = r1;" + /* get a random value for branching */ + "call %[bpf_ktime_get_ns];" + "if r0 == 0 goto 1f;" + /* conjure STACK_MISC at fp-8 */ + "call %[bpf_ktime_get_ns];" + "*(u64*)(r10 - 8) = r0;" + "*(u32*)(r10 - 4) = r0;" + "goto 2f;" +"1:" + /* conjure context pointer in fp-8 */ + "*(u64*)(r10 - 8) = r9;" +"2:" + /* read fp-8, should not be considered safe on second visit */ + "r1 = *(u64*)(r10 - 8);" + "exit;" + : + : __imm(bpf_ktime_get_ns) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; From a68b50f47bec8bd6a33b07b7e1562db2553981a7 Mon Sep 17 00:00:00 2001 From: Shung-Hsi Yu Date: Fri, 2 Feb 2024 17:55:58 +0800 Subject: [PATCH 049/119] selftests/bpf: trace_helpers.c: do not use poisoned type After commit c698eaebdf47 ("selftests/bpf: trace_helpers.c: Optimize kallsyms cache") trace_helpers.c now includes libbpf_internal.h, and thus can no longer use the u32 type (among others) since they are poison in libbpf_internal.h. Replace u32 with __u32 to fix the following error when building trace_helpers.c on powerpc: error: attempt to use poisoned "u32" Fixes: c698eaebdf47 ("selftests/bpf: trace_helpers.c: Optimize kallsyms cache") Signed-off-by: Shung-Hsi Yu Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20240202095559.12900-1-shung-hsi.yu@suse.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/trace_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 4faa898ff7fc..27fd7ed3e4b0 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -271,7 +271,7 @@ ssize_t get_uprobe_offset(const void *addr) * addi r2,r2,XXXX */ { - const u32 *insn = (const u32 *)(uintptr_t)addr; + const __u32 *insn = (const __u32 *)(uintptr_t)addr; if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) || ((*insn & OP_RT_RA_MASK) == LIS_R2)) && From 8f13c34087d3eb64329529b8517e5a6251653176 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 2 Feb 2024 11:05:27 -0800 Subject: [PATCH 050/119] bpf: handle trusted PTR_TO_BTF_ID_OR_NULL in argument check logic Add PTR_TRUSTED | PTR_MAYBE_NULL modifiers for PTR_TO_BTF_ID to check_reg_type() to support passing trusted nullable PTR_TO_BTF_ID registers into global functions accepting `__arg_trusted __arg_nullable` arguments. This hasn't been caught earlier because tests were either passing known non-NULL PTR_TO_BTF_ID registers or known NULL (SCALAR) registers. When utilizing this functionality in complicated real-world BPF application that passes around PTR_TO_BTF_ID_OR_NULL, it became apparent that verifier rejects valid case because check_reg_type() doesn't handle this case explicitly. Existing check_reg_type() logic is already anticipating this combination, so we just need to explicitly list this combo in the switch statement. Fixes: e2b3c4ff5d18 ("bpf: add __arg_trusted global func arg tag") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240202190529.2374377-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a0b8e400b3df..64fa188d00ad 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8242,6 +8242,7 @@ found: switch ((int)reg->type) { case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | PTR_TRUSTED: + case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL: case PTR_TO_BTF_ID | MEM_RCU: case PTR_TO_BTF_ID | PTR_MAYBE_NULL: case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU: From e2e70535dd76c6f17bdc9009ffca3d26cfd35ea4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 2 Feb 2024 11:05:28 -0800 Subject: [PATCH 051/119] selftests/bpf: add more cases for __arg_trusted __arg_nullable args Add extra layer of global functions to ensure that passing around (trusted) PTR_TO_BTF_ID_OR_NULL registers works as expected. We also extend trusted_task_arg_nullable subtest to check three possible valid argumements: known NULL, known non-NULL, and maybe NULL cases. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240202190529.2374377-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../bpf/progs/verifier_global_ptr_args.c | 32 +++++++++++++++++-- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index 484d6262363f..4ab0ef18d7eb 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -19,15 +19,41 @@ __weak int subprog_trusted_task_nullable(struct task_struct *task __arg_trusted return task->pid + task->tgid; } -SEC("?kprobe") +__weak int subprog_trusted_task_nullable_extra_layer(struct task_struct *task __arg_trusted __arg_nullable) +{ + return subprog_trusted_task_nullable(task) + subprog_trusted_task_nullable(NULL); +} + +SEC("?tp_btf/task_newtask") __success __log_level(2) __msg("Validating subprog_trusted_task_nullable() func#1...") __msg(": R1=trusted_ptr_or_null_task_struct(") int trusted_task_arg_nullable(void *ctx) { - struct task_struct *t = bpf_get_current_task_btf(); + struct task_struct *t1 = bpf_get_current_task_btf(); + struct task_struct *t2 = bpf_task_acquire(t1); + int res = 0; - return subprog_trusted_task_nullable(t) + subprog_trusted_task_nullable(NULL); + /* known NULL */ + res += subprog_trusted_task_nullable(NULL); + + /* known non-NULL */ + res += subprog_trusted_task_nullable(t1); + res += subprog_trusted_task_nullable_extra_layer(t1); + + /* unknown if NULL or not */ + res += subprog_trusted_task_nullable(t2); + res += subprog_trusted_task_nullable_extra_layer(t2); + + if (t2) { + /* known non-NULL after explicit NULL check, just in case */ + res += subprog_trusted_task_nullable(t2); + res += subprog_trusted_task_nullable_extra_layer(t2); + + bpf_task_release(t2); + } + + return res; } __weak int subprog_trusted_task_nonnull(struct task_struct *task __arg_trusted) From 1eb986746a67952df86eb2c50a36450ef103d01b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 2 Feb 2024 11:05:29 -0800 Subject: [PATCH 052/119] bpf: don't emit warnings intended for global subprogs for static subprogs When btf_prepare_func_args() was generalized to handle both static and global subprogs, a few warnings/errors that are meant only for global subprog cases started to be emitted for static subprogs, where they are sort of expected and irrelavant. Stop polutting verifier logs with irrelevant scary-looking messages. Fixes: e26080d0da87 ("bpf: prepare btf_prepare_func_args() for handling static subprogs") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240202190529.2374377-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ef380e546952..f7725cb6e564 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7122,6 +7122,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); if (nargs > MAX_BPF_FUNC_REG_ARGS) { + if (!is_global) + return -EINVAL; bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n", tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; @@ -7131,6 +7133,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!btf_type_is_int(t) && !btf_is_any_enum(t)) { + if (!is_global) + return -EINVAL; bpf_log(log, "Global function %s() doesn't return scalar. Only those are supported.\n", tname); @@ -7251,6 +7255,8 @@ skip_pointer: sub->args[i].arg_type = ARG_ANYTHING; continue; } + if (!is_global) + return -EINVAL; bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", i, btf_type_str(t), tname); return -EINVAL; From df9705eaa0bad034dad0f73386ff82f5c4dd7e24 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Fri, 2 Feb 2024 21:51:19 -0800 Subject: [PATCH 053/119] bpf: Remove an unnecessary check. The "i" here is always equal to "btf_type_vlen(t)" since the "for_each_member()" loop never breaks. Signed-off-by: Kui-Feng Lee Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240203055119.2235598-1-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/bpf_struct_ops.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 0decd862dfe0..f98f580de77a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -189,20 +189,17 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, } } - if (i == btf_type_vlen(t)) { - if (st_ops->init(btf)) { - pr_warn("Error in init bpf_struct_ops %s\n", - st_ops->name); - return -EINVAL; - } else { - st_ops_desc->type_id = type_id; - st_ops_desc->type = t; - st_ops_desc->value_id = value_id; - st_ops_desc->value_type = btf_type_by_id(btf, - value_id); - } + if (st_ops->init(btf)) { + pr_warn("Error in init bpf_struct_ops %s\n", + st_ops->name); + return -EINVAL; } + st_ops_desc->type_id = type_id; + st_ops_desc->type = t; + st_ops_desc->value_id = value_id; + st_ops_desc->value_type = btf_type_by_id(btf, value_id); + return 0; } From 7e428638bd784fd9e8944bfbf11513520e141b91 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 4 Feb 2024 11:44:52 -0800 Subject: [PATCH 054/119] selftests/bpf: Fix flaky test ptr_untrusted Somehow recently I frequently hit the following test failure with either ./test_progs or ./test_progs-cpuv4: serial_test_ptr_untrusted:PASS:skel_open 0 nsec serial_test_ptr_untrusted:PASS:lsm_attach 0 nsec serial_test_ptr_untrusted:PASS:raw_tp_attach 0 nsec serial_test_ptr_untrusted:FAIL:cmp_tp_name unexpected cmp_tp_name: actual -115 != expected 0 #182 ptr_untrusted:FAIL Further investigation found the failure is due to bpf_probe_read_user_str() where reading user-level string attr->raw_tracepoint.name is not successfully, most likely due to the string itself still in disk and not populated into memory yet. One solution is do a printf() call of the string before doing bpf syscall which will force the raw_tracepoint.name into memory. But I think a more robust solution is to use bpf_copy_from_user() which is used in sleepable program and can tolerate page fault, and the fix here used the latter approach. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240204194452.2785936-1-yonghong.song@linux.dev --- tools/testing/selftests/bpf/progs/test_ptr_untrusted.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_ptr_untrusted.c b/tools/testing/selftests/bpf/progs/test_ptr_untrusted.c index 4bdd65b5aa2d..2fdc44e76624 100644 --- a/tools/testing/selftests/bpf/progs/test_ptr_untrusted.c +++ b/tools/testing/selftests/bpf/progs/test_ptr_untrusted.c @@ -6,13 +6,13 @@ char tp_name[128]; -SEC("lsm/bpf") +SEC("lsm.s/bpf") int BPF_PROG(lsm_run, int cmd, union bpf_attr *attr, unsigned int size) { switch (cmd) { case BPF_RAW_TRACEPOINT_OPEN: - bpf_probe_read_user_str(tp_name, sizeof(tp_name) - 1, - (void *)attr->raw_tracepoint.name); + bpf_copy_from_user(tp_name, sizeof(tp_name) - 1, + (void *)attr->raw_tracepoint.name); break; default: break; From 169e650069647325496e5a65408ff301874c8e01 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Sat, 3 Feb 2024 22:12:04 -0800 Subject: [PATCH 055/119] selftests/bpf: Suppress warning message of an unused variable. "r" is used to receive the return value of test_2 in bpf_testmod.c, but it is not actually used. So, we remove "r" and change the return type to "void". Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401300557.z5vzn8FM-lkp@intel.com/ Signed-off-by: Kui-Feng Lee Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240204061204.1864529-1-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 6 ++---- tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h | 2 +- tools/testing/selftests/bpf/progs/struct_ops_module.c | 3 +-- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 4754c662b39f..a06daebc75c9 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -554,9 +554,8 @@ static const struct bpf_verifier_ops bpf_testmod_verifier_ops = { static int bpf_dummy_reg(void *kdata) { struct bpf_testmod_ops *ops = kdata; - int r; - r = ops->test_2(4, 3); + ops->test_2(4, 3); return 0; } @@ -570,9 +569,8 @@ static int bpf_testmod_test_1(void) return 0; } -static int bpf_testmod_test_2(int a, int b) +static void bpf_testmod_test_2(int a, int b) { - return 0; } static struct bpf_testmod_ops __bpf_testmod_ops = { diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h index ca5435751c79..537beca42896 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h @@ -30,7 +30,7 @@ struct bpf_iter_testmod_seq { struct bpf_testmod_ops { int (*test_1)(void); - int (*test_2)(int a, int b); + void (*test_2)(int a, int b); }; #endif /* _BPF_TESTMOD_H */ diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c index e44ac55195ca..b78746b3cef3 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_module.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c @@ -16,10 +16,9 @@ int BPF_PROG(test_1) } SEC("struct_ops/test_2") -int BPF_PROG(test_2, int a, int b) +void BPF_PROG(test_2, int a, int b) { test_2_result = a + b; - return a + b; } SEC(".struct_ops.link") From e7f31873176a345d72ca77c7b4da48493ccd9efd Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 4 Feb 2024 21:29:14 -0800 Subject: [PATCH 056/119] selftests/bpf: Fix flaky selftest lwt_redirect/lwt_reroute Recently, when running './test_progs -j', I occasionally hit the following errors: test_lwt_redirect:PASS:pthread_create 0 nsec test_lwt_redirect_run:FAIL:netns_create unexpected error: 256 (errno 0) #142/2 lwt_redirect/lwt_redirect_normal_nomac:FAIL #142 lwt_redirect:FAIL test_lwt_reroute:PASS:pthread_create 0 nsec test_lwt_reroute_run:FAIL:netns_create unexpected error: 256 (errno 0) test_lwt_reroute:PASS:pthread_join 0 nsec #143/2 lwt_reroute/lwt_reroute_qdisc_dropped:FAIL #143 lwt_reroute:FAIL The netns_create() definition looks like below: #define NETNS "ns_lwt" static inline int netns_create(void) { return system("ip netns add " NETNS); } One possibility is that both lwt_redirect and lwt_reroute create netns with the same name "ns_lwt" which may cause conflict. I tried the following example: $ sudo ip netns add abc $ echo $? 0 $ sudo ip netns add abc Cannot create namespace file "/var/run/netns/abc": File exists $ echo $? 1 $ The return code for above netns_create() is 256. The internet search suggests that the return value for 'ip netns add ns_lwt' is 1, which matches the above 'sudo ip netns add abc' example. This patch tried to use different netns names for two tests to avoid 'ip netns add ' failure. I ran './test_progs -j' 10 times and all succeeded with lwt_redirect/lwt_reroute tests. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Tested-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240205052914.1742687-1-yonghong.song@linux.dev --- tools/testing/selftests/bpf/prog_tests/lwt_helpers.h | 2 -- tools/testing/selftests/bpf/prog_tests/lwt_redirect.c | 1 + tools/testing/selftests/bpf/prog_tests/lwt_reroute.c | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_helpers.h b/tools/testing/selftests/bpf/prog_tests/lwt_helpers.h index e9190574e79f..fb1eb8c67361 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_helpers.h +++ b/tools/testing/selftests/bpf/prog_tests/lwt_helpers.h @@ -27,8 +27,6 @@ } \ }) -#define NETNS "ns_lwt" - static inline int netns_create(void) { return system("ip netns add " NETNS); diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c b/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c index b5b9e74b1044..835a1d756c16 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c @@ -54,6 +54,7 @@ #include #include +#define NETNS "ns_lwt_redirect" #include "lwt_helpers.h" #include "test_progs.h" #include "network_helpers.h" diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c b/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c index 5610bc76928d..03825d2b45a8 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c +++ b/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c @@ -48,6 +48,7 @@ * For case 2, force UDP packets to overflow fq limit. As long as kernel * is not crashed, it is considered successful. */ +#define NETNS "ns_lwt_reroute" #include "lwt_helpers.h" #include "network_helpers.h" #include From 2d9a925d0fbf0dae99af148adaf4f5cadf1be5e0 Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Fri, 2 Feb 2024 14:11:10 -0800 Subject: [PATCH 057/119] bpf, docs: Expand set of initial conformance groups This patch attempts to update the ISA specification according to the latest mailing list discussion about conformance groups, in a way that is intended to be consistent with IANA registry processes and IETF 118 WG meeting discussion. It does the following: * Split basic into base32 and base64 for 32-bit vs 64-bit base instructions * Split division/multiplication/modulo instructions out of base groups * Split atomic instructions out of base groups There may be additional changes as discussion continues, but there seems to be consensus on the principles above. v1->v2: fixed typo pointed out by David Vernet v2->v3: Moved multiplication to same groups as division/modulo Signed-off-by: Dave Thaler Acked-by: David Vernet Link: https://lore.kernel.org/r/20240202221110.3872-1-dthaler1968@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/standardization/instruction-set.rst | 48 ++++++++++++++----- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst index dcbc9193c66f..1c4258f1ce93 100644 --- a/Documentation/bpf/standardization/instruction-set.rst +++ b/Documentation/bpf/standardization/instruction-set.rst @@ -102,7 +102,7 @@ Conformance groups An implementation does not need to support all instructions specified in this document (e.g., deprecated instructions). Instead, a number of conformance -groups are specified. An implementation must support the "basic" conformance +groups are specified. An implementation must support the base32 conformance group and may support additional conformance groups, where supporting a conformance group means it must support all instructions in that conformance group. @@ -112,12 +112,21 @@ that executes instructions, and tools as such compilers that generate instructions for the runtime. Thus, capability discovery in terms of conformance groups might be done manually by users or automatically by tools. -Each conformance group has a short ASCII label (e.g., "basic") that +Each conformance group has a short ASCII label (e.g., "base32") that corresponds to a set of instructions that are mandatory. That is, each instruction has one or more conformance groups of which it is a member. -The "basic" conformance group includes all instructions defined in this -specification unless otherwise noted. +This document defines the following conformance groups: +* base32: includes all instructions defined in this + specification unless otherwise noted. +* base64: includes base32, plus instructions explicitly noted + as being in the base64 conformance group. +* atomic32: includes 32-bit atomic operation instructions (see `Atomic operations`_). +* atomic64: includes atomic32, plus 64-bit atomic operation instructions. +* divmul32: includes 32-bit division, multiplication, and modulo instructions. +* divmul64: includes divmul32, plus 64-bit division, multiplication, + and modulo instructions. +* legacy: deprecated packet access instructions. Instruction encoding ==================== @@ -234,7 +243,8 @@ Arithmetic instructions ----------------------- ``BPF_ALU`` uses 32-bit wide operands while ``BPF_ALU64`` uses 64-bit wide operands for -otherwise identical operations. +otherwise identical operations. ``BPF_ALU64`` instructions belong to the +base64 conformance group unless noted otherwise. The 'code' field encodes the operation as below, where 'src' and 'dst' refer to the values of the source and destination registers, respectively. @@ -288,6 +298,10 @@ where '(u32)' indicates that the upper 32 bits are zeroed. Note that most instructions have instruction offset of 0. Only three instructions (``BPF_SDIV``, ``BPF_SMOD``, ``BPF_MOVSX``) have a non-zero offset. +Division, multiplication, and modulo operations for ``BPF_ALU`` are part +of the "divmul32" conformance group, and division, multiplication, and +modulo operations for ``BPF_ALU64`` are part of the "divmul64" conformance +group. The division and modulo operations support both unsigned and signed flavors. For unsigned operations (``BPF_DIV`` and ``BPF_MOD``), for ``BPF_ALU``, @@ -344,7 +358,9 @@ BPF_ALU64 Reserved 0x00 do byte swap unconditionally ========= ========= ===== ================================================= The 'imm' field encodes the width of the swap operations. The following widths -are supported: 16, 32 and 64. +are supported: 16, 32 and 64. Width 64 operations belong to the base64 +conformance group and other swap operations belong to the base32 +conformance group. Examples: @@ -369,8 +385,10 @@ Examples: Jump instructions ----------------- -``BPF_JMP32`` uses 32-bit wide operands while ``BPF_JMP`` uses 64-bit wide operands for -otherwise identical operations. +``BPF_JMP32`` uses 32-bit wide operands and indicates the base32 +conformance group, while ``BPF_JMP`` uses 64-bit wide operands for +otherwise identical operations, and indicates the base64 conformance +group unless otherwise specified. The 'code' field encodes the operation as below: ======== ===== === =============================== ============================================= @@ -419,6 +437,9 @@ specified by the 'imm' field. A > 16-bit conditional jump may be converted to a < 16-bit conditional jump plus a 32-bit unconditional jump. +All ``BPF_CALL`` and ``BPF_JA`` instructions belong to the +base32 conformance group. + Helper functions ~~~~~~~~~~~~~~~~ @@ -476,6 +497,8 @@ The size modifier is one of: BPF_DW 0x18 double word (8 bytes) ============= ===== ===================== +Instructions using ``BPF_DW`` belong to the base64 conformance group. + Regular load and store operations --------------------------------- @@ -520,8 +543,10 @@ by other BPF programs or means outside of this specification. All atomic operations supported by BPF are encoded as store operations that use the ``BPF_ATOMIC`` mode modifier as follows: -* ``BPF_ATOMIC | BPF_W | BPF_STX`` for 32-bit operations -* ``BPF_ATOMIC | BPF_DW | BPF_STX`` for 64-bit operations +* ``BPF_ATOMIC | BPF_W | BPF_STX`` for 32-bit operations, which are + part of the "atomic32" conformance group. +* ``BPF_ATOMIC | BPF_DW | BPF_STX`` for 64-bit operations, which are + part of the "atomic64" conformance group. * 8-bit and 16-bit wide atomic operations are not supported. The 'imm' field is used to encode the actual atomic operation. @@ -634,5 +659,4 @@ carried over from classic BPF. These instructions used an instruction class of BPF_LD, a size modifier of BPF_W, BPF_H, or BPF_B, and a mode modifier of BPF_ABS or BPF_IND. However, these instructions are deprecated and should no longer be used. All legacy packet access -instructions belong to the "legacy" conformance group instead of the "basic" -conformance group. +instructions belong to the "legacy" conformance group. From a44b1334aadd82203f661adb9adb41e53ad0e8d1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 4 Feb 2024 22:23:48 +0000 Subject: [PATCH 058/119] bpf: Allow calling static subprogs while holding a bpf_spin_lock Currently, calling any helpers, kfuncs, or subprogs except the graph data structure (lists, rbtrees) API kfuncs while holding a bpf_spin_lock is not allowed. One of the original motivations of this decision was to force the BPF programmer's hand into keeping the bpf_spin_lock critical section small, and to ensure the execution time of the program does not increase due to lock waiting times. In addition to this, some of the helpers and kfuncs may be unsafe to call while holding a bpf_spin_lock. However, when it comes to subprog calls, atleast for static subprogs, the verifier is able to explore their instructions during verification. Therefore, it is similar in effect to having the same code inlined into the critical section. Hence, not allowing static subprog calls in the bpf_spin_lock critical section is mostly an annoyance that needs to be worked around, without providing any tangible benefit. Unlike static subprog calls, global subprog calls are not safe to permit within the critical section, as the verifier does not explore them during verification, therefore whether the same lock will be taken again, or unlocked, cannot be ascertained. Therefore, allow calling static subprogs within a bpf_spin_lock critical section, and only reject it in case the subprog linkage is global. Acked-by: Yonghong Song Acked-by: David Vernet Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240204222349.938118-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++++--- .../testing/selftests/bpf/progs/verifier_spin_lock.c | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 64fa188d00ad..7d38b2343ad4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9493,6 +9493,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (subprog_is_global(env, subprog)) { const char *sub_name = subprog_name(env, subprog); + /* Only global subprogs cannot be called with a lock held. */ + if (env->cur_state->active_lock.ptr) { + verbose(env, "global function calls are not allowed while holding a lock,\n" + "use static function instead\n"); + return -EINVAL; + } + if (err) { verbose(env, "Caller passes invalid args into func#%d ('%s')\n", subprog, sub_name); @@ -17644,7 +17651,6 @@ static int do_check(struct bpf_verifier_env *env) if (env->cur_state->active_lock.ptr) { if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || - (insn->src_reg == BPF_PSEUDO_CALL) || (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) { verbose(env, "function calls are not allowed while holding a lock\n"); @@ -17692,8 +17698,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } process_bpf_exit_full: - if (env->cur_state->active_lock.ptr && - !in_rbtree_lock_required_cb(env)) { + if (env->cur_state->active_lock.ptr && !env->cur_state->curframe) { verbose(env, "bpf_spin_unlock is missing\n"); return -EINVAL; } diff --git a/tools/testing/selftests/bpf/progs/verifier_spin_lock.c b/tools/testing/selftests/bpf/progs/verifier_spin_lock.c index 9c1aa69650f8..fb316c080c84 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spin_lock.c +++ b/tools/testing/selftests/bpf/progs/verifier_spin_lock.c @@ -330,7 +330,7 @@ l1_%=: r7 = r0; \ SEC("cgroup/skb") __description("spin_lock: test10 lock in subprog without unlock") -__failure __msg("unlock is missing") +__success __failure_unpriv __msg_unpriv("") __naked void lock_in_subprog_without_unlock(void) { From e8699c4ff85baedcf40f33db816cc487cee39397 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 4 Feb 2024 22:23:49 +0000 Subject: [PATCH 059/119] selftests/bpf: Add test for static subprog call in lock cs Add selftests for static subprog calls within bpf_spin_lock critical section, and ensure we still reject global subprog calls. Also test the case where a subprog call will unlock the caller's held lock, or the caller will unlock a lock taken by a subprog call, ensuring correct transfer of lock state across frames on exit. Acked-by: Yonghong Song Acked-by: David Vernet Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240204222349.938118-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/spin_lock.c | 2 + .../selftests/bpf/progs/test_spin_lock.c | 65 +++++++++++++++++++ .../selftests/bpf/progs/test_spin_lock_fail.c | 44 +++++++++++++ 3 files changed, 111 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c index 18d451be57c8..2b0068742ef9 100644 --- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c @@ -48,6 +48,8 @@ static struct { { "lock_id_mismatch_innermapval_kptr", "bpf_spin_unlock of different lock" }, { "lock_id_mismatch_innermapval_global", "bpf_spin_unlock of different lock" }, { "lock_id_mismatch_innermapval_mapval", "bpf_spin_unlock of different lock" }, + { "lock_global_subprog_call1", "global function calls are not allowed while holding a lock" }, + { "lock_global_subprog_call2", "global function calls are not allowed while holding a lock" }, }; static int match_regex(const char *pattern, const char *string) diff --git a/tools/testing/selftests/bpf/progs/test_spin_lock.c b/tools/testing/selftests/bpf/progs/test_spin_lock.c index b2440a0ff422..d8d77bdffd3d 100644 --- a/tools/testing/selftests/bpf/progs/test_spin_lock.c +++ b/tools/testing/selftests/bpf/progs/test_spin_lock.c @@ -101,4 +101,69 @@ int bpf_spin_lock_test(struct __sk_buff *skb) err: return err; } + +struct bpf_spin_lock lockA __hidden SEC(".data.A"); + +__noinline +static int static_subprog(struct __sk_buff *ctx) +{ + volatile int ret = 0; + + if (ctx->protocol) + return ret; + return ret + ctx->len; +} + +__noinline +static int static_subprog_lock(struct __sk_buff *ctx) +{ + volatile int ret = 0; + + ret = static_subprog(ctx); + bpf_spin_lock(&lockA); + return ret + ctx->len; +} + +__noinline +static int static_subprog_unlock(struct __sk_buff *ctx) +{ + volatile int ret = 0; + + ret = static_subprog(ctx); + bpf_spin_unlock(&lockA); + return ret + ctx->len; +} + +SEC("tc") +int lock_static_subprog_call(struct __sk_buff *ctx) +{ + int ret = 0; + + bpf_spin_lock(&lockA); + if (ctx->mark == 42) + ret = static_subprog(ctx); + bpf_spin_unlock(&lockA); + return ret; +} + +SEC("tc") +int lock_static_subprog_lock(struct __sk_buff *ctx) +{ + int ret = 0; + + ret = static_subprog_lock(ctx); + bpf_spin_unlock(&lockA); + return ret; +} + +SEC("tc") +int lock_static_subprog_unlock(struct __sk_buff *ctx) +{ + int ret = 0; + + bpf_spin_lock(&lockA); + ret = static_subprog_unlock(ctx); + return ret; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c b/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c index 86cd183ef6dc..43f40c4fe241 100644 --- a/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c +++ b/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c @@ -201,4 +201,48 @@ CHECK(innermapval_mapval, &iv->lock, &v->lock); #undef CHECK +__noinline +int global_subprog(struct __sk_buff *ctx) +{ + volatile int ret = 0; + + if (ctx->protocol) + ret += ctx->protocol; + return ret + ctx->mark; +} + +__noinline +static int static_subprog_call_global(struct __sk_buff *ctx) +{ + volatile int ret = 0; + + if (ctx->protocol) + return ret; + return ret + ctx->len + global_subprog(ctx); +} + +SEC("?tc") +int lock_global_subprog_call1(struct __sk_buff *ctx) +{ + int ret = 0; + + bpf_spin_lock(&lockA); + if (ctx->mark == 42) + ret = global_subprog(ctx); + bpf_spin_unlock(&lockA); + return ret; +} + +SEC("?tc") +int lock_global_subprog_call2(struct __sk_buff *ctx) +{ + int ret = 0; + + bpf_spin_lock(&lockA); + if (ctx->mark == 42) + ret = static_subprog_call_global(ctx); + bpf_spin_unlock(&lockA); + return ret; +} + char _license[] SEC("license") = "GPL"; From 6fceea0fa59f6786a2847a4cae409117624e8b58 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 5 Feb 2024 05:56:45 +0000 Subject: [PATCH 060/119] bpf: Transfer RCU lock state between subprog calls Allow transferring an imbalanced RCU lock state between subprog calls during verification. This allows patterns where a subprog call returns with an RCU lock held, or a subprog call releases an RCU lock held by the caller. Currently, the verifier would end up complaining if the RCU lock is not released when processing an exit from a subprog, which is non-ideal if its execution is supposed to be enclosed in an RCU read section of the caller. Instead, simply only check whether we are processing exit for frame#0 and do not complain on an active RCU lock otherwise. We only need to update the check when processing BPF_EXIT insn, as copy_verifier_state is already set up to do the right thing. Suggested-by: David Vernet Tested-by: Yafang Shao Acked-by: Yonghong Song Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: David Vernet Link: https://lore.kernel.org/r/20240205055646.1112186-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7d38b2343ad4..ddaf09db1175 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17703,8 +17703,7 @@ process_bpf_exit_full: return -EINVAL; } - if (env->cur_state->active_rcu_lock && - !in_rbtree_lock_required_cb(env)) { + if (env->cur_state->active_rcu_lock && !env->cur_state->curframe) { verbose(env, "bpf_rcu_read_unlock is missing\n"); return -EINVAL; } From 8be6a0147af314fd60db9da2158cd737dc6394a7 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 5 Feb 2024 05:56:46 +0000 Subject: [PATCH 061/119] selftests/bpf: Add tests for RCU lock transfer between subprogs Add selftests covering the following cases: - A static or global subprog called from within a RCU read section works - A static subprog taking an RCU read lock which is released in caller works - A static subprog releasing the caller's RCU read lock works Global subprogs that leave the lock in an imbalanced state will not work, as they are verified separately, so ensure those cases fail as well. Acked-by: Yonghong Song Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: David Vernet Link: https://lore.kernel.org/r/20240205055646.1112186-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/rcu_read_lock.c | 6 + .../selftests/bpf/progs/rcu_read_lock.c | 120 ++++++++++++++++++ 2 files changed, 126 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c index 3f1f58d3a729..a1f7e7378a64 100644 --- a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c @@ -29,6 +29,10 @@ static void test_success(void) bpf_program__set_autoload(skel->progs.non_sleepable_1, true); bpf_program__set_autoload(skel->progs.non_sleepable_2, true); bpf_program__set_autoload(skel->progs.task_trusted_non_rcuptr, true); + bpf_program__set_autoload(skel->progs.rcu_read_lock_subprog, true); + bpf_program__set_autoload(skel->progs.rcu_read_lock_global_subprog, true); + bpf_program__set_autoload(skel->progs.rcu_read_lock_subprog_lock, true); + bpf_program__set_autoload(skel->progs.rcu_read_lock_subprog_unlock, true); err = rcu_read_lock__load(skel); if (!ASSERT_OK(err, "skel_load")) goto out; @@ -75,6 +79,8 @@ static const char * const inproper_region_tests[] = { "inproper_sleepable_helper", "inproper_sleepable_kfunc", "nested_rcu_region", + "rcu_read_lock_global_subprog_lock", + "rcu_read_lock_global_subprog_unlock", }; static void test_inproper_region(void) diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index 14fb01437fb8..ab3a532b7dd6 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -319,3 +319,123 @@ int cross_rcu_region(void *ctx) bpf_rcu_read_unlock(); return 0; } + +__noinline +static int static_subprog(void *ctx) +{ + volatile int ret = 0; + + if (bpf_get_prandom_u32()) + return ret + 42; + return ret + bpf_get_prandom_u32(); +} + +__noinline +int global_subprog(u64 a) +{ + volatile int ret = a; + + return ret + static_subprog(NULL); +} + +__noinline +static int static_subprog_lock(void *ctx) +{ + volatile int ret = 0; + + bpf_rcu_read_lock(); + if (bpf_get_prandom_u32()) + return ret + 42; + return ret + bpf_get_prandom_u32(); +} + +__noinline +int global_subprog_lock(u64 a) +{ + volatile int ret = a; + + return ret + static_subprog_lock(NULL); +} + +__noinline +static int static_subprog_unlock(void *ctx) +{ + volatile int ret = 0; + + bpf_rcu_read_unlock(); + if (bpf_get_prandom_u32()) + return ret + 42; + return ret + bpf_get_prandom_u32(); +} + +__noinline +int global_subprog_unlock(u64 a) +{ + volatile int ret = a; + + return ret + static_subprog_unlock(NULL); +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int rcu_read_lock_subprog(void *ctx) +{ + volatile int ret = 0; + + bpf_rcu_read_lock(); + if (bpf_get_prandom_u32()) + ret += static_subprog(ctx); + bpf_rcu_read_unlock(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int rcu_read_lock_global_subprog(void *ctx) +{ + volatile int ret = 0; + + bpf_rcu_read_lock(); + if (bpf_get_prandom_u32()) + ret += global_subprog(ret); + bpf_rcu_read_unlock(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int rcu_read_lock_subprog_lock(void *ctx) +{ + volatile int ret = 0; + + ret += static_subprog_lock(ctx); + bpf_rcu_read_unlock(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int rcu_read_lock_global_subprog_lock(void *ctx) +{ + volatile int ret = 0; + + ret += global_subprog_lock(ret); + bpf_rcu_read_unlock(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int rcu_read_lock_subprog_unlock(void *ctx) +{ + volatile int ret = 0; + + bpf_rcu_read_lock(); + ret += static_subprog_unlock(ctx); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int rcu_read_lock_global_subprog_unlock(void *ctx) +{ + volatile int ret = 0; + + bpf_rcu_read_lock(); + ret += global_subprog_unlock(ret); + return 0; +} From 2863d665ea41282379f108e4da6c8a2366ba66db Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 5 Feb 2024 13:35:50 +0100 Subject: [PATCH 062/119] xsk: support redirect to any socket bound to the same umem Add support for directing a packet to any socket bound to the same umem. This makes it possible to use the XDP program to select what socket the packet should be received on. The user can populate the XSKMAP with various sockets and as long as they share the same umem, the XDP program can pick any one of them. Suggested-by: Yuval El-Hanany Signed-off-by: Magnus Karlsson Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240205123553.22180-2-magnus.karlsson@gmail.com Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 1eadfac03cc4..a339e9a1b557 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -313,10 +313,13 @@ static bool xsk_is_bound(struct xdp_sock *xs) static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { + struct net_device *dev = xdp->rxq->dev; + u32 qid = xdp->rxq->queue_index; + if (!xsk_is_bound(xs)) return -ENXIO; - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + if (!dev->_rx[qid].pool || xs->umem != dev->_rx[qid].pool->umem) return -EINVAL; if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { From 968595a93669b6b4f6d1fcf80cf2d97956b6868f Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 5 Feb 2024 13:35:51 +0100 Subject: [PATCH 063/119] xsk: document ability to redirect to any socket bound to the same umem Document the ability to redirect to any socket bound to the same umem. Signed-off-by: Magnus Karlsson Link: https://lore.kernel.org/r/20240205123553.22180-3-magnus.karlsson@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/networking/af_xdp.rst | 31 +++++++++++++++++------------ 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst index dceeb0d763aa..72da7057e4cf 100644 --- a/Documentation/networking/af_xdp.rst +++ b/Documentation/networking/af_xdp.rst @@ -329,23 +329,24 @@ XDP_SHARED_UMEM option and provide the initial socket's fd in the sxdp_shared_umem_fd field as you registered the UMEM on that socket. These two sockets will now share one and the same UMEM. -There is no need to supply an XDP program like the one in the previous -case where sockets were bound to the same queue id and -device. Instead, use the NIC's packet steering capabilities to steer -the packets to the right queue. In the previous example, there is only -one queue shared among sockets, so the NIC cannot do this steering. It -can only steer between queues. +In this case, it is possible to use the NIC's packet steering +capabilities to steer the packets to the right queue. This is not +possible in the previous example as there is only one queue shared +among sockets, so the NIC cannot do this steering as it can only steer +between queues. -In libbpf, you need to use the xsk_socket__create_shared() API as it -takes a reference to a FILL ring and a COMPLETION ring that will be -created for you and bound to the shared UMEM. You can use this -function for all the sockets you create, or you can use it for the -second and following ones and use xsk_socket__create() for the first -one. Both methods yield the same result. +In libxdp (or libbpf prior to version 1.0), you need to use the +xsk_socket__create_shared() API as it takes a reference to a FILL ring +and a COMPLETION ring that will be created for you and bound to the +shared UMEM. You can use this function for all the sockets you create, +or you can use it for the second and following ones and use +xsk_socket__create() for the first one. Both methods yield the same +result. Note that a UMEM can be shared between sockets on the same queue id and device, as well as between queues on the same device and between -devices at the same time. +devices at the same time. It is also possible to redirect to any +socket as long as it is bound to the same umem with XDP_SHARED_UMEM. XDP_USE_NEED_WAKEUP bind flag ----------------------------- @@ -822,6 +823,10 @@ A: The short answer is no, that is not supported at the moment. The switch, or other distribution mechanism, in your NIC to direct traffic to the correct queue id and socket. + Note that if you are using the XDP_SHARED_UMEM option, it is + possible to switch traffic between any socket bound to the same + umem. + Q: My packets are sometimes corrupted. What is wrong? A: Care has to be taken not to feed the same buffer in the UMEM into From d7bc416aa5cc183691287e8f0b1d5b182a7ce9c3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 5 Feb 2024 16:22:43 -0800 Subject: [PATCH 064/119] libbpf: fix return value for PERF_EVENT __arg_ctx type fix up check If PERF_EVENT program has __arg_ctx argument with matching architecture-specific pt_regs/user_pt_regs/user_regs_struct pointer type, libbpf should still perform type rewrite for old kernels, but not emit the warning. Fix copy/paste from kernel code where 0 is meant to signify "no error" condition. For libbpf we need to return "true" to proceed with type rewrite (which for PERF_EVENT program will be a canonical `struct bpf_perf_event_data *` type). Fixes: 9eea8fafe33e ("libbpf: fix __arg_ctx type enforcement for perf_event programs") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240206002243.1439450-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6932f2c4ddfd..01f407591a92 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6404,13 +6404,13 @@ static bool need_func_arg_type_fixup(const struct btf *btf, const struct bpf_pro case BPF_PROG_TYPE_PERF_EVENT: if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) && btf_is_struct(t) && strcmp(tname, "pt_regs") == 0) - return 0; + return true; if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) && btf_is_struct(t) && strcmp(tname, "user_pt_regs") == 0) - return 0; + return true; if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) && btf_is_struct(t) && strcmp(tname, "user_regs_struct") == 0) - return 0; + return true; break; case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: From c7dcb6c9aa85fa310251dad7e233eb955a5235ed Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 5 Feb 2024 16:40:08 -0800 Subject: [PATCH 065/119] selftests/bpf: mark dynptr kfuncs __weak to make them optional on old kernels Mark dynptr kfuncs as __weak to allow verifier_global_subprogs/arg_ctx_{perf,kprobe,raw_tp} subtests to be loadable on old kernels. Because bpf_dynptr_from_xdp() kfunc is used from arg_tag_dynptr BPF program in progs/verifier_global_subprogs.c *and* is not marked as __weak, loading any subtest from verifier_global_subprogs fails on old kernels that don't have bpf_dynptr_from_xdp() kfunc defined. Even if arg_tag_dynptr program itself is not loaded, libbpf bails out on non-weak reference to bpf_dynptr_from_xdp (that can't be resolved), which shared across all programs in progs/verifier_global_subprogs.c. So mark all dynptr-related kfuncs as __weak to unblock libbpf CI ([0]). In the upcoming "kfunc in vmlinux.h" work we should make sure that kfuncs are always declared __weak as well. [0] https://github.com/libbpf/libbpf/actions/runs/7792673215/job/21251250831?pr=776#step:4:7961 Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240206004008.1541513-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_kfuncs.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 0cd4111f954c..14ebe7d9e1a3 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -9,7 +9,7 @@ struct bpf_sock_addr_kern; * Error code */ extern int bpf_dynptr_from_skb(struct __sk_buff *skb, __u64 flags, - struct bpf_dynptr *ptr__uninit) __ksym; + struct bpf_dynptr *ptr__uninit) __ksym __weak; /* Description * Initializes an xdp-type dynptr @@ -17,7 +17,7 @@ extern int bpf_dynptr_from_skb(struct __sk_buff *skb, __u64 flags, * Error code */ extern int bpf_dynptr_from_xdp(struct xdp_md *xdp, __u64 flags, - struct bpf_dynptr *ptr__uninit) __ksym; + struct bpf_dynptr *ptr__uninit) __ksym __weak; /* Description * Obtain a read-only pointer to the dynptr's data @@ -26,7 +26,7 @@ extern int bpf_dynptr_from_xdp(struct xdp_md *xdp, __u64 flags, * buffer if unable to obtain a direct pointer */ extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset, - void *buffer, __u32 buffer__szk) __ksym; + void *buffer, __u32 buffer__szk) __ksym __weak; /* Description * Obtain a read-write pointer to the dynptr's data @@ -35,13 +35,13 @@ extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset, * buffer if unable to obtain a direct pointer */ extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u32 offset, - void *buffer, __u32 buffer__szk) __ksym; + void *buffer, __u32 buffer__szk) __ksym __weak; -extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u32 start, __u32 end) __ksym; -extern bool bpf_dynptr_is_null(const struct bpf_dynptr *ptr) __ksym; -extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym; -extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym; -extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clone__init) __ksym; +extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u32 start, __u32 end) __ksym __weak; +extern bool bpf_dynptr_is_null(const struct bpf_dynptr *ptr) __ksym __weak; +extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym __weak; +extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak; +extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clone__init) __ksym __weak; /* Description * Modify the address of a AF_UNIX sockaddr. From 563918a0e3afd97bcfb680b72c52ec080c82aea6 Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Mon, 5 Feb 2024 20:51:46 -0800 Subject: [PATCH 066/119] bpf, docs: Fix typos in instructions-set.rst * "imm32" should just be "imm" * Add blank line to fix formatting error reported by Stephen Rothwell [0] [0]: https://lore.kernel.org/bpf/20240206153301.4ead0bad@canb.auug.org.au/T/#u Signed-off-by: Dave Thaler Acked-by: David Vernet Link: https://lore.kernel.org/r/20240206045146.4965-1-dthaler1968@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/standardization/instruction-set.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst index 1c4258f1ce93..bdfe0cd0e499 100644 --- a/Documentation/bpf/standardization/instruction-set.rst +++ b/Documentation/bpf/standardization/instruction-set.rst @@ -117,6 +117,7 @@ corresponds to a set of instructions that are mandatory. That is, each instruction has one or more conformance groups of which it is a member. This document defines the following conformance groups: + * base32: includes all instructions defined in this specification unless otherwise noted. * base64: includes base32, plus instructions explicitly noted @@ -289,11 +290,11 @@ where '(u32)' indicates that the upper 32 bits are zeroed. ``BPF_XOR | BPF_K | BPF_ALU`` means:: - dst = (u32) dst ^ (u32) imm32 + dst = (u32) dst ^ (u32) imm ``BPF_XOR | BPF_K | BPF_ALU64`` means:: - dst = dst ^ imm32 + dst = dst ^ imm Note that most instructions have instruction offset of 0. Only three instructions (``BPF_SDIV``, ``BPF_SMOD``, ``BPF_MOVSX``) have a non-zero offset. @@ -511,7 +512,7 @@ instructions that transfer data between a register and memory. ``BPF_MEM | | BPF_ST`` means:: - *(size *) (dst + offset) = imm32 + *(size *) (dst + offset) = imm ``BPF_MEM | | BPF_LDX`` means:: From c27aa462aa78ff157fdda222af242e4571803d4a Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 6 Feb 2024 11:23:30 +0100 Subject: [PATCH 067/119] bpf: Use -Wno-address-of-packed-member in some selftests [Differences from V2: - Remove conditionals in the source files pragmas, as the pragma is supported by both GCC and clang.] Both GCC and clang implement the -Wno-address-of-packed-member warning, which is enabled by -Wall, that warns about taking the address of a packed struct field when it can lead to an "unaligned" address. This triggers the following errors (-Werror) when building three particular BPF selftests with GCC: progs/test_cls_redirect.c 986 | if (ipv4_is_fragment((void *)&encap->ip)) { progs/test_cls_redirect_dynptr.c 410 | pkt_ipv4_checksum((void *)&encap_gre->ip); progs/test_cls_redirect.c 521 | pkt_ipv4_checksum((void *)&encap_gre->ip); progs/test_tc_tunnel.c 232 | set_ipv4_csum((void *)&h_outer.ip); These warnings do not signal any real problem in the tests as far as I can see. This patch adds pragmas to these test files that inhibit the -Waddress-of-packed-member warning. Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240206102330.7113-1-jose.marchesi@oracle.com --- tools/testing/selftests/bpf/progs/test_cls_redirect.c | 2 ++ tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c | 2 ++ tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c index 66b304982245..bfc9179259d5 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -22,6 +22,8 @@ #include "test_cls_redirect.h" +#pragma GCC diagnostic ignored "-Waddress-of-packed-member" + #ifdef SUBPROGS #define INLINING __noinline #else diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c index f41c81212ee9..da54c09e9a15 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c @@ -23,6 +23,8 @@ #include "test_cls_redirect.h" #include "bpf_kfuncs.h" +#pragma GCC diagnostic ignored "-Waddress-of-packed-member" + #define offsetofend(TYPE, MEMBER) \ (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index e6e678aa9874..d8d7ab5e8e30 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -20,6 +20,8 @@ #include #include +#pragma GCC diagnostic ignored "-Waddress-of-packed-member" + static const int cfg_port = 8000; static const int cfg_udp_src = 20000; From 92a871ab9fa59a74d013bc04f321026a057618e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 6 Feb 2024 13:59:22 +0100 Subject: [PATCH 068/119] libbpf: Use OPTS_SET() macro in bpf_xdp_query() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the feature_flags and xdp_zc_max_segs fields were added to the libbpf bpf_xdp_query_opts, the code writing them did not use the OPTS_SET() macro. This causes libbpf to write to those fields unconditionally, which means that programs compiled against an older version of libbpf (with a smaller size of the bpf_xdp_query_opts struct) will have its stack corrupted by libbpf writing out of bounds. The patch adding the feature_flags field has an early bail out if the feature_flags field is not part of the opts struct (via the OPTS_HAS) macro, but the patch adding xdp_zc_max_segs does not. For consistency, this fix just changes the assignments to both fields to use the OPTS_SET() macro. Fixes: 13ce2daa259a ("xsk: add new netlink attribute dedicated for ZC max frags") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240206125922.1992815-1-toke@redhat.com --- tools/lib/bpf/netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 090bcf6e3b3d..68a2def17175 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -496,8 +496,8 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) if (err) return libbpf_err(err); - opts->feature_flags = md.flags; - opts->xdp_zc_max_segs = md.xdp_zc_max_segs; + OPTS_SET(opts, feature_flags, md.flags); + OPTS_SET(opts, xdp_zc_max_segs, md.xdp_zc_max_segs); skip_feature_flags: return 0; From 9707ac4fe2f5bac6406d2403f8b8a64d7b3d8e43 Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Tue, 6 Feb 2024 13:46:09 +0100 Subject: [PATCH 069/119] tools/resolve_btfids: Refactor set sorting with types from btf_ids.h Instead of using magic offsets to access BTF ID set data, leverage types from btf_ids.h (btf_id_set and btf_id_set8) which define the actual layout of the data. Thanks to this change, set sorting should also continue working if the layout changes. This requires to sync the definition of 'struct btf_id_set8' from include/linux/btf_ids.h to tools/include/linux/btf_ids.h. We don't sync the rest of the file at the moment, b/c that would require to also sync multiple dependent headers and we don't need any other defs from btf_ids.h. Signed-off-by: Viktor Malik Signed-off-by: Andrii Nakryiko Acked-by: Daniel Xu Link: https://lore.kernel.org/bpf/ff7f062ddf6a00815fda3087957c4ce667f50532.1707223196.git.vmalik@redhat.com --- tools/bpf/resolve_btfids/main.c | 35 ++++++++++++++++++++------------- tools/include/linux/btf_ids.h | 9 +++++++++ 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 27a23196d58e..32634f00abba 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -78,7 +79,7 @@ #include #define BTF_IDS_SECTION ".BTF_ids" -#define BTF_ID "__BTF_ID__" +#define BTF_ID_PREFIX "__BTF_ID__" #define BTF_STRUCT "struct" #define BTF_UNION "union" @@ -161,7 +162,7 @@ static int eprintf(int level, int var, const char *fmt, ...) static bool is_btf_id(const char *name) { - return name && !strncmp(name, BTF_ID, sizeof(BTF_ID) - 1); + return name && !strncmp(name, BTF_ID_PREFIX, sizeof(BTF_ID_PREFIX) - 1); } static struct btf_id *btf_id__find(struct rb_root *root, const char *name) @@ -441,7 +442,7 @@ static int symbols_collect(struct object *obj) * __BTF_ID__TYPE__vfs_truncate__0 * prefix = ^ */ - prefix = name + sizeof(BTF_ID) - 1; + prefix = name + sizeof(BTF_ID_PREFIX) - 1; /* struct */ if (!strncmp(prefix, BTF_STRUCT, sizeof(BTF_STRUCT) - 1)) { @@ -649,19 +650,18 @@ static int cmp_id(const void *pa, const void *pb) static int sets_patch(struct object *obj) { Elf_Data *data = obj->efile.idlist; - int *ptr = data->d_buf; struct rb_node *next; next = rb_first(&obj->sets); while (next) { - unsigned long addr, idx; + struct btf_id_set8 *set8; + struct btf_id_set *set; + unsigned long addr, off; struct btf_id *id; - int *base; - int cnt; id = rb_entry(next, struct btf_id, rb_node); addr = id->addr[0]; - idx = addr - obj->efile.idlist_addr; + off = addr - obj->efile.idlist_addr; /* sets are unique */ if (id->addr_cnt != 1) { @@ -670,14 +670,21 @@ static int sets_patch(struct object *obj) return -1; } - idx = idx / sizeof(int); - base = &ptr[idx] + (id->is_set8 ? 2 : 1); - cnt = ptr[idx]; + if (id->is_set) { + set = data->d_buf + off; + qsort(set->ids, set->cnt, sizeof(set->ids[0]), cmp_id); + } else { + set8 = data->d_buf + off; + /* + * Make sure id is at the beginning of the pairs + * struct, otherwise the below qsort would not work. + */ + BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id); + qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); + } pr_debug("sorting addr %5lu: cnt %6d [%s]\n", - (idx + 1) * sizeof(int), cnt, id->name); - - qsort(base, cnt, id->is_set8 ? sizeof(uint64_t) : sizeof(int), cmp_id); + off, id->is_set ? set->cnt : set8->cnt, id->name); next = rb_next(next); } diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h index 2f882d5cb30f..72535f00572f 100644 --- a/tools/include/linux/btf_ids.h +++ b/tools/include/linux/btf_ids.h @@ -8,6 +8,15 @@ struct btf_id_set { u32 ids[]; }; +struct btf_id_set8 { + u32 cnt; + u32 flags; + struct { + u32 id; + u32 flags; + } pairs[]; +}; + #ifdef CONFIG_DEBUG_INFO_BTF #include /* for __PASTE */ From 903fad4394666bc23975c93fb58f137ce64b5192 Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Tue, 6 Feb 2024 13:46:10 +0100 Subject: [PATCH 070/119] tools/resolve_btfids: Fix cross-compilation to non-host endianness The .BTF_ids section is pre-filled with zeroed BTF ID entries during the build and afterwards patched by resolve_btfids with correct values. Since resolve_btfids always writes in host-native endianness, it relies on libelf to do the translation when the target ELF is cross-compiled to a different endianness (this was introduced in commit 61e8aeda9398 ("bpf: Fix libelf endian handling in resolv_btfids")). Unfortunately, the translation will corrupt the flags fields of SET8 entries because these were written during vmlinux compilation and are in the correct endianness already. This will lead to numerous selftests failures such as: $ sudo ./test_verifier 502 502 #502/p sleepable fentry accept FAIL Failed to load prog 'Invalid argument'! bpf_fentry_test1 is not sleepable verification time 34 usec stack depth 0 processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 Summary: 0 PASSED, 0 SKIPPED, 1 FAILED Since it's not possible to instruct libelf to translate just certain values, let's manually bswap the flags (both global and entry flags) in resolve_btfids when needed, so that libelf then translates everything correctly. Fixes: ef2c6f370a63 ("tools/resolve_btfids: Add support for 8-byte BTF sets") Signed-off-by: Viktor Malik Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/7b6bff690919555574ce0f13d2a5996cacf7bf69.1707223196.git.vmalik@redhat.com --- tools/bpf/resolve_btfids/main.c | 35 +++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 32634f00abba..d9520cb826b3 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -90,6 +90,14 @@ #define ADDR_CNT 100 +#if __BYTE_ORDER == __LITTLE_ENDIAN +# define ELFDATANATIVE ELFDATA2LSB +#elif __BYTE_ORDER == __BIG_ENDIAN +# define ELFDATANATIVE ELFDATA2MSB +#else +# error "Unknown machine endianness!" +#endif + struct btf_id { struct rb_node rb_node; char *name; @@ -117,6 +125,7 @@ struct object { int idlist_shndx; size_t strtabidx; unsigned long idlist_addr; + int encoding; } efile; struct rb_root sets; @@ -320,6 +329,7 @@ static int elf_collect(struct object *obj) { Elf_Scn *scn = NULL; size_t shdrstrndx; + GElf_Ehdr ehdr; int idx = 0; Elf *elf; int fd; @@ -351,6 +361,13 @@ static int elf_collect(struct object *obj) return -1; } + if (gelf_getehdr(obj->efile.elf, &ehdr) == NULL) { + pr_err("FAILED cannot get ELF header: %s\n", + elf_errmsg(-1)); + return -1; + } + obj->efile.encoding = ehdr.e_ident[EI_DATA]; + /* * Scan all the elf sections and look for save data * from .BTF_ids section and symbols. @@ -681,6 +698,24 @@ static int sets_patch(struct object *obj) */ BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id); qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); + + /* + * When ELF endianness does not match endianness of the + * host, libelf will do the translation when updating + * the ELF. This, however, corrupts SET8 flags which are + * already in the target endianness. So, let's bswap + * them to the host endianness and libelf will then + * correctly translate everything. + */ + if (obj->efile.encoding != ELFDATANATIVE) { + int i; + + set8->flags = bswap_32(set8->flags); + for (i = 0; i < set8->cnt; i++) { + set8->pairs[i].flags = + bswap_32(set8->pairs[i].flags); + } + } } pr_debug("sorting addr %5lu: cnt %6d [%s]\n", From a2bff65cfca93f0fe4c5996f55ce8f413e85e4fe Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 6 Feb 2024 16:14:14 +0800 Subject: [PATCH 071/119] selftests/bpf: Fix error checking for cpumask_success__load() We should verify the return value of cpumask_success__load(). Signed-off-by: Yafang Shao Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240206081416.26242-4-laoar.shao@gmail.com --- tools/testing/selftests/bpf/prog_tests/cpumask.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cpumask.c b/tools/testing/selftests/bpf/prog_tests/cpumask.c index c2e886399e3c..ecf89df78109 100644 --- a/tools/testing/selftests/bpf/prog_tests/cpumask.c +++ b/tools/testing/selftests/bpf/prog_tests/cpumask.c @@ -27,7 +27,7 @@ static void verify_success(const char *prog_name) struct bpf_program *prog; struct bpf_link *link = NULL; pid_t child_pid; - int status; + int status, err; skel = cpumask_success__open(); if (!ASSERT_OK_PTR(skel, "cpumask_success__open")) @@ -36,8 +36,8 @@ static void verify_success(const char *prog_name) skel->bss->pid = getpid(); skel->bss->nr_cpus = libbpf_num_possible_cpus(); - cpumask_success__load(skel); - if (!ASSERT_OK_PTR(skel, "cpumask_success__load")) + err = cpumask_success__load(skel); + if (!ASSERT_OK(err, "cpumask_success__load")) goto cleanup; prog = bpf_object__find_program_by_name(skel->obj, prog_name); From ba6a6abb3bfa8377bcf386a11077c0533909f9e8 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 6 Feb 2024 16:14:15 +0800 Subject: [PATCH 072/119] selftests/bpf: Mark cpumask kfunc declarations as __weak After the series "Annotate kfuncs in .BTF_ids section"[0], kfuncs can be generated from bpftool. Let's mark the existing cpumask kfunc declarations __weak so they don't conflict with definitions that will eventually come from vmlinux.h. [0]. https://lore.kernel.org/all/cover.1706491398.git.dxu@dxuuu.xyz Suggested-by: Andrii Nakryiko Signed-off-by: Yafang Shao Signed-off-by: Andrii Nakryiko Acked-by: Daniel Xu Link: https://lore.kernel.org/bpf/20240206081416.26242-5-laoar.shao@gmail.com --- .../selftests/bpf/progs/cpumask_common.h | 55 ++++++++++--------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h index 0cd4aebb97cf..c705d8112a35 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_common.h +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h @@ -23,41 +23,42 @@ struct array_map { __uint(max_entries, 1); } __cpumask_map SEC(".maps"); -struct bpf_cpumask *bpf_cpumask_create(void) __ksym; -void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; -struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; -u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; -u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; +struct bpf_cpumask *bpf_cpumask_create(void) __ksym __weak; +void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym __weak; +struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym __weak; +u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym __weak; +u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym __weak; u32 bpf_cpumask_first_and(const struct cpumask *src1, - const struct cpumask *src2) __ksym; -void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym; -bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym; -void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym; + const struct cpumask *src2) __ksym __weak; +void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym __weak; +void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym __weak; +bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym __weak; +bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym __weak; +bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym __weak; +void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym __weak; +void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym __weak; bool bpf_cpumask_and(struct bpf_cpumask *cpumask, const struct cpumask *src1, - const struct cpumask *src2) __ksym; + const struct cpumask *src2) __ksym __weak; void bpf_cpumask_or(struct bpf_cpumask *cpumask, const struct cpumask *src1, - const struct cpumask *src2) __ksym; + const struct cpumask *src2) __ksym __weak; void bpf_cpumask_xor(struct bpf_cpumask *cpumask, const struct cpumask *src1, - const struct cpumask *src2) __ksym; -bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym; -bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym; -bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym; -bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym; -bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym; -void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym; -u32 bpf_cpumask_any_distribute(const struct cpumask *src) __ksym; -u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, const struct cpumask *src2) __ksym; -u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym; + const struct cpumask *src2) __ksym __weak; +bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym __weak; +bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym __weak; +bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym __weak; +bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym __weak; +bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym __weak; +void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym __weak; +u32 bpf_cpumask_any_distribute(const struct cpumask *src) __ksym __weak; +u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, + const struct cpumask *src2) __ksym __weak; +u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym __weak; -void bpf_rcu_read_lock(void) __ksym; -void bpf_rcu_read_unlock(void) __ksym; +void bpf_rcu_read_lock(void) __ksym __weak; +void bpf_rcu_read_unlock(void) __ksym __weak; static inline const struct cpumask *cast(struct bpf_cpumask *cpumask) { From e55dad12abe42383b68ba88212eb3d0fba0e9820 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 4 Feb 2024 16:56:34 +0900 Subject: [PATCH 073/119] bpf: Merge two CONFIG_BPF entries 'config BPF' exists in both init/Kconfig and kernel/bpf/Kconfig. Commit b24abcff918a ("bpf, kconfig: Add consolidated menu entry for bpf with core options") added the second one to kernel/bpf/Kconfig instead of moving the existing one. Merge them together. Signed-off-by: Masahiro Yamada Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240204075634.32969-1-masahiroy@kernel.org --- init/Kconfig | 5 ----- kernel/bpf/Kconfig | 1 + 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 8d4e836e1b6b..46ccad83a664 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1457,11 +1457,6 @@ config SYSCTL_ARCH_UNALIGN_ALLOW config HAVE_PCSPKR_PLATFORM bool -# interpreter that classic socket filters depend on -config BPF - bool - select CRYPTO_LIB_SHA1 - menuconfig EXPERT bool "Configure standard kernel features (expert users)" # Unhide debug options, to make the on-by-default options visible diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 6a906ff93006..bc25f5098a25 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -3,6 +3,7 @@ # BPF interpreter that, for example, classic socket filters depend on. config BPF bool + select CRYPTO_LIB_SHA1 # Used by archs to tell that they support BPF JIT compiler plus which # flavour. Only one of the two can be selected for a specific arch since From b9a395f0f7af66fe8224450481b99d4f83b57207 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 8 Feb 2024 14:24:21 +0800 Subject: [PATCH 074/119] bpf, btf: Fix return value of register_btf_id_dtor_kfuncs The same as __register_btf_kfunc_id_set(), to let the modules with stripped btf section loaded, this patch changes the return value of register_btf_id_dtor_kfuncs() too from -ENOENT to 0 when btf is NULL. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/eab65586d7fb0e72f2707d3747c7d4a5d60c823f.1707373307.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- kernel/bpf/btf.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f7725cb6e564..16eb937eca46 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8219,10 +8219,8 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c pr_err("missing vmlinux BTF, cannot register dtor kfuncs\n"); return -ENOENT; } - if (owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) { - pr_err("missing module BTF, cannot register dtor kfuncs\n"); - return -ENOENT; - } + if (owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + pr_warn("missing module BTF, cannot register dtor kfuncs\n"); return 0; } if (IS_ERR(btf)) From 9e60b0e02550aaf5f2301e49353641a5e3701674 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 8 Feb 2024 14:24:22 +0800 Subject: [PATCH 075/119] bpf, btf: Add check_btf_kconfigs helper This patch extracts duplicate code on error path when btf_get_module_btf() returns NULL from the functions __register_btf_kfunc_id_set() and register_btf_id_dtor_kfuncs() into a new helper named check_btf_kconfigs() to check CONFIG_DEBUG_INFO_BTF and CONFIG_DEBUG_INFO_BTF_MODULES in it. Signed-off-by: Geliang Tang Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/fa5537fc55f1e4d0bfd686598c81b7ab9dbd82b7.1707373307.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- kernel/bpf/btf.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 16eb937eca46..61e51cf0a995 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7738,6 +7738,17 @@ static struct btf *btf_get_module_btf(const struct module *module) return btf; } +static int check_btf_kconfigs(const struct module *module, const char *feature) +{ + if (!module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + pr_err("missing vmlinux BTF, cannot register %s\n", feature); + return -ENOENT; + } + if (module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + pr_warn("missing module BTF, cannot register %s\n", feature); + return 0; +} + BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags) { struct btf *btf = NULL; @@ -8098,15 +8109,8 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, int ret, i; btf = btf_get_module_btf(kset->owner); - if (!btf) { - if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { - pr_err("missing vmlinux BTF, cannot register kfuncs\n"); - return -ENOENT; - } - if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) - pr_warn("missing module BTF, cannot register kfuncs\n"); - return 0; - } + if (!btf) + return check_btf_kconfigs(kset->owner, "kfunc"); if (IS_ERR(btf)) return PTR_ERR(btf); @@ -8214,15 +8218,8 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c int ret; btf = btf_get_module_btf(owner); - if (!btf) { - if (!owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { - pr_err("missing vmlinux BTF, cannot register dtor kfuncs\n"); - return -ENOENT; - } - if (owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) - pr_warn("missing module BTF, cannot register dtor kfuncs\n"); - return 0; - } + if (!btf) + return check_btf_kconfigs(owner, "dtor kfuncs"); if (IS_ERR(btf)) return PTR_ERR(btf); From 947e56f82fd783a1ec1c9359b20b5699d09cae14 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 8 Feb 2024 14:24:23 +0800 Subject: [PATCH 076/119] bpf, btf: Check btf for register_bpf_struct_ops Similar to the handling in the functions __register_btf_kfunc_id_set() and register_btf_id_dtor_kfuncs(), this patch uses the newly added helper check_btf_kconfigs() to handle module with its btf section stripped. While at it, the patch also adds the missed IS_ERR() check to fix the commit f6be98d19985 ("bpf, net: switch to dynamic registration") Fixes: f6be98d19985 ("bpf, net: switch to dynamic registration") Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/69082b9835463fe36f9e354bddf2d0a97df39c2b.1707373307.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- kernel/bpf/btf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 61e51cf0a995..8e06d29961f1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8882,7 +8882,9 @@ int __register_bpf_struct_ops(struct bpf_struct_ops *st_ops) btf = btf_get_module_btf(st_ops->owner); if (!btf) - return -EINVAL; + return check_btf_kconfigs(st_ops->owner, "struct_ops"); + if (IS_ERR(btf)) + return PTR_ERR(btf); log = kzalloc(sizeof(*log), GFP_KERNEL | __GFP_NOWARN); if (!log) { From 68bc61c26cacf152baf905786b5949769700f40d Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 7 Feb 2024 13:26:17 +0100 Subject: [PATCH 077/119] bpf: Allow compiler to inline most of bpf_local_storage_lookup() In various performance profiles of kernels with BPF programs attached, bpf_local_storage_lookup() appears as a significant portion of CPU cycles spent. To enable the compiler generate more optimal code, turn bpf_local_storage_lookup() into a static inline function, where only the cache insertion code path is outlined Notably, outlining cache insertion helps avoid bloating callers by duplicating setting up calls to raw_spin_{lock,unlock}_irqsave() (on architectures which do not inline spin_lock/unlock, such as x86), which would cause the compiler produce worse code by deciding to outline otherwise inlinable functions. The call overhead is neutral, because we make 2 calls either way: either calling raw_spin_lock_irqsave() and raw_spin_unlock_irqsave(); or call __bpf_local_storage_insert_cache(), which calls raw_spin_lock_irqsave(), followed by a tail-call to raw_spin_unlock_irqsave() where the compiler can perform TCO and (in optimized uninstrumented builds) turns it into a plain jump. The call to __bpf_local_storage_insert_cache() can be elided entirely if cacheit_lockit is a false constant expression. Based on results from './benchs/run_bench_local_storage.sh' (21 trials, reboot between each trial; x86 defconfig + BPF, clang 16) this produces improvements in throughput and latency in the majority of cases, with an average (geomean) improvement of 8%: +---- Hashmap Control -------------------- | | + num keys: 10 | : | | +-+ hashmap (control) sequential get +----------------------+---------------------- | +- hits throughput | 14.789 M ops/s | 14.745 M ops/s ( ~ ) | +- hits latency | 67.679 ns/op | 67.879 ns/op ( ~ ) | +- important_hits throughput | 14.789 M ops/s | 14.745 M ops/s ( ~ ) | | + num keys: 1000 | : | | +-+ hashmap (control) sequential get +----------------------+---------------------- | +- hits throughput | 12.233 M ops/s | 12.170 M ops/s ( ~ ) | +- hits latency | 81.754 ns/op | 82.185 ns/op ( ~ ) | +- important_hits throughput | 12.233 M ops/s | 12.170 M ops/s ( ~ ) | | + num keys: 10000 | : | | +-+ hashmap (control) sequential get +----------------------+---------------------- | +- hits throughput | 7.220 M ops/s | 7.204 M ops/s ( ~ ) | +- hits latency | 138.522 ns/op | 138.842 ns/op ( ~ ) | +- important_hits throughput | 7.220 M ops/s | 7.204 M ops/s ( ~ ) | | + num keys: 100000 | : | | +-+ hashmap (control) sequential get +----------------------+---------------------- | +- hits throughput | 5.061 M ops/s | 5.165 M ops/s (+2.1%) | +- hits latency | 198.483 ns/op | 194.270 ns/op (-2.1%) | +- important_hits throughput | 5.061 M ops/s | 5.165 M ops/s (+2.1%) | | + num keys: 4194304 | : | | +-+ hashmap (control) sequential get +----------------------+---------------------- | +- hits throughput | 2.864 M ops/s | 2.882 M ops/s ( ~ ) | +- hits latency | 365.220 ns/op | 361.418 ns/op (-1.0%) | +- important_hits throughput | 2.864 M ops/s | 2.882 M ops/s ( ~ ) | +---- Local Storage ---------------------- | | + num_maps: 1 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 33.005 M ops/s | 39.068 M ops/s (+18.4%) | +- hits latency | 30.300 ns/op | 25.598 ns/op (-15.5%) | +- important_hits throughput | 33.005 M ops/s | 39.068 M ops/s (+18.4%) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 37.151 M ops/s | 44.926 M ops/s (+20.9%) | +- hits latency | 26.919 ns/op | 22.259 ns/op (-17.3%) | +- important_hits throughput | 37.151 M ops/s | 44.926 M ops/s (+20.9%) | | + num_maps: 10 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 32.288 M ops/s | 38.099 M ops/s (+18.0%) | +- hits latency | 30.972 ns/op | 26.248 ns/op (-15.3%) | +- important_hits throughput | 3.229 M ops/s | 3.810 M ops/s (+18.0%) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 34.473 M ops/s | 41.145 M ops/s (+19.4%) | +- hits latency | 29.010 ns/op | 24.307 ns/op (-16.2%) | +- important_hits throughput | 12.312 M ops/s | 14.695 M ops/s (+19.4%) | | + num_maps: 16 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 32.524 M ops/s | 38.341 M ops/s (+17.9%) | +- hits latency | 30.748 ns/op | 26.083 ns/op (-15.2%) | +- important_hits throughput | 2.033 M ops/s | 2.396 M ops/s (+17.9%) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 34.575 M ops/s | 41.338 M ops/s (+19.6%) | +- hits latency | 28.925 ns/op | 24.193 ns/op (-16.4%) | +- important_hits throughput | 11.001 M ops/s | 13.153 M ops/s (+19.6%) | | + num_maps: 17 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 28.861 M ops/s | 32.756 M ops/s (+13.5%) | +- hits latency | 34.649 ns/op | 30.530 ns/op (-11.9%) | +- important_hits throughput | 1.700 M ops/s | 1.929 M ops/s (+13.5%) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 31.529 M ops/s | 36.110 M ops/s (+14.5%) | +- hits latency | 31.719 ns/op | 27.697 ns/op (-12.7%) | +- important_hits throughput | 9.598 M ops/s | 10.993 M ops/s (+14.5%) | | + num_maps: 24 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 18.602 M ops/s | 19.937 M ops/s (+7.2%) | +- hits latency | 53.767 ns/op | 50.166 ns/op (-6.7%) | +- important_hits throughput | 0.776 M ops/s | 0.831 M ops/s (+7.2%) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 21.718 M ops/s | 23.332 M ops/s (+7.4%) | +- hits latency | 46.047 ns/op | 42.865 ns/op (-6.9%) | +- important_hits throughput | 6.110 M ops/s | 6.564 M ops/s (+7.4%) | | + num_maps: 32 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 14.118 M ops/s | 14.626 M ops/s (+3.6%) | +- hits latency | 70.856 ns/op | 68.381 ns/op (-3.5%) | +- important_hits throughput | 0.442 M ops/s | 0.458 M ops/s (+3.6%) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 17.111 M ops/s | 17.906 M ops/s (+4.6%) | +- hits latency | 58.451 ns/op | 55.865 ns/op (-4.4%) | +- important_hits throughput | 4.776 M ops/s | 4.998 M ops/s (+4.6%) | | + num_maps: 100 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 5.281 M ops/s | 5.528 M ops/s (+4.7%) | +- hits latency | 192.398 ns/op | 183.059 ns/op (-4.9%) | +- important_hits throughput | 0.053 M ops/s | 0.055 M ops/s (+4.9%) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 6.265 M ops/s | 6.498 M ops/s (+3.7%) | +- hits latency | 161.436 ns/op | 152.877 ns/op (-5.3%) | +- important_hits throughput | 1.636 M ops/s | 1.697 M ops/s (+3.7%) | | + num_maps: 1000 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 0.355 M ops/s | 0.354 M ops/s ( ~ ) | +- hits latency | 2826.538 ns/op | 2827.139 ns/op ( ~ ) | +- important_hits throughput | 0.000 M ops/s | 0.000 M ops/s ( ~ ) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 0.404 M ops/s | 0.403 M ops/s ( ~ ) | +- hits latency | 2481.190 ns/op | 2487.555 ns/op ( ~ ) | +- important_hits throughput | 0.102 M ops/s | 0.101 M ops/s ( ~ ) The on_lookup test in {cgrp,task}_ls_recursion.c is removed because the bpf_local_storage_lookup is no longer traceable and adding tracepoint will make the compiler generate worse code: https://lore.kernel.org/bpf/ZcJmok64Xqv6l4ZS@elver.google.com/ Signed-off-by: Marco Elver Cc: Martin KaFai Lau Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240207122626.3508658-1-elver@google.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf_local_storage.h | 30 ++++++++++- kernel/bpf/bpf_local_storage.c | 52 +++++-------------- .../bpf/prog_tests/task_local_storage.c | 6 --- .../selftests/bpf/progs/cgrp_ls_recursion.c | 26 ---------- .../selftests/bpf/progs/task_ls_recursion.c | 17 ------ 5 files changed, 41 insertions(+), 90 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 173ec7f43ed1..dcddb0aef7d8 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -129,10 +129,36 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache, bool bpf_ma); -struct bpf_local_storage_data * +void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem); +/* If cacheit_lockit is false, this lookup function is lockless */ +static inline struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, - bool cacheit_lockit); + bool cacheit_lockit) +{ + struct bpf_local_storage_data *sdata; + struct bpf_local_storage_elem *selem; + + /* Fast path (cache hit) */ + sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx], + bpf_rcu_lock_held()); + if (sdata && rcu_access_pointer(sdata->smap) == smap) + return sdata; + + /* Slow path (cache miss) */ + hlist_for_each_entry_rcu(selem, &local_storage->list, snode, + rcu_read_lock_trace_held()) + if (rcu_access_pointer(SDATA(selem)->smap) == smap) + break; + + if (!selem) + return NULL; + if (cacheit_lockit) + __bpf_local_storage_insert_cache(local_storage, smap, selem); + return SDATA(selem); +} void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 146824cc9689..bdea1a459153 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -414,47 +414,21 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) bpf_selem_unlink_storage(selem, reuse_now); } -/* If cacheit_lockit is false, this lookup function is lockless */ -struct bpf_local_storage_data * -bpf_local_storage_lookup(struct bpf_local_storage *local_storage, - struct bpf_local_storage_map *smap, - bool cacheit_lockit) +void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) { - struct bpf_local_storage_data *sdata; - struct bpf_local_storage_elem *selem; + unsigned long flags; - /* Fast path (cache hit) */ - sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx], - bpf_rcu_lock_held()); - if (sdata && rcu_access_pointer(sdata->smap) == smap) - return sdata; - - /* Slow path (cache miss) */ - hlist_for_each_entry_rcu(selem, &local_storage->list, snode, - rcu_read_lock_trace_held()) - if (rcu_access_pointer(SDATA(selem)->smap) == smap) - break; - - if (!selem) - return NULL; - - sdata = SDATA(selem); - if (cacheit_lockit) { - unsigned long flags; - - /* spinlock is needed to avoid racing with the - * parallel delete. Otherwise, publishing an already - * deleted sdata to the cache will become a use-after-free - * problem in the next bpf_local_storage_lookup(). - */ - raw_spin_lock_irqsave(&local_storage->lock, flags); - if (selem_linked_to_storage(selem)) - rcu_assign_pointer(local_storage->cache[smap->cache_idx], - sdata); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); - } - - return sdata; + /* spinlock is needed to avoid racing with the + * parallel delete. Otherwise, publishing an already + * deleted sdata to the cache will become a use-after-free + * problem in the next bpf_local_storage_lookup(). + */ + raw_spin_lock_irqsave(&local_storage->lock, flags); + if (selem_linked_to_storage(selem)) + rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem)); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); } static int check_flags(const struct bpf_local_storage_data *old_sdata, diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c index ea8537c54413..c33c05161a9e 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c @@ -117,12 +117,6 @@ static void test_recursion(void) ASSERT_OK(err, "lookup map_b"); ASSERT_EQ(value, 100, "map_b value"); - prog_fd = bpf_program__fd(skel->progs.on_lookup); - memset(&info, 0, sizeof(info)); - err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len); - ASSERT_OK(err, "get prog info"); - ASSERT_GT(info.recursion_misses, 0, "on_lookup prog recursion"); - prog_fd = bpf_program__fd(skel->progs.on_update); memset(&info, 0, sizeof(info)); err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len); diff --git a/tools/testing/selftests/bpf/progs/cgrp_ls_recursion.c b/tools/testing/selftests/bpf/progs/cgrp_ls_recursion.c index 610c2427fd93..3500e4b69ebe 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_ls_recursion.c +++ b/tools/testing/selftests/bpf/progs/cgrp_ls_recursion.c @@ -27,32 +27,6 @@ bool is_cgroup1 = 0; struct cgroup *bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id) __ksym; void bpf_cgroup_release(struct cgroup *cgrp) __ksym; -static void __on_lookup(struct cgroup *cgrp) -{ - bpf_cgrp_storage_delete(&map_a, cgrp); - bpf_cgrp_storage_delete(&map_b, cgrp); -} - -SEC("fentry/bpf_local_storage_lookup") -int BPF_PROG(on_lookup) -{ - struct task_struct *task = bpf_get_current_task_btf(); - struct cgroup *cgrp; - - if (is_cgroup1) { - cgrp = bpf_task_get_cgroup1(task, target_hid); - if (!cgrp) - return 0; - - __on_lookup(cgrp); - bpf_cgroup_release(cgrp); - return 0; - } - - __on_lookup(task->cgroups->dfl_cgrp); - return 0; -} - static void __on_update(struct cgroup *cgrp) { long *ptr; diff --git a/tools/testing/selftests/bpf/progs/task_ls_recursion.c b/tools/testing/selftests/bpf/progs/task_ls_recursion.c index 4542dc683b44..f1853c38aada 100644 --- a/tools/testing/selftests/bpf/progs/task_ls_recursion.c +++ b/tools/testing/selftests/bpf/progs/task_ls_recursion.c @@ -27,23 +27,6 @@ struct { __type(value, long); } map_b SEC(".maps"); -SEC("fentry/bpf_local_storage_lookup") -int BPF_PROG(on_lookup) -{ - struct task_struct *task = bpf_get_current_task_btf(); - - if (!test_pid || task->pid != test_pid) - return 0; - - /* The bpf_task_storage_delete will call - * bpf_local_storage_lookup. The prog->active will - * stop the recursion. - */ - bpf_task_storage_delete(&map_a, task); - bpf_task_storage_delete(&map_b, task); - return 0; -} - SEC("fentry/bpf_local_storage_update") int BPF_PROG(on_update) { From 5b268d1ebcdceacf992dfda8f9031d56005a274e Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Sun, 4 Feb 2024 14:06:34 -0700 Subject: [PATCH 078/119] bpf: Have bpf_rdonly_cast() take a const pointer Since 20d59ee55172 ("libbpf: add bpf_core_cast() macro"), libbpf is now exporting a const arg version of bpf_rdonly_cast(). This causes the following conflicting type error when generating kfunc prototypes from BTF: In file included from skeleton/pid_iter.bpf.c:5: /home/dxu/dev/linux/tools/bpf/bpftool/bootstrap/libbpf/include/bpf/bpf_core_read.h:297:14: error: conflicting types for 'bpf_rdonly_cast' extern void *bpf_rdonly_cast(const void *obj__ign, __u32 btf_id__k) __ksym __weak; ^ ./vmlinux.h:135625:14: note: previous declaration is here extern void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k) __weak __ksym; This is b/c the kernel defines bpf_rdonly_cast() with non-const arg. Since const arg is more permissive and thus backwards compatible, we change the kernel definition as well to avoid conflicting type errors. Signed-off-by: Daniel Xu Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/dfd3823f11ffd2d4c838e961d61ec9ae8a646773.1707080349.git.dxu@dxuuu.xyz --- kernel/bpf/helpers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4db1c658254c..3503949b4c1b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2484,9 +2484,9 @@ __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) return obj; } -__bpf_kfunc void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k) +__bpf_kfunc void *bpf_rdonly_cast(const void *obj__ign, u32 btf_id__k) { - return obj__ign; + return (void *)obj__ign; } __bpf_kfunc void bpf_rcu_read_lock(void) From 178c54666f9c4d2f49f2ea661d0c11b52f0ed190 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 6 Feb 2024 23:01:02 -0800 Subject: [PATCH 079/119] bpf: Mark bpf_spin_{lock,unlock}() helpers with notrace correctly Currently tracing is supposed not to allow for bpf_spin_{lock,unlock}() helper calls. This is to prevent deadlock for the following cases: - there is a prog (prog-A) calling bpf_spin_{lock,unlock}(). - there is a tracing program (prog-B), e.g., fentry, attached to bpf_spin_lock() and/or bpf_spin_unlock(). - prog-B calls bpf_spin_{lock,unlock}(). For such a case, when prog-A calls bpf_spin_{lock,unlock}(), a deadlock will happen. The related source codes are below in kernel/bpf/helpers.c: notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) notrace is supposed to prevent fentry prog from attaching to bpf_spin_{lock,unlock}(). But actually this is not the case and fentry prog can successfully attached to bpf_spin_lock(). Siddharth Chintamaneni reported the issue in [1]. The following is the macro definition for above BPF_CALL_1: #define BPF_CALL_x(x, name, ...) \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ { \ return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ } \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) #define BPF_CALL_1(name, ...) BPF_CALL_x(1, name, __VA_ARGS__) The notrace attribute is actually applied to the static always_inline function ____bpf_spin_{lock,unlock}(). The actual callback function bpf_spin_{lock,unlock}() is not marked with notrace, hence allowing fentry prog to attach to two helpers, and this may cause the above mentioned deadlock. Siddharth Chintamaneni actually has a reproducer in [2]. To fix the issue, a new macro NOTRACE_BPF_CALL_1 is introduced which will add notrace attribute to the original function instead of the hidden always_inline function and this fixed the problem. [1] https://lore.kernel.org/bpf/CAE5sdEigPnoGrzN8WU7Tx-h-iFuMZgW06qp0KHWtpvoXxf1OAQ@mail.gmail.com/ [2] https://lore.kernel.org/bpf/CAE5sdEg6yUc_Jz50AnUXEEUh6O73yQ1Z6NV2srJnef0ZrQkZew@mail.gmail.com/ Fixes: d83525ca62cf ("bpf: introduce bpf_spin_lock") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240207070102.335167-1-yonghong.song@linux.dev --- include/linux/filter.h | 21 ++++++++++++--------- kernel/bpf/helpers.c | 4 ++-- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index fee070b9826e..36cc29a2934c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -547,24 +547,27 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2, \ u64, __ur_3, u64, __ur_4, u64, __ur_5) -#define BPF_CALL_x(x, name, ...) \ +#define BPF_CALL_x(x, attr, name, ...) \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ - u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ - u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ + attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ + attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ { \ return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ } \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) -#define BPF_CALL_0(name, ...) BPF_CALL_x(0, name, __VA_ARGS__) -#define BPF_CALL_1(name, ...) BPF_CALL_x(1, name, __VA_ARGS__) -#define BPF_CALL_2(name, ...) BPF_CALL_x(2, name, __VA_ARGS__) -#define BPF_CALL_3(name, ...) BPF_CALL_x(3, name, __VA_ARGS__) -#define BPF_CALL_4(name, ...) BPF_CALL_x(4, name, __VA_ARGS__) -#define BPF_CALL_5(name, ...) BPF_CALL_x(5, name, __VA_ARGS__) +#define __NOATTR +#define BPF_CALL_0(name, ...) BPF_CALL_x(0, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_1(name, ...) BPF_CALL_x(1, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_2(name, ...) BPF_CALL_x(2, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_3(name, ...) BPF_CALL_x(3, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_4(name, ...) BPF_CALL_x(4, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_5(name, ...) BPF_CALL_x(5, __NOATTR, name, __VA_ARGS__) + +#define NOTRACE_BPF_CALL_1(name, ...) BPF_CALL_x(1, notrace, name, __VA_ARGS__) #define bpf_ctx_range(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3503949b4c1b..93edf730d288 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -334,7 +334,7 @@ static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock) __this_cpu_write(irqsave_flags, flags); } -notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) { __bpf_spin_lock_irqsave(lock); return 0; @@ -357,7 +357,7 @@ static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) local_irq_restore(flags); } -notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) { __bpf_spin_unlock_irqrestore(lock); return 0; From fc1c9e40da37905f87c73711a1ecc57f52c1fe1c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 6 Feb 2024 23:01:07 -0800 Subject: [PATCH 080/119] selftests/bpf: Ensure fentry prog cannot attach to bpf_spin_{lock,unlcok}() Add two tests to ensure fentry programs cannot attach to bpf_spin_{lock,unlock}() helpers. The tracing_failure.c files can be used in the future for other tracing failure cases. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240207070107.335341-1-yonghong.song@linux.dev --- .../bpf/prog_tests/tracing_failure.c | 37 +++++++++++++++++++ .../selftests/bpf/progs/tracing_failure.c | 20 ++++++++++ 2 files changed, 57 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/tracing_failure.c create mode 100644 tools/testing/selftests/bpf/progs/tracing_failure.c diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c new file mode 100644 index 000000000000..a222df765bc3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include "tracing_failure.skel.h" + +static void test_bpf_spin_lock(bool is_spin_lock) +{ + struct tracing_failure *skel; + int err; + + skel = tracing_failure__open(); + if (!ASSERT_OK_PTR(skel, "tracing_failure__open")) + return; + + if (is_spin_lock) + bpf_program__set_autoload(skel->progs.test_spin_lock, true); + else + bpf_program__set_autoload(skel->progs.test_spin_unlock, true); + + err = tracing_failure__load(skel); + if (!ASSERT_OK(err, "tracing_failure__load")) + goto out; + + err = tracing_failure__attach(skel); + ASSERT_ERR(err, "tracing_failure__attach"); + +out: + tracing_failure__destroy(skel); +} + +void test_tracing_failure(void) +{ + if (test__start_subtest("bpf_spin_lock")) + test_bpf_spin_lock(true); + if (test__start_subtest("bpf_spin_unlock")) + test_bpf_spin_lock(false); +} diff --git a/tools/testing/selftests/bpf/progs/tracing_failure.c b/tools/testing/selftests/bpf/progs/tracing_failure.c new file mode 100644 index 000000000000..d41665d2ec8c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_failure.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +SEC("?fentry/bpf_spin_lock") +int BPF_PROG(test_spin_lock, struct bpf_spin_lock *lock) +{ + return 0; +} + +SEC("?fentry/bpf_spin_unlock") +int BPF_PROG(test_spin_unlock, struct bpf_spin_lock *lock) +{ + return 0; +} From 52dbd67dff5d050e99301100e2cac578eef9b2e9 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Thu, 8 Feb 2024 21:36:12 +0100 Subject: [PATCH 081/119] bpf: Abstract loop unrolling pragmas in BPF selftests [Changes from V1: - Avoid conflict by rebasing with latest master.] Some BPF tests use loop unrolling compiler pragmas that are clang specific and not supported by GCC. These pragmas, along with their GCC equivalences are: #pragma clang loop unroll_count(N) #pragma GCC unroll N #pragma clang loop unroll(full) #pragma GCC unroll 65534 #pragma clang loop unroll(disable) #pragma GCC unroll 1 #pragma unroll [aka #pragma clang loop unroll(enable)] There is no GCC equivalence to this pragma. It enables unrolling on loops that the compiler would not ordinarily unroll even with -O2|-funroll-loops, but it is not equivalent to full unrolling either. This patch adds a new header progs/bpf_compiler.h that defines the following macros, which correspond to each pair of compiler-specific pragmas above: __pragma_loop_unroll_count(N) __pragma_loop_unroll_full __pragma_loop_no_unroll __pragma_loop_unroll The selftests using loop unrolling pragmas are then changed to include the header and use these macros in place of the explicit pragmas. Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240208203612.29611-1-jose.marchesi@oracle.com --- .../selftests/bpf/progs/bpf_compiler.h | 33 +++++++++++++++++++ tools/testing/selftests/bpf/progs/iters.c | 5 +-- tools/testing/selftests/bpf/progs/loop4.c | 4 ++- .../selftests/bpf/progs/profiler.inc.h | 17 +++++----- tools/testing/selftests/bpf/progs/pyperf.h | 7 ++-- .../testing/selftests/bpf/progs/strobemeta.h | 18 +++++----- .../selftests/bpf/progs/test_cls_redirect.c | 5 +-- .../selftests/bpf/progs/test_lwt_seg6local.c | 6 ++-- .../selftests/bpf/progs/test_seg6_loop.c | 4 ++- .../selftests/bpf/progs/test_skb_ctx.c | 4 ++- .../selftests/bpf/progs/test_sysctl_loop1.c | 6 ++-- .../selftests/bpf/progs/test_sysctl_loop2.c | 6 ++-- .../selftests/bpf/progs/test_sysctl_prog.c | 6 ++-- .../selftests/bpf/progs/test_tc_tunnel.c | 3 +- tools/testing/selftests/bpf/progs/test_xdp.c | 3 +- .../selftests/bpf/progs/test_xdp_loop.c | 3 +- .../selftests/bpf/progs/test_xdp_noinline.c | 5 +-- .../selftests/bpf/progs/xdp_synproxy_kern.c | 6 ++-- .../testing/selftests/bpf/progs/xdping_kern.c | 3 +- 19 files changed, 102 insertions(+), 42 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/bpf_compiler.h diff --git a/tools/testing/selftests/bpf/progs/bpf_compiler.h b/tools/testing/selftests/bpf/progs/bpf_compiler.h new file mode 100644 index 000000000000..a7c343dc82e6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_compiler.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BPF_COMPILER_H__ +#define __BPF_COMPILER_H__ + +#define DO_PRAGMA_(X) _Pragma(#X) + +#if __clang__ +#define __pragma_loop_unroll DO_PRAGMA_(clang loop unroll(enable)) +#else +/* In GCC -funroll-loops, which is enabled with -O2, should have the + same impact than the loop-unroll-enable pragma above. */ +#define __pragma_loop_unroll +#endif + +#if __clang__ +#define __pragma_loop_unroll_count(N) DO_PRAGMA_(clang loop unroll_count(N)) +#else +#define __pragma_loop_unroll_count(N) DO_PRAGMA_(GCC unroll N) +#endif + +#if __clang__ +#define __pragma_loop_unroll_full DO_PRAGMA_(clang loop unroll(full)) +#else +#define __pragma_loop_unroll_full DO_PRAGMA_(GCC unroll 65534) +#endif + +#if __clang__ +#define __pragma_loop_no_unroll DO_PRAGMA_(clang loop unroll(disable)) +#else +#define __pragma_loop_no_unroll DO_PRAGMA_(GCC unroll 1) +#endif + +#endif diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 225f02dd66d0..3db416606f2f 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -5,6 +5,7 @@ #include #include #include "bpf_misc.h" +#include "bpf_compiler.h" #define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof((x)[0])) @@ -183,7 +184,7 @@ int iter_pragma_unroll_loop(const void *ctx) MY_PID_GUARD(); bpf_iter_num_new(&it, 0, 2); -#pragma nounroll + __pragma_loop_no_unroll for (i = 0; i < 3; i++) { v = bpf_iter_num_next(&it); bpf_printk("ITER_BASIC: E3 VAL: i=%d v=%d", i, v ? *v : -1); @@ -238,7 +239,7 @@ int iter_multiple_sequential_loops(const void *ctx) bpf_iter_num_destroy(&it); bpf_iter_num_new(&it, 0, 2); -#pragma nounroll + __pragma_loop_no_unroll for (i = 0; i < 3; i++) { v = bpf_iter_num_next(&it); bpf_printk("ITER_BASIC: E3 VAL: i=%d v=%d", i, v ? *v : -1); diff --git a/tools/testing/selftests/bpf/progs/loop4.c b/tools/testing/selftests/bpf/progs/loop4.c index b35337926d66..0de0357f57cc 100644 --- a/tools/testing/selftests/bpf/progs/loop4.c +++ b/tools/testing/selftests/bpf/progs/loop4.c @@ -3,6 +3,8 @@ #include #include +#include "bpf_compiler.h" + char _license[] SEC("license") = "GPL"; SEC("socket") @@ -10,7 +12,7 @@ int combinations(volatile struct __sk_buff* skb) { int ret = 0, i; -#pragma nounroll + __pragma_loop_no_unroll for (i = 0; i < 20; i++) if (skb->len) ret |= 1 << i; diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h index de3b6e4e4d0a..6957d9f2805e 100644 --- a/tools/testing/selftests/bpf/progs/profiler.inc.h +++ b/tools/testing/selftests/bpf/progs/profiler.inc.h @@ -8,6 +8,7 @@ #include "profiler.h" #include "err.h" #include "bpf_experimental.h" +#include "bpf_compiler.h" #ifndef NULL #define NULL 0 @@ -169,7 +170,7 @@ static INLINE int get_var_spid_index(struct var_kill_data_arr_t* arr_struct, int spid) { #ifdef UNROLL -#pragma unroll + __pragma_loop_unroll #endif for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) if (arr_struct->array[i].meta.pid == spid) @@ -185,7 +186,7 @@ static INLINE void populate_ancestors(struct task_struct* task, ancestors_data->num_ancestors = 0; #ifdef UNROLL -#pragma unroll + __pragma_loop_unroll #endif for (num_ancestors = 0; num_ancestors < MAX_ANCESTORS; num_ancestors++) { parent = BPF_CORE_READ(parent, real_parent); @@ -212,7 +213,7 @@ static INLINE void* read_full_cgroup_path(struct kernfs_node* cgroup_node, size_t filepart_length; #ifdef UNROLL -#pragma unroll + __pragma_loop_unroll #endif for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) { filepart_length = @@ -261,7 +262,7 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, int cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id___local, pids_cgrp_id___local); #ifdef UNROLL -#pragma unroll + __pragma_loop_unroll #endif for (int i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys_state* subsys = @@ -402,7 +403,7 @@ static INLINE int trace_var_sys_kill(void* ctx, int tpid, int sig) if (kill_data == NULL) return 0; #ifdef UNROLL -#pragma unroll + __pragma_loop_unroll #endif for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) if (arr_struct->array[i].meta.pid == 0) { @@ -482,7 +483,7 @@ read_absolute_file_path_from_dentry(struct dentry* filp_dentry, void* payload) struct dentry* parent_dentry; #ifdef UNROLL -#pragma unroll + __pragma_loop_unroll #endif for (int i = 0; i < MAX_PATH_DEPTH; i++) { filepart_length = @@ -508,7 +509,7 @@ is_ancestor_in_allowed_inodes(struct dentry* filp_dentry) { struct dentry* parent_dentry; #ifdef UNROLL -#pragma unroll + __pragma_loop_unroll #endif for (int i = 0; i < MAX_PATH_DEPTH; i++) { u64 dir_ino = BPF_CORE_READ(filp_dentry, d_inode, i_ino); @@ -629,7 +630,7 @@ int raw_tracepoint__sched_process_exit(void* ctx) struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn); #ifdef UNROLL -#pragma unroll + __pragma_loop_unroll #endif for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) { struct var_kill_data_t* past_kill_data = &arr_struct->array[i]; diff --git a/tools/testing/selftests/bpf/progs/pyperf.h b/tools/testing/selftests/bpf/progs/pyperf.h index 026d573ce179..86484f07e1d1 100644 --- a/tools/testing/selftests/bpf/progs/pyperf.h +++ b/tools/testing/selftests/bpf/progs/pyperf.h @@ -8,6 +8,7 @@ #include #include #include "bpf_misc.h" +#include "bpf_compiler.h" #define FUNCTION_NAME_LEN 64 #define FILE_NAME_LEN 128 @@ -298,11 +299,11 @@ int __on_event(struct bpf_raw_tracepoint_args *ctx) #if defined(USE_ITER) /* no for loop, no unrolling */ #elif defined(NO_UNROLL) -#pragma clang loop unroll(disable) + __pragma_loop_no_unroll #elif defined(UNROLL_COUNT) -#pragma clang loop unroll_count(UNROLL_COUNT) + __pragma_loop_unroll_count(UNROLL_COUNT) #else -#pragma clang loop unroll(full) + __pragma_loop_unroll_full #endif /* NO_UNROLL */ /* Unwind python stack */ #ifdef USE_ITER diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h index 40df2cc26eaf..f74459eead26 100644 --- a/tools/testing/selftests/bpf/progs/strobemeta.h +++ b/tools/testing/selftests/bpf/progs/strobemeta.h @@ -10,6 +10,8 @@ #include #include +#include "bpf_compiler.h" + typedef uint32_t pid_t; struct task_struct {}; @@ -419,9 +421,9 @@ static __always_inline uint64_t read_map_var(struct strobemeta_cfg *cfg, } #ifdef NO_UNROLL -#pragma clang loop unroll(disable) + __pragma_loop_no_unroll #else -#pragma unroll + __pragma_loop_unroll #endif for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { if (i >= map.cnt) @@ -560,25 +562,25 @@ static void *read_strobe_meta(struct task_struct *task, payload_off = sizeof(data->payload); #else #ifdef NO_UNROLL -#pragma clang loop unroll(disable) + __pragma_loop_no_unroll #else -#pragma unroll + __pragma_loop_unroll #endif /* NO_UNROLL */ for (int i = 0; i < STROBE_MAX_INTS; ++i) { read_int_var(cfg, i, tls_base, &value, data); } #ifdef NO_UNROLL -#pragma clang loop unroll(disable) + __pragma_loop_no_unroll #else -#pragma unroll + __pragma_loop_unroll #endif /* NO_UNROLL */ for (int i = 0; i < STROBE_MAX_STRS; ++i) { payload_off = read_str_var(cfg, i, tls_base, &value, data, payload_off); } #ifdef NO_UNROLL -#pragma clang loop unroll(disable) + __pragma_loop_no_unroll #else -#pragma unroll + __pragma_loop_unroll #endif /* NO_UNROLL */ for (int i = 0; i < STROBE_MAX_MAPS; ++i) { payload_off = read_map_var(cfg, i, tls_base, &value, data, payload_off); diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c index bfc9179259d5..683c8aaa63da 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -20,6 +20,7 @@ #include #include +#include "bpf_compiler.h" #include "test_cls_redirect.h" #pragma GCC diagnostic ignored "-Waddress-of-packed-member" @@ -269,7 +270,7 @@ static INLINING void pkt_ipv4_checksum(struct iphdr *iph) uint32_t acc = 0; uint16_t *ipw = (uint16_t *)iph; -#pragma clang loop unroll(full) + __pragma_loop_unroll_full for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) { acc += ipw[i]; } @@ -296,7 +297,7 @@ bool pkt_skip_ipv6_extension_headers(buf_t *pkt, }; *is_fragment = false; -#pragma clang loop unroll(full) + __pragma_loop_unroll_full for (int i = 0; i < 6; i++) { switch (exthdr.next) { case IPPROTO_FRAGMENT: diff --git a/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c b/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c index 48ff2b2ad5e7..fed66f36adb6 100644 --- a/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c +++ b/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c @@ -6,6 +6,8 @@ #include #include +#include "bpf_compiler.h" + /* Packet parsing state machine helpers. */ #define cursor_advance(_cursor, _len) \ ({ void *_tmp = _cursor; _cursor += _len; _tmp; }) @@ -131,7 +133,7 @@ int is_valid_tlv_boundary(struct __sk_buff *skb, struct ip6_srh_t *srh, *pad_off = 0; // we can only go as far as ~10 TLVs due to the BPF max stack size - #pragma clang loop unroll(full) + __pragma_loop_unroll_full for (int i = 0; i < 10; i++) { struct sr6_tlv_t tlv; @@ -302,7 +304,7 @@ int __encap_srh(struct __sk_buff *skb) seg = (struct ip6_addr_t *)((char *)srh + sizeof(*srh)); - #pragma clang loop unroll(full) + __pragma_loop_unroll_full for (unsigned long long lo = 0; lo < 4; lo++) { seg->lo = bpf_cpu_to_be64(4 - lo); seg->hi = bpf_cpu_to_be64(hi); diff --git a/tools/testing/selftests/bpf/progs/test_seg6_loop.c b/tools/testing/selftests/bpf/progs/test_seg6_loop.c index a7278f064368..5059050f74f6 100644 --- a/tools/testing/selftests/bpf/progs/test_seg6_loop.c +++ b/tools/testing/selftests/bpf/progs/test_seg6_loop.c @@ -6,6 +6,8 @@ #include #include +#include "bpf_compiler.h" + /* Packet parsing state machine helpers. */ #define cursor_advance(_cursor, _len) \ ({ void *_tmp = _cursor; _cursor += _len; _tmp; }) @@ -134,7 +136,7 @@ static __always_inline int is_valid_tlv_boundary(struct __sk_buff *skb, // we can only go as far as ~10 TLVs due to the BPF max stack size // workaround: define induction variable "i" as "long" instead // of "int" to prevent alu32 sub-register spilling. - #pragma clang loop unroll(disable) + __pragma_loop_no_unroll for (long i = 0; i < 100; i++) { struct sr6_tlv_t tlv; diff --git a/tools/testing/selftests/bpf/progs/test_skb_ctx.c b/tools/testing/selftests/bpf/progs/test_skb_ctx.c index c482110cfc95..a724a70c6700 100644 --- a/tools/testing/selftests/bpf/progs/test_skb_ctx.c +++ b/tools/testing/selftests/bpf/progs/test_skb_ctx.c @@ -3,12 +3,14 @@ #include #include +#include "bpf_compiler.h" + char _license[] SEC("license") = "GPL"; SEC("tc") int process(struct __sk_buff *skb) { - #pragma clang loop unroll(full) + __pragma_loop_unroll_full for (int i = 0; i < 5; i++) { if (skb->cb[i] != i + 1) return 1; diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c b/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c index 553a282d816a..7f74077d6622 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c @@ -9,6 +9,8 @@ #include +#include "bpf_compiler.h" + #ifndef ARRAY_SIZE #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif @@ -30,7 +32,7 @@ static __always_inline int is_tcp_mem(struct bpf_sysctl *ctx) if (ret < 0 || ret != sizeof(tcp_mem_name) - 1) return 0; -#pragma clang loop unroll(disable) + __pragma_loop_no_unroll for (i = 0; i < sizeof(tcp_mem_name); ++i) if (name[i] != tcp_mem_name[i]) return 0; @@ -59,7 +61,7 @@ int sysctl_tcp_mem(struct bpf_sysctl *ctx) if (ret < 0 || ret >= MAX_VALUE_STR_LEN) return 0; -#pragma clang loop unroll(disable) + __pragma_loop_no_unroll for (i = 0; i < ARRAY_SIZE(tcp_mem); ++i) { ret = bpf_strtoul(value + off, MAX_ULONG_STR_LEN, 0, tcp_mem + i); diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c b/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c index 2b64bc563a12..68a75436e8af 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c @@ -9,6 +9,8 @@ #include +#include "bpf_compiler.h" + #ifndef ARRAY_SIZE #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif @@ -30,7 +32,7 @@ static __attribute__((noinline)) int is_tcp_mem(struct bpf_sysctl *ctx) if (ret < 0 || ret != sizeof(tcp_mem_name) - 1) return 0; -#pragma clang loop unroll(disable) + __pragma_loop_no_unroll for (i = 0; i < sizeof(tcp_mem_name); ++i) if (name[i] != tcp_mem_name[i]) return 0; @@ -57,7 +59,7 @@ int sysctl_tcp_mem(struct bpf_sysctl *ctx) if (ret < 0 || ret >= MAX_VALUE_STR_LEN) return 0; -#pragma clang loop unroll(disable) + __pragma_loop_no_unroll for (i = 0; i < ARRAY_SIZE(tcp_mem); ++i) { ret = bpf_strtoul(value + off, MAX_ULONG_STR_LEN, 0, tcp_mem + i); diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c index 5489823c83fc..efc3c61f7852 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c @@ -9,6 +9,8 @@ #include +#include "bpf_compiler.h" + /* Max supported length of a string with unsigned long in base 10 (pow2 - 1). */ #define MAX_ULONG_STR_LEN 0xF @@ -31,7 +33,7 @@ static __always_inline int is_tcp_mem(struct bpf_sysctl *ctx) if (ret < 0 || ret != sizeof(tcp_mem_name) - 1) return 0; -#pragma clang loop unroll(full) + __pragma_loop_unroll_full for (i = 0; i < sizeof(tcp_mem_name); ++i) if (name[i] != tcp_mem_name[i]) return 0; @@ -57,7 +59,7 @@ int sysctl_tcp_mem(struct bpf_sysctl *ctx) if (ret < 0 || ret >= MAX_VALUE_STR_LEN) return 0; -#pragma clang loop unroll(full) + __pragma_loop_unroll_full for (i = 0; i < ARRAY_SIZE(tcp_mem); ++i) { ret = bpf_strtoul(value + off, MAX_ULONG_STR_LEN, 0, tcp_mem + i); diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index d8d7ab5e8e30..404124a93892 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -19,6 +19,7 @@ #include #include +#include "bpf_compiler.h" #pragma GCC diagnostic ignored "-Waddress-of-packed-member" @@ -83,7 +84,7 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph) iph->check = 0; -#pragma clang loop unroll(full) + __pragma_loop_unroll_full for (i = 0, csum = 0; i < sizeof(*iph) >> 1; i++) csum += *iph16++; diff --git a/tools/testing/selftests/bpf/progs/test_xdp.c b/tools/testing/selftests/bpf/progs/test_xdp.c index d7a9a74b7245..8caf58be5818 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp.c +++ b/tools/testing/selftests/bpf/progs/test_xdp.c @@ -19,6 +19,7 @@ #include #include #include "test_iptunnel_common.h" +#include "bpf_compiler.h" struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); @@ -137,7 +138,7 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp) iph->ttl = 8; next_iph = (__u16 *)iph; -#pragma clang loop unroll(full) + __pragma_loop_unroll_full for (i = 0; i < sizeof(*iph) >> 1; i++) csum += *next_iph++; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_loop.c b/tools/testing/selftests/bpf/progs/test_xdp_loop.c index c98fb44156f0..93267a68825b 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_loop.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_loop.c @@ -15,6 +15,7 @@ #include #include #include "test_iptunnel_common.h" +#include "bpf_compiler.h" struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); @@ -133,7 +134,7 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp) iph->ttl = 8; next_iph = (__u16 *)iph; -#pragma clang loop unroll(disable) + __pragma_loop_no_unroll for (i = 0; i < sizeof(*iph) >> 1; i++) csum += *next_iph++; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c index 42c8f6ded0e4..5c7e4758a0ca 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c @@ -15,6 +15,7 @@ #include #include #include +#include "bpf_compiler.h" static __always_inline __u32 rol32(__u32 word, unsigned int shift) { @@ -362,7 +363,7 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, iph->ttl = 4; next_iph_u16 = (__u16 *) iph; -#pragma clang loop unroll(full) + __pragma_loop_unroll_full for (int i = 0; i < sizeof(struct iphdr) >> 1; i++) csum += *next_iph_u16++; iph->check = ~((csum & 0xffff) + (csum >> 16)); @@ -409,7 +410,7 @@ int send_icmp_reply(void *data, void *data_end) iph->saddr = tmp_addr; iph->check = 0; next_iph_u16 = (__u16 *) iph; -#pragma clang loop unroll(full) + __pragma_loop_unroll_full for (int i = 0; i < sizeof(struct iphdr) >> 1; i++) csum += *next_iph_u16++; iph->check = ~((csum & 0xffff) + (csum >> 16)); diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c index 518329c666e9..7ea9785738b5 100644 --- a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c @@ -7,6 +7,8 @@ #include #include +#include "bpf_compiler.h" + #define TC_ACT_OK 0 #define TC_ACT_SHOT 2 @@ -151,11 +153,11 @@ static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, __u64 sum = csum; int i; -#pragma unroll + __pragma_loop_unroll for (i = 0; i < 4; i++) sum += (__u32)saddr->in6_u.u6_addr32[i]; -#pragma unroll + __pragma_loop_unroll for (i = 0; i < 4; i++) sum += (__u32)daddr->in6_u.u6_addr32[i]; diff --git a/tools/testing/selftests/bpf/progs/xdping_kern.c b/tools/testing/selftests/bpf/progs/xdping_kern.c index 54cf1765118b..44e2b0ef23ae 100644 --- a/tools/testing/selftests/bpf/progs/xdping_kern.c +++ b/tools/testing/selftests/bpf/progs/xdping_kern.c @@ -15,6 +15,7 @@ #include #include +#include "bpf_compiler.h" #include "xdping.h" struct { @@ -116,7 +117,7 @@ int xdping_client(struct xdp_md *ctx) return XDP_PASS; if (pinginfo->start) { -#pragma clang loop unroll(full) + __pragma_loop_unroll_full for (i = 0; i < XDPING_MAX_COUNT; i++) { if (pinginfo->times[i] == 0) break; From 12bbcf8e840f40b82b02981e96e0a5fbb0703ea9 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Tue, 13 Feb 2024 17:35:43 +0000 Subject: [PATCH 082/119] libbpf: Add support to GCC in CORE macro definitions Due to internal differences between LLVM and GCC the current implementation for the CO-RE macros does not fit GCC parser, as it will optimize those expressions even before those would be accessible by the BPF backend. As examples, the following would be optimized out with the original definitions: - As enums are converted to their integer representation during parsing, the IR would not know how to distinguish an integer constant from an actual enum value. - Types need to be kept as temporary variables, as the existing type casts of the 0 address (as expanded for LLVM), are optimized away by the GCC C parser, never really reaching GCCs IR. Although, the macros appear to add extra complexity, the expanded code is removed from the compilation flow very early in the compilation process, not really affecting the quality of the generated assembly. Signed-off-by: Cupertino Miranda Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240213173543.1397708-1-cupertino.miranda@oracle.com --- tools/lib/bpf/bpf_core_read.h | 45 +++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index 0d3e88bd7d5f..1ce738d91685 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -46,7 +46,7 @@ enum bpf_enum_value_kind { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define __CORE_BITFIELD_PROBE_READ(dst, src, fld) \ bpf_probe_read_kernel( \ - (void *)dst, \ + (void *)dst, \ __CORE_RELO(src, fld, BYTE_SIZE), \ (const void *)src + __CORE_RELO(src, fld, BYTE_OFFSET)) #else @@ -145,8 +145,29 @@ enum bpf_enum_value_kind { } \ }) +/* Differentiator between compilers builtin implementations. This is a + * requirement due to the compiler parsing differences where GCC optimizes + * early in parsing those constructs of type pointers to the builtin specific + * type, resulting in not being possible to collect the required type + * information in the builtin expansion. + */ +#ifdef __clang__ +#define ___bpf_typeof(type) ((typeof(type) *) 0) +#else +#define ___bpf_typeof1(type, NR) ({ \ + extern typeof(type) *___concat(bpf_type_tmp_, NR); \ + ___concat(bpf_type_tmp_, NR); \ +}) +#define ___bpf_typeof(type) ___bpf_typeof1(type, __COUNTER__) +#endif + +#ifdef __clang__ #define ___bpf_field_ref1(field) (field) -#define ___bpf_field_ref2(type, field) (((typeof(type) *)0)->field) +#define ___bpf_field_ref2(type, field) (___bpf_typeof(type)->field) +#else +#define ___bpf_field_ref1(field) (&(field)) +#define ___bpf_field_ref2(type, field) (&(___bpf_typeof(type)->field)) +#endif #define ___bpf_field_ref(args...) \ ___bpf_apply(___bpf_field_ref, ___bpf_narg(args))(args) @@ -196,7 +217,7 @@ enum bpf_enum_value_kind { * BTF. Always succeeds. */ #define bpf_core_type_id_local(type) \ - __builtin_btf_type_id(*(typeof(type) *)0, BPF_TYPE_ID_LOCAL) + __builtin_btf_type_id(*___bpf_typeof(type), BPF_TYPE_ID_LOCAL) /* * Convenience macro to get BTF type ID of a target kernel's type that matches @@ -206,7 +227,7 @@ enum bpf_enum_value_kind { * - 0, if no matching type was found in a target kernel BTF. */ #define bpf_core_type_id_kernel(type) \ - __builtin_btf_type_id(*(typeof(type) *)0, BPF_TYPE_ID_TARGET) + __builtin_btf_type_id(*___bpf_typeof(type), BPF_TYPE_ID_TARGET) /* * Convenience macro to check that provided named type @@ -216,7 +237,7 @@ enum bpf_enum_value_kind { * 0, if no matching type is found. */ #define bpf_core_type_exists(type) \ - __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_EXISTS) + __builtin_preserve_type_info(*___bpf_typeof(type), BPF_TYPE_EXISTS) /* * Convenience macro to check that provided named type @@ -226,7 +247,7 @@ enum bpf_enum_value_kind { * 0, if the type does not match any in the target kernel */ #define bpf_core_type_matches(type) \ - __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_MATCHES) + __builtin_preserve_type_info(*___bpf_typeof(type), BPF_TYPE_MATCHES) /* * Convenience macro to get the byte size of a provided named type @@ -236,7 +257,7 @@ enum bpf_enum_value_kind { * 0, if no matching type is found. */ #define bpf_core_type_size(type) \ - __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_SIZE) + __builtin_preserve_type_info(*___bpf_typeof(type), BPF_TYPE_SIZE) /* * Convenience macro to check that provided enumerator value is defined in @@ -246,8 +267,13 @@ enum bpf_enum_value_kind { * kernel's BTF; * 0, if no matching enum and/or enum value within that enum is found. */ +#ifdef __clang__ #define bpf_core_enum_value_exists(enum_type, enum_value) \ __builtin_preserve_enum_value(*(typeof(enum_type) *)enum_value, BPF_ENUMVAL_EXISTS) +#else +#define bpf_core_enum_value_exists(enum_type, enum_value) \ + __builtin_preserve_enum_value(___bpf_typeof(enum_type), enum_value, BPF_ENUMVAL_EXISTS) +#endif /* * Convenience macro to get the integer value of an enumerator value in @@ -257,8 +283,13 @@ enum bpf_enum_value_kind { * present in target kernel's BTF; * 0, if no matching enum and/or enum value within that enum is found. */ +#ifdef __clang__ #define bpf_core_enum_value(enum_type, enum_value) \ __builtin_preserve_enum_value(*(typeof(enum_type) *)enum_value, BPF_ENUMVAL_VALUE) +#else +#define bpf_core_enum_value(enum_type, enum_value) \ + __builtin_preserve_enum_value(___bpf_typeof(enum_type), enum_value, BPF_ENUMVAL_VALUE) +#endif /* * bpf_core_read() abstracts away bpf_probe_read_kernel() call and captures From dc8543b597c282643a433e9a8af0459ed3046908 Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Thu, 8 Feb 2024 14:14:49 -0800 Subject: [PATCH 083/119] bpf, docs: Update ISA document title * Use "Instruction Set Architecture (ISA)" instead of "Instruction Set Specification" * Remove version number As previously discussed on the mailing list at https://mailarchive.ietf.org/arch/msg/bpf/SEpn3OL9TabNRn-4rDX9A6XVbjM/ Signed-off-by: Dave Thaler Signed-off-by: Daniel Borkmann Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20240208221449.12274-1-dthaler1968@gmail.com --- Documentation/bpf/standardization/instruction-set.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst index bdfe0cd0e499..868d9f6177e3 100644 --- a/Documentation/bpf/standardization/instruction-set.rst +++ b/Documentation/bpf/standardization/instruction-set.rst @@ -1,11 +1,11 @@ .. contents:: .. sectnum:: -======================================= -BPF Instruction Set Specification, v1.0 -======================================= +====================================== +BPF Instruction Set Architecture (ISA) +====================================== -This document specifies version 1.0 of the BPF instruction set. +This document specifies the BPF instruction set architecture (ISA). Documentation conventions ========================= From 77c0208e199ccb0986fb3612f2409c8cdcb036ad Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Thu, 8 Feb 2024 18:37:47 -0800 Subject: [PATCH 084/119] bpf: add btf pointer to struct bpf_ctx_arg_aux. Enable the providers to use types defined in a module instead of in the kernel (btf_vmlinux). Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240209023750.1153905-2-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 1 + kernel/bpf/btf.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1ebbee1d648e..3b7836f0a83e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1415,6 +1415,7 @@ struct bpf_jit_poke_descriptor { struct bpf_ctx_arg_aux { u32 offset; enum bpf_reg_type reg_type; + struct btf *btf; u32 btf_id; }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8e06d29961f1..cf100b5573ca 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6266,7 +6266,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } info->reg_type = ctx_arg_info->reg_type; - info->btf = btf_vmlinux; + info->btf = ctx_arg_info->btf ? : btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; return true; } From 6115a0aeef01aef152ad7738393aad11422bfb82 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Thu, 8 Feb 2024 18:37:48 -0800 Subject: [PATCH 085/119] bpf: Move __kfunc_param_match_suffix() to btf.c. Move __kfunc_param_match_suffix() to btf.c and rename it as btf_param_match_suffix(). It can be reused by bpf_struct_ops later. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240209023750.1153905-3-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/btf.h | 4 ++++ kernel/bpf/btf.c | 18 ++++++++++++++++++ kernel/bpf/verifier.c | 38 ++++++++++---------------------------- 3 files changed, 32 insertions(+), 28 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index 1ee8977b8c95..df76a14c64f6 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -495,6 +495,10 @@ static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func); } +bool btf_param_match_suffix(const struct btf *btf, + const struct btf_param *arg, + const char *suffix); + struct bpf_verifier_log; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index cf100b5573ca..447da964f217 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8904,3 +8904,21 @@ errout: } EXPORT_SYMBOL_GPL(__register_bpf_struct_ops); #endif + +bool btf_param_match_suffix(const struct btf *btf, + const struct btf_param *arg, + const char *suffix) +{ + int suffix_len = strlen(suffix), len; + const char *param_name; + + /* In the future, this can be ported to use BTF tagging */ + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len <= suffix_len) + return false; + param_name += len - suffix_len; + return !strncmp(param_name, suffix, suffix_len); +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ddaf09db1175..c92d6af7d975 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10682,24 +10682,6 @@ static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RCU_PROTECTED; } -static bool __kfunc_param_match_suffix(const struct btf *btf, - const struct btf_param *arg, - const char *suffix) -{ - int suffix_len = strlen(suffix), len; - const char *param_name; - - /* In the future, this can be ported to use BTF tagging */ - param_name = btf_name_by_offset(btf, arg->name_off); - if (str_is_empty(param_name)) - return false; - len = strlen(param_name); - if (len < suffix_len) - return false; - param_name += len - suffix_len; - return !strncmp(param_name, suffix, suffix_len); -} - static bool is_kfunc_arg_mem_size(const struct btf *btf, const struct btf_param *arg, const struct bpf_reg_state *reg) @@ -10710,7 +10692,7 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf, if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) return false; - return __kfunc_param_match_suffix(btf, arg, "__sz"); + return btf_param_match_suffix(btf, arg, "__sz"); } static bool is_kfunc_arg_const_mem_size(const struct btf *btf, @@ -10723,47 +10705,47 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf, if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) return false; - return __kfunc_param_match_suffix(btf, arg, "__szk"); + return btf_param_match_suffix(btf, arg, "__szk"); } static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__opt"); + return btf_param_match_suffix(btf, arg, "__opt"); } static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__k"); + return btf_param_match_suffix(btf, arg, "__k"); } static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__ign"); + return btf_param_match_suffix(btf, arg, "__ign"); } static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__alloc"); + return btf_param_match_suffix(btf, arg, "__alloc"); } static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__uninit"); + return btf_param_match_suffix(btf, arg, "__uninit"); } static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr"); + return btf_param_match_suffix(btf, arg, "__refcounted_kptr"); } static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__nullable"); + return btf_param_match_suffix(btf, arg, "__nullable"); } static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__str"); + return btf_param_match_suffix(btf, arg, "__str"); } static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, From 1611603537a4b88cec7993f32b70c03113801a46 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Thu, 8 Feb 2024 18:37:49 -0800 Subject: [PATCH 086/119] bpf: Create argument information for nullable arguments. Collect argument information from the type information of stub functions to mark arguments of BPF struct_ops programs with PTR_MAYBE_NULL if they are nullable. A nullable argument is annotated by suffixing "__nullable" at the argument name of stub function. For nullable arguments, this patch sets a struct bpf_ctx_arg_aux to label their reg_type with PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL. This makes the verifier to check programs and ensure that they properly check the pointer. The programs should check if the pointer is null before accessing the pointed memory. The implementer of a struct_ops type should annotate the arguments that can be null. The implementer should define a stub function (empty) as a placeholder for each defined operator. The name of a stub function should be in the pattern "__". For example, for test_maybe_null of struct bpf_testmod_ops, it's stub function name should be "bpf_testmod_ops__test_maybe_null". You mark an argument nullable by suffixing the argument name with "__nullable" at the stub function. Since we already has stub functions for kCFI, we just reuse these stub functions with the naming convention mentioned earlier. These stub functions with the naming convention is only required if there are nullable arguments to annotate. For functions having not nullable arguments, stub functions are not necessary for the purpose of this patch. This patch will prepare a list of struct bpf_ctx_arg_aux, aka arg_info, for each member field of a struct_ops type. "arg_info" will be assigned to "prog->aux->ctx_arg_info" of BPF struct_ops programs in check_struct_ops_btf_id() so that it can be used by btf_ctx_access() later to set reg_type properly for the verifier. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240209023750.1153905-4-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 21 ++++ include/linux/btf.h | 2 + kernel/bpf/bpf_struct_ops.c | 213 ++++++++++++++++++++++++++++++++++-- kernel/bpf/btf.c | 27 +++++ kernel/bpf/verifier.c | 6 + 5 files changed, 257 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3b7836f0a83e..c7aa99b44dbd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1709,6 +1709,19 @@ struct bpf_struct_ops { struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; }; +/* Every member of a struct_ops type has an instance even a member is not + * an operator (function pointer). The "info" field will be assigned to + * prog->aux->ctx_arg_info of BPF struct_ops programs to provide the + * argument information required by the verifier to verify the program. + * + * btf_ctx_access() will lookup prog->aux->ctx_arg_info to find the + * corresponding entry for an given argument. + */ +struct bpf_struct_ops_arg_info { + struct bpf_ctx_arg_aux *info; + u32 cnt; +}; + struct bpf_struct_ops_desc { struct bpf_struct_ops *st_ops; @@ -1716,6 +1729,9 @@ struct bpf_struct_ops_desc { const struct btf_type *value_type; u32 type_id; u32 value_id; + + /* Collection of argument information for each member */ + struct bpf_struct_ops_arg_info *arg_info; }; enum bpf_struct_ops_state { @@ -1790,6 +1806,7 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, struct btf *btf, struct bpf_verifier_log *log); void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map); +void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc); #else #define register_bpf_struct_ops(st_ops, type) ({ (void *)(st_ops); 0; }) static inline bool bpf_try_module_get(const void *data, struct module *owner) @@ -1814,6 +1831,10 @@ static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struc { } +static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc) +{ +} + #endif #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) diff --git a/include/linux/btf.h b/include/linux/btf.h index df76a14c64f6..cb96f6263638 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -498,6 +498,8 @@ static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) bool btf_param_match_suffix(const struct btf *btf, const struct btf_param *arg, const char *suffix); +int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, + u32 arg_no); struct bpf_verifier_log; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index f98f580de77a..0d7be97a2411 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -116,17 +116,183 @@ static bool is_valid_value_type(struct btf *btf, s32 value_id, return true; } +#define MAYBE_NULL_SUFFIX "__nullable" +#define MAX_STUB_NAME 128 + +/* Return the type info of a stub function, if it exists. + * + * The name of a stub function is made up of the name of the struct_ops and + * the name of the function pointer member, separated by "__". For example, + * if the struct_ops type is named "foo_ops" and the function pointer + * member is named "bar", the stub function name would be "foo_ops__bar". + */ +static const struct btf_type * +find_stub_func_proto(const struct btf *btf, const char *st_op_name, + const char *member_name) +{ + char stub_func_name[MAX_STUB_NAME]; + const struct btf_type *func_type; + s32 btf_id; + int cp; + + cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s", + st_op_name, member_name); + if (cp >= MAX_STUB_NAME) { + pr_warn("Stub function name too long\n"); + return NULL; + } + btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC); + if (btf_id < 0) + return NULL; + func_type = btf_type_by_id(btf, btf_id); + if (!func_type) + return NULL; + + return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */ +} + +/* Prepare argument info for every nullable argument of a member of a + * struct_ops type. + * + * Initialize a struct bpf_struct_ops_arg_info according to type info of + * the arguments of a stub function. (Check kCFI for more information about + * stub functions.) + * + * Each member in the struct_ops type has a struct bpf_struct_ops_arg_info + * to provide an array of struct bpf_ctx_arg_aux, which in turn provides + * the information that used by the verifier to check the arguments of the + * BPF struct_ops program assigned to the member. Here, we only care about + * the arguments that are marked as __nullable. + * + * The array of struct bpf_ctx_arg_aux is eventually assigned to + * prog->aux->ctx_arg_info of BPF struct_ops programs and passed to the + * verifier. (See check_struct_ops_btf_id()) + * + * arg_info->info will be the list of struct bpf_ctx_arg_aux if success. If + * fails, it will be kept untouched. + */ +static int prepare_arg_info(struct btf *btf, + const char *st_ops_name, + const char *member_name, + const struct btf_type *func_proto, + struct bpf_struct_ops_arg_info *arg_info) +{ + const struct btf_type *stub_func_proto, *pointed_type; + const struct btf_param *stub_args, *args; + struct bpf_ctx_arg_aux *info, *info_buf; + u32 nargs, arg_no, info_cnt = 0; + u32 arg_btf_id; + int offset; + + stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name); + if (!stub_func_proto) + return 0; + + /* Check if the number of arguments of the stub function is the same + * as the number of arguments of the function pointer. + */ + nargs = btf_type_vlen(func_proto); + if (nargs != btf_type_vlen(stub_func_proto)) { + pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n", + st_ops_name, member_name, member_name, st_ops_name); + return -EINVAL; + } + + if (!nargs) + return 0; + + args = btf_params(func_proto); + stub_args = btf_params(stub_func_proto); + + info_buf = kcalloc(nargs, sizeof(*info_buf), GFP_KERNEL); + if (!info_buf) + return -ENOMEM; + + /* Prepare info for every nullable argument */ + info = info_buf; + for (arg_no = 0; arg_no < nargs; arg_no++) { + /* Skip arguments that is not suffixed with + * "__nullable". + */ + if (!btf_param_match_suffix(btf, &stub_args[arg_no], + MAYBE_NULL_SUFFIX)) + continue; + + /* Should be a pointer to struct */ + pointed_type = btf_type_resolve_ptr(btf, + args[arg_no].type, + &arg_btf_id); + if (!pointed_type || + !btf_type_is_struct(pointed_type)) { + pr_warn("stub function %s__%s has %s tagging to an unsupported type\n", + st_ops_name, member_name, MAYBE_NULL_SUFFIX); + goto err_out; + } + + offset = btf_ctx_arg_offset(btf, func_proto, arg_no); + if (offset < 0) { + pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n", + st_ops_name, member_name, arg_no); + goto err_out; + } + + if (args[arg_no].type != stub_args[arg_no].type) { + pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n", + arg_no, st_ops_name, member_name); + goto err_out; + } + + /* Fill the information of the new argument */ + info->reg_type = + PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL; + info->btf_id = arg_btf_id; + info->btf = btf; + info->offset = offset; + + info++; + info_cnt++; + } + + if (info_cnt) { + arg_info->info = info_buf; + arg_info->cnt = info_cnt; + } else { + kfree(info_buf); + } + + return 0; + +err_out: + kfree(info_buf); + + return -EINVAL; +} + +/* Clean up the arg_info in a struct bpf_struct_ops_desc. */ +void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc) +{ + struct bpf_struct_ops_arg_info *arg_info; + int i; + + arg_info = st_ops_desc->arg_info; + for (i = 0; i < btf_type_vlen(st_ops_desc->type); i++) + kfree(arg_info[i].info); + + kfree(arg_info); +} + int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, struct btf *btf, struct bpf_verifier_log *log) { struct bpf_struct_ops *st_ops = st_ops_desc->st_ops; + struct bpf_struct_ops_arg_info *arg_info; const struct btf_member *member; const struct btf_type *t; s32 type_id, value_id; char value_name[128]; const char *mname; - int i; + int i, err; if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= sizeof(value_name)) { @@ -160,6 +326,17 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, if (!is_valid_value_type(btf, value_id, t, value_name)) return -EINVAL; + arg_info = kcalloc(btf_type_vlen(t), sizeof(*arg_info), + GFP_KERNEL); + if (!arg_info) + return -ENOMEM; + + st_ops_desc->arg_info = arg_info; + st_ops_desc->type = t; + st_ops_desc->type_id = type_id; + st_ops_desc->value_id = value_id; + st_ops_desc->value_type = btf_type_by_id(btf, value_id); + for_each_member(i, t, member) { const struct btf_type *func_proto; @@ -167,40 +344,52 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, if (!*mname) { pr_warn("anon member in struct %s is not supported\n", st_ops->name); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto errout; } if (__btf_member_bitfield_size(t, member)) { pr_warn("bit field member %s in struct %s is not supported\n", mname, st_ops->name); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto errout; } func_proto = btf_type_resolve_func_ptr(btf, member->type, NULL); - if (func_proto && - btf_distill_func_proto(log, btf, + if (!func_proto) + continue; + + if (btf_distill_func_proto(log, btf, func_proto, mname, &st_ops->func_models[i])) { pr_warn("Error in parsing func ptr %s in struct %s\n", mname, st_ops->name); - return -EINVAL; + err = -EINVAL; + goto errout; } + + err = prepare_arg_info(btf, st_ops->name, mname, + func_proto, + arg_info + i); + if (err) + goto errout; } if (st_ops->init(btf)) { pr_warn("Error in init bpf_struct_ops %s\n", st_ops->name); - return -EINVAL; + err = -EINVAL; + goto errout; } - st_ops_desc->type_id = type_id; - st_ops_desc->type = t; - st_ops_desc->value_id = value_id; - st_ops_desc->value_type = btf_type_by_id(btf, value_id); - return 0; + +errout: + bpf_struct_ops_desc_release(st_ops_desc); + + return err; } static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 447da964f217..efd9bc274be0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1699,6 +1699,13 @@ static void btf_free_struct_meta_tab(struct btf *btf) static void btf_free_struct_ops_tab(struct btf *btf) { struct btf_struct_ops_tab *tab = btf->struct_ops_tab; + u32 i; + + if (!tab) + return; + + for (i = 0; i < tab->cnt; i++) + bpf_struct_ops_desc_release(&tab->ops[i]); kfree(tab); btf->struct_ops_tab = NULL; @@ -6130,6 +6137,26 @@ static bool prog_args_trusted(const struct bpf_prog *prog) } } +int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, + u32 arg_no) +{ + const struct btf_param *args; + const struct btf_type *t; + int off = 0, i; + u32 sz; + + args = btf_params(func_proto); + for (i = 0; i < arg_no; i++) { + t = btf_type_by_id(btf, args[i].type); + t = btf_resolve_size(btf, t, &sz); + if (IS_ERR(t)) + return PTR_ERR(t); + off += roundup(sz, 8); + } + + return off; +} + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c92d6af7d975..72ca27f49616 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20419,6 +20419,12 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } } + /* btf_ctx_access() used this to provide argument type info */ + prog->aux->ctx_arg_info = + st_ops_desc->arg_info[member_idx].info; + prog->aux->ctx_arg_info_size = + st_ops_desc->arg_info[member_idx].cnt; + prog->aux->attach_func_proto = func_proto; prog->aux->attach_func_name = mname; env->ops = st_ops->verifier_ops; From 00f239eccf461a6403b3c16e767d04f3954cae98 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Thu, 8 Feb 2024 18:37:50 -0800 Subject: [PATCH 087/119] selftests/bpf: Test PTR_MAYBE_NULL arguments of struct_ops operators. Test if the verifier verifies nullable pointer arguments correctly for BPF struct_ops programs. "test_maybe_null" in struct bpf_testmod_ops is the operator defined for the test cases here. A BPF program should check a pointer for NULL beforehand to access the value pointed by the nullable pointer arguments, or the verifier should reject the programs. The test here includes two parts; the programs checking pointers properly and the programs not checking pointers beforehand. The test checks if the verifier accepts the programs checking properly and rejects the programs not checking at all. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240209023750.1153905-5-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 13 +++++- .../selftests/bpf/bpf_testmod/bpf_testmod.h | 4 ++ .../prog_tests/test_struct_ops_maybe_null.c | 46 +++++++++++++++++++ .../bpf/progs/struct_ops_maybe_null.c | 29 ++++++++++++ .../bpf/progs/struct_ops_maybe_null_fail.c | 24 ++++++++++ 5 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_struct_ops_maybe_null.c create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_maybe_null.c create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_maybe_null_fail.c diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index a06daebc75c9..66787e99ba1b 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -555,7 +555,11 @@ static int bpf_dummy_reg(void *kdata) { struct bpf_testmod_ops *ops = kdata; - ops->test_2(4, 3); + /* Some test cases (ex. struct_ops_maybe_null) may not have test_2 + * initialized, so we need to check for NULL. + */ + if (ops->test_2) + ops->test_2(4, 3); return 0; } @@ -573,9 +577,16 @@ static void bpf_testmod_test_2(int a, int b) { } +static int bpf_testmod_ops__test_maybe_null(int dummy, + struct task_struct *task__nullable) +{ + return 0; +} + static struct bpf_testmod_ops __bpf_testmod_ops = { .test_1 = bpf_testmod_test_1, .test_2 = bpf_testmod_test_2, + .test_maybe_null = bpf_testmod_ops__test_maybe_null, }; struct bpf_struct_ops bpf_bpf_testmod_ops = { diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h index 537beca42896..c3b0cf788f9f 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h @@ -5,6 +5,8 @@ #include +struct task_struct; + struct bpf_testmod_test_read_ctx { char *buf; loff_t off; @@ -31,6 +33,8 @@ struct bpf_iter_testmod_seq { struct bpf_testmod_ops { int (*test_1)(void); void (*test_2)(int a, int b); + /* Used to test nullable arguments. */ + int (*test_maybe_null)(int dummy, struct task_struct *task); }; #endif /* _BPF_TESTMOD_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_maybe_null.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_maybe_null.c new file mode 100644 index 000000000000..01dc2613c8a5 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_maybe_null.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include + +#include "struct_ops_maybe_null.skel.h" +#include "struct_ops_maybe_null_fail.skel.h" + +/* Test that the verifier accepts a program that access a nullable pointer + * with a proper check. + */ +static void maybe_null(void) +{ + struct struct_ops_maybe_null *skel; + + skel = struct_ops_maybe_null__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open_and_load")) + return; + + struct_ops_maybe_null__destroy(skel); +} + +/* Test that the verifier rejects a program that access a nullable pointer + * without a check beforehand. + */ +static void maybe_null_fail(void) +{ + struct struct_ops_maybe_null_fail *skel; + + skel = struct_ops_maybe_null_fail__open_and_load(); + if (ASSERT_ERR_PTR(skel, "struct_ops_module_fail__open_and_load")) + return; + + struct_ops_maybe_null_fail__destroy(skel); +} + +void test_struct_ops_maybe_null(void) +{ + /* The verifier verifies the programs at load time, so testing both + * programs in the same compile-unit is complicated. We run them in + * separate objects to simplify the testing. + */ + if (test__start_subtest("maybe_null")) + maybe_null(); + if (test__start_subtest("maybe_null_fail")) + maybe_null_fail(); +} diff --git a/tools/testing/selftests/bpf/progs/struct_ops_maybe_null.c b/tools/testing/selftests/bpf/progs/struct_ops_maybe_null.c new file mode 100644 index 000000000000..b450f72e744a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_maybe_null.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "../bpf_testmod/bpf_testmod.h" + +char _license[] SEC("license") = "GPL"; + +pid_t tgid = 0; + +/* This is a test BPF program that uses struct_ops to access an argument + * that may be NULL. This is a test for the verifier to ensure that it can + * rip PTR_MAYBE_NULL correctly. + */ +SEC("struct_ops/test_maybe_null") +int BPF_PROG(test_maybe_null, int dummy, + struct task_struct *task) +{ + if (task) + tgid = task->tgid; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_ops testmod_1 = { + .test_maybe_null = (void *)test_maybe_null, +}; + diff --git a/tools/testing/selftests/bpf/progs/struct_ops_maybe_null_fail.c b/tools/testing/selftests/bpf/progs/struct_ops_maybe_null_fail.c new file mode 100644 index 000000000000..6283099ec383 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_maybe_null_fail.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "../bpf_testmod/bpf_testmod.h" + +char _license[] SEC("license") = "GPL"; + +pid_t tgid = 0; + +SEC("struct_ops/test_maybe_null_struct_ptr") +int BPF_PROG(test_maybe_null_struct_ptr, int dummy, + struct task_struct *task) +{ + tgid = task->tgid; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_ops testmod_struct_ptr = { + .test_maybe_null = (void *)test_maybe_null_struct_ptr, +}; + From 32e18e7688c6847b0c9db073aafb00639ecf576c Mon Sep 17 00:00:00 2001 From: Oliver Crumrine Date: Fri, 9 Feb 2024 14:41:22 -0500 Subject: [PATCH 088/119] bpf: remove check in __cgroup_bpf_run_filter_skb Originally, this patch removed a redundant check in BPF_CGROUP_RUN_PROG_INET_EGRESS, as the check was already being done in the function it called, __cgroup_bpf_run_filter_skb. For v2, it was reccomended that I remove the check from __cgroup_bpf_run_filter_skb, and add the checks to the other macro that calls that function, BPF_CGROUP_RUN_PROG_INET_INGRESS. To sum it up, checking that the socket exists and that it is a full socket is now part of both macros BPF_CGROUP_RUN_PROG_INET_EGRESS and BPF_CGROUP_RUN_PROG_INET_INGRESS, and it is no longer part of the function they call, __cgroup_bpf_run_filter_skb. v3->v4: Fixed weird merge conflict. v2->v3: Sent to bpf-next instead of generic patch v1->v2: Addressed feedback about where check should be removed. Signed-off-by: Oliver Crumrine Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/7lv62yiyvmj5a7eozv2iznglpkydkdfancgmbhiptrgvgan5sy@3fl3onchgdz3 Signed-off-by: Martin KaFai Lau --- include/linux/bpf-cgroup.h | 3 ++- kernel/bpf/cgroup.c | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a789266feac3..fb3c3e7181e6 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -196,7 +196,8 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && \ - cgroup_bpf_sock_enabled(sk, CGROUP_INET_INGRESS)) \ + cgroup_bpf_sock_enabled(sk, CGROUP_INET_INGRESS) && sk && \ + sk_fullsock(sk)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ CGROUP_INET_INGRESS); \ \ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 98e0e3835b28..5a568bbbeaeb 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1364,9 +1364,6 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct cgroup *cgrp; int ret; - if (!sk || !sk_fullsock(sk)) - return 0; - if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) return 0; From fb5b86cfd4ef21ea18966718f6bf6c8f1b9df12e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Feb 2024 15:32:18 -0800 Subject: [PATCH 089/119] bpf: simplify btf_get_prog_ctx_type() into btf_is_prog_ctx_type() Return result of btf_get_prog_ctx_type() is never used and callers only check NULL vs non-NULL case to determine if given type matches expected PTR_TO_CTX type. So rename function to `btf_is_prog_ctx_type()` and return a simple true/false. We'll use this simpler interface to handle kprobe program type's special typedef case in the next patch. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240212233221.2575350-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 17 ++++++++--------- kernel/bpf/btf.c | 27 +++++++++++++-------------- kernel/bpf/verifier.c | 2 +- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index cb96f6263638..f9e56fd12a9f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -531,10 +531,9 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id); -const struct btf_type * -btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, enum bpf_prog_type prog_type, - int arg); +bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg); int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type); bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2); @@ -574,12 +573,12 @@ static inline struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf { return NULL; } -static inline const struct btf_member * -btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, enum bpf_prog_type prog_type, - int arg) +static inline bool +btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg) { - return NULL; + return false; } static inline int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type) { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index efd9bc274be0..405f95722905 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5694,10 +5694,9 @@ static int find_kern_ctx_type_id(enum bpf_prog_type prog_type) return ctx_type->type; } -const struct btf_type * -btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, enum bpf_prog_type prog_type, - int arg) +bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg) { const struct btf_type *ctx_type; const char *tname, *ctx_tname; @@ -5711,26 +5710,26 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, * is not supported yet. * BPF_PROG_TYPE_RAW_TRACEPOINT is fine. */ - return NULL; + return false; } tname = btf_name_by_offset(btf, t->name_off); if (!tname) { bpf_log(log, "arg#%d struct doesn't have a name\n", arg); - return NULL; + return false; } ctx_type = find_canonical_prog_ctx_type(prog_type); if (!ctx_type) { bpf_log(log, "btf_vmlinux is malformed\n"); /* should not happen */ - return NULL; + return false; } again: ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off); if (!ctx_tname) { /* should not happen */ bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n"); - return NULL; + return false; } /* only compare that prog's ctx type name is the same as * kernel expects. No need to compare field by field. @@ -5740,20 +5739,20 @@ again: * { // no fields of skb are ever used } */ if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0) - return ctx_type; + return true; if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0) - return ctx_type; + return true; if (strcmp(ctx_tname, tname)) { /* bpf_user_pt_regs_t is a typedef, so resolve it to * underlying struct and check name again */ if (!btf_type_is_modifier(ctx_type)) - return NULL; + return false; while (btf_type_is_modifier(ctx_type)) ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type); goto again; } - return ctx_type; + return true; } /* forward declarations for arch-specific underlying types of @@ -5905,7 +5904,7 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, enum bpf_prog_type prog_type, int arg) { - if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg)) + if (!btf_is_prog_ctx_type(log, btf, t, prog_type, arg)) return -ENOENT; return find_kern_ctx_type_id(prog_type); } @@ -7211,7 +7210,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) if (!btf_type_is_ptr(t)) goto skip_pointer; - if ((tags & ARG_TAG_CTX) || btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + if ((tags & ARG_TAG_CTX) || btf_is_prog_ctx_type(log, btf, t, prog_type, i)) { if (tags & ~ARG_TAG_CTX) { bpf_log(log, "arg#%d has invalid combination of tags\n", i); return -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 72ca27f49616..aa192dc735a9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11015,7 +11015,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, * type to our caller. When a set of conditions hold in the BTF type of * arguments, we resolve it to a known kfunc_ptr_arg_type. */ - if (btf_get_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) + if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) return KF_ARG_PTR_TO_CTX; if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) From 824c58fb1090ae5e502284400682e30841280a87 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Feb 2024 15:32:19 -0800 Subject: [PATCH 090/119] bpf: handle bpf_user_pt_regs_t typedef explicitly for PTR_TO_CTX global arg Expected canonical argument type for global function arguments representing PTR_TO_CTX is `bpf_user_pt_regs_t *ctx`. This currently works on s390x by accident because kernel resolves such typedef to underlying struct (which is anonymous on s390x), and erroneously accepting it as expected context type. We are fixing this problem next, which would break s390x arch, so we need to handle `bpf_user_pt_regs_t` case explicitly for KPROBE programs. Fixes: 91cc1a99740e ("bpf: Annotate context types") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240212233221.2575350-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 405f95722905..26dc0876e426 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5702,6 +5702,21 @@ bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const char *tname, *ctx_tname; t = btf_type_by_id(btf, t->type); + + /* KPROBE programs allow bpf_user_pt_regs_t typedef, which we need to + * check before we skip all the typedef below. + */ + if (prog_type == BPF_PROG_TYPE_KPROBE) { + while (btf_type_is_modifier(t) && !btf_type_is_typedef(t)) + t = btf_type_by_id(btf, t->type); + + if (btf_type_is_typedef(t)) { + tname = btf_name_by_offset(btf, t->name_off); + if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0) + return true; + } + } + while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!btf_type_is_struct(t)) { From 879bbe7aa4afa80acf72a1cad7f52416ea78c52d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Feb 2024 15:32:20 -0800 Subject: [PATCH 091/119] bpf: don't infer PTR_TO_CTX for programs with unnamed context type For program types that don't have named context type name (e.g., BPF iterator programs or tracepoint programs), ctx_tname will be a non-NULL empty string. For such programs it shouldn't be possible to have PTR_TO_CTX argument for global subprogs based on type name alone. arg:ctx tag is the only way to have PTR_TO_CTX passed into global subprog for such program types. Fix this loophole, which currently would assume PTR_TO_CTX whenever user uses a pointer to anonymous struct as an argument to their global subprogs. This happens in practice with the following (quite common, in practice) approach: typedef struct { /* anonymous */ int x; } my_type_t; int my_subprog(my_type_t *arg) { ... } User's intent is to have PTR_TO_MEM argument for `arg`, but verifier will complain about expecting PTR_TO_CTX. This fix also closes unintended s390x-specific KPROBE handling of PTR_TO_CTX case. Selftest change is necessary to accommodate this. Fixes: 91cc1a99740e ("bpf: Annotate context types") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240212233221.2575350-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 3 +++ .../bpf/progs/test_global_func_ctx_args.c | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 26dc0876e426..6ff0bd1a91d5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5746,6 +5746,9 @@ again: bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n"); return false; } + /* program types without named context types work only with arg:ctx tag */ + if (ctx_tname[0] == '\0') + return false; /* only compare that prog's ctx type name is the same as * kernel expects. No need to compare field by field. * It's ok for bpf prog to do: diff --git a/tools/testing/selftests/bpf/progs/test_global_func_ctx_args.c b/tools/testing/selftests/bpf/progs/test_global_func_ctx_args.c index 9a06e5eb1fbe..143c8a4852bf 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func_ctx_args.c +++ b/tools/testing/selftests/bpf/progs/test_global_func_ctx_args.c @@ -26,6 +26,23 @@ int kprobe_typedef_ctx(void *ctx) return kprobe_typedef_ctx_subprog(ctx); } +/* s390x defines: + * + * typedef user_pt_regs bpf_user_pt_regs_t; + * typedef struct { ... } user_pt_regs; + * + * And so "canonical" underlying struct type is anonymous. + * So on s390x only valid ways to have PTR_TO_CTX argument in global subprogs + * are: + * - bpf_user_pt_regs_t *ctx (typedef); + * - struct bpf_user_pt_regs_t *ctx (backwards compatible struct hack); + * - void *ctx __arg_ctx (arg:ctx tag) + * + * Other architectures also allow using underlying struct types (e.g., + * `struct pt_regs *ctx` for x86-64) + */ +#ifndef bpf_target_s390 + #define pt_regs_struct_t typeof(*(__PT_REGS_CAST((struct pt_regs *)NULL))) __weak int kprobe_struct_ctx_subprog(pt_regs_struct_t *ctx) @@ -40,6 +57,8 @@ int kprobe_resolved_ctx(void *ctx) return kprobe_struct_ctx_subprog(ctx); } +#endif + /* this is current hack to make this work on old kernels */ struct bpf_user_pt_regs_t {}; From 63d5a33fb4ec2a4ed6907c8ac144b6f10f6dba47 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Feb 2024 15:32:21 -0800 Subject: [PATCH 092/119] selftests/bpf: add anonymous user struct as global subprog arg test Add tests validating that kernel handles pointer to anonymous struct argument as PTR_TO_MEM case, not as PTR_TO_CTX case. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240212233221.2575350-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../bpf/progs/verifier_global_subprogs.c | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c index 67dddd941891..baff5ffe9405 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c @@ -115,6 +115,35 @@ int arg_tag_nullable_ptr_fail(void *ctx) return subprog_nullable_ptr_bad(&x); } +typedef struct { + int x; +} user_struct_t; + +__noinline __weak int subprog_user_anon_mem(user_struct_t *t) +{ + return t ? t->x : 0; +} + +SEC("?tracepoint") +__failure __log_level(2) +__msg("invalid bpf_context access") +__msg("Caller passes invalid args into func#1 ('subprog_user_anon_mem')") +int anon_user_mem_invalid(void *ctx) +{ + /* can't pass PTR_TO_CTX as user memory */ + return subprog_user_anon_mem(ctx); +} + +SEC("?tracepoint") +__success __log_level(2) +__msg("Func#1 ('subprog_user_anon_mem') is safe for any args that match its prototype") +int anon_user_mem_valid(void *ctx) +{ + user_struct_t t = { .x = 42 }; + + return subprog_user_anon_mem(&t); +} + __noinline __weak int subprog_nonnull_ptr_good(int *p1 __arg_nonnull, int *p2 __arg_nonnull) { return (*p1) * (*p2); /* good, no need for NULL checks */ From 7cc13adbd057f1905564ec2a254883d7fd407deb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Feb 2024 15:59:44 -0800 Subject: [PATCH 093/119] bpf: emit source code file name and line number in verifier log As BPF applications grow in size and complexity and are separated into multiple .bpf.c files that are statically linked together, it becomes harder and harder to match verifier's BPF assembly level output to original C code. While often annotated C source code is unique enough to be able to identify the file it belongs to, quite often this is actually problematic as parts of source code can be quite generic. Long story short, it is very useful to see source code file name and line number information along with the original C code. Verifier already knows this information, we just need to output it. This patch extends verifier log with file name and line number information, emitted next to original (presumably C) source code, annotating BPF assembly output, like so: ; @ .bpf.c: If file name has directory names in it, they are stripped away. This should be fine in practice as file names tend to be pretty unique with C code anyways, and keeping log size smaller is always good. In practice this might look something like below, where some code is coming from application files, while others are from libbpf's usdt.bpf.h header file: ; if (STROBEMETA_READ( @ strobemeta_probe.bpf.c:534 5592: (79) r1 = *(u64 *)(r10 -56) ; R1_w=mem_or_null(id=1589,sz=7680) R10=fp0 5593: (7b) *(u64 *)(r10 -56) = r1 ; R1_w=mem_or_null(id=1589,sz=7680) R10=fp0 5594: (79) r3 = *(u64 *)(r10 -8) ; R3_w=scalar() R10=fp0 fp-8=mmmmmmmm ... 170: (71) r1 = *(u8 *)(r8 +15) ; frame1: R1_w=scalar(...) R8_w=map_value(map=__bpf_usdt_spec,ks=4,vs=208) 171: (67) r1 <<= 56 ; frame1: R1_w=scalar(...) 172: (c7) r1 s>>= 56 ; frame1: R1_w=scalar(smin=smin32=-128,smax=smax32=127) ; val <<= arg_spec->arg_bitshift; @ usdt.bpf.h:183 173: (67) r1 <<= 32 ; frame1: R1_w=scalar(...) 174: (77) r1 >>= 32 ; frame1: R1_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 175: (79) r2 = *(u64 *)(r10 -8) ; frame1: R2_w=scalar() R10=fp0 fp-8=mmmmmmmm 176: (6f) r2 <<= r1 ; frame1: R1_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) R2_w=scalar() 177: (7b) *(u64 *)(r10 -8) = r2 ; frame1: R2_w=scalar(id=61) R10=fp0 fp-8_w=scalar(id=61) ; if (arg_spec->arg_signed) @ usdt.bpf.h:184 178: (bf) r3 = r2 ; frame1: R2_w=scalar(id=61) R3_w=scalar(id=61) 179: (7f) r3 >>= r1 ; frame1: R1_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) R3_w=scalar() ; if (arg_spec->arg_signed) @ usdt.bpf.h:184 180: (71) r4 = *(u8 *)(r8 +14) 181: safe log_fixup tests needed a minor adjustment as verifier log output increased a bit and that test is quite sensitive to such changes. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240212235944.2816107-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 15 ++++++++++++--- .../testing/selftests/bpf/prog_tests/log_fixup.c | 4 ++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 594a234f122b..cc789efc7f43 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -9,6 +9,7 @@ #include #include #include +#include #define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) @@ -362,6 +363,8 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, const char *prefix_fmt, ...) { const struct bpf_line_info *linfo; + const struct btf *btf; + const char *s, *fname; if (!bpf_verifier_log_needed(&env->log)) return; @@ -378,9 +381,15 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, va_end(args); } - verbose(env, "%s\n", - ltrim(btf_name_by_offset(env->prog->aux->btf, - linfo->line_off))); + btf = env->prog->aux->btf; + s = ltrim(btf_name_by_offset(btf, linfo->line_off)); + verbose(env, "%s", s); /* source code line */ + + s = btf_name_by_offset(btf, linfo->file_name_off); + /* leave only file name */ + fname = strrchr(s, '/'); + fname = fname ? fname + 1 : s; + verbose(env, " @ %s:%u\n", fname, BPF_LINE_INFO_LINE_NUM(linfo->line_col)); env->prev_linfo = linfo; } diff --git a/tools/testing/selftests/bpf/prog_tests/log_fixup.c b/tools/testing/selftests/bpf/prog_tests/log_fixup.c index 7a3fa2ff567b..90a98e23be61 100644 --- a/tools/testing/selftests/bpf/prog_tests/log_fixup.c +++ b/tools/testing/selftests/bpf/prog_tests/log_fixup.c @@ -169,9 +169,9 @@ void test_log_fixup(void) if (test__start_subtest("bad_core_relo_trunc_none")) bad_core_relo(0, TRUNC_NONE /* full buf */); if (test__start_subtest("bad_core_relo_trunc_partial")) - bad_core_relo(280, TRUNC_PARTIAL /* truncate original log a bit */); + bad_core_relo(300, TRUNC_PARTIAL /* truncate original log a bit */); if (test__start_subtest("bad_core_relo_trunc_full")) - bad_core_relo(220, TRUNC_FULL /* truncate also libbpf's message patch */); + bad_core_relo(240, TRUNC_FULL /* truncate also libbpf's message patch */); if (test__start_subtest("bad_core_relo_subprog")) bad_core_relo_subprog(); if (test__start_subtest("missing_map")) From 1159d27852207e8efb8d6ef2dae5aaa87ec4e225 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Wed, 14 Feb 2024 09:14:23 +0000 Subject: [PATCH 094/119] libbpf: Make remark about zero-initializing bpf_*_info structs In some situations, if you fail to zero-initialize the bpf_{prog,map,btf,link}_info structs supplied to the set of LIBBPF helpers bpf_{prog,map,btf,link}_get_info_by_fd(), you can expect the helper to return an error. This can possibly leave people in a situation where they're scratching their heads for an unnnecessary amount of time. Make an explicit remark about the requirement of zero-initializing the supplied bpf_{prog,map,btf,link}_info structs for the respective LIBBPF helpers. Internally, LIBBPF helpers bpf_{prog,map,btf,link}_get_info_by_fd() call into bpf_obj_get_info_by_fd() where the bpf(2) BPF_OBJ_GET_INFO_BY_FD command is used. This specific command is effectively backed by restrictions enforced by the bpf_check_uarg_tail_zero() helper. This function ensures that if the size of the supplied bpf_{prog,map,btf,link}_info structs are larger than what the kernel can handle, trailing bits are zeroed. This can be a problem when compiling against UAPI headers that don't necessarily match the sizes of the same underlying types known to the kernel. Signed-off-by: Matt Bobrowski Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/ZcyEb8x4VbhieWsL@google.com --- tools/lib/bpf/bpf.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index f866e98b2436..ab2570d28aec 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -500,7 +500,10 @@ LIBBPF_API int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len); * program corresponding to *prog_fd*. * * Populates up to *info_len* bytes of *info* and updates *info_len* with the - * actual number of bytes written to *info*. + * actual number of bytes written to *info*. Note that *info* should be + * zero-initialized or initialized as expected by the requested *info* + * type. Failing to (zero-)initialize *info* under certain circumstances can + * result in this helper returning an error. * * @param prog_fd BPF program file descriptor * @param info pointer to **struct bpf_prog_info** that will be populated with @@ -517,7 +520,10 @@ LIBBPF_API int bpf_prog_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, * map corresponding to *map_fd*. * * Populates up to *info_len* bytes of *info* and updates *info_len* with the - * actual number of bytes written to *info*. + * actual number of bytes written to *info*. Note that *info* should be + * zero-initialized or initialized as expected by the requested *info* + * type. Failing to (zero-)initialize *info* under certain circumstances can + * result in this helper returning an error. * * @param map_fd BPF map file descriptor * @param info pointer to **struct bpf_map_info** that will be populated with @@ -530,11 +536,14 @@ LIBBPF_API int bpf_prog_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, LIBBPF_API int bpf_map_get_info_by_fd(int map_fd, struct bpf_map_info *info, __u32 *info_len); /** - * @brief **bpf_btf_get_info_by_fd()** obtains information about the + * @brief **bpf_btf_get_info_by_fd()** obtains information about the * BTF object corresponding to *btf_fd*. * * Populates up to *info_len* bytes of *info* and updates *info_len* with the - * actual number of bytes written to *info*. + * actual number of bytes written to *info*. Note that *info* should be + * zero-initialized or initialized as expected by the requested *info* + * type. Failing to (zero-)initialize *info* under certain circumstances can + * result in this helper returning an error. * * @param btf_fd BTF object file descriptor * @param info pointer to **struct bpf_btf_info** that will be populated with @@ -551,7 +560,10 @@ LIBBPF_API int bpf_btf_get_info_by_fd(int btf_fd, struct bpf_btf_info *info, __u * link corresponding to *link_fd*. * * Populates up to *info_len* bytes of *info* and updates *info_len* with the - * actual number of bytes written to *info*. + * actual number of bytes written to *info*. Note that *info* should be + * zero-initialized or initialized as expected by the requested *info* + * type. Failing to (zero-)initialize *info* under certain circumstances can + * result in this helper returning an error. * * @param link_fd BPF link file descriptor * @param info pointer to **struct bpf_link_info** that will be populated with From a4561f5afef8a8ff25a2cfd46d587f65869494f2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 13 Feb 2024 16:23:11 -0800 Subject: [PATCH 095/119] bpf: Use O(log(N)) binary search to find line info record Real-world BPF applications keep growing in size. Medium-sized production application can easily have 50K+ verified instructions, and its line info section in .BTF.ext has more than 3K entries. When verifier emits log with log_level>=1, it annotates assembly code with matched original C source code. Currently it uses linear search over line info records to find a match. As complexity of BPF applications grows, this O(K * N) approach scales poorly. So, let's instead of linear O(N) search for line info record use faster equivalent O(log(N)) binary search algorithm. It's not a plain binary search, as we don't look for exact match. It's an upper bound search variant, looking for rightmost line info record that starts at or before given insn_off. Some unscientific measurements were done before and after this change. They were done in VM and fluctuate a bit, but overall the speed up is undeniable. BASELINE ======== File Program Duration (us) Insns -------------------------------- ---------------- ------------- ------ katran.bpf.o balancer_ingress 2497130 343552 pyperf600.bpf.linked3.o on_event 12389611 627288 strobelight_pyperf_libbpf.o on_py_event 387399 52445 -------------------------------- ---------------- ------------- ------ BINARY SEARCH ============= File Program Duration (us) Insns -------------------------------- ---------------- ------------- ------ katran.bpf.o balancer_ingress 2339312 343552 pyperf600.bpf.linked3.o on_event 5602203 627288 strobelight_pyperf_libbpf.o on_py_event 294761 52445 -------------------------------- ---------------- ------------- ------ While Katran's speed up is pretty modest (about 105ms, or 6%), for production pyperf BPF program (on_py_event) it's much greater already, going from 387ms down to 295ms (23% improvement). Looking at BPF selftests's biggest pyperf example, we can see even more dramatic improvement, shaving more than 50% of time, going from 12.3s down to 5.6s. Different amount of improvement is the function of overall amount of BPF assembly instructions in .bpf.o files (which contributes to how much line info records there will be and thus, on average, how much time linear search will take), among other things: $ llvm-objdump -d katran.bpf.o | wc -l 3863 $ llvm-objdump -d strobelight_pyperf_libbpf.o | wc -l 6997 $ llvm-objdump -d pyperf600.bpf.linked3.o | wc -l 87854 Granted, this only applies to debugging cases (e.g., using veristat, or failing verification in production), but seems worth doing to improve overall developer experience anyways. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240214002311.2197116-1-andrii@kernel.org --- kernel/bpf/log.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index cc789efc7f43..995e3b82e17b 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -334,7 +334,8 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off) { const struct bpf_line_info *linfo; const struct bpf_prog *prog; - u32 i, nr_linfo; + u32 nr_linfo; + int l, r, m; prog = env->prog; nr_linfo = prog->aux->nr_linfo; @@ -343,11 +344,30 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off) return NULL; linfo = prog->aux->linfo; - for (i = 1; i < nr_linfo; i++) - if (insn_off < linfo[i].insn_off) - break; + /* Loop invariant: linfo[l].insn_off <= insns_off. + * linfo[0].insn_off == 0 which always satisfies above condition. + * Binary search is searching for rightmost linfo entry that satisfies + * the above invariant, giving us the desired record that covers given + * instruction offset. + */ + l = 0; + r = nr_linfo - 1; + while (l < r) { + /* (r - l + 1) / 2 means we break a tie to the right, so if: + * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off, + * then m=2, we see that linfo[m].insn_off > insn_off, and so + * r becomes 1 and we exit the loop with correct l==1. + * If the tie was broken to the left, m=1 would end us up in + * an endless loop where l and m stay at 1 and r stays at 2. + */ + m = l + (r - l + 1) / 2; + if (linfo[m].insn_off <= insn_off) + l = m; + else + r = m - 1; + } - return &linfo[i - 1]; + return &linfo[l]; } static const char *ltrim(const char *s) From 57354f5fdee8017783b5cc2e53b263641b6862e9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 14 Feb 2024 09:41:00 -0800 Subject: [PATCH 096/119] bpf: improve duplicate source code line detection Verifier log avoids printing the same source code line multiple times when a consecutive block of BPF assembly instructions are covered by the same original (C) source code line. This greatly improves verifier log legibility. Unfortunately, this check is imperfect and in production applications it quite often happens that verifier log will have multiple duplicated source lines emitted, for no apparently good reason. E.g., this is excerpt from a real-world BPF application (with register states omitted for clarity): BEFORE ====== ; for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { @ strobemeta_probe.bpf.c:394 5369: (07) r8 += 2 ; 5370: (07) r7 += 16 ; ; for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { @ strobemeta_probe.bpf.c:394 5371: (07) r9 += 1 ; 5372: (79) r4 = *(u64 *)(r10 -32) ; ; for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { @ strobemeta_probe.bpf.c:394 5373: (55) if r9 != 0xf goto pc+2 ; if (i >= map->cnt) @ strobemeta_probe.bpf.c:396 5376: (79) r1 = *(u64 *)(r10 -40) ; 5377: (79) r1 = *(u64 *)(r1 +8) ; ; if (i >= map->cnt) @ strobemeta_probe.bpf.c:396 5378: (dd) if r1 s<= r9 goto pc-5 ; ; descr->key_lens[i] = 0; @ strobemeta_probe.bpf.c:398 5379: (b4) w1 = 0 ; 5380: (6b) *(u16 *)(r8 -30) = r1 ; ; task, data, off, STROBE_MAX_STR_LEN, map->entries[i].key); @ strobemeta_probe.bpf.c:400 5381: (79) r3 = *(u64 *)(r7 -8) ; 5382: (7b) *(u64 *)(r10 -24) = r6 ; ; task, data, off, STROBE_MAX_STR_LEN, map->entries[i].key); @ strobemeta_probe.bpf.c:400 5383: (bc) w6 = w6 ; ; barrier_var(payload_off); @ strobemeta_probe.bpf.c:280 5384: (bf) r2 = r6 ; 5385: (bf) r1 = r4 ; As can be seen, line 394 is emitted thrice, 396 is emitted twice, and line 400 is duplicated as well. Note that there are no intermingling other lines of source code in between these duplicates, so the issue is not compiler reordering assembly instruction such that multiple original source code lines are in effect. It becomes more obvious what's going on if we look at *full* original line info information (using btfdump for this, [0]): #2764: line: insn #5363 --> 394:3 @ ./././strobemeta_probe.bpf.c for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { #2765: line: insn #5373 --> 394:21 @ ./././strobemeta_probe.bpf.c for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { #2766: line: insn #5375 --> 394:47 @ ./././strobemeta_probe.bpf.c for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { #2767: line: insn #5377 --> 394:3 @ ./././strobemeta_probe.bpf.c for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { #2768: line: insn #5378 --> 414:10 @ ./././strobemeta_probe.bpf.c return off; We can see that there are four line info records covering instructions #5363 through #5377 (instruction indices are shifted due to subprog instruction being appended to main program), all of them are pointing to the same C source code line #394. But each of them points to a different part of that line, which is denoted by differing column numbers (3, 21, 47, 3). But verifier log doesn't distinguish between parts of the same source code line and doesn't emit this column number information, so for end user it's just a repetitive visual noise. So let's improve the detection of repeated source code line and avoid this. With the changes in this patch, we get this output for the same piece of BPF program log: AFTER ===== ; for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { @ strobemeta_probe.bpf.c:394 5369: (07) r8 += 2 ; 5370: (07) r7 += 16 ; 5371: (07) r9 += 1 ; 5372: (79) r4 = *(u64 *)(r10 -32) ; 5373: (55) if r9 != 0xf goto pc+2 ; if (i >= map->cnt) @ strobemeta_probe.bpf.c:396 5376: (79) r1 = *(u64 *)(r10 -40) ; 5377: (79) r1 = *(u64 *)(r1 +8) ; 5378: (dd) if r1 s<= r9 goto pc-5 ; ; descr->key_lens[i] = 0; @ strobemeta_probe.bpf.c:398 5379: (b4) w1 = 0 ; 5380: (6b) *(u16 *)(r8 -30) = r1 ; ; task, data, off, STROBE_MAX_STR_LEN, map->entries[i].key); @ strobemeta_probe.bpf.c:400 5381: (79) r3 = *(u64 *)(r7 -8) ; 5382: (7b) *(u64 *)(r10 -24) = r6 ; 5383: (bc) w6 = w6 ; ; barrier_var(payload_off); @ strobemeta_probe.bpf.c:280 5384: (bf) r2 = r6 ; 5385: (bf) r1 = r4 ; All the duplication is gone and the log is cleaner and less distracting. [0] https://github.com/anakryiko/btfdump Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240214174100.2847419-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 995e3b82e17b..63c34e7b0715 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -382,15 +382,28 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, u32 insn_off, const char *prefix_fmt, ...) { - const struct bpf_line_info *linfo; + const struct bpf_line_info *linfo, *prev_linfo; const struct btf *btf; const char *s, *fname; if (!bpf_verifier_log_needed(&env->log)) return; + prev_linfo = env->prev_linfo; linfo = find_linfo(env, insn_off); - if (!linfo || linfo == env->prev_linfo) + if (!linfo || linfo == prev_linfo) + return; + + /* It often happens that two separate linfo records point to the same + * source code line, but have differing column numbers. Given verifier + * log doesn't emit column information, from user perspective we just + * end up emitting the same source code line twice unnecessarily. + * So instead check that previous and current linfo record point to + * the same file (file_name_offs match) and the same line number, and + * avoid emitting duplicated source code line in such case. + */ + if (prev_linfo && linfo->file_name_off == prev_linfo->file_name_off && + BPF_LINE_INFO_LINE_NUM(linfo->line_col) == BPF_LINE_INFO_LINE_NUM(prev_linfo->line_col)) return; if (prefix_fmt) { From 682158ab532a5bd24399fec25b65fec561f0f6e9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 14 Feb 2024 15:29:51 -0800 Subject: [PATCH 097/119] bpf: Fix test verif_scale_strobemeta_subprogs failure due to llvm19 With latest llvm19, I hit the following selftest failures with $ ./test_progs -j libbpf: prog 'on_event': BPF program load failed: Permission denied libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- combined stack size of 4 calls is 544. Too large verification time 1344153 usec stack depth 24+440+0+32 processed 51008 insns (limit 1000000) max_states_per_insn 19 total_states 1467 peak_states 303 mark_read 146 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -13 libbpf: failed to load object 'strobemeta_subprogs.bpf.o' scale_test:FAIL:expect_success unexpected error: -13 (errno 13) #498 verif_scale_strobemeta_subprogs:FAIL The verifier complains too big of the combined stack size (544 bytes) which exceeds the maximum stack limit 512. This is a regression from llvm19 ([1]). In the above error log, the original stack depth is 24+440+0+32. To satisfy interpreter's need, in verifier the stack depth is adjusted to 32+448+32+32=544 which exceeds 512, hence the error. The same adjusted stack size is also used for jit case. But the jitted codes could use smaller stack size. $ egrep -r stack_depth | grep round_up arm64/net/bpf_jit_comp.c: ctx->stack_size = round_up(prog->aux->stack_depth, 16); loongarch/net/bpf_jit.c: bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16); powerpc/net/bpf_jit_comp.c: cgctx.stack_size = round_up(fp->aux->stack_depth, 16); riscv/net/bpf_jit_comp32.c: round_up(ctx->prog->aux->stack_depth, STACK_ALIGN); riscv/net/bpf_jit_comp64.c: bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16); s390/net/bpf_jit_comp.c: u32 stack_depth = round_up(fp->aux->stack_depth, 8); sparc/net/bpf_jit_comp_64.c: stack_needed += round_up(stack_depth, 16); x86/net/bpf_jit_comp.c: EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8)); x86/net/bpf_jit_comp.c: int tcc_off = -4 - round_up(stack_depth, 8); x86/net/bpf_jit_comp.c: round_up(stack_depth, 8)); x86/net/bpf_jit_comp.c: int tcc_off = -4 - round_up(stack_depth, 8); x86/net/bpf_jit_comp.c: EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8)); In the above, STACK_ALIGN in riscv/net/bpf_jit_comp32.c is defined as 16. So stack is aligned in either 8 or 16, x86/s390 having 8-byte stack alignment and the rest having 16-byte alignment. This patch calculates total stack depth based on 16-byte alignment if jit is requested. For the above failing case, the new stack size will be 32+448+0+32=512 and no verification failure. llvm19 regression will be discussed separately in llvm upstream. The verifier change caused three test failures as these tests compared messages with stack size. More specifically, - test_global_funcs/global_func1: fail with interpreter mode and success with jit mode. Adjusted stack sizes so both jit and interpreter modes will fail. - async_stack_depth/{pseudo_call_check, async_call_root_check}: since jit and interpreter will calculate different stack sizes, the failure msg is adjusted to omit those specific stack size numbers. [1] https://lore.kernel.org/bpf/32bde0f0-1881-46c9-931a-673be566c61d@linux.dev/ Suggested-by: Alexei Starovoitov Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240214232951.4113094-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 18 +++++++++++++----- .../selftests/bpf/progs/async_stack_depth.c | 4 ++-- .../selftests/bpf/progs/test_global_func1.c | 8 ++++++-- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aa192dc735a9..011d54a1dc53 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5812,6 +5812,17 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } +static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) +{ + if (env->prog->jit_requested) + return round_up(stack_depth, 16); + + /* round up to 32-bytes, since this is granularity + * of interpreter stack size + */ + return round_up(max_t(u32, stack_depth, 1), 32); +} + /* starting from main bpf function walk all instructions of the function * and recursively walk all callees that given function can call. * Ignore jump and exit insns. @@ -5855,10 +5866,7 @@ process_func: depth); return -EACCES; } - /* round up to 32-bytes, since this is granularity - * of interpreter stack size - */ - depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); + depth += round_up_stack_depth(env, subprog[idx].stack_depth); if (depth > MAX_BPF_STACK) { verbose(env, "combined stack size of %d calls is %d. Too large\n", frame + 1, depth); @@ -5952,7 +5960,7 @@ continue_func: */ if (frame == 0) return 0; - depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); + depth -= round_up_stack_depth(env, subprog[idx].stack_depth); frame--; i = ret_insn[frame]; idx = ret_prog[frame]; diff --git a/tools/testing/selftests/bpf/progs/async_stack_depth.c b/tools/testing/selftests/bpf/progs/async_stack_depth.c index 3517c0e01206..36734683acbd 100644 --- a/tools/testing/selftests/bpf/progs/async_stack_depth.c +++ b/tools/testing/selftests/bpf/progs/async_stack_depth.c @@ -30,7 +30,7 @@ static int bad_timer_cb(void *map, int *key, struct bpf_timer *timer) } SEC("tc") -__failure __msg("combined stack size of 2 calls is 576. Too large") +__failure __msg("combined stack size of 2 calls is") int pseudo_call_check(struct __sk_buff *ctx) { struct hmap_elem *elem; @@ -45,7 +45,7 @@ int pseudo_call_check(struct __sk_buff *ctx) } SEC("tc") -__failure __msg("combined stack size of 2 calls is 608. Too large") +__failure __msg("combined stack size of 2 calls is") int async_call_root_check(struct __sk_buff *ctx) { struct hmap_elem *elem; diff --git a/tools/testing/selftests/bpf/progs/test_global_func1.c b/tools/testing/selftests/bpf/progs/test_global_func1.c index 17a9f59bf5f3..fc69ff18880d 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func1.c +++ b/tools/testing/selftests/bpf/progs/test_global_func1.c @@ -5,7 +5,7 @@ #include #include "bpf_misc.h" -#define MAX_STACK (512 - 3 * 32 + 8) +#define MAX_STACK 260 static __attribute__ ((noinline)) int f0(int var, struct __sk_buff *skb) @@ -30,6 +30,10 @@ int f3(int, struct __sk_buff *skb, int); __attribute__ ((noinline)) int f2(int val, struct __sk_buff *skb) { + volatile char buf[MAX_STACK] = {}; + + __sink(buf[MAX_STACK - 1]); + return f1(skb) + f3(val, skb, 1); } @@ -44,7 +48,7 @@ int f3(int val, struct __sk_buff *skb, int var) } SEC("tc") -__failure __msg("combined stack size of 4 calls is 544") +__failure __msg("combined stack size of 3 calls is") int global_func1(struct __sk_buff *skb) { return f0(1, skb) + f1(skb) + f2(2, skb) + f3(3, skb, 4); From 7648f0c91eaa3598add9e91991a5483b29da32ee Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Fri, 16 Feb 2024 09:42:45 -0300 Subject: [PATCH 098/119] selftests/bpf: Remove empty TEST_CUSTOM_PROGS Commit f04a32b2c5b5 ("selftests/bpf: Do not use sign-file as testcase") removed the TEST_CUSTOM_PROGS assignment, and removed it from being used on TEST_GEN_FILES. Remove two leftovers from that cleanup. Found by inspection. Signed-off-by: Marcos Paulo de Souza Signed-off-by: Daniel Borkmann Cc: Alexey Gladkov Link: https://lore.kernel.org/bpf/20240216-bpf-selftests-custom-progs-v1-1-f7cf281a1fda@suse.com --- tools/testing/selftests/bpf/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index a38a3001527c..dbb8c5f94f34 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -197,8 +197,7 @@ endif # NOTE: Semicolon at the end is critical to override lib.mk's default static # rule for binaries. $(notdir $(TEST_GEN_PROGS) \ - $(TEST_GEN_PROGS_EXTENDED) \ - $(TEST_CUSTOM_PROGS)): %: $(OUTPUT)/% ; + $(TEST_GEN_PROGS_EXTENDED)): %: $(OUTPUT)/% ; # sort removes libbpf duplicates when not cross-building MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \ @@ -752,7 +751,7 @@ $(OUTPUT)/uprobe_multi: uprobe_multi.c $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@ -EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ +EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature bpftool \ $(addprefix $(OUTPUT)/,*.o *.skel.h *.lskel.h *.subskel.h \ From 01dbd7d8720a0cc97dad5e70ec674dacdc66cf3c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 20 Feb 2024 15:11:02 -0800 Subject: [PATCH 099/119] selftests/bpf: Remove intermediate test files. The test of linking process creates several intermediate files. Remove them once the build is over. This reduces the number of files in selftests/bpf/ directory from ~4400 to ~2600. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240220231102.49090-1-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index dbb8c5f94f34..9be69ff701ba 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -536,6 +536,7 @@ $(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $(Q)diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) $(Q)$$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$@ $(Q)$$(BPFTOOL) gen subskeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$(@:.skel.h=.subskel.h) + $(Q)rm -f $$(<:.o=.linked1.o) $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) $(TRUNNER_BPF_LSKELS): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) @@ -544,6 +545,7 @@ $(TRUNNER_BPF_LSKELS): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) $(Q)diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) $(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ + $(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) $(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_BPF_OBJS) $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.bpf.o)) @@ -554,6 +556,7 @@ $(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_BPF_OBJS) $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) $(Q)$$(BPFTOOL) gen skeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$@ $(Q)$$(BPFTOOL) gen subskeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$(@:.skel.h=.subskel.h) + $(Q)rm -f $$(@:.skel.h=.linked1.o) $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) endif # ensure we set up tests.h header generation rule just once From a3c70a3cf11eb4b6409afc2cce1a3747e1dfe96f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 20 Feb 2024 15:50:01 -0800 Subject: [PATCH 100/119] bpf: Shrink size of struct bpf_map/bpf_array. Back in 2018 the commit be95a845cc44 ("bpf: avoid false sharing of map refcount with max_entries") added ____cacheline_aligned to "struct bpf_map" to make sure that fields like refcnt don't share a cache line with max_entries that is used to bounds check map access. That was done to make spectre style attacks harder. The main mitigation is done via code similar to array_index_nospec(), of course. This was an additional precaution. It increased the size of "struct bpf_map" a little, but it's affect on all other maps (like array) is significant, since "struct bpf_map" is typically the first member in other map types. Undo this ____cacheline_aligned tag. Instead move freeze_mutex field around, so that refcnt and max_entries are still in different cache lines. The main effect is seen in sizeof(struct bpf_array) that reduces from 320 to 248 bytes. BEFORE: struct bpf_map { const struct bpf_map_ops * ops; /* 0 8 */ ... char name[16]; /* 96 16 */ /* XXX 16 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic64_t refcnt __attribute__((__aligned__(64))); /* 128 8 */ ... /* size: 256, cachelines: 4, members: 30 */ /* sum members: 232, holes: 1, sum holes: 16 */ /* padding: 8 */ /* paddings: 1, sum paddings: 2 */ } __attribute__((__aligned__(64))); struct bpf_array { struct bpf_map map; /* 0 256 */ ... /* size: 320, cachelines: 5, members: 5 */ /* padding: 48 */ /* paddings: 1, sum paddings: 8 */ } __attribute__((__aligned__(64))); AFTER: struct bpf_map { /* size: 232, cachelines: 4, members: 30 */ /* paddings: 1, sum paddings: 2 */ /* last cacheline: 40 bytes */ }; struct bpf_array { /* size: 248, cachelines: 4, members: 5 */ /* last cacheline: 56 bytes */ }; Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240220235001.57411-1-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c7aa99b44dbd..814dc913a968 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -251,10 +251,7 @@ struct bpf_list_node_kern { } __attribute__((aligned(8))); struct bpf_map { - /* The first two cachelines with read-mostly members of which some - * are also accessed in fast-path (e.g. ops, max_entries). - */ - const struct bpf_map_ops *ops ____cacheline_aligned; + const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY void *security; @@ -276,17 +273,14 @@ struct bpf_map { struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; - /* The 3rd and 4th cacheline with misc members to avoid false sharing - * particularly with refcounting. - */ - atomic64_t refcnt ____cacheline_aligned; + struct mutex freeze_mutex; + atomic64_t refcnt; atomic64_t usercnt; /* rcu is used before freeing and work is only used during freeing */ union { struct work_struct work; struct rcu_head rcu; }; - struct mutex freeze_mutex; atomic64_t writecnt; /* 'Ownership' of program-containing map is claimed by the first program * that is going to use this map or by the first program which FD is From b546b57526953be2981113171ed586c4c50b1b0a Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Feb 2024 17:03:00 +0200 Subject: [PATCH 101/119] selftests/bpf: update tcp_custom_syncookie to use scalar packet offset This commit updates tcp_custom_syncookie.c:tcp_parse_option() to use explicit packet offset (ctx->off) for packet access instead of ever moving pointer (ctx->ptr), this reduces verification complexity: - the tcp_parse_option() is passed as a callback to bpf_loop(); - suppose a checkpoint is created each time at function entry; - the ctx->ptr is tracked by verifier as PTR_TO_PACKET; - the ctx->ptr is incremented in tcp_parse_option(), thus umax_value field tracked for it is incremented as well; - on each next iteration of tcp_parse_option() checkpoint from a previous iteration can't be reused for state pruning, because PTR_TO_PACKET registers are considered equivalent only if old->umax_value >= cur->umax_value; - on the other hand, the ctx->off is a SCALAR, subject to widen_imprecise_scalars(); - it's exact bounds are eventually forgotten and it is tracked as unknown scalar at entry to tcp_parse_option(); - hence checkpoints created at the start of the function eventually converge. The change is similar to one applied in [0] to xdp_synproxy_kern.c. Comparing before and after with veristat yields following results: File Insns (A) Insns (B) Insns (DIFF) ------------------------------- --------- --------- ----------------- test_tcp_custom_syncookie.bpf.o 466657 12423 -454234 (-97.34%) [0] commit 977bc146d4eb ("selftests/bpf: track tcp payload offset as scalar in xdp_synproxy") Acked-by: Yonghong Song Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240222150300.14909-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/progs/test_tcp_custom_syncookie.c | 81 ++++++++++++------- 1 file changed, 52 insertions(+), 29 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c b/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c index a5501b29979a..c8e4553648bf 100644 --- a/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c +++ b/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c @@ -10,6 +10,8 @@ #include "test_siphash.h" #include "test_tcp_custom_syncookie.h" +#define MAX_PACKET_OFF 0xffff + /* Hash is calculated for each client and split into ISN and TS. * * MSB LSB @@ -52,16 +54,15 @@ static siphash_key_t test_key_siphash = { struct tcp_syncookie { struct __sk_buff *skb; + void *data; void *data_end; struct ethhdr *eth; struct iphdr *ipv4; struct ipv6hdr *ipv6; struct tcphdr *tcp; - union { - char *ptr; - __be32 *ptr32; - }; + __be32 *ptr32; struct bpf_tcp_req_attrs attrs; + u32 off; u32 cookie; u64 first; }; @@ -70,6 +71,7 @@ bool handled_syn, handled_ack; static int tcp_load_headers(struct tcp_syncookie *ctx) { + ctx->data = (void *)(long)ctx->skb->data; ctx->data_end = (void *)(long)ctx->skb->data_end; ctx->eth = (struct ethhdr *)(long)ctx->skb->data; @@ -134,6 +136,7 @@ static int tcp_reload_headers(struct tcp_syncookie *ctx) if (bpf_skb_change_tail(ctx->skb, data_len + 60 - ctx->tcp->doff * 4, 0)) goto err; + ctx->data = (void *)(long)ctx->skb->data; ctx->data_end = (void *)(long)ctx->skb->data_end; ctx->eth = (struct ethhdr *)(long)ctx->skb->data; if (ctx->ipv4) { @@ -195,47 +198,68 @@ err: return -1; } +static __always_inline void *next(struct tcp_syncookie *ctx, __u32 sz) +{ + __u64 off = ctx->off; + __u8 *data; + + /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */ + if (off > MAX_PACKET_OFF - sz) + return NULL; + + data = ctx->data + off; + barrier_var(data); + if (data + sz >= ctx->data_end) + return NULL; + + ctx->off += sz; + return data; +} + static int tcp_parse_option(__u32 index, struct tcp_syncookie *ctx) { - char opcode, opsize; + __u8 *opcode, *opsize, *wscale; + __u32 *tsval, *tsecr; + __u16 *mss; + __u32 off; - if (ctx->ptr + 1 > ctx->data_end) + off = ctx->off; + opcode = next(ctx, 1); + if (!opcode) goto stop; - opcode = *ctx->ptr++; - - if (opcode == TCPOPT_EOL) + if (*opcode == TCPOPT_EOL) goto stop; - if (opcode == TCPOPT_NOP) + if (*opcode == TCPOPT_NOP) goto next; - if (ctx->ptr + 1 > ctx->data_end) + opsize = next(ctx, 1); + if (!opsize) goto stop; - opsize = *ctx->ptr++; - - if (opsize < 2) + if (*opsize < 2) goto stop; - switch (opcode) { + switch (*opcode) { case TCPOPT_MSS: - if (opsize == TCPOLEN_MSS && ctx->tcp->syn && - ctx->ptr + (TCPOLEN_MSS - 2) < ctx->data_end) - ctx->attrs.mss = get_unaligned_be16(ctx->ptr); + mss = next(ctx, 2); + if (*opsize == TCPOLEN_MSS && ctx->tcp->syn && mss) + ctx->attrs.mss = get_unaligned_be16(mss); break; case TCPOPT_WINDOW: - if (opsize == TCPOLEN_WINDOW && ctx->tcp->syn && - ctx->ptr + (TCPOLEN_WINDOW - 2) < ctx->data_end) { + wscale = next(ctx, 1); + if (*opsize == TCPOLEN_WINDOW && ctx->tcp->syn && wscale) { ctx->attrs.wscale_ok = 1; - ctx->attrs.snd_wscale = *ctx->ptr; + ctx->attrs.snd_wscale = *wscale; } break; case TCPOPT_TIMESTAMP: - if (opsize == TCPOLEN_TIMESTAMP && - ctx->ptr + (TCPOLEN_TIMESTAMP - 2) < ctx->data_end) { - ctx->attrs.rcv_tsval = get_unaligned_be32(ctx->ptr); - ctx->attrs.rcv_tsecr = get_unaligned_be32(ctx->ptr + 4); + tsval = next(ctx, 4); + tsecr = next(ctx, 4); + if (*opsize == TCPOLEN_TIMESTAMP && tsval && tsecr) { + ctx->attrs.rcv_tsval = get_unaligned_be32(tsval); + ctx->attrs.rcv_tsecr = get_unaligned_be32(tsecr); if (ctx->tcp->syn && ctx->attrs.rcv_tsecr) ctx->attrs.tstamp_ok = 0; @@ -244,13 +268,12 @@ static int tcp_parse_option(__u32 index, struct tcp_syncookie *ctx) } break; case TCPOPT_SACK_PERM: - if (opsize == TCPOLEN_SACK_PERM && ctx->tcp->syn && - ctx->ptr + (TCPOLEN_SACK_PERM - 2) < ctx->data_end) + if (*opsize == TCPOLEN_SACK_PERM && ctx->tcp->syn) ctx->attrs.sack_ok = 1; break; } - ctx->ptr += opsize - 2; + ctx->off = off + *opsize; next: return 0; stop: @@ -259,7 +282,7 @@ stop: static void tcp_parse_options(struct tcp_syncookie *ctx) { - ctx->ptr = (char *)(ctx->tcp + 1); + ctx->off = (__u8 *)(ctx->tcp + 1) - (__u8 *)ctx->data, bpf_loop(40, tcp_parse_option, ctx, 0); } From c1bb68f6b2f6be5297c5fbad5caebf67d0dd3034 Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Wed, 21 Feb 2024 09:35:35 -0800 Subject: [PATCH 102/119] bpf, docs: Fix typos in instruction-set.rst * "BPF ADD" should be "BPF_ADD". * "src" should be "src_reg" in several places. The latter is the field name in the instruction. The former refers to the value of the register, or the immediate. * Add '' around field names in one sentence, for consistency with the rest of the document. Signed-off-by: Dave Thaler Acked-by: David Vernet Link: https://lore.kernel.org/r/20240221173535.16601-1-dthaler1968@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/standardization/instruction-set.rst | 74 +++++++++---------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst index 868d9f6177e3..45cffe94c752 100644 --- a/Documentation/bpf/standardization/instruction-set.rst +++ b/Documentation/bpf/standardization/instruction-set.rst @@ -178,8 +178,8 @@ Unused fields shall be cleared to zero. As discussed below in `64-bit immediate instructions`_, a 64-bit immediate instruction uses two 32-bit immediate values that are constructed as follows. The 64 bits following the basic instruction contain a pseudo instruction -using the same format but with opcode, dst_reg, src_reg, and offset all set to zero, -and imm containing the high 32 bits of the immediate value. +using the same format but with 'opcode', 'dst_reg', 'src_reg', and 'offset' all +set to zero, and imm containing the high 32 bits of the immediate value. This is depicted in the following figure:: @@ -392,27 +392,27 @@ otherwise identical operations, and indicates the base64 conformance group unless otherwise specified. The 'code' field encodes the operation as below: -======== ===== === =============================== ============================================= -code value src description notes -======== ===== === =============================== ============================================= -BPF_JA 0x0 0x0 PC += offset BPF_JMP | BPF_K only -BPF_JA 0x0 0x0 PC += imm BPF_JMP32 | BPF_K only -BPF_JEQ 0x1 any PC += offset if dst == src -BPF_JGT 0x2 any PC += offset if dst > src unsigned -BPF_JGE 0x3 any PC += offset if dst >= src unsigned -BPF_JSET 0x4 any PC += offset if dst & src -BPF_JNE 0x5 any PC += offset if dst != src -BPF_JSGT 0x6 any PC += offset if dst > src signed -BPF_JSGE 0x7 any PC += offset if dst >= src signed -BPF_CALL 0x8 0x0 call helper function by address BPF_JMP | BPF_K only, see `Helper functions`_ -BPF_CALL 0x8 0x1 call PC += imm BPF_JMP | BPF_K only, see `Program-local functions`_ -BPF_CALL 0x8 0x2 call helper function by BTF ID BPF_JMP | BPF_K only, see `Helper functions`_ -BPF_EXIT 0x9 0x0 return BPF_JMP | BPF_K only -BPF_JLT 0xa any PC += offset if dst < src unsigned -BPF_JLE 0xb any PC += offset if dst <= src unsigned -BPF_JSLT 0xc any PC += offset if dst < src signed -BPF_JSLE 0xd any PC += offset if dst <= src signed -======== ===== === =============================== ============================================= +======== ===== ======= =============================== ============================================= +code value src_reg description notes +======== ===== ======= =============================== ============================================= +BPF_JA 0x0 0x0 PC += offset BPF_JMP | BPF_K only +BPF_JA 0x0 0x0 PC += imm BPF_JMP32 | BPF_K only +BPF_JEQ 0x1 any PC += offset if dst == src +BPF_JGT 0x2 any PC += offset if dst > src unsigned +BPF_JGE 0x3 any PC += offset if dst >= src unsigned +BPF_JSET 0x4 any PC += offset if dst & src +BPF_JNE 0x5 any PC += offset if dst != src +BPF_JSGT 0x6 any PC += offset if dst > src signed +BPF_JSGE 0x7 any PC += offset if dst >= src signed +BPF_CALL 0x8 0x0 call helper function by address BPF_JMP | BPF_K only, see `Helper functions`_ +BPF_CALL 0x8 0x1 call PC += imm BPF_JMP | BPF_K only, see `Program-local functions`_ +BPF_CALL 0x8 0x2 call helper function by BTF ID BPF_JMP | BPF_K only, see `Helper functions`_ +BPF_EXIT 0x9 0x0 return BPF_JMP | BPF_K only +BPF_JLT 0xa any PC += offset if dst < src unsigned +BPF_JLE 0xb any PC += offset if dst <= src unsigned +BPF_JSLT 0xc any PC += offset if dst < src signed +BPF_JSLE 0xd any PC += offset if dst <= src signed +======== ===== ======= =============================== ============================================= The BPF program needs to store the return value into register R0 before doing a ``BPF_EXIT``. @@ -568,7 +568,7 @@ BPF_XOR 0xa0 atomic xor *(u32 *)(dst + offset) += src -``BPF_ATOMIC | BPF_DW | BPF_STX`` with 'imm' = BPF ADD means:: +``BPF_ATOMIC | BPF_DW | BPF_STX`` with 'imm' = BPF_ADD means:: *(u64 *)(dst + offset) += src @@ -601,24 +601,24 @@ and loaded back to ``R0``. ----------------------------- Instructions with the ``BPF_IMM`` 'mode' modifier use the wide instruction -encoding defined in `Instruction encoding`_, and use the 'src' field of the +encoding defined in `Instruction encoding`_, and use the 'src_reg' field of the basic instruction to hold an opcode subtype. The following table defines a set of ``BPF_IMM | BPF_DW | BPF_LD`` instructions -with opcode subtypes in the 'src' field, using new terms such as "map" +with opcode subtypes in the 'src_reg' field, using new terms such as "map" defined further below: -========================= ====== === ========================================= =========== ============== -opcode construction opcode src pseudocode imm type dst type -========================= ====== === ========================================= =========== ============== -BPF_IMM | BPF_DW | BPF_LD 0x18 0x0 dst = (next_imm << 32) | imm integer integer -BPF_IMM | BPF_DW | BPF_LD 0x18 0x1 dst = map_by_fd(imm) map fd map -BPF_IMM | BPF_DW | BPF_LD 0x18 0x2 dst = map_val(map_by_fd(imm)) + next_imm map fd data pointer -BPF_IMM | BPF_DW | BPF_LD 0x18 0x3 dst = var_addr(imm) variable id data pointer -BPF_IMM | BPF_DW | BPF_LD 0x18 0x4 dst = code_addr(imm) integer code pointer -BPF_IMM | BPF_DW | BPF_LD 0x18 0x5 dst = map_by_idx(imm) map index map -BPF_IMM | BPF_DW | BPF_LD 0x18 0x6 dst = map_val(map_by_idx(imm)) + next_imm map index data pointer -========================= ====== === ========================================= =========== ============== +========================= ====== ======= ========================================= =========== ============== +opcode construction opcode src_reg pseudocode imm type dst type +========================= ====== ======= ========================================= =========== ============== +BPF_IMM | BPF_DW | BPF_LD 0x18 0x0 dst = (next_imm << 32) | imm integer integer +BPF_IMM | BPF_DW | BPF_LD 0x18 0x1 dst = map_by_fd(imm) map fd map +BPF_IMM | BPF_DW | BPF_LD 0x18 0x2 dst = map_val(map_by_fd(imm)) + next_imm map fd data pointer +BPF_IMM | BPF_DW | BPF_LD 0x18 0x3 dst = var_addr(imm) variable id data pointer +BPF_IMM | BPF_DW | BPF_LD 0x18 0x4 dst = code_addr(imm) integer code pointer +BPF_IMM | BPF_DW | BPF_LD 0x18 0x5 dst = map_by_idx(imm) map index map +BPF_IMM | BPF_DW | BPF_LD 0x18 0x6 dst = map_val(map_by_idx(imm)) + next_imm map index data pointer +========================= ====== ======= ========================================= =========== ============== where From 89ee838130f470afcd02b30ca868f236a3f3b1d2 Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Wed, 21 Feb 2024 09:54:19 -0800 Subject: [PATCH 103/119] bpf, docs: specify which BPF_ABS and BPF_IND fields were zero Specifying which fields were unused allows IANA to only list as deprecated instructions that were actually used, leaving the rest as unassigned and possibly available for future use for something else. Signed-off-by: Dave Thaler Acked-by: David Vernet Link: https://lore.kernel.org/r/20240221175419.16843-1-dthaler1968@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/standardization/instruction-set.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst index 45cffe94c752..f3269d6dd5e0 100644 --- a/Documentation/bpf/standardization/instruction-set.rst +++ b/Documentation/bpf/standardization/instruction-set.rst @@ -658,6 +658,7 @@ Legacy BPF Packet access instructions BPF previously introduced special instructions for access to packet data that were carried over from classic BPF. These instructions used an instruction class of BPF_LD, a size modifier of BPF_W, BPF_H, or BPF_B, and a -mode modifier of BPF_ABS or BPF_IND. However, these instructions are -deprecated and should no longer be used. All legacy packet access -instructions belong to the "legacy" conformance group. +mode modifier of BPF_ABS or BPF_IND. The 'dst_reg' and 'offset' fields were +set to zero, and 'src_reg' was set to zero for BPF_ABS. However, these +instructions are deprecated and should no longer be used. All legacy packet +access instructions belong to the "legacy" conformance group. From 58fd62e0aa50fdd20bc41a01e787001f3af8a925 Mon Sep 17 00:00:00 2001 From: Martin Kelly Date: Wed, 21 Feb 2024 13:18:38 -0800 Subject: [PATCH 104/119] bpf: Clarify batch lookup/lookup_and_delete semantics The batch lookup and lookup_and_delete APIs have two parameters, in_batch and out_batch, to facilitate iterative lookup/lookup_and_deletion operations for supported maps. Except NULL for in_batch at the start of these two batch operations, both parameters need to point to memory equal or larger than the respective map key size, except for various hashmaps (hash, percpu_hash, lru_hash, lru_percpu_hash) where the in_batch/out_batch memory size should be at least 4 bytes. Document these semantics to clarify the API. Signed-off-by: Martin Kelly Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240221211838.1241578-1-martin.kelly@crowdstrike.com Signed-off-by: Martin KaFai Lau --- include/uapi/linux/bpf.h | 6 +++++- tools/include/uapi/linux/bpf.h | 6 +++++- tools/lib/bpf/bpf.h | 17 ++++++++++++----- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d96708380e52..d2e6c5fcec01 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -617,7 +617,11 @@ union bpf_iter_link_info { * to NULL to begin the batched operation. After each subsequent * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant * *out_batch* as the *in_batch* for the next operation to - * continue iteration from the current point. + * continue iteration from the current point. Both *in_batch* and + * *out_batch* must point to memory large enough to hold a key, + * except for maps of type **BPF_MAP_TYPE_{HASH, PERCPU_HASH, + * LRU_HASH, LRU_PERCPU_HASH}**, for which batch parameters + * must be at least 4 bytes wide regardless of key size. * * The *keys* and *values* are output parameters which must point * to memory large enough to hold *count* items based on the key diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d96708380e52..d2e6c5fcec01 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -617,7 +617,11 @@ union bpf_iter_link_info { * to NULL to begin the batched operation. After each subsequent * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant * *out_batch* as the *in_batch* for the next operation to - * continue iteration from the current point. + * continue iteration from the current point. Both *in_batch* and + * *out_batch* must point to memory large enough to hold a key, + * except for maps of type **BPF_MAP_TYPE_{HASH, PERCPU_HASH, + * LRU_HASH, LRU_PERCPU_HASH}**, for which batch parameters + * must be at least 4 bytes wide regardless of key size. * * The *keys* and *values* are output parameters which must point * to memory large enough to hold *count* items based on the key diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index ab2570d28aec..df0db2f0cdb7 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -190,10 +190,14 @@ LIBBPF_API int bpf_map_delete_batch(int fd, const void *keys, /** * @brief **bpf_map_lookup_batch()** allows for batch lookup of BPF map elements. * - * The parameter *in_batch* is the address of the first element in the batch to read. - * *out_batch* is an output parameter that should be passed as *in_batch* to subsequent - * calls to **bpf_map_lookup_batch()**. NULL can be passed for *in_batch* to indicate - * that the batched lookup starts from the beginning of the map. + * The parameter *in_batch* is the address of the first element in the batch to + * read. *out_batch* is an output parameter that should be passed as *in_batch* + * to subsequent calls to **bpf_map_lookup_batch()**. NULL can be passed for + * *in_batch* to indicate that the batched lookup starts from the beginning of + * the map. Both *in_batch* and *out_batch* must point to memory large enough to + * hold a single key, except for maps of type **BPF_MAP_TYPE_{HASH, PERCPU_HASH, + * LRU_HASH, LRU_PERCPU_HASH}**, for which the memory size must be at + * least 4 bytes wide regardless of key size. * * The *keys* and *values* are output parameters which must point to memory large enough to * hold *count* items based on the key and value size of the map *map_fd*. The *keys* @@ -226,7 +230,10 @@ LIBBPF_API int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch, * * @param fd BPF map file descriptor * @param in_batch address of the first element in batch to read, can pass NULL to - * get address of the first element in *out_batch* + * get address of the first element in *out_batch*. If not NULL, must be large + * enough to hold a key. For **BPF_MAP_TYPE_{HASH, PERCPU_HASH, LRU_HASH, + * LRU_PERCPU_HASH}**, the memory size must be at least 4 bytes wide regardless + * of key size. * @param out_batch output parameter that should be passed to next call as *in_batch* * @param keys pointer to an array of *count* keys * @param values pointer to an array large enough for *count* values From 3e0008336ae3153fb89b1a15bb877ddd38680fe6 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 21 Feb 2024 18:11:04 -0800 Subject: [PATCH 105/119] bpf: Check cfi_stubs before registering a struct_ops type. Recently, st_ops->cfi_stubs was introduced. However, the upcoming new struct_ops support (e.g. sched_ext) is not aware of this and does not provide its own cfi_stubs. The kernel ends up NULL dereferencing the st_ops->cfi_stubs. Considering struct_ops supports kernel module now, this NULL check is necessary. This patch is to reject struct_ops registration that does not provide a cfi_stubs. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240222021105.1180475-2-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/bpf_struct_ops.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 0d7be97a2411..a6019087b467 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -302,6 +302,11 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, } sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); + if (!st_ops->cfi_stubs) { + pr_warn("struct_ops for %s has no cfi_stubs\n", st_ops->name); + return -EINVAL; + } + type_id = btf_find_by_name_kind(btf, st_ops->name, BTF_KIND_STRUCT); if (type_id < 0) { From e9bbda13a7b876451285ab15fb600b809e5e2290 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 21 Feb 2024 18:11:05 -0800 Subject: [PATCH 106/119] selftests/bpf: Test case for lacking CFI stub functions. Ensure struct_ops rejects the registration of struct_ops types without proper CFI stub functions. bpf_test_no_cfi.ko is a module that attempts to register a struct_ops type called "bpf_test_no_cfi_ops" with cfi_stubs of NULL and non-NULL value. The NULL one should fail, and the non-NULL one should succeed. The module can only be loaded successfully if these registrations yield the expected results. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240222021105.1180475-3-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/Makefile | 10 ++- .../selftests/bpf/bpf_test_no_cfi/Makefile | 19 +++++ .../bpf/bpf_test_no_cfi/bpf_test_no_cfi.c | 84 +++++++++++++++++++ .../bpf/prog_tests/test_struct_ops_no_cfi.c | 35 ++++++++ tools/testing/selftests/bpf/testing_helpers.c | 4 +- tools/testing/selftests/bpf/testing_helpers.h | 2 + 6 files changed, 151 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/bpf/bpf_test_no_cfi/Makefile create mode 100644 tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c create mode 100644 tools/testing/selftests/bpf/prog_tests/test_struct_ops_no_cfi.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 9be69ff701ba..84cb5500e8ef 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -132,7 +132,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \ - xdp_features + xdp_features bpf_test_no_cfi.ko TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi @@ -254,6 +254,12 @@ $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard bpf_testmo $(Q)$(MAKE) $(submake_extras) RESOLVE_BTFIDS=$(RESOLVE_BTFIDS) -C bpf_testmod $(Q)cp bpf_testmod/bpf_testmod.ko $@ +$(OUTPUT)/bpf_test_no_cfi.ko: $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard bpf_test_no_cfi/Makefile bpf_test_no_cfi/*.[ch]) + $(call msg,MOD,,$@) + $(Q)$(RM) bpf_test_no_cfi/bpf_test_no_cfi.ko # force re-compilation + $(Q)$(MAKE) $(submake_extras) RESOLVE_BTFIDS=$(RESOLVE_BTFIDS) -C bpf_test_no_cfi + $(Q)cp bpf_test_no_cfi/bpf_test_no_cfi.ko $@ + DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool ifneq ($(CROSS_COMPILE),) CROSS_BPFTOOL := $(SCRATCH_DIR)/sbin/bpftool @@ -631,6 +637,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ flow_dissector_load.h \ ip_check_defrag_frags.h TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ + $(OUTPUT)/bpf_test_no_cfi.ko \ $(OUTPUT)/liburandom_read.so \ $(OUTPUT)/xdp_synproxy \ $(OUTPUT)/sign-file \ @@ -759,6 +766,7 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ feature bpftool \ $(addprefix $(OUTPUT)/,*.o *.skel.h *.lskel.h *.subskel.h \ no_alu32 cpuv4 bpf_gcc bpf_testmod.ko \ + bpf_test_no_cfi.ko \ liburandom_read.so) .PHONY: docs docs-clean diff --git a/tools/testing/selftests/bpf/bpf_test_no_cfi/Makefile b/tools/testing/selftests/bpf/bpf_test_no_cfi/Makefile new file mode 100644 index 000000000000..ed5143b79edf --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_test_no_cfi/Makefile @@ -0,0 +1,19 @@ +BPF_TEST_NO_CFI_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) +KDIR ?= $(abspath $(BPF_TEST_NO_CFI_DIR)/../../../../..) + +ifeq ($(V),1) +Q = +else +Q = @ +endif + +MODULES = bpf_test_no_cfi.ko + +obj-m += bpf_test_no_cfi.o + +all: + +$(Q)make -C $(KDIR) M=$(BPF_TEST_NO_CFI_DIR) modules + +clean: + +$(Q)make -C $(KDIR) M=$(BPF_TEST_NO_CFI_DIR) clean + diff --git a/tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c b/tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c new file mode 100644 index 000000000000..b1dd889d5d7d --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include + +struct bpf_test_no_cfi_ops { + void (*fn_1)(void); + void (*fn_2)(void); +}; + +static int dummy_init(struct btf *btf) +{ + return 0; +} + +static int dummy_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +static int dummy_reg(void *kdata) +{ + return 0; +} + +static void dummy_unreg(void *kdata) +{ +} + +static const struct bpf_verifier_ops dummy_verifier_ops; + +static void bpf_test_no_cfi_ops__fn_1(void) +{ +} + +static void bpf_test_no_cfi_ops__fn_2(void) +{ +} + +static struct bpf_test_no_cfi_ops __test_no_cif_ops = { + .fn_1 = bpf_test_no_cfi_ops__fn_1, + .fn_2 = bpf_test_no_cfi_ops__fn_2, +}; + +static struct bpf_struct_ops test_no_cif_ops = { + .verifier_ops = &dummy_verifier_ops, + .init = dummy_init, + .init_member = dummy_init_member, + .reg = dummy_reg, + .unreg = dummy_unreg, + .name = "bpf_test_no_cfi_ops", + .owner = THIS_MODULE, +}; + +static int bpf_test_no_cfi_init(void) +{ + int ret; + + ret = register_bpf_struct_ops(&test_no_cif_ops, + bpf_test_no_cfi_ops); + if (!ret) + return -EINVAL; + + test_no_cif_ops.cfi_stubs = &__test_no_cif_ops; + ret = register_bpf_struct_ops(&test_no_cif_ops, + bpf_test_no_cfi_ops); + return ret; +} + +static void bpf_test_no_cfi_exit(void) +{ +} + +module_init(bpf_test_no_cfi_init); +module_exit(bpf_test_no_cfi_exit); + +MODULE_AUTHOR("Kuifeng Lee"); +MODULE_DESCRIPTION("BPF no cfi_stubs test module"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_no_cfi.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_no_cfi.c new file mode 100644 index 000000000000..106ea447965a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_no_cfi.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include + +static void load_bpf_test_no_cfi(void) +{ + int fd; + int err; + + fd = open("bpf_test_no_cfi.ko", O_RDONLY); + if (!ASSERT_GE(fd, 0, "open")) + return; + + /* The module will try to register a struct_ops type without + * cfi_stubs and with cfi_stubs. + * + * The one without cfi_stub should fail. The module will be loaded + * successfully only if the result of the registration is as + * expected, or it fails. + */ + err = finit_module(fd, "", 0); + close(fd); + if (!ASSERT_OK(err, "finit_module")) + return; + + err = delete_module("bpf_test_no_cfi", 0); + ASSERT_OK(err, "delete_module"); +} + +void test_struct_ops_no_cfi(void) +{ + if (test__start_subtest("load_bpf_test_no_cfi")) + load_bpf_test_no_cfi(); +} diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index a59e56d804ee..28b6646662af 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -356,12 +356,12 @@ __u64 read_perf_max_sample_freq(void) return sample_freq; } -static int finit_module(int fd, const char *param_values, int flags) +int finit_module(int fd, const char *param_values, int flags) { return syscall(__NR_finit_module, fd, param_values, flags); } -static int delete_module(const char *name, int flags) +int delete_module(const char *name, int flags) { return syscall(__NR_delete_module, name, flags); } diff --git a/tools/testing/selftests/bpf/testing_helpers.h b/tools/testing/selftests/bpf/testing_helpers.h index d14de81727e6..d55f6ab12433 100644 --- a/tools/testing/selftests/bpf/testing_helpers.h +++ b/tools/testing/selftests/bpf/testing_helpers.h @@ -36,6 +36,8 @@ __u64 read_perf_max_sample_freq(void); int load_bpf_testmod(bool verbose); int unload_bpf_testmod(bool verbose); int kern_sync_rcu(void); +int finit_module(int fd, const char *param_values, int flags); +int delete_module(const char *name, int flags); static inline __u64 get_time_ns(void) { From 55bad79e33aeb670317290158a4b2ff71cdc8380 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 21 Feb 2024 17:25:17 +0100 Subject: [PATCH 107/119] bpf: allow more maps in sleepable bpf programs These 2 maps types are required for HID-BPF when a user wants to do IO with a device from a sleepable tracing point. Allowing BPF_MAP_TYPE_QUEUE (and therefore BPF_MAP_TYPE_STACK) allows for a BPF program to prepare from an IRQ the list of HID commands to send back to the device and then these commands can be retrieved from the sleepable trace point. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240221-hid-bpf-sleepable-v3-1-1fb378ca6301@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 011d54a1dc53..88e9d2e4c29f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18022,6 +18022,8 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_TASK_STORAGE: case BPF_MAP_TYPE_CGRP_STORAGE: + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: break; default: verbose(env, From dfe6625df48ec54c6dc9b86d361f26962d09de88 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 21 Feb 2024 17:25:18 +0100 Subject: [PATCH 108/119] bpf: introduce in_sleepable() helper No code change, but it'll allow to have only one place to change everything when we add in_sleepable in cur_state. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240221-hid-bpf-sleepable-v3-2-1fb378ca6301@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 88e9d2e4c29f..b787a6d67696 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5255,6 +5255,11 @@ bad_type: return -EINVAL; } +static bool in_sleepable(struct bpf_verifier_env *env) +{ + return env->prog->aux->sleepable; +} + /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() * can dereference RCU protected pointers and result is PTR_TRUSTED. */ @@ -5262,7 +5267,7 @@ static bool in_rcu_cs(struct bpf_verifier_env *env) { return env->cur_state->active_rcu_lock || env->cur_state->active_lock.ptr || - !env->prog->aux->sleepable; + !in_sleepable(env); } /* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */ @@ -10164,7 +10169,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } - if (!env->prog->aux->sleepable && fn->might_sleep) { + if (!in_sleepable(env) && fn->might_sleep) { verbose(env, "helper call might sleep in a non-sleepable prog\n"); return -EINVAL; } @@ -10194,7 +10199,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } - if (env->prog->aux->sleepable && is_storage_get_function(func_id)) + if (in_sleepable(env) && is_storage_get_function(func_id)) env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } @@ -11535,7 +11540,7 @@ static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env) return true; fallthrough; default: - return env->prog->aux->sleepable; + return in_sleepable(env); } } @@ -12056,7 +12061,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } sleepable = is_kfunc_sleepable(&meta); - if (sleepable && !env->prog->aux->sleepable) { + if (sleepable && !in_sleepable(env)) { verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name); return -EACCES; } @@ -19669,7 +19674,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) } if (is_storage_get_function(insn->imm)) { - if (!env->prog->aux->sleepable || + if (!in_sleepable(env) || env->insn_aux_data[i + delta].storage_get_func_atomic) insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); else From 2ab256e93249f5ac1da665861aa0f03fb4208d9c Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 21 Feb 2024 17:25:19 +0100 Subject: [PATCH 109/119] bpf: add is_async_callback_calling_insn() helper Currently we have a special case for BPF_FUNC_timer_set_callback, let's introduce a helper we can extend for the kfunc that will come in a later patch Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240221-hid-bpf-sleepable-v3-3-1fb378ca6301@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b787a6d67696..1c34b91b9583 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -528,6 +528,11 @@ static bool is_sync_callback_calling_insn(struct bpf_insn *insn) (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm)); } +static bool is_async_callback_calling_insn(struct bpf_insn *insn) +{ + return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm); +} + static bool is_storage_get_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_storage_get || @@ -9445,9 +9450,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins return -EFAULT; } - if (insn->code == (BPF_JMP | BPF_CALL) && - insn->src_reg == 0 && - insn->imm == BPF_FUNC_timer_set_callback) { + if (is_async_callback_calling_insn(insn)) { struct bpf_verifier_state *async_cb; /* there is no real recursion here. timer callbacks are async */ @@ -15588,7 +15591,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) return DONE_EXPLORING; case BPF_CALL: - if (insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback) + if (is_async_callback_calling_insn(insn)) /* Mark this call insn as a prune point to trigger * is_state_visited() check before call itself is * processed by __check_func_call(). Otherwise new From e74cb1b422131615a0fe3bedd4ab2e38b7442d10 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 1 Feb 2024 12:52:24 +0000 Subject: [PATCH 110/119] arm64: stacktrace: Implement arch_bpf_stack_walk() for the BPF JIT This will be used by bpf_throw() to unwind till the program marked as exception boundary and run the callback with the stack of the main program. This is required for supporting BPF exceptions on ARM64. Signed-off-by: Puranjay Mohan Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20240201125225.72796-2-puranjay12@gmail.com Signed-off-by: Alexei Starovoitov --- arch/arm64/kernel/stacktrace.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 7f88028a00c0..66cffc5fc0be 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -266,6 +267,31 @@ noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, kunwind_stack_walk(arch_kunwind_consume_entry, &data, task, regs); } +struct bpf_unwind_consume_entry_data { + bool (*consume_entry)(void *cookie, u64 ip, u64 sp, u64 fp); + void *cookie; +}; + +static bool +arch_bpf_unwind_consume_entry(const struct kunwind_state *state, void *cookie) +{ + struct bpf_unwind_consume_entry_data *data = cookie; + + return data->consume_entry(data->cookie, state->common.pc, 0, + state->common.fp); +} + +noinline noinstr void arch_bpf_stack_walk(bool (*consume_entry)(void *cookie, u64 ip, u64 sp, + u64 fp), void *cookie) +{ + struct bpf_unwind_consume_entry_data data = { + .consume_entry = consume_entry, + .cookie = cookie, + }; + + kunwind_stack_walk(arch_bpf_unwind_consume_entry, &data, current, NULL); +} + static bool dump_backtrace_entry(void *arg, unsigned long where) { char *loglvl = arg; From 22fc0e80aeb5c0c1377e6c02d7248f8fbf5df7fc Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 1 Feb 2024 12:52:25 +0000 Subject: [PATCH 111/119] bpf, arm64: support exceptions The prologue generation code has been modified to make the callback program use the stack of the program marked as exception boundary where callee-saved registers are already pushed. As the bpf_throw function never returns, if it clobbers any callee-saved registers, they would remain clobbered. So, the prologue of the exception-boundary program is modified to push R23 and R24 as well, which the callback will then recover in its epilogue. The Procedure Call Standard for the Arm 64-bit Architecture[1] states that registers r19 to r28 should be saved by the callee. BPF programs on ARM64 already save all callee-saved registers except r23 and r24. This patch adds an instruction in prologue of the program to save these two registers and another instruction in the epilogue to recover them. These extra instructions are only added if bpf_throw() is used. Otherwise the emitted prologue/epilogue remains unchanged. [1] https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240201125225.72796-3-puranjay12@gmail.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 85 +++++++++++++++----- tools/testing/selftests/bpf/DENYLIST.aarch64 | 1 - 2 files changed, 67 insertions(+), 19 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index cfd5434de483..20720ec346b8 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -285,7 +285,8 @@ static bool is_lsi_offset(int offset, int scale) /* Tail call offset to jump into */ #define PROLOGUE_OFFSET (BTI_INSNS + 2 + PAC_INSNS + 8) -static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) +static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, + bool is_exception_cb) { const struct bpf_prog *prog = ctx->prog; const bool is_main_prog = !bpf_is_subprog(prog); @@ -333,19 +334,34 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) emit(A64_MOV(1, A64_R(9), A64_LR), ctx); emit(A64_NOP, ctx); - /* Sign lr */ - if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL)) - emit(A64_PACIASP, ctx); + if (!is_exception_cb) { + /* Sign lr */ + if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL)) + emit(A64_PACIASP, ctx); + /* Save FP and LR registers to stay align with ARM64 AAPCS */ + emit(A64_PUSH(A64_FP, A64_LR, A64_SP), ctx); + emit(A64_MOV(1, A64_FP, A64_SP), ctx); - /* Save FP and LR registers to stay align with ARM64 AAPCS */ - emit(A64_PUSH(A64_FP, A64_LR, A64_SP), ctx); - emit(A64_MOV(1, A64_FP, A64_SP), ctx); - - /* Save callee-saved registers */ - emit(A64_PUSH(r6, r7, A64_SP), ctx); - emit(A64_PUSH(r8, r9, A64_SP), ctx); - emit(A64_PUSH(fp, tcc, A64_SP), ctx); - emit(A64_PUSH(fpb, A64_R(28), A64_SP), ctx); + /* Save callee-saved registers */ + emit(A64_PUSH(r6, r7, A64_SP), ctx); + emit(A64_PUSH(r8, r9, A64_SP), ctx); + emit(A64_PUSH(fp, tcc, A64_SP), ctx); + emit(A64_PUSH(fpb, A64_R(28), A64_SP), ctx); + } else { + /* + * Exception callback receives FP of Main Program as third + * parameter + */ + emit(A64_MOV(1, A64_FP, A64_R(2)), ctx); + /* + * Main Program already pushed the frame record and the + * callee-saved registers. The exception callback will not push + * anything and re-use the main program's stack. + * + * 10 registers are on the stack + */ + emit(A64_SUB_I(1, A64_SP, A64_FP, 80), ctx); + } /* Set up BPF prog stack base register */ emit(A64_MOV(1, fp, A64_SP), ctx); @@ -365,6 +381,20 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) emit_bti(A64_BTI_J, ctx); } + /* + * Program acting as exception boundary should save all ARM64 + * Callee-saved registers as the exception callback needs to recover + * all ARM64 Callee-saved registers in its epilogue. + */ + if (prog->aux->exception_boundary) { + /* + * As we are pushing two more registers, BPF_FP should be moved + * 16 bytes + */ + emit(A64_SUB_I(1, fp, fp, 16), ctx); + emit(A64_PUSH(A64_R(23), A64_R(24), A64_SP), ctx); + } + emit(A64_SUB_I(1, fpb, fp, ctx->fpb_offset), ctx); /* Stack must be multiples of 16B */ @@ -653,7 +683,7 @@ static void build_plt(struct jit_ctx *ctx) plt->target = (u64)&dummy_tramp; } -static void build_epilogue(struct jit_ctx *ctx) +static void build_epilogue(struct jit_ctx *ctx, bool is_exception_cb) { const u8 r0 = bpf2a64[BPF_REG_0]; const u8 r6 = bpf2a64[BPF_REG_6]; @@ -666,6 +696,15 @@ static void build_epilogue(struct jit_ctx *ctx) /* We're done with BPF stack */ emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); + /* + * Program acting as exception boundary pushes R23 and R24 in addition + * to BPF callee-saved registers. Exception callback uses the boundary + * program's stack frame, so recover these extra registers in the above + * two cases. + */ + if (ctx->prog->aux->exception_boundary || is_exception_cb) + emit(A64_POP(A64_R(23), A64_R(24), A64_SP), ctx); + /* Restore x27 and x28 */ emit(A64_POP(fpb, A64_R(28), A64_SP), ctx); /* Restore fs (x25) and x26 */ @@ -1575,7 +1614,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) * BPF line info needs ctx->offset[i] to be the offset of * instruction[i] in jited image, so build prologue first. */ - if (build_prologue(&ctx, was_classic)) { + if (build_prologue(&ctx, was_classic, prog->aux->exception_cb)) { prog = orig_prog; goto out_off; } @@ -1586,7 +1625,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } ctx.epilogue_offset = ctx.idx; - build_epilogue(&ctx); + build_epilogue(&ctx, prog->aux->exception_cb); build_plt(&ctx); extable_align = __alignof__(struct exception_table_entry); @@ -1614,7 +1653,7 @@ skip_init_ctx: ctx.idx = 0; ctx.exentry_idx = 0; - build_prologue(&ctx, was_classic); + build_prologue(&ctx, was_classic, prog->aux->exception_cb); if (build_body(&ctx, extra_pass)) { bpf_jit_binary_free(header); @@ -1622,7 +1661,7 @@ skip_init_ctx: goto out_off; } - build_epilogue(&ctx); + build_epilogue(&ctx, prog->aux->exception_cb); build_plt(&ctx); /* 3. Extra pass to validate JITed code. */ @@ -2310,3 +2349,13 @@ bool bpf_jit_supports_ptr_xchg(void) { return true; } + +bool bpf_jit_supports_exceptions(void) +{ + /* We unwind through both kernel frames starting from within bpf_throw + * call and BPF frames. Therefore we require FP unwinder to be enabled + * to walk kernel frames and reach BPF frames in the stack trace. + * ARM64 kernel is aways compiled with CONFIG_FRAME_POINTER=y + */ + return true; +} diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index 5c2cc7e8c5d0..0445ac38bc07 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -1,6 +1,5 @@ bpf_cookie/multi_kprobe_attach_api # kprobe_multi_link_api_subtest:FAIL:fentry_raw_skel_load unexpected error: -3 bpf_cookie/multi_kprobe_link_api # kprobe_multi_link_api_subtest:FAIL:fentry_raw_skel_load unexpected error: -3 -exceptions # JIT does not support calling kfunc bpf_throw: -524 fexit_sleep # The test never returns. The remaining tests cannot start. kprobe_multi_bench_attach # needs CONFIG_FPROBE kprobe_multi_test # needs CONFIG_FPROBE From 451c3cab9a65e656c3b3d106831fc02d56b8c34a Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 28 Feb 2024 14:18:23 +0000 Subject: [PATCH 112/119] arm64: patching: implement text_poke API The text_poke API is used to implement functions like memcpy() and memset() for instruction memory (RO+X). The implementation is similar to the x86 version. This will be used by the BPF JIT to write and modify BPF programs. There could be more users of this in the future. Signed-off-by: Puranjay Mohan Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20240228141824.119877-2-puranjay12@gmail.com Signed-off-by: Alexei Starovoitov --- arch/arm64/include/asm/patching.h | 2 + arch/arm64/kernel/patching.c | 75 +++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/arch/arm64/include/asm/patching.h b/arch/arm64/include/asm/patching.h index 68908b82b168..587bdb91ab7a 100644 --- a/arch/arm64/include/asm/patching.h +++ b/arch/arm64/include/asm/patching.h @@ -8,6 +8,8 @@ int aarch64_insn_read(void *addr, u32 *insnp); int aarch64_insn_write(void *addr, u32 insn); int aarch64_insn_write_literal_u64(void *addr, u64 val); +void *aarch64_insn_set(void *dst, u32 insn, size_t len); +void *aarch64_insn_copy(void *dst, void *src, size_t len); int aarch64_insn_patch_text_nosync(void *addr, u32 insn); int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt); diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c index b4835f6d594b..255534930368 100644 --- a/arch/arm64/kernel/patching.c +++ b/arch/arm64/kernel/patching.c @@ -105,6 +105,81 @@ noinstr int aarch64_insn_write_literal_u64(void *addr, u64 val) return ret; } +typedef void text_poke_f(void *dst, void *src, size_t patched, size_t len); + +static void *__text_poke(text_poke_f func, void *addr, void *src, size_t len) +{ + unsigned long flags; + size_t patched = 0; + size_t size; + void *waddr; + void *ptr; + + raw_spin_lock_irqsave(&patch_lock, flags); + + while (patched < len) { + ptr = addr + patched; + size = min_t(size_t, PAGE_SIZE - offset_in_page(ptr), + len - patched); + + waddr = patch_map(ptr, FIX_TEXT_POKE0); + func(waddr, src, patched, size); + patch_unmap(FIX_TEXT_POKE0); + + patched += size; + } + raw_spin_unlock_irqrestore(&patch_lock, flags); + + flush_icache_range((uintptr_t)addr, (uintptr_t)addr + len); + + return addr; +} + +static void text_poke_memcpy(void *dst, void *src, size_t patched, size_t len) +{ + copy_to_kernel_nofault(dst, src + patched, len); +} + +static void text_poke_memset(void *dst, void *src, size_t patched, size_t len) +{ + u32 c = *(u32 *)src; + + memset32(dst, c, len / 4); +} + +/** + * aarch64_insn_copy - Copy instructions into (an unused part of) RX memory + * @dst: address to modify + * @src: source of the copy + * @len: length to copy + * + * Useful for JITs to dump new code blocks into unused regions of RX memory. + */ +noinstr void *aarch64_insn_copy(void *dst, void *src, size_t len) +{ + /* A64 instructions must be word aligned */ + if ((uintptr_t)dst & 0x3) + return NULL; + + return __text_poke(text_poke_memcpy, dst, src, len); +} + +/** + * aarch64_insn_set - memset for RX memory regions. + * @dst: address to modify + * @insn: value to set + * @len: length of memory region. + * + * Useful for JITs to fill regions of RX memory with illegal instructions. + */ +noinstr void *aarch64_insn_set(void *dst, u32 insn, size_t len) +{ + if ((uintptr_t)dst & 0x3) + return NULL; + + return __text_poke(text_poke_memset, dst, &insn, len); +} + int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) { u32 *tp = addr; From 1dad391daef129e01e28206b8d586608ff026548 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 28 Feb 2024 14:18:24 +0000 Subject: [PATCH 113/119] bpf, arm64: use bpf_prog_pack for memory management Use bpf_jit_binary_pack_alloc for memory management of JIT binaries in ARM64 BPF JIT. The bpf_jit_binary_pack_alloc creates a pair of RW and RX buffers. The JIT writes the program into the RW buffer. When the JIT is done, the program is copied to the final RX buffer with bpf_jit_binary_pack_finalize. Implement bpf_arch_text_copy() and bpf_arch_text_invalidate() for ARM64 JIT as these functions are required by bpf_jit_binary_pack allocator. Signed-off-by: Puranjay Mohan Acked-by: Song Liu Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20240228141824.119877-3-puranjay12@gmail.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 139 ++++++++++++++++++++++++++++------ 1 file changed, 115 insertions(+), 24 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 20720ec346b8..5afc7a525eca 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -76,6 +76,7 @@ struct jit_ctx { int *offset; int exentry_idx; __le32 *image; + __le32 *ro_image; u32 stack_size; int fpb_offset; }; @@ -205,6 +206,14 @@ static void jit_fill_hole(void *area, unsigned int size) *ptr++ = cpu_to_le32(AARCH64_BREAK_FAULT); } +int bpf_arch_text_invalidate(void *dst, size_t len) +{ + if (!aarch64_insn_set(dst, AARCH64_BREAK_FAULT, len)) + return -EINVAL; + + return 0; +} + static inline int epilogue_offset(const struct jit_ctx *ctx) { int to = ctx->epilogue_offset; @@ -746,7 +755,8 @@ static int add_exception_handler(const struct bpf_insn *insn, struct jit_ctx *ctx, int dst_reg) { - off_t offset; + off_t ins_offset; + off_t fixup_offset; unsigned long pc; struct exception_table_entry *ex; @@ -763,12 +773,17 @@ static int add_exception_handler(const struct bpf_insn *insn, return -EINVAL; ex = &ctx->prog->aux->extable[ctx->exentry_idx]; - pc = (unsigned long)&ctx->image[ctx->idx - 1]; + pc = (unsigned long)&ctx->ro_image[ctx->idx - 1]; - offset = pc - (long)&ex->insn; - if (WARN_ON_ONCE(offset >= 0 || offset < INT_MIN)) + /* + * This is the relative offset of the instruction that may fault from + * the exception table itself. This will be written to the exception + * table and if this instruction faults, the destination register will + * be set to '0' and the execution will jump to the next instruction. + */ + ins_offset = pc - (long)&ex->insn; + if (WARN_ON_ONCE(ins_offset >= 0 || ins_offset < INT_MIN)) return -ERANGE; - ex->insn = offset; /* * Since the extable follows the program, the fixup offset is always @@ -777,12 +792,25 @@ static int add_exception_handler(const struct bpf_insn *insn, * bits. We don't need to worry about buildtime or runtime sort * modifying the upper bits because the table is already sorted, and * isn't part of the main exception table. + * + * The fixup_offset is set to the next instruction from the instruction + * that may fault. The execution will jump to this after handling the + * fault. */ - offset = (long)&ex->fixup - (pc + AARCH64_INSN_SIZE); - if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, offset)) + fixup_offset = (long)&ex->fixup - (pc + AARCH64_INSN_SIZE); + if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, fixup_offset)) return -ERANGE; - ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, offset) | + /* + * The offsets above have been calculated using the RO buffer but we + * need to use the R/W buffer for writes. + * switch ex to rw buffer for writing. + */ + ex = (void *)ctx->image + ((void *)ex - (void *)ctx->ro_image); + + ex->insn = ins_offset; + + ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) | FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); ex->type = EX_TYPE_BPF; @@ -1550,7 +1578,8 @@ static inline void bpf_flush_icache(void *start, void *end) struct arm64_jit_data { struct bpf_binary_header *header; - u8 *image; + u8 *ro_image; + struct bpf_binary_header *ro_header; struct jit_ctx ctx; }; @@ -1559,12 +1588,14 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) int image_size, prog_size, extable_size, extable_align, extable_offset; struct bpf_prog *tmp, *orig_prog = prog; struct bpf_binary_header *header; + struct bpf_binary_header *ro_header; struct arm64_jit_data *jit_data; bool was_classic = bpf_prog_was_classic(prog); bool tmp_blinded = false; bool extra_pass = false; struct jit_ctx ctx; u8 *image_ptr; + u8 *ro_image_ptr; if (!prog->jit_requested) return orig_prog; @@ -1591,8 +1622,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } if (jit_data->ctx.offset) { ctx = jit_data->ctx; - image_ptr = jit_data->image; + ro_image_ptr = jit_data->ro_image; + ro_header = jit_data->ro_header; header = jit_data->header; + image_ptr = (void *)header + ((void *)ro_image_ptr + - (void *)ro_header); extra_pass = true; prog_size = sizeof(u32) * ctx.idx; goto skip_init_ctx; @@ -1637,18 +1671,27 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) /* also allocate space for plt target */ extable_offset = round_up(prog_size + PLT_TARGET_SIZE, extable_align); image_size = extable_offset + extable_size; - header = bpf_jit_binary_alloc(image_size, &image_ptr, - sizeof(u32), jit_fill_hole); - if (header == NULL) { + ro_header = bpf_jit_binary_pack_alloc(image_size, &ro_image_ptr, + sizeof(u32), &header, &image_ptr, + jit_fill_hole); + if (!ro_header) { prog = orig_prog; goto out_off; } /* 2. Now, the actual pass. */ + /* + * Use the image(RW) for writing the JITed instructions. But also save + * the ro_image(RX) for calculating the offsets in the image. The RW + * image will be later copied to the RX image from where the program + * will run. The bpf_jit_binary_pack_finalize() will do this copy in the + * final step. + */ ctx.image = (__le32 *)image_ptr; + ctx.ro_image = (__le32 *)ro_image_ptr; if (extable_size) - prog->aux->extable = (void *)image_ptr + extable_offset; + prog->aux->extable = (void *)ro_image_ptr + extable_offset; skip_init_ctx: ctx.idx = 0; ctx.exentry_idx = 0; @@ -1656,9 +1699,8 @@ skip_init_ctx: build_prologue(&ctx, was_classic, prog->aux->exception_cb); if (build_body(&ctx, extra_pass)) { - bpf_jit_binary_free(header); prog = orig_prog; - goto out_off; + goto out_free_hdr; } build_epilogue(&ctx, prog->aux->exception_cb); @@ -1666,34 +1708,44 @@ skip_init_ctx: /* 3. Extra pass to validate JITed code. */ if (validate_ctx(&ctx)) { - bpf_jit_binary_free(header); prog = orig_prog; - goto out_off; + goto out_free_hdr; } /* And we're done. */ if (bpf_jit_enable > 1) bpf_jit_dump(prog->len, prog_size, 2, ctx.image); - bpf_flush_icache(header, ctx.image + ctx.idx); - if (!prog->is_func || extra_pass) { if (extra_pass && ctx.idx != jit_data->ctx.idx) { pr_err_once("multi-func JIT bug %d != %d\n", ctx.idx, jit_data->ctx.idx); - bpf_jit_binary_free(header); prog->bpf_func = NULL; prog->jited = 0; prog->jited_len = 0; + goto out_free_hdr; + } + if (WARN_ON(bpf_jit_binary_pack_finalize(prog, ro_header, + header))) { + /* ro_header has been freed */ + ro_header = NULL; + prog = orig_prog; goto out_off; } - bpf_jit_binary_lock_ro(header); + /* + * The instructions have now been copied to the ROX region from + * where they will execute. Now the data cache has to be cleaned to + * the PoU and the I-cache has to be invalidated for the VAs. + */ + bpf_flush_icache(ro_header, ctx.ro_image + ctx.idx); } else { jit_data->ctx = ctx; - jit_data->image = image_ptr; + jit_data->ro_image = ro_image_ptr; jit_data->header = header; + jit_data->ro_header = ro_header; } - prog->bpf_func = (void *)ctx.image; + + prog->bpf_func = (void *)ctx.ro_image; prog->jited = 1; prog->jited_len = prog_size; @@ -1714,6 +1766,14 @@ out: bpf_jit_prog_release_other(prog, prog == orig_prog ? tmp : orig_prog); return prog; + +out_free_hdr: + if (header) { + bpf_arch_text_copy(&ro_header->size, &header->size, + sizeof(header->size)); + bpf_jit_binary_pack_free(ro_header, header); + } + goto out_off; } bool bpf_jit_supports_kfunc_call(void) @@ -1721,6 +1781,13 @@ bool bpf_jit_supports_kfunc_call(void) return true; } +void *bpf_arch_text_copy(void *dst, void *src, size_t len) +{ + if (!aarch64_insn_copy(dst, src, len)) + return ERR_PTR(-EINVAL); + return dst; +} + u64 bpf_jit_alloc_exec_limit(void) { return VMALLOC_END - VMALLOC_START; @@ -2359,3 +2426,27 @@ bool bpf_jit_supports_exceptions(void) */ return true; } + +void bpf_jit_free(struct bpf_prog *prog) +{ + if (prog->jited) { + struct arm64_jit_data *jit_data = prog->aux->jit_data; + struct bpf_binary_header *hdr; + + /* + * If we fail the final pass of JIT (from jit_subprogs), + * the program may not be finalized yet. Call finalize here + * before freeing it. + */ + if (jit_data) { + bpf_arch_text_copy(&jit_data->ro_header->size, &jit_data->header->size, + sizeof(jit_data->header->size)); + kfree(jit_data); + } + hdr = bpf_jit_binary_pack_hdr(prog); + bpf_jit_binary_pack_free(hdr, NULL); + WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog)); + } + + bpf_prog_unlock_free(prog); +} From 896880ff30866f386ebed14ab81ce1ad3710cfc4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 22 Feb 2024 07:56:15 -0800 Subject: [PATCH 114/119] bpf: Replace bpf_lpm_trie_key 0-length array with flexible array Replace deprecated 0-length array in struct bpf_lpm_trie_key with flexible array. Found with GCC 13: ../kernel/bpf/lpm_trie.c:207:51: warning: array subscript i is outside array bounds of 'const __u8[0]' {aka 'const unsigned char[]'} [-Warray-bounds=] 207 | *(__be16 *)&key->data[i]); | ^~~~~~~~~~~~~ ../include/uapi/linux/swab.h:102:54: note: in definition of macro '__swab16' 102 | #define __swab16(x) (__u16)__builtin_bswap16((__u16)(x)) | ^ ../include/linux/byteorder/generic.h:97:21: note: in expansion of macro '__be16_to_cpu' 97 | #define be16_to_cpu __be16_to_cpu | ^~~~~~~~~~~~~ ../kernel/bpf/lpm_trie.c:206:28: note: in expansion of macro 'be16_to_cpu' 206 | u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^ | ^~~~~~~~~~~ In file included from ../include/linux/bpf.h:7: ../include/uapi/linux/bpf.h:82:17: note: while referencing 'data' 82 | __u8 data[0]; /* Arbitrary size */ | ^~~~ And found at run-time under CONFIG_FORTIFY_SOURCE: UBSAN: array-index-out-of-bounds in kernel/bpf/lpm_trie.c:218:49 index 0 is out of range for type '__u8 [*]' Changing struct bpf_lpm_trie_key is difficult since has been used by userspace. For example, in Cilium: struct egress_gw_policy_key { struct bpf_lpm_trie_key lpm_key; __u32 saddr; __u32 daddr; }; While direct references to the "data" member haven't been found, there are static initializers what include the final member. For example, the "{}" here: struct egress_gw_policy_key in_key = { .lpm_key = { 32 + 24, {} }, .saddr = CLIENT_IP, .daddr = EXTERNAL_SVC_IP & 0Xffffff, }; To avoid the build time and run time warnings seen with a 0-sized trailing array for struct bpf_lpm_trie_key, introduce a new struct that correctly uses a flexible array for the trailing bytes, struct bpf_lpm_trie_key_u8. As part of this, include the "header" portion (which is just the "prefixlen" member), so it can be used by anything building a bpf_lpr_trie_key that has trailing members that aren't a u8 flexible array (like the self-test[1]), which is named struct bpf_lpm_trie_key_hdr. Unfortunately, C++ refuses to parse the __struct_group() helper, so it is not possible to define struct bpf_lpm_trie_key_hdr directly in struct bpf_lpm_trie_key_u8, so we must open-code the union directly. Adjust the kernel code to use struct bpf_lpm_trie_key_u8 through-out, and for the selftest to use struct bpf_lpm_trie_key_hdr. Add a comment to the UAPI header directing folks to the two new options. Reported-by: Mark Rutland Signed-off-by: Kees Cook Signed-off-by: Daniel Borkmann Acked-by: Gustavo A. R. Silva Closes: https://paste.debian.net/hidden/ca500597/ Link: https://lore.kernel.org/all/202206281009.4332AA33@keescook/ [1] Link: https://lore.kernel.org/bpf/20240222155612.it.533-kees@kernel.org --- Documentation/bpf/map_lpm_trie.rst | 2 +- include/uapi/linux/bpf.h | 19 +++++++++++++++++- kernel/bpf/lpm_trie.c | 20 +++++++++---------- samples/bpf/map_perf_test_user.c | 2 +- samples/bpf/xdp_router_ipv4_user.c | 2 +- tools/include/uapi/linux/bpf.h | 19 +++++++++++++++++- .../selftests/bpf/progs/map_ptr_kern.c | 2 +- tools/testing/selftests/bpf/test_lpm_map.c | 18 ++++++++--------- 8 files changed, 59 insertions(+), 25 deletions(-) diff --git a/Documentation/bpf/map_lpm_trie.rst b/Documentation/bpf/map_lpm_trie.rst index 74d64a30f500..f9cd579496c9 100644 --- a/Documentation/bpf/map_lpm_trie.rst +++ b/Documentation/bpf/map_lpm_trie.rst @@ -17,7 +17,7 @@ significant byte. LPM tries may be created with a maximum prefix length that is a multiple of 8, in the range from 8 to 2048. The key used for lookup and update -operations is a ``struct bpf_lpm_trie_key``, extended by +operations is a ``struct bpf_lpm_trie_key_u8``, extended by ``max_prefixlen/8`` bytes. - For IPv4 addresses the data length is 4 bytes diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d2e6c5fcec01..a241f407c234 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -77,12 +77,29 @@ struct bpf_insn { __s32 imm; /* signed immediate constant */ }; -/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ +/* Deprecated: use struct bpf_lpm_trie_key_u8 (when the "data" member is needed for + * byte access) or struct bpf_lpm_trie_key_hdr (when using an alternative type for + * the trailing flexible array member) instead. + */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ __u8 data[0]; /* Arbitrary size */ }; +/* Header for bpf_lpm_trie_key structs */ +struct bpf_lpm_trie_key_hdr { + __u32 prefixlen; +}; + +/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry, with trailing byte array. */ +struct bpf_lpm_trie_key_u8 { + union { + struct bpf_lpm_trie_key_hdr hdr; + __u32 prefixlen; + }; + __u8 data[]; /* Arbitrary size */ +}; + struct bpf_cgroup_storage_key { __u64 cgroup_inode_id; /* cgroup inode id */ __u32 attach_type; /* program attach type (enum bpf_attach_type) */ diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index b32be680da6c..050fe1ebf0f7 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -164,13 +164,13 @@ static inline int extract_bit(const u8 *data, size_t index) */ static size_t longest_prefix_match(const struct lpm_trie *trie, const struct lpm_trie_node *node, - const struct bpf_lpm_trie_key *key) + const struct bpf_lpm_trie_key_u8 *key) { u32 limit = min(node->prefixlen, key->prefixlen); u32 prefixlen = 0, i = 0; BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32)); - BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32)); + BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key_u8, data) % sizeof(u32)); #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT) @@ -229,7 +229,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *found = NULL; - struct bpf_lpm_trie_key *key = _key; + struct bpf_lpm_trie_key_u8 *key = _key; if (key->prefixlen > trie->max_prefixlen) return NULL; @@ -309,7 +309,7 @@ static long trie_update_elem(struct bpf_map *map, struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; struct lpm_trie_node __rcu **slot; - struct bpf_lpm_trie_key *key = _key; + struct bpf_lpm_trie_key_u8 *key = _key; unsigned long irq_flags; unsigned int next_bit; size_t matchlen = 0; @@ -437,7 +437,7 @@ out: static long trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); - struct bpf_lpm_trie_key *key = _key; + struct bpf_lpm_trie_key_u8 *key = _key; struct lpm_trie_node __rcu **trim, **trim2; struct lpm_trie_node *node, *parent; unsigned long irq_flags; @@ -536,7 +536,7 @@ out: sizeof(struct lpm_trie_node)) #define LPM_VAL_SIZE_MIN 1 -#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X)) +#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key_u8) + (X)) #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) @@ -565,7 +565,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) /* copy mandatory map attributes */ bpf_map_init_from_attr(&trie->map, attr); trie->data_size = attr->key_size - - offsetof(struct bpf_lpm_trie_key, data); + offsetof(struct bpf_lpm_trie_key_u8, data); trie->max_prefixlen = trie->data_size * 8; spin_lock_init(&trie->lock); @@ -616,7 +616,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; struct lpm_trie *trie = container_of(map, struct lpm_trie, map); - struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; + struct bpf_lpm_trie_key_u8 *key = _key, *next_key = _next_key; struct lpm_trie_node **node_stack = NULL; int err = 0, stack_ptr = -1; unsigned int next_bit; @@ -703,7 +703,7 @@ find_leftmost: } do_copy: next_key->prefixlen = next_node->prefixlen; - memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data), + memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key_u8, data), next_node->data, trie->data_size); free_stack: kfree(node_stack); @@ -715,7 +715,7 @@ static int trie_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - /* Keys must have struct bpf_lpm_trie_key embedded. */ + /* Keys must have struct bpf_lpm_trie_key_u8 embedded. */ return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ? -EINVAL : 0; } diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index d2fbcf963cdf..07ff471ed6ae 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -370,7 +370,7 @@ static void run_perf_test(int tasks) static void fill_lpm_trie(void) { - struct bpf_lpm_trie_key *key; + struct bpf_lpm_trie_key_u8 *key; unsigned long value = 0; unsigned int i; int r; diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c index 9d41db09c480..266fdd0b025d 100644 --- a/samples/bpf/xdp_router_ipv4_user.c +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -91,7 +91,7 @@ static int recv_msg(struct sockaddr_nl sock_addr, int sock) static void read_route(struct nlmsghdr *nh, int nll) { char dsts[24], gws[24], ifs[16], dsts_len[24], metrics[24]; - struct bpf_lpm_trie_key *prefix_key; + struct bpf_lpm_trie_key_u8 *prefix_key; struct rtattr *rt_attr; struct rtmsg *rt_msg; int rtm_family; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d2e6c5fcec01..a241f407c234 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -77,12 +77,29 @@ struct bpf_insn { __s32 imm; /* signed immediate constant */ }; -/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ +/* Deprecated: use struct bpf_lpm_trie_key_u8 (when the "data" member is needed for + * byte access) or struct bpf_lpm_trie_key_hdr (when using an alternative type for + * the trailing flexible array member) instead. + */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ __u8 data[0]; /* Arbitrary size */ }; +/* Header for bpf_lpm_trie_key structs */ +struct bpf_lpm_trie_key_hdr { + __u32 prefixlen; +}; + +/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry, with trailing byte array. */ +struct bpf_lpm_trie_key_u8 { + union { + struct bpf_lpm_trie_key_hdr hdr; + __u32 prefixlen; + }; + __u8 data[]; /* Arbitrary size */ +}; + struct bpf_cgroup_storage_key { __u64 cgroup_inode_id; /* cgroup inode id */ __u32 attach_type; /* program attach type (enum bpf_attach_type) */ diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c index 3325da17ec81..efaf622c28dd 100644 --- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c +++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c @@ -316,7 +316,7 @@ struct lpm_trie { } __attribute__((preserve_access_index)); struct lpm_key { - struct bpf_lpm_trie_key trie_key; + struct bpf_lpm_trie_key_hdr trie_key; __u32 data; }; diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c index c028d621c744..d98c72dc563e 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -211,7 +211,7 @@ static void test_lpm_map(int keysize) volatile size_t n_matches, n_matches_after_delete; size_t i, j, n_nodes, n_lookups; struct tlpm_node *t, *list = NULL; - struct bpf_lpm_trie_key *key; + struct bpf_lpm_trie_key_u8 *key; uint8_t *data, *value; int r, map; @@ -331,8 +331,8 @@ static void test_lpm_map(int keysize) static void test_lpm_ipaddr(void) { LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC); - struct bpf_lpm_trie_key *key_ipv4; - struct bpf_lpm_trie_key *key_ipv6; + struct bpf_lpm_trie_key_u8 *key_ipv4; + struct bpf_lpm_trie_key_u8 *key_ipv6; size_t key_size_ipv4; size_t key_size_ipv6; int map_fd_ipv4; @@ -423,7 +423,7 @@ static void test_lpm_ipaddr(void) static void test_lpm_delete(void) { LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC); - struct bpf_lpm_trie_key *key; + struct bpf_lpm_trie_key_u8 *key; size_t key_size; int map_fd; __u64 value; @@ -532,7 +532,7 @@ static void test_lpm_delete(void) static void test_lpm_get_next_key(void) { LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC); - struct bpf_lpm_trie_key *key_p, *next_key_p; + struct bpf_lpm_trie_key_u8 *key_p, *next_key_p; size_t key_size; __u32 value = 0; int map_fd; @@ -693,9 +693,9 @@ static void *lpm_test_command(void *arg) { int i, j, ret, iter, key_size; struct lpm_mt_test_info *info = arg; - struct bpf_lpm_trie_key *key_p; + struct bpf_lpm_trie_key_u8 *key_p; - key_size = sizeof(struct bpf_lpm_trie_key) + sizeof(__u32); + key_size = sizeof(*key_p) + sizeof(__u32); key_p = alloca(key_size); for (iter = 0; iter < info->iter; iter++) for (i = 0; i < MAX_TEST_KEYS; i++) { @@ -717,7 +717,7 @@ static void *lpm_test_command(void *arg) ret = bpf_map_lookup_elem(info->map_fd, key_p, &value); assert(ret == 0 || errno == ENOENT); } else { - struct bpf_lpm_trie_key *next_key_p = alloca(key_size); + struct bpf_lpm_trie_key_u8 *next_key_p = alloca(key_size); ret = bpf_map_get_next_key(info->map_fd, key_p, next_key_p); assert(ret == 0 || errno == ENOENT || errno == ENOMEM); } @@ -752,7 +752,7 @@ static void test_lpm_multi_thread(void) /* create a trie */ value_size = sizeof(__u32); - key_size = sizeof(struct bpf_lpm_trie_key) + value_size; + key_size = sizeof(struct bpf_lpm_trie_key_hdr) + value_size; map_fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL, key_size, value_size, 100, &opts); /* create 4 threads to test update, delete, lookup and get_next_key */ From 3644d285462a60c80ac225d508fcfe705640d2b4 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 28 Feb 2024 22:45:19 -0800 Subject: [PATCH 115/119] libbpf: Set btf_value_type_id of struct bpf_map for struct_ops. For a struct_ops map, btf_value_type_id is the type ID of it's struct type. This value is required by bpftool to generate skeleton including pointers of shadow types. The code generator gets the type ID from bpf_map__btf_value_type_id() in order to get the type information of the struct type of a map. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240229064523.2091270-2-thinker.li@gmail.com --- tools/lib/bpf/libbpf.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 01f407591a92..c759628f4250 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1229,6 +1229,7 @@ static int init_struct_ops_maps(struct bpf_object *obj, const char *sec_name, map->name = strdup(var_name); if (!map->name) return -ENOMEM; + map->btf_value_type_id = type_id; map->def.type = BPF_MAP_TYPE_STRUCT_OPS; map->def.key_size = sizeof(int); @@ -4857,6 +4858,10 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b create_attr.btf_value_type_id = 0; map->btf_key_type_id = 0; map->btf_value_type_id = 0; + break; + case BPF_MAP_TYPE_STRUCT_OPS: + create_attr.btf_value_type_id = 0; + break; default: break; } From 69e4a9d2b3f5adf5af4feeab0a9f505da971265a Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 28 Feb 2024 22:45:20 -0800 Subject: [PATCH 116/119] libbpf: Convert st_ops->data to shadow type. Convert st_ops->data to the shadow type of the struct_ops map. The shadow type of a struct_ops type is a variant of the original struct type providing a way to access/change the values in the maps of the struct_ops type. bpf_map__initial_value() will return st_ops->data for struct_ops types. The skeleton is going to use it as the pointer to the shadow type of the original struct type. One of the main differences between the original struct type and the shadow type is that all function pointers of the shadow type are converted to pointers of struct bpf_program. Users can replace these bpf_program pointers with other BPF programs. The st_ops->progs[] will be updated before updating the value of a map to reflect the changes made by users. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240229064523.2091270-3-thinker.li@gmail.com --- tools/lib/bpf/libbpf.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c759628f4250..6c2979f1b471 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1014,6 +1014,19 @@ static bool bpf_map__is_struct_ops(const struct bpf_map *map) return map->def.type == BPF_MAP_TYPE_STRUCT_OPS; } +static bool is_valid_st_ops_program(struct bpf_object *obj, + const struct bpf_program *prog) +{ + int i; + + for (i = 0; i < obj->nr_programs; i++) { + if (&obj->programs[i] == prog) + return prog->type == BPF_PROG_TYPE_STRUCT_OPS; + } + + return false; +} + /* Init the map's fields that depend on kern_btf */ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) { @@ -1102,9 +1115,16 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) if (btf_is_ptr(mtype)) { struct bpf_program *prog; - prog = st_ops->progs[i]; + /* Update the value from the shadow type */ + prog = *(void **)mdata; + st_ops->progs[i] = prog; if (!prog) continue; + if (!is_valid_st_ops_program(obj, prog)) { + pr_warn("struct_ops init_kern %s: member %s is not a struct_ops program\n", + map->name, mname); + return -ENOTSUP; + } kern_mtype = skip_mods_and_typedefs(kern_btf, kern_mtype->type, @@ -9310,7 +9330,9 @@ static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, return NULL; } -/* Collect the reloc from ELF and populate the st_ops->progs[] */ +/* Collect the reloc from ELF, populate the st_ops->progs[], and update + * st_ops->data for shadow type. + */ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, Elf64_Shdr *shdr, Elf_Data *data) { @@ -9424,6 +9446,14 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, } st_ops->progs[member_idx] = prog; + + /* st_ops->data will be exposed to users, being returned by + * bpf_map__initial_value() as a pointer to the shadow + * type. All function pointers in the original struct type + * should be converted to a pointer to struct bpf_program + * in the shadow type. + */ + *((struct bpf_program **)(st_ops->data + moff)) = prog; } return 0; @@ -9882,6 +9912,12 @@ int bpf_map__set_initial_value(struct bpf_map *map, void *bpf_map__initial_value(struct bpf_map *map, size_t *psize) { + if (bpf_map__is_struct_ops(map)) { + if (psize) + *psize = map->def.value_size; + return map->st_ops->data; + } + if (!map->mmaped) return NULL; *psize = map->def.value_size; From a7b0fa352eafef95bd0d736ca94965d3f884ad18 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 28 Feb 2024 22:45:21 -0800 Subject: [PATCH 117/119] bpftool: Generated shadow variables for struct_ops maps. Declares and defines a pointer of the shadow type for each struct_ops map. The code generator will create an anonymous struct type as the shadow type for each struct_ops map. The shadow type is translated from the original struct type of the map. The user of the skeleton use pointers of them to access the values of struct_ops maps. However, shadow types only supports certain types of fields, including scalar types and function pointers. Any fields of unsupported types are translated into an array of characters to occupy the space of the original field. Function pointers are translated into pointers of the struct bpf_program. Additionally, padding fields are generated to occupy the space between two consecutive fields. The pointers of shadow types of struct_osp maps are initialized when *__open_opts() in skeletons are called. For a map called FOO, the user can access it through the pointer at skel->struct_ops.FOO. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240229064523.2091270-4-thinker.li@gmail.com --- tools/bpf/bpftool/gen.c | 237 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 236 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index a9334c57e859..1f579eacd9d4 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -55,6 +55,20 @@ static bool str_has_suffix(const char *str, const char *suffix) return true; } +static const struct btf_type * +resolve_func_ptr(const struct btf *btf, __u32 id, __u32 *res_id) +{ + const struct btf_type *t; + + t = skip_mods_and_typedefs(btf, id, NULL); + if (!btf_is_ptr(t)) + return NULL; + + t = skip_mods_and_typedefs(btf, t->type, res_id); + + return btf_is_func_proto(t) ? t : NULL; +} + static void get_obj_name(char *name, const char *file) { char file_copy[PATH_MAX]; @@ -909,6 +923,207 @@ codegen_progs_skeleton(struct bpf_object *obj, size_t prog_cnt, bool populate_li } } +static int walk_st_ops_shadow_vars(struct btf *btf, const char *ident, + const struct btf_type *map_type, __u32 map_type_id) +{ + LIBBPF_OPTS(btf_dump_emit_type_decl_opts, opts, .indent_level = 3); + const struct btf_type *member_type; + __u32 offset, next_offset = 0; + const struct btf_member *m; + struct btf_dump *d = NULL; + const char *member_name; + __u32 member_type_id; + int i, err = 0, n; + int size; + + d = btf_dump__new(btf, codegen_btf_dump_printf, NULL, NULL); + if (!d) + return -errno; + + n = btf_vlen(map_type); + for (i = 0, m = btf_members(map_type); i < n; i++, m++) { + member_type = skip_mods_and_typedefs(btf, m->type, &member_type_id); + member_name = btf__name_by_offset(btf, m->name_off); + + offset = m->offset / 8; + if (next_offset < offset) + printf("\t\t\tchar __padding_%d[%d];\n", i, offset - next_offset); + + switch (btf_kind(member_type)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + /* scalar type */ + printf("\t\t\t"); + opts.field_name = member_name; + err = btf_dump__emit_type_decl(d, member_type_id, &opts); + if (err) { + p_err("Failed to emit type declaration for %s: %d", member_name, err); + goto out; + } + printf(";\n"); + + size = btf__resolve_size(btf, member_type_id); + if (size < 0) { + p_err("Failed to resolve size of %s: %d\n", member_name, size); + err = size; + goto out; + } + + next_offset = offset + size; + break; + + case BTF_KIND_PTR: + if (resolve_func_ptr(btf, m->type, NULL)) { + /* Function pointer */ + printf("\t\t\tstruct bpf_program *%s;\n", member_name); + + next_offset = offset + sizeof(void *); + break; + } + /* All pointer types are unsupported except for + * function pointers. + */ + fallthrough; + + default: + /* Unsupported types + * + * Types other than scalar types and function + * pointers are currently not supported in order to + * prevent conflicts in the generated code caused + * by multiple definitions. For instance, if the + * struct type FOO is used in a struct_ops map, + * bpftool has to generate definitions for FOO, + * which may result in conflicts if FOO is defined + * in different skeleton files. + */ + size = btf__resolve_size(btf, member_type_id); + if (size < 0) { + p_err("Failed to resolve size of %s: %d\n", member_name, size); + err = size; + goto out; + } + printf("\t\t\tchar __unsupported_%d[%d];\n", i, size); + + next_offset = offset + size; + break; + } + } + + /* Cannot fail since it must be a struct type */ + size = btf__resolve_size(btf, map_type_id); + if (next_offset < (__u32)size) + printf("\t\t\tchar __padding_end[%d];\n", size - next_offset); + +out: + btf_dump__free(d); + + return err; +} + +/* Generate the pointer of the shadow type for a struct_ops map. + * + * This function adds a pointer of the shadow type for a struct_ops map. + * The members of a struct_ops map can be exported through a pointer to a + * shadow type. The user can access these members through the pointer. + * + * A shadow type includes not all members, only members of some types. + * They are scalar types and function pointers. The function pointers are + * translated to the pointer of the struct bpf_program. The scalar types + * are translated to the original type without any modifiers. + * + * Unsupported types will be translated to a char array to occupy the same + * space as the original field, being renamed as __unsupported_*. The user + * should treat these fields as opaque data. + */ +static int gen_st_ops_shadow_type(const char *obj_name, struct btf *btf, const char *ident, + const struct bpf_map *map) +{ + const struct btf_type *map_type; + const char *type_name; + __u32 map_type_id; + int err; + + map_type_id = bpf_map__btf_value_type_id(map); + if (map_type_id == 0) + return -EINVAL; + map_type = btf__type_by_id(btf, map_type_id); + if (!map_type) + return -EINVAL; + + type_name = btf__name_by_offset(btf, map_type->name_off); + + printf("\t\tstruct %s__%s__%s {\n", obj_name, ident, type_name); + + err = walk_st_ops_shadow_vars(btf, ident, map_type, map_type_id); + if (err) + return err; + + printf("\t\t} *%s;\n", ident); + + return 0; +} + +static int gen_st_ops_shadow(const char *obj_name, struct btf *btf, struct bpf_object *obj) +{ + int err, st_ops_cnt = 0; + struct bpf_map *map; + char ident[256]; + + if (!btf) + return 0; + + /* Generate the pointers to shadow types of + * struct_ops maps. + */ + bpf_object__for_each_map(map, obj) { + if (bpf_map__type(map) != BPF_MAP_TYPE_STRUCT_OPS) + continue; + if (!get_map_ident(map, ident, sizeof(ident))) + continue; + + if (st_ops_cnt == 0) /* first struct_ops map */ + printf("\tstruct {\n"); + st_ops_cnt++; + + err = gen_st_ops_shadow_type(obj_name, btf, ident, map); + if (err) + return err; + } + + if (st_ops_cnt) + printf("\t} struct_ops;\n"); + + return 0; +} + +/* Generate the code to initialize the pointers of shadow types. */ +static void gen_st_ops_shadow_init(struct btf *btf, struct bpf_object *obj) +{ + struct bpf_map *map; + char ident[256]; + + if (!btf) + return; + + /* Initialize the pointers to_ops shadow types of + * struct_ops maps. + */ + bpf_object__for_each_map(map, obj) { + if (bpf_map__type(map) != BPF_MAP_TYPE_STRUCT_OPS) + continue; + if (!get_map_ident(map, ident, sizeof(ident))) + continue; + codegen("\ + \n\ + obj->struct_ops.%1$s = bpf_map__initial_value(obj->maps.%1$s, NULL);\n\ + \n\ + ", ident); + } +} + static int do_skeleton(int argc, char **argv) { char header_guard[MAX_OBJ_NAME_LEN + sizeof("__SKEL_H__")]; @@ -1052,6 +1267,11 @@ static int do_skeleton(int argc, char **argv) printf("\t} maps;\n"); } + btf = bpf_object__btf(obj); + err = gen_st_ops_shadow(obj_name, btf, obj); + if (err) + goto out; + if (prog_cnt) { printf("\tstruct {\n"); bpf_object__for_each_program(prog, obj) { @@ -1075,7 +1295,6 @@ static int do_skeleton(int argc, char **argv) printf("\t} links;\n"); } - btf = bpf_object__btf(obj); if (btf) { err = codegen_datasecs(obj, obj_name); if (err) @@ -1133,6 +1352,12 @@ static int do_skeleton(int argc, char **argv) if (err) \n\ goto err_out; \n\ \n\ + ", obj_name); + + gen_st_ops_shadow_init(btf, obj); + + codegen("\ + \n\ return obj; \n\ err_out: \n\ %1$s__destroy(obj); \n\ @@ -1442,6 +1667,10 @@ static int do_subskeleton(int argc, char **argv) printf("\t} maps;\n"); } + err = gen_st_ops_shadow(obj_name, btf, obj); + if (err) + goto out; + if (prog_cnt) { printf("\tstruct {\n"); bpf_object__for_each_program(prog, obj) { @@ -1553,6 +1782,12 @@ static int do_subskeleton(int argc, char **argv) if (err) \n\ goto err; \n\ \n\ + "); + + gen_st_ops_shadow_init(btf, obj); + + codegen("\ + \n\ return obj; \n\ err: \n\ %1$s__destroy(obj); \n\ From f2e81192e07e87897ff1296c96775eceea8f582a Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 28 Feb 2024 22:45:22 -0800 Subject: [PATCH 118/119] bpftool: Add an example for struct_ops map and shadow type. The example in bpftool-gen.8 explains how to use the pointer of the shadow type to change the value of a field of a struct_ops map. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240229064523.2091270-5-thinker.li@gmail.com --- .../bpf/bpftool/Documentation/bpftool-gen.rst | 58 +++++++++++++++++-- 1 file changed, 52 insertions(+), 6 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 5006e724d1bc..5e60825818dd 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -257,18 +257,48 @@ EXAMPLES return 0; } -This is example BPF application with two BPF programs and a mix of BPF maps -and global variables. Source code is split across two source code files. +**$ cat example3.bpf.c** + +:: + + #include + #include + #include + /* This header file is provided by the bpf_testmod module. */ + #include "bpf_testmod.h" + + int test_2_result = 0; + + /* bpf_Testmod.ko calls this function, passing a "4" + * and testmod_map->data. + */ + SEC("struct_ops/test_2") + void BPF_PROG(test_2, int a, int b) + { + test_2_result = a + b; + } + + SEC(".struct_ops") + struct bpf_testmod_ops testmod_map = { + .test_2 = (void *)test_2, + .data = 0x1, + }; + +This is example BPF application with three BPF programs and a mix of BPF +maps and global variables. Source code is split across three source code +files. **$ clang --target=bpf -g example1.bpf.c -o example1.bpf.o** **$ clang --target=bpf -g example2.bpf.c -o example2.bpf.o** -**$ bpftool gen object example.bpf.o example1.bpf.o example2.bpf.o** +**$ clang --target=bpf -g example3.bpf.c -o example3.bpf.o** -This set of commands compiles *example1.bpf.c* and *example2.bpf.c* -individually and then statically links respective object files into the final -BPF ELF object file *example.bpf.o*. +**$ bpftool gen object example.bpf.o example1.bpf.o example2.bpf.o example3.bpf.o** + +This set of commands compiles *example1.bpf.c*, *example2.bpf.c* and +*example3.bpf.c* individually and then statically links respective object +files into the final BPF ELF object file *example.bpf.o*. **$ bpftool gen skeleton example.bpf.o name example | tee example.skel.h** @@ -291,7 +321,15 @@ BPF ELF object file *example.bpf.o*. struct bpf_map *data; struct bpf_map *bss; struct bpf_map *my_map; + struct bpf_map *testmod_map; } maps; + struct { + struct example__testmod_map__bpf_testmod_ops { + const struct bpf_program *test_1; + const struct bpf_program *test_2; + int data; + } *testmod_map; + } struct_ops; struct { struct bpf_program *handle_sys_enter; struct bpf_program *handle_sys_exit; @@ -304,6 +342,7 @@ BPF ELF object file *example.bpf.o*. struct { int x; } data; + int test_2_result; } *bss; struct example__data { _Bool global_flag; @@ -342,10 +381,16 @@ BPF ELF object file *example.bpf.o*. skel->rodata->param1 = 128; + /* Change the value through the pointer of shadow type */ + skel->struct_ops.testmod_map->data = 13; + err = example__load(skel); if (err) goto cleanup; + /* The result of the function test_2() */ + printf("test_2_result: %d\n", skel->bss->test_2_result); + err = example__attach(skel); if (err) goto cleanup; @@ -372,6 +417,7 @@ BPF ELF object file *example.bpf.o*. :: + test_2_result: 17 my_map name: my_map sys_enter prog FD: 8 my_static_var: 7 From 0623e73317940d052216fb6eef4efd55a0a7f602 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 28 Feb 2024 22:45:23 -0800 Subject: [PATCH 119/119] selftests/bpf: Test if shadow types work correctly. Change the values of fields, including scalar types and function pointers, and check if the struct_ops map works as expected. The test changes the field "test_2" of "testmod_1" from the pointer to test_2() to pointer to test_3() and the field "data" to 13. The function test_2() and test_3() both compute a new value for "test_2_result", but in different way. By checking the value of "test_2_result", it ensures the struct_ops map works as expected with changes through shadow types. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240229064523.2091270-6-thinker.li@gmail.com --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 11 ++++++++++- .../selftests/bpf/bpf_testmod/bpf_testmod.h | 8 ++++++++ .../bpf/prog_tests/test_struct_ops_module.c | 19 +++++++++++++++---- .../selftests/bpf/progs/struct_ops_module.c | 8 ++++++++ 4 files changed, 41 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 66787e99ba1b..098ddd067224 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -539,6 +539,15 @@ static int bpf_testmod_ops_init_member(const struct btf_type *t, const struct btf_member *member, void *kdata, const void *udata) { + if (member->offset == offsetof(struct bpf_testmod_ops, data) * 8) { + /* For data fields, this function has to copy it and return + * 1 to indicate that the data has been handled by the + * struct_ops type, or the verifier will reject the map if + * the value of the data field is not zero. + */ + ((struct bpf_testmod_ops *)kdata)->data = ((struct bpf_testmod_ops *)udata)->data; + return 1; + } return 0; } @@ -559,7 +568,7 @@ static int bpf_dummy_reg(void *kdata) * initialized, so we need to check for NULL. */ if (ops->test_2) - ops->test_2(4, 3); + ops->test_2(4, ops->data); return 0; } diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h index c3b0cf788f9f..971458acfac3 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h @@ -35,6 +35,14 @@ struct bpf_testmod_ops { void (*test_2)(int a, int b); /* Used to test nullable arguments. */ int (*test_maybe_null)(int dummy, struct task_struct *task); + + /* The following fields are used to test shadow copies. */ + char onebyte; + struct { + int a; + int b; + } unsupported; + int data; }; #endif /* _BPF_TESTMOD_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index 8d833f0c7580..7d6facf46ebb 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -32,17 +32,23 @@ cleanup: static void test_struct_ops_load(void) { - DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); struct struct_ops_module *skel; struct bpf_map_info info = {}; struct bpf_link *link; int err; u32 len; - skel = struct_ops_module__open_opts(&opts); + skel = struct_ops_module__open(); if (!ASSERT_OK_PTR(skel, "struct_ops_module_open")) return; + skel->struct_ops.testmod_1->data = 13; + skel->struct_ops.testmod_1->test_2 = skel->progs.test_3; + /* Since test_2() is not being used, it should be disabled from + * auto-loading, or it will fail to load. + */ + bpf_program__set_autoload(skel->progs.test_2, false); + err = struct_ops_module__load(skel); if (!ASSERT_OK(err, "struct_ops_module_load")) goto cleanup; @@ -56,8 +62,13 @@ static void test_struct_ops_load(void) link = bpf_map__attach_struct_ops(skel->maps.testmod_1); ASSERT_OK_PTR(link, "attach_test_mod_1"); - /* test_2() will be called from bpf_dummy_reg() in bpf_testmod.c */ - ASSERT_EQ(skel->bss->test_2_result, 7, "test_2_result"); + /* test_3() will be called from bpf_dummy_reg() in bpf_testmod.c + * + * In bpf_testmod.c it will pass 4 and 13 (the value of data) to + * .test_2. So, the value of test_2_result should be 20 (4 + 13 + + * 3). + */ + ASSERT_EQ(skel->bss->test_2_result, 20, "check_shadow_variables"); bpf_link__destroy(link); diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c index b78746b3cef3..25952fa09348 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_module.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c @@ -21,9 +21,17 @@ void BPF_PROG(test_2, int a, int b) test_2_result = a + b; } +SEC("struct_ops/test_3") +int BPF_PROG(test_3, int a, int b) +{ + test_2_result = a + b + 3; + return a + b + 3; +} + SEC(".struct_ops.link") struct bpf_testmod_ops testmod_1 = { .test_1 = (void *)test_1, .test_2 = (void *)test_2, + .data = 0x1, };