diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index 1030119294d0..01cc21f17f7b 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -211,27 +211,40 @@ over a rather long period of time, but improvements are always welcome! of the system, especially to real-time workloads running on the rest of the system. -7. As of v4.20, a given kernel implements only one RCU flavor, - which is RCU-sched for PREEMPTION=n and RCU-preempt for PREEMPTION=y. - If the updater uses call_rcu() or synchronize_rcu(), - then the corresponding readers may use rcu_read_lock() and - rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(), - or any pair of primitives that disables and re-enables preemption, - for example, rcu_read_lock_sched() and rcu_read_unlock_sched(). - If the updater uses synchronize_srcu() or call_srcu(), - then the corresponding readers must use srcu_read_lock() and - srcu_read_unlock(), and with the same srcu_struct. The rules for - the expedited primitives are the same as for their non-expedited - counterparts. Mixing things up will result in confusion and - broken kernels, and has even resulted in an exploitable security - issue. +7. As of v4.20, a given kernel implements only one RCU flavor, which + is RCU-sched for PREEMPTION=n and RCU-preempt for PREEMPTION=y. + If the updater uses call_rcu() or synchronize_rcu(), then + the corresponding readers may use: (1) rcu_read_lock() and + rcu_read_unlock(), (2) any pair of primitives that disables + and re-enables softirq, for example, rcu_read_lock_bh() and + rcu_read_unlock_bh(), or (3) any pair of primitives that disables + and re-enables preemption, for example, rcu_read_lock_sched() and + rcu_read_unlock_sched(). If the updater uses synchronize_srcu() + or call_srcu(), then the corresponding readers must use + srcu_read_lock() and srcu_read_unlock(), and with the same + srcu_struct. The rules for the expedited RCU grace-period-wait + primitives are the same as for their non-expedited counterparts. - One exception to this rule: rcu_read_lock() and rcu_read_unlock() - may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh() - in cases where local bottom halves are already known to be - disabled, for example, in irq or softirq context. Commenting - such cases is a must, of course! And the jury is still out on - whether the increased speed is worth it. + If the updater uses call_rcu_tasks() or synchronize_rcu_tasks(), + then the readers must refrain from executing voluntary + context switches, that is, from blocking. If the updater uses + call_rcu_tasks_trace() or synchronize_rcu_tasks_trace(), then + the corresponding readers must use rcu_read_lock_trace() and + rcu_read_unlock_trace(). If an updater uses call_rcu_tasks_rude() + or synchronize_rcu_tasks_rude(), then the corresponding readers + must use anything that disables interrupts. + + Mixing things up will result in confusion and broken kernels, and + has even resulted in an exploitable security issue. Therefore, + when using non-obvious pairs of primitives, commenting is + of course a must. One example of non-obvious pairing is + the XDP feature in networking, which calls BPF programs from + network-driver NAPI (softirq) context. BPF relies heavily on RCU + protection for its data structures, but because the BPF program + invocation happens entirely within a single local_bh_disable() + section in a NAPI poll cycle, this usage is safe. The reason + that this usage is safe is that readers can use anything that + disables BH when updaters use call_rcu() or synchronize_rcu(). 8. Although synchronize_rcu() is slower than is call_rcu(), it usually results in simpler code. So, unless update performance is diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index 93e8cf12a6d4..baea6c2abba5 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -12,6 +12,19 @@ BPF instruction-set. The Cilium project also maintains a `BPF and XDP Reference Guide`_ that goes into great technical depth about the BPF Architecture. +libbpf +====== + +Libbpf is a userspace library for loading and interacting with bpf programs. + +.. toctree:: + :maxdepth: 1 + + libbpf/libbpf + libbpf/libbpf_api + libbpf/libbpf_build + libbpf/libbpf_naming_convention + BPF Type Format (BTF) ===================== diff --git a/Documentation/bpf/libbpf/libbpf.rst b/Documentation/bpf/libbpf/libbpf.rst new file mode 100644 index 000000000000..1b1e61d5ead1 --- /dev/null +++ b/Documentation/bpf/libbpf/libbpf.rst @@ -0,0 +1,14 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +libbpf +====== + +This is documentation for libbpf, a userspace library for loading and +interacting with bpf programs. + +All general BPF questions, including kernel functionality, libbpf APIs and +their application, should be sent to bpf@vger.kernel.org mailing list. +You can `subscribe `_ to the +mailing list search its `archive `_. +Please search the archive before asking new questions. It very well might +be that this was already addressed or answered before. diff --git a/Documentation/bpf/libbpf/libbpf_api.rst b/Documentation/bpf/libbpf/libbpf_api.rst new file mode 100644 index 000000000000..f07eecd054da --- /dev/null +++ b/Documentation/bpf/libbpf/libbpf_api.rst @@ -0,0 +1,27 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +API +=== + +This documentation is autogenerated from header files in libbpf, tools/lib/bpf + +.. kernel-doc:: tools/lib/bpf/libbpf.h + :internal: + +.. kernel-doc:: tools/lib/bpf/bpf.h + :internal: + +.. kernel-doc:: tools/lib/bpf/btf.h + :internal: + +.. kernel-doc:: tools/lib/bpf/xsk.h + :internal: + +.. kernel-doc:: tools/lib/bpf/bpf_tracing.h + :internal: + +.. kernel-doc:: tools/lib/bpf/bpf_core_read.h + :internal: + +.. kernel-doc:: tools/lib/bpf/bpf_endian.h + :internal: \ No newline at end of file diff --git a/Documentation/bpf/libbpf/libbpf_build.rst b/Documentation/bpf/libbpf/libbpf_build.rst new file mode 100644 index 000000000000..8e8c23e8093d --- /dev/null +++ b/Documentation/bpf/libbpf/libbpf_build.rst @@ -0,0 +1,37 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +Building libbpf +=============== + +libelf and zlib are internal dependencies of libbpf and thus are required to link +against and must be installed on the system for applications to work. +pkg-config is used by default to find libelf, and the program called +can be overridden with PKG_CONFIG. + +If using pkg-config at build time is not desired, it can be disabled by +setting NO_PKG_CONFIG=1 when calling make. + +To build both static libbpf.a and shared libbpf.so: + +.. code-block:: bash + + $ cd src + $ make + +To build only static libbpf.a library in directory build/ and install them +together with libbpf headers in a staging directory root/: + +.. code-block:: bash + + $ cd src + $ mkdir build root + $ BUILD_STATIC_ONLY=y OBJDIR=build DESTDIR=root make install + +To build both static libbpf.a and shared libbpf.so against a custom libelf +dependency installed in /build/root/ and install them together with libbpf +headers in a build directory /build/root/: + +.. code-block:: bash + + $ cd src + $ PKG_CONFIG_PATH=/build/root/lib64/pkgconfig DESTDIR=/build/root make \ No newline at end of file diff --git a/tools/lib/bpf/README.rst b/Documentation/bpf/libbpf/libbpf_naming_convention.rst similarity index 90% rename from tools/lib/bpf/README.rst rename to Documentation/bpf/libbpf/libbpf_naming_convention.rst index 8928f7787f2d..3de1d51e41da 100644 --- a/tools/lib/bpf/README.rst +++ b/Documentation/bpf/libbpf/libbpf_naming_convention.rst @@ -1,7 +1,7 @@ .. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) -libbpf API naming convention -============================ +API naming convention +===================== libbpf API provides access to a few logically separated groups of functions and types. Every group has its own naming convention @@ -10,14 +10,14 @@ new function or type is added to keep libbpf API clean and consistent. All types and functions provided by libbpf API should have one of the following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``, -``perf_buffer_``. +``btf_dump_``, ``ring_buffer_``, ``perf_buffer_``. System call wrappers -------------------- System call wrappers are simple wrappers for commands supported by sys_bpf system call. These wrappers should go to ``bpf.h`` header file -and map one-on-one to corresponding commands. +and map one to one to corresponding commands. For example ``bpf_map_lookup_elem`` wraps ``BPF_MAP_LOOKUP_ELEM`` command of sys_bpf, ``bpf_prog_attach`` wraps ``BPF_PROG_ATTACH``, etc. @@ -49,10 +49,6 @@ object, ``bpf_object``, double underscore and ``open`` that defines the purpose of the function to open ELF file and create ``bpf_object`` from it. -Another example: ``bpf_program__load`` is named for corresponding -object, ``bpf_program``, that is separated from other part of the name -by double underscore. - All objects and corresponding functions other than BTF related should go to ``libbpf.h``. BTF types and functions should go to ``btf.h``. @@ -72,11 +68,7 @@ of both low-level ring access functions and high-level configuration functions. These can be mixed and matched. Note that these functions are not reentrant for performance reasons. -Please take a look at Documentation/networking/af_xdp.rst in the Linux -kernel source tree on how to use XDP sockets and for some common -mistakes in case you do not get any traffic up to user space. - -libbpf ABI +ABI ========== libbpf can be both linked statically or used as DSO. To avoid possible @@ -116,7 +108,8 @@ This bump in ABI version is at most once per kernel development cycle. For example, if current state of ``libbpf.map`` is: -.. code-block:: +.. code-block:: c + LIBBPF_0.0.1 { global: bpf_func_a; @@ -128,7 +121,8 @@ For example, if current state of ``libbpf.map`` is: , and a new symbol ``bpf_func_c`` is being introduced, then ``libbpf.map`` should be changed like this: -.. code-block:: +.. code-block:: c + LIBBPF_0.0.1 { global: bpf_func_a; @@ -148,7 +142,7 @@ Format of version script and ways to handle ABI changes, including incompatible ones, described in details in [1]. Stand-alone build -================= +------------------- Under https://github.com/libbpf/libbpf there is a (semi-)automated mirror of the mainline's version of libbpf for a stand-alone build. @@ -157,12 +151,12 @@ However, all changes to libbpf's code base must be upstreamed through the mainline kernel tree. License -======= +------------------- libbpf is dual-licensed under LGPL 2.1 and BSD 2-Clause. Links -===== +------------------- [1] https://www.akkadia.org/drepper/dsohowto.pdf (Chapter 3. Maintaining APIs and ABIs). diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst index 2ccc5644cc98..42576880aa4a 100644 --- a/Documentation/networking/af_xdp.rst +++ b/Documentation/networking/af_xdp.rst @@ -290,19 +290,19 @@ round-robin example of distributing packets is shown below: #define MAX_SOCKS 16 struct { - __uint(type, BPF_MAP_TYPE_XSKMAP); - __uint(max_entries, MAX_SOCKS); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(int)); + __uint(type, BPF_MAP_TYPE_XSKMAP); + __uint(max_entries, MAX_SOCKS); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); } xsks_map SEC(".maps"); static unsigned int rr; SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) { - rr = (rr + 1) & (MAX_SOCKS - 1); + rr = (rr + 1) & (MAX_SOCKS - 1); - return bpf_redirect_map(&xsks_map, rr, XDP_DROP); + return bpf_redirect_map(&xsks_map, rr, XDP_DROP); } Note, that since there is only a single set of FILL and COMPLETION @@ -379,7 +379,7 @@ would look like this for the TX path: .. code-block:: c if (xsk_ring_prod__needs_wakeup(&my_tx_ring)) - sendto(xsk_socket__fd(xsk_handle), NULL, 0, MSG_DONTWAIT, NULL, 0); + sendto(xsk_socket__fd(xsk_handle), NULL, 0, MSG_DONTWAIT, NULL, 0); I.e., only use the syscall if the flag is set. @@ -442,9 +442,9 @@ purposes. The supported statistics are shown below: .. code-block:: c struct xdp_statistics { - __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ - __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ - __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ + __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ }; XDP_OPTIONS getsockopt @@ -483,15 +483,15 @@ like this: .. code-block:: c // struct xdp_rxtx_ring { - // __u32 *producer; - // __u32 *consumer; - // struct xdp_desc *desc; + // __u32 *producer; + // __u32 *consumer; + // struct xdp_desc *desc; // }; // struct xdp_umem_ring { - // __u32 *producer; - // __u32 *consumer; - // __u64 *desc; + // __u32 *producer; + // __u32 *consumer; + // __u64 *desc; // }; // typedef struct xdp_rxtx_ring RING; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2a2e290fa5d8..e835164189f1 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -31,7 +31,7 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) } #define EMIT(bytes, len) \ - do { prog = emit_code(prog, bytes, len); cnt += len; } while (0) + do { prog = emit_code(prog, bytes, len); } while (0) #define EMIT1(b1) EMIT(b1, 1) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) @@ -239,7 +239,6 @@ struct jit_context { static void push_callee_regs(u8 **pprog, bool *callee_regs_used) { u8 *prog = *pprog; - int cnt = 0; if (callee_regs_used[0]) EMIT1(0x53); /* push rbx */ @@ -255,7 +254,6 @@ static void push_callee_regs(u8 **pprog, bool *callee_regs_used) static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) { u8 *prog = *pprog; - int cnt = 0; if (callee_regs_used[3]) EMIT2(0x41, 0x5F); /* pop r15 */ @@ -277,13 +275,12 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, bool tail_call_reachable, bool is_subprog) { u8 *prog = *pprog; - int cnt = X86_PATCH_SIZE; /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later */ - memcpy(prog, x86_nops[5], cnt); - prog += cnt; + memcpy(prog, x86_nops[5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; if (!ebpf_from_cbpf) { if (tail_call_reachable && !is_subprog) EMIT2(0x31, 0xC0); /* xor eax, eax */ @@ -303,7 +300,6 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode) { u8 *prog = *pprog; - int cnt = 0; s64 offset; offset = func - (ip + X86_PATCH_SIZE); @@ -423,7 +419,6 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, int off1 = 42; int off2 = 31; int off3 = 9; - int cnt = 0; /* count the additional bytes used for popping callee regs from stack * that need to be taken into account for each of the offsets that @@ -513,7 +508,6 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, int pop_bytes = 0; int off1 = 20; int poke_off; - int cnt = 0; /* count the additional bytes used for popping callee regs to stack * that need to be taken into account for jump offset that is used for @@ -615,7 +609,6 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate, { u8 *prog = *pprog; u8 b1, b2, b3; - int cnt = 0; /* * Optimization: if imm32 is positive, use 'mov %eax, imm32' @@ -655,7 +648,6 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, const u32 imm32_hi, const u32 imm32_lo) { u8 *prog = *pprog; - int cnt = 0; if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) { /* @@ -678,7 +670,6 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) { u8 *prog = *pprog; - int cnt = 0; if (is64) { /* mov dst, src */ @@ -697,7 +688,6 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) { u8 *prog = *pprog; - int cnt = 0; if (is_imm8(off)) { /* 1-byte signed displacement. @@ -720,7 +710,6 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64) { u8 *prog = *pprog; - int cnt = 0; if (is64) EMIT1(add_2mod(0x48, dst_reg, src_reg)); @@ -733,7 +722,6 @@ static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64) static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { u8 *prog = *pprog; - int cnt = 0; switch (size) { case BPF_B: @@ -764,7 +752,6 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { u8 *prog = *pprog; - int cnt = 0; switch (size) { case BPF_B: @@ -799,7 +786,6 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) { u8 *prog = *pprog; - int cnt = 0; EMIT1(0xF0); /* lock prefix */ @@ -869,10 +855,10 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt, } } -static int emit_nops(u8 **pprog, int len) +static void emit_nops(u8 **pprog, int len) { u8 *prog = *pprog; - int i, noplen, cnt = 0; + int i, noplen; while (len > 0) { noplen = len; @@ -886,8 +872,6 @@ static int emit_nops(u8 **pprog, int len) } *pprog = prog; - - return cnt; } #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) @@ -902,7 +886,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, bool tail_call_seen = false; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; - int i, cnt = 0, excnt = 0; + int i, excnt = 0; int ilen, proglen = 0; u8 *prog = temp; int err; @@ -1297,7 +1281,7 @@ st: if (is_imm8(insn->off)) emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { struct exception_table_entry *ex; - u8 *_insn = image + proglen; + u8 *_insn = image + proglen + (start_of_ldx - temp); s64 delta; /* populate jmp_offset for JMP above */ @@ -1576,7 +1560,7 @@ st: if (is_imm8(insn->off)) nops); return -EFAULT; } - cnt += emit_nops(&prog, nops); + emit_nops(&prog, nops); } EMIT2(jmp_cond, jmp_offset); } else if (is_simm32(jmp_offset)) { @@ -1622,7 +1606,7 @@ st: if (is_imm8(insn->off)) nops); return -EFAULT; } - cnt += emit_nops(&prog, nops); + emit_nops(&prog, nops); } break; } @@ -1647,7 +1631,7 @@ st: if (is_imm8(insn->off)) nops); return -EFAULT; } - cnt += emit_nops(&prog, INSN_SZ_DIFF - 2); + emit_nops(&prog, INSN_SZ_DIFF - 2); } EMIT2(0xEB, jmp_offset); } else if (is_simm32(jmp_offset)) { @@ -1754,7 +1738,6 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, { u8 *prog = *pprog; u8 *jmp_insn; - int cnt = 0; /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); @@ -1822,7 +1805,6 @@ static void emit_align(u8 **pprog, u32 align) static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) { u8 *prog = *pprog; - int cnt = 0; s64 offset; offset = func - (ip + 2 + 4); @@ -1854,7 +1836,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, u8 **branches) { u8 *prog = *pprog; - int i, cnt = 0; + int i; /* The first fmod_ret program will receive a garbage return value. * Set this to 0 to avoid confusing the program. @@ -1950,7 +1932,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i struct bpf_tramp_progs *tprogs, void *orig_call) { - int ret, i, cnt = 0, nr_args = m->nr_args; + int ret, i, nr_args = m->nr_args; int stack_size = nr_args * 8; struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY]; struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT]; @@ -2095,8 +2077,6 @@ static int emit_fallback_jump(u8 **pprog) */ err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog); #else - int cnt = 0; - EMIT2(0xFF, 0xE2); /* jmp rdx */ #endif *pprog = prog; @@ -2106,7 +2086,7 @@ static int emit_fallback_jump(u8 **pprog) static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) { u8 *jg_reloc, *prog = *pprog; - int pivot, err, jg_bytes = 1, cnt = 0; + int pivot, err, jg_bytes = 1; s64 jg_offset; if (a == b) { diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 3fe3edd80876..afae0afe3f81 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -326,7 +326,8 @@ int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) } if (attr->query.prog_cnt != 0 && prog_ids && cnt) - ret = bpf_prog_array_copy_to_user(progs, prog_ids, cnt); + ret = bpf_prog_array_copy_to_user(progs, prog_ids, + attr->query.prog_cnt); unlock: mutex_unlock(&ir_raw_handler_lock); diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index edaf37823c50..0e43000614ab 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -384,7 +384,6 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp) struct xdp_frame *xdpf; u64 *xdp_stat; - rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_bpf_prog); if (!xdp_prog) @@ -441,8 +440,6 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp) ena_increase_stat(xdp_stat, 1, &rx_ring->syncp); out: - rcu_read_unlock(); - return verdict; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index ec9564e584e0..bee6e091a997 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -138,9 +138,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, xdp_prepare_buff(&xdp, *data_ptr - offset, offset, *len, false); orig_data = xdp.data; - rcu_read_lock(); act = bpf_prog_run_xdp(xdp_prog, &xdp); - rcu_read_unlock(); tx_avail = bnxt_tx_avail(bp, txr); /* If the tx ring is not full, we must not update the rx producer yet diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index c33b4e837515..e2b290135fd9 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -555,9 +555,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, xdp_prepare_buff(&xdp, hard_start, data - hard_start, len, false); orig_data = xdp.data; - rcu_read_lock(); action = bpf_prog_run_xdp(prog, &xdp); - rcu_read_unlock(); len = xdp.data_end - xdp.data; /* Check if XDP program has changed headers */ diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 177c020bf34a..e6826561cf11 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2558,13 +2558,9 @@ static u32 dpaa_run_xdp(struct dpaa_priv *priv, struct qm_fd *fd, void *vaddr, u32 xdp_act; int err; - rcu_read_lock(); - xdp_prog = READ_ONCE(priv->xdp_prog); - if (!xdp_prog) { - rcu_read_unlock(); + if (!xdp_prog) return XDP_PASS; - } xdp_init_buff(&xdp, DPAA_BP_RAW_SIZE - DPAA_TX_PRIV_DATA_SIZE, &dpaa_fq->xdp_rxq); @@ -2638,8 +2634,6 @@ static u32 dpaa_run_xdp(struct dpaa_priv *priv, struct qm_fd *fd, void *vaddr, break; } - rcu_read_unlock(); - return xdp_act; } diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 8433aa730c42..973352393bd4 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -352,8 +352,6 @@ static u32 dpaa2_eth_run_xdp(struct dpaa2_eth_priv *priv, u32 xdp_act = XDP_PASS; int err, offset; - rcu_read_lock(); - xdp_prog = READ_ONCE(ch->xdp.prog); if (!xdp_prog) goto out; @@ -414,7 +412,6 @@ static u32 dpaa2_eth_run_xdp(struct dpaa2_eth_priv *priv, ch->xdp.res |= xdp_act; out: - rcu_read_unlock(); return xdp_act; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index b883ab809df3..38eb8151ee9a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2298,7 +2298,6 @@ static int i40e_run_xdp(struct i40e_ring *rx_ring, struct xdp_buff *xdp) struct bpf_prog *xdp_prog; u32 act; - rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); if (!xdp_prog) @@ -2334,7 +2333,6 @@ static int i40e_run_xdp(struct i40e_ring *rx_ring, struct xdp_buff *xdp) break; } xdp_out: - rcu_read_unlock(); return result; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 68f177a86403..e7e778ca074c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -153,7 +153,6 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) struct bpf_prog *xdp_prog; u32 act; - rcu_read_lock(); /* NB! xdp_prog will always be !NULL, due to the fact that * this path is enabled by setting an XDP program. */ @@ -164,7 +163,6 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); if (err) goto out_failure; - rcu_read_unlock(); return I40E_XDP_REDIR; } @@ -188,7 +186,6 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) result = I40E_XDP_CONSUMED; break; } - rcu_read_unlock(); return result; } diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index a63d5916ebb0..6ee8e0032d52 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1140,15 +1140,11 @@ int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size); #endif - rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); - if (!xdp_prog) { - rcu_read_unlock(); + if (!xdp_prog) goto construct_skb; - } xdp_res = ice_run_xdp(rx_ring, &xdp, xdp_prog); - rcu_read_unlock(); if (!xdp_res) goto construct_skb; if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) { diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 52acbe325db3..5a9f61deeb38 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -466,7 +466,6 @@ ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff *xdp) struct ice_ring *xdp_ring; u32 act; - rcu_read_lock(); /* ZC patch is enabled only when XDP program is set, * so here it can not be NULL */ @@ -478,7 +477,6 @@ ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff *xdp) err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); if (err) goto out_failure; - rcu_read_unlock(); return ICE_XDP_REDIR; } @@ -503,7 +501,6 @@ ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff *xdp) break; } - rcu_read_unlock(); return result; } diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 5db303d64d14..7e6435dc7e80 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -8381,7 +8381,6 @@ static struct sk_buff *igb_run_xdp(struct igb_adapter *adapter, struct bpf_prog *xdp_prog; u32 act; - rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); if (!xdp_prog) @@ -8416,7 +8415,6 @@ static struct sk_buff *igb_run_xdp(struct igb_adapter *adapter, break; } xdp_out: - rcu_read_unlock(); return ERR_PTR(-result); } diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 3f6b6d4543a8..95323095094d 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2240,18 +2240,15 @@ static struct sk_buff *igc_xdp_run_prog(struct igc_adapter *adapter, struct bpf_prog *prog; int res; - rcu_read_lock(); - prog = READ_ONCE(adapter->xdp_prog); if (!prog) { res = IGC_XDP_PASS; - goto unlock; + goto out; } res = __igc_xdp_run_prog(adapter, prog, xdp); -unlock: - rcu_read_unlock(); +out: return ERR_PTR(-res); } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 2ac5b82676f3..ffff69efd78a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2199,7 +2199,6 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, struct xdp_frame *xdpf; u32 act; - rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); if (!xdp_prog) @@ -2237,7 +2236,6 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, break; } xdp_out: - rcu_read_unlock(); return ERR_PTR(-result); } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index f72d2978263b..96dd1a4f956a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -100,7 +100,6 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter, struct xdp_frame *xdpf; u32 act; - rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); act = bpf_prog_run_xdp(xdp_prog, xdp); @@ -108,7 +107,6 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter, err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); if (err) goto out_failure; - rcu_read_unlock(); return IXGBE_XDP_REDIR; } @@ -134,7 +132,6 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter, result = IXGBE_XDP_CONSUMED; break; } - rcu_read_unlock(); return result; } diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index dc56931fc1dc..c714e1ecd308 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1054,7 +1054,6 @@ static struct sk_buff *ixgbevf_run_xdp(struct ixgbevf_adapter *adapter, struct bpf_prog *xdp_prog; u32 act; - rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); if (!xdp_prog) @@ -1082,7 +1081,6 @@ static struct sk_buff *ixgbevf_run_xdp(struct ixgbevf_adapter *adapter, break; } xdp_out: - rcu_read_unlock(); return ERR_PTR(-result); } diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 88a755034c39..361bc4fbe20b 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2369,7 +2369,6 @@ static int mvneta_rx_swbm(struct napi_struct *napi, /* Get number of received packets */ rx_todo = mvneta_rxq_busy_desc_num_get(pp, rxq); - rcu_read_lock(); xdp_prog = READ_ONCE(pp->xdp_prog); /* Fairness NAPI loop */ @@ -2447,7 +2446,6 @@ static int mvneta_rx_swbm(struct napi_struct *napi, xdp_buf.data_hard_start = NULL; sinfo.nr_frags = 0; } - rcu_read_unlock(); if (xdp_buf.data_hard_start) mvneta_xdp_put_buff(pp, rxq, &xdp_buf, &sinfo, -1); diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 3135220a8942..3229bafa2a2c 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3877,8 +3877,6 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, int rx_done = 0; u32 xdp_ret = 0; - rcu_read_lock(); - xdp_prog = READ_ONCE(port->xdp_prog); /* Get number of received packets and clamp the to-do */ @@ -4024,8 +4022,6 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, mvpp2_bm_pool_put(port, pool, dma_addr, phys_addr); } - rcu_read_unlock(); - if (xdp_ret & MVPP2_XDP_REDIR) xdp_do_flush_map(); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index cea62b8f554c..442991d91c15 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -679,9 +679,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud ring = priv->rx_ring[cq_ring]; - /* Protect accesses to: ring->xdp_prog, priv->mac_hash list */ - rcu_read_lock(); - xdp_prog = rcu_dereference(ring->xdp_prog); + xdp_prog = rcu_dereference_bh(ring->xdp_prog); xdp_init_buff(&xdp, priv->frag_info[0].frag_stride, &ring->xdp_rxq); doorbell_pending = false; @@ -744,7 +742,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud /* Drop the packet, since HW loopback-ed it */ mac_hash = ethh->h_source[MLX4_EN_MAC_HASH_IDX]; bucket = &priv->mac_hash[mac_hash]; - hlist_for_each_entry_rcu(entry, bucket, hlist) { + hlist_for_each_entry_rcu_bh(entry, bucket, hlist) { if (ether_addr_equal_64bits(entry->mac, ethh->h_source)) goto next; @@ -899,8 +897,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud break; } - rcu_read_unlock(); - if (likely(polled)) { if (doorbell_pending) { priv->tx_cq[TX_XDP][cq_ring]->xdp_busy = true; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index eeb30680b4dc..5dfa4799c34f 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1819,7 +1819,6 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) struct xdp_buff xdp; int idx; - rcu_read_lock(); xdp_prog = READ_ONCE(dp->xdp_prog); true_bufsz = xdp_prog ? PAGE_SIZE : dp->fl_bufsz; xdp_init_buff(&xdp, PAGE_SIZE - NFP_NET_RX_BUF_HEADROOM, @@ -2036,7 +2035,6 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) if (!nfp_net_xdp_complete(tx_ring)) pkts_polled = budget; } - rcu_read_unlock(); return pkts_polled; } diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 8e150dd4f899..065e9004598e 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1089,13 +1089,7 @@ static bool qede_rx_xdp(struct qede_dev *edev, xdp_prepare_buff(&xdp, page_address(bd->data), *data_offset, *len, false); - /* Queues always have a full reset currently, so for the time - * being until there's atomic program replace just mark read - * side for map helpers. - */ - rcu_read_lock(); act = bpf_prog_run_xdp(prog, &xdp); - rcu_read_unlock(); /* Recalculate, as XDP might have changed the headers */ *data_offset = xdp.data - xdp.data_hard_start; diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 17b8119c48e5..606750938b89 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -260,18 +260,14 @@ static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel, s16 offset; int err; - rcu_read_lock(); - xdp_prog = rcu_dereference(efx->xdp_prog); - if (!xdp_prog) { - rcu_read_unlock(); + xdp_prog = rcu_dereference_bh(efx->xdp_prog); + if (!xdp_prog) return true; - } rx_queue = efx_channel_get_rx_queue(channel); if (unlikely(channel->rx_pkt_n_frags > 1)) { /* We can't do XDP on fragmented packets - drop. */ - rcu_read_unlock(); efx_free_rx_buffers(rx_queue, rx_buf, channel->rx_pkt_n_frags); if (net_ratelimit()) @@ -296,7 +292,6 @@ static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel, rx_buf->len, false); xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); - rcu_read_unlock(); offset = (u8 *)xdp.data - *ehp; diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index dfc85cc68173..20d148c019d8 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -958,7 +958,6 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) xdp_init_buff(&xdp, PAGE_SIZE, &dring->xdp_rxq); - rcu_read_lock(); xdp_prog = READ_ONCE(priv->xdp_prog); dma_dir = page_pool_get_dma_dir(dring->page_pool); @@ -1069,8 +1068,6 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) } netsec_finalize_xdp_rx(priv, xdp_act, xdp_xmit); - rcu_read_unlock(); - return done; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 16820873b01d..219535ab2c0c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4651,7 +4651,6 @@ static int stmmac_xdp_xmit_back(struct stmmac_priv *priv, return res; } -/* This function assumes rcu_read_lock() is held by the caller. */ static int __stmmac_xdp_run_prog(struct stmmac_priv *priv, struct bpf_prog *prog, struct xdp_buff *xdp) @@ -4693,17 +4692,14 @@ static struct sk_buff *stmmac_xdp_run_prog(struct stmmac_priv *priv, struct bpf_prog *prog; int res; - rcu_read_lock(); - prog = READ_ONCE(priv->xdp_prog); if (!prog) { res = STMMAC_XDP_PASS; - goto unlock; + goto out; } res = __stmmac_xdp_run_prog(priv, prog, xdp); -unlock: - rcu_read_unlock(); +out: return ERR_PTR(-res); } @@ -4973,10 +4969,8 @@ static int stmmac_rx_zc(struct stmmac_priv *priv, int limit, u32 queue) buf->xdp->data_end = buf->xdp->data + buf1_len; xsk_buff_dma_sync_for_cpu(buf->xdp, rx_q->xsk_pool); - rcu_read_lock(); prog = READ_ONCE(priv->xdp_prog); res = __stmmac_xdp_run_prog(priv, prog, buf->xdp); - rcu_read_unlock(); switch (res) { case STMMAC_XDP_PASS: diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index 5862f0a4a975..ecc2a6b7e28f 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -1328,13 +1328,9 @@ int cpsw_run_xdp(struct cpsw_priv *priv, int ch, struct xdp_buff *xdp, struct bpf_prog *prog; u32 act; - rcu_read_lock(); - prog = READ_ONCE(priv->xdp_prog); - if (!prog) { - ret = CPSW_XDP_PASS; - goto out; - } + if (!prog) + return CPSW_XDP_PASS; act = bpf_prog_run_xdp(prog, xdp); /* XDP prog might have changed packet data and boundaries */ @@ -1378,10 +1374,8 @@ int cpsw_run_xdp(struct cpsw_priv *priv, int ch, struct xdp_buff *xdp, ndev->stats.rx_bytes += *len; ndev->stats.rx_packets++; out: - rcu_read_unlock(); return ret; drop: - rcu_read_unlock(); page_pool_recycle_direct(cpsw->page_pool[ch], page); return ret; } diff --git a/include/linux/filter.h b/include/linux/filter.h index 688856e0b28a..472f97074da0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -763,11 +763,9 @@ DECLARE_BPF_DISPATCHER(xdp) static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { - /* Caller needs to hold rcu_read_lock() (!), otherwise program - * can be released while still running, or map elements could be - * freed early while still having concurrent users. XDP fastpath - * already takes rcu_read_lock() when fetching the program, so - * it's not necessary here anymore. + /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus + * under local_bh_disable(), which provides the needed RCU protection + * for accessing map entries. */ return __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); } diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9455476c5ba2..d7895b81264e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -363,6 +363,20 @@ static inline void rcu_preempt_sleep_check(void) { } #define rcu_check_sparse(p, space) #endif /* #else #ifdef __CHECKER__ */ +/** + * unrcu_pointer - mark a pointer as not being RCU protected + * @p: pointer needing to lose its __rcu property + * + * Converts @p from an __rcu pointer to a __kernel pointer. + * This allows an __rcu pointer to be used with xchg() and friends. + */ +#define unrcu_pointer(p) \ +({ \ + typeof(*p) *_________p1 = (typeof(*p) *__force)(p); \ + rcu_check_sparse(p, __rcu); \ + ((typeof(*p) __force __kernel *)(_________p1)); \ +}) + #define __rcu_access_pointer(p, space) \ ({ \ typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 9c0722c6d7ac..fff069d2ed1b 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -37,7 +37,7 @@ struct xdp_umem { struct xsk_map { struct bpf_map map; spinlock_t lock; /* Synchronize map updates */ - struct xdp_sock *xsk_map[]; + struct xdp_sock __rcu *xsk_map[]; }; struct xdp_sock { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a1a0c4e791c6..480e936c54d0 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -74,7 +74,7 @@ struct bpf_cpu_map_entry { struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ - struct bpf_cpu_map_entry **cpu_map; + struct bpf_cpu_map_entry __rcu **cpu_map; }; static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); @@ -469,7 +469,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, { struct bpf_cpu_map_entry *old_rcpu; - old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu); + old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu))); if (old_rcpu) { call_rcu(&old_rcpu->rcu, __cpu_map_entry_free); INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop); @@ -551,7 +551,7 @@ static void cpu_map_free(struct bpf_map *map) for (i = 0; i < cmap->map.max_entries; i++) { struct bpf_cpu_map_entry *rcpu; - rcpu = READ_ONCE(cmap->cpu_map[i]); + rcpu = rcu_dereference_raw(cmap->cpu_map[i]); if (!rcpu) continue; @@ -562,6 +562,10 @@ static void cpu_map_free(struct bpf_map *map) kfree(cmap); } +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. + */ static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); @@ -570,7 +574,8 @@ static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) if (key >= map->max_entries) return NULL; - rcpu = READ_ONCE(cmap->cpu_map[key]); + rcpu = rcu_dereference_check(cmap->cpu_map[key], + rcu_read_lock_bh_held()); return rcpu; } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2a75e6c2d27d..2f6bd75cd682 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -73,7 +73,7 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; - struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ + struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */ struct list_head list; /* these are only used for DEVMAP_HASH type maps */ @@ -226,7 +226,7 @@ static void dev_map_free(struct bpf_map *map) for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; - dev = dtab->netdev_map[i]; + dev = rcu_dereference_raw(dtab->netdev_map[i]); if (!dev) continue; @@ -259,6 +259,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. + */ static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); @@ -410,15 +414,9 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err); } -/* __dev_flush is called from xdp_do_flush() which _must_ be signaled - * from the driver before returning from its napi->poll() routine. The poll() - * routine is called either from busy_poll context or net_rx_action signaled - * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the - * net device can be torn down. On devmap tear down we ensure the flush list - * is empty before completing to ensure all flush operations have completed. - * When drivers update the bpf program they may need to ensure any flush ops - * are also complete. Using synchronize_rcu or call_rcu will suffice for this - * because both wait for napi context to exit. +/* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the + * driver before returning from its napi->poll() routine. See the comment above + * xdp_do_flush() in filter.c. */ void __dev_flush(void) { @@ -433,9 +431,9 @@ void __dev_flush(void) } } -/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or - * update happens in parallel here a dev_put won't happen until after reading - * the ifindex. +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { @@ -445,12 +443,14 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) if (key >= map->max_entries) return NULL; - obj = READ_ONCE(dtab->netdev_map[key]); + obj = rcu_dereference_check(dtab->netdev_map[key], + rcu_read_lock_bh_held()); return obj; } -/* Runs under RCU-read-side, plus in softirq under NAPI protection. - * Thus, safe percpu variable access. +/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu + * variable access, and map elements stick around. See comment above + * xdp_do_flush() in filter.c. */ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) @@ -735,14 +735,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) if (k >= map->max_entries) return -EINVAL; - /* Use call_rcu() here to ensure any rcu critical sections have - * completed as well as any flush operations because call_rcu - * will wait for preempt-disable region to complete, NAPI in this - * context. And additionally, the driver tear down ensures all - * soft irqs are complete before removing the net device in the - * case of dev_put equals zero. - */ - old_dev = xchg(&dtab->netdev_map[k], NULL); + old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL)); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); return 0; @@ -851,7 +844,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, * Remembering the driver side flush operation will happen before the * net device is removed. */ - old_dev = xchg(&dtab->netdev_map[i], dev); + old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev))); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); @@ -1031,10 +1024,10 @@ static int dev_map_notification(struct notifier_block *notifier, for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev, *odev; - dev = READ_ONCE(dtab->netdev_map[i]); + dev = rcu_dereference(dtab->netdev_map[i]); if (!dev || netdev != dev->dev) continue; - odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); + odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL)); if (dev == odev) call_rcu(&dev->rcu, __dev_map_entry_free); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 6f6681b07364..72c58cc516a3 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -596,7 +596,8 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l; u32 hash, key_size; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); key_size = map->key_size; @@ -989,7 +990,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); key_size = map->key_size; @@ -1082,7 +1084,8 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); key_size = map->key_size; @@ -1148,7 +1151,8 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); key_size = map->key_size; @@ -1202,7 +1206,8 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); key_size = map->key_size; @@ -1276,7 +1281,8 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); key_size = map->key_size; @@ -1311,7 +1317,8 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); key_size = map->key_size; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a2f1f15ce432..62cf00383910 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -29,7 +29,7 @@ */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } @@ -45,7 +45,7 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); return map->ops->map_update_elem(map, key, value, flags); } @@ -62,7 +62,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); return map->ops->map_delete_elem(map, key); } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1b7b8a6f34ee..423549d2c52e 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -232,7 +232,8 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) /* Start walking the trie from the root node ... */ - for (node = rcu_dereference(trie->root); node;) { + for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held()); + node;) { unsigned int next_bit; size_t matchlen; @@ -264,7 +265,8 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) * traverse down. */ next_bit = extract_bit(key->data, node->prefixlen); - node = rcu_dereference(node->child[next_bit]); + node = rcu_dereference_check(node->child[next_bit], + rcu_read_lock_bh_held()); } if (!found) diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 84b3b35fc0d0..9e0c10c6892a 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) @@ -105,6 +106,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, VM_ALLOC | VM_USERMAP, PAGE_KERNEL); if (rb) { + kmemleak_not_leak(pages); rb->pages = pages; rb->nr_pages = nr_pages; return rb; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7a52bc172841..64bd2d84367f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1017,6 +1017,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c index 05e1cfc1e5cd..291a92546246 100644 --- a/net/bpfilter/main.c +++ b/net/bpfilter/main.c @@ -57,7 +57,7 @@ int main(void) { debug_f = fopen("/dev/kmsg", "w"); setvbuf(debug_f, 0, _IOLBF, 0); - fprintf(debug_f, "Started bpfilter\n"); + fprintf(debug_f, "<5>Started bpfilter\n"); loop(); fclose(debug_f); return 0; diff --git a/net/core/filter.c b/net/core/filter.c index 0b13d8157a8f..d22895caa164 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3235,15 +3235,12 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) return ret; } -static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags) +static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; - ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; @@ -3255,21 +3252,11 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags) if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* SKB_GSO_TCPV4 needs to be changed into - * SKB_GSO_TCPV6. - */ + /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */ if (shinfo->gso_type & SKB_GSO_TCPV4) { shinfo->gso_type &= ~SKB_GSO_TCPV4; shinfo->gso_type |= SKB_GSO_TCPV6; } - - /* Due to IPv6 header, MSS needs to be downgraded. */ - if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) - skb_decrease_gso_size(shinfo, len_diff); - - /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; - shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IPV6); @@ -3278,15 +3265,12 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags) return 0; } -static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags) +static int bpf_skb_proto_6_to_4(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; - ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; @@ -3298,21 +3282,11 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags) if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* SKB_GSO_TCPV6 needs to be changed into - * SKB_GSO_TCPV4. - */ + /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */ if (shinfo->gso_type & SKB_GSO_TCPV6) { shinfo->gso_type &= ~SKB_GSO_TCPV6; shinfo->gso_type |= SKB_GSO_TCPV4; } - - /* Due to IPv4 header, MSS can be upgraded. */ - if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) - skb_increase_gso_size(shinfo, len_diff); - - /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; - shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IP); @@ -3321,17 +3295,17 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags) return 0; } -static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto, u64 flags) +static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) { __be16 from_proto = skb->protocol; if (from_proto == htons(ETH_P_IP) && to_proto == htons(ETH_P_IPV6)) - return bpf_skb_proto_4_to_6(skb, flags); + return bpf_skb_proto_4_to_6(skb); if (from_proto == htons(ETH_P_IPV6) && to_proto == htons(ETH_P_IP)) - return bpf_skb_proto_6_to_4(skb, flags); + return bpf_skb_proto_6_to_4(skb); return -ENOTSUPP; } @@ -3341,7 +3315,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, { int ret; - if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO))) + if (unlikely(flags)) return -EINVAL; /* General idea is that this helper does the basic groundwork @@ -3361,7 +3335,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, * that. For offloads, we mark packet as dodgy, so that headers * need to be verified first. */ - ret = bpf_skb_proto_xlat(skb, proto, flags); + ret = bpf_skb_proto_xlat(skb, proto); bpf_compute_data_pointers(skb); return ret; } @@ -3923,6 +3897,34 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; +/* XDP_REDIRECT works by a three-step process, implemented in the functions + * below: + * + * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target + * of the redirect and store it (along with some other metadata) in a per-CPU + * struct bpf_redirect_info. + * + * 2. When the program returns the XDP_REDIRECT return code, the driver will + * call xdp_do_redirect() which will use the information in struct + * bpf_redirect_info to actually enqueue the frame into a map type-specific + * bulk queue structure. + * + * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(), + * which will flush all the different bulk queues, thus completing the + * redirect. + * + * Pointers to the map entries will be kept around for this whole sequence of + * steps, protected by RCU. However, there is no top-level rcu_read_lock() in + * the core code; instead, the RCU protection relies on everything happening + * inside a single NAPI poll sequence, which means it's between a pair of calls + * to local_bh_disable()/local_bh_enable(). + * + * The map entries are marked as __rcu and the map code makes sure to + * dereference those pointers with rcu_dereference_check() in a way that works + * for both sections that to hold an rcu_read_lock() and sections that are + * called from NAPI without a separate rcu_read_lock(). The code below does not + * use RCU annotations, but relies on those in the map code. + */ void xdp_do_flush(void) { __dev_flush(); diff --git a/net/core/xdp.c b/net/core/xdp.c index 725d20f1b100..cc92ccb38432 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -113,8 +113,13 @@ static void mem_allocator_disconnect(void *allocator) void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; + int type = xdp_rxq->mem.type; int id = xdp_rxq->mem.id; + /* Reset mem info to defaults */ + xdp_rxq->mem.id = 0; + xdp_rxq->mem.type = 0; + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { WARN(1, "Missing register, driver bug"); return; @@ -123,7 +128,7 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; - if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { + if (type == MEM_TYPE_PAGE_POOL) { rcu_read_lock(); xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); page_pool_destroy(xa->page_pool); @@ -144,10 +149,6 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) xdp_rxq->reg_state = REG_STATE_UNREGISTERED; xdp_rxq->dev = NULL; - - /* Reset mem info to defaults */ - xdp_rxq->mem.id = 0; - xdp_rxq->mem.type = 0; } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index e48e980c3b93..e409a0005717 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -43,7 +43,6 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, tcf_lastuse_update(&prog->tcf_tm); bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); - rcu_read_lock(); filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); @@ -56,7 +55,6 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, } if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); - rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. * Similarly as in cls_bpf, if filter_res == -1 we use the diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 6e3e63db0e01..fa739efa59f4 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -85,8 +85,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_bpf_prog *prog; int ret = -1; - /* Needed here for accessing maps. */ - rcu_read_lock(); list_for_each_entry_rcu(prog, &head->plist, link) { int filter_res; @@ -131,7 +129,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, break; } - rcu_read_unlock(); return ret; } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cd62d4ba87a9..996da915f520 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -749,7 +749,7 @@ static void xsk_unbind_dev(struct xdp_sock *xs) } static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, - struct xdp_sock ***map_entry) + struct xdp_sock __rcu ***map_entry) { struct xsk_map *map = NULL; struct xsk_map_node *node; @@ -785,7 +785,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs) * might be updates to the map between * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). */ - struct xdp_sock **map_entry = NULL; + struct xdp_sock __rcu **map_entry = NULL; struct xsk_map *map; while ((map = xsk_get_map_list_entry(xs, &map_entry))) { diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index edcf249ad1f1..a4bc4749faac 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -31,7 +31,7 @@ struct xdp_mmap_offsets_v1 { struct xsk_map_node { struct list_head node; struct xsk_map *map; - struct xdp_sock **map_entry; + struct xdp_sock __rcu **map_entry; }; static inline struct xdp_sock *xdp_sk(struct sock *sk) @@ -40,7 +40,7 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk) } void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry); + struct xdp_sock __rcu **map_entry); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id); int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id); diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 9df75ea4a567..2e48d0e094d9 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -12,7 +12,7 @@ #include "xsk.h" static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { struct xsk_map_node *node; @@ -42,7 +42,7 @@ static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) } static void xsk_map_sock_delete(struct xdp_sock *xs, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { struct xsk_map_node *n, *tmp; @@ -124,6 +124,10 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. + */ static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) { struct xsk_map *m = container_of(map, struct xsk_map, map); @@ -131,12 +135,11 @@ static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) if (key >= map->max_entries) return NULL; - return READ_ONCE(m->xsk_map[key]); + return rcu_dereference_check(m->xsk_map[key], rcu_read_lock_bh_held()); } static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { - WARN_ON_ONCE(!rcu_read_lock_held()); return __xsk_map_lookup_elem(map, *(u32 *)key); } @@ -149,7 +152,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs, *old_xs, **map_entry; + struct xdp_sock __rcu **map_entry; + struct xdp_sock *xs, *old_xs; u32 i = *(u32 *)key, fd = *(u32 *)value; struct xsk_map_node *node; struct socket *sock; @@ -179,7 +183,7 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, } spin_lock_bh(&m->lock); - old_xs = READ_ONCE(*map_entry); + old_xs = rcu_dereference_protected(*map_entry, lockdep_is_held(&m->lock)); if (old_xs == xs) { err = 0; goto out; @@ -191,7 +195,7 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, goto out; } xsk_map_sock_add(xs, node); - WRITE_ONCE(*map_entry, xs); + rcu_assign_pointer(*map_entry, xs); if (old_xs) xsk_map_sock_delete(old_xs, map_entry); spin_unlock_bh(&m->lock); @@ -208,7 +212,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, static int xsk_map_delete_elem(struct bpf_map *map, void *key) { struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *old_xs, **map_entry; + struct xdp_sock __rcu **map_entry; + struct xdp_sock *old_xs; int k = *(u32 *)key; if (k >= map->max_entries) @@ -216,7 +221,7 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) spin_lock_bh(&m->lock); map_entry = &m->xsk_map[k]; - old_xs = xchg(map_entry, NULL); + old_xs = unrcu_pointer(xchg(map_entry, NULL)); if (old_xs) xsk_map_sock_delete(old_xs, map_entry); spin_unlock_bh(&m->lock); @@ -231,11 +236,11 @@ static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) } void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { spin_lock_bh(&map->lock); - if (READ_ONCE(*map_entry) == xs) { - WRITE_ONCE(*map_entry, NULL); + if (rcu_access_pointer(*map_entry) == xs) { + rcu_assign_pointer(*map_entry, NULL); xsk_map_sock_delete(xs, map_entry); } spin_unlock_bh(&map->lock); diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index 41d705c3a1f7..93854e135134 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -130,7 +130,7 @@ int main(int argc, char **argv) if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) xdp_flags |= XDP_FLAGS_DRV_MODE; - if (optind == argc) { + if (optind + 2 != argc) { printf("usage: %s _IN _OUT\n", argv[0]); return 1; } @@ -213,5 +213,5 @@ int main(int argc, char **argv) poll_stats(2, ifindex_out); out: - return 0; + return ret; } diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 48c0ade05ab1..1e04ce724240 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4000,6 +4000,10 @@ bpf_object__probe_loading(struct bpf_object *obj) attr.license = "GPL"; ret = bpf_load_program_xattr(&attr, NULL, 0); + if (ret < 0) { + attr.prog_type = BPF_PROG_TYPE_TRACEPOINT; + ret = bpf_load_program_xattr(&attr, NULL, 0); + } if (ret < 0) { ret = errno; cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index cf9381f03b16..39f25e09b51e 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -154,7 +154,7 @@ static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq, return ret; } -static int libbpf_netlink_send_recv(struct nlmsghdr *nh, +static int libbpf_netlink_send_recv(struct libbpf_nla_req *req, __dump_nlmsg_t parse_msg, libbpf_dump_nlmsg_t parse_attr, void *cookie) @@ -166,15 +166,15 @@ static int libbpf_netlink_send_recv(struct nlmsghdr *nh, if (sock < 0) return sock; - nh->nlmsg_pid = 0; - nh->nlmsg_seq = time(NULL); + req->nh.nlmsg_pid = 0; + req->nh.nlmsg_seq = time(NULL); - if (send(sock, nh, nh->nlmsg_len, 0) < 0) { + if (send(sock, req, req->nh.nlmsg_len, 0) < 0) { ret = -errno; goto out; } - ret = libbpf_netlink_recv(sock, nl_pid, nh->nlmsg_seq, + ret = libbpf_netlink_recv(sock, nl_pid, req->nh.nlmsg_seq, parse_msg, parse_attr, cookie); out: libbpf_netlink_close(sock); @@ -186,11 +186,7 @@ static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, { struct nlattr *nla; int ret; - struct { - struct nlmsghdr nh; - struct ifinfomsg ifinfo; - char attrbuf[64]; - } req; + struct libbpf_nla_req req; memset(&req, 0, sizeof(req)); req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); @@ -199,27 +195,26 @@ static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, req.ifinfo.ifi_family = AF_UNSPEC; req.ifinfo.ifi_index = ifindex; - nla = nlattr_begin_nested(&req.nh, sizeof(req), IFLA_XDP); + nla = nlattr_begin_nested(&req, IFLA_XDP); if (!nla) return -EMSGSIZE; - ret = nlattr_add(&req.nh, sizeof(req), IFLA_XDP_FD, &fd, sizeof(fd)); + ret = nlattr_add(&req, IFLA_XDP_FD, &fd, sizeof(fd)); if (ret < 0) return ret; if (flags) { - ret = nlattr_add(&req.nh, sizeof(req), IFLA_XDP_FLAGS, &flags, - sizeof(flags)); + ret = nlattr_add(&req, IFLA_XDP_FLAGS, &flags, sizeof(flags)); if (ret < 0) return ret; } if (flags & XDP_FLAGS_REPLACE) { - ret = nlattr_add(&req.nh, sizeof(req), IFLA_XDP_EXPECTED_FD, - &old_fd, sizeof(old_fd)); + ret = nlattr_add(&req, IFLA_XDP_EXPECTED_FD, &old_fd, + sizeof(old_fd)); if (ret < 0) return ret; } - nlattr_end_nested(&req.nh, nla); + nlattr_end_nested(&req, nla); - return libbpf_netlink_send_recv(&req.nh, NULL, NULL, NULL); + return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); } int bpf_set_link_xdp_fd_opts(int ifindex, int fd, __u32 flags, @@ -314,14 +309,11 @@ int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info, struct xdp_id_md xdp_id = {}; __u32 mask; int ret; - struct { - struct nlmsghdr nh; - struct ifinfomsg ifm; - } req = { - .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), - .nh.nlmsg_type = RTM_GETLINK, - .nh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, - .ifm.ifi_family = AF_PACKET, + struct libbpf_nla_req req = { + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nh.nlmsg_type = RTM_GETLINK, + .nh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .ifinfo.ifi_family = AF_PACKET, }; if (flags & ~XDP_FLAGS_MASK || !info_size) @@ -336,7 +328,7 @@ int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info, xdp_id.ifindex = ifindex; xdp_id.flags = flags; - ret = libbpf_netlink_send_recv(&req.nh, __dump_link_nlmsg, + ret = libbpf_netlink_send_recv(&req, __dump_link_nlmsg, get_xdp_info, &xdp_id); if (!ret) { size_t sz = min(info_size, sizeof(xdp_id.info)); @@ -376,15 +368,14 @@ int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags) return libbpf_err(ret); } -typedef int (*qdisc_config_t)(struct nlmsghdr *nh, struct tcmsg *t, - size_t maxsz); +typedef int (*qdisc_config_t)(struct libbpf_nla_req *req); -static int clsact_config(struct nlmsghdr *nh, struct tcmsg *t, size_t maxsz) +static int clsact_config(struct libbpf_nla_req *req) { - t->tcm_parent = TC_H_CLSACT; - t->tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0); + req->tc.tcm_parent = TC_H_CLSACT; + req->tc.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0); - return nlattr_add(nh, maxsz, TCA_KIND, "clsact", sizeof("clsact")); + return nlattr_add(req, TCA_KIND, "clsact", sizeof("clsact")); } static int attach_point_to_config(struct bpf_tc_hook *hook, @@ -431,11 +422,7 @@ static int tc_qdisc_modify(struct bpf_tc_hook *hook, int cmd, int flags) { qdisc_config_t config; int ret; - struct { - struct nlmsghdr nh; - struct tcmsg tc; - char buf[256]; - } req; + struct libbpf_nla_req req; ret = attach_point_to_config(hook, &config); if (ret < 0) @@ -448,11 +435,11 @@ static int tc_qdisc_modify(struct bpf_tc_hook *hook, int cmd, int flags) req.tc.tcm_family = AF_UNSPEC; req.tc.tcm_ifindex = OPTS_GET(hook, ifindex, 0); - ret = config(&req.nh, &req.tc, sizeof(req)); + ret = config(&req); if (ret < 0) return ret; - return libbpf_netlink_send_recv(&req.nh, NULL, NULL, NULL); + return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); } static int tc_qdisc_create_excl(struct bpf_tc_hook *hook) @@ -537,14 +524,14 @@ static int get_tc_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, struct nlattr *tb[TCA_MAX + 1]; libbpf_nla_parse(tb, TCA_MAX, - (struct nlattr *)((char *)tc + NLMSG_ALIGN(sizeof(*tc))), + (struct nlattr *)((void *)tc + NLMSG_ALIGN(sizeof(*tc))), NLMSG_PAYLOAD(nh, sizeof(*tc)), NULL); if (!tb[TCA_KIND]) return NL_CONT; return __get_tc_info(cookie, tc, tb, nh->nlmsg_flags & NLM_F_ECHO); } -static int tc_add_fd_and_name(struct nlmsghdr *nh, size_t maxsz, int fd) +static int tc_add_fd_and_name(struct libbpf_nla_req *req, int fd) { struct bpf_prog_info info = {}; __u32 info_len = sizeof(info); @@ -555,7 +542,7 @@ static int tc_add_fd_and_name(struct nlmsghdr *nh, size_t maxsz, int fd) if (ret < 0) return ret; - ret = nlattr_add(nh, maxsz, TCA_BPF_FD, &fd, sizeof(fd)); + ret = nlattr_add(req, TCA_BPF_FD, &fd, sizeof(fd)); if (ret < 0) return ret; len = snprintf(name, sizeof(name), "%s:[%u]", info.name, info.id); @@ -563,7 +550,7 @@ static int tc_add_fd_and_name(struct nlmsghdr *nh, size_t maxsz, int fd) return -errno; if (len >= sizeof(name)) return -ENAMETOOLONG; - return nlattr_add(nh, maxsz, TCA_BPF_NAME, name, len + 1); + return nlattr_add(req, TCA_BPF_NAME, name, len + 1); } int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) @@ -571,12 +558,8 @@ int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) __u32 protocol, bpf_flags, handle, priority, parent, prog_id, flags; int ret, ifindex, attach_point, prog_fd; struct bpf_cb_ctx info = {}; + struct libbpf_nla_req req; struct nlattr *nla; - struct { - struct nlmsghdr nh; - struct tcmsg tc; - char buf[256]; - } req; if (!hook || !opts || !OPTS_VALID(hook, bpf_tc_hook) || @@ -618,25 +601,24 @@ int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) return libbpf_err(ret); req.tc.tcm_parent = parent; - ret = nlattr_add(&req.nh, sizeof(req), TCA_KIND, "bpf", sizeof("bpf")); + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); if (ret < 0) return libbpf_err(ret); - nla = nlattr_begin_nested(&req.nh, sizeof(req), TCA_OPTIONS); + nla = nlattr_begin_nested(&req, TCA_OPTIONS); if (!nla) return libbpf_err(-EMSGSIZE); - ret = tc_add_fd_and_name(&req.nh, sizeof(req), prog_fd); + ret = tc_add_fd_and_name(&req, prog_fd); if (ret < 0) return libbpf_err(ret); bpf_flags = TCA_BPF_FLAG_ACT_DIRECT; - ret = nlattr_add(&req.nh, sizeof(req), TCA_BPF_FLAGS, &bpf_flags, - sizeof(bpf_flags)); + ret = nlattr_add(&req, TCA_BPF_FLAGS, &bpf_flags, sizeof(bpf_flags)); if (ret < 0) return libbpf_err(ret); - nlattr_end_nested(&req.nh, nla); + nlattr_end_nested(&req, nla); info.opts = opts; - ret = libbpf_netlink_send_recv(&req.nh, get_tc_info, NULL, &info); + ret = libbpf_netlink_send_recv(&req, get_tc_info, NULL, &info); if (ret < 0) return libbpf_err(ret); if (!info.processed) @@ -650,11 +632,7 @@ static int __bpf_tc_detach(const struct bpf_tc_hook *hook, { __u32 protocol = 0, handle, priority, parent, prog_id, flags; int ret, ifindex, attach_point, prog_fd; - struct { - struct nlmsghdr nh; - struct tcmsg tc; - char buf[256]; - } req; + struct libbpf_nla_req req; if (!hook || !OPTS_VALID(hook, bpf_tc_hook) || @@ -701,13 +679,12 @@ static int __bpf_tc_detach(const struct bpf_tc_hook *hook, req.tc.tcm_parent = parent; if (!flush) { - ret = nlattr_add(&req.nh, sizeof(req), TCA_KIND, - "bpf", sizeof("bpf")); + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); if (ret < 0) return ret; } - return libbpf_netlink_send_recv(&req.nh, NULL, NULL, NULL); + return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); } int bpf_tc_detach(const struct bpf_tc_hook *hook, @@ -727,11 +704,7 @@ int bpf_tc_query(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) __u32 protocol, handle, priority, parent, prog_id, flags; int ret, ifindex, attach_point, prog_fd; struct bpf_cb_ctx info = {}; - struct { - struct nlmsghdr nh; - struct tcmsg tc; - char buf[256]; - } req; + struct libbpf_nla_req req; if (!hook || !opts || !OPTS_VALID(hook, bpf_tc_hook) || @@ -770,13 +743,13 @@ int bpf_tc_query(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) return libbpf_err(ret); req.tc.tcm_parent = parent; - ret = nlattr_add(&req.nh, sizeof(req), TCA_KIND, "bpf", sizeof("bpf")); + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); if (ret < 0) return libbpf_err(ret); info.opts = opts; - ret = libbpf_netlink_send_recv(&req.nh, get_tc_info, NULL, &info); + ret = libbpf_netlink_send_recv(&req, get_tc_info, NULL, &info); if (ret < 0) return libbpf_err(ret); if (!info.processed) diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c index b607fa9852b1..f57e77a6e40f 100644 --- a/tools/lib/bpf/nlattr.c +++ b/tools/lib/bpf/nlattr.c @@ -27,7 +27,7 @@ static struct nlattr *nla_next(const struct nlattr *nla, int *remaining) int totlen = NLA_ALIGN(nla->nla_len); *remaining -= totlen; - return (struct nlattr *) ((char *) nla + totlen); + return (struct nlattr *)((void *)nla + totlen); } static int nla_ok(const struct nlattr *nla, int remaining) diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h index 3c780ab6d022..4d15ae2ff812 100644 --- a/tools/lib/bpf/nlattr.h +++ b/tools/lib/bpf/nlattr.h @@ -13,6 +13,7 @@ #include #include #include +#include /* avoid multiple definition of netlink features */ #define __LINUX_NETLINK_H @@ -52,6 +53,15 @@ struct libbpf_nla_policy { uint16_t maxlen; }; +struct libbpf_nla_req { + struct nlmsghdr nh; + union { + struct ifinfomsg ifinfo; + struct tcmsg tc; + }; + char buf[128]; +}; + /** * @ingroup attr * Iterate over a stream of attributes @@ -71,7 +81,7 @@ struct libbpf_nla_policy { */ static inline void *libbpf_nla_data(const struct nlattr *nla) { - return (char *) nla + NLA_HDRLEN; + return (void *)nla + NLA_HDRLEN; } static inline uint8_t libbpf_nla_getattr_u8(const struct nlattr *nla) @@ -108,47 +118,47 @@ int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh); static inline struct nlattr *nla_data(struct nlattr *nla) { - return (struct nlattr *)((char *)nla + NLA_HDRLEN); + return (struct nlattr *)((void *)nla + NLA_HDRLEN); } -static inline struct nlattr *nh_tail(struct nlmsghdr *nh) +static inline struct nlattr *req_tail(struct libbpf_nla_req *req) { - return (struct nlattr *)((char *)nh + NLMSG_ALIGN(nh->nlmsg_len)); + return (struct nlattr *)((void *)req + NLMSG_ALIGN(req->nh.nlmsg_len)); } -static inline int nlattr_add(struct nlmsghdr *nh, size_t maxsz, int type, +static inline int nlattr_add(struct libbpf_nla_req *req, int type, const void *data, int len) { struct nlattr *nla; - if (NLMSG_ALIGN(nh->nlmsg_len) + NLA_ALIGN(NLA_HDRLEN + len) > maxsz) + if (NLMSG_ALIGN(req->nh.nlmsg_len) + NLA_ALIGN(NLA_HDRLEN + len) > sizeof(*req)) return -EMSGSIZE; if (!!data != !!len) return -EINVAL; - nla = nh_tail(nh); + nla = req_tail(req); nla->nla_type = type; nla->nla_len = NLA_HDRLEN + len; if (data) memcpy(nla_data(nla), data, len); - nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + NLA_ALIGN(nla->nla_len); + req->nh.nlmsg_len = NLMSG_ALIGN(req->nh.nlmsg_len) + NLA_ALIGN(nla->nla_len); return 0; } -static inline struct nlattr *nlattr_begin_nested(struct nlmsghdr *nh, - size_t maxsz, int type) +static inline struct nlattr *nlattr_begin_nested(struct libbpf_nla_req *req, int type) { struct nlattr *tail; - tail = nh_tail(nh); - if (nlattr_add(nh, maxsz, type | NLA_F_NESTED, NULL, 0)) + tail = req_tail(req); + if (nlattr_add(req, type | NLA_F_NESTED, NULL, 0)) return NULL; return tail; } -static inline void nlattr_end_nested(struct nlmsghdr *nh, struct nlattr *tail) +static inline void nlattr_end_nested(struct libbpf_nla_req *req, + struct nlattr *tail) { - tail->nla_len = (char *)nh_tail(nh) - (char *)tail; + tail->nla_len = (void *)req_tail(req) - (void *)tail; } #endif /* __LIBBPF_NLATTR_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index a01788090c31..4706cee84360 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -100,7 +100,7 @@ void test_ringbuf(void) if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) goto cleanup; - rb_fd = bpf_map__fd(skel->maps.ringbuf); + rb_fd = skel->maps.ringbuf.map_fd; /* good read/write cons_pos */ mmap_ptr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, rb_fd, 0); ASSERT_OK_PTR(mmap_ptr, "rw_cons_pos");