Merge branch 'bpf-prevent-userspace-memory-access'

Puranjay Mohan says:

====================
bpf: prevent userspace memory access

V5: https://lore.kernel.org/bpf/20240324185356.59111-1-puranjay12@gmail.com/
Changes in V6:
- Disable the verifier's instrumentation in x86-64 and update the JIT to
  take care of vsyscall page in addition to userspace addresses.
- Update bpf_testmod to test for vsyscall addresses.

V4: https://lore.kernel.org/bpf/20240321124640.8870-1-puranjay12@gmail.com/
Changes in V5:
- Use TASK_SIZE_MAX + PAGE_SIZE, VSYSCALL_ADDR as userspace boundary in
  x86-64 JIT.
- Added Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>

V3: https://lore.kernel.org/bpf/20240321120842.78983-1-puranjay12@gmail.com/
Changes in V4:
- Disable this feature on architectures that don't define
  CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE.
- By doing the above, we don't need anything explicitly for s390x.

V2: https://lore.kernel.org/bpf/20240321101058.68530-1-puranjay12@gmail.com/
Changes in V3:
- Return 0 from bpf_arch_uaddress_limit() in disabled case because it
  returns u64.
- Modify the check in verifier to no do instrumentation when uaddress_limit
  is 0.

V1: https://lore.kernel.org/bpf/20240320105436.4781-1-puranjay12@gmail.com/
Changes in V2:
- Disable this feature on s390x.

With BPF_PROBE_MEM, BPF allows de-referencing an untrusted pointer. To
thwart invalid memory accesses, the JITs add an exception table entry for
all such accesses. But in case the src_reg + offset is a userspace address,
the BPF program might read that memory if the user has mapped it.

x86-64 JIT already instruments the BPF_PROBE_MEM based loads with checks to
skip loads from userspace addresses, but is doesn't check for vsyscall page
because it falls in the kernel address space but is considered a userspace
page. The second patch in this series fixes the x86-64 JIT to also skip
loads from the vsyscall page. The last patch updates the bpf_testmod so
this address can be checked as part of the selftests.

Other architectures don't have the complexity of the vsyscall address and
just need to skip loads from the userspace. To make this more scalable and
robust, the verifier is updated in the first patch to instrument
BPF_PROBE_MEM to skip loads from the userspace addresses.
====================

Link: https://lore.kernel.org/r/20240424100210.11982-1-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Alexei Starovoitov 2024-04-26 09:45:19 -07:00
commit a86538a2ef
5 changed files with 74 additions and 32 deletions

View File

@ -1807,36 +1807,41 @@ populate_extable:
if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
/* Conservatively check that src_reg + insn->off is a kernel address:
* src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE
* src_reg is used as scratch for src_reg += insn->off and restored
* after emit_ldx if necessary
* src_reg + insn->off > TASK_SIZE_MAX + PAGE_SIZE
* and
* src_reg + insn->off < VSYSCALL_ADDR
*/
u64 limit = TASK_SIZE_MAX + PAGE_SIZE;
u64 limit = TASK_SIZE_MAX + PAGE_SIZE - VSYSCALL_ADDR;
u8 *end_of_jmp;
/* At end of these emitted checks, insn->off will have been added
* to src_reg, so no need to do relative load with insn->off offset
*/
insn_off = 0;
/* movabsq r10, VSYSCALL_ADDR */
emit_mov_imm64(&prog, BPF_REG_AX, (long)VSYSCALL_ADDR >> 32,
(u32)(long)VSYSCALL_ADDR);
/* movabsq r11, limit */
EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
EMIT((u32)limit, 4);
EMIT(limit >> 32, 4);
/* mov src_reg, r11 */
EMIT_mov(AUX_REG, src_reg);
if (insn->off) {
/* add src_reg, insn->off */
maybe_emit_1mod(&prog, src_reg, true);
EMIT2_off32(0x81, add_1reg(0xC0, src_reg), insn->off);
/* add r11, insn->off */
maybe_emit_1mod(&prog, AUX_REG, true);
EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off);
}
/* cmp src_reg, r11 */
maybe_emit_mod(&prog, src_reg, AUX_REG, true);
EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
/* sub r11, r10 */
maybe_emit_mod(&prog, AUX_REG, BPF_REG_AX, true);
EMIT2(0x29, add_2reg(0xC0, AUX_REG, BPF_REG_AX));
/* if unsigned '>=', goto load */
EMIT2(X86_JAE, 0);
/* movabsq r10, limit */
emit_mov_imm64(&prog, BPF_REG_AX, (long)limit >> 32,
(u32)(long)limit);
/* cmp r10, r11 */
maybe_emit_mod(&prog, AUX_REG, BPF_REG_AX, true);
EMIT2(0x39, add_2reg(0xC0, AUX_REG, BPF_REG_AX));
/* if unsigned '>', goto load */
EMIT2(X86_JA, 0);
end_of_jmp = prog;
/* xor dst_reg, dst_reg */
@ -1862,18 +1867,6 @@ populate_extable:
/* populate jmp_offset for JMP above */
start_of_ldx[-1] = prog - start_of_ldx;
if (insn->off && src_reg != dst_reg) {
/* sub src_reg, insn->off
* Restore src_reg after "add src_reg, insn->off" in prev
* if statement. But if src_reg == dst_reg, emit_ldx
* above already clobbered src_reg, so no need to restore.
* If add src_reg, insn->off was unnecessary, no need to
* restore either.
*/
maybe_emit_1mod(&prog, src_reg, true);
EMIT2_off32(0x81, add_1reg(0xE8, src_reg), insn->off);
}
if (!bpf_prog->aux->extable)
break;
@ -3473,3 +3466,9 @@ bool bpf_jit_supports_ptr_xchg(void)
{
return true;
}
/* x86-64 JIT emits its own code to filter user addresses so return 0 here */
u64 bpf_arch_uaddress_limit(void)
{
return 0;
}

View File

@ -963,6 +963,7 @@ bool bpf_jit_supports_far_kfunc_call(void);
bool bpf_jit_supports_exceptions(void);
bool bpf_jit_supports_ptr_xchg(void);
bool bpf_jit_supports_arena(void);
u64 bpf_arch_uaddress_limit(void);
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
bool bpf_helper_changes_pkt_data(void *func);

View File

@ -2942,6 +2942,15 @@ bool __weak bpf_jit_supports_arena(void)
return false;
}
u64 __weak bpf_arch_uaddress_limit(void)
{
#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
return TASK_SIZE;
#else
return 0;
#endif
}
/* Return TRUE if the JIT backend satisfies the following two conditions:
* 1) JIT backend supports atomic_xchg() on pointer-sized words.
* 2) Under the specific arch, the implementation of xchg() is the same

View File

@ -19675,6 +19675,36 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto next_insn;
}
/* Make it impossible to de-reference a userspace address */
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) == BPF_PROBE_MEM ||
BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
struct bpf_insn *patch = &insn_buf[0];
u64 uaddress_limit = bpf_arch_uaddress_limit();
if (!uaddress_limit)
goto next_insn;
*patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
if (insn->off)
*patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
*patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
*patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
*patch++ = *insn;
*patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
*patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
cnt = patch - insn_buf;
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
goto next_insn;
}
/* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
if (BPF_CLASS(insn->code) == BPF_LD &&
(BPF_MODE(insn->code) == BPF_ABS ||

View File

@ -205,6 +205,9 @@ __weak noinline struct file *bpf_testmod_return_ptr(int arg)
case 5: return (void *)~(1ull << 30); /* trigger extable */
case 6: return &f; /* valid addr */
case 7: return (void *)((long)&f | 1); /* kernel tricks */
#ifdef CONFIG_X86_64
case 8: return (void *)VSYSCALL_ADDR; /* vsyscall page address */
#endif
default: return NULL;
}
}