Merge branch 'bpf-to-bpf-function-calls'

Alexei Starovoitov says:

====================
First of all huge thank you to Daniel, John, Jakub, Edward and others who
reviewed multiple iterations of this patch set over the last many months
and to Dave and others who gave critical feedback during netconf/netdev.

The patch is solid enough and we thought through numerous corner cases,
but it's not the end. More followups with code reorg and features to follow.

TLDR: Allow arbitrary function calls from bpf function to another bpf function.

Since the beginning of bpf all bpf programs were represented as a single function
and program authors were forced to use always_inline for all functions
in their C code. That was causing llvm to unnecessary inflate the code size
and forcing developers to move code to header files with little code reuse.

With a bit of additional complexity teach verifier to recognize
arbitrary function calls from one bpf function to another as long as
all of functions are presented to the verifier as a single bpf program.
Extended program layout:
..
r1 = ..    // arg1
r2 = ..    // arg2
call pc+1  // function call pc-relative
exit
.. = r1    // access arg1
.. = r2    // access arg2
..
call pc+20 // second level of function call
...

It allows for better optimized code and finally allows to introduce
the core bpf libraries that can be reused in different projects,
since programs are no longer limited by single elf file.
With function calls bpf can be compiled into multiple .o files.

This patch is the first step. It detects programs that contain
multiple functions and checks that calls between them are valid.
It splits the sequence of bpf instructions (one program) into a set
of bpf functions that call each other. Calls to only known
functions are allowed. Since all functions are presented to
the verifier at once conceptually it is 'static linking'.

Future plans:
- introduce BPF_PROG_TYPE_LIBRARY and allow a set of bpf functions
  to be loaded into the kernel that can be later linked to other
  programs with concrete program types. Aka 'dynamic linking'.

- introduce function pointer type and indirect calls to allow
  bpf functions call other dynamically loaded bpf functions while
  the caller bpf function is already executing. Aka 'runtime linking'.
  This will be more generic and more flexible alternative
  to bpf_tail_calls.

FAQ:
Q: Interpreter and JIT changes mean that new instruction is introduced ?
A: No. The call instruction technically stays the same. Now it can call
   both kernel helpers and other bpf functions.
   Calling convention stays the same as well.
   From uapi point of view the call insn got new 'relocation' BPF_PSEUDO_CALL
   similar to BPF_PSEUDO_MAP_FD 'relocation' of bpf_ldimm64 insn.

Q: What had to change on LLVM side?
A: Trivial LLVM patch to allow calls was applied to upcoming 6.0 release:
   https://reviews.llvm.org/rL318614
   with few bugfixes as well.
   Make sure to build the latest llvm to have bpf_call support.

More details in the patches.
====================

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
This commit is contained in:
Daniel Borkmann 2017-12-17 20:34:37 +01:00
commit ef9fde06a2
23 changed files with 4381 additions and 272 deletions

View file

@ -1824,7 +1824,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
/* If BPF JIT was not enabled then we must fall back to
* the interpreter.
*/
if (!bpf_jit_enable)
if (!prog->jit_requested)
return orig_prog;
/* If constant blinding was enabled and we failed during blinding

View file

@ -99,6 +99,20 @@ static inline void emit_a64_mov_i64(const int reg, const u64 val,
}
}
static inline void emit_addr_mov_i64(const int reg, const u64 val,
struct jit_ctx *ctx)
{
u64 tmp = val;
int shift = 0;
emit(A64_MOVZ(1, reg, tmp & 0xffff, shift), ctx);
for (;shift < 48;) {
tmp >>= 16;
shift += 16;
emit(A64_MOVK(1, reg, tmp & 0xffff, shift), ctx);
}
}
static inline void emit_a64_mov_i(const int is64, const int reg,
const s32 val, struct jit_ctx *ctx)
{
@ -603,7 +617,10 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
const u8 r0 = bpf2a64[BPF_REG_0];
const u64 func = (u64)__bpf_call_base + imm;
emit_a64_mov_i64(tmp, func, ctx);
if (ctx->prog->is_func)
emit_addr_mov_i64(tmp, func, ctx);
else
emit_a64_mov_i64(tmp, func, ctx);
emit(A64_BLR(tmp), ctx);
emit(A64_MOV(1, r0, A64_R(0)), ctx);
break;
@ -835,16 +852,24 @@ static inline void bpf_flush_icache(void *start, void *end)
flush_icache_range((unsigned long)start, (unsigned long)end);
}
struct arm64_jit_data {
struct bpf_binary_header *header;
u8 *image;
struct jit_ctx ctx;
};
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
struct bpf_prog *tmp, *orig_prog = prog;
struct bpf_binary_header *header;
struct arm64_jit_data *jit_data;
bool tmp_blinded = false;
bool extra_pass = false;
struct jit_ctx ctx;
int image_size;
u8 *image_ptr;
if (!bpf_jit_enable)
if (!prog->jit_requested)
return orig_prog;
tmp = bpf_jit_blind_constants(prog);
@ -858,13 +883,29 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
prog = tmp;
}
jit_data = prog->aux->jit_data;
if (!jit_data) {
jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
if (!jit_data) {
prog = orig_prog;
goto out;
}
prog->aux->jit_data = jit_data;
}
if (jit_data->ctx.offset) {
ctx = jit_data->ctx;
image_ptr = jit_data->image;
header = jit_data->header;
extra_pass = true;
goto skip_init_ctx;
}
memset(&ctx, 0, sizeof(ctx));
ctx.prog = prog;
ctx.offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
if (ctx.offset == NULL) {
prog = orig_prog;
goto out;
goto out_off;
}
/* 1. Initial fake pass to compute ctx->idx. */
@ -895,6 +936,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
/* 2. Now, the actual pass. */
ctx.image = (__le32 *)image_ptr;
skip_init_ctx:
ctx.idx = 0;
build_prologue(&ctx);
@ -920,13 +962,31 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
bpf_flush_icache(header, ctx.image + ctx.idx);
bpf_jit_binary_lock_ro(header);
if (!prog->is_func || extra_pass) {
if (extra_pass && ctx.idx != jit_data->ctx.idx) {
pr_err_once("multi-func JIT bug %d != %d\n",
ctx.idx, jit_data->ctx.idx);
bpf_jit_binary_free(header);
prog->bpf_func = NULL;
prog->jited = 0;
goto out_off;
}
bpf_jit_binary_lock_ro(header);
} else {
jit_data->ctx = ctx;
jit_data->image = image_ptr;
jit_data->header = header;
}
prog->bpf_func = (void *)ctx.image;
prog->jited = 1;
prog->jited_len = image_size;
if (!prog->is_func || extra_pass) {
out_off:
kfree(ctx.offset);
kfree(ctx.offset);
kfree(jit_data);
prog->aux->jit_data = NULL;
}
out:
if (tmp_blinded)
bpf_jit_prog_release_other(prog, prog == orig_prog ?

View file

@ -1869,7 +1869,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
unsigned int image_size;
u8 *image_ptr;
if (!bpf_jit_enable || !cpu_has_mips64r2)
if (!prog->jit_requested || !cpu_has_mips64r2)
return prog;
tmp = bpf_jit_blind_constants(prog);

View file

@ -993,7 +993,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
struct bpf_prog *tmp_fp;
bool bpf_blinded = false;
if (!bpf_jit_enable)
if (!fp->jit_requested)
return org_fp;
tmp_fp = bpf_jit_blind_constants(org_fp);

View file

@ -1300,7 +1300,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
struct bpf_jit jit;
int pass;
if (!bpf_jit_enable)
if (!fp->jit_requested)
return orig_fp;
tmp = bpf_jit_blind_constants(fp);

View file

@ -1517,7 +1517,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
u8 *image_ptr;
int pass;
if (!bpf_jit_enable)
if (!prog->jit_requested)
return orig_prog;
tmp = bpf_jit_blind_constants(prog);

View file

@ -1109,19 +1109,29 @@ xadd: if (is_imm8(insn->off))
return proglen;
}
struct x64_jit_data {
struct bpf_binary_header *header;
int *addrs;
u8 *image;
int proglen;
struct jit_context ctx;
};
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
struct bpf_binary_header *header = NULL;
struct bpf_prog *tmp, *orig_prog = prog;
struct x64_jit_data *jit_data;
int proglen, oldproglen = 0;
struct jit_context ctx = {};
bool tmp_blinded = false;
bool extra_pass = false;
u8 *image = NULL;
int *addrs;
int pass;
int i;
if (!bpf_jit_enable)
if (!prog->jit_requested)
return orig_prog;
tmp = bpf_jit_blind_constants(prog);
@ -1135,10 +1145,28 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
prog = tmp;
}
jit_data = prog->aux->jit_data;
if (!jit_data) {
jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
if (!jit_data) {
prog = orig_prog;
goto out;
}
prog->aux->jit_data = jit_data;
}
addrs = jit_data->addrs;
if (addrs) {
ctx = jit_data->ctx;
oldproglen = jit_data->proglen;
image = jit_data->image;
header = jit_data->header;
extra_pass = true;
goto skip_init_addrs;
}
addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
if (!addrs) {
prog = orig_prog;
goto out;
goto out_addrs;
}
/* Before first pass, make a rough estimation of addrs[]
@ -1149,6 +1177,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
addrs[i] = proglen;
}
ctx.cleanup_addr = proglen;
skip_init_addrs:
/* JITed image shrinks with every pass and the loop iterates
* until the image stops shrinking. Very large bpf programs
@ -1189,7 +1218,15 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
if (image) {
bpf_flush_icache(header, image + proglen);
bpf_jit_binary_lock_ro(header);
if (!prog->is_func || extra_pass) {
bpf_jit_binary_lock_ro(header);
} else {
jit_data->addrs = addrs;
jit_data->ctx = ctx;
jit_data->proglen = proglen;
jit_data->image = image;
jit_data->header = header;
}
prog->bpf_func = (void *)image;
prog->jited = 1;
prog->jited_len = proglen;
@ -1197,8 +1234,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
prog = orig_prog;
}
if (!prog->is_func || extra_pass) {
out_addrs:
kfree(addrs);
kfree(addrs);
kfree(jit_data);
prog->aux->jit_data = NULL;
}
out:
if (tmp_blinded)
bpf_jit_prog_release_other(prog, prog == orig_prog ?

View file

@ -200,6 +200,9 @@ struct bpf_prog_aux {
u32 max_ctx_offset;
u32 stack_depth;
u32 id;
u32 func_cnt;
struct bpf_prog **func;
void *jit_data; /* JIT specific data. arch dependent */
struct latch_tree_node ksym_tnode;
struct list_head ksym_lnode;
const struct bpf_prog_ops *ops;
@ -402,6 +405,7 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
/* verify correctness of eBPF program */
int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
/* Map specifics */
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key);

View file

@ -76,6 +76,14 @@ struct bpf_reg_state {
s64 smax_value; /* maximum possible (s64)value */
u64 umin_value; /* minimum possible (u64)value */
u64 umax_value; /* maximum possible (u64)value */
/* Inside the callee two registers can be both PTR_TO_STACK like
* R1=fp-8 and R2=fp-8, but one of them points to this function stack
* while another to the caller's stack. To differentiate them 'frameno'
* is used which is an index in bpf_verifier_state->frame[] array
* pointing to bpf_func_state.
* This field must be second to last, for states_equal() reasons.
*/
u32 frameno;
/* This field must be last, for states_equal() reasons. */
enum bpf_reg_liveness live;
};
@ -83,7 +91,8 @@ struct bpf_reg_state {
enum bpf_stack_slot_type {
STACK_INVALID, /* nothing was stored in this stack slot */
STACK_SPILL, /* register spilled into stack */
STACK_MISC /* BPF program wrote some data into this slot */
STACK_MISC, /* BPF program wrote some data into this slot */
STACK_ZERO, /* BPF program wrote constant zero */
};
#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
@ -96,13 +105,34 @@ struct bpf_stack_state {
/* state of the program:
* type of all registers and stack info
*/
struct bpf_verifier_state {
struct bpf_func_state {
struct bpf_reg_state regs[MAX_BPF_REG];
struct bpf_verifier_state *parent;
/* index of call instruction that called into this func */
int callsite;
/* stack frame number of this function state from pov of
* enclosing bpf_verifier_state.
* 0 = main function, 1 = first callee.
*/
u32 frameno;
/* subprog number == index within subprog_stack_depth
* zero == main subprog
*/
u32 subprogno;
/* should be second to last. See copy_func_state() */
int allocated_stack;
struct bpf_stack_state *stack;
};
#define MAX_CALL_FRAMES 8
struct bpf_verifier_state {
/* call stack tracking */
struct bpf_func_state *frame[MAX_CALL_FRAMES];
struct bpf_verifier_state *parent;
u32 curframe;
};
/* linked list of verifier states used to prune search */
struct bpf_verifier_state_list {
struct bpf_verifier_state state;
@ -113,6 +143,7 @@ struct bpf_insn_aux_data {
union {
enum bpf_reg_type ptr_type; /* pointer type for load/store insns */
struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */
s32 call_imm; /* saved imm field of call insn */
};
int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
bool seen; /* this insn was processed by the verifier */
@ -141,6 +172,8 @@ struct bpf_ext_analyzer_ops {
int insn_idx, int prev_insn_idx);
};
#define BPF_MAX_SUBPROGS 256
/* single container for all structs
* one verifier_env per bpf_check() call
*/
@ -159,13 +192,17 @@ struct bpf_verifier_env {
bool allow_ptr_leaks;
bool seen_direct_write;
struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
struct bpf_verifer_log log;
u32 subprog_starts[BPF_MAX_SUBPROGS];
u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1];
u32 subprog_cnt;
};
static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
{
return env->cur_state->regs;
struct bpf_verifier_state *cur = env->cur_state;
return cur->frame[cur->curframe]->regs;
}
#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)

View file

@ -58,6 +58,9 @@ struct bpf_prog_aux;
/* unused opcode to mark special call to bpf_tail_call() helper */
#define BPF_TAIL_CALL 0xf0
/* unused opcode to mark call to interpreter with arguments */
#define BPF_CALL_ARGS 0xe0
/* As per nm, we expose JITed images as text (code) section for
* kallsyms. That way, tools like perf can find it to match
* addresses.
@ -455,10 +458,13 @@ struct bpf_binary_header {
struct bpf_prog {
u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */
jit_requested:1,/* archs need to JIT the prog */
locked:1, /* Program image locked? */
gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */
dst_needed:1, /* Do we need dst entry? */
blinded:1, /* Was blinded */
is_func:1, /* program is a bpf function */
kprobe_override:1; /* Do we override a kprobe? */
enum bpf_prog_type type; /* Type of BPF program */
u32 len; /* Number of filter blocks */
@ -710,6 +716,9 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
#define __bpf_call_base_args \
((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
__bpf_call_base)
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
void bpf_jit_compile(struct bpf_prog *prog);
@ -798,7 +807,7 @@ static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
return fp->jited && bpf_jit_is_ebpf();
}
static inline bool bpf_jit_blinding_enabled(void)
static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
{
/* These are the prerequisites, should someone ever have the
* idea to call blinding outside of them, we make sure to
@ -806,7 +815,7 @@ static inline bool bpf_jit_blinding_enabled(void)
*/
if (!bpf_jit_is_ebpf())
return false;
if (!bpf_jit_enable)
if (!prog->jit_requested)
return false;
if (!bpf_jit_harden)
return false;

View file

@ -197,8 +197,14 @@ enum bpf_attach_type {
*/
#define BPF_F_STRICT_ALIGNMENT (1U << 0)
/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
#define BPF_PSEUDO_MAP_FD 1
/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
* offset to another bpf function
*/
#define BPF_PSEUDO_CALL 1
/* flags for BPF_MAP_UPDATE_ELEM command */
#define BPF_ANY 0 /* create new element or update existing */
#define BPF_NOEXIST 1 /* create new element if it didn't exist */

View file

@ -94,6 +94,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
fp->pages = size / PAGE_SIZE;
fp->aux = aux;
fp->aux->prog = fp;
fp->jit_requested = ebpf_jit_enabled();
INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
@ -217,30 +218,40 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
return 0;
}
static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
{
return BPF_CLASS(insn->code) == BPF_JMP &&
/* Call and Exit are both special jumps with no
* target inside the BPF instruction image.
*/
BPF_OP(insn->code) != BPF_CALL &&
BPF_OP(insn->code) != BPF_EXIT;
}
static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
{
struct bpf_insn *insn = prog->insnsi;
u32 i, insn_cnt = prog->len;
bool pseudo_call;
u8 code;
int off;
for (i = 0; i < insn_cnt; i++, insn++) {
if (!bpf_is_jmp_and_has_target(insn))
code = insn->code;
if (BPF_CLASS(code) != BPF_JMP)
continue;
if (BPF_OP(code) == BPF_EXIT)
continue;
if (BPF_OP(code) == BPF_CALL) {
if (insn->src_reg == BPF_PSEUDO_CALL)
pseudo_call = true;
else
continue;
} else {
pseudo_call = false;
}
off = pseudo_call ? insn->imm : insn->off;
/* Adjust offset of jmps if we cross boundaries. */
if (i < pos && i + insn->off + 1 > pos)
insn->off += delta;
else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
insn->off -= delta;
if (i < pos && i + off + 1 > pos)
off += delta;
else if (i > pos + delta && i + off + 1 <= pos + delta)
off -= delta;
if (pseudo_call)
insn->imm = off;
else
insn->off = off;
}
}
@ -711,7 +722,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
struct bpf_insn *insn;
int i, rewritten;
if (!bpf_jit_blinding_enabled())
if (!bpf_jit_blinding_enabled(prog) || prog->blinded)
return prog;
clone = bpf_prog_clone_create(prog, GFP_USER);
@ -753,6 +764,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
i += insn_delta;
}
clone->blinded = 1;
return clone;
}
#endif /* CONFIG_BPF_JIT */
@ -774,8 +786,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
*
* Decode and execute eBPF instructions.
*/
static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
u64 *stack)
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
u64 tmp;
static const void *jumptable[256] = {
@ -835,6 +846,7 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
/* Call instruction */
[BPF_JMP | BPF_CALL] = &&JMP_CALL,
[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
/* Jumps */
[BPF_JMP | BPF_JA] = &&JMP_JA,
@ -1025,6 +1037,13 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
BPF_R4, BPF_R5);
CONT;
JMP_CALL_ARGS:
BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
BPF_R3, BPF_R4,
BPF_R5,
insn + insn->off + 1);
CONT;
JMP_TAIL_CALL: {
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
@ -1297,6 +1316,23 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn
return ___bpf_prog_run(regs, insn, stack); \
}
#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \
static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
const struct bpf_insn *insn) \
{ \
u64 stack[stack_size / sizeof(u64)]; \
u64 regs[MAX_BPF_REG]; \
\
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
BPF_R1 = r1; \
BPF_R2 = r2; \
BPF_R3 = r3; \
BPF_R4 = r4; \
BPF_R5 = r5; \
return ___bpf_prog_run(regs, insn, stack); \
}
#define EVAL1(FN, X) FN(X)
#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
@ -1308,6 +1344,10 @@ EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192);
EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384);
EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512);
#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
static unsigned int (*interpreters[])(const void *ctx,
@ -1316,6 +1356,24 @@ EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
{
stack_depth = max_t(u32, stack_depth, 1);
insn->off = (s16) insn->imm;
insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
__bpf_call_base_args;
insn->code = BPF_JMP | BPF_CALL_ARGS;
}
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
@ -1572,11 +1630,19 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
int i;
aux = container_of(work, struct bpf_prog_aux, work);
if (bpf_prog_is_dev_bound(aux))
bpf_prog_offload_destroy(aux->prog);
bpf_jit_free(aux->prog);
for (i = 0; i < aux->func_cnt; i++)
bpf_jit_free(aux->func[i]);
if (aux->func_cnt) {
kfree(aux->func);
bpf_prog_unlock_free(aux->prog);
} else {
bpf_jit_free(aux->prog);
}
}
/* Free internal BPF program */

View file

@ -189,8 +189,12 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
u8 opcode = BPF_OP(insn->code);
if (opcode == BPF_CALL) {
verbose(env, "(%02x) call %s#%d\n", insn->code,
func_id_name(insn->imm), insn->imm);
if (insn->src_reg == BPF_PSEUDO_CALL)
verbose(env, "(%02x) call pc%+d\n", insn->code,
insn->imm);
else
verbose(env, "(%02x) call %s#%d\n", insn->code,
func_id_name(insn->imm), insn->imm);
} else if (insn->code == (BPF_JMP | BPF_JA)) {
verbose(env, "(%02x) goto pc%+d\n",
insn->code, insn->off);

View file

@ -1194,7 +1194,8 @@ static int bpf_prog_load(union bpf_attr *attr)
goto free_used_maps;
/* eBPF program is ready to be JITed */
prog = bpf_prog_select_runtime(prog, &err);
if (!prog->bpf_func)
prog = bpf_prog_select_runtime(prog, &err);
if (err < 0)
goto free_used_maps;

File diff suppressed because it is too large Load diff

View file

@ -197,8 +197,14 @@ enum bpf_attach_type {
*/
#define BPF_F_STRICT_ALIGNMENT (1U << 0)
/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
#define BPF_PSEUDO_MAP_FD 1
/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
* offset to another bpf function
*/
#define BPF_PSEUDO_CALL 1
/* flags for BPF_MAP_UPDATE_ELEM command */
#define BPF_ANY 0 /* create new element or update existing */
#define BPF_NOEXIST 1 /* create new element if it didn't exist */

View file

@ -40,7 +40,7 @@ int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name,
__u32 map_flags);
/* Recommend log buffer size */
#define BPF_LOG_BUF_SIZE 65536
#define BPF_LOG_BUF_SIZE (256 * 1024)
int bpf_load_program_name(enum bpf_prog_type type, const char *name,
const struct bpf_insn *insns,
size_t insns_cnt, const char *license,

View file

@ -174,12 +174,19 @@ struct bpf_program {
char *name;
char *section_name;
struct bpf_insn *insns;
size_t insns_cnt;
size_t insns_cnt, main_prog_cnt;
enum bpf_prog_type type;
struct {
struct reloc_desc {
enum {
RELO_LD64,
RELO_CALL,
} type;
int insn_idx;
int map_idx;
union {
int map_idx;
int text_off;
};
} *reloc_desc;
int nr_reloc;
@ -234,6 +241,7 @@ struct bpf_object {
} *reloc;
int nr_reloc;
int maps_shndx;
int text_shndx;
} efile;
/*
* All loaded bpf_object is linked in a list, which is
@ -375,9 +383,13 @@ bpf_object__init_prog_names(struct bpf_object *obj)
size_t pi, si;
for (pi = 0; pi < obj->nr_programs; pi++) {
char *name = NULL;
const char *name = NULL;
prog = &obj->programs[pi];
if (prog->idx == obj->efile.text_shndx) {
name = ".text";
goto skip_search;
}
for (si = 0; si < symbols->d_size / sizeof(GElf_Sym) && !name;
si++) {
@ -405,7 +417,7 @@ bpf_object__init_prog_names(struct bpf_object *obj)
prog->section_name);
return -EINVAL;
}
skip_search:
prog->name = strdup(name);
if (!prog->name) {
pr_warning("failed to allocate memory for prog sym %s\n",
@ -795,6 +807,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
} else if ((sh.sh_type == SHT_PROGBITS) &&
(sh.sh_flags & SHF_EXECINSTR) &&
(data->d_size > 0)) {
if (strcmp(name, ".text") == 0)
obj->efile.text_shndx = idx;
err = bpf_object__add_program(obj, data->d_buf,
data->d_size, name, idx);
if (err) {
@ -856,11 +870,14 @@ bpf_object__find_prog_by_idx(struct bpf_object *obj, int idx)
}
static int
bpf_program__collect_reloc(struct bpf_program *prog,
size_t nr_maps, GElf_Shdr *shdr,
Elf_Data *data, Elf_Data *symbols,
int maps_shndx, struct bpf_map *maps)
bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr,
Elf_Data *data, struct bpf_object *obj)
{
Elf_Data *symbols = obj->efile.symbols;
int text_shndx = obj->efile.text_shndx;
int maps_shndx = obj->efile.maps_shndx;
struct bpf_map *maps = obj->maps;
size_t nr_maps = obj->nr_maps;
int i, nrels;
pr_debug("collecting relocating info for: '%s'\n",
@ -893,8 +910,10 @@ bpf_program__collect_reloc(struct bpf_program *prog,
GELF_R_SYM(rel.r_info));
return -LIBBPF_ERRNO__FORMAT;
}
pr_debug("relo for %ld value %ld name %d\n",
rel.r_info >> 32, sym.st_value, sym.st_name);
if (sym.st_shndx != maps_shndx) {
if (sym.st_shndx != maps_shndx && sym.st_shndx != text_shndx) {
pr_warning("Program '%s' contains non-map related relo data pointing to section %u\n",
prog->section_name, sym.st_shndx);
return -LIBBPF_ERRNO__RELOC;
@ -903,6 +922,17 @@ bpf_program__collect_reloc(struct bpf_program *prog,
insn_idx = rel.r_offset / sizeof(struct bpf_insn);
pr_debug("relocation: insn_idx=%u\n", insn_idx);
if (insns[insn_idx].code == (BPF_JMP | BPF_CALL)) {
if (insns[insn_idx].src_reg != BPF_PSEUDO_CALL) {
pr_warning("incorrect bpf_call opcode\n");
return -LIBBPF_ERRNO__RELOC;
}
prog->reloc_desc[i].type = RELO_CALL;
prog->reloc_desc[i].insn_idx = insn_idx;
prog->reloc_desc[i].text_off = sym.st_value;
continue;
}
if (insns[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
pr_warning("bpf: relocation: invalid relo for insns[%d].code 0x%x\n",
insn_idx, insns[insn_idx].code);
@ -924,6 +954,7 @@ bpf_program__collect_reloc(struct bpf_program *prog,
return -LIBBPF_ERRNO__RELOC;
}
prog->reloc_desc[i].type = RELO_LD64;
prog->reloc_desc[i].insn_idx = insn_idx;
prog->reloc_desc[i].map_idx = map_idx;
}
@ -962,28 +993,77 @@ bpf_object__create_maps(struct bpf_object *obj)
return 0;
}
static int
bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj,
struct reloc_desc *relo)
{
struct bpf_insn *insn, *new_insn;
struct bpf_program *text;
size_t new_cnt;
if (relo->type != RELO_CALL)
return -LIBBPF_ERRNO__RELOC;
if (prog->idx == obj->efile.text_shndx) {
pr_warning("relo in .text insn %d into off %d\n",
relo->insn_idx, relo->text_off);
return -LIBBPF_ERRNO__RELOC;
}
if (prog->main_prog_cnt == 0) {
text = bpf_object__find_prog_by_idx(obj, obj->efile.text_shndx);
if (!text) {
pr_warning("no .text section found yet relo into text exist\n");
return -LIBBPF_ERRNO__RELOC;
}
new_cnt = prog->insns_cnt + text->insns_cnt;
new_insn = realloc(prog->insns, new_cnt * sizeof(*insn));
if (!new_insn) {
pr_warning("oom in prog realloc\n");
return -ENOMEM;
}
memcpy(new_insn + prog->insns_cnt, text->insns,
text->insns_cnt * sizeof(*insn));
prog->insns = new_insn;
prog->main_prog_cnt = prog->insns_cnt;
prog->insns_cnt = new_cnt;
}
insn = &prog->insns[relo->insn_idx];
insn->imm += prog->main_prog_cnt - relo->insn_idx;
pr_debug("added %zd insn from %s to prog %s\n",
text->insns_cnt, text->section_name, prog->section_name);
return 0;
}
static int
bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj)
{
int i;
int i, err;
if (!prog || !prog->reloc_desc)
return 0;
for (i = 0; i < prog->nr_reloc; i++) {
int insn_idx, map_idx;
struct bpf_insn *insns = prog->insns;
if (prog->reloc_desc[i].type == RELO_LD64) {
struct bpf_insn *insns = prog->insns;
int insn_idx, map_idx;
insn_idx = prog->reloc_desc[i].insn_idx;
map_idx = prog->reloc_desc[i].map_idx;
insn_idx = prog->reloc_desc[i].insn_idx;
map_idx = prog->reloc_desc[i].map_idx;
if (insn_idx >= (int)prog->insns_cnt) {
pr_warning("relocation out of range: '%s'\n",
prog->section_name);
return -LIBBPF_ERRNO__RELOC;
if (insn_idx >= (int)prog->insns_cnt) {
pr_warning("relocation out of range: '%s'\n",
prog->section_name);
return -LIBBPF_ERRNO__RELOC;
}
insns[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
insns[insn_idx].imm = obj->maps[map_idx].fd;
} else {
err = bpf_program__reloc_text(prog, obj,
&prog->reloc_desc[i]);
if (err)
return err;
}
insns[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
insns[insn_idx].imm = obj->maps[map_idx].fd;
}
zfree(&prog->reloc_desc);
@ -1026,7 +1106,6 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
Elf_Data *data = obj->efile.reloc[i].data;
int idx = shdr->sh_info;
struct bpf_program *prog;
size_t nr_maps = obj->nr_maps;
if (shdr->sh_type != SHT_REL) {
pr_warning("internal error at %d\n", __LINE__);
@ -1040,11 +1119,9 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
return -LIBBPF_ERRNO__RELOC;
}
err = bpf_program__collect_reloc(prog, nr_maps,
err = bpf_program__collect_reloc(prog,
shdr, data,
obj->efile.symbols,
obj->efile.maps_shndx,
obj->maps);
obj);
if (err)
return err;
}
@ -1197,6 +1274,8 @@ bpf_object__load_progs(struct bpf_object *obj)
int err;
for (i = 0; i < obj->nr_programs; i++) {
if (obj->programs[i].idx == obj->efile.text_shndx)
continue;
err = bpf_program__load(&obj->programs[i],
obj->license,
obj->kern_version);
@ -1859,7 +1938,7 @@ long libbpf_get_error(const void *ptr)
int bpf_prog_load(const char *file, enum bpf_prog_type type,
struct bpf_object **pobj, int *prog_fd)
{
struct bpf_program *prog;
struct bpf_program *prog, *first_prog = NULL;
struct bpf_object *obj;
int err;
@ -1867,25 +1946,30 @@ int bpf_prog_load(const char *file, enum bpf_prog_type type,
if (IS_ERR(obj))
return -ENOENT;
prog = bpf_program__next(NULL, obj);
if (!prog) {
bpf_object__for_each_program(prog, obj) {
/*
* If type is not specified, try to guess it based on
* section name.
*/
if (type == BPF_PROG_TYPE_UNSPEC) {
type = bpf_program__guess_type(prog);
if (type == BPF_PROG_TYPE_UNSPEC) {
bpf_object__close(obj);
return -EINVAL;
}
}
bpf_program__set_type(prog, type);
if (prog->idx != obj->efile.text_shndx && !first_prog)
first_prog = prog;
}
if (!first_prog) {
pr_warning("object file doesn't contain bpf program\n");
bpf_object__close(obj);
return -ENOENT;
}
/*
* If type is not specified, try to guess it based on
* section name.
*/
if (type == BPF_PROG_TYPE_UNSPEC) {
type = bpf_program__guess_type(prog);
if (type == BPF_PROG_TYPE_UNSPEC) {
bpf_object__close(obj);
return -EINVAL;
}
}
bpf_program__set_type(prog, type);
err = bpf_object__load(obj);
if (err) {
bpf_object__close(obj);
@ -1893,6 +1977,6 @@ int bpf_prog_load(const char *file, enum bpf_prog_type type,
}
*pobj = obj;
*prog_fd = bpf_program__fd(prog);
*prog_fd = bpf_program__fd(first_prog);
return 0;
}

View file

@ -17,7 +17,8 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \
sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o
sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
test_l4lb_noinline.o test_xdp_noinline.o
TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
test_offload.py
@ -49,8 +50,13 @@ else
CPU ?= generic
endif
CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \
-Wno-compare-distinct-pointer-types
$(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline
$(OUTPUT)/test_xdp_noinline.o: CLANG_FLAGS += -fno-inline
%.o: %.c
$(CLANG) -I. -I./include/uapi -I../../../include/uapi \
-Wno-compare-distinct-pointer-types \
$(CLANG) $(CLANG_FLAGS) \
-O2 -target bpf -emit-llvm -c $< -o - | \
$(LLC) -march=bpf -mcpu=$(CPU) -filetype=obj -o $@

View file

@ -0,0 +1,473 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2017 Facebook
#include <stddef.h>
#include <stdbool.h>
#include <string.h>
#include <linux/pkt_cls.h>
#include <linux/bpf.h>
#include <linux/in.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/icmp.h>
#include <linux/icmpv6.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include "bpf_helpers.h"
#include "test_iptunnel_common.h"
#include "bpf_endian.h"
int _version SEC("version") = 1;
static __u32 rol32(__u32 word, unsigned int shift)
{
return (word << shift) | (word >> ((-shift) & 31));
}
/* copy paste of jhash from kernel sources to make sure llvm
* can compile it into valid sequence of bpf instructions
*/
#define __jhash_mix(a, b, c) \
{ \
a -= c; a ^= rol32(c, 4); c += b; \
b -= a; b ^= rol32(a, 6); a += c; \
c -= b; c ^= rol32(b, 8); b += a; \
a -= c; a ^= rol32(c, 16); c += b; \
b -= a; b ^= rol32(a, 19); a += c; \
c -= b; c ^= rol32(b, 4); b += a; \
}
#define __jhash_final(a, b, c) \
{ \
c ^= b; c -= rol32(b, 14); \
a ^= c; a -= rol32(c, 11); \
b ^= a; b -= rol32(a, 25); \
c ^= b; c -= rol32(b, 16); \
a ^= c; a -= rol32(c, 4); \
b ^= a; b -= rol32(a, 14); \
c ^= b; c -= rol32(b, 24); \
}
#define JHASH_INITVAL 0xdeadbeef
typedef unsigned int u32;
static u32 jhash(const void *key, u32 length, u32 initval)
{
u32 a, b, c;
const unsigned char *k = key;
a = b = c = JHASH_INITVAL + length + initval;
while (length > 12) {
a += *(u32 *)(k);
b += *(u32 *)(k + 4);
c += *(u32 *)(k + 8);
__jhash_mix(a, b, c);
length -= 12;
k += 12;
}
switch (length) {
case 12: c += (u32)k[11]<<24;
case 11: c += (u32)k[10]<<16;
case 10: c += (u32)k[9]<<8;
case 9: c += k[8];
case 8: b += (u32)k[7]<<24;
case 7: b += (u32)k[6]<<16;
case 6: b += (u32)k[5]<<8;
case 5: b += k[4];
case 4: a += (u32)k[3]<<24;
case 3: a += (u32)k[2]<<16;
case 2: a += (u32)k[1]<<8;
case 1: a += k[0];
__jhash_final(a, b, c);
case 0: /* Nothing left to add */
break;
}
return c;
}
static u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
{
a += initval;
b += initval;
c += initval;
__jhash_final(a, b, c);
return c;
}
static u32 jhash_2words(u32 a, u32 b, u32 initval)
{
return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
}
#define PCKT_FRAGMENTED 65343
#define IPV4_HDR_LEN_NO_OPT 20
#define IPV4_PLUS_ICMP_HDR 28
#define IPV6_PLUS_ICMP_HDR 48
#define RING_SIZE 2
#define MAX_VIPS 12
#define MAX_REALS 5
#define CTL_MAP_SIZE 16
#define CH_RINGS_SIZE (MAX_VIPS * RING_SIZE)
#define F_IPV6 (1 << 0)
#define F_HASH_NO_SRC_PORT (1 << 0)
#define F_ICMP (1 << 0)
#define F_SYN_SET (1 << 1)
struct packet_description {
union {
__be32 src;
__be32 srcv6[4];
};
union {
__be32 dst;
__be32 dstv6[4];
};
union {
__u32 ports;
__u16 port16[2];
};
__u8 proto;
__u8 flags;
};
struct ctl_value {
union {
__u64 value;
__u32 ifindex;
__u8 mac[6];
};
};
struct vip_meta {
__u32 flags;
__u32 vip_num;
};
struct real_definition {
union {
__be32 dst;
__be32 dstv6[4];
};
__u8 flags;
};
struct vip_stats {
__u64 bytes;
__u64 pkts;
};
struct eth_hdr {
unsigned char eth_dest[ETH_ALEN];
unsigned char eth_source[ETH_ALEN];
unsigned short eth_proto;
};
struct bpf_map_def SEC("maps") vip_map = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(struct vip),
.value_size = sizeof(struct vip_meta),
.max_entries = MAX_VIPS,
};
struct bpf_map_def SEC("maps") ch_rings = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = CH_RINGS_SIZE,
};
struct bpf_map_def SEC("maps") reals = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct real_definition),
.max_entries = MAX_REALS,
};
struct bpf_map_def SEC("maps") stats = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct vip_stats),
.max_entries = MAX_VIPS,
};
struct bpf_map_def SEC("maps") ctl_array = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct ctl_value),
.max_entries = CTL_MAP_SIZE,
};
static __u32 get_packet_hash(struct packet_description *pckt,
bool ipv6)
{
if (ipv6)
return jhash_2words(jhash(pckt->srcv6, 16, MAX_VIPS),
pckt->ports, CH_RINGS_SIZE);
else
return jhash_2words(pckt->src, pckt->ports, CH_RINGS_SIZE);
}
static bool get_packet_dst(struct real_definition **real,
struct packet_description *pckt,
struct vip_meta *vip_info,
bool is_ipv6)
{
__u32 hash = get_packet_hash(pckt, is_ipv6);
__u32 key = RING_SIZE * vip_info->vip_num + hash % RING_SIZE;
__u32 *real_pos;
if (hash != 0x358459b7 /* jhash of ipv4 packet */ &&
hash != 0x2f4bc6bb /* jhash of ipv6 packet */)
return 0;
real_pos = bpf_map_lookup_elem(&ch_rings, &key);
if (!real_pos)
return false;
key = *real_pos;
*real = bpf_map_lookup_elem(&reals, &key);
if (!(*real))
return false;
return true;
}
static int parse_icmpv6(void *data, void *data_end, __u64 off,
struct packet_description *pckt)
{
struct icmp6hdr *icmp_hdr;
struct ipv6hdr *ip6h;
icmp_hdr = data + off;
if (icmp_hdr + 1 > data_end)
return TC_ACT_SHOT;
if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG)
return TC_ACT_OK;
off += sizeof(struct icmp6hdr);
ip6h = data + off;
if (ip6h + 1 > data_end)
return TC_ACT_SHOT;
pckt->proto = ip6h->nexthdr;
pckt->flags |= F_ICMP;
memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16);
memcpy(pckt->dstv6, ip6h->saddr.s6_addr32, 16);
return TC_ACT_UNSPEC;
}
static int parse_icmp(void *data, void *data_end, __u64 off,
struct packet_description *pckt)
{
struct icmphdr *icmp_hdr;
struct iphdr *iph;
icmp_hdr = data + off;
if (icmp_hdr + 1 > data_end)
return TC_ACT_SHOT;
if (icmp_hdr->type != ICMP_DEST_UNREACH ||
icmp_hdr->code != ICMP_FRAG_NEEDED)
return TC_ACT_OK;
off += sizeof(struct icmphdr);
iph = data + off;
if (iph + 1 > data_end)
return TC_ACT_SHOT;
if (iph->ihl != 5)
return TC_ACT_SHOT;
pckt->proto = iph->protocol;
pckt->flags |= F_ICMP;
pckt->src = iph->daddr;
pckt->dst = iph->saddr;
return TC_ACT_UNSPEC;
}
static bool parse_udp(void *data, __u64 off, void *data_end,
struct packet_description *pckt)
{
struct udphdr *udp;
udp = data + off;
if (udp + 1 > data_end)
return false;
if (!(pckt->flags & F_ICMP)) {
pckt->port16[0] = udp->source;
pckt->port16[1] = udp->dest;
} else {
pckt->port16[0] = udp->dest;
pckt->port16[1] = udp->source;
}
return true;
}
static bool parse_tcp(void *data, __u64 off, void *data_end,
struct packet_description *pckt)
{
struct tcphdr *tcp;
tcp = data + off;
if (tcp + 1 > data_end)
return false;
if (tcp->syn)
pckt->flags |= F_SYN_SET;
if (!(pckt->flags & F_ICMP)) {
pckt->port16[0] = tcp->source;
pckt->port16[1] = tcp->dest;
} else {
pckt->port16[0] = tcp->dest;
pckt->port16[1] = tcp->source;
}
return true;
}
static int process_packet(void *data, __u64 off, void *data_end,
bool is_ipv6, struct __sk_buff *skb)
{
void *pkt_start = (void *)(long)skb->data;
struct packet_description pckt = {};
struct eth_hdr *eth = pkt_start;
struct bpf_tunnel_key tkey = {};
struct vip_stats *data_stats;
struct real_definition *dst;
struct vip_meta *vip_info;
struct ctl_value *cval;
__u32 v4_intf_pos = 1;
__u32 v6_intf_pos = 2;
struct ipv6hdr *ip6h;
struct vip vip = {};
struct iphdr *iph;
int tun_flag = 0;
__u16 pkt_bytes;
__u64 iph_len;
__u32 ifindex;
__u8 protocol;
__u32 vip_num;
int action;
tkey.tunnel_ttl = 64;
if (is_ipv6) {
ip6h = data + off;
if (ip6h + 1 > data_end)
return TC_ACT_SHOT;
iph_len = sizeof(struct ipv6hdr);
protocol = ip6h->nexthdr;
pckt.proto = protocol;
pkt_bytes = bpf_ntohs(ip6h->payload_len);
off += iph_len;
if (protocol == IPPROTO_FRAGMENT) {
return TC_ACT_SHOT;
} else if (protocol == IPPROTO_ICMPV6) {
action = parse_icmpv6(data, data_end, off, &pckt);
if (action >= 0)
return action;
off += IPV6_PLUS_ICMP_HDR;
} else {
memcpy(pckt.srcv6, ip6h->saddr.s6_addr32, 16);
memcpy(pckt.dstv6, ip6h->daddr.s6_addr32, 16);
}
} else {
iph = data + off;
if (iph + 1 > data_end)
return TC_ACT_SHOT;
if (iph->ihl != 5)
return TC_ACT_SHOT;
protocol = iph->protocol;
pckt.proto = protocol;
pkt_bytes = bpf_ntohs(iph->tot_len);
off += IPV4_HDR_LEN_NO_OPT;
if (iph->frag_off & PCKT_FRAGMENTED)
return TC_ACT_SHOT;
if (protocol == IPPROTO_ICMP) {
action = parse_icmp(data, data_end, off, &pckt);
if (action >= 0)
return action;
off += IPV4_PLUS_ICMP_HDR;
} else {
pckt.src = iph->saddr;
pckt.dst = iph->daddr;
}
}
protocol = pckt.proto;
if (protocol == IPPROTO_TCP) {
if (!parse_tcp(data, off, data_end, &pckt))
return TC_ACT_SHOT;
} else if (protocol == IPPROTO_UDP) {
if (!parse_udp(data, off, data_end, &pckt))
return TC_ACT_SHOT;
} else {
return TC_ACT_SHOT;
}
if (is_ipv6)
memcpy(vip.daddr.v6, pckt.dstv6, 16);
else
vip.daddr.v4 = pckt.dst;
vip.dport = pckt.port16[1];
vip.protocol = pckt.proto;
vip_info = bpf_map_lookup_elem(&vip_map, &vip);
if (!vip_info) {
vip.dport = 0;
vip_info = bpf_map_lookup_elem(&vip_map, &vip);
if (!vip_info)
return TC_ACT_SHOT;
pckt.port16[1] = 0;
}
if (vip_info->flags & F_HASH_NO_SRC_PORT)
pckt.port16[0] = 0;
if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6))
return TC_ACT_SHOT;
if (dst->flags & F_IPV6) {
cval = bpf_map_lookup_elem(&ctl_array, &v6_intf_pos);
if (!cval)
return TC_ACT_SHOT;
ifindex = cval->ifindex;
memcpy(tkey.remote_ipv6, dst->dstv6, 16);
tun_flag = BPF_F_TUNINFO_IPV6;
} else {
cval = bpf_map_lookup_elem(&ctl_array, &v4_intf_pos);
if (!cval)
return TC_ACT_SHOT;
ifindex = cval->ifindex;
tkey.remote_ipv4 = dst->dst;
}
vip_num = vip_info->vip_num;
data_stats = bpf_map_lookup_elem(&stats, &vip_num);
if (!data_stats)
return TC_ACT_SHOT;
data_stats->pkts++;
data_stats->bytes += pkt_bytes;
bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), tun_flag);
*(u32 *)eth->eth_dest = tkey.remote_ipv4;
return bpf_redirect(ifindex, 0);
}
SEC("l4lb-demo")
int balancer_ingress(struct __sk_buff *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct eth_hdr *eth = data;
__u32 eth_proto;
__u32 nh_off;
nh_off = sizeof(struct eth_hdr);
if (data + nh_off > data_end)
return TC_ACT_SHOT;
eth_proto = eth->eth_proto;
if (eth_proto == bpf_htons(ETH_P_IP))
return process_packet(data, nh_off, data_end, false, ctx);
else if (eth_proto == bpf_htons(ETH_P_IPV6))
return process_packet(data, nh_off, data_end, true, ctx);
else
return TC_ACT_SHOT;
}
char _license[] SEC("license") = "GPL";

View file

@ -169,10 +169,9 @@ static void test_xdp(void)
#define NUM_ITER 100000
#define VIP_NUM 5
static void test_l4lb(void)
static void test_l4lb(const char *file)
{
unsigned int nr_cpus = bpf_num_possible_cpus();
const char *file = "./test_l4lb.o";
struct vip key = {.protocol = 6};
struct vip_meta {
__u32 flags;
@ -249,6 +248,95 @@ static void test_l4lb(void)
bpf_object__close(obj);
}
static void test_l4lb_all(void)
{
const char *file1 = "./test_l4lb.o";
const char *file2 = "./test_l4lb_noinline.o";
test_l4lb(file1);
test_l4lb(file2);
}
static void test_xdp_noinline(void)
{
const char *file = "./test_xdp_noinline.o";
unsigned int nr_cpus = bpf_num_possible_cpus();
struct vip key = {.protocol = 6};
struct vip_meta {
__u32 flags;
__u32 vip_num;
} value = {.vip_num = VIP_NUM};
__u32 stats_key = VIP_NUM;
struct vip_stats {
__u64 bytes;
__u64 pkts;
} stats[nr_cpus];
struct real_definition {
union {
__be32 dst;
__be32 dstv6[4];
};
__u8 flags;
} real_def = {.dst = MAGIC_VAL};
__u32 ch_key = 11, real_num = 3;
__u32 duration, retval, size;
int err, i, prog_fd, map_fd;
__u64 bytes = 0, pkts = 0;
struct bpf_object *obj;
char buf[128];
u32 *magic = (u32 *)buf;
err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
if (err) {
error_cnt++;
return;
}
map_fd = bpf_find_map(__func__, obj, "vip_map");
if (map_fd < 0)
goto out;
bpf_map_update_elem(map_fd, &key, &value, 0);
map_fd = bpf_find_map(__func__, obj, "ch_rings");
if (map_fd < 0)
goto out;
bpf_map_update_elem(map_fd, &ch_key, &real_num, 0);
map_fd = bpf_find_map(__func__, obj, "reals");
if (map_fd < 0)
goto out;
bpf_map_update_elem(map_fd, &real_num, &real_def, 0);
err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4),
buf, &size, &retval, &duration);
CHECK(err || errno || retval != 1 || size != 54 ||
*magic != MAGIC_VAL, "ipv4",
"err %d errno %d retval %d size %d magic %x\n",
err, errno, retval, size, *magic);
err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6),
buf, &size, &retval, &duration);
CHECK(err || errno || retval != 1 || size != 74 ||
*magic != MAGIC_VAL, "ipv6",
"err %d errno %d retval %d size %d magic %x\n",
err, errno, retval, size, *magic);
map_fd = bpf_find_map(__func__, obj, "stats");
if (map_fd < 0)
goto out;
bpf_map_lookup_elem(map_fd, &stats_key, stats);
for (i = 0; i < nr_cpus; i++) {
bytes += stats[i].bytes;
pkts += stats[i].pkts;
}
if (bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2) {
error_cnt++;
printf("test_xdp_noinline:FAIL:stats %lld %lld\n", bytes, pkts);
}
out:
bpf_object__close(obj);
}
static void test_tcp_estats(void)
{
const char *file = "./test_tcp_estats.o";
@ -757,7 +845,8 @@ int main(void)
test_pkt_access();
test_xdp();
test_l4lb();
test_l4lb_all();
test_xdp_noinline();
test_tcp_estats();
test_bpf_obj_id();
test_pkt_md_access();

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,833 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2017 Facebook
#include <stddef.h>
#include <stdbool.h>
#include <string.h>
#include <linux/pkt_cls.h>
#include <linux/bpf.h>
#include <linux/in.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/icmp.h>
#include <linux/icmpv6.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include "bpf_helpers.h"
#define bpf_printk(fmt, ...) \
({ \
char ____fmt[] = fmt; \
bpf_trace_printk(____fmt, sizeof(____fmt), \
##__VA_ARGS__); \
})
static __u32 rol32(__u32 word, unsigned int shift)
{
return (word << shift) | (word >> ((-shift) & 31));
}
/* copy paste of jhash from kernel sources to make sure llvm
* can compile it into valid sequence of bpf instructions
*/
#define __jhash_mix(a, b, c) \
{ \
a -= c; a ^= rol32(c, 4); c += b; \
b -= a; b ^= rol32(a, 6); a += c; \
c -= b; c ^= rol32(b, 8); b += a; \
a -= c; a ^= rol32(c, 16); c += b; \
b -= a; b ^= rol32(a, 19); a += c; \
c -= b; c ^= rol32(b, 4); b += a; \
}
#define __jhash_final(a, b, c) \
{ \
c ^= b; c -= rol32(b, 14); \
a ^= c; a -= rol32(c, 11); \
b ^= a; b -= rol32(a, 25); \
c ^= b; c -= rol32(b, 16); \
a ^= c; a -= rol32(c, 4); \
b ^= a; b -= rol32(a, 14); \
c ^= b; c -= rol32(b, 24); \
}
#define JHASH_INITVAL 0xdeadbeef
typedef unsigned int u32;
static __attribute__ ((noinline))
u32 jhash(const void *key, u32 length, u32 initval)
{
u32 a, b, c;
const unsigned char *k = key;
a = b = c = JHASH_INITVAL + length + initval;
while (length > 12) {
a += *(u32 *)(k);
b += *(u32 *)(k + 4);
c += *(u32 *)(k + 8);
__jhash_mix(a, b, c);
length -= 12;
k += 12;
}
switch (length) {
case 12: c += (u32)k[11]<<24;
case 11: c += (u32)k[10]<<16;
case 10: c += (u32)k[9]<<8;
case 9: c += k[8];
case 8: b += (u32)k[7]<<24;
case 7: b += (u32)k[6]<<16;
case 6: b += (u32)k[5]<<8;
case 5: b += k[4];
case 4: a += (u32)k[3]<<24;
case 3: a += (u32)k[2]<<16;
case 2: a += (u32)k[1]<<8;
case 1: a += k[0];
__jhash_final(a, b, c);
case 0: /* Nothing left to add */
break;
}
return c;
}
static __attribute__ ((noinline))
u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
{
a += initval;
b += initval;
c += initval;
__jhash_final(a, b, c);
return c;
}
static __attribute__ ((noinline))
u32 jhash_2words(u32 a, u32 b, u32 initval)
{
return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
}
struct flow_key {
union {
__be32 src;
__be32 srcv6[4];
};
union {
__be32 dst;
__be32 dstv6[4];
};
union {
__u32 ports;
__u16 port16[2];
};
__u8 proto;
};
struct packet_description {
struct flow_key flow;
__u8 flags;
};
struct ctl_value {
union {
__u64 value;
__u32 ifindex;
__u8 mac[6];
};
};
struct vip_definition {
union {
__be32 vip;
__be32 vipv6[4];
};
__u16 port;
__u16 family;
__u8 proto;
};
struct vip_meta {
__u32 flags;
__u32 vip_num;
};
struct real_pos_lru {
__u32 pos;
__u64 atime;
};
struct real_definition {
union {
__be32 dst;
__be32 dstv6[4];
};
__u8 flags;
};
struct lb_stats {
__u64 v2;
__u64 v1;
};
struct bpf_map_def __attribute__ ((section("maps"), used)) vip_map = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(struct vip_definition),
.value_size = sizeof(struct vip_meta),
.max_entries = 512,
.map_flags = 0,
};
struct bpf_map_def __attribute__ ((section("maps"), used)) lru_cache = {
.type = BPF_MAP_TYPE_LRU_HASH,
.key_size = sizeof(struct flow_key),
.value_size = sizeof(struct real_pos_lru),
.max_entries = 300,
.map_flags = 1U << 1,
};
struct bpf_map_def __attribute__ ((section("maps"), used)) ch_rings = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = 12 * 655,
.map_flags = 0,
};
struct bpf_map_def __attribute__ ((section("maps"), used)) reals = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct real_definition),
.max_entries = 40,
.map_flags = 0,
};
struct bpf_map_def __attribute__ ((section("maps"), used)) stats = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct lb_stats),
.max_entries = 515,
.map_flags = 0,
};
struct bpf_map_def __attribute__ ((section("maps"), used)) ctl_array = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct ctl_value),
.max_entries = 16,
.map_flags = 0,
};
struct eth_hdr {
unsigned char eth_dest[6];
unsigned char eth_source[6];
unsigned short eth_proto;
};
static inline __u64 calc_offset(bool is_ipv6, bool is_icmp)
{
__u64 off = sizeof(struct eth_hdr);
if (is_ipv6) {
off += sizeof(struct ipv6hdr);
if (is_icmp)
off += sizeof(struct icmp6hdr) + sizeof(struct ipv6hdr);
} else {
off += sizeof(struct iphdr);
if (is_icmp)
off += sizeof(struct icmphdr) + sizeof(struct iphdr);
}
return off;
}
static __attribute__ ((noinline))
bool parse_udp(void *data, void *data_end,
bool is_ipv6, struct packet_description *pckt)
{
bool is_icmp = !((pckt->flags & (1 << 0)) == 0);
__u64 off = calc_offset(is_ipv6, is_icmp);
struct udphdr *udp;
udp = data + off;
if (udp + 1 > data_end)
return 0;
if (!is_icmp) {
pckt->flow.port16[0] = udp->source;
pckt->flow.port16[1] = udp->dest;
} else {
pckt->flow.port16[0] = udp->dest;
pckt->flow.port16[1] = udp->source;
}
return 1;
}
static __attribute__ ((noinline))
bool parse_tcp(void *data, void *data_end,
bool is_ipv6, struct packet_description *pckt)
{
bool is_icmp = !((pckt->flags & (1 << 0)) == 0);
__u64 off = calc_offset(is_ipv6, is_icmp);
struct tcphdr *tcp;
tcp = data + off;
if (tcp + 1 > data_end)
return 0;
if (tcp->syn)
pckt->flags |= (1 << 1);
if (!is_icmp) {
pckt->flow.port16[0] = tcp->source;
pckt->flow.port16[1] = tcp->dest;
} else {
pckt->flow.port16[0] = tcp->dest;
pckt->flow.port16[1] = tcp->source;
}
return 1;
}
static __attribute__ ((noinline))
bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval,
struct packet_description *pckt,
struct real_definition *dst, __u32 pkt_bytes)
{
struct eth_hdr *new_eth;
struct eth_hdr *old_eth;
struct ipv6hdr *ip6h;
__u32 ip_suffix;
void *data_end;
void *data;
if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr)))
return 0;
data = (void *)(long)xdp->data;
data_end = (void *)(long)xdp->data_end;
new_eth = data;
ip6h = data + sizeof(struct eth_hdr);
old_eth = data + sizeof(struct ipv6hdr);
if (new_eth + 1 > data_end ||
old_eth + 1 > data_end || ip6h + 1 > data_end)
return 0;
memcpy(new_eth->eth_dest, cval->mac, 6);
memcpy(new_eth->eth_source, old_eth->eth_dest, 6);
new_eth->eth_proto = 56710;
ip6h->version = 6;
ip6h->priority = 0;
memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
ip6h->nexthdr = IPPROTO_IPV6;
ip_suffix = pckt->flow.srcv6[3] ^ pckt->flow.port16[0];
ip6h->payload_len =
__builtin_bswap16(pkt_bytes + sizeof(struct ipv6hdr));
ip6h->hop_limit = 4;
ip6h->saddr.in6_u.u6_addr32[0] = 1;
ip6h->saddr.in6_u.u6_addr32[1] = 2;
ip6h->saddr.in6_u.u6_addr32[2] = 3;
ip6h->saddr.in6_u.u6_addr32[3] = ip_suffix;
memcpy(ip6h->daddr.in6_u.u6_addr32, dst->dstv6, 16);
return 1;
}
static __attribute__ ((noinline))
bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval,
struct packet_description *pckt,
struct real_definition *dst, __u32 pkt_bytes)
{
__u32 ip_suffix = __builtin_bswap16(pckt->flow.port16[0]);
struct eth_hdr *new_eth;
struct eth_hdr *old_eth;
__u16 *next_iph_u16;
struct iphdr *iph;
__u32 csum = 0;
void *data_end;
void *data;
ip_suffix <<= 15;
ip_suffix ^= pckt->flow.src;
if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr)))
return 0;
data = (void *)(long)xdp->data;
data_end = (void *)(long)xdp->data_end;
new_eth = data;
iph = data + sizeof(struct eth_hdr);
old_eth = data + sizeof(struct iphdr);
if (new_eth + 1 > data_end ||
old_eth + 1 > data_end || iph + 1 > data_end)
return 0;
memcpy(new_eth->eth_dest, cval->mac, 6);
memcpy(new_eth->eth_source, old_eth->eth_dest, 6);
new_eth->eth_proto = 8;
iph->version = 4;
iph->ihl = 5;
iph->frag_off = 0;
iph->protocol = IPPROTO_IPIP;
iph->check = 0;
iph->tos = 1;
iph->tot_len = __builtin_bswap16(pkt_bytes + sizeof(struct iphdr));
/* don't update iph->daddr, since it will overwrite old eth_proto
* and multiple iterations of bpf_prog_run() will fail
*/
iph->saddr = ((0xFFFF0000 & ip_suffix) | 4268) ^ dst->dst;
iph->ttl = 4;
next_iph_u16 = (__u16 *) iph;
#pragma clang loop unroll(full)
for (int i = 0; i < sizeof(struct iphdr) >> 1; i++)
csum += *next_iph_u16++;
iph->check = ~((csum & 0xffff) + (csum >> 16));
if (bpf_xdp_adjust_head(xdp, (int)sizeof(struct iphdr)))
return 0;
return 1;
}
static __attribute__ ((noinline))
bool decap_v6(struct xdp_md *xdp, void **data, void **data_end, bool inner_v4)
{
struct eth_hdr *new_eth;
struct eth_hdr *old_eth;
old_eth = *data;
new_eth = *data + sizeof(struct ipv6hdr);
memcpy(new_eth->eth_source, old_eth->eth_source, 6);
memcpy(new_eth->eth_dest, old_eth->eth_dest, 6);
if (inner_v4)
new_eth->eth_proto = 8;
else
new_eth->eth_proto = 56710;
if (bpf_xdp_adjust_head(xdp, (int)sizeof(struct ipv6hdr)))
return 0;
*data = (void *)(long)xdp->data;
*data_end = (void *)(long)xdp->data_end;
return 1;
}
static __attribute__ ((noinline))
bool decap_v4(struct xdp_md *xdp, void **data, void **data_end)
{
struct eth_hdr *new_eth;
struct eth_hdr *old_eth;
old_eth = *data;
new_eth = *data + sizeof(struct iphdr);
memcpy(new_eth->eth_source, old_eth->eth_source, 6);
memcpy(new_eth->eth_dest, old_eth->eth_dest, 6);
new_eth->eth_proto = 8;
if (bpf_xdp_adjust_head(xdp, (int)sizeof(struct iphdr)))
return 0;
*data = (void *)(long)xdp->data;
*data_end = (void *)(long)xdp->data_end;
return 1;
}
static __attribute__ ((noinline))
int swap_mac_and_send(void *data, void *data_end)
{
unsigned char tmp_mac[6];
struct eth_hdr *eth;
eth = data;
memcpy(tmp_mac, eth->eth_source, 6);
memcpy(eth->eth_source, eth->eth_dest, 6);
memcpy(eth->eth_dest, tmp_mac, 6);
return XDP_TX;
}
static __attribute__ ((noinline))
int send_icmp_reply(void *data, void *data_end)
{
struct icmphdr *icmp_hdr;
__u16 *next_iph_u16;
__u32 tmp_addr = 0;
struct iphdr *iph;
__u32 csum1 = 0;
__u32 csum = 0;
__u64 off = 0;
if (data + sizeof(struct eth_hdr)
+ sizeof(struct iphdr) + sizeof(struct icmphdr) > data_end)
return XDP_DROP;
off += sizeof(struct eth_hdr);
iph = data + off;
off += sizeof(struct iphdr);
icmp_hdr = data + off;
icmp_hdr->type = 0;
icmp_hdr->checksum += 0x0007;
iph->ttl = 4;
tmp_addr = iph->daddr;
iph->daddr = iph->saddr;
iph->saddr = tmp_addr;
iph->check = 0;
next_iph_u16 = (__u16 *) iph;
#pragma clang loop unroll(full)
for (int i = 0; i < sizeof(struct iphdr) >> 1; i++)
csum += *next_iph_u16++;
iph->check = ~((csum & 0xffff) + (csum >> 16));
return swap_mac_and_send(data, data_end);
}
static __attribute__ ((noinline))
int send_icmp6_reply(void *data, void *data_end)
{
struct icmp6hdr *icmp_hdr;
struct ipv6hdr *ip6h;
__be32 tmp_addr[4];
__u64 off = 0;
if (data + sizeof(struct eth_hdr)
+ sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) > data_end)
return XDP_DROP;
off += sizeof(struct eth_hdr);
ip6h = data + off;
off += sizeof(struct ipv6hdr);
icmp_hdr = data + off;
icmp_hdr->icmp6_type = 129;
icmp_hdr->icmp6_cksum -= 0x0001;
ip6h->hop_limit = 4;
memcpy(tmp_addr, ip6h->saddr.in6_u.u6_addr32, 16);
memcpy(ip6h->saddr.in6_u.u6_addr32, ip6h->daddr.in6_u.u6_addr32, 16);
memcpy(ip6h->daddr.in6_u.u6_addr32, tmp_addr, 16);
return swap_mac_and_send(data, data_end);
}
static __attribute__ ((noinline))
int parse_icmpv6(void *data, void *data_end, __u64 off,
struct packet_description *pckt)
{
struct icmp6hdr *icmp_hdr;
struct ipv6hdr *ip6h;
icmp_hdr = data + off;
if (icmp_hdr + 1 > data_end)
return XDP_DROP;
if (icmp_hdr->icmp6_type == 128)
return send_icmp6_reply(data, data_end);
if (icmp_hdr->icmp6_type != 3)
return XDP_PASS;
off += sizeof(struct icmp6hdr);
ip6h = data + off;
if (ip6h + 1 > data_end)
return XDP_DROP;
pckt->flow.proto = ip6h->nexthdr;
pckt->flags |= (1 << 0);
memcpy(pckt->flow.srcv6, ip6h->daddr.in6_u.u6_addr32, 16);
memcpy(pckt->flow.dstv6, ip6h->saddr.in6_u.u6_addr32, 16);
return -1;
}
static __attribute__ ((noinline))
int parse_icmp(void *data, void *data_end, __u64 off,
struct packet_description *pckt)
{
struct icmphdr *icmp_hdr;
struct iphdr *iph;
icmp_hdr = data + off;
if (icmp_hdr + 1 > data_end)
return XDP_DROP;
if (icmp_hdr->type == 8)
return send_icmp_reply(data, data_end);
if ((icmp_hdr->type != 3) || (icmp_hdr->code != 4))
return XDP_PASS;
off += sizeof(struct icmphdr);
iph = data + off;
if (iph + 1 > data_end)
return XDP_DROP;
if (iph->ihl != 5)
return XDP_DROP;
pckt->flow.proto = iph->protocol;
pckt->flags |= (1 << 0);
pckt->flow.src = iph->daddr;
pckt->flow.dst = iph->saddr;
return -1;
}
static __attribute__ ((noinline))
__u32 get_packet_hash(struct packet_description *pckt,
bool hash_16bytes)
{
if (hash_16bytes)
return jhash_2words(jhash(pckt->flow.srcv6, 16, 12),
pckt->flow.ports, 24);
else
return jhash_2words(pckt->flow.src, pckt->flow.ports,
24);
}
__attribute__ ((noinline))
static bool get_packet_dst(struct real_definition **real,
struct packet_description *pckt,
struct vip_meta *vip_info,
bool is_ipv6, void *lru_map)
{
struct real_pos_lru new_dst_lru = { };
bool hash_16bytes = is_ipv6;
__u32 *real_pos, hash, key;
__u64 cur_time;
if (vip_info->flags & (1 << 2))
hash_16bytes = 1;
if (vip_info->flags & (1 << 3)) {
pckt->flow.port16[0] = pckt->flow.port16[1];
memset(pckt->flow.srcv6, 0, 16);
}
hash = get_packet_hash(pckt, hash_16bytes);
if (hash != 0x358459b7 /* jhash of ipv4 packet */ &&
hash != 0x2f4bc6bb /* jhash of ipv6 packet */)
return 0;
key = 2 * vip_info->vip_num + hash % 2;
real_pos = bpf_map_lookup_elem(&ch_rings, &key);
if (!real_pos)
return 0;
key = *real_pos;
*real = bpf_map_lookup_elem(&reals, &key);
if (!(*real))
return 0;
if (!(vip_info->flags & (1 << 1))) {
__u32 conn_rate_key = 512 + 2;
struct lb_stats *conn_rate_stats =
bpf_map_lookup_elem(&stats, &conn_rate_key);
if (!conn_rate_stats)
return 1;
cur_time = bpf_ktime_get_ns();
if ((cur_time - conn_rate_stats->v2) >> 32 > 0xffFFFF) {
conn_rate_stats->v1 = 1;
conn_rate_stats->v2 = cur_time;
} else {
conn_rate_stats->v1 += 1;
if (conn_rate_stats->v1 >= 1)
return 1;
}
if (pckt->flow.proto == IPPROTO_UDP)
new_dst_lru.atime = cur_time;
new_dst_lru.pos = key;
bpf_map_update_elem(lru_map, &pckt->flow, &new_dst_lru, 0);
}
return 1;
}
__attribute__ ((noinline))
static void connection_table_lookup(struct real_definition **real,
struct packet_description *pckt,
void *lru_map)
{
struct real_pos_lru *dst_lru;
__u64 cur_time;
__u32 key;
dst_lru = bpf_map_lookup_elem(lru_map, &pckt->flow);
if (!dst_lru)
return;
if (pckt->flow.proto == IPPROTO_UDP) {
cur_time = bpf_ktime_get_ns();
if (cur_time - dst_lru->atime > 300000)
return;
dst_lru->atime = cur_time;
}
key = dst_lru->pos;
*real = bpf_map_lookup_elem(&reals, &key);
}
/* don't believe your eyes!
* below function has 6 arguments whereas bpf and llvm allow maximum of 5
* but since it's _static_ llvm can optimize one argument away
*/
__attribute__ ((noinline))
static int process_l3_headers_v6(struct packet_description *pckt,
__u8 *protocol, __u64 off,
__u16 *pkt_bytes, void *data,
void *data_end)
{
struct ipv6hdr *ip6h;
__u64 iph_len;
int action;
ip6h = data + off;
if (ip6h + 1 > data_end)
return XDP_DROP;
iph_len = sizeof(struct ipv6hdr);
*protocol = ip6h->nexthdr;
pckt->flow.proto = *protocol;
*pkt_bytes = __builtin_bswap16(ip6h->payload_len);
off += iph_len;
if (*protocol == 45) {
return XDP_DROP;
} else if (*protocol == 59) {
action = parse_icmpv6(data, data_end, off, pckt);
if (action >= 0)
return action;
} else {
memcpy(pckt->flow.srcv6, ip6h->saddr.in6_u.u6_addr32, 16);
memcpy(pckt->flow.dstv6, ip6h->daddr.in6_u.u6_addr32, 16);
}
return -1;
}
__attribute__ ((noinline))
static int process_l3_headers_v4(struct packet_description *pckt,
__u8 *protocol, __u64 off,
__u16 *pkt_bytes, void *data,
void *data_end)
{
struct iphdr *iph;
__u64 iph_len;
int action;
iph = data + off;
if (iph + 1 > data_end)
return XDP_DROP;
if (iph->ihl != 5)
return XDP_DROP;
*protocol = iph->protocol;
pckt->flow.proto = *protocol;
*pkt_bytes = __builtin_bswap16(iph->tot_len);
off += 20;
if (iph->frag_off & 65343)
return XDP_DROP;
if (*protocol == IPPROTO_ICMP) {
action = parse_icmp(data, data_end, off, pckt);
if (action >= 0)
return action;
} else {
pckt->flow.src = iph->saddr;
pckt->flow.dst = iph->daddr;
}
return -1;
}
__attribute__ ((noinline))
static int process_packet(void *data, __u64 off, void *data_end,
bool is_ipv6, struct xdp_md *xdp)
{
struct real_definition *dst = NULL;
struct packet_description pckt = { };
struct vip_definition vip = { };
struct lb_stats *data_stats;
struct eth_hdr *eth = data;
void *lru_map = &lru_cache;
struct vip_meta *vip_info;
__u32 lru_stats_key = 513;
__u32 mac_addr_pos = 0;
__u32 stats_key = 512;
struct ctl_value *cval;
__u16 pkt_bytes;
__u64 iph_len;
__u8 protocol;
__u32 vip_num;
int action;
if (is_ipv6)
action = process_l3_headers_v6(&pckt, &protocol, off,
&pkt_bytes, data, data_end);
else
action = process_l3_headers_v4(&pckt, &protocol, off,
&pkt_bytes, data, data_end);
if (action >= 0)
return action;
protocol = pckt.flow.proto;
if (protocol == IPPROTO_TCP) {
if (!parse_tcp(data, data_end, is_ipv6, &pckt))
return XDP_DROP;
} else if (protocol == IPPROTO_UDP) {
if (!parse_udp(data, data_end, is_ipv6, &pckt))
return XDP_DROP;
} else {
return XDP_TX;
}
if (is_ipv6)
memcpy(vip.vipv6, pckt.flow.dstv6, 16);
else
vip.vip = pckt.flow.dst;
vip.port = pckt.flow.port16[1];
vip.proto = pckt.flow.proto;
vip_info = bpf_map_lookup_elem(&vip_map, &vip);
if (!vip_info) {
vip.port = 0;
vip_info = bpf_map_lookup_elem(&vip_map, &vip);
if (!vip_info)
return XDP_PASS;
if (!(vip_info->flags & (1 << 4)))
pckt.flow.port16[1] = 0;
}
if (data_end - data > 1400)
return XDP_DROP;
data_stats = bpf_map_lookup_elem(&stats, &stats_key);
if (!data_stats)
return XDP_DROP;
data_stats->v1 += 1;
if (!dst) {
if (vip_info->flags & (1 << 0))
pckt.flow.port16[0] = 0;
if (!(pckt.flags & (1 << 1)) && !(vip_info->flags & (1 << 1)))
connection_table_lookup(&dst, &pckt, lru_map);
if (dst)
goto out;
if (pckt.flow.proto == IPPROTO_TCP) {
struct lb_stats *lru_stats =
bpf_map_lookup_elem(&stats, &lru_stats_key);
if (!lru_stats)
return XDP_DROP;
if (pckt.flags & (1 << 1))
lru_stats->v1 += 1;
else
lru_stats->v2 += 1;
}
if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6, lru_map))
return XDP_DROP;
data_stats->v2 += 1;
}
out:
cval = bpf_map_lookup_elem(&ctl_array, &mac_addr_pos);
if (!cval)
return XDP_DROP;
if (dst->flags & (1 << 0)) {
if (!encap_v6(xdp, cval, &pckt, dst, pkt_bytes))
return XDP_DROP;
} else {
if (!encap_v4(xdp, cval, &pckt, dst, pkt_bytes))
return XDP_DROP;
}
vip_num = vip_info->vip_num;
data_stats = bpf_map_lookup_elem(&stats, &vip_num);
if (!data_stats)
return XDP_DROP;
data_stats->v1 += 1;
data_stats->v2 += pkt_bytes;
data = (void *)(long)xdp->data;
data_end = (void *)(long)xdp->data_end;
if (data + 4 > data_end)
return XDP_DROP;
*(u32 *)data = dst->dst;
return XDP_DROP;
}
__attribute__ ((section("xdp-test"), used))
int balancer_ingress(struct xdp_md *ctx)
{
void *data = (void *)(long)ctx->data;
void *data_end = (void *)(long)ctx->data_end;
struct eth_hdr *eth = data;
__u32 eth_proto;
__u32 nh_off;
nh_off = sizeof(struct eth_hdr);
if (data + nh_off > data_end)
return XDP_DROP;
eth_proto = eth->eth_proto;
if (eth_proto == 8)
return process_packet(data, nh_off, data_end, 0, ctx);
else if (eth_proto == 56710)
return process_packet(data, nh_off, data_end, 1, ctx);
else
return XDP_DROP;
}
char _license[] __attribute__ ((section("license"), used)) = "GPL";
int _version __attribute__ ((section("version"), used)) = 1;