Merge branch 'bpf-metadata-direct-access'

Daniel Borkmann says:

====================
BPF metadata for direct access

This work enables generic transfer of metadata from XDP into skb,
meaning the packet has a flexible and programmable room for meta
data, which can later be used by BPF to set various skb members
when passing up the stack. For details, please see second patch.
Support has been implemented and tested with two drivers, and
should be straight forward to add to other drivers as well which
properly support head adjustment already.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2017-09-26 13:36:45 -07:00
commit 390e96ec8e
29 changed files with 760 additions and 105 deletions

View file

@ -94,6 +94,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
xdp.data_hard_start = *data_ptr - offset;
xdp.data = *data_ptr;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = *data_ptr + *len;
orig_data = xdp.data;
mapping = rx_buf->mapping - bp->rx_dma_offset;

View file

@ -523,6 +523,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
xdp.data_hard_start = page_address(page);
xdp.data = (void *)cpu_addr;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
orig_data = xdp.data;

View file

@ -2107,6 +2107,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
if (!skb) {
xdp.data = page_address(rx_buffer->page) +
rx_buffer->page_offset;
xdp_set_data_meta_invalid(&xdp);
xdp.data_hard_start = xdp.data -
i40e_rx_offset(rx_ring);
xdp.data_end = xdp.data + size;

View file

@ -2133,6 +2133,21 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
#if L1_CACHE_BYTES < 128
prefetch(xdp->data + L1_CACHE_BYTES);
#endif
/* Note, we get here by enabling legacy-rx via:
*
* ethtool --set-priv-flags <dev> legacy-rx on
*
* In this mode, we currently get 0 extra XDP headroom as
* opposed to having legacy-rx off, where we process XDP
* packets going to stack via ixgbe_build_skb(). The latter
* provides us currently with 192 bytes of headroom.
*
* For ixgbe_construct_skb() mode it means that the
* xdp->data_meta will always point to xdp->data, since
* the helper cannot expand the head. Should this ever
* change in future for legacy-rx mode on, then lets also
* add xdp->data_meta handling here.
*/
/* allocate a skb to store the frags */
skb = napi_alloc_skb(&rx_ring->q_vector->napi, IXGBE_RX_HDR_SIZE);
@ -2165,6 +2180,7 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
struct xdp_buff *xdp,
union ixgbe_adv_rx_desc *rx_desc)
{
unsigned int metasize = xdp->data - xdp->data_meta;
#if (PAGE_SIZE < 8192)
unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
#else
@ -2174,10 +2190,14 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
#endif
struct sk_buff *skb;
/* prefetch first cache line of first page */
prefetch(xdp->data);
/* Prefetch first cache line of first page. If xdp->data_meta
* is unused, this points extactly as xdp->data, otherwise we
* likely have a consumer accessing first few bytes of meta
* data, and then actual data.
*/
prefetch(xdp->data_meta);
#if L1_CACHE_BYTES < 128
prefetch(xdp->data + L1_CACHE_BYTES);
prefetch(xdp->data_meta + L1_CACHE_BYTES);
#endif
/* build an skb to around the page buffer */
@ -2188,6 +2208,8 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
/* update pointers within the skb to store the data */
skb_reserve(skb, xdp->data - xdp->data_hard_start);
__skb_put(skb, xdp->data_end - xdp->data);
if (metasize)
skb_metadata_set(skb, metasize);
/* record DMA address if this is the start of a chain of buffers */
if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))
@ -2326,6 +2348,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
if (!skb) {
xdp.data = page_address(rx_buffer->page) +
rx_buffer->page_offset;
xdp.data_meta = xdp.data;
xdp.data_hard_start = xdp.data -
ixgbe_rx_offset(rx_ring);
xdp.data_end = xdp.data + size;

View file

@ -762,6 +762,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
xdp.data_hard_start = va - frags[0].page_offset;
xdp.data = va;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + length;
orig_data = xdp.data;

View file

@ -794,6 +794,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
return false;
xdp.data = va + *rx_headroom;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + *len;
xdp.data_hard_start = va;

View file

@ -1574,26 +1574,6 @@ nfp_net_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring,
return true;
}
static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start,
unsigned int *off, unsigned int *len)
{
struct xdp_buff xdp;
void *orig_data;
int ret;
xdp.data_hard_start = hard_start;
xdp.data = data + *off;
xdp.data_end = data + *off + *len;
orig_data = xdp.data;
ret = bpf_prog_run_xdp(prog, &xdp);
*len -= xdp.data - orig_data;
*off += xdp.data - orig_data;
return ret;
}
/**
* nfp_net_rx() - receive up to @budget packets on @rx_ring
* @rx_ring: RX ring to receive from
@ -1629,6 +1609,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
struct nfp_meta_parsed meta;
struct net_device *netdev;
dma_addr_t new_dma_addr;
u32 meta_len_xdp = 0;
void *new_frag;
idx = D_IDX(rx_ring, rx_ring->rd_p);
@ -1707,16 +1688,24 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
if (xdp_prog && !(rxd->rxd.flags & PCIE_DESC_RX_BPF &&
dp->bpf_offload_xdp) && !meta.portid) {
void *orig_data = rxbuf->frag + pkt_off;
unsigned int dma_off;
void *hard_start;
struct xdp_buff xdp;
int act;
hard_start = rxbuf->frag + NFP_NET_RX_BUF_HEADROOM;
xdp.data_hard_start = rxbuf->frag + NFP_NET_RX_BUF_HEADROOM;
xdp.data = orig_data;
xdp.data_meta = orig_data;
xdp.data_end = orig_data + pkt_len;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
pkt_len -= xdp.data - orig_data;
pkt_off += xdp.data - orig_data;
act = nfp_net_run_xdp(xdp_prog, rxbuf->frag, hard_start,
&pkt_off, &pkt_len);
switch (act) {
case XDP_PASS:
meta_len_xdp = xdp.data - xdp.data_meta;
break;
case XDP_TX:
dma_off = pkt_off - NFP_NET_RX_BUF_HEADROOM;
@ -1784,6 +1773,8 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
if (rxd->rxd.flags & PCIE_DESC_RX_VLAN)
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
le16_to_cpu(rxd->rxd.vlan));
if (meta_len_xdp)
skb_metadata_set(skb, meta_len_xdp);
napi_gro_receive(&rx_ring->r_vec->napi, skb);
}

View file

@ -1004,6 +1004,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
xdp.data_hard_start = page_address(bd->data);
xdp.data = xdp.data_hard_start + *data_offset;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + *len;
/* Queues always have a full reset currently, so for the time

View file

@ -1468,6 +1468,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
xdp.data_hard_start = buf;
xdp.data = buf + pad;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
orig_data = xdp.data;
act = bpf_prog_run_xdp(xdp_prog, &xdp);

View file

@ -554,6 +554,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
xdp.data = xdp.data_hard_start + xdp_headroom;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
orig_data = xdp.data;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
@ -686,6 +687,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
data = page_address(xdp_page) + offset;
xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
xdp.data = data + vi->hdr_len;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + (len - vi->hdr_len);
act = bpf_prog_run_xdp(xdp_prog, &xdp);

View file

@ -137,6 +137,7 @@ enum bpf_reg_type {
PTR_TO_MAP_VALUE, /* reg points to map element value */
PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
PTR_TO_STACK, /* reg == frame_pointer + offset */
PTR_TO_PACKET_META, /* skb->data - meta_len */
PTR_TO_PACKET, /* reg points to skb->data */
PTR_TO_PACKET_END, /* skb->data + headlen */
};

View file

@ -487,24 +487,30 @@ struct sk_filter {
struct bpf_skb_data_end {
struct qdisc_skb_cb qdisc_cb;
void *data_meta;
void *data_end;
};
struct xdp_buff {
void *data;
void *data_end;
void *data_meta;
void *data_hard_start;
};
/* compute the linear packet data range [data, data_end) which
* will be accessed by cls_bpf, act_bpf and lwt programs
/* Compute the linear packet data range [data, data_end) which
* will be accessed by various program types (cls_bpf, act_bpf,
* lwt, ...). Subsystems allowing direct data access must (!)
* ensure that cb[] area can be written to when BPF program is
* invoked (otherwise cb[] save/restore is necessary).
*/
static inline void bpf_compute_data_end(struct sk_buff *skb)
static inline void bpf_compute_data_pointers(struct sk_buff *skb)
{
struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb));
cb->data_end = skb->data + skb_headlen(skb);
cb->data_meta = skb->data - skb_metadata_len(skb);
cb->data_end = skb->data + skb_headlen(skb);
}
static inline u8 *bpf_skb_cb(struct sk_buff *skb)
@ -725,8 +731,22 @@ int xdp_do_redirect(struct net_device *dev,
struct bpf_prog *prog);
void xdp_do_flush_map(void);
/* Drivers not supporting XDP metadata can use this helper, which
* rejects any room expansion for metadata as a result.
*/
static __always_inline void
xdp_set_data_meta_invalid(struct xdp_buff *xdp)
{
xdp->data_meta = xdp->data + 1;
}
static __always_inline bool
xdp_data_meta_unsupported(const struct xdp_buff *xdp)
{
return unlikely(xdp->data_meta > xdp->data);
}
void bpf_warn_invalid_xdp_action(u32 act);
void bpf_warn_invalid_xdp_redirect(u32 ifindex);
struct sock *do_sk_redirect_map(void);

View file

@ -489,8 +489,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
* the end of the header data, ie. at skb->end.
*/
struct skb_shared_info {
unsigned short _unused;
unsigned char nr_frags;
__u8 __unused;
__u8 meta_len;
__u8 nr_frags;
__u8 tx_flags;
unsigned short gso_size;
/* Warning: this field is not always filled in (UFO)! */
@ -3400,6 +3401,69 @@ static inline ktime_t net_invalid_timestamp(void)
return 0;
}
static inline u8 skb_metadata_len(const struct sk_buff *skb)
{
return skb_shinfo(skb)->meta_len;
}
static inline void *skb_metadata_end(const struct sk_buff *skb)
{
return skb_mac_header(skb);
}
static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
const struct sk_buff *skb_b,
u8 meta_len)
{
const void *a = skb_metadata_end(skb_a);
const void *b = skb_metadata_end(skb_b);
/* Using more efficient varaiant than plain call to memcmp(). */
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
u64 diffs = 0;
switch (meta_len) {
#define __it(x, op) (x -= sizeof(u##op))
#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
case 32: diffs |= __it_diff(a, b, 64);
case 24: diffs |= __it_diff(a, b, 64);
case 16: diffs |= __it_diff(a, b, 64);
case 8: diffs |= __it_diff(a, b, 64);
break;
case 28: diffs |= __it_diff(a, b, 64);
case 20: diffs |= __it_diff(a, b, 64);
case 12: diffs |= __it_diff(a, b, 64);
case 4: diffs |= __it_diff(a, b, 32);
break;
}
return diffs;
#else
return memcmp(a - meta_len, b - meta_len, meta_len);
#endif
}
static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
const struct sk_buff *skb_b)
{
u8 len_a = skb_metadata_len(skb_a);
u8 len_b = skb_metadata_len(skb_b);
if (!(len_a | len_b))
return false;
return len_a != len_b ?
true : __skb_metadata_differs(skb_a, skb_b, len_a);
}
static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
{
skb_shinfo(skb)->meta_len = meta_len;
}
static inline void skb_metadata_clear(struct sk_buff *skb)
{
skb_metadata_set(skb, 0);
}
struct sk_buff *skb_clone_sk(struct sk_buff *skb);
#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING

View file

@ -582,6 +582,12 @@ union bpf_attr {
* @map: pointer to sockmap to update
* @key: key to insert/update sock in map
* @flags: same flags as map update elem
*
* int bpf_xdp_adjust_meta(xdp_md, delta)
* Adjust the xdp_md.data_meta by delta
* @xdp_md: pointer to xdp_md
* @delta: An positive/negative integer to be added to xdp_md.data_meta
* Return: 0 on success or negative on error
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@ -638,6 +644,7 @@ union bpf_attr {
FN(redirect_map), \
FN(sk_redirect_map), \
FN(sock_map_update), \
FN(xdp_adjust_meta),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@ -715,7 +722,7 @@ struct __sk_buff {
__u32 data_end;
__u32 napi_id;
/* accessed by BPF_PROG_TYPE_sk_skb types */
/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
__u32 family;
__u32 remote_ip4; /* Stored in network byte order */
__u32 local_ip4; /* Stored in network byte order */
@ -723,6 +730,9 @@ struct __sk_buff {
__u32 local_ip6[4]; /* Stored in network byte order */
__u32 remote_port; /* Stored in network byte order */
__u32 local_port; /* stored in host byte order */
/* ... here. */
__u32 data_meta;
};
struct bpf_tunnel_key {
@ -783,6 +793,7 @@ enum xdp_action {
struct xdp_md {
__u32 data;
__u32 data_end;
__u32 data_meta;
};
enum sk_action {

View file

@ -102,7 +102,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
skb_orphan(skb);
skb->sk = psock->sock;
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
rc = (*prog->bpf_func)(skb, prog->insnsi);
skb->sk = NULL;
@ -369,7 +369,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
* any socket yet.
*/
skb->sk = psock->sock;
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
rc = (*prog->bpf_func)(skb, prog->insnsi);
skb->sk = NULL;
rcu_read_unlock();

View file

@ -177,6 +177,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
va_end(args);
}
static bool type_is_pkt_pointer(enum bpf_reg_type type)
{
return type == PTR_TO_PACKET ||
type == PTR_TO_PACKET_META;
}
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
[NOT_INIT] = "?",
@ -187,6 +193,7 @@ static const char * const reg_type_str[] = {
[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
[PTR_TO_STACK] = "fp",
[PTR_TO_PACKET] = "pkt",
[PTR_TO_PACKET_META] = "pkt_meta",
[PTR_TO_PACKET_END] = "pkt_end",
};
@ -226,7 +233,7 @@ static void print_verifier_state(struct bpf_verifier_state *state)
verbose("(id=%d", reg->id);
if (t != SCALAR_VALUE)
verbose(",off=%d", reg->off);
if (t == PTR_TO_PACKET)
if (type_is_pkt_pointer(t))
verbose(",r=%d", reg->range);
else if (t == CONST_PTR_TO_MAP ||
t == PTR_TO_MAP_VALUE ||
@ -519,6 +526,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
__mark_reg_known_zero(regs + regno);
}
static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
{
return type_is_pkt_pointer(reg->type);
}
static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
{
return reg_is_pkt_pointer(reg) ||
reg->type == PTR_TO_PACKET_END;
}
/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
enum bpf_reg_type which)
{
/* The register can already have a range from prior markings.
* This is fine as long as it hasn't been advanced from its
* origin.
*/
return reg->type == which &&
reg->id == 0 &&
reg->off == 0 &&
tnum_equals_const(reg->var_off, 0);
}
/* Attempts to improve min/max values based on var_off information */
static void __update_reg_bounds(struct bpf_reg_state *reg)
{
@ -702,6 +734,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
case PTR_TO_PACKET_END:
case CONST_PTR_TO_MAP:
return true;
@ -1047,7 +1080,10 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
switch (reg->type) {
case PTR_TO_PACKET:
/* special case, because of NET_IP_ALIGN */
case PTR_TO_PACKET_META:
/* Special case, because of NET_IP_ALIGN. Given metadata sits
* right in front, treat it the very same way.
*/
return check_pkt_ptr_alignment(reg, off, size, strict);
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
@ -1124,8 +1160,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
/* ctx access returns either a scalar, or a
* PTR_TO_PACKET[_END]. In the latter case, we know
* the offset is zero.
* PTR_TO_PACKET[_META,_END]. In the latter
* case, we know the offset is zero.
*/
if (reg_type == SCALAR_VALUE)
mark_reg_unknown(state->regs, value_regno);
@ -1170,7 +1206,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else {
err = check_stack_read(state, off, size, value_regno);
}
} else if (reg->type == PTR_TO_PACKET) {
} else if (reg_is_pkt_pointer(reg)) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
verbose("cannot write into packet\n");
return -EACCES;
@ -1310,6 +1346,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
switch (reg->type) {
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
return check_packet_access(env, regno, reg->off, access_size);
case PTR_TO_MAP_VALUE:
return check_map_access(env, regno, reg->off, access_size);
@ -1342,7 +1379,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return 0;
}
if (type == PTR_TO_PACKET &&
if (type_is_pkt_pointer(type) &&
!may_access_direct_pkt_data(env, meta, BPF_READ)) {
verbose("helper access to the packet is not allowed\n");
return -EACCES;
@ -1351,7 +1388,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
if (type != PTR_TO_PACKET && type != expected_type)
if (!type_is_pkt_pointer(type) &&
type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
@ -1375,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (register_is_null(*reg))
/* final test in check_stack_boundary() */;
else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
else if (!type_is_pkt_pointer(type) &&
type != PTR_TO_MAP_VALUE &&
type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
@ -1401,7 +1440,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
verbose("invalid map_ptr to access map->key\n");
return -EACCES;
}
if (type == PTR_TO_PACKET)
if (type_is_pkt_pointer(type))
err = check_packet_access(env, regno, reg->off,
meta->map_ptr->key_size);
else
@ -1417,7 +1456,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
verbose("invalid map_ptr to access map->value\n");
return -EACCES;
}
if (type == PTR_TO_PACKET)
if (type_is_pkt_pointer(type))
err = check_packet_access(env, regno, reg->off,
meta->map_ptr->value_size);
else
@ -1590,8 +1629,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
return count > 1 ? -EINVAL : 0;
}
/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
* so turn them into unknown SCALAR_VALUE.
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
* are now invalid, so turn them into unknown SCALAR_VALUE.
*/
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
@ -1600,18 +1639,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
int i;
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET ||
regs[i].type == PTR_TO_PACKET_END)
if (reg_is_pkt_pointer_any(&regs[i]))
mark_reg_unknown(regs, i);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
reg = &state->spilled_regs[i / BPF_REG_SIZE];
if (reg->type != PTR_TO_PACKET &&
reg->type != PTR_TO_PACKET_END)
continue;
__mark_reg_unknown(reg);
if (reg_is_pkt_pointer_any(reg))
__mark_reg_unknown(reg);
}
}
@ -1871,7 +1907,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
if (ptr_reg->type == PTR_TO_PACKET) {
if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
dst_reg->range = 0;
@ -1931,7 +1967,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
if (ptr_reg->type == PTR_TO_PACKET) {
if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
if (smin_val < 0)
@ -2421,7 +2457,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
static void find_good_pkt_pointers(struct bpf_verifier_state *state,
struct bpf_reg_state *dst_reg)
struct bpf_reg_state *dst_reg,
enum bpf_reg_type type)
{
struct bpf_reg_state *regs = state->regs, *reg;
int i;
@ -2483,7 +2520,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
if (regs[i].type == type && regs[i].id == dst_reg->id)
/* keep the maximum range already checked */
regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
@ -2491,7 +2528,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
reg = &state->spilled_regs[i / BPF_REG_SIZE];
if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
if (reg->type == type && reg->id == dst_reg->id)
reg->range = max_t(u16, reg->range, dst_reg->off);
}
}
@ -2856,19 +2893,39 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
find_good_pkt_pointers(this_branch, dst_reg);
find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
find_good_pkt_pointers(other_branch, dst_reg);
find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
dst_reg->type == PTR_TO_PACKET_END &&
regs[insn->src_reg].type == PTR_TO_PACKET) {
find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
PTR_TO_PACKET);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
dst_reg->type == PTR_TO_PACKET_END &&
regs[insn->src_reg].type == PTR_TO_PACKET) {
find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
PTR_TO_PACKET);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
dst_reg->type == PTR_TO_PACKET_META &&
reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
dst_reg->type == PTR_TO_PACKET_META &&
reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
regs[insn->src_reg].type == PTR_TO_PACKET_META) {
find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
PTR_TO_PACKET_META);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
regs[insn->src_reg].type == PTR_TO_PACKET_META) {
find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
PTR_TO_PACKET_META);
} else if (is_pointer_value(env, insn->dst_reg)) {
verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
return -EACCES;
@ -3298,8 +3355,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
/* Check our ids match any regs they're supposed to */
return check_ids(rold->id, rcur->id, idmap);
case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
if (rcur->type != PTR_TO_PACKET)
if (rcur->type != rold->type)
return false;
/* We must have at least as much range as the old ptr
* did, so that any accesses which were safe before are

View file

@ -133,7 +133,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
if (is_l2)
__skb_push(skb, ETH_HLEN);
if (is_direct_pkt_access)
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
retval = bpf_test_run(prog, skb, repeat, &duration);
if (!is_l2)
__skb_push(skb, ETH_HLEN);
@ -162,6 +162,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
xdp.data_hard_start = data;
xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN;
xdp.data_meta = xdp.data;
xdp.data_end = xdp.data + size;
retval = bpf_test_run(prog, &xdp, repeat, &duration);

View file

@ -3864,8 +3864,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
static u32 netif_receive_generic_xdp(struct sk_buff *skb,
struct bpf_prog *xdp_prog)
{
u32 metalen, act = XDP_DROP;
struct xdp_buff xdp;
u32 act = XDP_DROP;
void *orig_data;
int hlen, off;
u32 mac_len;
@ -3876,8 +3876,25 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
if (skb_cloned(skb))
return XDP_PASS;
if (skb_linearize(skb))
goto do_drop;
/* XDP packets must be linear and must have sufficient headroom
* of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
* native XDP provides, thus we need to do it here as well.
*/
if (skb_is_nonlinear(skb) ||
skb_headroom(skb) < XDP_PACKET_HEADROOM) {
int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
int troom = skb->tail + skb->data_len - skb->end;
/* In case we have to go down the path and also linearize,
* then lets do the pskb_expand_head() work just once here.
*/
if (pskb_expand_head(skb,
hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
goto do_drop;
if (troom > 0 && __skb_linearize(skb))
goto do_drop;
}
/* The XDP program wants to see the packet starting at the MAC
* header.
@ -3885,6 +3902,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
mac_len = skb->data - skb_mac_header(skb);
hlen = skb_headlen(skb) + mac_len;
xdp.data = skb->data - mac_len;
xdp.data_meta = xdp.data;
xdp.data_end = xdp.data + hlen;
xdp.data_hard_start = skb->data - skb_headroom(skb);
orig_data = xdp.data;
@ -3902,10 +3920,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
case XDP_REDIRECT:
case XDP_TX:
__skb_push(skb, mac_len);
/* fall through */
case XDP_PASS:
break;
case XDP_PASS:
metalen = xdp.data - xdp.data_meta;
if (metalen)
skb_metadata_set(skb, metalen);
break;
default:
bpf_warn_invalid_xdp_action(act);
/* fall through */
@ -4695,6 +4715,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
diffs |= p->vlan_tci ^ skb->vlan_tci;
diffs |= skb_metadata_dst_cmp(p, skb);
diffs |= skb_metadata_differs(p, skb);
if (maclen == ETH_HLEN)
diffs |= compare_ether_header(skb_mac_header(p),
skb_mac_header(skb));

View file

@ -1402,7 +1402,7 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
{
int err = __bpf_try_make_writable(skb, write_len);
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
return err;
}
@ -1962,7 +1962,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
bpf_pull_mac_rcsum(skb);
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
return ret;
}
@ -1984,7 +1984,7 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
ret = skb_vlan_pop(skb);
bpf_pull_mac_rcsum(skb);
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
return ret;
}
@ -2178,7 +2178,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
* need to be verified first.
*/
ret = bpf_skb_proto_xlat(skb, proto);
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
return ret;
}
@ -2303,7 +2303,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) :
bpf_skb_net_grow(skb, len_diff_abs);
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
return ret;
}
@ -2394,7 +2394,7 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
skb_gso_reset(skb);
}
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
return ret;
}
@ -2434,7 +2434,7 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
skb_reset_mac_header(skb);
}
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
return 0;
}
@ -2447,14 +2447,26 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
.arg3_type = ARG_ANYTHING,
};
static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
{
return xdp_data_meta_unsupported(xdp) ? 0 :
xdp->data - xdp->data_meta;
}
BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
unsigned long metalen = xdp_get_metalen(xdp);
void *data_start = xdp->data_hard_start + metalen;
void *data = xdp->data + offset;
if (unlikely(data < xdp->data_hard_start ||
if (unlikely(data < data_start ||
data > xdp->data_end - ETH_HLEN))
return -EINVAL;
if (metalen)
memmove(xdp->data_meta + offset,
xdp->data_meta, metalen);
xdp->data_meta += offset;
xdp->data = data;
return 0;
@ -2468,6 +2480,33 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
{
void *meta = xdp->data_meta + offset;
unsigned long metalen = xdp->data - meta;
if (xdp_data_meta_unsupported(xdp))
return -ENOTSUPP;
if (unlikely(meta < xdp->data_hard_start ||
meta > xdp->data))
return -EINVAL;
if (unlikely((metalen & (sizeof(__u32) - 1)) ||
(metalen > 32)))
return -EACCES;
xdp->data_meta = meta;
return 0;
}
static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
.func = bpf_xdp_adjust_meta,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
};
static int __bpf_tx_xdp(struct net_device *dev,
struct bpf_map *map,
struct xdp_buff *xdp,
@ -2692,7 +2731,8 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_clone_redirect ||
func == bpf_l3_csum_replace ||
func == bpf_l4_csum_replace ||
func == bpf_xdp_adjust_head)
func == bpf_xdp_adjust_head ||
func == bpf_xdp_adjust_meta)
return true;
return false;
@ -3288,6 +3328,8 @@ xdp_func_proto(enum bpf_func_id func_id)
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_xdp_adjust_head:
return &bpf_xdp_adjust_head_proto;
case BPF_FUNC_xdp_adjust_meta:
return &bpf_xdp_adjust_meta_proto;
case BPF_FUNC_redirect:
return &bpf_xdp_redirect_proto;
case BPF_FUNC_redirect_map:
@ -3418,6 +3460,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range(struct __sk_buff, data_end):
if (size != size_default)
return false;
@ -3444,6 +3487,7 @@ static bool sk_filter_is_valid_access(int off, int size,
switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range(struct __sk_buff, data_end):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
@ -3468,6 +3512,7 @@ static bool lwt_is_valid_access(int off, int size,
switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
case bpf_ctx_range(struct __sk_buff, data_meta):
return false;
}
@ -3586,6 +3631,9 @@ static bool tc_cls_act_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data):
info->reg_type = PTR_TO_PACKET;
break;
case bpf_ctx_range(struct __sk_buff, data_meta):
info->reg_type = PTR_TO_PACKET_META;
break;
case bpf_ctx_range(struct __sk_buff, data_end):
info->reg_type = PTR_TO_PACKET_END;
break;
@ -3619,6 +3667,9 @@ static bool xdp_is_valid_access(int off, int size,
case offsetof(struct xdp_md, data):
info->reg_type = PTR_TO_PACKET;
break;
case offsetof(struct xdp_md, data_meta):
info->reg_type = PTR_TO_PACKET_META;
break;
case offsetof(struct xdp_md, data_end):
info->reg_type = PTR_TO_PACKET_END;
break;
@ -3677,6 +3728,12 @@ static bool sk_skb_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data_meta):
return false;
}
if (type == BPF_WRITE) {
switch (off) {
case bpf_ctx_range(struct __sk_buff, mark):
@ -3689,8 +3746,6 @@ static bool sk_skb_is_valid_access(int off, int size,
}
switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid):
return false;
case bpf_ctx_range(struct __sk_buff, data):
info->reg_type = PTR_TO_PACKET;
break;
@ -3847,6 +3902,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
offsetof(struct sk_buff, data));
break;
case offsetof(struct __sk_buff, data_meta):
off = si->off;
off -= offsetof(struct __sk_buff, data_meta);
off += offsetof(struct sk_buff, cb);
off += offsetof(struct bpf_skb_data_end, data_meta);
*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
si->src_reg, off);
break;
case offsetof(struct __sk_buff, data_end):
off = si->off;
off -= offsetof(struct __sk_buff, data_end);
@ -4095,6 +4159,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
si->dst_reg, si->src_reg,
offsetof(struct xdp_buff, data));
break;
case offsetof(struct xdp_md, data_meta):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
si->dst_reg, si->src_reg,
offsetof(struct xdp_buff, data_meta));
break;
case offsetof(struct xdp_md, data_end):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
si->dst_reg, si->src_reg,

View file

@ -51,7 +51,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
*/
preempt_disable();
rcu_read_lock();
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
ret = bpf_prog_run_save_cb(lwt->prog, skb);
rcu_read_unlock();

View file

@ -1509,6 +1509,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb->nohdr = 0;
atomic_set(&skb_shinfo(skb)->dataref, 1);
skb_metadata_clear(skb);
/* It is not generally safe to change skb->truesize.
* For the moment, we really care of rx path, or
* when skb is orphaned (not attached to a socket).

View file

@ -49,11 +49,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
filter = rcu_dereference(prog->filter);
if (at_ingress) {
__skb_push(skb, skb->mac_len);
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
filter_res = BPF_PROG_RUN(filter, skb);
__skb_pull(skb, skb->mac_len);
} else {
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
filter_res = BPF_PROG_RUN(filter, skb);
}
rcu_read_unlock();

View file

@ -99,11 +99,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
} else if (at_ingress) {
/* It is safe to push/pull even if skb_shared() */
__skb_push(skb, skb->mac_len);
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
filter_res = BPF_PROG_RUN(prog->filter, skb);
__skb_pull(skb, skb->mac_len);
} else {
bpf_compute_data_end(skb);
bpf_compute_data_pointers(skb);
filter_res = BPF_PROG_RUN(prog->filter, skb);
}

View file

@ -143,12 +143,6 @@ enum bpf_attach_type {
#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
enum bpf_sockmap_flags {
BPF_SOCKMAP_UNSPEC,
BPF_SOCKMAP_STRPARSER,
__MAX_BPF_SOCKMAP_FLAG
};
/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
* to the given target_fd cgroup the descendent cgroup will be able to
* override effective bpf program that was inherited from this cgroup
@ -368,9 +362,20 @@ union bpf_attr {
* int bpf_redirect(ifindex, flags)
* redirect to another netdev
* @ifindex: ifindex of the net device
* @flags: bit 0 - if set, redirect to ingress instead of egress
* other bits - reserved
* Return: TC_ACT_REDIRECT
* @flags:
* cls_bpf:
* bit 0 - if set, redirect to ingress instead of egress
* other bits - reserved
* xdp_bpf:
* all bits - reserved
* Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error
* xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error
* int bpf_redirect_map(map, key, flags)
* redirect to endpoint in map
* @map: pointer to dev map
* @key: index in map to lookup
* @flags: --
* Return: XDP_REDIRECT on success or XDP_ABORT on error
*
* u32 bpf_get_route_realm(skb)
* retrieve a dst's tclassid
@ -577,6 +582,12 @@ union bpf_attr {
* @map: pointer to sockmap to update
* @key: key to insert/update sock in map
* @flags: same flags as map update elem
*
* int bpf_xdp_adjust_meta(xdp_md, delta)
* Adjust the xdp_md.data_meta by delta
* @xdp_md: pointer to xdp_md
* @delta: An positive/negative integer to be added to xdp_md.data_meta
* Return: 0 on success or negative on error
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@ -632,7 +643,8 @@ union bpf_attr {
FN(skb_adjust_room), \
FN(redirect_map), \
FN(sk_redirect_map), \
FN(sock_map_update),
FN(sock_map_update), \
FN(xdp_adjust_meta),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@ -710,7 +722,7 @@ struct __sk_buff {
__u32 data_end;
__u32 napi_id;
/* accessed by BPF_PROG_TYPE_sk_skb types */
/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
__u32 family;
__u32 remote_ip4; /* Stored in network byte order */
__u32 local_ip4; /* Stored in network byte order */
@ -718,6 +730,9 @@ struct __sk_buff {
__u32 local_ip6[4]; /* Stored in network byte order */
__u32 remote_port; /* Stored in network byte order */
__u32 local_port; /* stored in host byte order */
/* ... here. */
__u32 data_meta;
};
struct bpf_tunnel_key {
@ -753,20 +768,23 @@ struct bpf_sock {
__u32 family;
__u32 type;
__u32 protocol;
__u32 mark;
__u32 priority;
};
#define XDP_PACKET_HEADROOM 256
/* User return codes for XDP prog type.
* A valid XDP program must return one of these defined values. All other
* return codes are reserved for future use. Unknown return codes will result
* in packet drop.
* return codes are reserved for future use. Unknown return codes will
* result in packet drops and a warning via bpf_warn_invalid_xdp_action().
*/
enum xdp_action {
XDP_ABORTED = 0,
XDP_DROP,
XDP_PASS,
XDP_TX,
XDP_REDIRECT,
};
/* user accessible metadata for XDP packet hook
@ -775,6 +793,7 @@ enum xdp_action {
struct xdp_md {
__u32 data;
__u32 data_end;
__u32 data_meta;
};
enum sk_action {

View file

@ -15,9 +15,10 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
test_align
TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
test_pkt_md_access.o test_xdp_redirect.o sockmap_parse_prog.o sockmap_verdict_prog.o
test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \
sockmap_verdict_prog.o
TEST_PROGS := test_kmod.sh test_xdp_redirect.sh
TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh
include ../lib.mk
@ -34,8 +35,20 @@ $(BPFOBJ): force
$(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/
CLANG ?= clang
LLC ?= llc
PROBE := $(shell llc -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
# Let newer LLVM versions transparently probe the kernel for availability
# of full BPF instruction set.
ifeq ($(PROBE),)
CPU ?= probe
else
CPU ?= generic
endif
%.o: %.c
$(CLANG) -I. -I./include/uapi -I../../../include/uapi \
-Wno-compare-distinct-pointer-types \
-O2 -target bpf -c $< -o $@
-Wno-compare-distinct-pointer-types \
-O2 -target bpf -emit-llvm -c $< -o - | \
$(LLC) -march=bpf -mcpu=$(CPU) -filetype=obj -o $@

View file

@ -62,6 +62,8 @@ static unsigned long long (*bpf_get_prandom_u32)(void) =
(void *) BPF_FUNC_get_prandom_u32;
static int (*bpf_xdp_adjust_head)(void *ctx, int offset) =
(void *) BPF_FUNC_xdp_adjust_head;
static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) =
(void *) BPF_FUNC_xdp_adjust_meta;
static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval,
int optlen) =
(void *) BPF_FUNC_setsockopt;

View file

@ -6645,6 +6645,253 @@ static struct bpf_test tests[] = {
.errstr = "BPF_END uses reserved fields",
.result = REJECT,
},
{
"meta access, test1",
.insns = {
BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
offsetof(struct xdp_md, data_meta)),
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
offsetof(struct xdp_md, data)),
BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_XDP,
},
{
"meta access, test2",
.insns = {
BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
offsetof(struct xdp_md, data_meta)),
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
offsetof(struct xdp_md, data)),
BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 8),
BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.result = REJECT,
.errstr = "invalid access to packet, off=-8",
.prog_type = BPF_PROG_TYPE_XDP,
},
{
"meta access, test3",
.insns = {
BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
offsetof(struct xdp_md, data_meta)),
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
offsetof(struct xdp_md, data_end)),
BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.result = REJECT,
.errstr = "invalid access to packet",
.prog_type = BPF_PROG_TYPE_XDP,
},
{
"meta access, test4",
.insns = {
BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
offsetof(struct xdp_md, data_meta)),
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
offsetof(struct xdp_md, data_end)),
BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
offsetof(struct xdp_md, data)),
BPF_MOV64_REG(BPF_REG_0, BPF_REG_4),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.result = REJECT,
.errstr = "invalid access to packet",
.prog_type = BPF_PROG_TYPE_XDP,
},
{
"meta access, test5",
.insns = {
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
offsetof(struct xdp_md, data_meta)),
BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
offsetof(struct xdp_md, data)),
BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_4, 3),
BPF_MOV64_IMM(BPF_REG_2, -8),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
BPF_FUNC_xdp_adjust_meta),
BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_3, 0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.result = REJECT,
.errstr = "R3 !read_ok",
.prog_type = BPF_PROG_TYPE_XDP,
},
{
"meta access, test6",
.insns = {
BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
offsetof(struct xdp_md, data_meta)),
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
offsetof(struct xdp_md, data)),
BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_0, 1),
BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.result = REJECT,
.errstr = "invalid access to packet",
.prog_type = BPF_PROG_TYPE_XDP,
},
{
"meta access, test7",
.insns = {
BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
offsetof(struct xdp_md, data_meta)),
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
offsetof(struct xdp_md, data)),
BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_XDP,
},
{
"meta access, test8",
.insns = {
BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
offsetof(struct xdp_md, data_meta)),
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
offsetof(struct xdp_md, data)),
BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0xFFFF),
BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_XDP,
},
{
"meta access, test9",
.insns = {
BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
offsetof(struct xdp_md, data_meta)),
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
offsetof(struct xdp_md, data)),
BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0xFFFF),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 1),
BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.result = REJECT,
.errstr = "invalid access to packet",
.prog_type = BPF_PROG_TYPE_XDP,
},
{
"meta access, test10",
.insns = {
BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
offsetof(struct xdp_md, data_meta)),
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
offsetof(struct xdp_md, data)),
BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
offsetof(struct xdp_md, data_end)),
BPF_MOV64_IMM(BPF_REG_5, 42),
BPF_MOV64_IMM(BPF_REG_6, 24),
BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8),
BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8),
BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6),
BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_5),
BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 8),
BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_5, 1),
BPF_LDX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.result = REJECT,
.errstr = "invalid access to packet",
.prog_type = BPF_PROG_TYPE_XDP,
},
{
"meta access, test11",
.insns = {
BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
offsetof(struct xdp_md, data_meta)),
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
offsetof(struct xdp_md, data)),
BPF_MOV64_IMM(BPF_REG_5, 42),
BPF_MOV64_IMM(BPF_REG_6, 24),
BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8),
BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8),
BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6),
BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_5),
BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 8),
BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_3, 1),
BPF_LDX_MEM(BPF_B, BPF_REG_5, BPF_REG_5, 0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_XDP,
},
{
"meta access, test12",
.insns = {
BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
offsetof(struct xdp_md, data_meta)),
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
offsetof(struct xdp_md, data)),
BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
offsetof(struct xdp_md, data_end)),
BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 16),
BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_4, 5),
BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_3, 0),
BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 16),
BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_3, 1),
BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_XDP,
},
};
static int probe_filter_length(const struct bpf_insn *fp)

View file

@ -0,0 +1,53 @@
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/pkt_cls.h>
#include "bpf_helpers.h"
#define __round_mask(x, y) ((__typeof__(x))((y) - 1))
#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1)
#define ctx_ptr(ctx, mem) (void *)(unsigned long)ctx->mem
SEC("t")
int ing_cls(struct __sk_buff *ctx)
{
__u8 *data, *data_meta, *data_end;
__u32 diff = 0;
data_meta = ctx_ptr(ctx, data_meta);
data_end = ctx_ptr(ctx, data_end);
data = ctx_ptr(ctx, data);
if (data + ETH_ALEN > data_end ||
data_meta + round_up(ETH_ALEN, 4) > data)
return TC_ACT_SHOT;
diff |= ((__u32 *)data_meta)[0] ^ ((__u32 *)data)[0];
diff |= ((__u16 *)data_meta)[2] ^ ((__u16 *)data)[2];
return diff ? TC_ACT_SHOT : TC_ACT_OK;
}
SEC("x")
int ing_xdp(struct xdp_md *ctx)
{
__u8 *data, *data_meta, *data_end;
int ret;
ret = bpf_xdp_adjust_meta(ctx, -round_up(ETH_ALEN, 4));
if (ret < 0)
return XDP_DROP;
data_meta = ctx_ptr(ctx, data_meta);
data_end = ctx_ptr(ctx, data_end);
data = ctx_ptr(ctx, data);
if (data + ETH_ALEN > data_end ||
data_meta + round_up(ETH_ALEN, 4) > data)
return XDP_DROP;
__builtin_memcpy(data_meta, data, ETH_ALEN);
return XDP_PASS;
}
char _license[] SEC("license") = "GPL";

View file

@ -0,0 +1,51 @@
#!/bin/sh
cleanup()
{
if [ "$?" = "0" ]; then
echo "selftests: test_xdp_meta [PASS]";
else
echo "selftests: test_xdp_meta [FAILED]";
fi
set +e
ip netns del ns1 2> /dev/null
ip netns del ns2 2> /dev/null
}
ip link set dev lo xdp off 2>/dev/null > /dev/null
if [ $? -ne 0 ];then
echo "selftests: [SKIP] Could not run test without the ip xdp support"
exit 0
fi
set -e
ip netns add ns1
ip netns add ns2
trap cleanup 0 2 3 6 9
ip link add veth1 type veth peer name veth2
ip link set veth1 netns ns1
ip link set veth2 netns ns2
ip netns exec ns1 ip addr add 10.1.1.11/24 dev veth1
ip netns exec ns2 ip addr add 10.1.1.22/24 dev veth2
ip netns exec ns1 tc qdisc add dev veth1 clsact
ip netns exec ns2 tc qdisc add dev veth2 clsact
ip netns exec ns1 tc filter add dev veth1 ingress bpf da obj test_xdp_meta.o sec t
ip netns exec ns2 tc filter add dev veth2 ingress bpf da obj test_xdp_meta.o sec t
ip netns exec ns1 ip link set dev veth1 xdp obj test_xdp_meta.o sec x
ip netns exec ns2 ip link set dev veth2 xdp obj test_xdp_meta.o sec x
ip netns exec ns1 ip link set dev veth1 up
ip netns exec ns2 ip link set dev veth2 up
ip netns exec ns1 ping -c 1 10.1.1.22
ip netns exec ns2 ping -c 1 10.1.1.11
exit 0