diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 9b624566b82d..475d6f28ca67 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -421,8 +421,7 @@ struct nft_set { unsigned char *udata; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; - u16 flags:13, - bound:1, + u16 flags:14, genmask:2; u8 klen; u8 dlen; @@ -1348,12 +1347,15 @@ struct nft_trans_rule { struct nft_trans_set { struct nft_set *set; u32 set_id; + bool bound; }; #define nft_trans_set(trans) \ (((struct nft_trans_set *)trans->data)->set) #define nft_trans_set_id(trans) \ (((struct nft_trans_set *)trans->data)->set_id) +#define nft_trans_set_bound(trans) \ + (((struct nft_trans_set *)trans->data)->bound) struct nft_trans_chain { bool update; @@ -1384,12 +1386,15 @@ struct nft_trans_table { struct nft_trans_elem { struct nft_set *set; struct nft_set_elem elem; + bool bound; }; #define nft_trans_elem_set(trans) \ (((struct nft_trans_elem *)trans->data)->set) #define nft_trans_elem(trans) \ (((struct nft_trans_elem *)trans->data)->elem) +#define nft_trans_elem_set_bound(trans) \ + (((struct nft_trans_elem *)trans->data)->bound) struct nft_trans_obj { struct nft_object *obj; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index a542761e90d1..81a8ef42b88d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -453,13 +453,12 @@ EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); * table location, we assume id gets exposed to userspace. * * Following nf_conn items do not change throughout lifetime - * of the nf_conn after it has been committed to main hash table: + * of the nf_conn: * * 1. nf_conn address - * 2. nf_conn->ext address - * 3. nf_conn->master address (normally NULL) - * 4. tuple - * 5. the associated net namespace + * 2. nf_conn->master address (normally NULL) + * 3. the associated net namespace + * 4. the original direction tuple */ u32 nf_ct_get_id(const struct nf_conn *ct) { @@ -469,9 +468,10 @@ u32 nf_ct_get_id(const struct nf_conn *ct) net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); a = (unsigned long)ct; - b = (unsigned long)ct->master ^ net_hash_mix(nf_ct_net(ct)); - c = (unsigned long)ct->ext; - d = (unsigned long)siphash(&ct->tuplehash, sizeof(ct->tuplehash), + b = (unsigned long)ct->master; + c = (unsigned long)nf_ct_net(ct); + d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), &ct_id_seed); #ifdef CONFIG_64BIT return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index e3d797252a98..80a8f9ae4c93 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -111,15 +111,16 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) -static void flow_offload_fixup_ct_state(struct nf_conn *ct) +static inline __s32 nf_flow_timeout_delta(unsigned int timeout) +{ + return (__s32)(timeout - (u32)jiffies); +} + +static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; + int l4num = nf_ct_protonum(ct); unsigned int timeout; - int l4num; - - l4num = nf_ct_protonum(ct); - if (l4num == IPPROTO_TCP) - flow_offload_fixup_tcp(&ct->proto.tcp); l4proto = nf_ct_l4proto_find(l4num); if (!l4proto) @@ -132,7 +133,20 @@ static void flow_offload_fixup_ct_state(struct nf_conn *ct) else return; - ct->timeout = nfct_time_stamp + timeout; + if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout) + ct->timeout = nfct_time_stamp + timeout; +} + +static void flow_offload_fixup_ct_state(struct nf_conn *ct) +{ + if (nf_ct_protonum(ct) == IPPROTO_TCP) + flow_offload_fixup_tcp(&ct->proto.tcp); +} + +static void flow_offload_fixup_ct(struct nf_conn *ct) +{ + flow_offload_fixup_ct_state(ct); + flow_offload_fixup_ct_timeout(ct); } void flow_offload_free(struct flow_offload *flow) @@ -208,6 +222,11 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) } EXPORT_SYMBOL_GPL(flow_offload_add); +static inline bool nf_flow_has_expired(const struct flow_offload *flow) +{ + return nf_flow_timeout_delta(flow->timeout) <= 0; +} + static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { @@ -223,6 +242,11 @@ static void flow_offload_del(struct nf_flowtable *flow_table, e = container_of(flow, struct flow_offload_entry, flow); clear_bit(IPS_OFFLOAD_BIT, &e->ct->status); + if (nf_flow_has_expired(flow)) + flow_offload_fixup_ct(e->ct); + else if (flow->flags & FLOW_OFFLOAD_TEARDOWN) + flow_offload_fixup_ct_timeout(e->ct); + flow_offload_free(flow); } @@ -298,11 +322,6 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, return err; } -static inline bool nf_flow_has_expired(const struct flow_offload *flow) -{ - return (__s32)(flow->timeout - (u32)jiffies) <= 0; -} - static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) { struct nf_flowtable *flow_table = data; diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index cdfc33517e85..d68c801dd614 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -214,6 +214,25 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) return true; } +static int nf_flow_offload_dst_check(struct dst_entry *dst) +{ + if (unlikely(dst_xfrm(dst))) + return dst_check(dst, 0) ? 0 : -1; + + return 0; +} + +static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, + const struct nf_hook_state *state, + struct dst_entry *dst) +{ + skb_orphan(skb); + skb_dst_set_noref(skb, dst); + skb->tstamp = 0; + dst_output(state->net, state->sk, skb); + return NF_STOLEN; +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -254,6 +273,11 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff)) return NF_ACCEPT; + if (nf_flow_offload_dst_check(&rt->dst)) { + flow_offload_teardown(flow); + return NF_ACCEPT; + } + if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0) return NF_DROP; @@ -261,6 +285,13 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, iph = ip_hdr(skb); ip_decrease_ttl(iph); + if (unlikely(dst_xfrm(&rt->dst))) { + memset(skb->cb, 0, sizeof(struct inet_skb_parm)); + IPCB(skb)->iif = skb->dev->ifindex; + IPCB(skb)->flags = IPSKB_FORWARDED; + return nf_flow_xmit_xfrm(skb, state, &rt->dst); + } + skb->dev = outdev; nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); skb_dst_set_noref(skb, &rt->dst); @@ -467,6 +498,11 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, sizeof(*ip6h))) return NF_ACCEPT; + if (nf_flow_offload_dst_check(&rt->dst)) { + flow_offload_teardown(flow); + return NF_ACCEPT; + } + if (skb_try_make_writable(skb, sizeof(*ip6h))) return NF_DROP; @@ -477,6 +513,13 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, ip6h = ipv6_hdr(skb); ip6h->hop_limit--; + if (unlikely(dst_xfrm(&rt->dst))) { + memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + IP6CB(skb)->iif = skb->dev->ifindex; + IP6CB(skb)->flags = IP6SKB_FORWARDED; + return nf_flow_xmit_xfrm(skb, state, &rt->dst); + } + skb->dev = outdev; nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); skb_dst_set_noref(skb, &rt->dst); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 605a7cfe7ca7..88abbddf8967 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -138,9 +138,14 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) return; list_for_each_entry_reverse(trans, &net->nft.commit_list, list) { - if (trans->msg_type == NFT_MSG_NEWSET && - nft_trans_set(trans) == set) { - set->bound = true; + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (nft_trans_set(trans) == set) + nft_trans_set_bound(trans) = true; + break; + case NFT_MSG_NEWSETELEM: + if (nft_trans_elem_set(trans) == set) + nft_trans_elem_set_bound(trans) = true; break; } } @@ -6906,7 +6911,7 @@ static int __nf_tables_abort(struct net *net) break; case NFT_MSG_NEWSET: trans->ctx.table->use--; - if (nft_trans_set(trans)->bound) { + if (nft_trans_set_bound(trans)) { nft_trans_destroy(trans); break; } @@ -6918,7 +6923,7 @@ static int __nf_tables_abort(struct net *net) nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: - if (nft_trans_elem_set(trans)->bound) { + if (nft_trans_elem_set_bound(trans)) { nft_trans_destroy(trans); break; } diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index aa5f571d4361..060a4ed46d5e 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -72,11 +72,11 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, { struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; + struct tcphdr _tcph, *tcph = NULL; enum ip_conntrack_info ctinfo; struct nf_flow_route route; struct flow_offload *flow; enum ip_conntrack_dir dir; - bool is_tcp = false; struct nf_conn *ct; int ret; @@ -89,7 +89,10 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: - is_tcp = true; + tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, + sizeof(_tcph), &_tcph); + if (unlikely(!tcph || tcph->fin || tcph->rst)) + goto out; break; case IPPROTO_UDP: break; @@ -115,7 +118,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (!flow) goto err_flow_alloc; - if (is_tcp) { + if (tcph) { ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; } diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index fe52488a6f72..16571ac1dab4 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -321,4 +321,52 @@ else ip netns exec nsr1 nft list ruleset fi +KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1) +KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1) +SPI1=$RANDOM +SPI2=$RANDOM + +if [ $SPI1 -eq $SPI2 ]; then + SPI2=$((SPI2+1)) +fi + +do_esp() { + local ns=$1 + local me=$2 + local remote=$3 + local lnet=$4 + local rnet=$5 + local spi_out=$6 + local spi_in=$7 + + ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet + ip -net $ns xfrm state add src $me dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet + + # to encrypt packets as they go out (includes forwarded packets that need encapsulation) + ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow + # to fwd decrypted packets after esp processing: + ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow + +} + +do_esp nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2 + +do_esp nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1 + +ip netns exec nsr1 nft delete table ip nat + +# restore default routes +ip -net ns2 route del 192.168.10.1 via 10.0.2.1 +ip -net ns2 route add default via 10.0.2.1 +ip -net ns2 route add default via dead:2::1 + +test_tcp_forwarding ns1 ns2 +if [ $? -eq 0 ] ;then + echo "PASS: ipsec tunnel mode for ns1/ns2" +else + echo "FAIL: ipsec tunnel mode for ns1/ns2" + ip netns exec nsr1 nft list ruleset 1>&2 + ip netns exec nsr1 cat /proc/net/xfrm_stat 1>&2 +fi + exit $ret