From b69d540da7db84e836cea77fbd56a518aafa1f2f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Nov 2018 23:00:31 +0100 Subject: [PATCH 01/17] selftests: add xfrm policy test script add a script that adds a ipsec tunnel between two network namespaces plus following policies: .0/24 -> ipsec tunnel .240/28 -> bypass .253/32 -> ipsec tunnel Then check that .254 bypasses tunnel (match /28 exception), and .2 (match /24) and .253 (match direct policy) pass through the tunnel. Abuses iptables to check if ping did resolve an ipsec policy or not. Also adds a bunch of 'block' rules that are not supposed to match. Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- tools/testing/selftests/net/Makefile | 3 +- tools/testing/selftests/net/xfrm_policy.sh | 276 +++++++++++++++++++++ 2 files changed, 278 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/net/xfrm_policy.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index eec359895feb..2ab3e3437a92 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -4,7 +4,8 @@ CFLAGS = -Wall -Wl,--no-as-needed -O2 -g CFLAGS += -I../../../../usr/include/ -TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh +TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh \ + rtnetlink.sh xfrm_policy.sh TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh TEST_PROGS += udpgro_bench.sh udpgro.sh diff --git a/tools/testing/selftests/net/xfrm_policy.sh b/tools/testing/selftests/net/xfrm_policy.sh new file mode 100755 index 000000000000..6ca63a6e50e9 --- /dev/null +++ b/tools/testing/selftests/net/xfrm_policy.sh @@ -0,0 +1,276 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Check xfrm policy resolution. Topology: +# +# 1.2 1.1 3.1 3.10 2.1 2.2 +# eth1 eth1 veth0 veth0 eth1 eth1 +# ns1 ---- ns3 ----- ns4 ---- ns2 +# +# ns3 and ns4 are connected via ipsec tunnel. +# pings from ns1 to ns2 (and vice versa) are supposed to work like this: +# ns1: ping 10.0.2.2: passes via ipsec tunnel. +# ns2: ping 10.0.1.2: passes via ipsec tunnel. + +# ns1: ping 10.0.1.253: passes via ipsec tunnel (direct policy) +# ns2: ping 10.0.2.253: passes via ipsec tunnel (direct policy) +# +# ns1: ping 10.0.2.254: does NOT pass via ipsec tunnel (exception) +# ns2: ping 10.0.1.254: does NOT pass via ipsec tunnel (exception) + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +KEY_SHA=0xdeadbeef1234567890abcdefabcdefabcdefabcd +KEY_AES=0x0123456789abcdef0123456789012345 +SPI1=0x1 +SPI2=0x2 + +do_esp() { + local ns=$1 + local me=$2 + local remote=$3 + local lnet=$4 + local rnet=$5 + local spi_out=$6 + local spi_in=$7 + + ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet + ip -net $ns xfrm state add src $me dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet + + # to encrypt packets as they go out (includes forwarded packets that need encapsulation) + ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 100 action allow + # to fwd decrypted packets after esp processing: + ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 100 action allow +} + +do_exception() { + local ns=$1 + local me=$2 + local remote=$3 + local encryptip=$4 + local plain=$5 + + # network $plain passes without tunnel + ip -net $ns xfrm policy add dst $plain dir out priority 10 action allow + + # direct policy for $encryptip, use tunnel, higher prio takes precedence + ip -net $ns xfrm policy add dst $encryptip dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow +} + +# policies that are not supposed to match any packets generated in this test. +do_dummies4() { + local ns=$1 + + for i in $(seq 10 16);do + # dummy policy with wildcard src/dst. + echo netns exec $ns ip xfrm policy add src 0.0.0.0/0 dst 10.$i.99.0/30 dir out action block + echo netns exec $ns ip xfrm policy add src 10.$i.99.0/30 dst 0.0.0.0/0 dir out action block + for j in $(seq 32 64);do + echo netns exec $ns ip xfrm policy add src 10.$i.1.0/30 dst 10.$i.$j.0/30 dir out action block + # silly, as it encompasses the one above too, but its allowed: + echo netns exec $ns ip xfrm policy add src 10.$i.1.0/29 dst 10.$i.$j.0/29 dir out action block + # and yet again, even more broad one. + echo netns exec $ns ip xfrm policy add src 10.$i.1.0/24 dst 10.$i.$j.0/24 dir out action block + echo netns exec $ns ip xfrm policy add src 10.$i.$j.0/24 dst 10.$i.1.0/24 dir fwd action block + done + done | ip -batch /dev/stdin +} + +do_dummies6() { + local ns=$1 + + for i in $(seq 10 16);do + for j in $(seq 32 64);do + echo netns exec $ns ip xfrm policy add src dead:$i::/64 dst dead:$i:$j::/64 dir out action block + echo netns exec $ns ip xfrm policy add src dead:$i:$j::/64 dst dead:$i::/24 dir fwd action block + done + done | ip -batch /dev/stdin +} + +check_ipt_policy_count() +{ + ns=$1 + + ip netns exec $ns iptables-save -c |grep policy | ( read c rest + ip netns exec $ns iptables -Z + if [ x"$c" = x'[0:0]' ]; then + exit 0 + elif [ x"$c" = x ]; then + echo "ERROR: No counters" + ret=1 + exit 111 + else + exit 1 + fi + ) +} + +check_xfrm() { + # 0: iptables -m policy rule count == 0 + # 1: iptables -m policy rule count != 0 + rval=$1 + ip=$2 + ret=0 + + ip netns exec ns1 ping -q -c 1 10.0.2.$ip > /dev/null + + check_ipt_policy_count ns3 + if [ $? -ne $rval ] ; then + ret=1 + fi + check_ipt_policy_count ns4 + if [ $? -ne $rval ] ; then + ret=1 + fi + + ip netns exec ns2 ping -q -c 1 10.0.1.$ip > /dev/null + + check_ipt_policy_count ns3 + if [ $? -ne $rval ] ; then + ret=1 + fi + check_ipt_policy_count ns4 + if [ $? -ne $rval ] ; then + ret=1 + fi + + return $ret +} + +#check for needed privileges +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip +fi + +ip -Version 2>/dev/null >/dev/null +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without the ip tool" + exit $ksft_skip +fi + +# needed to check if policy lookup got valid ipsec result +iptables --version 2>/dev/null >/dev/null +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without iptables tool" + exit $ksft_skip +fi + +for i in 1 2 3 4; do + ip netns add ns$i + ip -net ns$i link set lo up +done + +DEV=veth0 +ip link add $DEV netns ns1 type veth peer name eth1 netns ns3 +ip link add $DEV netns ns2 type veth peer name eth1 netns ns4 + +ip link add $DEV netns ns3 type veth peer name veth0 netns ns4 + +DEV=veth0 +for i in 1 2; do + ip -net ns$i link set $DEV up + ip -net ns$i addr add 10.0.$i.2/24 dev $DEV + ip -net ns$i addr add dead:$i::2/64 dev $DEV + + ip -net ns$i addr add 10.0.$i.253 dev $DEV + ip -net ns$i addr add 10.0.$i.254 dev $DEV + ip -net ns$i addr add dead:$i::fd dev $DEV + ip -net ns$i addr add dead:$i::fe dev $DEV +done + +for i in 3 4; do +ip -net ns$i link set eth1 up +ip -net ns$i link set veth0 up +done + +ip -net ns1 route add default via 10.0.1.1 +ip -net ns2 route add default via 10.0.2.1 + +ip -net ns3 addr add 10.0.1.1/24 dev eth1 +ip -net ns3 addr add 10.0.3.1/24 dev veth0 +ip -net ns3 addr add 2001:1::1/64 dev eth1 +ip -net ns3 addr add 2001:3::1/64 dev veth0 + +ip -net ns3 route add default via 10.0.3.10 + +ip -net ns4 addr add 10.0.2.1/24 dev eth1 +ip -net ns4 addr add 10.0.3.10/24 dev veth0 +ip -net ns4 addr add 2001:2::1/64 dev eth1 +ip -net ns4 addr add 2001:3::10/64 dev veth0 +ip -net ns4 route add default via 10.0.3.1 + +for j in 4 6; do + for i in 3 4;do + ip netns exec ns$i sysctl net.ipv$j.conf.eth1.forwarding=1 > /dev/null + ip netns exec ns$i sysctl net.ipv$j.conf.veth0.forwarding=1 > /dev/null + done +done + +# abuse iptables rule counter to check if ping matches a policy +ip netns exec ns3 iptables -p icmp -A FORWARD -m policy --dir out --pol ipsec +ip netns exec ns4 iptables -p icmp -A FORWARD -m policy --dir out --pol ipsec +if [ $? -ne 0 ];then + echo "SKIP: Could not insert iptables rule" + for i in 1 2 3 4;do ip netns del ns$i;done + exit $ksft_skip +fi + +# localip remoteip localnet remotenet +do_esp ns3 10.0.3.1 10.0.3.10 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2 +do_esp ns3 dead:3::1 dead:3::10 dead:1::/64 dead:2::/64 $SPI1 $SPI2 +do_esp ns4 10.0.3.10 10.0.3.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1 +do_esp ns4 dead:3::10 dead:3::1 dead:2::/64 dead:1::/64 $SPI2 $SPI1 + +do_dummies4 ns3 +do_dummies6 ns4 + +# ping to .254 should use ipsec, exception is not installed. +check_xfrm 1 254 +if [ $? -ne 0 ]; then + echo "FAIL: expected ping to .254 to use ipsec tunnel" + ret=1 +else + echo "PASS: policy before exception matches" +fi + +# installs exceptions +# localip remoteip encryptdst plaindst +do_exception ns3 10.0.3.1 10.0.3.10 10.0.2.253 10.0.2.240/28 +do_exception ns4 10.0.3.10 10.0.3.1 10.0.1.253 10.0.1.240/28 + +do_exception ns3 dead:3::1 dead:3::10 dead:2::fd dead:2:f0::/96 +do_exception ns4 dead:3::10 dead:3::1 dead:1::fd dead:1:f0::/96 + +# ping to .254 should now be excluded from the tunnel +check_xfrm 0 254 +if [ $? -ne 0 ]; then + echo "FAIL: expected ping to .254 to fail" + ret=1 +else + echo "PASS: ping to .254 bypassed ipsec tunnel" +fi + +# ping to .253 should use use ipsec due to direct policy exception. +check_xfrm 1 253 +if [ $? -ne 0 ]; then + echo "FAIL: expected ping to .253 to use ipsec tunnel" + ret=1 +else + echo "PASS: direct policy matches" +fi + +# ping to .2 should use ipsec. +check_xfrm 1 2 +if [ $? -ne 0 ]; then + echo "FAIL: expected ping to .2 to use ipsec tunnel" + ret=1 +else + echo "PASS: policy matches" +fi + +for i in 1 2 3 4;do ip netns del ns$i;done + +exit $ret From ceb159e30ad22efa0981d544e6166003515aa896 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Nov 2018 23:00:32 +0100 Subject: [PATCH 02/17] xfrm: security: iterate all, not inexact lists currently all non-socket policies are either hashed in the dst table, or placed on the 'inexact list'. When flushing, we first walk the table, then the (per-direction) inexact lists. When we try and get rid of the inexact lists to having "n" inexact lists (e.g. per-af inexact lists, or sorted into a tree), this walk would become more complicated. Simplify this: walk the 'all' list and skip socket policies during traversal so we don't need to handle exact and inexact policies separately anymore. Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 93 ++++++++++++------------------------------ 1 file changed, 26 insertions(+), 67 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 119a427d9b2b..39d0db2a50d9 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -892,36 +892,19 @@ EXPORT_SYMBOL(xfrm_policy_byid); static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { - int dir, err = 0; + struct xfrm_policy *pol; + int err = 0; - for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { - struct xfrm_policy *pol; - int i; + list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { + if (pol->walk.dead || + xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || + pol->type != type) + continue; - hlist_for_each_entry(pol, - &net->xfrm.policy_inexact[dir], bydst) { - if (pol->type != type) - continue; - err = security_xfrm_policy_delete(pol->security); - if (err) { - xfrm_audit_policy_delete(pol, 0, task_valid); - return err; - } - } - for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) { - hlist_for_each_entry(pol, - net->xfrm.policy_bydst[dir].table + i, - bydst) { - if (pol->type != type) - continue; - err = security_xfrm_policy_delete( - pol->security); - if (err) { - xfrm_audit_policy_delete(pol, 0, - task_valid); - return err; - } - } + err = security_xfrm_policy_delete(pol->security); + if (err) { + xfrm_audit_policy_delete(pol, 0, task_valid); + return err; } } return err; @@ -937,6 +920,7 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) { int dir, err = 0, cnt = 0; + struct xfrm_policy *pol; spin_lock_bh(&net->xfrm.xfrm_policy_lock); @@ -944,46 +928,21 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) if (err) goto out; - for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { - struct xfrm_policy *pol; - int i; - - again1: - hlist_for_each_entry(pol, - &net->xfrm.policy_inexact[dir], bydst) { - if (pol->type != type) - continue; - __xfrm_policy_unlink(pol, dir); - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - cnt++; - - xfrm_audit_policy_delete(pol, 1, task_valid); - - xfrm_policy_kill(pol); - - spin_lock_bh(&net->xfrm.xfrm_policy_lock); - goto again1; - } - - for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) { - again2: - hlist_for_each_entry(pol, - net->xfrm.policy_bydst[dir].table + i, - bydst) { - if (pol->type != type) - continue; - __xfrm_policy_unlink(pol, dir); - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - cnt++; - - xfrm_audit_policy_delete(pol, 1, task_valid); - xfrm_policy_kill(pol); - - spin_lock_bh(&net->xfrm.xfrm_policy_lock); - goto again2; - } - } +again: + list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { + dir = xfrm_policy_id2dir(pol->index); + if (pol->walk.dead || + dir >= XFRM_POLICY_MAX || + pol->type != type) + continue; + __xfrm_policy_unlink(pol, dir); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + cnt++; + xfrm_audit_policy_delete(pol, 1, task_valid); + xfrm_policy_kill(pol); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + goto again; } if (!cnt) err = -ESRCH; From a927d6af53eec08661628e3992d74736e848a743 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Nov 2018 23:00:33 +0100 Subject: [PATCH 03/17] xfrm: policy: split list insertion into a helper ... so we can reuse this later without code duplication when we add policy to a second inexact list. Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 43 ++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 39d0db2a50d9..20d6815be0d7 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -740,18 +740,12 @@ static bool xfrm_policy_mark_match(struct xfrm_policy *policy, return false; } -int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) +static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, + struct xfrm_policy *policy, + bool excl) { - struct net *net = xp_net(policy); - struct xfrm_policy *pol; - struct xfrm_policy *delpol; - struct hlist_head *chain; - struct hlist_node *newpos; + struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL; - spin_lock_bh(&net->xfrm.xfrm_policy_lock); - chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); - delpol = NULL; - newpos = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == policy->type && pol->if_id == policy->if_id && @@ -759,24 +753,41 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) xfrm_policy_mark_match(policy, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { - if (excl) { - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - return -EEXIST; - } + if (excl) + return ERR_PTR(-EEXIST); delpol = pol; if (policy->priority > pol->priority) continue; } else if (policy->priority >= pol->priority) { - newpos = &pol->bydst; + newpos = pol; continue; } if (delpol) break; } if (newpos) - hlist_add_behind_rcu(&policy->bydst, newpos); + hlist_add_behind_rcu(&policy->bydst, &newpos->bydst); else hlist_add_head_rcu(&policy->bydst, chain); + + return delpol; +} + +int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) +{ + struct net *net = xp_net(policy); + struct xfrm_policy *delpol; + struct hlist_head *chain; + + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); + delpol = xfrm_policy_insert_list(chain, policy, excl); + + if (IS_ERR(delpol)) { + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return PTR_ERR(delpol); + } + __xfrm_policy_link(policy, dir); /* After previous checking, family can either be AF_INET or AF_INET6 */ From cc1bb845adc9b3a005cbb67fd18c69af1c3aec94 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Nov 2018 23:00:34 +0100 Subject: [PATCH 04/17] xfrm: policy: return NULL when inexact search needed currently policy_hash_bysel() returns the hash bucket list (for exact policies), or the inexact list (when policy uses a prefix). Searching this inexact list is slow, so it might be better to pre-sort inexact lists into a tree or another data structure for faster searching. However, due to 'any' policies, that need to be searched in any case, doing so will require that 'inexact' policies need to be handled specially to decide the best search strategy. So change hash_bysel() and return NULL if the policy can't be handled via the policy hash table. Right now, we simply use the inexact list when this happens, but future patch can then implement a different strategy. Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 20d6815be0d7..b00c265f6be3 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -365,7 +365,7 @@ static struct hlist_head *policy_hash_bysel(struct net *net, hash = __sel_hash(sel, family, hmask, dbits, sbits); if (hash == hmask + 1) - return &net->xfrm.policy_inexact[dir]; + return NULL; return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; @@ -625,6 +625,8 @@ static void xfrm_hash_rebuild(struct work_struct *work) chain = policy_hash_bysel(net, &policy->selector, policy->family, xfrm_policy_id2dir(policy->index)); + if (!chain) + chain = &net->xfrm.policy_inexact[dir]; hlist_for_each_entry(pol, chain, bydst) { if (policy->priority >= pol->priority) newpos = &pol->bydst; @@ -781,7 +783,12 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); - delpol = xfrm_policy_insert_list(chain, policy, excl); + if (chain) { + delpol = xfrm_policy_insert_list(chain, policy, excl); + } else { + chain = &net->xfrm.policy_inexact[dir]; + delpol = xfrm_policy_insert_list(chain, policy, excl); + } if (IS_ERR(delpol)) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); @@ -829,6 +836,8 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); + if (!chain) + chain = &net->xfrm.policy_inexact[dir]; ret = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == type && From 24969facd704a5f0dd8e08da86bf32a9ce972bee Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Nov 2018 23:00:35 +0100 Subject: [PATCH 05/17] xfrm: policy: store inexact policies in an rhashtable Switch packet-path lookups for inexact policies to rhashtable. In this initial version, we now no longer need to search policies with non-matching address family and type. Next patch will add the if_id as well so lookups from the xfrm interface driver only need to search inexact policies for that device. Future patches will augment the hlist in each rhash bucket with a tree and pre-sort policies according to daddr/prefix. A single rhashtable is used. In order to avoid a full rhashtable walk on netns exit, the bins get placed on a pernet list, i.e. we add almost no cost for network namespaces that had no xfrm policies. The inexact lists are kept in place, and policies are added to both the per-rhash-inexact list and a pernet one. The latter is needed for the control plane to handle migrate -- these requests do not consider the if_id, so if we'd remove the inexact_list now we would have to search all hash buckets and then figure out which matching policy candidate is the most recent one -- this appears a bit harder than just keeping the 'old' inexact list for this purpose. Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 2 + include/net/xfrm.h | 1 + net/xfrm/xfrm_policy.c | 350 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 335 insertions(+), 18 deletions(-) diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 9991e5ef52cc..59f45b1e9dac 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,7 @@ struct netns_xfrm { unsigned int policy_count[XFRM_POLICY_MAX * 2]; struct work_struct policy_hash_work; struct xfrm_policy_hthresh policy_hthresh; + struct list_head inexact_bins; struct sock *nlsk; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 0eb390c205af..870fa9b27f7e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -596,6 +596,7 @@ struct xfrm_policy { u16 family; struct xfrm_sec_ctx *security; struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; + struct hlist_node bydst_inexact_list; struct rcu_head rcu; }; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b00c265f6be3..5c7e7399323f 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,22 @@ struct xfrm_flo { u8 flags; }; +struct xfrm_pol_inexact_key { + possible_net_t net; + u16 family; + u8 dir, type; +}; + +struct xfrm_pol_inexact_bin { + struct xfrm_pol_inexact_key k; + struct rhash_head head; + struct hlist_head hhead; + + /* slow path below */ + struct list_head inexact_bins; + struct rcu_head rcu; +}; + static DEFINE_SPINLOCK(xfrm_if_cb_lock); static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; @@ -55,6 +72,9 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] static struct kmem_cache *xfrm_dst_cache __ro_after_init; static __read_mostly seqcount_t xfrm_policy_hash_generation; +static struct rhashtable xfrm_policy_inexact_table; +static const struct rhashtable_params xfrm_pol_inexact_params; + static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); @@ -64,6 +84,18 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir); + +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_lookup_rcu(struct net *net, + u8 type, u16 family, u8 dir); +static struct xfrm_policy * +xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, + bool excl); +static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, + struct xfrm_policy *policy); + static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy) { return refcount_inc_not_zero(&policy->refcnt); @@ -269,6 +301,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); + INIT_HLIST_NODE(&policy->bydst_inexact_list); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); @@ -563,6 +596,107 @@ static void xfrm_hash_resize(struct work_struct *work) mutex_unlock(&hash_resize_mutex); } +static void xfrm_hash_reset_inexact_table(struct net *net) +{ + struct xfrm_pol_inexact_bin *b; + + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + list_for_each_entry(b, &net->xfrm.inexact_bins, inexact_bins) + INIT_HLIST_HEAD(&b->hhead); +} + +/* Make sure *pol can be inserted into fastbin. + * Useful to check that later insert requests will be sucessful + * (provided xfrm_policy_lock is held throughout). + */ +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) +{ + struct xfrm_pol_inexact_bin *bin, *prev; + struct xfrm_pol_inexact_key k = { + .family = pol->family, + .type = pol->type, + .dir = dir, + }; + struct net *net = xp_net(pol); + + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + write_pnet(&k.net, net); + bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k, + xfrm_pol_inexact_params); + if (bin) + return bin; + + bin = kzalloc(sizeof(*bin), GFP_ATOMIC); + if (!bin) + return NULL; + + bin->k = k; + INIT_HLIST_HEAD(&bin->hhead); + + prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table, + &bin->k, &bin->head, + xfrm_pol_inexact_params); + if (!prev) { + list_add(&bin->inexact_bins, &net->xfrm.inexact_bins); + return bin; + } + + kfree(bin); + + return IS_ERR(prev) ? NULL : prev; +} + +static void xfrm_policy_inexact_delete_bin(struct net *net, + struct xfrm_pol_inexact_bin *b) +{ + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + if (!hlist_empty(&b->hhead)) + return; + + if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head, + xfrm_pol_inexact_params) == 0) { + list_del(&b->inexact_bins); + kfree_rcu(b, rcu); + } +} + +static void __xfrm_policy_inexact_flush(struct net *net) +{ + struct xfrm_pol_inexact_bin *bin; + + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + list_for_each_entry(bin, &net->xfrm.inexact_bins, inexact_bins) + xfrm_policy_inexact_delete_bin(net, bin); +} + +static struct xfrm_policy * +xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) +{ + struct xfrm_pol_inexact_bin *bin; + struct xfrm_policy *delpol; + struct hlist_head *chain; + struct net *net; + + bin = xfrm_policy_inexact_alloc_bin(policy, dir); + if (!bin) + return ERR_PTR(-ENOMEM); + + delpol = xfrm_policy_insert_list(&bin->hhead, policy, excl); + if (delpol && excl) + return ERR_PTR(-EEXIST); + + net = xp_net(policy); + chain = &net->xfrm.policy_inexact[dir]; + xfrm_policy_insert_inexact_list(chain, policy); + + return delpol; +} + static void xfrm_hash_rebuild(struct work_struct *work) { struct net *net = container_of(work, struct net, @@ -592,7 +726,45 @@ static void xfrm_hash_rebuild(struct work_struct *work) spin_lock_bh(&net->xfrm.xfrm_policy_lock); + /* make sure that we can insert the indirect policies again before + * we start with destructive action. + */ + list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { + u8 dbits, sbits; + + dir = xfrm_policy_id2dir(policy->index); + if (policy->walk.dead || dir >= XFRM_POLICY_MAX) + continue; + + if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { + if (policy->family == AF_INET) { + dbits = rbits4; + sbits = lbits4; + } else { + dbits = rbits6; + sbits = lbits6; + } + } else { + if (policy->family == AF_INET) { + dbits = lbits4; + sbits = rbits4; + } else { + dbits = lbits6; + sbits = rbits6; + } + } + + if (policy->selector.prefixlen_d < dbits || + policy->selector.prefixlen_s < sbits) + continue; + + if (!xfrm_policy_inexact_alloc_bin(policy, dir)) + goto out_unlock; + } + /* reset the bydst and inexact table in all directions */ + xfrm_hash_reset_inexact_table(net); + for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); hmask = net->xfrm.policy_bydst[dir].hmask; @@ -625,8 +797,13 @@ static void xfrm_hash_rebuild(struct work_struct *work) chain = policy_hash_bysel(net, &policy->selector, policy->family, xfrm_policy_id2dir(policy->index)); - if (!chain) - chain = &net->xfrm.policy_inexact[dir]; + if (!chain) { + void *p = xfrm_policy_inexact_insert(policy, dir, 0); + + WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p)); + continue; + } + hlist_for_each_entry(pol, chain, bydst) { if (policy->priority >= pol->priority) newpos = &pol->bydst; @@ -639,6 +816,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) hlist_add_head_rcu(&policy->bydst, chain); } +out_unlock: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); @@ -742,6 +920,84 @@ static bool xfrm_policy_mark_match(struct xfrm_policy *policy, return false; } +static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed) +{ + const struct xfrm_pol_inexact_key *k = data; + u32 a = k->type << 24 | k->dir << 16 | k->family; + + return jhash_2words(a, net_hash_mix(read_pnet(&k->net)), seed); +} + +static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed) +{ + const struct xfrm_pol_inexact_bin *b = data; + + return xfrm_pol_bin_key(&b->k, 0, seed); +} + +static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct xfrm_pol_inexact_key *key = arg->key; + const struct xfrm_pol_inexact_bin *b = ptr; + int ret; + + if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net))) + return -1; + + ret = b->k.dir ^ key->dir; + if (ret) + return ret; + + ret = b->k.type ^ key->type; + if (ret) + return ret; + + ret = b->k.family ^ key->family; + if (ret) + return ret; + + return 0; +} + +static const struct rhashtable_params xfrm_pol_inexact_params = { + .head_offset = offsetof(struct xfrm_pol_inexact_bin, head), + .hashfn = xfrm_pol_bin_key, + .obj_hashfn = xfrm_pol_bin_obj, + .obj_cmpfn = xfrm_pol_bin_cmp, + .automatic_shrinking = true, +}; + +static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, + struct xfrm_policy *policy) +{ + struct xfrm_policy *pol, *delpol = NULL; + struct hlist_node *newpos = NULL; + + hlist_for_each_entry(pol, chain, bydst_inexact_list) { + if (pol->type == policy->type && + pol->if_id == policy->if_id && + !selector_cmp(&pol->selector, &policy->selector) && + xfrm_policy_mark_match(policy, pol) && + xfrm_sec_ctx_match(pol->security, policy->security) && + !WARN_ON(delpol)) { + delpol = pol; + if (policy->priority > pol->priority) + continue; + } else if (policy->priority >= pol->priority) { + newpos = &pol->bydst_inexact_list; + continue; + } + if (delpol) + break; + } + + if (newpos) + hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos); + else + hlist_add_head_rcu(&policy->bydst_inexact_list, chain); +} + static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl) @@ -767,6 +1023,7 @@ static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, if (delpol) break; } + if (newpos) hlist_add_behind_rcu(&policy->bydst, &newpos->bydst); else @@ -783,12 +1040,10 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); - if (chain) { + if (chain) delpol = xfrm_policy_insert_list(chain, policy, excl); - } else { - chain = &net->xfrm.policy_inexact[dir]; - delpol = xfrm_policy_insert_list(chain, policy, excl); - } + else + delpol = xfrm_policy_inexact_insert(policy, dir, excl); if (IS_ERR(delpol)) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); @@ -830,14 +1085,24 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, struct xfrm_sec_ctx *ctx, int delete, int *err) { - struct xfrm_policy *pol, *ret; + struct xfrm_pol_inexact_bin *bin = NULL; + struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); - if (!chain) - chain = &net->xfrm.policy_inexact[dir]; + if (!chain) { + bin = xfrm_policy_inexact_lookup(net, type, + sel->family, dir); + if (!bin) { + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return NULL; + } + + chain = &bin->hhead; + } + ret = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == type && @@ -854,6 +1119,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, return pol; } __xfrm_policy_unlink(pol, dir); + xfrm_policy_inexact_delete_bin(net, bin); } ret = pol; break; @@ -964,7 +1230,9 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again; } - if (!cnt) + if (cnt) + __xfrm_policy_inexact_flush(net); + else err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); @@ -1063,21 +1331,50 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, if (match) ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, dir); - return ret; } +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir) +{ + struct xfrm_pol_inexact_key k = { + .family = family, + .type = type, + .dir = dir, + }; + + write_pnet(&k.net, net); + + return rhashtable_lookup(&xfrm_policy_inexact_table, &k, + xfrm_pol_inexact_params); +} + +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir) +{ + struct xfrm_pol_inexact_bin *bin; + + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + rcu_read_lock(); + bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir); + rcu_read_unlock(); + + return bin; +} + static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { - int err; - struct xfrm_policy *pol, *ret; const xfrm_address_t *daddr, *saddr; + struct xfrm_pol_inexact_bin *bin; + struct xfrm_policy *pol, *ret; struct hlist_head *chain; unsigned int sequence; u32 priority; + int err; daddr = xfrm_flowi_daddr(fl, family); saddr = xfrm_flowi_saddr(fl, family); @@ -1108,7 +1405,10 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, break; } } - chain = &net->xfrm.policy_inexact[dir]; + bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir); + if (!bin) + goto skip_inexact; + chain = &bin->hhead; hlist_for_each_entry_rcu(pol, chain, bydst) { if ((pol->priority >= priority) && ret) break; @@ -1127,6 +1427,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } } +skip_inexact: if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) goto retry; @@ -1218,6 +1519,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, /* Socket policies are not hashed. */ if (!hlist_unhashed(&pol->bydst)) { hlist_del_rcu(&pol->bydst); + hlist_del_init(&pol->bydst_inexact_list); hlist_del(&pol->byidx); } @@ -2795,13 +3097,17 @@ static void xfrm_statistics_fini(struct net *net) static int __net_init xfrm_policy_init(struct net *net) { unsigned int hmask, sz; - int dir; + int dir, err; - if (net_eq(net, &init_net)) + if (net_eq(net, &init_net)) { xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache", sizeof(struct xfrm_dst), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + err = rhashtable_init(&xfrm_policy_inexact_table, + &xfrm_pol_inexact_params); + BUG_ON(err); + } hmask = 8 - 1; sz = (hmask+1) * sizeof(struct hlist_head); @@ -2836,6 +3142,7 @@ static int __net_init xfrm_policy_init(struct net *net) seqlock_init(&net->xfrm.policy_hthresh.lock); INIT_LIST_HEAD(&net->xfrm.policy_all); + INIT_LIST_HEAD(&net->xfrm.inexact_bins); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); return 0; @@ -2854,6 +3161,7 @@ static int __net_init xfrm_policy_init(struct net *net) static void xfrm_policy_fini(struct net *net) { + struct xfrm_pol_inexact_bin *bin, *tmp; unsigned int sz; int dir; @@ -2879,6 +3187,12 @@ static void xfrm_policy_fini(struct net *net) sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(net->xfrm.policy_byidx)); xfrm_hash_free(net->xfrm.policy_byidx, sz); + + list_for_each_entry_safe(bin, tmp, &net->xfrm.inexact_bins, + inexact_bins) { + WARN_ON(!hlist_empty(&bin->hhead)); + xfrm_policy_inexact_delete_bin(net, bin); + } } static int __net_init xfrm_net_init(struct net *net) @@ -3044,7 +3358,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * } } chain = &net->xfrm.policy_inexact[dir]; - hlist_for_each_entry(pol, chain, bydst) { + hlist_for_each_entry(pol, chain, bydst_inexact_list) { if ((pol->priority >= priority) && ret) break; From b5fe22e2337d47cd68bb7d8e4103a628808c4d5e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Nov 2018 23:00:36 +0100 Subject: [PATCH 06/17] xfrm: policy: consider if_id when hashing inexact policy This avoids searches of polices that cannot match in the first place due to different interface id by placing them in different bins. Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 5c7e7399323f..dda27fd7b8a4 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -48,6 +48,7 @@ struct xfrm_flo { struct xfrm_pol_inexact_key { possible_net_t net; + u32 if_id; u16 family; u8 dir, type; }; @@ -85,11 +86,12 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); static struct xfrm_pol_inexact_bin * -xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir); +xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir, + u32 if_id); static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup_rcu(struct net *net, - u8 type, u16 family, u8 dir); + u8 type, u16 family, u8 dir, u32 if_id); static struct xfrm_policy * xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl); @@ -618,6 +620,7 @@ xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) .family = pol->family, .type = pol->type, .dir = dir, + .if_id = pol->if_id, }; struct net *net = xp_net(pol); @@ -925,7 +928,8 @@ static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed) const struct xfrm_pol_inexact_key *k = data; u32 a = k->type << 24 | k->dir << 16 | k->family; - return jhash_2words(a, net_hash_mix(read_pnet(&k->net)), seed); + return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)), + seed); } static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed) @@ -957,7 +961,7 @@ static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg, if (ret) return ret; - return 0; + return b->k.if_id ^ key->if_id; } static const struct rhashtable_params xfrm_pol_inexact_params = { @@ -1094,7 +1098,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, chain = policy_hash_bysel(net, sel, sel->family, dir); if (!chain) { bin = xfrm_policy_inexact_lookup(net, type, - sel->family, dir); + sel->family, dir, if_id); if (!bin) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return NULL; @@ -1335,12 +1339,14 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, } static struct xfrm_pol_inexact_bin * -xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir) +xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, + u8 dir, u32 if_id) { struct xfrm_pol_inexact_key k = { .family = family, .type = type, .dir = dir, + .if_id = if_id, }; write_pnet(&k.net, net); @@ -1350,14 +1356,15 @@ xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir) } static struct xfrm_pol_inexact_bin * -xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir) +xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, + u8 dir, u32 if_id) { struct xfrm_pol_inexact_bin *bin; lockdep_assert_held(&net->xfrm.xfrm_policy_lock); rcu_read_lock(); - bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir); + bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); rcu_read_unlock(); return bin; @@ -1405,7 +1412,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, break; } } - bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir); + bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); if (!bin) goto skip_inexact; chain = &bin->hhead; From 6be3b0db6db82cf056a72cc18042048edd27f8ee Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Nov 2018 23:00:37 +0100 Subject: [PATCH 07/17] xfrm: policy: add inexact policy search tree infrastructure At this time inexact policies are all searched in-order until the first match is found. After removal of the flow cache, this resolution has to be performed for every packetm resulting in major slowdown when number of inexact policies is high. This adds infrastructure to later sort inexact policies into a tree. This only introduces a single class: any:any. Next patch will add a search tree to pre-sort policies that have a fixed daddr/prefixlen, so in this patch the any:any class will still be used for all policies. Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 + net/xfrm/xfrm_policy.c | 303 +++++++++++++++++++++++++++++++++-------- 2 files changed, 249 insertions(+), 55 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 870fa9b27f7e..9df6dca17155 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -577,6 +577,7 @@ struct xfrm_policy { /* This lock only affects elements except for entry. */ rwlock_t lock; refcount_t refcnt; + u32 pos; struct timer_list timer; atomic_t genid; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index dda27fd7b8a4..4eb12e9b40c2 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -46,6 +46,9 @@ struct xfrm_flo { u8 flags; }; +/* prefixes smaller than this are stored in lists, not trees. */ +#define INEXACT_PREFIXLEN_IPV4 16 +#define INEXACT_PREFIXLEN_IPV6 48 struct xfrm_pol_inexact_key { possible_net_t net; u32 if_id; @@ -56,6 +59,7 @@ struct xfrm_pol_inexact_key { struct xfrm_pol_inexact_bin { struct xfrm_pol_inexact_key k; struct rhash_head head; + /* list containing '*:*' policies */ struct hlist_head hhead; /* slow path below */ @@ -63,6 +67,16 @@ struct xfrm_pol_inexact_bin { struct rcu_head rcu; }; +enum xfrm_pol_inexact_candidate_type { + XFRM_POL_CAND_ANY, + + XFRM_POL_CAND_MAX, +}; + +struct xfrm_pol_inexact_candidates { + struct hlist_head *res[XFRM_POL_CAND_MAX]; +}; + static DEFINE_SPINLOCK(xfrm_if_cb_lock); static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; @@ -98,6 +112,12 @@ xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, struct xfrm_policy *policy); +static bool +xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, + struct xfrm_pol_inexact_bin *b, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr); + static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy) { return refcount_inc_not_zero(&policy->refcnt); @@ -652,13 +672,48 @@ xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) return IS_ERR(prev) ? NULL : prev; } -static void xfrm_policy_inexact_delete_bin(struct net *net, - struct xfrm_pol_inexact_bin *b) +static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr, + int family, u8 prefixlen) { - lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + if (xfrm_addr_any(addr, family)) + return true; - if (!hlist_empty(&b->hhead)) + if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6) + return true; + + if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4) + return true; + + return false; +} + +static bool +xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy) +{ + const xfrm_address_t *addr; + bool saddr_any, daddr_any; + u8 prefixlen; + + addr = &policy->selector.saddr; + prefixlen = policy->selector.prefixlen_s; + + saddr_any = xfrm_pol_inexact_addr_use_any_list(addr, + policy->family, + prefixlen); + addr = &policy->selector.daddr; + prefixlen = policy->selector.prefixlen_d; + daddr_any = xfrm_pol_inexact_addr_use_any_list(addr, + policy->family, + prefixlen); + return saddr_any && daddr_any; +} + +static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit) +{ + if (!hlist_empty(&b->hhead)) { + WARN_ON_ONCE(net_exit); return; + } if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head, xfrm_pol_inexact_params) == 0) { @@ -667,14 +722,23 @@ static void xfrm_policy_inexact_delete_bin(struct net *net, } } +static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b) +{ + struct net *net = read_pnet(&b->k.net); + + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + __xfrm_policy_inexact_prune_bin(b, false); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); +} + static void __xfrm_policy_inexact_flush(struct net *net) { - struct xfrm_pol_inexact_bin *bin; + struct xfrm_pol_inexact_bin *bin, *t; lockdep_assert_held(&net->xfrm.xfrm_policy_lock); - list_for_each_entry(bin, &net->xfrm.inexact_bins, inexact_bins) - xfrm_policy_inexact_delete_bin(net, bin); + list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins) + __xfrm_policy_inexact_prune_bin(bin, false); } static struct xfrm_policy * @@ -689,14 +753,28 @@ xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) if (!bin) return ERR_PTR(-ENOMEM); - delpol = xfrm_policy_insert_list(&bin->hhead, policy, excl); - if (delpol && excl) - return ERR_PTR(-EEXIST); - net = xp_net(policy); + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + if (xfrm_policy_inexact_insert_use_any_list(policy)) { + chain = &bin->hhead; + goto insert_to_list; + } + + chain = &bin->hhead; +insert_to_list: + delpol = xfrm_policy_insert_list(chain, policy, excl); + if (delpol && excl) { + __xfrm_policy_inexact_prune_bin(bin, false); + return ERR_PTR(-EEXIST); + } + chain = &net->xfrm.policy_inexact[dir]; xfrm_policy_insert_inexact_list(chain, policy); + if (delpol) + __xfrm_policy_inexact_prune_bin(bin, false); + return delpol; } @@ -733,6 +811,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) * we start with destructive action. */ list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { + struct xfrm_pol_inexact_bin *bin; u8 dbits, sbits; dir = xfrm_policy_id2dir(policy->index); @@ -761,7 +840,8 @@ static void xfrm_hash_rebuild(struct work_struct *work) policy->selector.prefixlen_s < sbits) continue; - if (!xfrm_policy_inexact_alloc_bin(policy, dir)) + bin = xfrm_policy_inexact_alloc_bin(policy, dir); + if (!bin) goto out_unlock; } @@ -820,6 +900,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) } out_unlock: + __xfrm_policy_inexact_flush(net); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); @@ -977,6 +1058,7 @@ static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, { struct xfrm_policy *pol, *delpol = NULL; struct hlist_node *newpos = NULL; + int i = 0; hlist_for_each_entry(pol, chain, bydst_inexact_list) { if (pol->type == policy->type && @@ -1000,6 +1082,11 @@ static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos); else hlist_add_head_rcu(&policy->bydst_inexact_list, chain); + + hlist_for_each_entry(pol, chain, bydst_inexact_list) { + pol->pos = i; + i++; + } } static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, @@ -1083,6 +1170,29 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) } EXPORT_SYMBOL(xfrm_policy_insert); +static struct xfrm_policy * +__xfrm_policy_bysel_ctx(struct hlist_head *chain, u32 mark, u32 if_id, + u8 type, int dir, + struct xfrm_selector *sel, + struct xfrm_sec_ctx *ctx) +{ + struct xfrm_policy *pol; + + if (!chain) + return NULL; + + hlist_for_each_entry(pol, chain, bydst) { + if (pol->type == type && + pol->if_id == if_id && + (mark & pol->mark.m) == pol->mark.v && + !selector_cmp(sel, &pol->selector) && + xfrm_sec_ctx_match(ctx, pol->security)) + return pol; + } + + return NULL; +} + struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, @@ -1097,6 +1207,9 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); if (!chain) { + struct xfrm_pol_inexact_candidates cand; + int i; + bin = xfrm_policy_inexact_lookup(net, type, sel->family, dir, if_id); if (!bin) { @@ -1104,35 +1217,46 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, return NULL; } - chain = &bin->hhead; + if (!xfrm_policy_find_inexact_candidates(&cand, bin, + &sel->saddr, + &sel->daddr)) { + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return NULL; + } + + pol = NULL; + for (i = 0; i < ARRAY_SIZE(cand.res); i++) { + struct xfrm_policy *tmp; + + tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark, + if_id, type, dir, + sel, ctx); + if (tmp && pol && tmp->pos < pol->pos) + pol = tmp; + } + } else { + pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir, + sel, ctx); } - ret = NULL; - hlist_for_each_entry(pol, chain, bydst) { - if (pol->type == type && - pol->if_id == if_id && - (mark & pol->mark.m) == pol->mark.v && - !selector_cmp(sel, &pol->selector) && - xfrm_sec_ctx_match(ctx, pol->security)) { - xfrm_pol_hold(pol); - if (delete) { - *err = security_xfrm_policy_delete( - pol->security); - if (*err) { - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - return pol; - } - __xfrm_policy_unlink(pol, dir); - xfrm_policy_inexact_delete_bin(net, bin); + if (pol) { + xfrm_pol_hold(pol); + if (delete) { + *err = security_xfrm_policy_delete(pol->security); + if (*err) { + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return pol; } - ret = pol; - break; + __xfrm_policy_unlink(pol, dir); } + ret = pol; } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); + if (bin && delete) + xfrm_policy_inexact_prune_bin(bin); return ret; } EXPORT_SYMBOL(xfrm_policy_bysel_ctx); @@ -1338,6 +1462,20 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, return ret; } +static bool +xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, + struct xfrm_pol_inexact_bin *b, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) +{ + if (!b) + return false; + + memset(cand, 0, sizeof(*cand)); + cand->res[XFRM_POL_CAND_ANY] = &b->hhead; + return true; +} + static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir, u32 if_id) @@ -1370,11 +1508,76 @@ xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, return bin; } +static struct xfrm_policy * +__xfrm_policy_eval_candidates(struct hlist_head *chain, + struct xfrm_policy *prefer, + const struct flowi *fl, + u8 type, u16 family, int dir, u32 if_id) +{ + u32 priority = prefer ? prefer->priority : ~0u; + struct xfrm_policy *pol; + + if (!chain) + return NULL; + + hlist_for_each_entry_rcu(pol, chain, bydst) { + int err; + + if (pol->priority > priority) + break; + + err = xfrm_policy_match(pol, fl, type, family, dir, if_id); + if (err) { + if (err != -ESRCH) + return ERR_PTR(err); + + continue; + } + + if (prefer) { + /* matches. Is it older than *prefer? */ + if (pol->priority == priority && + prefer->pos < pol->pos) + return prefer; + } + + return pol; + } + + return NULL; +} + +static struct xfrm_policy * +xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand, + struct xfrm_policy *prefer, + const struct flowi *fl, + u8 type, u16 family, int dir, u32 if_id) +{ + struct xfrm_policy *tmp; + int i; + + for (i = 0; i < ARRAY_SIZE(cand->res); i++) { + tmp = __xfrm_policy_eval_candidates(cand->res[i], + prefer, + fl, type, family, dir, + if_id); + if (!tmp) + continue; + + if (IS_ERR(tmp)) + return tmp; + prefer = tmp; + } + + return prefer; +} + static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { + struct xfrm_pol_inexact_candidates cand; const xfrm_address_t *daddr, *saddr; struct xfrm_pol_inexact_bin *bin; struct xfrm_policy *pol, *ret; @@ -1413,25 +1616,16 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } } bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); - if (!bin) + if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr, + daddr)) goto skip_inexact; - chain = &bin->hhead; - hlist_for_each_entry_rcu(pol, chain, bydst) { - if ((pol->priority >= priority) && ret) - break; - err = xfrm_policy_match(pol, fl, type, family, dir, if_id); - if (err) { - if (err == -ESRCH) - continue; - else { - ret = ERR_PTR(err); - goto fail; - } - } else { - ret = pol; - break; - } + pol = xfrm_policy_eval_candidates(&cand, ret, fl, type, + family, dir, if_id); + if (pol) { + ret = pol; + if (IS_ERR(pol)) + goto fail; } skip_inexact: @@ -3168,7 +3362,7 @@ static int __net_init xfrm_policy_init(struct net *net) static void xfrm_policy_fini(struct net *net) { - struct xfrm_pol_inexact_bin *bin, *tmp; + struct xfrm_pol_inexact_bin *b, *t; unsigned int sz; int dir; @@ -3195,11 +3389,10 @@ static void xfrm_policy_fini(struct net *net) WARN_ON(!hlist_empty(net->xfrm.policy_byidx)); xfrm_hash_free(net->xfrm.policy_byidx, sz); - list_for_each_entry_safe(bin, tmp, &net->xfrm.inexact_bins, - inexact_bins) { - WARN_ON(!hlist_empty(&bin->hhead)); - xfrm_policy_inexact_delete_bin(net, bin); - } + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins) + __xfrm_policy_inexact_prune_bin(b, true); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } static int __net_init xfrm_net_init(struct net *net) From 9cf545ebd591da673bb6b6c88150212ad83567a9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Nov 2018 23:00:38 +0100 Subject: [PATCH 08/17] xfrm: policy: store inexact policies in a tree ordered by destination address This adds inexact lists per destination network, stored in a search tree. Inexact lookups now return two 'candidate lists', the 'any' policies ('any' destionations), and a list of policies that share same daddr/prefix. Next patch will add a second search tree for 'saddr:any' policies so we can avoid placing those on the 'any:any' list too. Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 + net/xfrm/xfrm_policy.c | 333 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 328 insertions(+), 6 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 9df6dca17155..fa4b3c877fcf 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -590,6 +590,7 @@ struct xfrm_policy { struct xfrm_lifetime_cur curlft; struct xfrm_policy_walk_entry walk; struct xfrm_policy_queue polq; + bool bydst_reinsert; u8 type; u8 action; u8 flags; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 4eb12e9b40c2..81447d5d02e6 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -49,6 +49,38 @@ struct xfrm_flo { /* prefixes smaller than this are stored in lists, not trees. */ #define INEXACT_PREFIXLEN_IPV4 16 #define INEXACT_PREFIXLEN_IPV6 48 + +struct xfrm_pol_inexact_node { + struct rb_node node; + union { + xfrm_address_t addr; + struct rcu_head rcu; + }; + u8 prefixlen; + + /* the policies matching this node, can be empty list */ + struct hlist_head hhead; +}; + +/* xfrm inexact policy search tree: + * xfrm_pol_inexact_bin = hash(dir,type,family,if_id); + * | + * +---- root_d: sorted by daddr:prefix + * | | + * | xfrm_pol_inexact_node + * | | + * | +- coarse policies and all any:daddr policies + * | + * +---- coarse policies and all any:any policies + * + * Lookups return two candidate lists: + * 1. any:any list from top-level xfrm_pol_inexact_bin + * 2. any:daddr list from daddr tree + * + * This result set then needs to be searched for the policy with + * the lowest priority. If two results have same prio, youngest one wins. + */ + struct xfrm_pol_inexact_key { possible_net_t net; u32 if_id; @@ -62,12 +94,17 @@ struct xfrm_pol_inexact_bin { /* list containing '*:*' policies */ struct hlist_head hhead; + seqcount_t count; + /* tree sorted by daddr/prefix */ + struct rb_root root_d; + /* slow path below */ struct list_head inexact_bins; struct rcu_head rcu; }; enum xfrm_pol_inexact_candidate_type { + XFRM_POL_CAND_DADDR, XFRM_POL_CAND_ANY, XFRM_POL_CAND_MAX, @@ -658,6 +695,8 @@ xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) bin->k = k; INIT_HLIST_HEAD(&bin->hhead); + bin->root_d = RB_ROOT; + seqcount_init(&bin->count); prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table, &bin->k, &bin->head, @@ -708,9 +747,211 @@ xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy) return saddr_any && daddr_any; } +static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node, + const xfrm_address_t *addr, u8 prefixlen) +{ + node->addr = *addr; + node->prefixlen = prefixlen; +} + +static struct xfrm_pol_inexact_node * +xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen) +{ + struct xfrm_pol_inexact_node *node; + + node = kzalloc(sizeof(*node), GFP_ATOMIC); + if (node) + xfrm_pol_inexact_node_init(node, addr, prefixlen); + + return node; +} + +static int xfrm_policy_addr_delta(const xfrm_address_t *a, + const xfrm_address_t *b, + u8 prefixlen, u16 family) +{ + unsigned int pdw, pbi; + int delta = 0; + + switch (family) { + case AF_INET: + if (sizeof(long) == 4 && prefixlen == 0) + return ntohl(a->a4) - ntohl(b->a4); + return (ntohl(a->a4) & ((~0UL << (32 - prefixlen)))) - + (ntohl(b->a4) & ((~0UL << (32 - prefixlen)))); + case AF_INET6: + pdw = prefixlen >> 5; + pbi = prefixlen & 0x1f; + + if (pdw) { + delta = memcmp(a->a6, b->a6, pdw << 2); + if (delta) + return delta; + } + if (pbi) { + u32 mask = ~0u << (32 - pbi); + + delta = (ntohl(a->a6[pdw]) & mask) - + (ntohl(b->a6[pdw]) & mask); + } + break; + default: + break; + } + + return delta; +} + +static void xfrm_policy_inexact_list_reinsert(struct net *net, + struct xfrm_pol_inexact_node *n, + u16 family) +{ + struct hlist_node *newpos = NULL; + struct xfrm_policy *policy, *p; + + list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + if (!policy->bydst_reinsert) + continue; + + WARN_ON_ONCE(policy->family != family); + + policy->bydst_reinsert = false; + hlist_for_each_entry(p, &n->hhead, bydst) { + if (policy->priority >= p->priority) + newpos = &p->bydst; + else + break; + } + + if (newpos) + hlist_add_behind(&policy->bydst, newpos); + else + hlist_add_head(&policy->bydst, &n->hhead); + } +} + +/* merge nodes v and n */ +static void xfrm_policy_inexact_node_merge(struct net *net, + struct xfrm_pol_inexact_node *v, + struct xfrm_pol_inexact_node *n, + u16 family) +{ + struct xfrm_policy *tmp; + + hlist_for_each_entry(tmp, &v->hhead, bydst) + tmp->bydst_reinsert = true; + hlist_for_each_entry(tmp, &n->hhead, bydst) + tmp->bydst_reinsert = true; + + INIT_HLIST_HEAD(&n->hhead); + xfrm_policy_inexact_list_reinsert(net, n, family); +} + +static struct xfrm_pol_inexact_node * +xfrm_policy_inexact_insert_node(struct net *net, + struct rb_root *root, + xfrm_address_t *addr, + u16 family, u8 prefixlen, u8 dir) +{ + struct xfrm_pol_inexact_node *cached = NULL; + struct rb_node **p, *parent = NULL; + struct xfrm_pol_inexact_node *node; + + p = &root->rb_node; + while (*p) { + int delta; + + parent = *p; + node = rb_entry(*p, struct xfrm_pol_inexact_node, node); + + delta = xfrm_policy_addr_delta(addr, &node->addr, + node->prefixlen, + family); + if (delta == 0 && prefixlen >= node->prefixlen) { + WARN_ON_ONCE(cached); /* ipsec policies got lost */ + return node; + } + + if (delta < 0) + p = &parent->rb_left; + else + p = &parent->rb_right; + + if (prefixlen < node->prefixlen) { + delta = xfrm_policy_addr_delta(addr, &node->addr, + prefixlen, + family); + if (delta) + continue; + + /* This node is a subnet of the new prefix. It needs + * to be removed and re-inserted with the smaller + * prefix and all nodes that are now also covered + * by the reduced prefixlen. + */ + rb_erase(&node->node, root); + + if (!cached) { + xfrm_pol_inexact_node_init(node, addr, + prefixlen); + cached = node; + } else { + /* This node also falls within the new + * prefixlen. Merge the to-be-reinserted + * node and this one. + */ + xfrm_policy_inexact_node_merge(net, node, + cached, family); + kfree_rcu(node, rcu); + } + + /* restart */ + p = &root->rb_node; + parent = NULL; + } + } + + node = cached; + if (!node) { + node = xfrm_pol_inexact_node_alloc(addr, prefixlen); + if (!node) + return NULL; + } + + rb_link_node_rcu(&node->node, parent, p); + rb_insert_color(&node->node, root); + + return node; +} + +static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm) +{ + struct xfrm_pol_inexact_node *node; + struct rb_node *rn = rb_first(r); + + while (rn) { + node = rb_entry(rn, struct xfrm_pol_inexact_node, node); + + rn = rb_next(rn); + + if (!hlist_empty(&node->hhead)) { + WARN_ON_ONCE(rm); + continue; + } + + rb_erase(&node->node, r); + kfree_rcu(node, rcu); + } +} + static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit) { - if (!hlist_empty(&b->hhead)) { + write_seqcount_begin(&b->count); + xfrm_policy_inexact_gc_tree(&b->root_d, net_exit); + write_seqcount_end(&b->count); + + if (!RB_EMPTY_ROOT(&b->root_d) || + !hlist_empty(&b->hhead)) { WARN_ON_ONCE(net_exit); return; } @@ -741,6 +982,37 @@ static void __xfrm_policy_inexact_flush(struct net *net) __xfrm_policy_inexact_prune_bin(bin, false); } +static struct hlist_head * +xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin, + struct xfrm_policy *policy, u8 dir) +{ + struct xfrm_pol_inexact_node *n; + struct net *net; + + net = xp_net(policy); + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + if (xfrm_policy_inexact_insert_use_any_list(policy)) + return &bin->hhead; + + if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr, + policy->family, + policy->selector.prefixlen_d)) + return &bin->hhead; + + /* daddr is fixed */ + write_seqcount_begin(&bin->count); + n = xfrm_policy_inexact_insert_node(net, + &bin->root_d, + &policy->selector.daddr, + policy->family, + policy->selector.prefixlen_d, dir); + write_seqcount_end(&bin->count); + if (!n) + return NULL; + return &n->hhead; +} + static struct xfrm_policy * xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) { @@ -756,13 +1028,12 @@ xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) net = xp_net(policy); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); - if (xfrm_policy_inexact_insert_use_any_list(policy)) { - chain = &bin->hhead; - goto insert_to_list; + chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir); + if (!chain) { + __xfrm_policy_inexact_prune_bin(bin, false); + return ERR_PTR(-ENOMEM); } - chain = &bin->hhead; -insert_to_list: delpol = xfrm_policy_insert_list(chain, policy, excl); if (delpol && excl) { __xfrm_policy_inexact_prune_bin(bin, false); @@ -843,6 +1114,9 @@ static void xfrm_hash_rebuild(struct work_struct *work) bin = xfrm_policy_inexact_alloc_bin(policy, dir); if (!bin) goto out_unlock; + + if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir)) + goto out_unlock; } /* reset the bydst and inexact table in all directions */ @@ -1462,17 +1736,64 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, return ret; } +static struct xfrm_pol_inexact_node * +xfrm_policy_lookup_inexact_addr(const struct rb_root *r, + seqcount_t *count, + const xfrm_address_t *addr, u16 family) +{ + const struct rb_node *parent; + int seq; + +again: + seq = read_seqcount_begin(count); + + parent = rcu_dereference_raw(r->rb_node); + while (parent) { + struct xfrm_pol_inexact_node *node; + int delta; + + node = rb_entry(parent, struct xfrm_pol_inexact_node, node); + + delta = xfrm_policy_addr_delta(addr, &node->addr, + node->prefixlen, family); + if (delta < 0) { + parent = rcu_dereference_raw(parent->rb_left); + continue; + } else if (delta > 0) { + parent = rcu_dereference_raw(parent->rb_right); + continue; + } + + return node; + } + + if (read_seqcount_retry(count, seq)) + goto again; + + return NULL; +} + static bool xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_pol_inexact_bin *b, const xfrm_address_t *saddr, const xfrm_address_t *daddr) { + struct xfrm_pol_inexact_node *n; + u16 family; + if (!b) return false; + family = b->k.family; memset(cand, 0, sizeof(*cand)); cand->res[XFRM_POL_CAND_ANY] = &b->hhead; + + n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr, + family); + if (n) + cand->res[XFRM_POL_CAND_DADDR] = &n->hhead; + return true; } From e901cbc29316fb06423de5dfbc5afb78d4efda53 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Nov 2018 23:00:39 +0100 Subject: [PATCH 09/17] xfrm: policy: check reinserted policies match their node validate the re-inserted policies match the lookup node. Policies that fail this test won't be returned in the candidate set. This is enabled by default for now, it should not cause noticeable reinsert slow down. Such reinserts are needed when we have to merge an existing node (e.g. for 10.0.0.0/28 because a overlapping subnet was added (e.g. 10.0.0.0/24), so whenever this happens existing policies have to be placed on the list of the new node. Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 81447d5d02e6..57e28dcd7c53 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -806,10 +806,16 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net, struct xfrm_pol_inexact_node *n, u16 family) { + unsigned int matched_s, matched_d; struct hlist_node *newpos = NULL; struct xfrm_policy *policy, *p; + matched_s = 0; + matched_d = 0; + list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + bool matches_s, matches_d; + if (!policy->bydst_reinsert) continue; @@ -827,6 +833,32 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net, hlist_add_behind(&policy->bydst, newpos); else hlist_add_head(&policy->bydst, &n->hhead); + + /* paranoia checks follow. + * Check that the reinserted policy matches at least + * saddr or daddr for current node prefix. + * + * Matching both is fine, matching saddr in one policy + * (but not daddr) and then matching only daddr in another + * is a bug. + */ + matches_s = xfrm_policy_addr_delta(&policy->selector.saddr, + &n->addr, + n->prefixlen, + family) == 0; + matches_d = xfrm_policy_addr_delta(&policy->selector.daddr, + &n->addr, + n->prefixlen, + family) == 0; + if (matches_s && matches_d) + continue; + + WARN_ON_ONCE(!matches_s && !matches_d); + if (matches_s) + matched_s++; + if (matches_d) + matched_d++; + WARN_ON_ONCE(matched_s && matched_d); } } From 64a09a7bfedee6ab97273d653dfac28e82635b61 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Nov 2018 23:00:40 +0100 Subject: [PATCH 10/17] xfrm: policy: store inexact policies in a tree ordered by source address This adds the 'saddr:any' search class. It contains all policies that have a fixed saddr/prefixlen, but 'any' destination. Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 46 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 57e28dcd7c53..38e33326c856 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -71,11 +71,20 @@ struct xfrm_pol_inexact_node { * | | * | +- coarse policies and all any:daddr policies * | + * +---- root_s: sorted by saddr:prefix + * | | + * | xfrm_pol_inexact_node + * | | + * | + root: unused + * | | + * | + hhead: saddr:any policies + * | * +---- coarse policies and all any:any policies * - * Lookups return two candidate lists: + * Lookups return three candidate lists: * 1. any:any list from top-level xfrm_pol_inexact_bin * 2. any:daddr list from daddr tree + * 2. saddr:any list from saddr tree * * This result set then needs to be searched for the policy with * the lowest priority. If two results have same prio, youngest one wins. @@ -98,12 +107,16 @@ struct xfrm_pol_inexact_bin { /* tree sorted by daddr/prefix */ struct rb_root root_d; + /* tree sorted by saddr/prefix */ + struct rb_root root_s; + /* slow path below */ struct list_head inexact_bins; struct rcu_head rcu; }; enum xfrm_pol_inexact_candidate_type { + XFRM_POL_CAND_SADDR, XFRM_POL_CAND_DADDR, XFRM_POL_CAND_ANY, @@ -696,6 +709,7 @@ xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) bin->k = k; INIT_HLIST_HEAD(&bin->hhead); bin->root_d = RB_ROOT; + bin->root_s = RB_ROOT; seqcount_init(&bin->count); prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table, @@ -980,9 +994,10 @@ static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool { write_seqcount_begin(&b->count); xfrm_policy_inexact_gc_tree(&b->root_d, net_exit); + xfrm_policy_inexact_gc_tree(&b->root_s, net_exit); write_seqcount_end(&b->count); - if (!RB_EMPTY_ROOT(&b->root_d) || + if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) || !hlist_empty(&b->hhead)) { WARN_ON_ONCE(net_exit); return; @@ -1027,10 +1042,28 @@ xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin, if (xfrm_policy_inexact_insert_use_any_list(policy)) return &bin->hhead; + /* saddr is wildcard */ + if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr, + policy->family, + policy->selector.prefixlen_s)) + return &bin->hhead; + if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr, policy->family, - policy->selector.prefixlen_d)) - return &bin->hhead; + policy->selector.prefixlen_d)) { + write_seqcount_begin(&bin->count); + n = xfrm_policy_inexact_insert_node(net, + &bin->root_s, + &policy->selector.saddr, + policy->family, + policy->selector.prefixlen_s, + dir); + write_seqcount_end(&bin->count); + if (!n) + return NULL; + + return &n->hhead; + } /* daddr is fixed */ write_seqcount_begin(&bin->count); @@ -1826,6 +1859,11 @@ xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, if (n) cand->res[XFRM_POL_CAND_DADDR] = &n->hhead; + n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr, + family); + if (n) + cand->res[XFRM_POL_CAND_SADDR] = &n->hhead; + return true; } From 6ac098b2a9d3088781f1c2b7138cf38e817a3da7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Nov 2018 23:00:41 +0100 Subject: [PATCH 11/17] xfrm: policy: add 2nd-level saddr trees for inexact policies This adds the fourth and final search class, containing policies where both saddr and daddr have prefix lengths (i.e., not wildcards). Inexact policies now end up in one of the following four search classes: 1. "Any:Any" list, containing policies where both saddr and daddr are wildcards or have very coarse prefixes, e.g. 10.0.0.0/8 and the like. 2. "saddr:any" list, containing policies with a fixed saddr/prefixlen, but without destination restrictions. These lists are stored in rbtree nodes; each node contains those policies matching saddr/prefixlen. 3. "Any:daddr" list. Similar to 2), except for policies where only the destinations are specified. 4. "saddr:daddr" lists, containing only those policies that match the given source/destination network. The root of the saddr/daddr nodes gets stored in the nodes of the 'daddr' tree. This diagram illustrates the list classes, and their placement in the lookup hierarchy: xfrm_pol_inexact_bin = hash(dir,type,family,if_id); | +---- root_d: sorted by daddr:prefix | | | xfrm_pol_inexact_node | | | +- root: sorted by saddr/prefix | | | | | xfrm_pol_inexact_node | | | | | + root: unused | | | | | + hhead: saddr:daddr policies | | | +- coarse policies and all any:daddr policies | +---- root_s: sorted by saddr:prefix | | | xfrm_pol_inexact_node | | | + root: unused | | | + hhead: saddr:any policies | +---- coarse policies and all any:any policies lookup for an inexact policy returns pointers to the four relevant list classes, after which each of the lists needs to be searched for the policy with the higher priority. This will only speed up lookups in case we have many policies and a sizeable portion of these have disjunct saddr/daddr addresses. Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 118 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 108 insertions(+), 10 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 38e33326c856..bd80b8a4322f 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -58,6 +58,8 @@ struct xfrm_pol_inexact_node { }; u8 prefixlen; + struct rb_root root; + /* the policies matching this node, can be empty list */ struct hlist_head hhead; }; @@ -69,6 +71,14 @@ struct xfrm_pol_inexact_node { * | | * | xfrm_pol_inexact_node * | | + * | +- root: sorted by saddr/prefix + * | | | + * | | xfrm_pol_inexact_node + * | | | + * | | + root: unused + * | | | + * | | + hhead: saddr:daddr policies + * | | * | +- coarse policies and all any:daddr policies * | * +---- root_s: sorted by saddr:prefix @@ -81,10 +91,11 @@ struct xfrm_pol_inexact_node { * | * +---- coarse policies and all any:any policies * - * Lookups return three candidate lists: + * Lookups return four candidate lists: * 1. any:any list from top-level xfrm_pol_inexact_bin * 2. any:daddr list from daddr tree - * 2. saddr:any list from saddr tree + * 3. saddr:daddr list from 2nd level daddr tree + * 4. saddr:any list from saddr tree * * This result set then needs to be searched for the policy with * the lowest priority. If two results have same prio, youngest one wins. @@ -116,6 +127,7 @@ struct xfrm_pol_inexact_bin { }; enum xfrm_pol_inexact_candidate_type { + XFRM_POL_CAND_BOTH, XFRM_POL_CAND_SADDR, XFRM_POL_CAND_DADDR, XFRM_POL_CAND_ANY, @@ -876,13 +888,82 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net, } } +static void xfrm_policy_inexact_node_reinsert(struct net *net, + struct xfrm_pol_inexact_node *n, + struct rb_root *new, + u16 family) +{ + struct rb_node **p, *parent = NULL; + struct xfrm_pol_inexact_node *node; + + /* we should not have another subtree here */ + WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root)); + + p = &new->rb_node; + while (*p) { + u8 prefixlen; + int delta; + + parent = *p; + node = rb_entry(*p, struct xfrm_pol_inexact_node, node); + + prefixlen = min(node->prefixlen, n->prefixlen); + + delta = xfrm_policy_addr_delta(&n->addr, &node->addr, + prefixlen, family); + if (delta < 0) { + p = &parent->rb_left; + } else if (delta > 0) { + p = &parent->rb_right; + } else { + struct xfrm_policy *tmp; + + hlist_for_each_entry(tmp, &node->hhead, bydst) + tmp->bydst_reinsert = true; + hlist_for_each_entry(tmp, &n->hhead, bydst) + tmp->bydst_reinsert = true; + + INIT_HLIST_HEAD(&node->hhead); + xfrm_policy_inexact_list_reinsert(net, node, family); + + if (node->prefixlen == n->prefixlen) { + kfree_rcu(n, rcu); + return; + } + + rb_erase(*p, new); + kfree_rcu(n, rcu); + n = node; + n->prefixlen = prefixlen; + *p = new->rb_node; + parent = NULL; + } + } + + rb_link_node_rcu(&n->node, parent, p); + rb_insert_color(&n->node, new); +} + /* merge nodes v and n */ static void xfrm_policy_inexact_node_merge(struct net *net, struct xfrm_pol_inexact_node *v, struct xfrm_pol_inexact_node *n, u16 family) { + struct xfrm_pol_inexact_node *node; struct xfrm_policy *tmp; + struct rb_node *rnode; + + /* To-be-merged node v has a subtree. + * + * Dismantle it and insert its nodes to n->root. + */ + while ((rnode = rb_first(&v->root)) != NULL) { + node = rb_entry(rnode, struct xfrm_pol_inexact_node, node); + rb_erase(&node->node, &v->root); + xfrm_policy_inexact_node_reinsert(net, node, &n->root, + family); + } hlist_for_each_entry(tmp, &v->hhead, bydst) tmp->bydst_reinsert = true; @@ -978,9 +1059,10 @@ static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm) while (rn) { node = rb_entry(rn, struct xfrm_pol_inexact_node, node); + xfrm_policy_inexact_gc_tree(&node->root, rm); rn = rb_next(rn); - if (!hlist_empty(&node->hhead)) { + if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) { WARN_ON_ONCE(rm); continue; } @@ -1042,12 +1124,6 @@ xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin, if (xfrm_policy_inexact_insert_use_any_list(policy)) return &bin->hhead; - /* saddr is wildcard */ - if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr, - policy->family, - policy->selector.prefixlen_s)) - return &bin->hhead; - if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr, policy->family, policy->selector.prefixlen_d)) { @@ -1075,6 +1151,23 @@ xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin, write_seqcount_end(&bin->count); if (!n) return NULL; + + /* saddr is wildcard */ + if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr, + policy->family, + policy->selector.prefixlen_s)) + return &n->hhead; + + write_seqcount_begin(&bin->count); + n = xfrm_policy_inexact_insert_node(net, + &n->root, + &policy->selector.saddr, + policy->family, + policy->selector.prefixlen_s, dir); + write_seqcount_end(&bin->count); + if (!n) + return NULL; + return &n->hhead; } @@ -1856,8 +1949,13 @@ xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr, family); - if (n) + if (n) { cand->res[XFRM_POL_CAND_DADDR] = &n->hhead; + n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr, + family); + if (n) + cand->res[XFRM_POL_CAND_BOTH] = &n->hhead; + } n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr, family); From 4fc735d934ef8afb2f12d19293e3418d14ba6ba2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 13 Nov 2018 14:23:28 +0000 Subject: [PATCH 12/17] af_key: fix indentation on declaration statement There is an indentation issue before the declaration of xfrm_ctx. Remove spaces and replace with a tab. Signed-off-by: Colin Ian King Signed-off-by: Steffen Klassert --- net/key/af_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index 9d61266526e7..655c787f9d54 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2020,7 +2020,7 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp) { - struct xfrm_sec_ctx *xfrm_ctx = xp->security; + struct xfrm_sec_ctx *xfrm_ctx = xp->security; if (xfrm_ctx) { int len = sizeof(struct sadb_x_sec_ctx); From 7759d6a837edf3e2cc627d3bd7167c6fec637d64 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 13 Nov 2018 14:28:42 +0000 Subject: [PATCH 13/17] xfrm: policy: add missing indentation There is a missing indentation before the goto statement. Add it. Signed-off-by: Colin Ian King Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index bd80b8a4322f..e8b8e722da62 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2782,7 +2782,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) pq->timeout = pq->timeout << 1; if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout)) xfrm_pol_hold(pol); - goto out; + goto out; } dst_release(dst); From 39aa6928d462d0f4fd809ff8109f98f24843b28b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 15 Nov 2018 02:51:57 +0100 Subject: [PATCH 14/17] xfrm: policy: fix netlink/pf_key policy lookups Colin Ian King says: Static analysis with CoverityScan found a potential issue [..] It seems that pointer pol is set to NULL and then a check to see if it is non-null is used to set pol to tmp; howeverm this check is always going to be false because pol is always NULL. Fix this and update test script to catch this. Updated script only: ./xfrm_policy.sh ; echo $? RTNETLINK answers: No such file or directory FAIL: ip -net ns3 xfrm policy get src 10.0.1.0/24 dst 10.0.2.0/24 dir out RTNETLINK answers: No such file or directory [..] PASS: policy before exception matches PASS: ping to .254 bypassed ipsec tunnel PASS: direct policy matches PASS: policy matches 1 Fixes: 6be3b0db6db ("xfrm: policy: add inexact policy search tree infrastructure") Reported-by: Colin Ian King Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 5 ++- tools/testing/selftests/net/xfrm_policy.sh | 38 ++++++++++++++++++---- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e8b8e722da62..ddc3335dd552 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1663,7 +1663,10 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark, if_id, type, dir, sel, ctx); - if (tmp && pol && tmp->pos < pol->pos) + if (!tmp) + continue; + + if (!pol || tmp->pos < pol->pos) pol = tmp; } } else { diff --git a/tools/testing/selftests/net/xfrm_policy.sh b/tools/testing/selftests/net/xfrm_policy.sh index 6ca63a6e50e9..8db35b99457c 100755 --- a/tools/testing/selftests/net/xfrm_policy.sh +++ b/tools/testing/selftests/net/xfrm_policy.sh @@ -21,6 +21,7 @@ # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 ret=0 +policy_checks_ok=1 KEY_SHA=0xdeadbeef1234567890abcdefabcdefabcdefabcd KEY_AES=0x0123456789abcdef0123456789012345 @@ -45,6 +46,26 @@ do_esp() { ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 100 action allow } +do_esp_policy_get_check() { + local ns=$1 + local lnet=$2 + local rnet=$3 + + ip -net $ns xfrm policy get src $lnet dst $rnet dir out > /dev/null + if [ $? -ne 0 ] && [ $policy_checks_ok -eq 1 ] ;then + policy_checks_ok=0 + echo "FAIL: ip -net $ns xfrm policy get src $lnet dst $rnet dir out" + ret=1 + fi + + ip -net $ns xfrm policy get src $rnet dst $lnet dir fwd > /dev/null + if [ $? -ne 0 ] && [ $policy_checks_ok -eq 1 ] ;then + policy_checks_ok=0 + echo "FAIL: ip -net $ns xfrm policy get src $rnet dst $lnet dir fwd" + ret=1 + fi +} + do_exception() { local ns=$1 local me=$2 @@ -112,31 +133,31 @@ check_xfrm() { # 1: iptables -m policy rule count != 0 rval=$1 ip=$2 - ret=0 + lret=0 ip netns exec ns1 ping -q -c 1 10.0.2.$ip > /dev/null check_ipt_policy_count ns3 if [ $? -ne $rval ] ; then - ret=1 + lret=1 fi check_ipt_policy_count ns4 if [ $? -ne $rval ] ; then - ret=1 + lret=1 fi ip netns exec ns2 ping -q -c 1 10.0.1.$ip > /dev/null check_ipt_policy_count ns3 if [ $? -ne $rval ] ; then - ret=1 + lret=1 fi check_ipt_policy_count ns4 if [ $? -ne $rval ] ; then - ret=1 + lret=1 fi - return $ret + return $lret } #check for needed privileges @@ -227,6 +248,11 @@ do_esp ns4 dead:3::10 dead:3::1 dead:2::/64 dead:1::/64 $SPI2 $SPI1 do_dummies4 ns3 do_dummies6 ns4 +do_esp_policy_get_check ns3 10.0.1.0/24 10.0.2.0/24 +do_esp_policy_get_check ns4 10.0.2.0/24 10.0.1.0/24 +do_esp_policy_get_check ns3 dead:1::/64 dead:2::/64 +do_esp_policy_get_check ns4 dead:2::/64 dead:1::/64 + # ping to .254 should use ipsec, exception is not installed. check_xfrm 1 254 if [ $? -ne 0 ]; then From 3fcccec008cf095ac61018cfce0e4bb76edddb5b Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 12 Nov 2018 17:28:22 +0800 Subject: [PATCH 15/17] xfrm6: remove BUG_ON from xfrm6_dst_ifdown if loopback_idev is NULL pointer, and the following access of loopback_idev will trigger panic, which is same as BUG_ON Signed-off-by: Li RongQing Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_policy.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index d35bcf92969c..769f8f78d3b8 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -262,7 +262,6 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, if (xdst->u.rt6.rt6i_idev->dev == dev) { struct inet6_dev *loopback_idev = in6_dev_get(dev_net(dev)->loopback_dev); - BUG_ON(!loopback_idev); do { in6_dev_put(xdst->u.rt6.rt6i_idev); From 88584c30e31967db2ad03c7015a9aea3460deb2c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Nov 2018 13:28:54 +0100 Subject: [PATCH 16/17] xfrm: policy: fix policy hash rebuild Dan Carpenter reports following static checker warning: net/xfrm/xfrm_policy.c:1316 xfrm_hash_rebuild() warn: 'dir' is out of bounds '3' vs '2' | 1280 /* reset the bydst and inexact table in all directions */ | 1281 xfrm_hash_reset_inexact_table(net); | 1282 | 1283 for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { | ^^^^^^^^^^^^^^^^^^^^^ |dir == XFRM_POLICY_MAX at the end of this loop. | 1304 /* re-insert all policies by order of creation */ | 1305 list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { [..] | 1314 xfrm_policy_id2dir(policy->index)); | 1315 if (!chain) { | 1316 void *p = xfrm_policy_inexact_insert(policy, dir, 0); Fix this by updating 'dir' based on current policy. Otherwise, the inexact policies won't be found anymore during lookup, as they get hashed to a bogus bin. Reported-by: Dan Carpenter Fixes: cc1bb845adc9 ("xfrm: policy: return NULL when inexact search needed") Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ddc3335dd552..be04091eb7db 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1303,15 +1303,16 @@ static void xfrm_hash_rebuild(struct work_struct *work) /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { - if (policy->walk.dead || - xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) { + if (policy->walk.dead) + continue; + dir = xfrm_policy_id2dir(policy->index); + if (dir >= XFRM_POLICY_MAX) { /* skip socket policies */ continue; } newpos = NULL; chain = policy_hash_bysel(net, &policy->selector, - policy->family, - xfrm_policy_id2dir(policy->index)); + policy->family, dir); if (!chain) { void *p = xfrm_policy_inexact_insert(policy, dir, 0); From 77990464bb39eb0f5cd41e4f9e3d6411f2883cac Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 6 Dec 2018 17:52:28 +0000 Subject: [PATCH 17/17] xfrm: clean an indentation issue, remove a space Trivial fix to clean up indentation issue, remove an extraneous space. Signed-off-by: Colin Ian King Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index fa4b3c877fcf..0a8d70d16918 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1970,7 +1970,7 @@ static inline void xfrm_dev_state_delete(struct xfrm_state *x) static inline void xfrm_dev_state_free(struct xfrm_state *x) { struct xfrm_state_offload *xso = &x->xso; - struct net_device *dev = xso->dev; + struct net_device *dev = xso->dev; if (dev && dev->xfrmdev_ops) { if (dev->xfrmdev_ops->xdo_dev_state_free)