2018-03-27 09:53:07 +00:00
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/netdevice.h>
|
|
|
|
#include <net/net_namespace.h>
|
|
|
|
#include <net/netfilter/nf_tables.h>
|
|
|
|
#include <linux/netfilter_ipv4.h>
|
|
|
|
#include <linux/netfilter_ipv6.h>
|
|
|
|
#include <linux/netfilter_bridge.h>
|
|
|
|
#include <linux/netfilter_arp.h>
|
|
|
|
#include <net/netfilter/nf_tables_ipv4.h>
|
|
|
|
#include <net/netfilter/nf_tables_ipv6.h>
|
|
|
|
|
|
|
|
#ifdef CONFIG_NF_TABLES_IPV4
|
|
|
|
static unsigned int nft_do_chain_ipv4(void *priv,
|
|
|
|
struct sk_buff *skb,
|
|
|
|
const struct nf_hook_state *state)
|
|
|
|
{
|
|
|
|
struct nft_pktinfo pkt;
|
|
|
|
|
|
|
|
nft_set_pktinfo(&pkt, skb, state);
|
2021-05-28 10:30:07 +00:00
|
|
|
nft_set_pktinfo_ipv4(&pkt);
|
2018-03-27 09:53:07 +00:00
|
|
|
|
|
|
|
return nft_do_chain(&pkt, priv);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct nft_chain_type nft_chain_filter_ipv4 = {
|
|
|
|
.name = "filter",
|
|
|
|
.type = NFT_CHAIN_T_DEFAULT,
|
|
|
|
.family = NFPROTO_IPV4,
|
|
|
|
.hook_mask = (1 << NF_INET_LOCAL_IN) |
|
|
|
|
(1 << NF_INET_LOCAL_OUT) |
|
|
|
|
(1 << NF_INET_FORWARD) |
|
|
|
|
(1 << NF_INET_PRE_ROUTING) |
|
|
|
|
(1 << NF_INET_POST_ROUTING),
|
|
|
|
.hooks = {
|
|
|
|
[NF_INET_LOCAL_IN] = nft_do_chain_ipv4,
|
|
|
|
[NF_INET_LOCAL_OUT] = nft_do_chain_ipv4,
|
|
|
|
[NF_INET_FORWARD] = nft_do_chain_ipv4,
|
|
|
|
[NF_INET_PRE_ROUTING] = nft_do_chain_ipv4,
|
|
|
|
[NF_INET_POST_ROUTING] = nft_do_chain_ipv4,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static void nft_chain_filter_ipv4_init(void)
|
|
|
|
{
|
|
|
|
nft_register_chain_type(&nft_chain_filter_ipv4);
|
|
|
|
}
|
|
|
|
static void nft_chain_filter_ipv4_fini(void)
|
|
|
|
{
|
|
|
|
nft_unregister_chain_type(&nft_chain_filter_ipv4);
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
static inline void nft_chain_filter_ipv4_init(void) {}
|
|
|
|
static inline void nft_chain_filter_ipv4_fini(void) {}
|
|
|
|
#endif /* CONFIG_NF_TABLES_IPV4 */
|
|
|
|
|
|
|
|
#ifdef CONFIG_NF_TABLES_ARP
|
|
|
|
static unsigned int nft_do_chain_arp(void *priv, struct sk_buff *skb,
|
|
|
|
const struct nf_hook_state *state)
|
|
|
|
{
|
|
|
|
struct nft_pktinfo pkt;
|
|
|
|
|
|
|
|
nft_set_pktinfo(&pkt, skb, state);
|
2021-05-28 10:30:07 +00:00
|
|
|
nft_set_pktinfo_unspec(&pkt);
|
2018-03-27 09:53:07 +00:00
|
|
|
|
|
|
|
return nft_do_chain(&pkt, priv);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct nft_chain_type nft_chain_filter_arp = {
|
|
|
|
.name = "filter",
|
|
|
|
.type = NFT_CHAIN_T_DEFAULT,
|
|
|
|
.family = NFPROTO_ARP,
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.hook_mask = (1 << NF_ARP_IN) |
|
|
|
|
(1 << NF_ARP_OUT),
|
|
|
|
.hooks = {
|
|
|
|
[NF_ARP_IN] = nft_do_chain_arp,
|
|
|
|
[NF_ARP_OUT] = nft_do_chain_arp,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static void nft_chain_filter_arp_init(void)
|
|
|
|
{
|
|
|
|
nft_register_chain_type(&nft_chain_filter_arp);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nft_chain_filter_arp_fini(void)
|
|
|
|
{
|
|
|
|
nft_unregister_chain_type(&nft_chain_filter_arp);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void nft_chain_filter_arp_init(void) {}
|
|
|
|
static inline void nft_chain_filter_arp_fini(void) {}
|
|
|
|
#endif /* CONFIG_NF_TABLES_ARP */
|
|
|
|
|
|
|
|
#ifdef CONFIG_NF_TABLES_IPV6
|
|
|
|
static unsigned int nft_do_chain_ipv6(void *priv,
|
|
|
|
struct sk_buff *skb,
|
|
|
|
const struct nf_hook_state *state)
|
|
|
|
{
|
|
|
|
struct nft_pktinfo pkt;
|
|
|
|
|
|
|
|
nft_set_pktinfo(&pkt, skb, state);
|
2021-05-28 10:30:07 +00:00
|
|
|
nft_set_pktinfo_ipv6(&pkt);
|
2018-03-27 09:53:07 +00:00
|
|
|
|
|
|
|
return nft_do_chain(&pkt, priv);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct nft_chain_type nft_chain_filter_ipv6 = {
|
|
|
|
.name = "filter",
|
|
|
|
.type = NFT_CHAIN_T_DEFAULT,
|
|
|
|
.family = NFPROTO_IPV6,
|
|
|
|
.hook_mask = (1 << NF_INET_LOCAL_IN) |
|
|
|
|
(1 << NF_INET_LOCAL_OUT) |
|
|
|
|
(1 << NF_INET_FORWARD) |
|
|
|
|
(1 << NF_INET_PRE_ROUTING) |
|
|
|
|
(1 << NF_INET_POST_ROUTING),
|
|
|
|
.hooks = {
|
|
|
|
[NF_INET_LOCAL_IN] = nft_do_chain_ipv6,
|
|
|
|
[NF_INET_LOCAL_OUT] = nft_do_chain_ipv6,
|
|
|
|
[NF_INET_FORWARD] = nft_do_chain_ipv6,
|
|
|
|
[NF_INET_PRE_ROUTING] = nft_do_chain_ipv6,
|
|
|
|
[NF_INET_POST_ROUTING] = nft_do_chain_ipv6,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static void nft_chain_filter_ipv6_init(void)
|
|
|
|
{
|
|
|
|
nft_register_chain_type(&nft_chain_filter_ipv6);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nft_chain_filter_ipv6_fini(void)
|
|
|
|
{
|
|
|
|
nft_unregister_chain_type(&nft_chain_filter_ipv6);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void nft_chain_filter_ipv6_init(void) {}
|
|
|
|
static inline void nft_chain_filter_ipv6_fini(void) {}
|
|
|
|
#endif /* CONFIG_NF_TABLES_IPV6 */
|
|
|
|
|
|
|
|
#ifdef CONFIG_NF_TABLES_INET
|
|
|
|
static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
|
|
|
|
const struct nf_hook_state *state)
|
|
|
|
{
|
|
|
|
struct nft_pktinfo pkt;
|
|
|
|
|
|
|
|
nft_set_pktinfo(&pkt, skb, state);
|
|
|
|
|
|
|
|
switch (state->pf) {
|
|
|
|
case NFPROTO_IPV4:
|
2021-05-28 10:30:07 +00:00
|
|
|
nft_set_pktinfo_ipv4(&pkt);
|
2018-03-27 09:53:07 +00:00
|
|
|
break;
|
|
|
|
case NFPROTO_IPV6:
|
2021-05-28 10:30:07 +00:00
|
|
|
nft_set_pktinfo_ipv6(&pkt);
|
2018-03-27 09:53:07 +00:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return nft_do_chain(&pkt, priv);
|
|
|
|
}
|
|
|
|
|
2020-10-07 23:14:48 +00:00
|
|
|
static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb,
|
|
|
|
const struct nf_hook_state *state)
|
|
|
|
{
|
|
|
|
struct nf_hook_state ingress_state = *state;
|
|
|
|
struct nft_pktinfo pkt;
|
|
|
|
|
|
|
|
switch (skb->protocol) {
|
|
|
|
case htons(ETH_P_IP):
|
|
|
|
/* Original hook is NFPROTO_NETDEV and NF_NETDEV_INGRESS. */
|
|
|
|
ingress_state.pf = NFPROTO_IPV4;
|
|
|
|
ingress_state.hook = NF_INET_INGRESS;
|
|
|
|
nft_set_pktinfo(&pkt, skb, &ingress_state);
|
|
|
|
|
2021-05-28 10:30:07 +00:00
|
|
|
if (nft_set_pktinfo_ipv4_ingress(&pkt) < 0)
|
2020-10-07 23:14:48 +00:00
|
|
|
return NF_DROP;
|
|
|
|
break;
|
|
|
|
case htons(ETH_P_IPV6):
|
|
|
|
ingress_state.pf = NFPROTO_IPV6;
|
|
|
|
ingress_state.hook = NF_INET_INGRESS;
|
|
|
|
nft_set_pktinfo(&pkt, skb, &ingress_state);
|
|
|
|
|
2021-05-28 10:30:07 +00:00
|
|
|
if (nft_set_pktinfo_ipv6_ingress(&pkt) < 0)
|
2020-10-07 23:14:48 +00:00
|
|
|
return NF_DROP;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return NF_ACCEPT;
|
|
|
|
}
|
|
|
|
|
|
|
|
return nft_do_chain(&pkt, priv);
|
|
|
|
}
|
|
|
|
|
2018-03-27 09:53:07 +00:00
|
|
|
static const struct nft_chain_type nft_chain_filter_inet = {
|
|
|
|
.name = "filter",
|
|
|
|
.type = NFT_CHAIN_T_DEFAULT,
|
|
|
|
.family = NFPROTO_INET,
|
2020-10-07 23:14:48 +00:00
|
|
|
.hook_mask = (1 << NF_INET_INGRESS) |
|
|
|
|
(1 << NF_INET_LOCAL_IN) |
|
2018-03-27 09:53:07 +00:00
|
|
|
(1 << NF_INET_LOCAL_OUT) |
|
|
|
|
(1 << NF_INET_FORWARD) |
|
|
|
|
(1 << NF_INET_PRE_ROUTING) |
|
|
|
|
(1 << NF_INET_POST_ROUTING),
|
|
|
|
.hooks = {
|
2020-10-07 23:14:48 +00:00
|
|
|
[NF_INET_INGRESS] = nft_do_chain_inet_ingress,
|
2018-03-27 09:53:07 +00:00
|
|
|
[NF_INET_LOCAL_IN] = nft_do_chain_inet,
|
|
|
|
[NF_INET_LOCAL_OUT] = nft_do_chain_inet,
|
|
|
|
[NF_INET_FORWARD] = nft_do_chain_inet,
|
|
|
|
[NF_INET_PRE_ROUTING] = nft_do_chain_inet,
|
|
|
|
[NF_INET_POST_ROUTING] = nft_do_chain_inet,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static void nft_chain_filter_inet_init(void)
|
|
|
|
{
|
|
|
|
nft_register_chain_type(&nft_chain_filter_inet);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nft_chain_filter_inet_fini(void)
|
|
|
|
{
|
|
|
|
nft_unregister_chain_type(&nft_chain_filter_inet);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void nft_chain_filter_inet_init(void) {}
|
|
|
|
static inline void nft_chain_filter_inet_fini(void) {}
|
|
|
|
#endif /* CONFIG_NF_TABLES_IPV6 */
|
|
|
|
|
2019-07-10 08:08:20 +00:00
|
|
|
#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE)
|
2018-03-27 09:53:07 +00:00
|
|
|
static unsigned int
|
|
|
|
nft_do_chain_bridge(void *priv,
|
|
|
|
struct sk_buff *skb,
|
|
|
|
const struct nf_hook_state *state)
|
|
|
|
{
|
|
|
|
struct nft_pktinfo pkt;
|
|
|
|
|
|
|
|
nft_set_pktinfo(&pkt, skb, state);
|
|
|
|
|
|
|
|
switch (eth_hdr(skb)->h_proto) {
|
|
|
|
case htons(ETH_P_IP):
|
2021-05-28 10:30:07 +00:00
|
|
|
nft_set_pktinfo_ipv4_validate(&pkt);
|
2018-03-27 09:53:07 +00:00
|
|
|
break;
|
|
|
|
case htons(ETH_P_IPV6):
|
2021-05-28 10:30:07 +00:00
|
|
|
nft_set_pktinfo_ipv6_validate(&pkt);
|
2018-03-27 09:53:07 +00:00
|
|
|
break;
|
|
|
|
default:
|
2021-05-28 10:30:07 +00:00
|
|
|
nft_set_pktinfo_unspec(&pkt);
|
2018-03-27 09:53:07 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return nft_do_chain(&pkt, priv);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct nft_chain_type nft_chain_filter_bridge = {
|
|
|
|
.name = "filter",
|
|
|
|
.type = NFT_CHAIN_T_DEFAULT,
|
|
|
|
.family = NFPROTO_BRIDGE,
|
|
|
|
.hook_mask = (1 << NF_BR_PRE_ROUTING) |
|
|
|
|
(1 << NF_BR_LOCAL_IN) |
|
|
|
|
(1 << NF_BR_FORWARD) |
|
|
|
|
(1 << NF_BR_LOCAL_OUT) |
|
|
|
|
(1 << NF_BR_POST_ROUTING),
|
|
|
|
.hooks = {
|
|
|
|
[NF_BR_PRE_ROUTING] = nft_do_chain_bridge,
|
|
|
|
[NF_BR_LOCAL_IN] = nft_do_chain_bridge,
|
|
|
|
[NF_BR_FORWARD] = nft_do_chain_bridge,
|
|
|
|
[NF_BR_LOCAL_OUT] = nft_do_chain_bridge,
|
|
|
|
[NF_BR_POST_ROUTING] = nft_do_chain_bridge,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static void nft_chain_filter_bridge_init(void)
|
|
|
|
{
|
|
|
|
nft_register_chain_type(&nft_chain_filter_bridge);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nft_chain_filter_bridge_fini(void)
|
|
|
|
{
|
|
|
|
nft_unregister_chain_type(&nft_chain_filter_bridge);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void nft_chain_filter_bridge_init(void) {}
|
|
|
|
static inline void nft_chain_filter_bridge_fini(void) {}
|
|
|
|
#endif /* CONFIG_NF_TABLES_BRIDGE */
|
|
|
|
|
|
|
|
#ifdef CONFIG_NF_TABLES_NETDEV
|
|
|
|
static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb,
|
|
|
|
const struct nf_hook_state *state)
|
|
|
|
{
|
|
|
|
struct nft_pktinfo pkt;
|
|
|
|
|
|
|
|
nft_set_pktinfo(&pkt, skb, state);
|
|
|
|
|
|
|
|
switch (skb->protocol) {
|
|
|
|
case htons(ETH_P_IP):
|
2021-05-28 10:30:07 +00:00
|
|
|
nft_set_pktinfo_ipv4_validate(&pkt);
|
2018-03-27 09:53:07 +00:00
|
|
|
break;
|
|
|
|
case htons(ETH_P_IPV6):
|
2021-05-28 10:30:07 +00:00
|
|
|
nft_set_pktinfo_ipv6_validate(&pkt);
|
2018-03-27 09:53:07 +00:00
|
|
|
break;
|
|
|
|
default:
|
2021-05-28 10:30:07 +00:00
|
|
|
nft_set_pktinfo_unspec(&pkt);
|
2018-03-27 09:53:07 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return nft_do_chain(&pkt, priv);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct nft_chain_type nft_chain_filter_netdev = {
|
|
|
|
.name = "filter",
|
|
|
|
.type = NFT_CHAIN_T_DEFAULT,
|
|
|
|
.family = NFPROTO_NETDEV,
|
netfilter: Introduce egress hook
Support classifying packets with netfilter on egress to satisfy user
requirements such as:
* outbound security policies for containers (Laura)
* filtering and mangling intra-node Direct Server Return (DSR) traffic
on a load balancer (Laura)
* filtering locally generated traffic coming in through AF_PACKET,
such as local ARP traffic generated for clustering purposes or DHCP
(Laura; the AF_PACKET plumbing is contained in a follow-up commit)
* L2 filtering from ingress and egress for AVB (Audio Video Bridging)
and gPTP with nftables (Pablo)
* in the future: in-kernel NAT64/NAT46 (Pablo)
The egress hook introduced herein complements the ingress hook added by
commit e687ad60af09 ("netfilter: add netfilter ingress hook after
handle_ing() under unique static key"). A patch for nftables to hook up
egress rules from user space has been submitted separately, so users may
immediately take advantage of the feature.
Alternatively or in addition to netfilter, packets can be classified
with traffic control (tc). On ingress, packets are classified first by
tc, then by netfilter. On egress, the order is reversed for symmetry.
Conceptually, tc and netfilter can be thought of as layers, with
netfilter layered above tc.
Traffic control is capable of redirecting packets to another interface
(man 8 tc-mirred). E.g., an ingress packet may be redirected from the
host namespace to a container via a veth connection:
tc ingress (host) -> tc egress (veth host) -> tc ingress (veth container)
In this case, netfilter egress classifying is not performed when leaving
the host namespace! That's because the packet is still on the tc layer.
If tc redirects the packet to a physical interface in the host namespace
such that it leaves the system, the packet is never subjected to
netfilter egress classifying. That is only logical since it hasn't
passed through netfilter ingress classifying either.
Packets can alternatively be redirected at the netfilter layer using
nft fwd. Such a packet *is* subjected to netfilter egress classifying
since it has reached the netfilter layer.
Internally, the skb->nf_skip_egress flag controls whether netfilter is
invoked on egress by __dev_queue_xmit(). Because __dev_queue_xmit() may
be called recursively by tunnel drivers such as vxlan, the flag is
reverted to false after sch_handle_egress(). This ensures that
netfilter is applied both on the overlay and underlying network.
Interaction between tc and netfilter is possible by setting and querying
skb->mark.
If netfilter egress classifying is not enabled on any interface, it is
patched out of the data path by way of a static_key and doesn't make a
performance difference that is discernible from noise:
Before: 1537 1538 1538 1537 1538 1537 Mb/sec
After: 1536 1534 1539 1539 1539 1540 Mb/sec
Before + tc accept: 1418 1418 1418 1419 1419 1418 Mb/sec
After + tc accept: 1419 1424 1418 1419 1422 1420 Mb/sec
Before + tc drop: 1620 1619 1619 1619 1620 1620 Mb/sec
After + tc drop: 1616 1624 1625 1624 1622 1619 Mb/sec
When netfilter egress classifying is enabled on at least one interface,
a minimal performance penalty is incurred for every egress packet, even
if the interface it's transmitted over doesn't have any netfilter egress
rules configured. That is caused by checking dev->nf_hooks_egress
against NULL.
Measurements were performed on a Core i7-3615QM. Commands to reproduce:
ip link add dev foo type dummy
ip link set dev foo up
modprobe pktgen
echo "add_device foo" > /proc/net/pktgen/kpktgend_3
samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -i foo -n 400000000 -m "11:11:11:11:11:11" -d 1.1.1.1
Accept all traffic with tc:
tc qdisc add dev foo clsact
tc filter add dev foo egress bpf da bytecode '1,6 0 0 0,'
Drop all traffic with tc:
tc qdisc add dev foo clsact
tc filter add dev foo egress bpf da bytecode '1,6 0 0 2,'
Apply this patch when measuring packet drops to avoid errors in dmesg:
https://lore.kernel.org/netdev/a73dda33-57f4-95d8-ea51-ed483abd6a7a@iogearbox.net/
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Cc: Laura García Liébana <nevola@gmail.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2021-10-08 20:06:03 +00:00
|
|
|
.hook_mask = (1 << NF_NETDEV_INGRESS) |
|
|
|
|
(1 << NF_NETDEV_EGRESS),
|
2018-03-27 09:53:07 +00:00
|
|
|
.hooks = {
|
|
|
|
[NF_NETDEV_INGRESS] = nft_do_chain_netdev,
|
netfilter: Introduce egress hook
Support classifying packets with netfilter on egress to satisfy user
requirements such as:
* outbound security policies for containers (Laura)
* filtering and mangling intra-node Direct Server Return (DSR) traffic
on a load balancer (Laura)
* filtering locally generated traffic coming in through AF_PACKET,
such as local ARP traffic generated for clustering purposes or DHCP
(Laura; the AF_PACKET plumbing is contained in a follow-up commit)
* L2 filtering from ingress and egress for AVB (Audio Video Bridging)
and gPTP with nftables (Pablo)
* in the future: in-kernel NAT64/NAT46 (Pablo)
The egress hook introduced herein complements the ingress hook added by
commit e687ad60af09 ("netfilter: add netfilter ingress hook after
handle_ing() under unique static key"). A patch for nftables to hook up
egress rules from user space has been submitted separately, so users may
immediately take advantage of the feature.
Alternatively or in addition to netfilter, packets can be classified
with traffic control (tc). On ingress, packets are classified first by
tc, then by netfilter. On egress, the order is reversed for symmetry.
Conceptually, tc and netfilter can be thought of as layers, with
netfilter layered above tc.
Traffic control is capable of redirecting packets to another interface
(man 8 tc-mirred). E.g., an ingress packet may be redirected from the
host namespace to a container via a veth connection:
tc ingress (host) -> tc egress (veth host) -> tc ingress (veth container)
In this case, netfilter egress classifying is not performed when leaving
the host namespace! That's because the packet is still on the tc layer.
If tc redirects the packet to a physical interface in the host namespace
such that it leaves the system, the packet is never subjected to
netfilter egress classifying. That is only logical since it hasn't
passed through netfilter ingress classifying either.
Packets can alternatively be redirected at the netfilter layer using
nft fwd. Such a packet *is* subjected to netfilter egress classifying
since it has reached the netfilter layer.
Internally, the skb->nf_skip_egress flag controls whether netfilter is
invoked on egress by __dev_queue_xmit(). Because __dev_queue_xmit() may
be called recursively by tunnel drivers such as vxlan, the flag is
reverted to false after sch_handle_egress(). This ensures that
netfilter is applied both on the overlay and underlying network.
Interaction between tc and netfilter is possible by setting and querying
skb->mark.
If netfilter egress classifying is not enabled on any interface, it is
patched out of the data path by way of a static_key and doesn't make a
performance difference that is discernible from noise:
Before: 1537 1538 1538 1537 1538 1537 Mb/sec
After: 1536 1534 1539 1539 1539 1540 Mb/sec
Before + tc accept: 1418 1418 1418 1419 1419 1418 Mb/sec
After + tc accept: 1419 1424 1418 1419 1422 1420 Mb/sec
Before + tc drop: 1620 1619 1619 1619 1620 1620 Mb/sec
After + tc drop: 1616 1624 1625 1624 1622 1619 Mb/sec
When netfilter egress classifying is enabled on at least one interface,
a minimal performance penalty is incurred for every egress packet, even
if the interface it's transmitted over doesn't have any netfilter egress
rules configured. That is caused by checking dev->nf_hooks_egress
against NULL.
Measurements were performed on a Core i7-3615QM. Commands to reproduce:
ip link add dev foo type dummy
ip link set dev foo up
modprobe pktgen
echo "add_device foo" > /proc/net/pktgen/kpktgend_3
samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -i foo -n 400000000 -m "11:11:11:11:11:11" -d 1.1.1.1
Accept all traffic with tc:
tc qdisc add dev foo clsact
tc filter add dev foo egress bpf da bytecode '1,6 0 0 0,'
Drop all traffic with tc:
tc qdisc add dev foo clsact
tc filter add dev foo egress bpf da bytecode '1,6 0 0 2,'
Apply this patch when measuring packet drops to avoid errors in dmesg:
https://lore.kernel.org/netdev/a73dda33-57f4-95d8-ea51-ed483abd6a7a@iogearbox.net/
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Cc: Laura García Liébana <nevola@gmail.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2021-10-08 20:06:03 +00:00
|
|
|
[NF_NETDEV_EGRESS] = nft_do_chain_netdev,
|
2018-03-27 09:53:07 +00:00
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static void nft_netdev_event(unsigned long event, struct net_device *dev,
|
|
|
|
struct nft_ctx *ctx)
|
|
|
|
{
|
|
|
|
struct nft_base_chain *basechain = nft_base_chain(ctx->chain);
|
2019-10-16 12:30:05 +00:00
|
|
|
struct nft_hook *hook, *found = NULL;
|
|
|
|
int n = 0;
|
2018-03-27 09:53:07 +00:00
|
|
|
|
2019-10-16 12:30:05 +00:00
|
|
|
if (event != NETDEV_UNREGISTER)
|
|
|
|
return;
|
2018-03-27 09:53:07 +00:00
|
|
|
|
2019-10-16 12:30:05 +00:00
|
|
|
list_for_each_entry(hook, &basechain->hook_list, list) {
|
|
|
|
if (hook->ops.dev == dev)
|
|
|
|
found = hook;
|
|
|
|
|
|
|
|
n++;
|
2018-03-27 09:53:07 +00:00
|
|
|
}
|
2019-10-16 12:30:05 +00:00
|
|
|
if (!found)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (n > 1) {
|
|
|
|
nf_unregister_net_hook(ctx->net, &found->ops);
|
|
|
|
list_del_rcu(&found->list);
|
|
|
|
kfree_rcu(found, rcu);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-05-04 12:20:21 +00:00
|
|
|
/* UNREGISTER events are also happening on netns exit.
|
|
|
|
*
|
|
|
|
* Although nf_tables core releases all tables/chains, only this event
|
|
|
|
* handler provides guarantee that hook->ops.dev is still accessible,
|
|
|
|
* so we cannot skip exiting net namespaces.
|
|
|
|
*/
|
2019-10-16 12:30:05 +00:00
|
|
|
__nft_release_basechain(ctx);
|
2018-03-27 09:53:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int nf_tables_netdev_event(struct notifier_block *this,
|
|
|
|
unsigned long event, void *ptr)
|
|
|
|
{
|
|
|
|
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
2021-04-01 14:11:10 +00:00
|
|
|
struct nftables_pernet *nft_net;
|
2018-03-27 09:53:07 +00:00
|
|
|
struct nft_table *table;
|
|
|
|
struct nft_chain *chain, *nr;
|
|
|
|
struct nft_ctx ctx = {
|
|
|
|
.net = dev_net(dev),
|
|
|
|
};
|
|
|
|
|
|
|
|
if (event != NETDEV_UNREGISTER &&
|
|
|
|
event != NETDEV_CHANGENAME)
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
|
2021-04-22 22:17:08 +00:00
|
|
|
nft_net = nft_pernet(ctx.net);
|
2021-04-01 14:11:10 +00:00
|
|
|
mutex_lock(&nft_net->commit_mutex);
|
|
|
|
list_for_each_entry(table, &nft_net->tables, list) {
|
2018-03-27 09:53:07 +00:00
|
|
|
if (table->family != NFPROTO_NETDEV)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ctx.family = table->family;
|
|
|
|
ctx.table = table;
|
|
|
|
list_for_each_entry_safe(chain, nr, &table->chains, list) {
|
|
|
|
if (!nft_is_base_chain(chain))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ctx.chain = chain;
|
|
|
|
nft_netdev_event(event, dev, &ctx);
|
|
|
|
}
|
|
|
|
}
|
2021-04-01 14:11:10 +00:00
|
|
|
mutex_unlock(&nft_net->commit_mutex);
|
2018-03-27 09:53:07 +00:00
|
|
|
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct notifier_block nf_tables_netdev_notifier = {
|
|
|
|
.notifier_call = nf_tables_netdev_event,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int nft_chain_filter_netdev_init(void)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
nft_register_chain_type(&nft_chain_filter_netdev);
|
|
|
|
|
|
|
|
err = register_netdevice_notifier(&nf_tables_netdev_notifier);
|
|
|
|
if (err)
|
|
|
|
goto err_register_netdevice_notifier;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_register_netdevice_notifier:
|
|
|
|
nft_unregister_chain_type(&nft_chain_filter_netdev);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nft_chain_filter_netdev_fini(void)
|
|
|
|
{
|
|
|
|
nft_unregister_chain_type(&nft_chain_filter_netdev);
|
|
|
|
unregister_netdevice_notifier(&nf_tables_netdev_notifier);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline int nft_chain_filter_netdev_init(void) { return 0; }
|
|
|
|
static inline void nft_chain_filter_netdev_fini(void) {}
|
|
|
|
#endif /* CONFIG_NF_TABLES_NETDEV */
|
|
|
|
|
|
|
|
int __init nft_chain_filter_init(void)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = nft_chain_filter_netdev_init();
|
|
|
|
if (err < 0)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
nft_chain_filter_ipv4_init();
|
|
|
|
nft_chain_filter_ipv6_init();
|
|
|
|
nft_chain_filter_arp_init();
|
|
|
|
nft_chain_filter_inet_init();
|
|
|
|
nft_chain_filter_bridge_init();
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-08-02 19:44:40 +00:00
|
|
|
void nft_chain_filter_fini(void)
|
2018-03-27 09:53:07 +00:00
|
|
|
{
|
|
|
|
nft_chain_filter_bridge_fini();
|
|
|
|
nft_chain_filter_inet_fini();
|
|
|
|
nft_chain_filter_arp_fini();
|
|
|
|
nft_chain_filter_ipv6_fini();
|
|
|
|
nft_chain_filter_ipv4_fini();
|
|
|
|
nft_chain_filter_netdev_fini();
|
|
|
|
}
|