mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-10-31 16:38:12 +00:00
2f1e85b1ae
This patch fixes issue: * If we install tc filters with act_skbedit in clsact hook. It doesn't work, because netdev_core_pick_tx() overwrites queue_mapping. $ tc filter ... action skbedit queue_mapping 1 And this patch is useful: * We can use FQ + EDT to implement efficient policies. Tx queues are picked by xps, ndo_select_queue of netdev driver, or skb hash in netdev_core_pick_tx(). In fact, the netdev driver, and skb hash are _not_ under control. xps uses the CPUs map to select Tx queues, but we can't figure out which task_struct of pod/containter running on this cpu in most case. We can use clsact filters to classify one pod/container traffic to one Tx queue. Why ? In containter networking environment, there are two kinds of pod/ containter/net-namespace. One kind (e.g. P1, P2), the high throughput is key in these applications. But avoid running out of network resource, the outbound traffic of these pods is limited, using or sharing one dedicated Tx queues assigned HTB/TBF/FQ Qdisc. Other kind of pods (e.g. Pn), the low latency of data access is key. And the traffic is not limited. Pods use or share other dedicated Tx queues assigned FIFO Qdisc. This choice provides two benefits. First, contention on the HTB/FQ Qdisc lock is significantly reduced since fewer CPUs contend for the same queue. More importantly, Qdisc contention can be eliminated completely if each CPU has its own FIFO Qdisc for the second kind of pods. There must be a mechanism in place to support classifying traffic based on pods/container to different Tx queues. Note that clsact is outside of Qdisc while Qdisc can run a classifier to select a sub-queue under the lock. In general recording the decision in the skb seems a little heavy handed. This patch introduces a per-CPU variable, suggested by Eric. The xmit.skip_txqueue flag is firstly cleared in __dev_queue_xmit(). - Tx Qdisc may install that skbedit actions, then xmit.skip_txqueue flag is set in qdisc->enqueue() though tx queue has been selected in netdev_tx_queue_mapping() or netdev_core_pick_tx(). That flag is cleared firstly in __dev_queue_xmit(), is useful: - Avoid picking Tx queue with netdev_tx_queue_mapping() in next netdev in such case: eth0 macvlan - eth0.3 vlan - eth0 ixgbe-phy: For example, eth0, macvlan in pod, which root Qdisc install skbedit queue_mapping, send packets to eth0.3, vlan in host. In __dev_queue_xmit() of eth0.3, clear the flag, does not select tx queue according to skb->queue_mapping because there is no filters in clsact or tx Qdisc of this netdev. Same action taked in eth0, ixgbe in Host. - Avoid picking Tx queue for next packet. If we set xmit.skip_txqueue in tx Qdisc (qdisc->enqueue()), the proper way to clear it is clearing it in __dev_queue_xmit when processing next packets. For performance reasons, use the static key. If user does not config the NET_EGRESS, the patch will not be compiled. +----+ +----+ +----+ | P1 | | P2 | | Pn | +----+ +----+ +----+ | | | +-----------+-----------+ | | clsact/skbedit | MQ v +-----------+-----------+ | q0 | q1 | qn v v v HTB/FQ HTB/FQ ... FIFO Cc: Jamal Hadi Salim <jhs@mojatatu.com> Cc: Cong Wang <xiyou.wangcong@gmail.com> Cc: Jiri Pirko <jiri@resnulli.us> Cc: "David S. Miller" <davem@davemloft.net> Cc: Jakub Kicinski <kuba@kernel.org> Cc: Jonathan Lemon <jonathan.lemon@gmail.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Alexander Lobakin <alobakin@pm.me> Cc: Paolo Abeni <pabeni@redhat.com> Cc: Talal Ahmad <talalahmad@google.com> Cc: Kevin Hao <haokexin@gmail.com> Cc: Ilias Apalodimas <ilias.apalodimas@linaro.org> Cc: Kees Cook <keescook@chromium.org> Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com> Cc: Antoine Tenart <atenart@kernel.org> Cc: Wei Wang <weiwan@google.com> Cc: Arnd Bergmann <arnd@arndb.de> Suggested-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
141 lines
4.6 KiB
C
141 lines
4.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef __LINUX_RTNETLINK_H
|
|
#define __LINUX_RTNETLINK_H
|
|
|
|
|
|
#include <linux/mutex.h>
|
|
#include <linux/netdevice.h>
|
|
#include <linux/wait.h>
|
|
#include <linux/refcount.h>
|
|
#include <uapi/linux/rtnetlink.h>
|
|
|
|
extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
|
|
extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
|
|
extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
|
|
u32 group, struct nlmsghdr *nlh, gfp_t flags);
|
|
extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
|
|
extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
|
|
extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
|
|
u32 id, long expires, u32 error);
|
|
|
|
void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
|
|
void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
|
|
gfp_t flags, int *new_nsid, int new_ifindex);
|
|
struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
|
|
unsigned change, u32 event,
|
|
gfp_t flags, int *new_nsid,
|
|
int new_ifindex);
|
|
void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
|
|
gfp_t flags);
|
|
|
|
|
|
/* RTNL is used as a global lock for all changes to network configuration */
|
|
extern void rtnl_lock(void);
|
|
extern void rtnl_unlock(void);
|
|
extern int rtnl_trylock(void);
|
|
extern int rtnl_is_locked(void);
|
|
extern int rtnl_lock_killable(void);
|
|
extern bool refcount_dec_and_rtnl_lock(refcount_t *r);
|
|
|
|
extern wait_queue_head_t netdev_unregistering_wq;
|
|
extern struct rw_semaphore pernet_ops_rwsem;
|
|
extern struct rw_semaphore net_rwsem;
|
|
|
|
#ifdef CONFIG_PROVE_LOCKING
|
|
extern bool lockdep_rtnl_is_held(void);
|
|
#else
|
|
static inline bool lockdep_rtnl_is_held(void)
|
|
{
|
|
return true;
|
|
}
|
|
#endif /* #ifdef CONFIG_PROVE_LOCKING */
|
|
|
|
/**
|
|
* rcu_dereference_rtnl - rcu_dereference with debug checking
|
|
* @p: The pointer to read, prior to dereferencing
|
|
*
|
|
* Do an rcu_dereference(p), but check caller either holds rcu_read_lock()
|
|
* or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference()
|
|
*/
|
|
#define rcu_dereference_rtnl(p) \
|
|
rcu_dereference_check(p, lockdep_rtnl_is_held())
|
|
|
|
/**
|
|
* rcu_dereference_bh_rtnl - rcu_dereference_bh with debug checking
|
|
* @p: The pointer to read, prior to dereference
|
|
*
|
|
* Do an rcu_dereference_bh(p), but check caller either holds rcu_read_lock_bh()
|
|
* or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference_bh()
|
|
*/
|
|
#define rcu_dereference_bh_rtnl(p) \
|
|
rcu_dereference_bh_check(p, lockdep_rtnl_is_held())
|
|
|
|
/**
|
|
* rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL
|
|
* @p: The pointer to read, prior to dereferencing
|
|
*
|
|
* Return the value of the specified RCU-protected pointer, but omit
|
|
* the READ_ONCE(), because caller holds RTNL.
|
|
*/
|
|
#define rtnl_dereference(p) \
|
|
rcu_dereference_protected(p, lockdep_rtnl_is_held())
|
|
|
|
static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
|
|
{
|
|
return rtnl_dereference(dev->ingress_queue);
|
|
}
|
|
|
|
static inline struct netdev_queue *dev_ingress_queue_rcu(struct net_device *dev)
|
|
{
|
|
return rcu_dereference(dev->ingress_queue);
|
|
}
|
|
|
|
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
|
|
|
|
#ifdef CONFIG_NET_INGRESS
|
|
void net_inc_ingress_queue(void);
|
|
void net_dec_ingress_queue(void);
|
|
#endif
|
|
|
|
#ifdef CONFIG_NET_EGRESS
|
|
void net_inc_egress_queue(void);
|
|
void net_dec_egress_queue(void);
|
|
void netdev_xmit_skip_txqueue(bool skip);
|
|
#endif
|
|
|
|
void rtnetlink_init(void);
|
|
void __rtnl_unlock(void);
|
|
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail);
|
|
|
|
#define ASSERT_RTNL() \
|
|
WARN_ONCE(!rtnl_is_locked(), \
|
|
"RTNL: assertion failed at %s (%d)\n", __FILE__, __LINE__)
|
|
|
|
extern int ndo_dflt_fdb_dump(struct sk_buff *skb,
|
|
struct netlink_callback *cb,
|
|
struct net_device *dev,
|
|
struct net_device *filter_dev,
|
|
int *idx);
|
|
extern int ndo_dflt_fdb_add(struct ndmsg *ndm,
|
|
struct nlattr *tb[],
|
|
struct net_device *dev,
|
|
const unsigned char *addr,
|
|
u16 vid,
|
|
u16 flags);
|
|
extern int ndo_dflt_fdb_del(struct ndmsg *ndm,
|
|
struct nlattr *tb[],
|
|
struct net_device *dev,
|
|
const unsigned char *addr,
|
|
u16 vid);
|
|
|
|
extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
|
|
struct net_device *dev, u16 mode,
|
|
u32 flags, u32 mask, int nlflags,
|
|
u32 filter_mask,
|
|
int (*vlan_fill)(struct sk_buff *skb,
|
|
struct net_device *dev,
|
|
u32 filter_mask));
|
|
|
|
extern void rtnl_offload_xstats_notify(struct net_device *dev);
|
|
|
|
#endif /* __LINUX_RTNETLINK_H */
|