net: add low latency socket poll

Adds an ndo_ll_poll method and the code that supports it.
This method can be used by low latency applications to busy-poll
Ethernet device queues directly from the socket code.
sysctl_net_ll_poll controls how many microseconds to poll.
Default is zero (disabled).
Individual protocol support will be added by subsequent patches.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Tested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eliezer Tamir 2013-06-10 11:39:50 +03:00 committed by David S. Miller
parent af12fa6e46
commit 0602129286
12 changed files with 208 additions and 2 deletions

View File

@ -50,6 +50,13 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
it's a Per-CPU variable.
Default: 64
low_latency_poll
----------------
Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL)
Approximate time in us to spin waiting for packets on the device queue.
Recommended value is 50. May increase power usage.
Default: 0 (off)
rmem_default
------------

View File

@ -971,6 +971,9 @@ struct net_device_ops {
struct netpoll_info *info,
gfp_t gfp);
void (*ndo_netpoll_cleanup)(struct net_device *dev);
#endif
#ifdef CONFIG_NET_LL_RX_POLL
int (*ndo_ll_poll)(struct napi_struct *dev);
#endif
int (*ndo_set_vf_mac)(struct net_device *dev,
int queue, u8 *mac);

View File

@ -386,6 +386,7 @@ typedef unsigned char *sk_buff_data_t;
* @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS
* @dma_cookie: a cookie to one of several possible DMA operations
* done by skb DMA functions
* @napi_id: id of the NAPI struct this skb came from
* @secmark: security marking
* @mark: Generic packet mark
* @dropcount: total number of sk_receive_queue overflows
@ -500,8 +501,11 @@ struct sk_buff {
/* 7/9 bit hole (depending on ndisc_nodetype presence) */
kmemcheck_bitfield_end(flags2);
#ifdef CONFIG_NET_DMA
dma_cookie_t dma_cookie;
#if defined CONFIG_NET_DMA || defined CONFIG_NET_LL_RX_POLL
union {
unsigned int napi_id;
dma_cookie_t dma_cookie;
};
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;

148
include/net/ll_poll.h Normal file
View File

@ -0,0 +1,148 @@
/*
* Low Latency Sockets
* Copyright(c) 2013 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*
* Author: Eliezer Tamir
*
* Contact Information:
* e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
*/
/*
* For now this depends on CONFIG_X86_TSC
*/
#ifndef _LINUX_NET_LL_POLL_H
#define _LINUX_NET_LL_POLL_H
#include <linux/netdevice.h>
#include <net/ip.h>
#ifdef CONFIG_NET_LL_RX_POLL
struct napi_struct;
extern unsigned long sysctl_net_ll_poll __read_mostly;
/* return values from ndo_ll_poll */
#define LL_FLUSH_FAILED -1
#define LL_FLUSH_BUSY -2
/* we don't mind a ~2.5% imprecision */
#define TSC_MHZ (tsc_khz >> 10)
static inline cycles_t ll_end_time(void)
{
return TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll) + get_cycles();
}
static inline bool sk_valid_ll(struct sock *sk)
{
return sysctl_net_ll_poll && sk->sk_napi_id &&
!need_resched() && !signal_pending(current);
}
static inline bool can_poll_ll(cycles_t end_time)
{
return !time_after((unsigned long)get_cycles(),
(unsigned long)end_time);
}
static inline bool sk_poll_ll(struct sock *sk, int nonblock)
{
cycles_t end_time = ll_end_time();
const struct net_device_ops *ops;
struct napi_struct *napi;
int rc = false;
/*
* rcu read lock for napi hash
* bh so we don't race with net_rx_action
*/
rcu_read_lock_bh();
napi = napi_by_id(sk->sk_napi_id);
if (!napi)
goto out;
ops = napi->dev->netdev_ops;
if (!ops->ndo_ll_poll)
goto out;
do {
rc = ops->ndo_ll_poll(napi);
if (rc == LL_FLUSH_FAILED)
break; /* permanent failure */
if (rc > 0)
/* local bh are disabled so it is ok to use _BH */
NET_ADD_STATS_BH(sock_net(sk),
LINUX_MIB_LOWLATENCYRXPACKETS, rc);
} while (skb_queue_empty(&sk->sk_receive_queue)
&& can_poll_ll(end_time) && !nonblock);
rc = !skb_queue_empty(&sk->sk_receive_queue);
out:
rcu_read_unlock_bh();
return rc;
}
/* used in the NIC receive handler to mark the skb */
static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
{
skb->napi_id = napi->napi_id;
}
/* used in the protocol hanlder to propagate the napi_id to the socket */
static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
{
sk->sk_napi_id = skb->napi_id;
}
#else /* CONFIG_NET_LL_RX_POLL */
static inline cycles_t ll_end_time(void)
{
return 0;
}
static inline bool sk_valid_ll(struct sock *sk)
{
return false;
}
static inline bool sk_poll_ll(struct sock *sk, int nonblock)
{
return false;
}
static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
{
}
static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
{
}
static inline bool can_poll_ll(cycles_t end_time)
{
return false;
}
#endif /* CONFIG_NET_LL_RX_POLL */
#endif /* _LINUX_NET_LL_POLL_H */

View File

@ -229,6 +229,7 @@ struct cg_proto;
* @sk_omem_alloc: "o" is "option" or "other"
* @sk_wmem_queued: persistent queue size
* @sk_forward_alloc: space allocated forward
* @sk_napi_id: id of the last napi context to receive data for sk
* @sk_allocation: allocation mode
* @sk_sndbuf: size of send buffer in bytes
* @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
@ -324,6 +325,9 @@ struct sock {
int sk_forward_alloc;
#ifdef CONFIG_RPS
__u32 sk_rxhash;
#endif
#ifdef CONFIG_NET_LL_RX_POLL
unsigned int sk_napi_id;
#endif
atomic_t sk_drops;
int sk_rcvbuf;

View File

@ -253,6 +253,7 @@ enum
LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */
LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */
LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */
LINUX_MIB_LOWLATENCYRXPACKETS, /* LowLatencyRxPackets */
__LINUX_MIB_MAX
};

View File

@ -243,6 +243,18 @@ config NETPRIO_CGROUP
Cgroup subsystem for use in assigning processes to network priorities on
a per-interface basis
config NET_LL_RX_POLL
bool "Low Latency Receive Poll"
depends on X86_TSC
default n
---help---
Support Low Latency Receive Queue Poll.
(For network card drivers which support this option.)
When waiting for data in read or poll call directly into the the device driver
to flush packets which may be pending on the device queues into the stack.
If unsure, say N.
config BQL
boolean
depends on SYSFS

View File

@ -733,6 +733,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->vlan_tci = old->vlan_tci;
skb_copy_secmark(new, old);
#ifdef CONFIG_NET_LL_RX_POLL
new->napi_id = old->napi_id;
#endif
}
/*

View File

@ -139,6 +139,8 @@
#include <net/tcp.h>
#endif
#include <net/ll_poll.h>
static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);
@ -2284,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_stamp = ktime_set(-1L, 0);
#ifdef CONFIG_NET_LL_RX_POLL
sk->sk_napi_id = 0;
#endif
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)

View File

@ -19,6 +19,7 @@
#include <net/ip.h>
#include <net/sock.h>
#include <net/net_ratelimit.h>
#include <net/ll_poll.h>
static int one = 1;
@ -284,6 +285,15 @@ static struct ctl_table net_core_table[] = {
.proc_handler = flow_limit_table_len_sysctl
},
#endif /* CONFIG_NET_FLOW_LIMIT */
#ifdef CONFIG_NET_LL_RX_POLL
{
.procname = "low_latency_poll",
.data = &sysctl_net_ll_poll,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax
},
#endif
#endif /* CONFIG_NET */
{
.procname = "netdev_budget",

View File

@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS),
SNMP_MIB_SENTINEL
};

View File

@ -104,6 +104,12 @@
#include <linux/route.h>
#include <linux/sockios.h>
#include <linux/atalk.h>
#include <net/ll_poll.h>
#ifdef CONFIG_NET_LL_RX_POLL
unsigned long sysctl_net_ll_poll __read_mostly;
EXPORT_SYMBOL_GPL(sysctl_net_ll_poll);
#endif
static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,