diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 3addac970680..a1e48c3c84c9 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -74,6 +74,10 @@ definitions: name: queue-type type: enum entries: [ rx, tx ] + - + name: qstats-scope + type: flags + entries: [ queue ] attribute-sets: - @@ -265,6 +269,66 @@ attribute-sets: doc: ID of the NAPI instance which services this queue. type: u32 + - + name: qstats + doc: | + Get device statistics, scoped to a device or a queue. + These statistics extend (and partially duplicate) statistics available + in struct rtnl_link_stats64. + Value of the `scope` attribute determines how statistics are + aggregated. When aggregated for the entire device the statistics + represent the total number of events since last explicit reset of + the device (i.e. not a reconfiguration like changing queue count). + When reported per-queue, however, the statistics may not add + up to the total number of events, will only be reported for currently + active objects, and will likely report the number of events since last + reconfiguration. + attributes: + - + name: ifindex + doc: ifindex of the netdevice to which stats belong. + type: u32 + checks: + min: 1 + - + name: queue-type + doc: Queue type as rx, tx, for queue-id. + type: u32 + enum: queue-type + - + name: queue-id + doc: Queue ID, if stats are scoped to a single queue instance. + type: u32 + - + name: scope + doc: | + What object type should be used to iterate over the stats. + type: uint + enum: qstats-scope + - + name: rx-packets + doc: | + Number of wire packets successfully received and passed to the stack. + For drivers supporting XDP, XDP is considered the first layer + of the stack, so packets consumed by XDP are still counted here. + type: uint + value: 8 # reserve some attr ids in case we need more metadata later + - + name: rx-bytes + doc: Successfully received bytes, see `rx-packets`. + type: uint + - + name: tx-packets + doc: | + Number of wire packets successfully sent. Packet is considered to be + successfully sent once it is in device memory (usually this means + the device has issued a DMA completion for the packet). + type: uint + - + name: tx-bytes + doc: Successfully sent bytes, see `tx-packets`. + type: uint + operations: list: - @@ -405,6 +469,26 @@ operations: attributes: - ifindex reply: *napi-get-op + - + name: qstats-get + doc: | + Get / dump fine grained statistics. Which statistics are reported + depends on the device and the driver, and whether the driver stores + software counters per-queue. + attribute-set: qstats + dump: + request: + attributes: + - scope + reply: + attributes: + - ifindex + - queue-type + - queue-id + - rx-packets + - rx-bytes + - tx-packets + - tx-bytes mcast-groups: list: diff --git a/Documentation/networking/statistics.rst b/Documentation/networking/statistics.rst index 551b3cc29a41..75e017dfa825 100644 --- a/Documentation/networking/statistics.rst +++ b/Documentation/networking/statistics.rst @@ -41,6 +41,15 @@ If `-s` is specified once the detailed errors won't be shown. `ip` supports JSON formatting via the `-j` option. +Queue statistics +~~~~~~~~~~~~~~~~ + +Queue statistics are accessible via the netdev netlink family. + +Currently no widely distributed CLI exists to access those statistics. +Kernel development tools (ynl) can be used to experiment with them, +see `Documentation/userspace-api/netlink/intro-specs.rst`. + Protocol-specific statistics ---------------------------- @@ -147,6 +156,12 @@ Statistics are reported both in the responses to link information requests (`RTM_GETLINK`) and statistic requests (`RTM_GETSTATS`, when `IFLA_STATS_LINK_64` bit is set in the `.filter_mask` of the request). +netdev (netlink) +~~~~~~~~~~~~~~~~ + +`netdev` generic netlink family allows accessing page pool and per queue +statistics. + ethtool ------- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 416a800d72ba..4230c7f3b959 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1955,6 +1955,7 @@ enum netdev_reg_state { * * @sysfs_rx_queue_group: Space for optional per-rx queue attributes * @rtnl_link_ops: Rtnl_link_ops + * @stat_ops: Optional ops for queue-aware statistics * * @gso_max_size: Maximum size of generic segmentation offload * @tso_max_size: Device (as in HW) limit on the max TSO request size @@ -2335,6 +2336,8 @@ struct net_device { const struct rtnl_link_ops *rtnl_link_ops; + const struct netdev_stat_ops *stat_ops; + /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SEGS 65535u #define GSO_LEGACY_MAX_SIZE 65536u diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 8b8ed4e13d74..d633347eeda5 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -4,6 +4,60 @@ #include +struct netdev_queue_stats_rx { + u64 bytes; + u64 packets; +}; + +struct netdev_queue_stats_tx { + u64 bytes; + u64 packets; +}; + +/** + * struct netdev_stat_ops - netdev ops for fine grained stats + * @get_queue_stats_rx: get stats for a given Rx queue + * @get_queue_stats_tx: get stats for a given Tx queue + * @get_base_stats: get base stats (not belonging to any live instance) + * + * Query stats for a given object. The values of the statistics are undefined + * on entry (specifically they are *not* zero-initialized). Drivers should + * assign values only to the statistics they collect. Statistics which are not + * collected must be left undefined. + * + * Queue objects are not necessarily persistent, and only currently active + * queues are queried by the per-queue callbacks. This means that per-queue + * statistics will not generally add up to the total number of events for + * the device. The @get_base_stats callback allows filling in the delta + * between events for currently live queues and overall device history. + * When the statistics for the entire device are queried, first @get_base_stats + * is issued to collect the delta, and then a series of per-queue callbacks. + * Only statistics which are set in @get_base_stats will be reported + * at the device level, meaning that unlike in queue callbacks, setting + * a statistic to zero in @get_base_stats is a legitimate thing to do. + * This is because @get_base_stats has a second function of designating which + * statistics are in fact correct for the entire device (e.g. when history + * for some of the events is not maintained, and reliable "total" cannot + * be provided). + * + * Device drivers can assume that when collecting total device stats, + * the @get_base_stats and subsequent per-queue calls are performed + * "atomically" (without releasing the rtnl_lock). + * + * Device drivers are encouraged to reset the per-queue statistics when + * number of queues change. This is because the primary use case for + * per-queue statistics is currently to detect traffic imbalance. + */ +struct netdev_stat_ops { + void (*get_queue_stats_rx)(struct net_device *dev, int idx, + struct netdev_queue_stats_rx *stats); + void (*get_queue_stats_tx)(struct net_device *dev, int idx, + struct netdev_queue_stats_tx *stats); + void (*get_base_stats)(struct net_device *dev, + struct netdev_queue_stats_rx *rx, + struct netdev_queue_stats_tx *tx); +}; + /** * DOC: Lockless queue stopping / waking helpers. * diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 93cb411adf72..639ffa04c172 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -70,6 +70,10 @@ enum netdev_queue_type { NETDEV_QUEUE_TYPE_TX, }; +enum netdev_qstats_scope { + NETDEV_QSTATS_SCOPE_QUEUE = 1, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, @@ -132,6 +136,20 @@ enum { NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) }; +enum { + NETDEV_A_QSTATS_IFINDEX = 1, + NETDEV_A_QSTATS_QUEUE_TYPE, + NETDEV_A_QSTATS_QUEUE_ID, + NETDEV_A_QSTATS_SCOPE, + NETDEV_A_QSTATS_RX_PACKETS = 8, + NETDEV_A_QSTATS_RX_BYTES, + NETDEV_A_QSTATS_TX_PACKETS, + NETDEV_A_QSTATS_TX_BYTES, + + __NETDEV_A_QSTATS_MAX, + NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) +}; + enum { NETDEV_CMD_DEV_GET = 1, NETDEV_CMD_DEV_ADD_NTF, @@ -144,6 +162,7 @@ enum { NETDEV_CMD_PAGE_POOL_STATS_GET, NETDEV_CMD_QUEUE_GET, NETDEV_CMD_NAPI_GET, + NETDEV_CMD_QSTATS_GET, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index be7f2ebd61b2..8d8ace9ef87f 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -68,6 +68,11 @@ static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFIN [NETDEV_A_NAPI_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; +/* NETDEV_CMD_QSTATS_GET - dump */ +static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = { + [NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1), +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -138,6 +143,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_NAPI_IFINDEX, .flags = GENL_CMD_CAP_DUMP, }, + { + .cmd = NETDEV_CMD_QSTATS_GET, + .dumpit = netdev_nl_qstats_get_dumpit, + .policy = netdev_qstats_get_nl_policy, + .maxattr = NETDEV_A_QSTATS_SCOPE, + .flags = GENL_CMD_CAP_DUMP, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index a47f2bcbe4fa..4db40fd5b4a9 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -28,6 +28,8 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 918b109e0cf4..7fa75e13dc6d 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "netdev-genl-gen.h" @@ -460,6 +461,218 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) return err; } +#define NETDEV_STAT_NOT_SET (~0ULL) + +static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size) +{ + const u64 *add = _add; + u64 *sum = _sum; + + while (size) { + if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET) + *sum += *add; + sum++; + add++; + size -= 8; + } +} + +static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value) +{ + if (value == NETDEV_STAT_NOT_SET) + return 0; + return nla_put_uint(rsp, attr_id, value); +} + +static int +netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) +{ + if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes)) + return -EMSGSIZE; + return 0; +} + +static int +netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) +{ + if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes)) + return -EMSGSIZE; + return 0; +} + +static int +netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp, + u32 q_type, int i, const struct genl_info *info) +{ + const struct netdev_stat_ops *ops = netdev->stat_ops; + struct netdev_queue_stats_rx rx; + struct netdev_queue_stats_tx tx; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) || + nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) || + nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i)) + goto nla_put_failure; + + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + memset(&rx, 0xff, sizeof(rx)); + ops->get_queue_stats_rx(netdev, i, &rx); + if (!memchr_inv(&rx, 0xff, sizeof(rx))) + goto nla_cancel; + if (netdev_nl_stats_write_rx(rsp, &rx)) + goto nla_put_failure; + break; + case NETDEV_QUEUE_TYPE_TX: + memset(&tx, 0xff, sizeof(tx)); + ops->get_queue_stats_tx(netdev, i, &tx); + if (!memchr_inv(&tx, 0xff, sizeof(tx))) + goto nla_cancel; + if (netdev_nl_stats_write_tx(rsp, &tx)) + goto nla_put_failure; + break; + } + + genlmsg_end(rsp, hdr); + return 0; + +nla_cancel: + genlmsg_cancel(rsp, hdr); + return 0; +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int +netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + const struct netdev_stat_ops *ops = netdev->stat_ops; + int i, err; + + if (!(netdev->flags & IFF_UP)) + return 0; + + i = ctx->rxq_idx; + while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) { + err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX, + i, info); + if (err) + return err; + ctx->rxq_idx = i++; + } + i = ctx->txq_idx; + while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { + err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX, + i, info); + if (err) + return err; + ctx->txq_idx = i++; + } + + ctx->rxq_idx = 0; + ctx->txq_idx = 0; + return 0; +} + +static int +netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info) +{ + struct netdev_queue_stats_rx rx_sum, rx; + struct netdev_queue_stats_tx tx_sum, tx; + const struct netdev_stat_ops *ops; + void *hdr; + int i; + + ops = netdev->stat_ops; + /* Netdev can't guarantee any complete counters */ + if (!ops->get_base_stats) + return 0; + + memset(&rx_sum, 0xff, sizeof(rx_sum)); + memset(&tx_sum, 0xff, sizeof(tx_sum)); + + ops->get_base_stats(netdev, &rx_sum, &tx_sum); + + /* The op was there, but nothing reported, don't bother */ + if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) && + !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum))) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex)) + goto nla_put_failure; + + for (i = 0; i < netdev->real_num_rx_queues; i++) { + memset(&rx, 0xff, sizeof(rx)); + if (ops->get_queue_stats_rx) + ops->get_queue_stats_rx(netdev, i, &rx); + netdev_nl_stats_add(&rx_sum, &rx, sizeof(rx)); + } + for (i = 0; i < netdev->real_num_tx_queues; i++) { + memset(&tx, 0xff, sizeof(tx)); + if (ops->get_queue_stats_tx) + ops->get_queue_stats_tx(netdev, i, &tx); + netdev_nl_stats_add(&tx_sum, &tx, sizeof(tx)); + } + + if (netdev_nl_stats_write_rx(rsp, &rx_sum) || + netdev_nl_stats_write_tx(rsp, &tx_sum)) + goto nla_put_failure; + + genlmsg_end(rsp, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + unsigned int scope; + int err = 0; + + scope = 0; + if (info->attrs[NETDEV_A_QSTATS_SCOPE]) + scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); + + rtnl_lock(); + for_each_netdev_dump(net, netdev, ctx->ifindex) { + if (!netdev->stat_ops) + continue; + + switch (scope) { + case 0: + err = netdev_nl_stats_by_netdev(netdev, skb, info); + break; + case NETDEV_QSTATS_SCOPE_QUEUE: + err = netdev_nl_stats_by_queue(netdev, skb, info, ctx); + break; + } + if (err < 0) + break; + } + rtnl_unlock(); + + return err; +} + static int netdev_genl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 93cb411adf72..639ffa04c172 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -70,6 +70,10 @@ enum netdev_queue_type { NETDEV_QUEUE_TYPE_TX, }; +enum netdev_qstats_scope { + NETDEV_QSTATS_SCOPE_QUEUE = 1, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, @@ -132,6 +136,20 @@ enum { NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) }; +enum { + NETDEV_A_QSTATS_IFINDEX = 1, + NETDEV_A_QSTATS_QUEUE_TYPE, + NETDEV_A_QSTATS_QUEUE_ID, + NETDEV_A_QSTATS_SCOPE, + NETDEV_A_QSTATS_RX_PACKETS = 8, + NETDEV_A_QSTATS_RX_BYTES, + NETDEV_A_QSTATS_TX_PACKETS, + NETDEV_A_QSTATS_TX_BYTES, + + __NETDEV_A_QSTATS_MAX, + NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) +}; + enum { NETDEV_CMD_DEV_GET = 1, NETDEV_CMD_DEV_ADD_NTF, @@ -144,6 +162,7 @@ enum { NETDEV_CMD_PAGE_POOL_STATS_GET, NETDEV_CMD_QUEUE_GET, NETDEV_CMD_NAPI_GET, + NETDEV_CMD_QSTATS_GET, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)