linux-stable/net/sched/sch_taprio.c
Vladimir Oltean af7b29b1de Revert "net/sched: taprio: make qdisc_leaf() see the per-netdev-queue pfifo child qdiscs"
taprio_attach() has this logic at the end, which should have been
removed with the blamed patch (which is now being reverted):

	/* access to the child qdiscs is not needed in offload mode */
	if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
		kfree(q->qdiscs);
		q->qdiscs = NULL;
	}

because otherwise, we make use of q->qdiscs[] even after this array was
deallocated, namely in taprio_leaf(). Therefore, whenever one would try
to attach a valid child qdisc to a fully offloaded taprio root, one
would immediately dereference a NULL pointer.

$ tc qdisc replace dev eno0 handle 8001: parent root taprio \
	num_tc 8 \
	map 0 1 2 3 4 5 6 7 \
	queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \
	max-sdu 0 0 0 0 0 200 0 0 \
	base-time 200 \
	sched-entry S 80 20000 \
	sched-entry S a0 20000 \
	sched-entry S 5f 60000 \
	flags 2
$ max_frame_size=1500
$ data_rate_kbps=20000
$ port_transmit_rate_kbps=1000000
$ idleslope=$data_rate_kbps
$ sendslope=$(($idleslope - $port_transmit_rate_kbps))
$ locredit=$(($max_frame_size * $sendslope / $port_transmit_rate_kbps))
$ hicredit=$(($max_frame_size * $idleslope / $port_transmit_rate_kbps))
$ tc qdisc replace dev eno0 parent 8001:7 cbs \
	idleslope $idleslope \
	sendslope $sendslope \
	hicredit $hicredit \
	locredit $locredit \
	offload 0

Unable to handle kernel NULL pointer dereference at virtual address 0000000000000030
pc : taprio_leaf+0x28/0x40
lr : qdisc_leaf+0x3c/0x60
Call trace:
 taprio_leaf+0x28/0x40
 tc_modify_qdisc+0xf0/0x72c
 rtnetlink_rcv_msg+0x12c/0x390
 netlink_rcv_skb+0x5c/0x130
 rtnetlink_rcv+0x1c/0x2c

The solution is not as obvious as the problem. The code which deallocates
q->qdiscs[] is in fact copied and pasted from mqprio, which also
deallocates the array in mqprio_attach() and never uses it afterwards.

Therefore, the identical cleanup logic of priv->qdiscs[] that
mqprio_destroy() has is deceptive because it will never take place at
qdisc_destroy() time, but just at raw ops->destroy() time (otherwise
said, priv->qdiscs[] do not last for the entire lifetime of the mqprio
root), but rather, this is just the twisted way in which the Qdisc API
understands error path cleanup should be done (Qdisc_ops :: destroy() is
called even when Qdisc_ops :: init() never succeeded).

Side note, in fact this is also what the comment in mqprio_init() says:

	/* pre-allocate qdisc, attachment can't fail */

Or reworded, mqprio's priv->qdiscs[] scheme is only meant to serve as
data passing between Qdisc_ops :: init() and Qdisc_ops :: attach().

[ this comment was also copied and pasted into the initial taprio
  commit, even though taprio_attach() came way later ]

The problem is that taprio also makes extensive use of the q->qdiscs[]
array in the software fast path (taprio_enqueue() and taprio_dequeue()),
but it does not keep a reference of its own on q->qdiscs[i] (you'd think
that since it creates these Qdiscs, it holds the reference, but nope,
this is not completely true).

To understand the difference between taprio_destroy() and mqprio_destroy()
one must look before commit 13511704f8 ("net: taprio offload: enforce
qdisc to netdev queue mapping"), because that just muddied the waters.

In the "original" taprio design, taprio always attached itself (the root
Qdisc) to all netdev TX queues, so that dev_qdisc_enqueue() would go
through taprio_enqueue().

It also called qdisc_refcount_inc() on itself for as many times as there
were netdev TX queues, in order to counter-balance what tc_get_qdisc()
does when destroying a Qdisc (simplified for brevity below):

	if (n->nlmsg_type == RTM_DELQDISC)
		err = qdisc_graft(dev, parent=NULL, new=NULL, q, extack);

qdisc_graft(where "new" is NULL so this deletes the Qdisc):

	for (i = 0; i < num_q; i++) {
		struct netdev_queue *dev_queue;

		dev_queue = netdev_get_tx_queue(dev, i);

		old = dev_graft_qdisc(dev_queue, new);
		if (new && i > 0)
			qdisc_refcount_inc(new);

		qdisc_put(old);
		~~~~~~~~~~~~~~
		this decrements taprio's refcount once for each TX queue
	}

	notify_and_destroy(net, skb, n, classid,
			   rtnl_dereference(dev->qdisc), new);
			   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			   and this finally decrements it to zero,
			   making qdisc_put() call qdisc_destroy()

The q->qdiscs[] created using qdisc_create_dflt() (or their
replacements, if taprio_graft() was ever to get called) were then
privately freed by taprio_destroy().

This is still what is happening after commit 13511704f8 ("net: taprio
offload: enforce qdisc to netdev queue mapping"), but only for software
mode.

In full offload mode, the per-txq "qdisc_put(old)" calls from
qdisc_graft() now deallocate the child Qdiscs rather than decrement
taprio's refcount. So when notify_and_destroy(taprio) finally calls
taprio_destroy(), the difference is that the child Qdiscs were already
deallocated.

And this is exactly why the taprio_attach() comment "access to the child
qdiscs is not needed in offload mode" is deceptive too. Not only the
q->qdiscs[] array is not needed, but it is also necessary to get rid of
it as soon as possible, because otherwise, we will also call qdisc_put()
on the child Qdiscs in qdisc_destroy() -> taprio_destroy(), and this
will cause a nasty use-after-free/refcount-saturate/whatever.

In short, the problem is that since the blamed commit, taprio_leaf()
needs q->qdiscs[] to not be freed by taprio_attach(), while qdisc_destroy()
-> taprio_destroy() does need q->qdiscs[] to be freed by taprio_attach()
for full offload. Fixing one problem triggers the other.

All of this can be solved by making taprio keep its q->qdiscs[i] with a
refcount elevated at 2 (in offloaded mode where they are attached to the
netdev TX queues), both in taprio_attach() and in taprio_graft(). The
generic qdisc_graft() would just decrement the child qdiscs' refcounts
to 1, and taprio_destroy() would give them the final coup de grace.

However the rabbit hole of changes is getting quite deep, and the
complexity increases. The blamed commit was supposed to be a bug fix in
the first place, and the bug it addressed is not so significant so as to
justify further rework in stable trees. So I'd rather just revert it.
I don't know enough about multi-queue Qdisc design to make a proper
judgement right now regarding what is/isn't idiomatic use of Qdisc
concepts in taprio. I will try to study the problem more and come with a
different solution in net-next.

Fixes: 1461d212ab ("net/sched: taprio: make qdisc_leaf() see the per-netdev-queue pfifo child qdiscs")
Reported-by: Muhammad Husaini Zulkifli <muhammad.husaini.zulkifli@intel.com>
Reported-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Link: https://lore.kernel.org/r/20221004220100.1650558-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-10-05 20:32:15 -07:00

2158 lines
54 KiB
C

// SPDX-License-Identifier: GPL-2.0
/* net/sched/sch_taprio.c Time Aware Priority Scheduler
*
* Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com>
*
*/
#include <linux/ethtool.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/list.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/math64.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/time.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/sch_generic.h>
#include <net/sock.h>
#include <net/tcp.h>
static LIST_HEAD(taprio_list);
#define TAPRIO_ALL_GATES_OPEN -1
#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)
#define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)
#define TAPRIO_FLAGS_INVALID U32_MAX
struct sched_entry {
struct list_head list;
/* The instant that this entry "closes" and the next one
* should open, the qdisc will make some effort so that no
* packet leaves after this time.
*/
ktime_t close_time;
ktime_t next_txtime;
atomic_t budget;
int index;
u32 gate_mask;
u32 interval;
u8 command;
};
struct sched_gate_list {
struct rcu_head rcu;
struct list_head entries;
size_t num_entries;
ktime_t cycle_close_time;
s64 cycle_time;
s64 cycle_time_extension;
s64 base_time;
};
struct taprio_sched {
struct Qdisc **qdiscs;
struct Qdisc *root;
u32 flags;
enum tk_offsets tk_offset;
int clockid;
bool offloaded;
atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
* speeds it's sub-nanoseconds per byte
*/
/* Protects the update side of the RCU protected current_entry */
spinlock_t current_entry_lock;
struct sched_entry __rcu *current_entry;
struct sched_gate_list __rcu *oper_sched;
struct sched_gate_list __rcu *admin_sched;
struct hrtimer advance_timer;
struct list_head taprio_list;
u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
u32 max_sdu[TC_MAX_QUEUE]; /* for dump and offloading */
u32 txtime_delay;
};
struct __tc_taprio_qopt_offload {
refcount_t users;
struct tc_taprio_qopt_offload offload;
};
static ktime_t sched_base_time(const struct sched_gate_list *sched)
{
if (!sched)
return KTIME_MAX;
return ns_to_ktime(sched->base_time);
}
static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono)
{
/* This pairs with WRITE_ONCE() in taprio_parse_clockid() */
enum tk_offsets tk_offset = READ_ONCE(q->tk_offset);
switch (tk_offset) {
case TK_OFFS_MAX:
return mono;
default:
return ktime_mono_to_any(mono, tk_offset);
}
}
static ktime_t taprio_get_time(const struct taprio_sched *q)
{
return taprio_mono_to_any(q, ktime_get());
}
static void taprio_free_sched_cb(struct rcu_head *head)
{
struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu);
struct sched_entry *entry, *n;
list_for_each_entry_safe(entry, n, &sched->entries, list) {
list_del(&entry->list);
kfree(entry);
}
kfree(sched);
}
static void switch_schedules(struct taprio_sched *q,
struct sched_gate_list **admin,
struct sched_gate_list **oper)
{
rcu_assign_pointer(q->oper_sched, *admin);
rcu_assign_pointer(q->admin_sched, NULL);
if (*oper)
call_rcu(&(*oper)->rcu, taprio_free_sched_cb);
*oper = *admin;
*admin = NULL;
}
/* Get how much time has been already elapsed in the current cycle. */
static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time)
{
ktime_t time_since_sched_start;
s32 time_elapsed;
time_since_sched_start = ktime_sub(time, sched->base_time);
div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed);
return time_elapsed;
}
static ktime_t get_interval_end_time(struct sched_gate_list *sched,
struct sched_gate_list *admin,
struct sched_entry *entry,
ktime_t intv_start)
{
s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start);
ktime_t intv_end, cycle_ext_end, cycle_end;
cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed);
intv_end = ktime_add_ns(intv_start, entry->interval);
cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension);
if (ktime_before(intv_end, cycle_end))
return intv_end;
else if (admin && admin != sched &&
ktime_after(admin->base_time, cycle_end) &&
ktime_before(admin->base_time, cycle_ext_end))
return admin->base_time;
else
return cycle_end;
}
static int length_to_duration(struct taprio_sched *q, int len)
{
return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC);
}
/* Returns the entry corresponding to next available interval. If
* validate_interval is set, it only validates whether the timestamp occurs
* when the gate corresponding to the skb's traffic class is open.
*/
static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb,
struct Qdisc *sch,
struct sched_gate_list *sched,
struct sched_gate_list *admin,
ktime_t time,
ktime_t *interval_start,
ktime_t *interval_end,
bool validate_interval)
{
ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time;
ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time;
struct sched_entry *entry = NULL, *entry_found = NULL;
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
bool entry_available = false;
s32 cycle_elapsed;
int tc, n;
tc = netdev_get_prio_tc_map(dev, skb->priority);
packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb));
*interval_start = 0;
*interval_end = 0;
if (!sched)
return NULL;
cycle = sched->cycle_time;
cycle_elapsed = get_cycle_time_elapsed(sched, time);
curr_intv_end = ktime_sub_ns(time, cycle_elapsed);
cycle_end = ktime_add_ns(curr_intv_end, cycle);
list_for_each_entry(entry, &sched->entries, list) {
curr_intv_start = curr_intv_end;
curr_intv_end = get_interval_end_time(sched, admin, entry,
curr_intv_start);
if (ktime_after(curr_intv_start, cycle_end))
break;
if (!(entry->gate_mask & BIT(tc)) ||
packet_transmit_time > entry->interval)
continue;
txtime = entry->next_txtime;
if (ktime_before(txtime, time) || validate_interval) {
transmit_end_time = ktime_add_ns(time, packet_transmit_time);
if ((ktime_before(curr_intv_start, time) &&
ktime_before(transmit_end_time, curr_intv_end)) ||
(ktime_after(curr_intv_start, time) && !validate_interval)) {
entry_found = entry;
*interval_start = curr_intv_start;
*interval_end = curr_intv_end;
break;
} else if (!entry_available && !validate_interval) {
/* Here, we are just trying to find out the
* first available interval in the next cycle.
*/
entry_available = true;
entry_found = entry;
*interval_start = ktime_add_ns(curr_intv_start, cycle);
*interval_end = ktime_add_ns(curr_intv_end, cycle);
}
} else if (ktime_before(txtime, earliest_txtime) &&
!entry_available) {
earliest_txtime = txtime;
entry_found = entry;
n = div_s64(ktime_sub(txtime, curr_intv_start), cycle);
*interval_start = ktime_add(curr_intv_start, n * cycle);
*interval_end = ktime_add(curr_intv_end, n * cycle);
}
}
return entry_found;
}
static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
struct sched_gate_list *sched, *admin;
ktime_t interval_start, interval_end;
struct sched_entry *entry;
rcu_read_lock();
sched = rcu_dereference(q->oper_sched);
admin = rcu_dereference(q->admin_sched);
entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp,
&interval_start, &interval_end, true);
rcu_read_unlock();
return entry;
}
static bool taprio_flags_valid(u32 flags)
{
/* Make sure no other flag bits are set. */
if (flags & ~(TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST |
TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD))
return false;
/* txtime-assist and full offload are mutually exclusive */
if ((flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) &&
(flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD))
return false;
return true;
}
/* This returns the tstamp value set by TCP in terms of the set clock. */
static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb)
{
unsigned int offset = skb_network_offset(skb);
const struct ipv6hdr *ipv6h;
const struct iphdr *iph;
struct ipv6hdr _ipv6h;
ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
if (!ipv6h)
return 0;
if (ipv6h->version == 4) {
iph = (struct iphdr *)ipv6h;
offset += iph->ihl * 4;
/* special-case 6in4 tunnelling, as that is a common way to get
* v6 connectivity in the home
*/
if (iph->protocol == IPPROTO_IPV6) {
ipv6h = skb_header_pointer(skb, offset,
sizeof(_ipv6h), &_ipv6h);
if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
return 0;
} else if (iph->protocol != IPPROTO_TCP) {
return 0;
}
} else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) {
return 0;
}
return taprio_mono_to_any(q, skb->skb_mstamp_ns);
}
/* There are a few scenarios where we will have to modify the txtime from
* what is read from next_txtime in sched_entry. They are:
* 1. If txtime is in the past,
* a. The gate for the traffic class is currently open and packet can be
* transmitted before it closes, schedule the packet right away.
* b. If the gate corresponding to the traffic class is going to open later
* in the cycle, set the txtime of packet to the interval start.
* 2. If txtime is in the future, there are packets corresponding to the
* current traffic class waiting to be transmitted. So, the following
* possibilities exist:
* a. We can transmit the packet before the window containing the txtime
* closes.
* b. The window might close before the transmission can be completed
* successfully. So, schedule the packet in the next open window.
*/
static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch)
{
ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp;
struct taprio_sched *q = qdisc_priv(sch);
struct sched_gate_list *sched, *admin;
ktime_t minimum_time, now, txtime;
int len, packet_transmit_time;
struct sched_entry *entry;
bool sched_changed;
now = taprio_get_time(q);
minimum_time = ktime_add_ns(now, q->txtime_delay);
tcp_tstamp = get_tcp_tstamp(q, skb);
minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp);
rcu_read_lock();
admin = rcu_dereference(q->admin_sched);
sched = rcu_dereference(q->oper_sched);
if (admin && ktime_after(minimum_time, admin->base_time))
switch_schedules(q, &admin, &sched);
/* Until the schedule starts, all the queues are open */
if (!sched || ktime_before(minimum_time, sched->base_time)) {
txtime = minimum_time;
goto done;
}
len = qdisc_pkt_len(skb);
packet_transmit_time = length_to_duration(q, len);
do {
sched_changed = false;
entry = find_entry_to_transmit(skb, sch, sched, admin,
minimum_time,
&interval_start, &interval_end,
false);
if (!entry) {
txtime = 0;
goto done;
}
txtime = entry->next_txtime;
txtime = max_t(ktime_t, txtime, minimum_time);
txtime = max_t(ktime_t, txtime, interval_start);
if (admin && admin != sched &&
ktime_after(txtime, admin->base_time)) {
sched = admin;
sched_changed = true;
continue;
}
transmit_end_time = ktime_add(txtime, packet_transmit_time);
minimum_time = transmit_end_time;
/* Update the txtime of current entry to the next time it's
* interval starts.
*/
if (ktime_after(transmit_end_time, interval_end))
entry->next_txtime = ktime_add(interval_start, sched->cycle_time);
} while (sched_changed || ktime_after(transmit_end_time, interval_end));
entry->next_txtime = transmit_end_time;
done:
rcu_read_unlock();
return txtime;
}
static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
struct Qdisc *child, struct sk_buff **to_free)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
int prio = skb->priority;
u8 tc;
/* sk_flags are only safe to use on full sockets. */
if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) {
if (!is_valid_interval(skb, sch))
return qdisc_drop(skb, sch, to_free);
} else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
skb->tstamp = get_packet_txtime(skb, sch);
if (!skb->tstamp)
return qdisc_drop(skb, sch, to_free);
}
/* Devices with full offload are expected to honor this in hardware */
tc = netdev_get_prio_tc_map(dev, prio);
if (skb->len > q->max_frm_len[tc])
return qdisc_drop(skb, sch, to_free);
qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return qdisc_enqueue(skb, child, to_free);
}
/* Will not be called in the full offload case, since the TX queues are
* attached to the Qdisc created using qdisc_create_dflt()
*/
static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
struct taprio_sched *q = qdisc_priv(sch);
struct Qdisc *child;
int queue;
queue = skb_get_queue_mapping(skb);
child = q->qdiscs[queue];
if (unlikely(!child))
return qdisc_drop(skb, sch, to_free);
/* Large packets might not be transmitted when the transmission duration
* exceeds any configured interval. Therefore, segment the skb into
* smaller chunks. Drivers with full offload are expected to handle
* this in hardware.
*/
if (skb_is_gso(skb)) {
unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb);
netdev_features_t features = netif_skb_features(skb);
struct sk_buff *segs, *nskb;
int ret;
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs))
return qdisc_drop(skb, sch, to_free);
skb_list_walk_safe(segs, segs, nskb) {
skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
slen += segs->len;
ret = taprio_enqueue_one(segs, sch, child, to_free);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
qdisc_qstats_drop(sch);
} else {
numsegs++;
}
}
if (numsegs > 1)
qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen);
consume_skb(skb);
return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}
return taprio_enqueue_one(skb, sch, child, to_free);
}
/* Will not be called in the full offload case, since the TX queues are
* attached to the Qdisc created using qdisc_create_dflt()
*/
static struct sk_buff *taprio_peek(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
struct sched_entry *entry;
struct sk_buff *skb;
u32 gate_mask;
int i;
rcu_read_lock();
entry = rcu_dereference(q->current_entry);
gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
rcu_read_unlock();
if (!gate_mask)
return NULL;
for (i = 0; i < dev->num_tx_queues; i++) {
struct Qdisc *child = q->qdiscs[i];
int prio;
u8 tc;
if (unlikely(!child))
continue;
skb = child->ops->peek(child);
if (!skb)
continue;
if (TXTIME_ASSIST_IS_ENABLED(q->flags))
return skb;
prio = skb->priority;
tc = netdev_get_prio_tc_map(dev, prio);
if (!(gate_mask & BIT(tc)))
continue;
return skb;
}
return NULL;
}
static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
{
atomic_set(&entry->budget,
div64_u64((u64)entry->interval * PSEC_PER_NSEC,
atomic64_read(&q->picos_per_byte)));
}
/* Will not be called in the full offload case, since the TX queues are
* attached to the Qdisc created using qdisc_create_dflt()
*/
static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
struct sk_buff *skb = NULL;
struct sched_entry *entry;
u32 gate_mask;
int i;
rcu_read_lock();
entry = rcu_dereference(q->current_entry);
/* if there's no entry, it means that the schedule didn't
* start yet, so force all gates to be open, this is in
* accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5
* "AdminGateStates"
*/
gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
if (!gate_mask)
goto done;
for (i = 0; i < dev->num_tx_queues; i++) {
struct Qdisc *child = q->qdiscs[i];
ktime_t guard;
int prio;
int len;
u8 tc;
if (unlikely(!child))
continue;
if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
skb = child->ops->dequeue(child);
if (!skb)
continue;
goto skb_found;
}
skb = child->ops->peek(child);
if (!skb)
continue;
prio = skb->priority;
tc = netdev_get_prio_tc_map(dev, prio);
if (!(gate_mask & BIT(tc))) {
skb = NULL;
continue;
}
len = qdisc_pkt_len(skb);
guard = ktime_add_ns(taprio_get_time(q),
length_to_duration(q, len));
/* In the case that there's no gate entry, there's no
* guard band ...
*/
if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
ktime_after(guard, entry->close_time)) {
skb = NULL;
continue;
}
/* ... and no budget. */
if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
atomic_sub_return(len, &entry->budget) < 0) {
skb = NULL;
continue;
}
skb = child->ops->dequeue(child);
if (unlikely(!skb))
goto done;
skb_found:
qdisc_bstats_update(sch, skb);
qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
goto done;
}
done:
rcu_read_unlock();
return skb;
}
static bool should_restart_cycle(const struct sched_gate_list *oper,
const struct sched_entry *entry)
{
if (list_is_last(&entry->list, &oper->entries))
return true;
if (ktime_compare(entry->close_time, oper->cycle_close_time) == 0)
return true;
return false;
}
static bool should_change_schedules(const struct sched_gate_list *admin,
const struct sched_gate_list *oper,
ktime_t close_time)
{
ktime_t next_base_time, extension_time;
if (!admin)
return false;
next_base_time = sched_base_time(admin);
/* This is the simple case, the close_time would fall after
* the next schedule base_time.
*/
if (ktime_compare(next_base_time, close_time) <= 0)
return true;
/* This is the cycle_time_extension case, if the close_time
* plus the amount that can be extended would fall after the
* next schedule base_time, we can extend the current schedule
* for that amount.
*/
extension_time = ktime_add_ns(close_time, oper->cycle_time_extension);
/* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about
* how precisely the extension should be made. So after
* conformance testing, this logic may change.
*/
if (ktime_compare(next_base_time, extension_time) <= 0)
return true;
return false;
}
static enum hrtimer_restart advance_sched(struct hrtimer *timer)
{
struct taprio_sched *q = container_of(timer, struct taprio_sched,
advance_timer);
struct sched_gate_list *oper, *admin;
struct sched_entry *entry, *next;
struct Qdisc *sch = q->root;
ktime_t close_time;
spin_lock(&q->current_entry_lock);
entry = rcu_dereference_protected(q->current_entry,
lockdep_is_held(&q->current_entry_lock));
oper = rcu_dereference_protected(q->oper_sched,
lockdep_is_held(&q->current_entry_lock));
admin = rcu_dereference_protected(q->admin_sched,
lockdep_is_held(&q->current_entry_lock));
if (!oper)
switch_schedules(q, &admin, &oper);
/* This can happen in two cases: 1. this is the very first run
* of this function (i.e. we weren't running any schedule
* previously); 2. The previous schedule just ended. The first
* entry of all schedules are pre-calculated during the
* schedule initialization.
*/
if (unlikely(!entry || entry->close_time == oper->base_time)) {
next = list_first_entry(&oper->entries, struct sched_entry,
list);
close_time = next->close_time;
goto first_run;
}
if (should_restart_cycle(oper, entry)) {
next = list_first_entry(&oper->entries, struct sched_entry,
list);
oper->cycle_close_time = ktime_add_ns(oper->cycle_close_time,
oper->cycle_time);
} else {
next = list_next_entry(entry, list);
}
close_time = ktime_add_ns(entry->close_time, next->interval);
close_time = min_t(ktime_t, close_time, oper->cycle_close_time);
if (should_change_schedules(admin, oper, close_time)) {
/* Set things so the next time this runs, the new
* schedule runs.
*/
close_time = sched_base_time(admin);
switch_schedules(q, &admin, &oper);
}
next->close_time = close_time;
taprio_set_budget(q, next);
first_run:
rcu_assign_pointer(q->current_entry, next);
spin_unlock(&q->current_entry_lock);
hrtimer_set_expires(&q->advance_timer, close_time);
rcu_read_lock();
__netif_schedule(sch);
rcu_read_unlock();
return HRTIMER_RESTART;
}
static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
[TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 },
[TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 },
[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 },
[TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 },
};
static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = {
[TCA_TAPRIO_TC_ENTRY_INDEX] = { .type = NLA_U32 },
[TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 },
};
static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
[TCA_TAPRIO_ATTR_PRIOMAP] = {
.len = sizeof(struct tc_mqprio_qopt)
},
[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED },
[TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 },
[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED },
[TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 },
[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 },
[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 },
[TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 },
[TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 },
[TCA_TAPRIO_ATTR_TC_ENTRY] = { .type = NLA_NESTED },
};
static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb,
struct sched_entry *entry,
struct netlink_ext_ack *extack)
{
int min_duration = length_to_duration(q, ETH_ZLEN);
u32 interval = 0;
if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD])
entry->command = nla_get_u8(
tb[TCA_TAPRIO_SCHED_ENTRY_CMD]);
if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK])
entry->gate_mask = nla_get_u32(
tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]);
if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL])
interval = nla_get_u32(
tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]);
/* The interval should allow at least the minimum ethernet
* frame to go out.
*/
if (interval < min_duration) {
NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry");
return -EINVAL;
}
entry->interval = interval;
return 0;
}
static int parse_sched_entry(struct taprio_sched *q, struct nlattr *n,
struct sched_entry *entry, int index,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { };
int err;
err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n,
entry_policy, NULL);
if (err < 0) {
NL_SET_ERR_MSG(extack, "Could not parse nested entry");
return -EINVAL;
}
entry->index = index;
return fill_sched_entry(q, tb, entry, extack);
}
static int parse_sched_list(struct taprio_sched *q, struct nlattr *list,
struct sched_gate_list *sched,
struct netlink_ext_ack *extack)
{
struct nlattr *n;
int err, rem;
int i = 0;
if (!list)
return -EINVAL;
nla_for_each_nested(n, list, rem) {
struct sched_entry *entry;
if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) {
NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'");
continue;
}
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry) {
NL_SET_ERR_MSG(extack, "Not enough memory for entry");
return -ENOMEM;
}
err = parse_sched_entry(q, n, entry, i, extack);
if (err < 0) {
kfree(entry);
return err;
}
list_add_tail(&entry->list, &sched->entries);
i++;
}
sched->num_entries = i;
return i;
}
static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb,
struct sched_gate_list *new,
struct netlink_ext_ack *extack)
{
int err = 0;
if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) {
NL_SET_ERR_MSG(extack, "Adding a single entry is not supported");
return -ENOTSUPP;
}
if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME])
new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]);
if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION])
new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]);
if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME])
new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]);
if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST])
err = parse_sched_list(q, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST],
new, extack);
if (err < 0)
return err;
if (!new->cycle_time) {
struct sched_entry *entry;
ktime_t cycle = 0;
list_for_each_entry(entry, &new->entries, list)
cycle = ktime_add_ns(cycle, entry->interval);
if (!cycle) {
NL_SET_ERR_MSG(extack, "'cycle_time' can never be 0");
return -EINVAL;
}
new->cycle_time = cycle;
}
return 0;
}
static int taprio_parse_mqprio_opt(struct net_device *dev,
struct tc_mqprio_qopt *qopt,
struct netlink_ext_ack *extack,
u32 taprio_flags)
{
int i, j;
if (!qopt && !dev->num_tc) {
NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary");
return -EINVAL;
}
/* If num_tc is already set, it means that the user already
* configured the mqprio part
*/
if (dev->num_tc)
return 0;
/* Verify num_tc is not out of max range */
if (qopt->num_tc > TC_MAX_QUEUE) {
NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range");
return -EINVAL;
}
/* taprio imposes that traffic classes map 1:n to tx queues */
if (qopt->num_tc > dev->num_tx_queues) {
NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues");
return -EINVAL;
}
/* Verify priority mapping uses valid tcs */
for (i = 0; i <= TC_BITMASK; i++) {
if (qopt->prio_tc_map[i] >= qopt->num_tc) {
NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping");
return -EINVAL;
}
}
for (i = 0; i < qopt->num_tc; i++) {
unsigned int last = qopt->offset[i] + qopt->count[i];
/* Verify the queue count is in tx range being equal to the
* real_num_tx_queues indicates the last queue is in use.
*/
if (qopt->offset[i] >= dev->num_tx_queues ||
!qopt->count[i] ||
last > dev->real_num_tx_queues) {
NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping");
return -EINVAL;
}
if (TXTIME_ASSIST_IS_ENABLED(taprio_flags))
continue;
/* Verify that the offset and counts do not overlap */
for (j = i + 1; j < qopt->num_tc; j++) {
if (last > qopt->offset[j]) {
NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping");
return -EINVAL;
}
}
}
return 0;
}
static int taprio_get_start_time(struct Qdisc *sch,
struct sched_gate_list *sched,
ktime_t *start)
{
struct taprio_sched *q = qdisc_priv(sch);
ktime_t now, base, cycle;
s64 n;
base = sched_base_time(sched);
now = taprio_get_time(q);
if (ktime_after(base, now)) {
*start = base;
return 0;
}
cycle = sched->cycle_time;
/* The qdisc is expected to have at least one sched_entry. Moreover,
* any entry must have 'interval' > 0. Thus if the cycle time is zero,
* something went really wrong. In that case, we should warn about this
* inconsistent state and return error.
*/
if (WARN_ON(!cycle))
return -EFAULT;
/* Schedule the start time for the beginning of the next
* cycle.
*/
n = div64_s64(ktime_sub_ns(now, base), cycle);
*start = ktime_add_ns(base, (n + 1) * cycle);
return 0;
}
static void setup_first_close_time(struct taprio_sched *q,
struct sched_gate_list *sched, ktime_t base)
{
struct sched_entry *first;
ktime_t cycle;
first = list_first_entry(&sched->entries,
struct sched_entry, list);
cycle = sched->cycle_time;
/* FIXME: find a better place to do this */
sched->cycle_close_time = ktime_add_ns(base, cycle);
first->close_time = ktime_add_ns(base, first->interval);
taprio_set_budget(q, first);
rcu_assign_pointer(q->current_entry, NULL);
}
static void taprio_start_sched(struct Qdisc *sch,
ktime_t start, struct sched_gate_list *new)
{
struct taprio_sched *q = qdisc_priv(sch);
ktime_t expires;
if (FULL_OFFLOAD_IS_ENABLED(q->flags))
return;
expires = hrtimer_get_expires(&q->advance_timer);
if (expires == 0)
expires = KTIME_MAX;
/* If the new schedule starts before the next expiration, we
* reprogram it to the earliest one, so we change the admin
* schedule to the operational one at the right time.
*/
start = min_t(ktime_t, start, expires);
hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS);
}
static void taprio_set_picos_per_byte(struct net_device *dev,
struct taprio_sched *q)
{
struct ethtool_link_ksettings ecmd;
int speed = SPEED_10;
int picos_per_byte;
int err;
err = __ethtool_get_link_ksettings(dev, &ecmd);
if (err < 0)
goto skip;
if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN)
speed = ecmd.base.speed;
skip:
picos_per_byte = (USEC_PER_SEC * 8) / speed;
atomic64_set(&q->picos_per_byte, picos_per_byte);
netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n",
dev->name, (long long)atomic64_read(&q->picos_per_byte),
ecmd.base.speed);
}
static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct taprio_sched *q;
ASSERT_RTNL();
if (event != NETDEV_UP && event != NETDEV_CHANGE)
return NOTIFY_DONE;
list_for_each_entry(q, &taprio_list, taprio_list) {
if (dev != qdisc_dev(q->root))
continue;
taprio_set_picos_per_byte(dev, q);
break;
}
return NOTIFY_DONE;
}
static void setup_txtime(struct taprio_sched *q,
struct sched_gate_list *sched, ktime_t base)
{
struct sched_entry *entry;
u32 interval = 0;
list_for_each_entry(entry, &sched->entries, list) {
entry->next_txtime = ktime_add_ns(base, interval);
interval += entry->interval;
}
}
static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries)
{
struct __tc_taprio_qopt_offload *__offload;
__offload = kzalloc(struct_size(__offload, offload.entries, num_entries),
GFP_KERNEL);
if (!__offload)
return NULL;
refcount_set(&__offload->users, 1);
return &__offload->offload;
}
struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload
*offload)
{
struct __tc_taprio_qopt_offload *__offload;
__offload = container_of(offload, struct __tc_taprio_qopt_offload,
offload);
refcount_inc(&__offload->users);
return offload;
}
EXPORT_SYMBOL_GPL(taprio_offload_get);
void taprio_offload_free(struct tc_taprio_qopt_offload *offload)
{
struct __tc_taprio_qopt_offload *__offload;
__offload = container_of(offload, struct __tc_taprio_qopt_offload,
offload);
if (!refcount_dec_and_test(&__offload->users))
return;
kfree(__offload);
}
EXPORT_SYMBOL_GPL(taprio_offload_free);
/* The function will only serve to keep the pointers to the "oper" and "admin"
* schedules valid in relation to their base times, so when calling dump() the
* users looks at the right schedules.
* When using full offload, the admin configuration is promoted to oper at the
* base_time in the PHC time domain. But because the system time is not
* necessarily in sync with that, we can't just trigger a hrtimer to call
* switch_schedules at the right hardware time.
* At the moment we call this by hand right away from taprio, but in the future
* it will be useful to create a mechanism for drivers to notify taprio of the
* offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump().
* This is left as TODO.
*/
static void taprio_offload_config_changed(struct taprio_sched *q)
{
struct sched_gate_list *oper, *admin;
oper = rtnl_dereference(q->oper_sched);
admin = rtnl_dereference(q->admin_sched);
switch_schedules(q, &admin, &oper);
}
static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask)
{
u32 i, queue_mask = 0;
for (i = 0; i < dev->num_tc; i++) {
u32 offset, count;
if (!(tc_mask & BIT(i)))
continue;
offset = dev->tc_to_txq[i].offset;
count = dev->tc_to_txq[i].count;
queue_mask |= GENMASK(offset + count - 1, offset);
}
return queue_mask;
}
static void taprio_sched_to_offload(struct net_device *dev,
struct sched_gate_list *sched,
struct tc_taprio_qopt_offload *offload)
{
struct sched_entry *entry;
int i = 0;
offload->base_time = sched->base_time;
offload->cycle_time = sched->cycle_time;
offload->cycle_time_extension = sched->cycle_time_extension;
list_for_each_entry(entry, &sched->entries, list) {
struct tc_taprio_sched_entry *e = &offload->entries[i];
e->command = entry->command;
e->interval = entry->interval;
e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask);
i++;
}
offload->num_entries = i;
}
static int taprio_enable_offload(struct net_device *dev,
struct taprio_sched *q,
struct sched_gate_list *sched,
struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
struct tc_taprio_qopt_offload *offload;
struct tc_taprio_caps caps;
int tc, err = 0;
if (!ops->ndo_setup_tc) {
NL_SET_ERR_MSG(extack,
"Device does not support taprio offload");
return -EOPNOTSUPP;
}
qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO,
&caps, sizeof(caps));
if (!caps.supports_queue_max_sdu) {
for (tc = 0; tc < TC_MAX_QUEUE; tc++) {
if (q->max_sdu[tc]) {
NL_SET_ERR_MSG_MOD(extack,
"Device does not handle queueMaxSDU");
return -EOPNOTSUPP;
}
}
}
offload = taprio_offload_alloc(sched->num_entries);
if (!offload) {
NL_SET_ERR_MSG(extack,
"Not enough memory for enabling offload mode");
return -ENOMEM;
}
offload->enable = 1;
taprio_sched_to_offload(dev, sched, offload);
for (tc = 0; tc < TC_MAX_QUEUE; tc++)
offload->max_sdu[tc] = q->max_sdu[tc];
err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
if (err < 0) {
NL_SET_ERR_MSG(extack,
"Device failed to setup taprio offload");
goto done;
}
q->offloaded = true;
done:
taprio_offload_free(offload);
return err;
}
static int taprio_disable_offload(struct net_device *dev,
struct taprio_sched *q,
struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
struct tc_taprio_qopt_offload *offload;
int err;
if (!q->offloaded)
return 0;
offload = taprio_offload_alloc(0);
if (!offload) {
NL_SET_ERR_MSG(extack,
"Not enough memory to disable offload mode");
return -ENOMEM;
}
offload->enable = 0;
err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
if (err < 0) {
NL_SET_ERR_MSG(extack,
"Device failed to disable offload");
goto out;
}
q->offloaded = false;
out:
taprio_offload_free(offload);
return err;
}
/* If full offload is enabled, the only possible clockid is the net device's
* PHC. For that reason, specifying a clockid through netlink is incorrect.
* For txtime-assist, it is implicitly assumed that the device's PHC is kept
* in sync with the specified clockid via a user space daemon such as phc2sys.
* For both software taprio and txtime-assist, the clockid is used for the
* hrtimer that advances the schedule and hence mandatory.
*/
static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb,
struct netlink_ext_ack *extack)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
int err = -EINVAL;
if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
const struct ethtool_ops *ops = dev->ethtool_ops;
struct ethtool_ts_info info = {
.cmd = ETHTOOL_GET_TS_INFO,
.phc_index = -1,
};
if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
NL_SET_ERR_MSG(extack,
"The 'clockid' cannot be specified for full offload");
goto out;
}
if (ops && ops->get_ts_info)
err = ops->get_ts_info(dev, &info);
if (err || info.phc_index < 0) {
NL_SET_ERR_MSG(extack,
"Device does not have a PTP clock");
err = -ENOTSUPP;
goto out;
}
} else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
enum tk_offsets tk_offset;
/* We only support static clockids and we don't allow
* for it to be modified after the first init.
*/
if (clockid < 0 ||
(q->clockid != -1 && q->clockid != clockid)) {
NL_SET_ERR_MSG(extack,
"Changing the 'clockid' of a running schedule is not supported");
err = -ENOTSUPP;
goto out;
}
switch (clockid) {
case CLOCK_REALTIME:
tk_offset = TK_OFFS_REAL;
break;
case CLOCK_MONOTONIC:
tk_offset = TK_OFFS_MAX;
break;
case CLOCK_BOOTTIME:
tk_offset = TK_OFFS_BOOT;
break;
case CLOCK_TAI:
tk_offset = TK_OFFS_TAI;
break;
default:
NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
err = -EINVAL;
goto out;
}
/* This pairs with READ_ONCE() in taprio_mono_to_any */
WRITE_ONCE(q->tk_offset, tk_offset);
q->clockid = clockid;
} else {
NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory");
goto out;
}
/* Everything went ok, return success. */
err = 0;
out:
return err;
}
static int taprio_parse_tc_entry(struct Qdisc *sch,
struct nlattr *opt,
u32 max_sdu[TC_QOPT_MAX_QUEUE],
unsigned long *seen_tcs,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { };
struct net_device *dev = qdisc_dev(sch);
u32 val = 0;
int err, tc;
err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt,
taprio_tc_policy, extack);
if (err < 0)
return err;
if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) {
NL_SET_ERR_MSG_MOD(extack, "TC entry index missing");
return -EINVAL;
}
tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]);
if (tc >= TC_QOPT_MAX_QUEUE) {
NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range");
return -ERANGE;
}
if (*seen_tcs & BIT(tc)) {
NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry");
return -EINVAL;
}
*seen_tcs |= BIT(tc);
if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU])
val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]);
if (val > dev->max_mtu) {
NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU");
return -ERANGE;
}
max_sdu[tc] = val;
return 0;
}
static int taprio_parse_tc_entries(struct Qdisc *sch,
struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
u32 max_sdu[TC_QOPT_MAX_QUEUE];
unsigned long seen_tcs = 0;
struct nlattr *n;
int tc, rem;
int err = 0;
for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
max_sdu[tc] = q->max_sdu[tc];
nla_for_each_nested(n, opt, rem) {
if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY)
continue;
err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs, extack);
if (err)
goto out;
}
for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
q->max_sdu[tc] = max_sdu[tc];
if (max_sdu[tc])
q->max_frm_len[tc] = max_sdu[tc] + dev->hard_header_len;
else
q->max_frm_len[tc] = U32_MAX; /* never oversized */
}
out:
return err;
}
static int taprio_mqprio_cmp(const struct net_device *dev,
const struct tc_mqprio_qopt *mqprio)
{
int i;
if (!mqprio || mqprio->num_tc != dev->num_tc)
return -1;
for (i = 0; i < mqprio->num_tc; i++)
if (dev->tc_to_txq[i].count != mqprio->count[i] ||
dev->tc_to_txq[i].offset != mqprio->offset[i])
return -1;
for (i = 0; i <= TC_BITMASK; i++)
if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i])
return -1;
return 0;
}
/* The semantics of the 'flags' argument in relation to 'change()'
* requests, are interpreted following two rules (which are applied in
* this order): (1) an omitted 'flags' argument is interpreted as
* zero; (2) the 'flags' of a "running" taprio instance cannot be
* changed.
*/
static int taprio_new_flags(const struct nlattr *attr, u32 old,
struct netlink_ext_ack *extack)
{
u32 new = 0;
if (attr)
new = nla_get_u32(attr);
if (old != TAPRIO_FLAGS_INVALID && old != new) {
NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported");
return -EOPNOTSUPP;
}
if (!taprio_flags_valid(new)) {
NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid");
return -EINVAL;
}
return new;
}
static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { };
struct sched_gate_list *oper, *admin, *new_admin;
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
struct tc_mqprio_qopt *mqprio = NULL;
unsigned long flags;
ktime_t start;
int i, err;
err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt,
taprio_policy, extack);
if (err < 0)
return err;
if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
err = taprio_new_flags(tb[TCA_TAPRIO_ATTR_FLAGS],
q->flags, extack);
if (err < 0)
return err;
q->flags = err;
err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags);
if (err < 0)
return err;
err = taprio_parse_tc_entries(sch, opt, extack);
if (err)
return err;
new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL);
if (!new_admin) {
NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule");
return -ENOMEM;
}
INIT_LIST_HEAD(&new_admin->entries);
oper = rtnl_dereference(q->oper_sched);
admin = rtnl_dereference(q->admin_sched);
/* no changes - no new mqprio settings */
if (!taprio_mqprio_cmp(dev, mqprio))
mqprio = NULL;
if (mqprio && (oper || admin)) {
NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported");
err = -ENOTSUPP;
goto free_sched;
}
err = parse_taprio_schedule(q, tb, new_admin, extack);
if (err < 0)
goto free_sched;
if (new_admin->num_entries == 0) {
NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule");
err = -EINVAL;
goto free_sched;
}
err = taprio_parse_clockid(sch, tb, extack);
if (err < 0)
goto free_sched;
taprio_set_picos_per_byte(dev, q);
if (mqprio) {
err = netdev_set_num_tc(dev, mqprio->num_tc);
if (err)
goto free_sched;
for (i = 0; i < mqprio->num_tc; i++)
netdev_set_tc_queue(dev, i,
mqprio->count[i],
mqprio->offset[i]);
/* Always use supplied priority mappings */
for (i = 0; i <= TC_BITMASK; i++)
netdev_set_prio_tc_map(dev, i,
mqprio->prio_tc_map[i]);
}
if (FULL_OFFLOAD_IS_ENABLED(q->flags))
err = taprio_enable_offload(dev, q, new_admin, extack);
else
err = taprio_disable_offload(dev, q, extack);
if (err)
goto free_sched;
/* Protects against enqueue()/dequeue() */
spin_lock_bh(qdisc_lock(sch));
if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) {
if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) {
NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled");
err = -EINVAL;
goto unlock;
}
q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]);
}
if (!TXTIME_ASSIST_IS_ENABLED(q->flags) &&
!FULL_OFFLOAD_IS_ENABLED(q->flags) &&
!hrtimer_active(&q->advance_timer)) {
hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
q->advance_timer.function = advance_sched;
}
err = taprio_get_start_time(sch, new_admin, &start);
if (err < 0) {
NL_SET_ERR_MSG(extack, "Internal error: failed get start time");
goto unlock;
}
setup_txtime(q, new_admin, start);
if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
if (!oper) {
rcu_assign_pointer(q->oper_sched, new_admin);
err = 0;
new_admin = NULL;
goto unlock;
}
rcu_assign_pointer(q->admin_sched, new_admin);
if (admin)
call_rcu(&admin->rcu, taprio_free_sched_cb);
} else {
setup_first_close_time(q, new_admin, start);
/* Protects against advance_sched() */
spin_lock_irqsave(&q->current_entry_lock, flags);
taprio_start_sched(sch, start, new_admin);
rcu_assign_pointer(q->admin_sched, new_admin);
if (admin)
call_rcu(&admin->rcu, taprio_free_sched_cb);
spin_unlock_irqrestore(&q->current_entry_lock, flags);
if (FULL_OFFLOAD_IS_ENABLED(q->flags))
taprio_offload_config_changed(q);
}
new_admin = NULL;
err = 0;
unlock:
spin_unlock_bh(qdisc_lock(sch));
free_sched:
if (new_admin)
call_rcu(&new_admin->rcu, taprio_free_sched_cb);
return err;
}
static void taprio_reset(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
int i;
hrtimer_cancel(&q->advance_timer);
if (q->qdiscs) {
for (i = 0; i < dev->num_tx_queues; i++)
if (q->qdiscs[i])
qdisc_reset(q->qdiscs[i]);
}
}
static void taprio_destroy(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
struct sched_gate_list *oper, *admin;
unsigned int i;
list_del(&q->taprio_list);
/* Note that taprio_reset() might not be called if an error
* happens in qdisc_create(), after taprio_init() has been called.
*/
hrtimer_cancel(&q->advance_timer);
taprio_disable_offload(dev, q, NULL);
if (q->qdiscs) {
for (i = 0; i < dev->num_tx_queues; i++)
qdisc_put(q->qdiscs[i]);
kfree(q->qdiscs);
}
q->qdiscs = NULL;
netdev_reset_tc(dev);
oper = rtnl_dereference(q->oper_sched);
admin = rtnl_dereference(q->admin_sched);
if (oper)
call_rcu(&oper->rcu, taprio_free_sched_cb);
if (admin)
call_rcu(&admin->rcu, taprio_free_sched_cb);
}
static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
int i;
spin_lock_init(&q->current_entry_lock);
hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS);
q->advance_timer.function = advance_sched;
q->root = sch;
/* We only support static clockids. Use an invalid value as default
* and get the valid one on taprio_change().
*/
q->clockid = -1;
q->flags = TAPRIO_FLAGS_INVALID;
list_add(&q->taprio_list, &taprio_list);
if (sch->parent != TC_H_ROOT) {
NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc");
return -EOPNOTSUPP;
}
if (!netif_is_multiqueue(dev)) {
NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required");
return -EOPNOTSUPP;
}
/* pre-allocate qdisc, attachment can't fail */
q->qdiscs = kcalloc(dev->num_tx_queues,
sizeof(q->qdiscs[0]),
GFP_KERNEL);
if (!q->qdiscs)
return -ENOMEM;
if (!opt)
return -EINVAL;
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *dev_queue;
struct Qdisc *qdisc;
dev_queue = netdev_get_tx_queue(dev, i);
qdisc = qdisc_create_dflt(dev_queue,
&pfifo_qdisc_ops,
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(i + 1)),
extack);
if (!qdisc)
return -ENOMEM;
if (i < dev->real_num_tx_queues)
qdisc_hash_add(qdisc, false);
q->qdiscs[i] = qdisc;
}
return taprio_change(sch, opt, extack);
}
static void taprio_attach(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
unsigned int ntx;
/* Attach underlying qdisc */
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
struct Qdisc *qdisc = q->qdiscs[ntx];
struct Qdisc *old;
if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
} else {
old = dev_graft_qdisc(qdisc->dev_queue, sch);
qdisc_refcount_inc(sch);
}
if (old)
qdisc_put(old);
}
/* access to the child qdiscs is not needed in offload mode */
if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
kfree(q->qdiscs);
q->qdiscs = NULL;
}
}
static struct netdev_queue *taprio_queue_get(struct Qdisc *sch,
unsigned long cl)
{
struct net_device *dev = qdisc_dev(sch);
unsigned long ntx = cl - 1;
if (ntx >= dev->num_tx_queues)
return NULL;
return netdev_get_tx_queue(dev, ntx);
}
static int taprio_graft(struct Qdisc *sch, unsigned long cl,
struct Qdisc *new, struct Qdisc **old,
struct netlink_ext_ack *extack)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
if (!dev_queue)
return -EINVAL;
if (dev->flags & IFF_UP)
dev_deactivate(dev);
if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
*old = dev_graft_qdisc(dev_queue, new);
} else {
*old = q->qdiscs[cl - 1];
q->qdiscs[cl - 1] = new;
}
if (new)
new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
if (dev->flags & IFF_UP)
dev_activate(dev);
return 0;
}
static int dump_entry(struct sk_buff *msg,
const struct sched_entry *entry)
{
struct nlattr *item;
item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY);
if (!item)
return -ENOSPC;
if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index))
goto nla_put_failure;
if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command))
goto nla_put_failure;
if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK,
entry->gate_mask))
goto nla_put_failure;
if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL,
entry->interval))
goto nla_put_failure;
return nla_nest_end(msg, item);
nla_put_failure:
nla_nest_cancel(msg, item);
return -1;
}
static int dump_schedule(struct sk_buff *msg,
const struct sched_gate_list *root)
{
struct nlattr *entry_list;
struct sched_entry *entry;
if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME,
root->base_time, TCA_TAPRIO_PAD))
return -1;
if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME,
root->cycle_time, TCA_TAPRIO_PAD))
return -1;
if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION,
root->cycle_time_extension, TCA_TAPRIO_PAD))
return -1;
entry_list = nla_nest_start_noflag(msg,
TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST);
if (!entry_list)
goto error_nest;
list_for_each_entry(entry, &root->entries, list) {
if (dump_entry(msg, entry) < 0)
goto error_nest;
}
nla_nest_end(msg, entry_list);
return 0;
error_nest:
nla_nest_cancel(msg, entry_list);
return -1;
}
static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb)
{
struct nlattr *n;
int tc;
for (tc = 0; tc < TC_MAX_QUEUE; tc++) {
n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY);
if (!n)
return -EMSGSIZE;
if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc))
goto nla_put_failure;
if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU,
q->max_sdu[tc]))
goto nla_put_failure;
nla_nest_end(skb, n);
}
return 0;
nla_put_failure:
nla_nest_cancel(skb, n);
return -EMSGSIZE;
}
static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
struct sched_gate_list *oper, *admin;
struct tc_mqprio_qopt opt = { 0 };
struct nlattr *nest, *sched_nest;
unsigned int i;
oper = rtnl_dereference(q->oper_sched);
admin = rtnl_dereference(q->admin_sched);
opt.num_tc = netdev_get_num_tc(dev);
memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
for (i = 0; i < netdev_get_num_tc(dev); i++) {
opt.count[i] = dev->tc_to_txq[i].count;
opt.offset[i] = dev->tc_to_txq[i].offset;
}
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
goto start_error;
if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt))
goto options_error;
if (!FULL_OFFLOAD_IS_ENABLED(q->flags) &&
nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
goto options_error;
if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags))
goto options_error;
if (q->txtime_delay &&
nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
goto options_error;
if (taprio_dump_tc_entries(q, skb))
goto options_error;
if (oper && dump_schedule(skb, oper))
goto options_error;
if (!admin)
goto done;
sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED);
if (!sched_nest)
goto options_error;
if (dump_schedule(skb, admin))
goto admin_error;
nla_nest_end(skb, sched_nest);
done:
return nla_nest_end(skb, nest);
admin_error:
nla_nest_cancel(skb, sched_nest);
options_error:
nla_nest_cancel(skb, nest);
start_error:
return -ENOSPC;
}
static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl)
{
struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
if (!dev_queue)
return NULL;
return dev_queue->qdisc_sleeping;
}
static unsigned long taprio_find(struct Qdisc *sch, u32 classid)
{
unsigned int ntx = TC_H_MIN(classid);
if (!taprio_queue_get(sch, ntx))
return 0;
return ntx;
}
static int taprio_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_handle |= TC_H_MIN(cl);
tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
return 0;
}
static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
__releases(d->lock)
__acquires(d->lock)
{
struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
sch = dev_queue->qdisc_sleeping;
if (gnet_stats_copy_basic(d, NULL, &sch->bstats, true) < 0 ||
qdisc_qstats_copy(d, sch) < 0)
return -1;
return 0;
}
static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct net_device *dev = qdisc_dev(sch);
unsigned long ntx;
if (arg->stop)
return;
arg->count = arg->skip;
for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
if (!tc_qdisc_stats_dump(sch, ntx + 1, arg))
break;
}
}
static struct netdev_queue *taprio_select_queue(struct Qdisc *sch,
struct tcmsg *tcm)
{
return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent));
}
static const struct Qdisc_class_ops taprio_class_ops = {
.graft = taprio_graft,
.leaf = taprio_leaf,
.find = taprio_find,
.walk = taprio_walk,
.dump = taprio_dump_class,
.dump_stats = taprio_dump_class_stats,
.select_queue = taprio_select_queue,
};
static struct Qdisc_ops taprio_qdisc_ops __read_mostly = {
.cl_ops = &taprio_class_ops,
.id = "taprio",
.priv_size = sizeof(struct taprio_sched),
.init = taprio_init,
.change = taprio_change,
.destroy = taprio_destroy,
.reset = taprio_reset,
.attach = taprio_attach,
.peek = taprio_peek,
.dequeue = taprio_dequeue,
.enqueue = taprio_enqueue,
.dump = taprio_dump,
.owner = THIS_MODULE,
};
static struct notifier_block taprio_device_notifier = {
.notifier_call = taprio_dev_notifier,
};
static int __init taprio_module_init(void)
{
int err = register_netdevice_notifier(&taprio_device_notifier);
if (err)
return err;
return register_qdisc(&taprio_qdisc_ops);
}
static void __exit taprio_module_exit(void)
{
unregister_qdisc(&taprio_qdisc_ops);
unregister_netdevice_notifier(&taprio_device_notifier);
}
module_init(taprio_module_init);
module_exit(taprio_module_exit);
MODULE_LICENSE("GPL");