1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * net/sched/sch_api.c Packet scheduler API.
5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
29 #include <net/net_namespace.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
35 #include <trace/events/qdisc.h>
42 This file consists of two interrelated parts:
44 1. queueing disciplines manager frontend.
45 2. traffic classes manager frontend.
47 Generally, queueing discipline ("qdisc") is a black box,
48 which is able to enqueue packets and to dequeue them (when
49 device is ready to send something) in order and at times
50 determined by algorithm hidden in it.
52 qdisc's are divided to two categories:
53 - "queues", which have no internal structure visible from outside.
54 - "schedulers", which split all the packets to "traffic classes",
55 using "packet classifiers" (look at cls_api.c)
57 In turn, classes may have child qdiscs (as rule, queues)
58 attached to them etc. etc. etc.
60 The goal of the routines in this file is to translate
61 information supplied by user in the form of handles
62 to more intelligible for kernel form, to make some sanity
63 checks and part of work, which is common to all qdiscs
64 and to provide rtnetlink notifications.
66 All real intelligent work is done inside qdisc modules.
70 Every discipline has two major routines: enqueue and dequeue.
74 dequeue usually returns a skb to send. It is allowed to return NULL,
75 but it does not mean that queue is empty, it just means that
76 discipline does not want to send anything this time.
77 Queue is really empty if q->q.qlen == 0.
78 For complicated disciplines with multiple queues q->q is not
79 real packet queue, but however q->q.qlen must be valid.
83 enqueue returns 0, if packet was enqueued successfully.
84 If packet (this one or another one) was dropped, it returns
86 NET_XMIT_DROP - this packet dropped
87 Expected action: do not backoff, but wait until queue will clear.
88 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
89 Expected action: backoff or ignore
95 like dequeue but without removing a packet from the queue
99 returns qdisc to initial state: purge all buffers, clear all
100 timers, counters (except for statistics) etc.
104 initializes newly created qdisc.
108 destroys resources allocated by init and during lifetime of qdisc.
112 changes qdisc parameters.
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
119 /************************************************
120 * Queueing disciplines manipulation. *
121 ************************************************/
124 /* The list of all installed queueing disciplines. */
126 static struct Qdisc_ops *qdisc_base;
128 /* Register/unregister queueing discipline */
130 int register_qdisc(struct Qdisc_ops *qops)
132 struct Qdisc_ops *q, **qp;
135 write_lock(&qdisc_mod_lock);
136 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137 if (!strcmp(qops->id, q->id))
140 if (qops->enqueue == NULL)
141 qops->enqueue = noop_qdisc_ops.enqueue;
142 if (qops->peek == NULL) {
143 if (qops->dequeue == NULL)
144 qops->peek = noop_qdisc_ops.peek;
148 if (qops->dequeue == NULL)
149 qops->dequeue = noop_qdisc_ops.dequeue;
152 const struct Qdisc_class_ops *cops = qops->cl_ops;
154 if (!(cops->find && cops->walk && cops->leaf))
157 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
165 write_unlock(&qdisc_mod_lock);
172 EXPORT_SYMBOL(register_qdisc);
174 int unregister_qdisc(struct Qdisc_ops *qops)
176 struct Qdisc_ops *q, **qp;
179 write_lock(&qdisc_mod_lock);
180 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
188 write_unlock(&qdisc_mod_lock);
191 EXPORT_SYMBOL(unregister_qdisc);
193 /* Get default qdisc if not otherwise specified */
194 void qdisc_get_default(char *name, size_t len)
196 read_lock(&qdisc_mod_lock);
197 strlcpy(name, default_qdisc_ops->id, len);
198 read_unlock(&qdisc_mod_lock);
201 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
203 struct Qdisc_ops *q = NULL;
205 for (q = qdisc_base; q; q = q->next) {
206 if (!strcmp(name, q->id)) {
207 if (!try_module_get(q->owner))
216 /* Set new default qdisc to use */
217 int qdisc_set_default(const char *name)
219 const struct Qdisc_ops *ops;
221 if (!capable(CAP_NET_ADMIN))
224 write_lock(&qdisc_mod_lock);
225 ops = qdisc_lookup_default(name);
227 /* Not found, drop lock and try to load module */
228 write_unlock(&qdisc_mod_lock);
229 request_module("sch_%s", name);
230 write_lock(&qdisc_mod_lock);
232 ops = qdisc_lookup_default(name);
236 /* Set new default */
237 module_put(default_qdisc_ops->owner);
238 default_qdisc_ops = ops;
240 write_unlock(&qdisc_mod_lock);
242 return ops ? 0 : -ENOENT;
245 #ifdef CONFIG_NET_SCH_DEFAULT
246 /* Set default value from kernel config */
247 static int __init sch_default_qdisc(void)
249 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
251 late_initcall(sch_default_qdisc);
254 /* We know handle. Find qdisc among all qdisc's attached to device
255 * (root qdisc, all its children, children of children etc.)
256 * Note: caller either uses rtnl or rcu_read_lock()
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
263 if (!qdisc_dev(root))
264 return (root->handle == handle ? root : NULL);
266 if (!(root->flags & TCQ_F_BUILTIN) &&
267 root->handle == handle)
270 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
271 if (q->handle == handle)
277 void qdisc_hash_add(struct Qdisc *q, bool invisible)
279 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
281 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
283 q->flags |= TCQ_F_INVISIBLE;
286 EXPORT_SYMBOL(qdisc_hash_add);
288 void qdisc_hash_del(struct Qdisc *q)
290 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
292 hash_del_rcu(&q->hash);
295 EXPORT_SYMBOL(qdisc_hash_del);
297 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
303 q = qdisc_match_from_root(dev->qdisc, handle);
307 if (dev_ingress_queue(dev))
308 q = qdisc_match_from_root(
309 dev_ingress_queue(dev)->qdisc_sleeping,
315 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
317 struct netdev_queue *nq;
322 q = qdisc_match_from_root(dev->qdisc, handle);
326 nq = dev_ingress_queue_rcu(dev);
328 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
333 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
336 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
340 cl = cops->find(p, classid);
344 return cops->leaf(p, cl);
347 /* Find queueing discipline by name */
349 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
351 struct Qdisc_ops *q = NULL;
354 read_lock(&qdisc_mod_lock);
355 for (q = qdisc_base; q; q = q->next) {
356 if (nla_strcmp(kind, q->id) == 0) {
357 if (!try_module_get(q->owner))
362 read_unlock(&qdisc_mod_lock);
367 /* The linklayer setting were not transferred from iproute2, in older
368 * versions, and the rate tables lookup systems have been dropped in
369 * the kernel. To keep backward compatible with older iproute2 tc
370 * utils, we detect the linklayer setting by detecting if the rate
371 * table were modified.
373 * For linklayer ATM table entries, the rate table will be aligned to
374 * 48 bytes, thus some table entries will contain the same value. The
375 * mpu (min packet unit) is also encoded into the old rate table, thus
376 * starting from the mpu, we find low and high table entries for
377 * mapping this cell. If these entries contain the same value, when
378 * the rate tables have been modified for linklayer ATM.
380 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
381 * and then roundup to the next cell, calc the table entry one below,
384 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
386 int low = roundup(r->mpu, 48);
387 int high = roundup(low+1, 48);
388 int cell_low = low >> r->cell_log;
389 int cell_high = (high >> r->cell_log) - 1;
391 /* rtab is too inaccurate at rates > 100Mbit/s */
392 if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
393 pr_debug("TC linklayer: Giving up ATM detection\n");
394 return TC_LINKLAYER_ETHERNET;
397 if ((cell_high > cell_low) && (cell_high < 256)
398 && (rtab[cell_low] == rtab[cell_high])) {
399 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
400 cell_low, cell_high, rtab[cell_high]);
401 return TC_LINKLAYER_ATM;
403 return TC_LINKLAYER_ETHERNET;
406 static struct qdisc_rate_table *qdisc_rtab_list;
408 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
410 struct netlink_ext_ack *extack)
412 struct qdisc_rate_table *rtab;
414 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
415 nla_len(tab) != TC_RTAB_SIZE) {
416 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
420 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
421 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
422 !memcmp(&rtab->data, nla_data(tab), 1024)) {
428 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
432 memcpy(rtab->data, nla_data(tab), 1024);
433 if (r->linklayer == TC_LINKLAYER_UNAWARE)
434 r->linklayer = __detect_linklayer(r, rtab->data);
435 rtab->next = qdisc_rtab_list;
436 qdisc_rtab_list = rtab;
438 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
442 EXPORT_SYMBOL(qdisc_get_rtab);
444 void qdisc_put_rtab(struct qdisc_rate_table *tab)
446 struct qdisc_rate_table *rtab, **rtabp;
448 if (!tab || --tab->refcnt)
451 for (rtabp = &qdisc_rtab_list;
452 (rtab = *rtabp) != NULL;
453 rtabp = &rtab->next) {
461 EXPORT_SYMBOL(qdisc_put_rtab);
463 static LIST_HEAD(qdisc_stab_list);
465 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
466 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
467 [TCA_STAB_DATA] = { .type = NLA_BINARY },
470 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
471 struct netlink_ext_ack *extack)
473 struct nlattr *tb[TCA_STAB_MAX + 1];
474 struct qdisc_size_table *stab;
475 struct tc_sizespec *s;
476 unsigned int tsize = 0;
480 err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
484 if (!tb[TCA_STAB_BASE]) {
485 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
486 return ERR_PTR(-EINVAL);
489 s = nla_data(tb[TCA_STAB_BASE]);
492 if (!tb[TCA_STAB_DATA]) {
493 NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
494 return ERR_PTR(-EINVAL);
496 tab = nla_data(tb[TCA_STAB_DATA]);
497 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
500 if (tsize != s->tsize || (!tab && tsize > 0)) {
501 NL_SET_ERR_MSG(extack, "Invalid size of size table");
502 return ERR_PTR(-EINVAL);
505 list_for_each_entry(stab, &qdisc_stab_list, list) {
506 if (memcmp(&stab->szopts, s, sizeof(*s)))
508 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
514 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
516 return ERR_PTR(-ENOMEM);
521 memcpy(stab->data, tab, tsize * sizeof(u16));
523 list_add_tail(&stab->list, &qdisc_stab_list);
528 void qdisc_put_stab(struct qdisc_size_table *tab)
533 if (--tab->refcnt == 0) {
534 list_del(&tab->list);
538 EXPORT_SYMBOL(qdisc_put_stab);
540 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
544 nest = nla_nest_start_noflag(skb, TCA_STAB);
546 goto nla_put_failure;
547 if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
548 goto nla_put_failure;
549 nla_nest_end(skb, nest);
557 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
558 const struct qdisc_size_table *stab)
562 pkt_len = skb->len + stab->szopts.overhead;
563 if (unlikely(!stab->szopts.tsize))
566 slot = pkt_len + stab->szopts.cell_align;
567 if (unlikely(slot < 0))
570 slot >>= stab->szopts.cell_log;
571 if (likely(slot < stab->szopts.tsize))
572 pkt_len = stab->data[slot];
574 pkt_len = stab->data[stab->szopts.tsize - 1] *
575 (slot / stab->szopts.tsize) +
576 stab->data[slot % stab->szopts.tsize];
578 pkt_len <<= stab->szopts.size_log;
580 if (unlikely(pkt_len < 1))
582 qdisc_skb_cb(skb)->pkt_len = pkt_len;
584 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
586 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
588 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
589 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
590 txt, qdisc->ops->id, qdisc->handle >> 16);
591 qdisc->flags |= TCQ_F_WARN_NONWC;
594 EXPORT_SYMBOL(qdisc_warn_nonwc);
596 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
598 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
602 __netif_schedule(qdisc_root(wd->qdisc));
605 return HRTIMER_NORESTART;
608 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
611 hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
612 wd->timer.function = qdisc_watchdog;
615 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
617 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
619 qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
621 EXPORT_SYMBOL(qdisc_watchdog_init);
623 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
626 if (test_bit(__QDISC_STATE_DEACTIVATED,
627 &qdisc_root_sleeping(wd->qdisc)->state))
630 if (hrtimer_is_queued(&wd->timer)) {
631 /* If timer is already set in [expires, expires + delta_ns],
632 * do not reprogram it.
634 if (wd->last_expires - expires <= delta_ns)
638 wd->last_expires = expires;
639 hrtimer_start_range_ns(&wd->timer,
640 ns_to_ktime(expires),
642 HRTIMER_MODE_ABS_PINNED);
644 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
646 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
648 hrtimer_cancel(&wd->timer);
650 EXPORT_SYMBOL(qdisc_watchdog_cancel);
652 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
654 struct hlist_head *h;
657 h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
660 for (i = 0; i < n; i++)
661 INIT_HLIST_HEAD(&h[i]);
666 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
668 struct Qdisc_class_common *cl;
669 struct hlist_node *next;
670 struct hlist_head *nhash, *ohash;
671 unsigned int nsize, nmask, osize;
674 /* Rehash when load factor exceeds 0.75 */
675 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
677 nsize = clhash->hashsize * 2;
679 nhash = qdisc_class_hash_alloc(nsize);
683 ohash = clhash->hash;
684 osize = clhash->hashsize;
687 for (i = 0; i < osize; i++) {
688 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
689 h = qdisc_class_hash(cl->classid, nmask);
690 hlist_add_head(&cl->hnode, &nhash[h]);
693 clhash->hash = nhash;
694 clhash->hashsize = nsize;
695 clhash->hashmask = nmask;
696 sch_tree_unlock(sch);
700 EXPORT_SYMBOL(qdisc_class_hash_grow);
702 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
704 unsigned int size = 4;
706 clhash->hash = qdisc_class_hash_alloc(size);
709 clhash->hashsize = size;
710 clhash->hashmask = size - 1;
711 clhash->hashelems = 0;
714 EXPORT_SYMBOL(qdisc_class_hash_init);
716 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
718 kvfree(clhash->hash);
720 EXPORT_SYMBOL(qdisc_class_hash_destroy);
722 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
723 struct Qdisc_class_common *cl)
727 INIT_HLIST_NODE(&cl->hnode);
728 h = qdisc_class_hash(cl->classid, clhash->hashmask);
729 hlist_add_head(&cl->hnode, &clhash->hash[h]);
732 EXPORT_SYMBOL(qdisc_class_hash_insert);
734 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
735 struct Qdisc_class_common *cl)
737 hlist_del(&cl->hnode);
740 EXPORT_SYMBOL(qdisc_class_hash_remove);
742 /* Allocate an unique handle from space managed by kernel
743 * Possible range is [8000-FFFF]:0000 (0x8000 values)
745 static u32 qdisc_alloc_handle(struct net_device *dev)
748 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
751 autohandle += TC_H_MAKE(0x10000U, 0);
752 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
753 autohandle = TC_H_MAKE(0x80000000U, 0);
754 if (!qdisc_lookup(dev, autohandle))
762 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
764 bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
765 const struct Qdisc_class_ops *cops;
771 if (n == 0 && len == 0)
773 drops = max_t(int, n, 0);
775 while ((parentid = sch->parent)) {
776 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
779 if (sch->flags & TCQ_F_NOPARENT)
781 /* Notify parent qdisc only if child qdisc becomes empty.
783 * If child was empty even before update then backlog
784 * counter is screwed and we skip notification because
785 * parent class is already passive.
787 * If the original child was offloaded then it is allowed
788 * to be seem as empty, so the parent is notified anyway.
790 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
791 !qdisc_is_offloaded);
792 /* TODO: perform the search on a per txq basis */
793 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
795 WARN_ON_ONCE(parentid != TC_H_ROOT);
798 cops = sch->ops->cl_ops;
799 if (notify && cops->qlen_notify) {
800 cl = cops->find(sch, parentid);
801 cops->qlen_notify(sch, cl);
804 sch->qstats.backlog -= len;
805 __qdisc_qstats_drop(sch, drops);
809 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
811 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
814 struct net_device *dev = qdisc_dev(sch);
817 sch->flags &= ~TCQ_F_OFFLOADED;
818 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
821 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
822 if (err == -EOPNOTSUPP)
826 sch->flags |= TCQ_F_OFFLOADED;
830 EXPORT_SYMBOL(qdisc_offload_dump_helper);
832 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
833 struct Qdisc *new, struct Qdisc *old,
834 enum tc_setup_type type, void *type_data,
835 struct netlink_ext_ack *extack)
837 bool any_qdisc_is_offloaded;
840 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
843 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
845 /* Don't report error if the graft is part of destroy operation. */
846 if (!err || !new || new == &noop_qdisc)
849 /* Don't report error if the parent, the old child and the new
850 * one are not offloaded.
852 any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
853 any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
854 any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
856 if (any_qdisc_is_offloaded)
857 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
859 EXPORT_SYMBOL(qdisc_offload_graft_helper);
861 static void qdisc_offload_graft_root(struct net_device *dev,
862 struct Qdisc *new, struct Qdisc *old,
863 struct netlink_ext_ack *extack)
865 struct tc_root_qopt_offload graft_offload = {
866 .command = TC_ROOT_GRAFT,
867 .handle = new ? new->handle : 0,
868 .ingress = (new && new->flags & TCQ_F_INGRESS) ||
869 (old && old->flags & TCQ_F_INGRESS),
872 qdisc_offload_graft_helper(dev, NULL, new, old,
873 TC_SETUP_ROOT_QDISC, &graft_offload, extack);
876 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
877 u32 portid, u32 seq, u16 flags, int event)
879 struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
880 struct gnet_stats_queue __percpu *cpu_qstats = NULL;
882 struct nlmsghdr *nlh;
883 unsigned char *b = skb_tail_pointer(skb);
885 struct qdisc_size_table *stab;
890 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
893 tcm = nlmsg_data(nlh);
894 tcm->tcm_family = AF_UNSPEC;
897 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
898 tcm->tcm_parent = clid;
899 tcm->tcm_handle = q->handle;
900 tcm->tcm_info = refcount_read(&q->refcnt);
901 if (nla_put_string(skb, TCA_KIND, q->ops->id))
902 goto nla_put_failure;
903 if (q->ops->ingress_block_get) {
904 block_index = q->ops->ingress_block_get(q);
906 nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
907 goto nla_put_failure;
909 if (q->ops->egress_block_get) {
910 block_index = q->ops->egress_block_get(q);
912 nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
913 goto nla_put_failure;
915 if (q->ops->dump && q->ops->dump(q, skb) < 0)
916 goto nla_put_failure;
917 if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
918 goto nla_put_failure;
919 qlen = qdisc_qlen_sum(q);
921 stab = rtnl_dereference(q->stab);
922 if (stab && qdisc_dump_stab(skb, stab) < 0)
923 goto nla_put_failure;
925 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
926 NULL, &d, TCA_PAD) < 0)
927 goto nla_put_failure;
929 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
930 goto nla_put_failure;
932 if (qdisc_is_percpu_stats(q)) {
933 cpu_bstats = q->cpu_bstats;
934 cpu_qstats = q->cpu_qstats;
937 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
938 &d, cpu_bstats, &q->bstats) < 0 ||
939 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
940 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
941 goto nla_put_failure;
943 if (gnet_stats_finish_copy(&d) < 0)
944 goto nla_put_failure;
946 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
955 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
957 if (q->flags & TCQ_F_BUILTIN)
959 if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
965 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
966 struct nlmsghdr *n, u32 clid,
967 struct Qdisc *old, struct Qdisc *new)
970 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
972 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
976 if (old && !tc_qdisc_dump_ignore(old, false)) {
977 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
978 0, RTM_DELQDISC) < 0)
981 if (new && !tc_qdisc_dump_ignore(new, false)) {
982 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
983 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
988 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
989 n->nlmsg_flags & NLM_F_ECHO);
996 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
997 struct nlmsghdr *n, u32 clid,
998 struct Qdisc *old, struct Qdisc *new)
1001 qdisc_notify(net, skb, n, clid, old, new);
1007 static void qdisc_clear_nolock(struct Qdisc *sch)
1009 sch->flags &= ~TCQ_F_NOLOCK;
1010 if (!(sch->flags & TCQ_F_CPUSTATS))
1013 free_percpu(sch->cpu_bstats);
1014 free_percpu(sch->cpu_qstats);
1015 sch->cpu_bstats = NULL;
1016 sch->cpu_qstats = NULL;
1017 sch->flags &= ~TCQ_F_CPUSTATS;
1020 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1023 * When appropriate send a netlink notification using 'skb'
1026 * On success, destroy old qdisc.
1029 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1030 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1031 struct Qdisc *new, struct Qdisc *old,
1032 struct netlink_ext_ack *extack)
1034 struct Qdisc *q = old;
1035 struct net *net = dev_net(dev);
1037 if (parent == NULL) {
1038 unsigned int i, num_q, ingress;
1041 num_q = dev->num_tx_queues;
1042 if ((q && q->flags & TCQ_F_INGRESS) ||
1043 (new && new->flags & TCQ_F_INGRESS)) {
1046 if (!dev_ingress_queue(dev)) {
1047 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1052 if (dev->flags & IFF_UP)
1053 dev_deactivate(dev);
1055 qdisc_offload_graft_root(dev, new, old, extack);
1057 if (new && new->ops->attach)
1060 for (i = 0; i < num_q; i++) {
1061 struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1064 dev_queue = netdev_get_tx_queue(dev, i);
1066 old = dev_graft_qdisc(dev_queue, new);
1068 qdisc_refcount_inc(new);
1076 notify_and_destroy(net, skb, n, classid,
1078 if (new && !new->ops->attach)
1079 qdisc_refcount_inc(new);
1080 dev->qdisc = new ? : &noop_qdisc;
1082 if (new && new->ops->attach)
1083 new->ops->attach(new);
1085 notify_and_destroy(net, skb, n, classid, old, new);
1088 if (dev->flags & IFF_UP)
1091 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1095 /* Only support running class lockless if parent is lockless */
1096 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1097 qdisc_clear_nolock(new);
1099 if (!cops || !cops->graft)
1102 cl = cops->find(parent, classid);
1104 NL_SET_ERR_MSG(extack, "Specified class not found");
1108 err = cops->graft(parent, cl, new, &old, extack);
1111 notify_and_destroy(net, skb, n, classid, old, new);
1116 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1117 struct netlink_ext_ack *extack)
1121 if (tca[TCA_INGRESS_BLOCK]) {
1122 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1125 NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1128 if (!sch->ops->ingress_block_set) {
1129 NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1132 sch->ops->ingress_block_set(sch, block_index);
1134 if (tca[TCA_EGRESS_BLOCK]) {
1135 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1138 NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1141 if (!sch->ops->egress_block_set) {
1142 NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1145 sch->ops->egress_block_set(sch, block_index);
1151 Allocate and initialize new qdisc.
1153 Parameters are passed via opt.
1156 static struct Qdisc *qdisc_create(struct net_device *dev,
1157 struct netdev_queue *dev_queue,
1158 struct Qdisc *p, u32 parent, u32 handle,
1159 struct nlattr **tca, int *errp,
1160 struct netlink_ext_ack *extack)
1163 struct nlattr *kind = tca[TCA_KIND];
1165 struct Qdisc_ops *ops;
1166 struct qdisc_size_table *stab;
1168 ops = qdisc_lookup_ops(kind);
1169 #ifdef CONFIG_MODULES
1170 if (ops == NULL && kind != NULL) {
1171 char name[IFNAMSIZ];
1172 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1173 /* We dropped the RTNL semaphore in order to
1174 * perform the module load. So, even if we
1175 * succeeded in loading the module we have to
1176 * tell the caller to replay the request. We
1177 * indicate this using -EAGAIN.
1178 * We replay the request because the device may
1179 * go away in the mean time.
1182 request_module("sch_%s", name);
1184 ops = qdisc_lookup_ops(kind);
1186 /* We will try again qdisc_lookup_ops,
1187 * so don't keep a reference.
1189 module_put(ops->owner);
1199 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1203 sch = qdisc_alloc(dev_queue, ops, extack);
1209 sch->parent = parent;
1211 if (handle == TC_H_INGRESS) {
1212 sch->flags |= TCQ_F_INGRESS;
1213 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1216 handle = qdisc_alloc_handle(dev);
1218 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1223 if (!netif_is_multiqueue(dev))
1224 sch->flags |= TCQ_F_ONETXQUEUE;
1227 sch->handle = handle;
1229 /* This exist to keep backward compatible with a userspace
1230 * loophole, what allowed userspace to get IFF_NO_QUEUE
1231 * facility on older kernels by setting tx_queue_len=0 (prior
1232 * to qdisc init), and then forgot to reinit tx_queue_len
1233 * before again attaching a qdisc.
1235 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1236 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1237 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1240 err = qdisc_block_indexes_set(sch, tca, extack);
1245 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1250 if (tca[TCA_STAB]) {
1251 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1253 err = PTR_ERR(stab);
1256 rcu_assign_pointer(sch->stab, stab);
1258 if (tca[TCA_RATE]) {
1259 seqcount_t *running;
1262 if (sch->flags & TCQ_F_MQROOT) {
1263 NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1267 if (sch->parent != TC_H_ROOT &&
1268 !(sch->flags & TCQ_F_INGRESS) &&
1269 (!p || !(p->flags & TCQ_F_MQROOT)))
1270 running = qdisc_root_sleeping_running(sch);
1272 running = &sch->running;
1274 err = gen_new_estimator(&sch->bstats,
1281 NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1286 qdisc_hash_add(sch, false);
1287 trace_qdisc_create(ops, dev, parent);
1292 /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1299 module_put(ops->owner);
1306 * Any broken qdiscs that would require a ops->reset() here?
1307 * The qdisc was never in action so it shouldn't be necessary.
1309 qdisc_put_stab(rtnl_dereference(sch->stab));
1315 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1316 struct netlink_ext_ack *extack)
1318 struct qdisc_size_table *ostab, *stab = NULL;
1321 if (tca[TCA_OPTIONS]) {
1322 if (!sch->ops->change) {
1323 NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1326 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1327 NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1330 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1335 if (tca[TCA_STAB]) {
1336 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1338 return PTR_ERR(stab);
1341 ostab = rtnl_dereference(sch->stab);
1342 rcu_assign_pointer(sch->stab, stab);
1343 qdisc_put_stab(ostab);
1345 if (tca[TCA_RATE]) {
1346 /* NB: ignores errors from replace_estimator
1347 because change can't be undone. */
1348 if (sch->flags & TCQ_F_MQROOT)
1350 gen_replace_estimator(&sch->bstats,
1354 qdisc_root_sleeping_running(sch),
1361 struct check_loop_arg {
1362 struct qdisc_walker w;
1367 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1368 struct qdisc_walker *w);
1370 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1372 struct check_loop_arg arg;
1374 if (q->ops->cl_ops == NULL)
1377 arg.w.stop = arg.w.skip = arg.w.count = 0;
1378 arg.w.fn = check_loop_fn;
1381 q->ops->cl_ops->walk(q, &arg.w);
1382 return arg.w.stop ? -ELOOP : 0;
1386 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1389 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1390 struct check_loop_arg *arg = (struct check_loop_arg *)w;
1392 leaf = cops->leaf(q, cl);
1394 if (leaf == arg->p || arg->depth > 7)
1396 return check_loop(leaf, arg->p, arg->depth + 1);
1401 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1402 [TCA_KIND] = { .type = NLA_STRING },
1403 [TCA_RATE] = { .type = NLA_BINARY,
1404 .len = sizeof(struct tc_estimator) },
1405 [TCA_STAB] = { .type = NLA_NESTED },
1406 [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG },
1407 [TCA_CHAIN] = { .type = NLA_U32 },
1408 [TCA_INGRESS_BLOCK] = { .type = NLA_U32 },
1409 [TCA_EGRESS_BLOCK] = { .type = NLA_U32 },
1416 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1417 struct netlink_ext_ack *extack)
1419 struct net *net = sock_net(skb->sk);
1420 struct tcmsg *tcm = nlmsg_data(n);
1421 struct nlattr *tca[TCA_MAX + 1];
1422 struct net_device *dev;
1424 struct Qdisc *q = NULL;
1425 struct Qdisc *p = NULL;
1428 if ((n->nlmsg_type != RTM_GETQDISC) &&
1429 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1432 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1433 rtm_tca_policy, extack);
1437 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1441 clid = tcm->tcm_parent;
1443 if (clid != TC_H_ROOT) {
1444 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1445 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1447 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1450 q = qdisc_leaf(p, clid);
1451 } else if (dev_ingress_queue(dev)) {
1452 q = dev_ingress_queue(dev)->qdisc_sleeping;
1458 NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1462 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1463 NL_SET_ERR_MSG(extack, "Invalid handle");
1467 q = qdisc_lookup(dev, tcm->tcm_handle);
1469 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1474 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1475 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1479 if (n->nlmsg_type == RTM_DELQDISC) {
1481 NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1484 if (q->handle == 0) {
1485 NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1488 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1492 qdisc_notify(net, skb, n, clid, NULL, q);
1498 * Create/change qdisc.
1501 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1502 struct netlink_ext_ack *extack)
1504 struct net *net = sock_net(skb->sk);
1506 struct nlattr *tca[TCA_MAX + 1];
1507 struct net_device *dev;
1509 struct Qdisc *q, *p;
1512 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1516 /* Reinit, just in case something touches this. */
1517 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1518 rtm_tca_policy, extack);
1522 tcm = nlmsg_data(n);
1523 clid = tcm->tcm_parent;
1526 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1532 if (clid != TC_H_ROOT) {
1533 if (clid != TC_H_INGRESS) {
1534 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1536 NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1539 q = qdisc_leaf(p, clid);
1540 } else if (dev_ingress_queue_create(dev)) {
1541 q = dev_ingress_queue(dev)->qdisc_sleeping;
1547 /* It may be default qdisc, ignore it */
1548 if (q && q->handle == 0)
1551 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1552 if (tcm->tcm_handle) {
1553 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1554 NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1557 if (TC_H_MIN(tcm->tcm_handle)) {
1558 NL_SET_ERR_MSG(extack, "Invalid minor handle");
1561 q = qdisc_lookup(dev, tcm->tcm_handle);
1563 goto create_n_graft;
1564 if (n->nlmsg_flags & NLM_F_EXCL) {
1565 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1568 if (tca[TCA_KIND] &&
1569 nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1570 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1574 (p && check_loop(q, p, 0))) {
1575 NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1578 qdisc_refcount_inc(q);
1582 goto create_n_graft;
1584 /* This magic test requires explanation.
1586 * We know, that some child q is already
1587 * attached to this parent and have choice:
1588 * either to change it or to create/graft new one.
1590 * 1. We are allowed to create/graft only
1591 * if CREATE and REPLACE flags are set.
1593 * 2. If EXCL is set, requestor wanted to say,
1594 * that qdisc tcm_handle is not expected
1595 * to exist, so that we choose create/graft too.
1597 * 3. The last case is when no flags are set.
1598 * Alas, it is sort of hole in API, we
1599 * cannot decide what to do unambiguously.
1600 * For now we select create/graft, if
1601 * user gave KIND, which does not match existing.
1603 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1604 (n->nlmsg_flags & NLM_F_REPLACE) &&
1605 ((n->nlmsg_flags & NLM_F_EXCL) ||
1607 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1608 goto create_n_graft;
1612 if (!tcm->tcm_handle) {
1613 NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1616 q = qdisc_lookup(dev, tcm->tcm_handle);
1619 /* Change qdisc parameters */
1621 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1624 if (n->nlmsg_flags & NLM_F_EXCL) {
1625 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1628 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1629 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1632 err = qdisc_change(q, tca, extack);
1634 qdisc_notify(net, skb, n, clid, NULL, q);
1638 if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1639 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1642 if (clid == TC_H_INGRESS) {
1643 if (dev_ingress_queue(dev)) {
1644 q = qdisc_create(dev, dev_ingress_queue(dev), p,
1645 tcm->tcm_parent, tcm->tcm_parent,
1648 NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1652 struct netdev_queue *dev_queue;
1654 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1655 dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1657 dev_queue = p->dev_queue;
1659 dev_queue = netdev_get_tx_queue(dev, 0);
1661 q = qdisc_create(dev, dev_queue, p,
1662 tcm->tcm_parent, tcm->tcm_handle,
1672 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1682 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1683 struct netlink_callback *cb,
1684 int *q_idx_p, int s_q_idx, bool recur,
1685 bool dump_invisible)
1687 int ret = 0, q_idx = *q_idx_p;
1695 if (q_idx < s_q_idx) {
1698 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1699 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1700 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1706 /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1707 * itself has already been dumped.
1709 * If we've already dumped the top-level (ingress) qdisc above and the global
1710 * qdisc hashtable, we don't want to hit it again
1712 if (!qdisc_dev(root) || !recur)
1715 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1716 if (q_idx < s_q_idx) {
1720 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1721 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1722 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1736 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1738 struct net *net = sock_net(skb->sk);
1741 struct net_device *dev;
1742 const struct nlmsghdr *nlh = cb->nlh;
1743 struct nlattr *tca[TCA_MAX + 1];
1746 s_idx = cb->args[0];
1747 s_q_idx = q_idx = cb->args[1];
1752 err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1753 rtm_tca_policy, cb->extack);
1757 for_each_netdev(net, dev) {
1758 struct netdev_queue *dev_queue;
1766 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1767 true, tca[TCA_DUMP_INVISIBLE]) < 0)
1770 dev_queue = dev_ingress_queue(dev);
1772 tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1773 &q_idx, s_q_idx, false,
1774 tca[TCA_DUMP_INVISIBLE]) < 0)
1783 cb->args[1] = q_idx;
1790 /************************************************
1791 * Traffic classes manipulation. *
1792 ************************************************/
1794 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1796 u32 portid, u32 seq, u16 flags, int event)
1799 struct nlmsghdr *nlh;
1800 unsigned char *b = skb_tail_pointer(skb);
1802 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1805 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1807 goto out_nlmsg_trim;
1808 tcm = nlmsg_data(nlh);
1809 tcm->tcm_family = AF_UNSPEC;
1812 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1813 tcm->tcm_parent = q->handle;
1814 tcm->tcm_handle = q->handle;
1816 if (nla_put_string(skb, TCA_KIND, q->ops->id))
1817 goto nla_put_failure;
1818 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1819 goto nla_put_failure;
1821 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1822 NULL, &d, TCA_PAD) < 0)
1823 goto nla_put_failure;
1825 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1826 goto nla_put_failure;
1828 if (gnet_stats_finish_copy(&d) < 0)
1829 goto nla_put_failure;
1831 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1840 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1841 struct nlmsghdr *n, struct Qdisc *q,
1842 unsigned long cl, int event)
1844 struct sk_buff *skb;
1845 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1848 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1852 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1857 err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1858 n->nlmsg_flags & NLM_F_ECHO);
1864 static int tclass_del_notify(struct net *net,
1865 const struct Qdisc_class_ops *cops,
1866 struct sk_buff *oskb, struct nlmsghdr *n,
1867 struct Qdisc *q, unsigned long cl)
1869 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1870 struct sk_buff *skb;
1876 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1880 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1881 RTM_DELTCLASS) < 0) {
1886 err = cops->delete(q, cl);
1892 err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1893 n->nlmsg_flags & NLM_F_ECHO);
1899 #ifdef CONFIG_NET_CLS
1901 struct tcf_bind_args {
1902 struct tcf_walker w;
1908 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1910 struct tcf_bind_args *a = (void *)arg;
1912 if (tp->ops->bind_class) {
1913 struct Qdisc *q = tcf_block_q(tp->chain->block);
1916 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1922 struct tc_bind_class_args {
1923 struct qdisc_walker w;
1924 unsigned long new_cl;
1929 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1930 struct qdisc_walker *w)
1932 struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1933 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1934 struct tcf_block *block;
1935 struct tcf_chain *chain;
1937 block = cops->tcf_block(q, cl, NULL);
1940 for (chain = tcf_get_next_chain(block, NULL);
1942 chain = tcf_get_next_chain(block, chain)) {
1943 struct tcf_proto *tp;
1945 for (tp = tcf_get_next_proto(chain, NULL, true);
1946 tp; tp = tcf_get_next_proto(chain, tp, true)) {
1947 struct tcf_bind_args arg = {};
1949 arg.w.fn = tcf_node_bind;
1950 arg.classid = a->clid;
1953 tp->ops->walk(tp, &arg.w, true);
1960 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1961 unsigned long new_cl)
1963 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1964 struct tc_bind_class_args args = {};
1966 if (!cops->tcf_block)
1968 args.portid = portid;
1970 args.new_cl = new_cl;
1971 args.w.fn = tc_bind_class_walker;
1972 q->ops->cl_ops->walk(q, &args.w);
1977 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1978 unsigned long new_cl)
1984 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1985 struct netlink_ext_ack *extack)
1987 struct net *net = sock_net(skb->sk);
1988 struct tcmsg *tcm = nlmsg_data(n);
1989 struct nlattr *tca[TCA_MAX + 1];
1990 struct net_device *dev;
1991 struct Qdisc *q = NULL;
1992 const struct Qdisc_class_ops *cops;
1993 unsigned long cl = 0;
1994 unsigned long new_cl;
2000 if ((n->nlmsg_type != RTM_GETTCLASS) &&
2001 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2004 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2005 rtm_tca_policy, extack);
2009 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2014 parent == TC_H_UNSPEC - unspecified parent.
2015 parent == TC_H_ROOT - class is root, which has no parent.
2016 parent == X:0 - parent is root class.
2017 parent == X:Y - parent is a node in hierarchy.
2018 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
2020 handle == 0:0 - generate handle from kernel pool.
2021 handle == 0:Y - class is X:Y, where X:0 is qdisc.
2022 handle == X:Y - clear.
2023 handle == X:0 - root class.
2026 /* Step 1. Determine qdisc handle X:0 */
2028 portid = tcm->tcm_parent;
2029 clid = tcm->tcm_handle;
2030 qid = TC_H_MAJ(clid);
2032 if (portid != TC_H_ROOT) {
2033 u32 qid1 = TC_H_MAJ(portid);
2036 /* If both majors are known, they must be identical. */
2041 } else if (qid == 0)
2042 qid = dev->qdisc->handle;
2044 /* Now qid is genuine qdisc handle consistent
2045 * both with parent and child.
2047 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2050 portid = TC_H_MAKE(qid, portid);
2053 qid = dev->qdisc->handle;
2056 /* OK. Locate qdisc */
2057 q = qdisc_lookup(dev, qid);
2061 /* An check that it supports classes */
2062 cops = q->ops->cl_ops;
2066 /* Now try to get class */
2068 if (portid == TC_H_ROOT)
2071 clid = TC_H_MAKE(qid, clid);
2074 cl = cops->find(q, clid);
2078 if (n->nlmsg_type != RTM_NEWTCLASS ||
2079 !(n->nlmsg_flags & NLM_F_CREATE))
2082 switch (n->nlmsg_type) {
2085 if (n->nlmsg_flags & NLM_F_EXCL)
2089 err = tclass_del_notify(net, cops, skb, n, q, cl);
2090 /* Unbind the class with flilters with 0 */
2091 tc_bind_tclass(q, portid, clid, 0);
2094 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2102 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2103 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2110 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2112 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2113 /* We just create a new class, need to do reverse binding. */
2115 tc_bind_tclass(q, portid, clid, new_cl);
2121 struct qdisc_dump_args {
2122 struct qdisc_walker w;
2123 struct sk_buff *skb;
2124 struct netlink_callback *cb;
2127 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2128 struct qdisc_walker *arg)
2130 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2132 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2133 a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2137 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2138 struct tcmsg *tcm, struct netlink_callback *cb,
2141 struct qdisc_dump_args arg;
2143 if (tc_qdisc_dump_ignore(q, false) ||
2144 *t_p < s_t || !q->ops->cl_ops ||
2146 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2151 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2152 arg.w.fn = qdisc_class_dump;
2156 arg.w.skip = cb->args[1];
2158 q->ops->cl_ops->walk(q, &arg.w);
2159 cb->args[1] = arg.w.count;
2166 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2167 struct tcmsg *tcm, struct netlink_callback *cb,
2176 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2179 if (!qdisc_dev(root))
2182 if (tcm->tcm_parent) {
2183 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2184 if (q && q != root &&
2185 tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2189 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2190 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2197 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2199 struct tcmsg *tcm = nlmsg_data(cb->nlh);
2200 struct net *net = sock_net(skb->sk);
2201 struct netdev_queue *dev_queue;
2202 struct net_device *dev;
2205 if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2207 dev = dev_get_by_index(net, tcm->tcm_ifindex);
2214 if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2217 dev_queue = dev_ingress_queue(dev);
2219 tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2230 #ifdef CONFIG_PROC_FS
2231 static int psched_show(struct seq_file *seq, void *v)
2233 seq_printf(seq, "%08x %08x %08x %08x\n",
2234 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2236 (u32)NSEC_PER_SEC / hrtimer_resolution);
2241 static int __net_init psched_net_init(struct net *net)
2243 struct proc_dir_entry *e;
2245 e = proc_create_single("psched", 0, net->proc_net, psched_show);
2252 static void __net_exit psched_net_exit(struct net *net)
2254 remove_proc_entry("psched", net->proc_net);
2257 static int __net_init psched_net_init(struct net *net)
2262 static void __net_exit psched_net_exit(struct net *net)
2267 static struct pernet_operations psched_net_ops = {
2268 .init = psched_net_init,
2269 .exit = psched_net_exit,
2272 static int __init pktsched_init(void)
2276 err = register_pernet_subsys(&psched_net_ops);
2278 pr_err("pktsched_init: "
2279 "cannot initialize per netns operations\n");
2283 register_qdisc(&pfifo_fast_ops);
2284 register_qdisc(&pfifo_qdisc_ops);
2285 register_qdisc(&bfifo_qdisc_ops);
2286 register_qdisc(&pfifo_head_drop_qdisc_ops);
2287 register_qdisc(&mq_qdisc_ops);
2288 register_qdisc(&noqueue_qdisc_ops);
2290 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2291 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2292 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2294 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2295 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2296 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2302 subsys_initcall(pktsched_init);