Merge tag 'erofs-for-5.5-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang...
[linux-2.6-microblaze.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 /*
36
37    Short review.
38    -------------
39
40    This file consists of two interrelated parts:
41
42    1. queueing disciplines manager frontend.
43    2. traffic classes manager frontend.
44
45    Generally, queueing discipline ("qdisc") is a black box,
46    which is able to enqueue packets and to dequeue them (when
47    device is ready to send something) in order and at times
48    determined by algorithm hidden in it.
49
50    qdisc's are divided to two categories:
51    - "queues", which have no internal structure visible from outside.
52    - "schedulers", which split all the packets to "traffic classes",
53      using "packet classifiers" (look at cls_api.c)
54
55    In turn, classes may have child qdiscs (as rule, queues)
56    attached to them etc. etc. etc.
57
58    The goal of the routines in this file is to translate
59    information supplied by user in the form of handles
60    to more intelligible for kernel form, to make some sanity
61    checks and part of work, which is common to all qdiscs
62    and to provide rtnetlink notifications.
63
64    All real intelligent work is done inside qdisc modules.
65
66
67
68    Every discipline has two major routines: enqueue and dequeue.
69
70    ---dequeue
71
72    dequeue usually returns a skb to send. It is allowed to return NULL,
73    but it does not mean that queue is empty, it just means that
74    discipline does not want to send anything this time.
75    Queue is really empty if q->q.qlen == 0.
76    For complicated disciplines with multiple queues q->q is not
77    real packet queue, but however q->q.qlen must be valid.
78
79    ---enqueue
80
81    enqueue returns 0, if packet was enqueued successfully.
82    If packet (this one or another one) was dropped, it returns
83    not zero error code.
84    NET_XMIT_DROP        - this packet dropped
85      Expected action: do not backoff, but wait until queue will clear.
86    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
87      Expected action: backoff or ignore
88
89    Auxiliary routines:
90
91    ---peek
92
93    like dequeue but without removing a packet from the queue
94
95    ---reset
96
97    returns qdisc to initial state: purge all buffers, clear all
98    timers, counters (except for statistics) etc.
99
100    ---init
101
102    initializes newly created qdisc.
103
104    ---destroy
105
106    destroys resources allocated by init and during lifetime of qdisc.
107
108    ---change
109
110    changes qdisc parameters.
111  */
112
113 /* Protects list of registered TC modules. It is pure SMP lock. */
114 static DEFINE_RWLOCK(qdisc_mod_lock);
115
116
117 /************************************************
118  *      Queueing disciplines manipulation.      *
119  ************************************************/
120
121
122 /* The list of all installed queueing disciplines. */
123
124 static struct Qdisc_ops *qdisc_base;
125
126 /* Register/unregister queueing discipline */
127
128 int register_qdisc(struct Qdisc_ops *qops)
129 {
130         struct Qdisc_ops *q, **qp;
131         int rc = -EEXIST;
132
133         write_lock(&qdisc_mod_lock);
134         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
135                 if (!strcmp(qops->id, q->id))
136                         goto out;
137
138         if (qops->enqueue == NULL)
139                 qops->enqueue = noop_qdisc_ops.enqueue;
140         if (qops->peek == NULL) {
141                 if (qops->dequeue == NULL)
142                         qops->peek = noop_qdisc_ops.peek;
143                 else
144                         goto out_einval;
145         }
146         if (qops->dequeue == NULL)
147                 qops->dequeue = noop_qdisc_ops.dequeue;
148
149         if (qops->cl_ops) {
150                 const struct Qdisc_class_ops *cops = qops->cl_ops;
151
152                 if (!(cops->find && cops->walk && cops->leaf))
153                         goto out_einval;
154
155                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
156                         goto out_einval;
157         }
158
159         qops->next = NULL;
160         *qp = qops;
161         rc = 0;
162 out:
163         write_unlock(&qdisc_mod_lock);
164         return rc;
165
166 out_einval:
167         rc = -EINVAL;
168         goto out;
169 }
170 EXPORT_SYMBOL(register_qdisc);
171
172 int unregister_qdisc(struct Qdisc_ops *qops)
173 {
174         struct Qdisc_ops *q, **qp;
175         int err = -ENOENT;
176
177         write_lock(&qdisc_mod_lock);
178         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
179                 if (q == qops)
180                         break;
181         if (q) {
182                 *qp = q->next;
183                 q->next = NULL;
184                 err = 0;
185         }
186         write_unlock(&qdisc_mod_lock);
187         return err;
188 }
189 EXPORT_SYMBOL(unregister_qdisc);
190
191 /* Get default qdisc if not otherwise specified */
192 void qdisc_get_default(char *name, size_t len)
193 {
194         read_lock(&qdisc_mod_lock);
195         strlcpy(name, default_qdisc_ops->id, len);
196         read_unlock(&qdisc_mod_lock);
197 }
198
199 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
200 {
201         struct Qdisc_ops *q = NULL;
202
203         for (q = qdisc_base; q; q = q->next) {
204                 if (!strcmp(name, q->id)) {
205                         if (!try_module_get(q->owner))
206                                 q = NULL;
207                         break;
208                 }
209         }
210
211         return q;
212 }
213
214 /* Set new default qdisc to use */
215 int qdisc_set_default(const char *name)
216 {
217         const struct Qdisc_ops *ops;
218
219         if (!capable(CAP_NET_ADMIN))
220                 return -EPERM;
221
222         write_lock(&qdisc_mod_lock);
223         ops = qdisc_lookup_default(name);
224         if (!ops) {
225                 /* Not found, drop lock and try to load module */
226                 write_unlock(&qdisc_mod_lock);
227                 request_module("sch_%s", name);
228                 write_lock(&qdisc_mod_lock);
229
230                 ops = qdisc_lookup_default(name);
231         }
232
233         if (ops) {
234                 /* Set new default */
235                 module_put(default_qdisc_ops->owner);
236                 default_qdisc_ops = ops;
237         }
238         write_unlock(&qdisc_mod_lock);
239
240         return ops ? 0 : -ENOENT;
241 }
242
243 #ifdef CONFIG_NET_SCH_DEFAULT
244 /* Set default value from kernel config */
245 static int __init sch_default_qdisc(void)
246 {
247         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
248 }
249 late_initcall(sch_default_qdisc);
250 #endif
251
252 /* We know handle. Find qdisc among all qdisc's attached to device
253  * (root qdisc, all its children, children of children etc.)
254  * Note: caller either uses rtnl or rcu_read_lock()
255  */
256
257 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
258 {
259         struct Qdisc *q;
260
261         if (!qdisc_dev(root))
262                 return (root->handle == handle ? root : NULL);
263
264         if (!(root->flags & TCQ_F_BUILTIN) &&
265             root->handle == handle)
266                 return root;
267
268         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
269                 if (q->handle == handle)
270                         return q;
271         }
272         return NULL;
273 }
274
275 void qdisc_hash_add(struct Qdisc *q, bool invisible)
276 {
277         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
278                 ASSERT_RTNL();
279                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
280                 if (invisible)
281                         q->flags |= TCQ_F_INVISIBLE;
282         }
283 }
284 EXPORT_SYMBOL(qdisc_hash_add);
285
286 void qdisc_hash_del(struct Qdisc *q)
287 {
288         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
289                 ASSERT_RTNL();
290                 hash_del_rcu(&q->hash);
291         }
292 }
293 EXPORT_SYMBOL(qdisc_hash_del);
294
295 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
296 {
297         struct Qdisc *q;
298
299         if (!handle)
300                 return NULL;
301         q = qdisc_match_from_root(dev->qdisc, handle);
302         if (q)
303                 goto out;
304
305         if (dev_ingress_queue(dev))
306                 q = qdisc_match_from_root(
307                         dev_ingress_queue(dev)->qdisc_sleeping,
308                         handle);
309 out:
310         return q;
311 }
312
313 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
314 {
315         struct netdev_queue *nq;
316         struct Qdisc *q;
317
318         if (!handle)
319                 return NULL;
320         q = qdisc_match_from_root(dev->qdisc, handle);
321         if (q)
322                 goto out;
323
324         nq = dev_ingress_queue_rcu(dev);
325         if (nq)
326                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
327 out:
328         return q;
329 }
330
331 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
332 {
333         unsigned long cl;
334         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
335
336         if (cops == NULL)
337                 return NULL;
338         cl = cops->find(p, classid);
339
340         if (cl == 0)
341                 return NULL;
342         return cops->leaf(p, cl);
343 }
344
345 /* Find queueing discipline by name */
346
347 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
348 {
349         struct Qdisc_ops *q = NULL;
350
351         if (kind) {
352                 read_lock(&qdisc_mod_lock);
353                 for (q = qdisc_base; q; q = q->next) {
354                         if (nla_strcmp(kind, q->id) == 0) {
355                                 if (!try_module_get(q->owner))
356                                         q = NULL;
357                                 break;
358                         }
359                 }
360                 read_unlock(&qdisc_mod_lock);
361         }
362         return q;
363 }
364
365 /* The linklayer setting were not transferred from iproute2, in older
366  * versions, and the rate tables lookup systems have been dropped in
367  * the kernel. To keep backward compatible with older iproute2 tc
368  * utils, we detect the linklayer setting by detecting if the rate
369  * table were modified.
370  *
371  * For linklayer ATM table entries, the rate table will be aligned to
372  * 48 bytes, thus some table entries will contain the same value.  The
373  * mpu (min packet unit) is also encoded into the old rate table, thus
374  * starting from the mpu, we find low and high table entries for
375  * mapping this cell.  If these entries contain the same value, when
376  * the rate tables have been modified for linklayer ATM.
377  *
378  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
379  * and then roundup to the next cell, calc the table entry one below,
380  * and compare.
381  */
382 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
383 {
384         int low       = roundup(r->mpu, 48);
385         int high      = roundup(low+1, 48);
386         int cell_low  = low >> r->cell_log;
387         int cell_high = (high >> r->cell_log) - 1;
388
389         /* rtab is too inaccurate at rates > 100Mbit/s */
390         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
391                 pr_debug("TC linklayer: Giving up ATM detection\n");
392                 return TC_LINKLAYER_ETHERNET;
393         }
394
395         if ((cell_high > cell_low) && (cell_high < 256)
396             && (rtab[cell_low] == rtab[cell_high])) {
397                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
398                          cell_low, cell_high, rtab[cell_high]);
399                 return TC_LINKLAYER_ATM;
400         }
401         return TC_LINKLAYER_ETHERNET;
402 }
403
404 static struct qdisc_rate_table *qdisc_rtab_list;
405
406 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
407                                         struct nlattr *tab,
408                                         struct netlink_ext_ack *extack)
409 {
410         struct qdisc_rate_table *rtab;
411
412         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
413             nla_len(tab) != TC_RTAB_SIZE) {
414                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
415                 return NULL;
416         }
417
418         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
419                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
420                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
421                         rtab->refcnt++;
422                         return rtab;
423                 }
424         }
425
426         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
427         if (rtab) {
428                 rtab->rate = *r;
429                 rtab->refcnt = 1;
430                 memcpy(rtab->data, nla_data(tab), 1024);
431                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
432                         r->linklayer = __detect_linklayer(r, rtab->data);
433                 rtab->next = qdisc_rtab_list;
434                 qdisc_rtab_list = rtab;
435         } else {
436                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
437         }
438         return rtab;
439 }
440 EXPORT_SYMBOL(qdisc_get_rtab);
441
442 void qdisc_put_rtab(struct qdisc_rate_table *tab)
443 {
444         struct qdisc_rate_table *rtab, **rtabp;
445
446         if (!tab || --tab->refcnt)
447                 return;
448
449         for (rtabp = &qdisc_rtab_list;
450              (rtab = *rtabp) != NULL;
451              rtabp = &rtab->next) {
452                 if (rtab == tab) {
453                         *rtabp = rtab->next;
454                         kfree(rtab);
455                         return;
456                 }
457         }
458 }
459 EXPORT_SYMBOL(qdisc_put_rtab);
460
461 static LIST_HEAD(qdisc_stab_list);
462
463 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
464         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
465         [TCA_STAB_DATA] = { .type = NLA_BINARY },
466 };
467
468 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
469                                                struct netlink_ext_ack *extack)
470 {
471         struct nlattr *tb[TCA_STAB_MAX + 1];
472         struct qdisc_size_table *stab;
473         struct tc_sizespec *s;
474         unsigned int tsize = 0;
475         u16 *tab = NULL;
476         int err;
477
478         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
479                                           extack);
480         if (err < 0)
481                 return ERR_PTR(err);
482         if (!tb[TCA_STAB_BASE]) {
483                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
484                 return ERR_PTR(-EINVAL);
485         }
486
487         s = nla_data(tb[TCA_STAB_BASE]);
488
489         if (s->tsize > 0) {
490                 if (!tb[TCA_STAB_DATA]) {
491                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
492                         return ERR_PTR(-EINVAL);
493                 }
494                 tab = nla_data(tb[TCA_STAB_DATA]);
495                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
496         }
497
498         if (tsize != s->tsize || (!tab && tsize > 0)) {
499                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
500                 return ERR_PTR(-EINVAL);
501         }
502
503         list_for_each_entry(stab, &qdisc_stab_list, list) {
504                 if (memcmp(&stab->szopts, s, sizeof(*s)))
505                         continue;
506                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
507                         continue;
508                 stab->refcnt++;
509                 return stab;
510         }
511
512         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
513         if (!stab)
514                 return ERR_PTR(-ENOMEM);
515
516         stab->refcnt = 1;
517         stab->szopts = *s;
518         if (tsize > 0)
519                 memcpy(stab->data, tab, tsize * sizeof(u16));
520
521         list_add_tail(&stab->list, &qdisc_stab_list);
522
523         return stab;
524 }
525
526 void qdisc_put_stab(struct qdisc_size_table *tab)
527 {
528         if (!tab)
529                 return;
530
531         if (--tab->refcnt == 0) {
532                 list_del(&tab->list);
533                 kfree_rcu(tab, rcu);
534         }
535 }
536 EXPORT_SYMBOL(qdisc_put_stab);
537
538 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
539 {
540         struct nlattr *nest;
541
542         nest = nla_nest_start_noflag(skb, TCA_STAB);
543         if (nest == NULL)
544                 goto nla_put_failure;
545         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
546                 goto nla_put_failure;
547         nla_nest_end(skb, nest);
548
549         return skb->len;
550
551 nla_put_failure:
552         return -1;
553 }
554
555 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
556                                const struct qdisc_size_table *stab)
557 {
558         int pkt_len, slot;
559
560         pkt_len = skb->len + stab->szopts.overhead;
561         if (unlikely(!stab->szopts.tsize))
562                 goto out;
563
564         slot = pkt_len + stab->szopts.cell_align;
565         if (unlikely(slot < 0))
566                 slot = 0;
567
568         slot >>= stab->szopts.cell_log;
569         if (likely(slot < stab->szopts.tsize))
570                 pkt_len = stab->data[slot];
571         else
572                 pkt_len = stab->data[stab->szopts.tsize - 1] *
573                                 (slot / stab->szopts.tsize) +
574                                 stab->data[slot % stab->szopts.tsize];
575
576         pkt_len <<= stab->szopts.size_log;
577 out:
578         if (unlikely(pkt_len < 1))
579                 pkt_len = 1;
580         qdisc_skb_cb(skb)->pkt_len = pkt_len;
581 }
582 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
583
584 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
585 {
586         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
587                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
588                         txt, qdisc->ops->id, qdisc->handle >> 16);
589                 qdisc->flags |= TCQ_F_WARN_NONWC;
590         }
591 }
592 EXPORT_SYMBOL(qdisc_warn_nonwc);
593
594 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
595 {
596         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
597                                                  timer);
598
599         rcu_read_lock();
600         __netif_schedule(qdisc_root(wd->qdisc));
601         rcu_read_unlock();
602
603         return HRTIMER_NORESTART;
604 }
605
606 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
607                                  clockid_t clockid)
608 {
609         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
610         wd->timer.function = qdisc_watchdog;
611         wd->qdisc = qdisc;
612 }
613 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
614
615 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
616 {
617         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
618 }
619 EXPORT_SYMBOL(qdisc_watchdog_init);
620
621 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
622 {
623         if (test_bit(__QDISC_STATE_DEACTIVATED,
624                      &qdisc_root_sleeping(wd->qdisc)->state))
625                 return;
626
627         if (wd->last_expires == expires)
628                 return;
629
630         wd->last_expires = expires;
631         hrtimer_start(&wd->timer,
632                       ns_to_ktime(expires),
633                       HRTIMER_MODE_ABS_PINNED);
634 }
635 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
636
637 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
638 {
639         hrtimer_cancel(&wd->timer);
640 }
641 EXPORT_SYMBOL(qdisc_watchdog_cancel);
642
643 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
644 {
645         struct hlist_head *h;
646         unsigned int i;
647
648         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
649
650         if (h != NULL) {
651                 for (i = 0; i < n; i++)
652                         INIT_HLIST_HEAD(&h[i]);
653         }
654         return h;
655 }
656
657 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
658 {
659         struct Qdisc_class_common *cl;
660         struct hlist_node *next;
661         struct hlist_head *nhash, *ohash;
662         unsigned int nsize, nmask, osize;
663         unsigned int i, h;
664
665         /* Rehash when load factor exceeds 0.75 */
666         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
667                 return;
668         nsize = clhash->hashsize * 2;
669         nmask = nsize - 1;
670         nhash = qdisc_class_hash_alloc(nsize);
671         if (nhash == NULL)
672                 return;
673
674         ohash = clhash->hash;
675         osize = clhash->hashsize;
676
677         sch_tree_lock(sch);
678         for (i = 0; i < osize; i++) {
679                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
680                         h = qdisc_class_hash(cl->classid, nmask);
681                         hlist_add_head(&cl->hnode, &nhash[h]);
682                 }
683         }
684         clhash->hash     = nhash;
685         clhash->hashsize = nsize;
686         clhash->hashmask = nmask;
687         sch_tree_unlock(sch);
688
689         kvfree(ohash);
690 }
691 EXPORT_SYMBOL(qdisc_class_hash_grow);
692
693 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
694 {
695         unsigned int size = 4;
696
697         clhash->hash = qdisc_class_hash_alloc(size);
698         if (!clhash->hash)
699                 return -ENOMEM;
700         clhash->hashsize  = size;
701         clhash->hashmask  = size - 1;
702         clhash->hashelems = 0;
703         return 0;
704 }
705 EXPORT_SYMBOL(qdisc_class_hash_init);
706
707 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
708 {
709         kvfree(clhash->hash);
710 }
711 EXPORT_SYMBOL(qdisc_class_hash_destroy);
712
713 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
714                              struct Qdisc_class_common *cl)
715 {
716         unsigned int h;
717
718         INIT_HLIST_NODE(&cl->hnode);
719         h = qdisc_class_hash(cl->classid, clhash->hashmask);
720         hlist_add_head(&cl->hnode, &clhash->hash[h]);
721         clhash->hashelems++;
722 }
723 EXPORT_SYMBOL(qdisc_class_hash_insert);
724
725 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
726                              struct Qdisc_class_common *cl)
727 {
728         hlist_del(&cl->hnode);
729         clhash->hashelems--;
730 }
731 EXPORT_SYMBOL(qdisc_class_hash_remove);
732
733 /* Allocate an unique handle from space managed by kernel
734  * Possible range is [8000-FFFF]:0000 (0x8000 values)
735  */
736 static u32 qdisc_alloc_handle(struct net_device *dev)
737 {
738         int i = 0x8000;
739         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
740
741         do {
742                 autohandle += TC_H_MAKE(0x10000U, 0);
743                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
744                         autohandle = TC_H_MAKE(0x80000000U, 0);
745                 if (!qdisc_lookup(dev, autohandle))
746                         return autohandle;
747                 cond_resched();
748         } while (--i > 0);
749
750         return 0;
751 }
752
753 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
754 {
755         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
756         const struct Qdisc_class_ops *cops;
757         unsigned long cl;
758         u32 parentid;
759         bool notify;
760         int drops;
761
762         if (n == 0 && len == 0)
763                 return;
764         drops = max_t(int, n, 0);
765         rcu_read_lock();
766         while ((parentid = sch->parent)) {
767                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
768                         break;
769
770                 if (sch->flags & TCQ_F_NOPARENT)
771                         break;
772                 /* Notify parent qdisc only if child qdisc becomes empty.
773                  *
774                  * If child was empty even before update then backlog
775                  * counter is screwed and we skip notification because
776                  * parent class is already passive.
777                  *
778                  * If the original child was offloaded then it is allowed
779                  * to be seem as empty, so the parent is notified anyway.
780                  */
781                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
782                                                        !qdisc_is_offloaded);
783                 /* TODO: perform the search on a per txq basis */
784                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
785                 if (sch == NULL) {
786                         WARN_ON_ONCE(parentid != TC_H_ROOT);
787                         break;
788                 }
789                 cops = sch->ops->cl_ops;
790                 if (notify && cops->qlen_notify) {
791                         cl = cops->find(sch, parentid);
792                         cops->qlen_notify(sch, cl);
793                 }
794                 sch->q.qlen -= n;
795                 sch->qstats.backlog -= len;
796                 __qdisc_qstats_drop(sch, drops);
797         }
798         rcu_read_unlock();
799 }
800 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
801
802 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
803                               void *type_data)
804 {
805         struct net_device *dev = qdisc_dev(sch);
806         int err;
807
808         sch->flags &= ~TCQ_F_OFFLOADED;
809         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
810                 return 0;
811
812         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
813         if (err == -EOPNOTSUPP)
814                 return 0;
815
816         if (!err)
817                 sch->flags |= TCQ_F_OFFLOADED;
818
819         return err;
820 }
821 EXPORT_SYMBOL(qdisc_offload_dump_helper);
822
823 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
824                                 struct Qdisc *new, struct Qdisc *old,
825                                 enum tc_setup_type type, void *type_data,
826                                 struct netlink_ext_ack *extack)
827 {
828         bool any_qdisc_is_offloaded;
829         int err;
830
831         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
832                 return;
833
834         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
835
836         /* Don't report error if the graft is part of destroy operation. */
837         if (!err || !new || new == &noop_qdisc)
838                 return;
839
840         /* Don't report error if the parent, the old child and the new
841          * one are not offloaded.
842          */
843         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
844         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
845         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
846
847         if (any_qdisc_is_offloaded)
848                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
849 }
850 EXPORT_SYMBOL(qdisc_offload_graft_helper);
851
852 static void qdisc_offload_graft_root(struct net_device *dev,
853                                      struct Qdisc *new, struct Qdisc *old,
854                                      struct netlink_ext_ack *extack)
855 {
856         struct tc_root_qopt_offload graft_offload = {
857                 .command        = TC_ROOT_GRAFT,
858                 .handle         = new ? new->handle : 0,
859                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
860                                   (old && old->flags & TCQ_F_INGRESS),
861         };
862
863         qdisc_offload_graft_helper(dev, NULL, new, old,
864                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
865 }
866
867 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
868                          u32 portid, u32 seq, u16 flags, int event)
869 {
870         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
871         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
872         struct tcmsg *tcm;
873         struct nlmsghdr  *nlh;
874         unsigned char *b = skb_tail_pointer(skb);
875         struct gnet_dump d;
876         struct qdisc_size_table *stab;
877         u32 block_index;
878         __u32 qlen;
879
880         cond_resched();
881         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
882         if (!nlh)
883                 goto out_nlmsg_trim;
884         tcm = nlmsg_data(nlh);
885         tcm->tcm_family = AF_UNSPEC;
886         tcm->tcm__pad1 = 0;
887         tcm->tcm__pad2 = 0;
888         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
889         tcm->tcm_parent = clid;
890         tcm->tcm_handle = q->handle;
891         tcm->tcm_info = refcount_read(&q->refcnt);
892         if (nla_put_string(skb, TCA_KIND, q->ops->id))
893                 goto nla_put_failure;
894         if (q->ops->ingress_block_get) {
895                 block_index = q->ops->ingress_block_get(q);
896                 if (block_index &&
897                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
898                         goto nla_put_failure;
899         }
900         if (q->ops->egress_block_get) {
901                 block_index = q->ops->egress_block_get(q);
902                 if (block_index &&
903                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
904                         goto nla_put_failure;
905         }
906         if (q->ops->dump && q->ops->dump(q, skb) < 0)
907                 goto nla_put_failure;
908         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
909                 goto nla_put_failure;
910         qlen = qdisc_qlen_sum(q);
911
912         stab = rtnl_dereference(q->stab);
913         if (stab && qdisc_dump_stab(skb, stab) < 0)
914                 goto nla_put_failure;
915
916         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
917                                          NULL, &d, TCA_PAD) < 0)
918                 goto nla_put_failure;
919
920         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
921                 goto nla_put_failure;
922
923         if (qdisc_is_percpu_stats(q)) {
924                 cpu_bstats = q->cpu_bstats;
925                 cpu_qstats = q->cpu_qstats;
926         }
927
928         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
929                                   &d, cpu_bstats, &q->bstats) < 0 ||
930             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
931             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
932                 goto nla_put_failure;
933
934         if (gnet_stats_finish_copy(&d) < 0)
935                 goto nla_put_failure;
936
937         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
938         return skb->len;
939
940 out_nlmsg_trim:
941 nla_put_failure:
942         nlmsg_trim(skb, b);
943         return -1;
944 }
945
946 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
947 {
948         if (q->flags & TCQ_F_BUILTIN)
949                 return true;
950         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
951                 return true;
952
953         return false;
954 }
955
956 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
957                         struct nlmsghdr *n, u32 clid,
958                         struct Qdisc *old, struct Qdisc *new)
959 {
960         struct sk_buff *skb;
961         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
962
963         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
964         if (!skb)
965                 return -ENOBUFS;
966
967         if (old && !tc_qdisc_dump_ignore(old, false)) {
968                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
969                                   0, RTM_DELQDISC) < 0)
970                         goto err_out;
971         }
972         if (new && !tc_qdisc_dump_ignore(new, false)) {
973                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
974                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
975                         goto err_out;
976         }
977
978         if (skb->len)
979                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
980                                       n->nlmsg_flags & NLM_F_ECHO);
981
982 err_out:
983         kfree_skb(skb);
984         return -EINVAL;
985 }
986
987 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
988                                struct nlmsghdr *n, u32 clid,
989                                struct Qdisc *old, struct Qdisc *new)
990 {
991         if (new || old)
992                 qdisc_notify(net, skb, n, clid, old, new);
993
994         if (old)
995                 qdisc_put(old);
996 }
997
998 static void qdisc_clear_nolock(struct Qdisc *sch)
999 {
1000         sch->flags &= ~TCQ_F_NOLOCK;
1001         if (!(sch->flags & TCQ_F_CPUSTATS))
1002                 return;
1003
1004         free_percpu(sch->cpu_bstats);
1005         free_percpu(sch->cpu_qstats);
1006         sch->cpu_bstats = NULL;
1007         sch->cpu_qstats = NULL;
1008         sch->flags &= ~TCQ_F_CPUSTATS;
1009 }
1010
1011 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1012  * to device "dev".
1013  *
1014  * When appropriate send a netlink notification using 'skb'
1015  * and "n".
1016  *
1017  * On success, destroy old qdisc.
1018  */
1019
1020 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1021                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1022                        struct Qdisc *new, struct Qdisc *old,
1023                        struct netlink_ext_ack *extack)
1024 {
1025         struct Qdisc *q = old;
1026         struct net *net = dev_net(dev);
1027
1028         if (parent == NULL) {
1029                 unsigned int i, num_q, ingress;
1030
1031                 ingress = 0;
1032                 num_q = dev->num_tx_queues;
1033                 if ((q && q->flags & TCQ_F_INGRESS) ||
1034                     (new && new->flags & TCQ_F_INGRESS)) {
1035                         num_q = 1;
1036                         ingress = 1;
1037                         if (!dev_ingress_queue(dev)) {
1038                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1039                                 return -ENOENT;
1040                         }
1041                 }
1042
1043                 if (dev->flags & IFF_UP)
1044                         dev_deactivate(dev);
1045
1046                 qdisc_offload_graft_root(dev, new, old, extack);
1047
1048                 if (new && new->ops->attach)
1049                         goto skip;
1050
1051                 for (i = 0; i < num_q; i++) {
1052                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1053
1054                         if (!ingress)
1055                                 dev_queue = netdev_get_tx_queue(dev, i);
1056
1057                         old = dev_graft_qdisc(dev_queue, new);
1058                         if (new && i > 0)
1059                                 qdisc_refcount_inc(new);
1060
1061                         if (!ingress)
1062                                 qdisc_put(old);
1063                 }
1064
1065 skip:
1066                 if (!ingress) {
1067                         notify_and_destroy(net, skb, n, classid,
1068                                            dev->qdisc, new);
1069                         if (new && !new->ops->attach)
1070                                 qdisc_refcount_inc(new);
1071                         dev->qdisc = new ? : &noop_qdisc;
1072
1073                         if (new && new->ops->attach)
1074                                 new->ops->attach(new);
1075                 } else {
1076                         notify_and_destroy(net, skb, n, classid, old, new);
1077                 }
1078
1079                 if (dev->flags & IFF_UP)
1080                         dev_activate(dev);
1081         } else {
1082                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1083                 unsigned long cl;
1084                 int err;
1085
1086                 /* Only support running class lockless if parent is lockless */
1087                 if (new && (new->flags & TCQ_F_NOLOCK) &&
1088                     parent && !(parent->flags & TCQ_F_NOLOCK))
1089                         qdisc_clear_nolock(new);
1090
1091                 if (!cops || !cops->graft)
1092                         return -EOPNOTSUPP;
1093
1094                 cl = cops->find(parent, classid);
1095                 if (!cl) {
1096                         NL_SET_ERR_MSG(extack, "Specified class not found");
1097                         return -ENOENT;
1098                 }
1099
1100                 err = cops->graft(parent, cl, new, &old, extack);
1101                 if (err)
1102                         return err;
1103                 notify_and_destroy(net, skb, n, classid, old, new);
1104         }
1105         return 0;
1106 }
1107
1108 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1109                                    struct netlink_ext_ack *extack)
1110 {
1111         u32 block_index;
1112
1113         if (tca[TCA_INGRESS_BLOCK]) {
1114                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1115
1116                 if (!block_index) {
1117                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1118                         return -EINVAL;
1119                 }
1120                 if (!sch->ops->ingress_block_set) {
1121                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1122                         return -EOPNOTSUPP;
1123                 }
1124                 sch->ops->ingress_block_set(sch, block_index);
1125         }
1126         if (tca[TCA_EGRESS_BLOCK]) {
1127                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1128
1129                 if (!block_index) {
1130                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1131                         return -EINVAL;
1132                 }
1133                 if (!sch->ops->egress_block_set) {
1134                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1135                         return -EOPNOTSUPP;
1136                 }
1137                 sch->ops->egress_block_set(sch, block_index);
1138         }
1139         return 0;
1140 }
1141
1142 /*
1143    Allocate and initialize new qdisc.
1144
1145    Parameters are passed via opt.
1146  */
1147
1148 static struct Qdisc *qdisc_create(struct net_device *dev,
1149                                   struct netdev_queue *dev_queue,
1150                                   struct Qdisc *p, u32 parent, u32 handle,
1151                                   struct nlattr **tca, int *errp,
1152                                   struct netlink_ext_ack *extack)
1153 {
1154         int err;
1155         struct nlattr *kind = tca[TCA_KIND];
1156         struct Qdisc *sch;
1157         struct Qdisc_ops *ops;
1158         struct qdisc_size_table *stab;
1159
1160         ops = qdisc_lookup_ops(kind);
1161 #ifdef CONFIG_MODULES
1162         if (ops == NULL && kind != NULL) {
1163                 char name[IFNAMSIZ];
1164                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1165                         /* We dropped the RTNL semaphore in order to
1166                          * perform the module load.  So, even if we
1167                          * succeeded in loading the module we have to
1168                          * tell the caller to replay the request.  We
1169                          * indicate this using -EAGAIN.
1170                          * We replay the request because the device may
1171                          * go away in the mean time.
1172                          */
1173                         rtnl_unlock();
1174                         request_module("sch_%s", name);
1175                         rtnl_lock();
1176                         ops = qdisc_lookup_ops(kind);
1177                         if (ops != NULL) {
1178                                 /* We will try again qdisc_lookup_ops,
1179                                  * so don't keep a reference.
1180                                  */
1181                                 module_put(ops->owner);
1182                                 err = -EAGAIN;
1183                                 goto err_out;
1184                         }
1185                 }
1186         }
1187 #endif
1188
1189         err = -ENOENT;
1190         if (!ops) {
1191                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1192                 goto err_out;
1193         }
1194
1195         sch = qdisc_alloc(dev_queue, ops, extack);
1196         if (IS_ERR(sch)) {
1197                 err = PTR_ERR(sch);
1198                 goto err_out2;
1199         }
1200
1201         sch->parent = parent;
1202
1203         if (handle == TC_H_INGRESS) {
1204                 sch->flags |= TCQ_F_INGRESS;
1205                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1206         } else {
1207                 if (handle == 0) {
1208                         handle = qdisc_alloc_handle(dev);
1209                         if (handle == 0) {
1210                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1211                                 err = -ENOSPC;
1212                                 goto err_out3;
1213                         }
1214                 }
1215                 if (!netif_is_multiqueue(dev))
1216                         sch->flags |= TCQ_F_ONETXQUEUE;
1217         }
1218
1219         sch->handle = handle;
1220
1221         /* This exist to keep backward compatible with a userspace
1222          * loophole, what allowed userspace to get IFF_NO_QUEUE
1223          * facility on older kernels by setting tx_queue_len=0 (prior
1224          * to qdisc init), and then forgot to reinit tx_queue_len
1225          * before again attaching a qdisc.
1226          */
1227         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1228                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1229                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1230         }
1231
1232         err = qdisc_block_indexes_set(sch, tca, extack);
1233         if (err)
1234                 goto err_out3;
1235
1236         if (ops->init) {
1237                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1238                 if (err != 0)
1239                         goto err_out5;
1240         }
1241
1242         if (tca[TCA_STAB]) {
1243                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1244                 if (IS_ERR(stab)) {
1245                         err = PTR_ERR(stab);
1246                         goto err_out4;
1247                 }
1248                 rcu_assign_pointer(sch->stab, stab);
1249         }
1250         if (tca[TCA_RATE]) {
1251                 seqcount_t *running;
1252
1253                 err = -EOPNOTSUPP;
1254                 if (sch->flags & TCQ_F_MQROOT) {
1255                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1256                         goto err_out4;
1257                 }
1258
1259                 if (sch->parent != TC_H_ROOT &&
1260                     !(sch->flags & TCQ_F_INGRESS) &&
1261                     (!p || !(p->flags & TCQ_F_MQROOT)))
1262                         running = qdisc_root_sleeping_running(sch);
1263                 else
1264                         running = &sch->running;
1265
1266                 err = gen_new_estimator(&sch->bstats,
1267                                         sch->cpu_bstats,
1268                                         &sch->rate_est,
1269                                         NULL,
1270                                         running,
1271                                         tca[TCA_RATE]);
1272                 if (err) {
1273                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1274                         goto err_out4;
1275                 }
1276         }
1277
1278         qdisc_hash_add(sch, false);
1279
1280         return sch;
1281
1282 err_out5:
1283         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1284         if (ops->destroy)
1285                 ops->destroy(sch);
1286 err_out3:
1287         dev_put(dev);
1288         qdisc_free(sch);
1289 err_out2:
1290         module_put(ops->owner);
1291 err_out:
1292         *errp = err;
1293         return NULL;
1294
1295 err_out4:
1296         /*
1297          * Any broken qdiscs that would require a ops->reset() here?
1298          * The qdisc was never in action so it shouldn't be necessary.
1299          */
1300         qdisc_put_stab(rtnl_dereference(sch->stab));
1301         if (ops->destroy)
1302                 ops->destroy(sch);
1303         goto err_out3;
1304 }
1305
1306 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1307                         struct netlink_ext_ack *extack)
1308 {
1309         struct qdisc_size_table *ostab, *stab = NULL;
1310         int err = 0;
1311
1312         if (tca[TCA_OPTIONS]) {
1313                 if (!sch->ops->change) {
1314                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1315                         return -EINVAL;
1316                 }
1317                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1318                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1319                         return -EOPNOTSUPP;
1320                 }
1321                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1322                 if (err)
1323                         return err;
1324         }
1325
1326         if (tca[TCA_STAB]) {
1327                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1328                 if (IS_ERR(stab))
1329                         return PTR_ERR(stab);
1330         }
1331
1332         ostab = rtnl_dereference(sch->stab);
1333         rcu_assign_pointer(sch->stab, stab);
1334         qdisc_put_stab(ostab);
1335
1336         if (tca[TCA_RATE]) {
1337                 /* NB: ignores errors from replace_estimator
1338                    because change can't be undone. */
1339                 if (sch->flags & TCQ_F_MQROOT)
1340                         goto out;
1341                 gen_replace_estimator(&sch->bstats,
1342                                       sch->cpu_bstats,
1343                                       &sch->rate_est,
1344                                       NULL,
1345                                       qdisc_root_sleeping_running(sch),
1346                                       tca[TCA_RATE]);
1347         }
1348 out:
1349         return 0;
1350 }
1351
1352 struct check_loop_arg {
1353         struct qdisc_walker     w;
1354         struct Qdisc            *p;
1355         int                     depth;
1356 };
1357
1358 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1359                          struct qdisc_walker *w);
1360
1361 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1362 {
1363         struct check_loop_arg   arg;
1364
1365         if (q->ops->cl_ops == NULL)
1366                 return 0;
1367
1368         arg.w.stop = arg.w.skip = arg.w.count = 0;
1369         arg.w.fn = check_loop_fn;
1370         arg.depth = depth;
1371         arg.p = p;
1372         q->ops->cl_ops->walk(q, &arg.w);
1373         return arg.w.stop ? -ELOOP : 0;
1374 }
1375
1376 static int
1377 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1378 {
1379         struct Qdisc *leaf;
1380         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1381         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1382
1383         leaf = cops->leaf(q, cl);
1384         if (leaf) {
1385                 if (leaf == arg->p || arg->depth > 7)
1386                         return -ELOOP;
1387                 return check_loop(leaf, arg->p, arg->depth + 1);
1388         }
1389         return 0;
1390 }
1391
1392 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1393         [TCA_KIND]              = { .type = NLA_STRING },
1394         [TCA_RATE]              = { .type = NLA_BINARY,
1395                                     .len = sizeof(struct tc_estimator) },
1396         [TCA_STAB]              = { .type = NLA_NESTED },
1397         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1398         [TCA_CHAIN]             = { .type = NLA_U32 },
1399         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1400         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1401 };
1402
1403 /*
1404  * Delete/get qdisc.
1405  */
1406
1407 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1408                         struct netlink_ext_ack *extack)
1409 {
1410         struct net *net = sock_net(skb->sk);
1411         struct tcmsg *tcm = nlmsg_data(n);
1412         struct nlattr *tca[TCA_MAX + 1];
1413         struct net_device *dev;
1414         u32 clid;
1415         struct Qdisc *q = NULL;
1416         struct Qdisc *p = NULL;
1417         int err;
1418
1419         if ((n->nlmsg_type != RTM_GETQDISC) &&
1420             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1421                 return -EPERM;
1422
1423         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1424                                      rtm_tca_policy, extack);
1425         if (err < 0)
1426                 return err;
1427
1428         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1429         if (!dev)
1430                 return -ENODEV;
1431
1432         clid = tcm->tcm_parent;
1433         if (clid) {
1434                 if (clid != TC_H_ROOT) {
1435                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1436                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1437                                 if (!p) {
1438                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1439                                         return -ENOENT;
1440                                 }
1441                                 q = qdisc_leaf(p, clid);
1442                         } else if (dev_ingress_queue(dev)) {
1443                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1444                         }
1445                 } else {
1446                         q = dev->qdisc;
1447                 }
1448                 if (!q) {
1449                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1450                         return -ENOENT;
1451                 }
1452
1453                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1454                         NL_SET_ERR_MSG(extack, "Invalid handle");
1455                         return -EINVAL;
1456                 }
1457         } else {
1458                 q = qdisc_lookup(dev, tcm->tcm_handle);
1459                 if (!q) {
1460                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1461                         return -ENOENT;
1462                 }
1463         }
1464
1465         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1466                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1467                 return -EINVAL;
1468         }
1469
1470         if (n->nlmsg_type == RTM_DELQDISC) {
1471                 if (!clid) {
1472                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1473                         return -EINVAL;
1474                 }
1475                 if (q->handle == 0) {
1476                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1477                         return -ENOENT;
1478                 }
1479                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1480                 if (err != 0)
1481                         return err;
1482         } else {
1483                 qdisc_notify(net, skb, n, clid, NULL, q);
1484         }
1485         return 0;
1486 }
1487
1488 /*
1489  * Create/change qdisc.
1490  */
1491
1492 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1493                            struct netlink_ext_ack *extack)
1494 {
1495         struct net *net = sock_net(skb->sk);
1496         struct tcmsg *tcm;
1497         struct nlattr *tca[TCA_MAX + 1];
1498         struct net_device *dev;
1499         u32 clid;
1500         struct Qdisc *q, *p;
1501         int err;
1502
1503         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1504                 return -EPERM;
1505
1506 replay:
1507         /* Reinit, just in case something touches this. */
1508         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1509                                      rtm_tca_policy, extack);
1510         if (err < 0)
1511                 return err;
1512
1513         tcm = nlmsg_data(n);
1514         clid = tcm->tcm_parent;
1515         q = p = NULL;
1516
1517         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1518         if (!dev)
1519                 return -ENODEV;
1520
1521
1522         if (clid) {
1523                 if (clid != TC_H_ROOT) {
1524                         if (clid != TC_H_INGRESS) {
1525                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1526                                 if (!p) {
1527                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1528                                         return -ENOENT;
1529                                 }
1530                                 q = qdisc_leaf(p, clid);
1531                         } else if (dev_ingress_queue_create(dev)) {
1532                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1533                         }
1534                 } else {
1535                         q = dev->qdisc;
1536                 }
1537
1538                 /* It may be default qdisc, ignore it */
1539                 if (q && q->handle == 0)
1540                         q = NULL;
1541
1542                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1543                         if (tcm->tcm_handle) {
1544                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1545                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1546                                         return -EEXIST;
1547                                 }
1548                                 if (TC_H_MIN(tcm->tcm_handle)) {
1549                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1550                                         return -EINVAL;
1551                                 }
1552                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1553                                 if (!q)
1554                                         goto create_n_graft;
1555                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1556                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1557                                         return -EEXIST;
1558                                 }
1559                                 if (tca[TCA_KIND] &&
1560                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1561                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1562                                         return -EINVAL;
1563                                 }
1564                                 if (q == p ||
1565                                     (p && check_loop(q, p, 0))) {
1566                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1567                                         return -ELOOP;
1568                                 }
1569                                 qdisc_refcount_inc(q);
1570                                 goto graft;
1571                         } else {
1572                                 if (!q)
1573                                         goto create_n_graft;
1574
1575                                 /* This magic test requires explanation.
1576                                  *
1577                                  *   We know, that some child q is already
1578                                  *   attached to this parent and have choice:
1579                                  *   either to change it or to create/graft new one.
1580                                  *
1581                                  *   1. We are allowed to create/graft only
1582                                  *   if CREATE and REPLACE flags are set.
1583                                  *
1584                                  *   2. If EXCL is set, requestor wanted to say,
1585                                  *   that qdisc tcm_handle is not expected
1586                                  *   to exist, so that we choose create/graft too.
1587                                  *
1588                                  *   3. The last case is when no flags are set.
1589                                  *   Alas, it is sort of hole in API, we
1590                                  *   cannot decide what to do unambiguously.
1591                                  *   For now we select create/graft, if
1592                                  *   user gave KIND, which does not match existing.
1593                                  */
1594                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1595                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1596                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1597                                      (tca[TCA_KIND] &&
1598                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1599                                         goto create_n_graft;
1600                         }
1601                 }
1602         } else {
1603                 if (!tcm->tcm_handle) {
1604                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1605                         return -EINVAL;
1606                 }
1607                 q = qdisc_lookup(dev, tcm->tcm_handle);
1608         }
1609
1610         /* Change qdisc parameters */
1611         if (!q) {
1612                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1613                 return -ENOENT;
1614         }
1615         if (n->nlmsg_flags & NLM_F_EXCL) {
1616                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1617                 return -EEXIST;
1618         }
1619         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1620                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1621                 return -EINVAL;
1622         }
1623         err = qdisc_change(q, tca, extack);
1624         if (err == 0)
1625                 qdisc_notify(net, skb, n, clid, NULL, q);
1626         return err;
1627
1628 create_n_graft:
1629         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1630                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1631                 return -ENOENT;
1632         }
1633         if (clid == TC_H_INGRESS) {
1634                 if (dev_ingress_queue(dev)) {
1635                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1636                                          tcm->tcm_parent, tcm->tcm_parent,
1637                                          tca, &err, extack);
1638                 } else {
1639                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1640                         err = -ENOENT;
1641                 }
1642         } else {
1643                 struct netdev_queue *dev_queue;
1644
1645                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1646                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1647                 else if (p)
1648                         dev_queue = p->dev_queue;
1649                 else
1650                         dev_queue = netdev_get_tx_queue(dev, 0);
1651
1652                 q = qdisc_create(dev, dev_queue, p,
1653                                  tcm->tcm_parent, tcm->tcm_handle,
1654                                  tca, &err, extack);
1655         }
1656         if (q == NULL) {
1657                 if (err == -EAGAIN)
1658                         goto replay;
1659                 return err;
1660         }
1661
1662 graft:
1663         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1664         if (err) {
1665                 if (q)
1666                         qdisc_put(q);
1667                 return err;
1668         }
1669
1670         return 0;
1671 }
1672
1673 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1674                               struct netlink_callback *cb,
1675                               int *q_idx_p, int s_q_idx, bool recur,
1676                               bool dump_invisible)
1677 {
1678         int ret = 0, q_idx = *q_idx_p;
1679         struct Qdisc *q;
1680         int b;
1681
1682         if (!root)
1683                 return 0;
1684
1685         q = root;
1686         if (q_idx < s_q_idx) {
1687                 q_idx++;
1688         } else {
1689                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1690                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1691                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1692                                   RTM_NEWQDISC) <= 0)
1693                         goto done;
1694                 q_idx++;
1695         }
1696
1697         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1698          * itself has already been dumped.
1699          *
1700          * If we've already dumped the top-level (ingress) qdisc above and the global
1701          * qdisc hashtable, we don't want to hit it again
1702          */
1703         if (!qdisc_dev(root) || !recur)
1704                 goto out;
1705
1706         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1707                 if (q_idx < s_q_idx) {
1708                         q_idx++;
1709                         continue;
1710                 }
1711                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1712                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1713                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1714                                   RTM_NEWQDISC) <= 0)
1715                         goto done;
1716                 q_idx++;
1717         }
1718
1719 out:
1720         *q_idx_p = q_idx;
1721         return ret;
1722 done:
1723         ret = -1;
1724         goto out;
1725 }
1726
1727 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1728 {
1729         struct net *net = sock_net(skb->sk);
1730         int idx, q_idx;
1731         int s_idx, s_q_idx;
1732         struct net_device *dev;
1733         const struct nlmsghdr *nlh = cb->nlh;
1734         struct nlattr *tca[TCA_MAX + 1];
1735         int err;
1736
1737         s_idx = cb->args[0];
1738         s_q_idx = q_idx = cb->args[1];
1739
1740         idx = 0;
1741         ASSERT_RTNL();
1742
1743         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1744                                      rtm_tca_policy, cb->extack);
1745         if (err < 0)
1746                 return err;
1747
1748         for_each_netdev(net, dev) {
1749                 struct netdev_queue *dev_queue;
1750
1751                 if (idx < s_idx)
1752                         goto cont;
1753                 if (idx > s_idx)
1754                         s_q_idx = 0;
1755                 q_idx = 0;
1756
1757                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1758                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1759                         goto done;
1760
1761                 dev_queue = dev_ingress_queue(dev);
1762                 if (dev_queue &&
1763                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1764                                        &q_idx, s_q_idx, false,
1765                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1766                         goto done;
1767
1768 cont:
1769                 idx++;
1770         }
1771
1772 done:
1773         cb->args[0] = idx;
1774         cb->args[1] = q_idx;
1775
1776         return skb->len;
1777 }
1778
1779
1780
1781 /************************************************
1782  *      Traffic classes manipulation.           *
1783  ************************************************/
1784
1785 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1786                           unsigned long cl,
1787                           u32 portid, u32 seq, u16 flags, int event)
1788 {
1789         struct tcmsg *tcm;
1790         struct nlmsghdr  *nlh;
1791         unsigned char *b = skb_tail_pointer(skb);
1792         struct gnet_dump d;
1793         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1794
1795         cond_resched();
1796         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1797         if (!nlh)
1798                 goto out_nlmsg_trim;
1799         tcm = nlmsg_data(nlh);
1800         tcm->tcm_family = AF_UNSPEC;
1801         tcm->tcm__pad1 = 0;
1802         tcm->tcm__pad2 = 0;
1803         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1804         tcm->tcm_parent = q->handle;
1805         tcm->tcm_handle = q->handle;
1806         tcm->tcm_info = 0;
1807         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1808                 goto nla_put_failure;
1809         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1810                 goto nla_put_failure;
1811
1812         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1813                                          NULL, &d, TCA_PAD) < 0)
1814                 goto nla_put_failure;
1815
1816         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1817                 goto nla_put_failure;
1818
1819         if (gnet_stats_finish_copy(&d) < 0)
1820                 goto nla_put_failure;
1821
1822         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1823         return skb->len;
1824
1825 out_nlmsg_trim:
1826 nla_put_failure:
1827         nlmsg_trim(skb, b);
1828         return -1;
1829 }
1830
1831 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1832                          struct nlmsghdr *n, struct Qdisc *q,
1833                          unsigned long cl, int event)
1834 {
1835         struct sk_buff *skb;
1836         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1837         int err = 0;
1838
1839         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1840         if (!skb)
1841                 return -ENOBUFS;
1842
1843         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1844                 kfree_skb(skb);
1845                 return -EINVAL;
1846         }
1847
1848         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1849                              n->nlmsg_flags & NLM_F_ECHO);
1850         if (err > 0)
1851                 err = 0;
1852         return err;
1853 }
1854
1855 static int tclass_del_notify(struct net *net,
1856                              const struct Qdisc_class_ops *cops,
1857                              struct sk_buff *oskb, struct nlmsghdr *n,
1858                              struct Qdisc *q, unsigned long cl)
1859 {
1860         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1861         struct sk_buff *skb;
1862         int err = 0;
1863
1864         if (!cops->delete)
1865                 return -EOPNOTSUPP;
1866
1867         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1868         if (!skb)
1869                 return -ENOBUFS;
1870
1871         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1872                            RTM_DELTCLASS) < 0) {
1873                 kfree_skb(skb);
1874                 return -EINVAL;
1875         }
1876
1877         err = cops->delete(q, cl);
1878         if (err) {
1879                 kfree_skb(skb);
1880                 return err;
1881         }
1882
1883         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1884                              n->nlmsg_flags & NLM_F_ECHO);
1885         if (err > 0)
1886                 err = 0;
1887         return err;
1888 }
1889
1890 #ifdef CONFIG_NET_CLS
1891
1892 struct tcf_bind_args {
1893         struct tcf_walker w;
1894         u32 classid;
1895         unsigned long cl;
1896 };
1897
1898 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1899 {
1900         struct tcf_bind_args *a = (void *)arg;
1901
1902         if (tp->ops->bind_class) {
1903                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1904
1905                 sch_tree_lock(q);
1906                 tp->ops->bind_class(n, a->classid, a->cl);
1907                 sch_tree_unlock(q);
1908         }
1909         return 0;
1910 }
1911
1912 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1913                            unsigned long new_cl)
1914 {
1915         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1916         struct tcf_block *block;
1917         struct tcf_chain *chain;
1918         unsigned long cl;
1919
1920         cl = cops->find(q, portid);
1921         if (!cl)
1922                 return;
1923         if (!cops->tcf_block)
1924                 return;
1925         block = cops->tcf_block(q, cl, NULL);
1926         if (!block)
1927                 return;
1928         for (chain = tcf_get_next_chain(block, NULL);
1929              chain;
1930              chain = tcf_get_next_chain(block, chain)) {
1931                 struct tcf_proto *tp;
1932
1933                 for (tp = tcf_get_next_proto(chain, NULL, true);
1934                      tp; tp = tcf_get_next_proto(chain, tp, true)) {
1935                         struct tcf_bind_args arg = {};
1936
1937                         arg.w.fn = tcf_node_bind;
1938                         arg.classid = clid;
1939                         arg.cl = new_cl;
1940                         tp->ops->walk(tp, &arg.w, true);
1941                 }
1942         }
1943 }
1944
1945 #else
1946
1947 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1948                            unsigned long new_cl)
1949 {
1950 }
1951
1952 #endif
1953
1954 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1955                          struct netlink_ext_ack *extack)
1956 {
1957         struct net *net = sock_net(skb->sk);
1958         struct tcmsg *tcm = nlmsg_data(n);
1959         struct nlattr *tca[TCA_MAX + 1];
1960         struct net_device *dev;
1961         struct Qdisc *q = NULL;
1962         const struct Qdisc_class_ops *cops;
1963         unsigned long cl = 0;
1964         unsigned long new_cl;
1965         u32 portid;
1966         u32 clid;
1967         u32 qid;
1968         int err;
1969
1970         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1971             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1972                 return -EPERM;
1973
1974         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1975                                      rtm_tca_policy, extack);
1976         if (err < 0)
1977                 return err;
1978
1979         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1980         if (!dev)
1981                 return -ENODEV;
1982
1983         /*
1984            parent == TC_H_UNSPEC - unspecified parent.
1985            parent == TC_H_ROOT   - class is root, which has no parent.
1986            parent == X:0         - parent is root class.
1987            parent == X:Y         - parent is a node in hierarchy.
1988            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1989
1990            handle == 0:0         - generate handle from kernel pool.
1991            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1992            handle == X:Y         - clear.
1993            handle == X:0         - root class.
1994          */
1995
1996         /* Step 1. Determine qdisc handle X:0 */
1997
1998         portid = tcm->tcm_parent;
1999         clid = tcm->tcm_handle;
2000         qid = TC_H_MAJ(clid);
2001
2002         if (portid != TC_H_ROOT) {
2003                 u32 qid1 = TC_H_MAJ(portid);
2004
2005                 if (qid && qid1) {
2006                         /* If both majors are known, they must be identical. */
2007                         if (qid != qid1)
2008                                 return -EINVAL;
2009                 } else if (qid1) {
2010                         qid = qid1;
2011                 } else if (qid == 0)
2012                         qid = dev->qdisc->handle;
2013
2014                 /* Now qid is genuine qdisc handle consistent
2015                  * both with parent and child.
2016                  *
2017                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2018                  */
2019                 if (portid)
2020                         portid = TC_H_MAKE(qid, portid);
2021         } else {
2022                 if (qid == 0)
2023                         qid = dev->qdisc->handle;
2024         }
2025
2026         /* OK. Locate qdisc */
2027         q = qdisc_lookup(dev, qid);
2028         if (!q)
2029                 return -ENOENT;
2030
2031         /* An check that it supports classes */
2032         cops = q->ops->cl_ops;
2033         if (cops == NULL)
2034                 return -EINVAL;
2035
2036         /* Now try to get class */
2037         if (clid == 0) {
2038                 if (portid == TC_H_ROOT)
2039                         clid = qid;
2040         } else
2041                 clid = TC_H_MAKE(qid, clid);
2042
2043         if (clid)
2044                 cl = cops->find(q, clid);
2045
2046         if (cl == 0) {
2047                 err = -ENOENT;
2048                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2049                     !(n->nlmsg_flags & NLM_F_CREATE))
2050                         goto out;
2051         } else {
2052                 switch (n->nlmsg_type) {
2053                 case RTM_NEWTCLASS:
2054                         err = -EEXIST;
2055                         if (n->nlmsg_flags & NLM_F_EXCL)
2056                                 goto out;
2057                         break;
2058                 case RTM_DELTCLASS:
2059                         err = tclass_del_notify(net, cops, skb, n, q, cl);
2060                         /* Unbind the class with flilters with 0 */
2061                         tc_bind_tclass(q, portid, clid, 0);
2062                         goto out;
2063                 case RTM_GETTCLASS:
2064                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2065                         goto out;
2066                 default:
2067                         err = -EINVAL;
2068                         goto out;
2069                 }
2070         }
2071
2072         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2073                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2074                 return -EOPNOTSUPP;
2075         }
2076
2077         new_cl = cl;
2078         err = -EOPNOTSUPP;
2079         if (cops->change)
2080                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2081         if (err == 0) {
2082                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2083                 /* We just create a new class, need to do reverse binding. */
2084                 if (cl != new_cl)
2085                         tc_bind_tclass(q, portid, clid, new_cl);
2086         }
2087 out:
2088         return err;
2089 }
2090
2091 struct qdisc_dump_args {
2092         struct qdisc_walker     w;
2093         struct sk_buff          *skb;
2094         struct netlink_callback *cb;
2095 };
2096
2097 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2098                             struct qdisc_walker *arg)
2099 {
2100         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2101
2102         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2103                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2104                               RTM_NEWTCLASS);
2105 }
2106
2107 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2108                                 struct tcmsg *tcm, struct netlink_callback *cb,
2109                                 int *t_p, int s_t)
2110 {
2111         struct qdisc_dump_args arg;
2112
2113         if (tc_qdisc_dump_ignore(q, false) ||
2114             *t_p < s_t || !q->ops->cl_ops ||
2115             (tcm->tcm_parent &&
2116              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2117                 (*t_p)++;
2118                 return 0;
2119         }
2120         if (*t_p > s_t)
2121                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2122         arg.w.fn = qdisc_class_dump;
2123         arg.skb = skb;
2124         arg.cb = cb;
2125         arg.w.stop  = 0;
2126         arg.w.skip = cb->args[1];
2127         arg.w.count = 0;
2128         q->ops->cl_ops->walk(q, &arg.w);
2129         cb->args[1] = arg.w.count;
2130         if (arg.w.stop)
2131                 return -1;
2132         (*t_p)++;
2133         return 0;
2134 }
2135
2136 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2137                                struct tcmsg *tcm, struct netlink_callback *cb,
2138                                int *t_p, int s_t)
2139 {
2140         struct Qdisc *q;
2141         int b;
2142
2143         if (!root)
2144                 return 0;
2145
2146         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2147                 return -1;
2148
2149         if (!qdisc_dev(root))
2150                 return 0;
2151
2152         if (tcm->tcm_parent) {
2153                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2154                 if (q && q != root &&
2155                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2156                         return -1;
2157                 return 0;
2158         }
2159         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2160                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2161                         return -1;
2162         }
2163
2164         return 0;
2165 }
2166
2167 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2168 {
2169         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2170         struct net *net = sock_net(skb->sk);
2171         struct netdev_queue *dev_queue;
2172         struct net_device *dev;
2173         int t, s_t;
2174
2175         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2176                 return 0;
2177         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2178         if (!dev)
2179                 return 0;
2180
2181         s_t = cb->args[0];
2182         t = 0;
2183
2184         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2185                 goto done;
2186
2187         dev_queue = dev_ingress_queue(dev);
2188         if (dev_queue &&
2189             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2190                                 &t, s_t) < 0)
2191                 goto done;
2192
2193 done:
2194         cb->args[0] = t;
2195
2196         dev_put(dev);
2197         return skb->len;
2198 }
2199
2200 #ifdef CONFIG_PROC_FS
2201 static int psched_show(struct seq_file *seq, void *v)
2202 {
2203         seq_printf(seq, "%08x %08x %08x %08x\n",
2204                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2205                    1000000,
2206                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2207
2208         return 0;
2209 }
2210
2211 static int __net_init psched_net_init(struct net *net)
2212 {
2213         struct proc_dir_entry *e;
2214
2215         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2216         if (e == NULL)
2217                 return -ENOMEM;
2218
2219         return 0;
2220 }
2221
2222 static void __net_exit psched_net_exit(struct net *net)
2223 {
2224         remove_proc_entry("psched", net->proc_net);
2225 }
2226 #else
2227 static int __net_init psched_net_init(struct net *net)
2228 {
2229         return 0;
2230 }
2231
2232 static void __net_exit psched_net_exit(struct net *net)
2233 {
2234 }
2235 #endif
2236
2237 static struct pernet_operations psched_net_ops = {
2238         .init = psched_net_init,
2239         .exit = psched_net_exit,
2240 };
2241
2242 static int __init pktsched_init(void)
2243 {
2244         int err;
2245
2246         err = register_pernet_subsys(&psched_net_ops);
2247         if (err) {
2248                 pr_err("pktsched_init: "
2249                        "cannot initialize per netns operations\n");
2250                 return err;
2251         }
2252
2253         register_qdisc(&pfifo_fast_ops);
2254         register_qdisc(&pfifo_qdisc_ops);
2255         register_qdisc(&bfifo_qdisc_ops);
2256         register_qdisc(&pfifo_head_drop_qdisc_ops);
2257         register_qdisc(&mq_qdisc_ops);
2258         register_qdisc(&noqueue_qdisc_ops);
2259
2260         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2261         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2262         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2263                       0);
2264         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2265         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2266         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2267                       0);
2268
2269         return 0;
2270 }
2271
2272 subsys_initcall(pktsched_init);