Merge tag 'microblaze-v4.18-rc3' of git://git.monstr.eu/linux-2.6-microblaze
[linux-2.6-microblaze.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 #include <net/pkt_cls.h>
39
40 /*
41
42    Short review.
43    -------------
44
45    This file consists of two interrelated parts:
46
47    1. queueing disciplines manager frontend.
48    2. traffic classes manager frontend.
49
50    Generally, queueing discipline ("qdisc") is a black box,
51    which is able to enqueue packets and to dequeue them (when
52    device is ready to send something) in order and at times
53    determined by algorithm hidden in it.
54
55    qdisc's are divided to two categories:
56    - "queues", which have no internal structure visible from outside.
57    - "schedulers", which split all the packets to "traffic classes",
58      using "packet classifiers" (look at cls_api.c)
59
60    In turn, classes may have child qdiscs (as rule, queues)
61    attached to them etc. etc. etc.
62
63    The goal of the routines in this file is to translate
64    information supplied by user in the form of handles
65    to more intelligible for kernel form, to make some sanity
66    checks and part of work, which is common to all qdiscs
67    and to provide rtnetlink notifications.
68
69    All real intelligent work is done inside qdisc modules.
70
71
72
73    Every discipline has two major routines: enqueue and dequeue.
74
75    ---dequeue
76
77    dequeue usually returns a skb to send. It is allowed to return NULL,
78    but it does not mean that queue is empty, it just means that
79    discipline does not want to send anything this time.
80    Queue is really empty if q->q.qlen == 0.
81    For complicated disciplines with multiple queues q->q is not
82    real packet queue, but however q->q.qlen must be valid.
83
84    ---enqueue
85
86    enqueue returns 0, if packet was enqueued successfully.
87    If packet (this one or another one) was dropped, it returns
88    not zero error code.
89    NET_XMIT_DROP        - this packet dropped
90      Expected action: do not backoff, but wait until queue will clear.
91    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
92      Expected action: backoff or ignore
93
94    Auxiliary routines:
95
96    ---peek
97
98    like dequeue but without removing a packet from the queue
99
100    ---reset
101
102    returns qdisc to initial state: purge all buffers, clear all
103    timers, counters (except for statistics) etc.
104
105    ---init
106
107    initializes newly created qdisc.
108
109    ---destroy
110
111    destroys resources allocated by init and during lifetime of qdisc.
112
113    ---change
114
115    changes qdisc parameters.
116  */
117
118 /* Protects list of registered TC modules. It is pure SMP lock. */
119 static DEFINE_RWLOCK(qdisc_mod_lock);
120
121
122 /************************************************
123  *      Queueing disciplines manipulation.      *
124  ************************************************/
125
126
127 /* The list of all installed queueing disciplines. */
128
129 static struct Qdisc_ops *qdisc_base;
130
131 /* Register/unregister queueing discipline */
132
133 int register_qdisc(struct Qdisc_ops *qops)
134 {
135         struct Qdisc_ops *q, **qp;
136         int rc = -EEXIST;
137
138         write_lock(&qdisc_mod_lock);
139         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
140                 if (!strcmp(qops->id, q->id))
141                         goto out;
142
143         if (qops->enqueue == NULL)
144                 qops->enqueue = noop_qdisc_ops.enqueue;
145         if (qops->peek == NULL) {
146                 if (qops->dequeue == NULL)
147                         qops->peek = noop_qdisc_ops.peek;
148                 else
149                         goto out_einval;
150         }
151         if (qops->dequeue == NULL)
152                 qops->dequeue = noop_qdisc_ops.dequeue;
153
154         if (qops->cl_ops) {
155                 const struct Qdisc_class_ops *cops = qops->cl_ops;
156
157                 if (!(cops->find && cops->walk && cops->leaf))
158                         goto out_einval;
159
160                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
161                         goto out_einval;
162         }
163
164         qops->next = NULL;
165         *qp = qops;
166         rc = 0;
167 out:
168         write_unlock(&qdisc_mod_lock);
169         return rc;
170
171 out_einval:
172         rc = -EINVAL;
173         goto out;
174 }
175 EXPORT_SYMBOL(register_qdisc);
176
177 int unregister_qdisc(struct Qdisc_ops *qops)
178 {
179         struct Qdisc_ops *q, **qp;
180         int err = -ENOENT;
181
182         write_lock(&qdisc_mod_lock);
183         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
184                 if (q == qops)
185                         break;
186         if (q) {
187                 *qp = q->next;
188                 q->next = NULL;
189                 err = 0;
190         }
191         write_unlock(&qdisc_mod_lock);
192         return err;
193 }
194 EXPORT_SYMBOL(unregister_qdisc);
195
196 /* Get default qdisc if not otherwise specified */
197 void qdisc_get_default(char *name, size_t len)
198 {
199         read_lock(&qdisc_mod_lock);
200         strlcpy(name, default_qdisc_ops->id, len);
201         read_unlock(&qdisc_mod_lock);
202 }
203
204 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
205 {
206         struct Qdisc_ops *q = NULL;
207
208         for (q = qdisc_base; q; q = q->next) {
209                 if (!strcmp(name, q->id)) {
210                         if (!try_module_get(q->owner))
211                                 q = NULL;
212                         break;
213                 }
214         }
215
216         return q;
217 }
218
219 /* Set new default qdisc to use */
220 int qdisc_set_default(const char *name)
221 {
222         const struct Qdisc_ops *ops;
223
224         if (!capable(CAP_NET_ADMIN))
225                 return -EPERM;
226
227         write_lock(&qdisc_mod_lock);
228         ops = qdisc_lookup_default(name);
229         if (!ops) {
230                 /* Not found, drop lock and try to load module */
231                 write_unlock(&qdisc_mod_lock);
232                 request_module("sch_%s", name);
233                 write_lock(&qdisc_mod_lock);
234
235                 ops = qdisc_lookup_default(name);
236         }
237
238         if (ops) {
239                 /* Set new default */
240                 module_put(default_qdisc_ops->owner);
241                 default_qdisc_ops = ops;
242         }
243         write_unlock(&qdisc_mod_lock);
244
245         return ops ? 0 : -ENOENT;
246 }
247
248 #ifdef CONFIG_NET_SCH_DEFAULT
249 /* Set default value from kernel config */
250 static int __init sch_default_qdisc(void)
251 {
252         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
253 }
254 late_initcall(sch_default_qdisc);
255 #endif
256
257 /* We know handle. Find qdisc among all qdisc's attached to device
258  * (root qdisc, all its children, children of children etc.)
259  * Note: caller either uses rtnl or rcu_read_lock()
260  */
261
262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
263 {
264         struct Qdisc *q;
265
266         if (!qdisc_dev(root))
267                 return (root->handle == handle ? root : NULL);
268
269         if (!(root->flags & TCQ_F_BUILTIN) &&
270             root->handle == handle)
271                 return root;
272
273         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274                 if (q->handle == handle)
275                         return q;
276         }
277         return NULL;
278 }
279
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283                 ASSERT_RTNL();
284                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285                 if (invisible)
286                         q->flags |= TCQ_F_INVISIBLE;
287         }
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294                 ASSERT_RTNL();
295                 hash_del_rcu(&q->hash);
296         }
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302         struct Qdisc *q;
303
304         if (!handle)
305                 return NULL;
306         q = qdisc_match_from_root(dev->qdisc, handle);
307         if (q)
308                 goto out;
309
310         if (dev_ingress_queue(dev))
311                 q = qdisc_match_from_root(
312                         dev_ingress_queue(dev)->qdisc_sleeping,
313                         handle);
314 out:
315         return q;
316 }
317
318 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
319 {
320         unsigned long cl;
321         struct Qdisc *leaf;
322         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
323
324         if (cops == NULL)
325                 return NULL;
326         cl = cops->find(p, classid);
327
328         if (cl == 0)
329                 return NULL;
330         leaf = cops->leaf(p, cl);
331         return leaf;
332 }
333
334 /* Find queueing discipline by name */
335
336 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
337 {
338         struct Qdisc_ops *q = NULL;
339
340         if (kind) {
341                 read_lock(&qdisc_mod_lock);
342                 for (q = qdisc_base; q; q = q->next) {
343                         if (nla_strcmp(kind, q->id) == 0) {
344                                 if (!try_module_get(q->owner))
345                                         q = NULL;
346                                 break;
347                         }
348                 }
349                 read_unlock(&qdisc_mod_lock);
350         }
351         return q;
352 }
353
354 /* The linklayer setting were not transferred from iproute2, in older
355  * versions, and the rate tables lookup systems have been dropped in
356  * the kernel. To keep backward compatible with older iproute2 tc
357  * utils, we detect the linklayer setting by detecting if the rate
358  * table were modified.
359  *
360  * For linklayer ATM table entries, the rate table will be aligned to
361  * 48 bytes, thus some table entries will contain the same value.  The
362  * mpu (min packet unit) is also encoded into the old rate table, thus
363  * starting from the mpu, we find low and high table entries for
364  * mapping this cell.  If these entries contain the same value, when
365  * the rate tables have been modified for linklayer ATM.
366  *
367  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
368  * and then roundup to the next cell, calc the table entry one below,
369  * and compare.
370  */
371 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
372 {
373         int low       = roundup(r->mpu, 48);
374         int high      = roundup(low+1, 48);
375         int cell_low  = low >> r->cell_log;
376         int cell_high = (high >> r->cell_log) - 1;
377
378         /* rtab is too inaccurate at rates > 100Mbit/s */
379         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
380                 pr_debug("TC linklayer: Giving up ATM detection\n");
381                 return TC_LINKLAYER_ETHERNET;
382         }
383
384         if ((cell_high > cell_low) && (cell_high < 256)
385             && (rtab[cell_low] == rtab[cell_high])) {
386                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
387                          cell_low, cell_high, rtab[cell_high]);
388                 return TC_LINKLAYER_ATM;
389         }
390         return TC_LINKLAYER_ETHERNET;
391 }
392
393 static struct qdisc_rate_table *qdisc_rtab_list;
394
395 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
396                                         struct nlattr *tab,
397                                         struct netlink_ext_ack *extack)
398 {
399         struct qdisc_rate_table *rtab;
400
401         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
402             nla_len(tab) != TC_RTAB_SIZE) {
403                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
404                 return NULL;
405         }
406
407         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
408                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
409                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
410                         rtab->refcnt++;
411                         return rtab;
412                 }
413         }
414
415         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
416         if (rtab) {
417                 rtab->rate = *r;
418                 rtab->refcnt = 1;
419                 memcpy(rtab->data, nla_data(tab), 1024);
420                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
421                         r->linklayer = __detect_linklayer(r, rtab->data);
422                 rtab->next = qdisc_rtab_list;
423                 qdisc_rtab_list = rtab;
424         } else {
425                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
426         }
427         return rtab;
428 }
429 EXPORT_SYMBOL(qdisc_get_rtab);
430
431 void qdisc_put_rtab(struct qdisc_rate_table *tab)
432 {
433         struct qdisc_rate_table *rtab, **rtabp;
434
435         if (!tab || --tab->refcnt)
436                 return;
437
438         for (rtabp = &qdisc_rtab_list;
439              (rtab = *rtabp) != NULL;
440              rtabp = &rtab->next) {
441                 if (rtab == tab) {
442                         *rtabp = rtab->next;
443                         kfree(rtab);
444                         return;
445                 }
446         }
447 }
448 EXPORT_SYMBOL(qdisc_put_rtab);
449
450 static LIST_HEAD(qdisc_stab_list);
451
452 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
453         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
454         [TCA_STAB_DATA] = { .type = NLA_BINARY },
455 };
456
457 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
458                                                struct netlink_ext_ack *extack)
459 {
460         struct nlattr *tb[TCA_STAB_MAX + 1];
461         struct qdisc_size_table *stab;
462         struct tc_sizespec *s;
463         unsigned int tsize = 0;
464         u16 *tab = NULL;
465         int err;
466
467         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
468         if (err < 0)
469                 return ERR_PTR(err);
470         if (!tb[TCA_STAB_BASE]) {
471                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
472                 return ERR_PTR(-EINVAL);
473         }
474
475         s = nla_data(tb[TCA_STAB_BASE]);
476
477         if (s->tsize > 0) {
478                 if (!tb[TCA_STAB_DATA]) {
479                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
480                         return ERR_PTR(-EINVAL);
481                 }
482                 tab = nla_data(tb[TCA_STAB_DATA]);
483                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
484         }
485
486         if (tsize != s->tsize || (!tab && tsize > 0)) {
487                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
488                 return ERR_PTR(-EINVAL);
489         }
490
491         list_for_each_entry(stab, &qdisc_stab_list, list) {
492                 if (memcmp(&stab->szopts, s, sizeof(*s)))
493                         continue;
494                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
495                         continue;
496                 stab->refcnt++;
497                 return stab;
498         }
499
500         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
501         if (!stab)
502                 return ERR_PTR(-ENOMEM);
503
504         stab->refcnt = 1;
505         stab->szopts = *s;
506         if (tsize > 0)
507                 memcpy(stab->data, tab, tsize * sizeof(u16));
508
509         list_add_tail(&stab->list, &qdisc_stab_list);
510
511         return stab;
512 }
513
514 static void stab_kfree_rcu(struct rcu_head *head)
515 {
516         kfree(container_of(head, struct qdisc_size_table, rcu));
517 }
518
519 void qdisc_put_stab(struct qdisc_size_table *tab)
520 {
521         if (!tab)
522                 return;
523
524         if (--tab->refcnt == 0) {
525                 list_del(&tab->list);
526                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
527         }
528 }
529 EXPORT_SYMBOL(qdisc_put_stab);
530
531 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
532 {
533         struct nlattr *nest;
534
535         nest = nla_nest_start(skb, TCA_STAB);
536         if (nest == NULL)
537                 goto nla_put_failure;
538         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
539                 goto nla_put_failure;
540         nla_nest_end(skb, nest);
541
542         return skb->len;
543
544 nla_put_failure:
545         return -1;
546 }
547
548 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
549                                const struct qdisc_size_table *stab)
550 {
551         int pkt_len, slot;
552
553         pkt_len = skb->len + stab->szopts.overhead;
554         if (unlikely(!stab->szopts.tsize))
555                 goto out;
556
557         slot = pkt_len + stab->szopts.cell_align;
558         if (unlikely(slot < 0))
559                 slot = 0;
560
561         slot >>= stab->szopts.cell_log;
562         if (likely(slot < stab->szopts.tsize))
563                 pkt_len = stab->data[slot];
564         else
565                 pkt_len = stab->data[stab->szopts.tsize - 1] *
566                                 (slot / stab->szopts.tsize) +
567                                 stab->data[slot % stab->szopts.tsize];
568
569         pkt_len <<= stab->szopts.size_log;
570 out:
571         if (unlikely(pkt_len < 1))
572                 pkt_len = 1;
573         qdisc_skb_cb(skb)->pkt_len = pkt_len;
574 }
575 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
576
577 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
578 {
579         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
580                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
581                         txt, qdisc->ops->id, qdisc->handle >> 16);
582                 qdisc->flags |= TCQ_F_WARN_NONWC;
583         }
584 }
585 EXPORT_SYMBOL(qdisc_warn_nonwc);
586
587 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
588 {
589         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
590                                                  timer);
591
592         rcu_read_lock();
593         __netif_schedule(qdisc_root(wd->qdisc));
594         rcu_read_unlock();
595
596         return HRTIMER_NORESTART;
597 }
598
599 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
600 {
601         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
602         wd->timer.function = qdisc_watchdog;
603         wd->qdisc = qdisc;
604 }
605 EXPORT_SYMBOL(qdisc_watchdog_init);
606
607 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
608 {
609         if (test_bit(__QDISC_STATE_DEACTIVATED,
610                      &qdisc_root_sleeping(wd->qdisc)->state))
611                 return;
612
613         if (wd->last_expires == expires)
614                 return;
615
616         wd->last_expires = expires;
617         hrtimer_start(&wd->timer,
618                       ns_to_ktime(expires),
619                       HRTIMER_MODE_ABS_PINNED);
620 }
621 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
622
623 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
624 {
625         hrtimer_cancel(&wd->timer);
626 }
627 EXPORT_SYMBOL(qdisc_watchdog_cancel);
628
629 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
630 {
631         struct hlist_head *h;
632         unsigned int i;
633
634         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
635
636         if (h != NULL) {
637                 for (i = 0; i < n; i++)
638                         INIT_HLIST_HEAD(&h[i]);
639         }
640         return h;
641 }
642
643 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
644 {
645         struct Qdisc_class_common *cl;
646         struct hlist_node *next;
647         struct hlist_head *nhash, *ohash;
648         unsigned int nsize, nmask, osize;
649         unsigned int i, h;
650
651         /* Rehash when load factor exceeds 0.75 */
652         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
653                 return;
654         nsize = clhash->hashsize * 2;
655         nmask = nsize - 1;
656         nhash = qdisc_class_hash_alloc(nsize);
657         if (nhash == NULL)
658                 return;
659
660         ohash = clhash->hash;
661         osize = clhash->hashsize;
662
663         sch_tree_lock(sch);
664         for (i = 0; i < osize; i++) {
665                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
666                         h = qdisc_class_hash(cl->classid, nmask);
667                         hlist_add_head(&cl->hnode, &nhash[h]);
668                 }
669         }
670         clhash->hash     = nhash;
671         clhash->hashsize = nsize;
672         clhash->hashmask = nmask;
673         sch_tree_unlock(sch);
674
675         kvfree(ohash);
676 }
677 EXPORT_SYMBOL(qdisc_class_hash_grow);
678
679 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
680 {
681         unsigned int size = 4;
682
683         clhash->hash = qdisc_class_hash_alloc(size);
684         if (!clhash->hash)
685                 return -ENOMEM;
686         clhash->hashsize  = size;
687         clhash->hashmask  = size - 1;
688         clhash->hashelems = 0;
689         return 0;
690 }
691 EXPORT_SYMBOL(qdisc_class_hash_init);
692
693 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
694 {
695         kvfree(clhash->hash);
696 }
697 EXPORT_SYMBOL(qdisc_class_hash_destroy);
698
699 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
700                              struct Qdisc_class_common *cl)
701 {
702         unsigned int h;
703
704         INIT_HLIST_NODE(&cl->hnode);
705         h = qdisc_class_hash(cl->classid, clhash->hashmask);
706         hlist_add_head(&cl->hnode, &clhash->hash[h]);
707         clhash->hashelems++;
708 }
709 EXPORT_SYMBOL(qdisc_class_hash_insert);
710
711 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
712                              struct Qdisc_class_common *cl)
713 {
714         hlist_del(&cl->hnode);
715         clhash->hashelems--;
716 }
717 EXPORT_SYMBOL(qdisc_class_hash_remove);
718
719 /* Allocate an unique handle from space managed by kernel
720  * Possible range is [8000-FFFF]:0000 (0x8000 values)
721  */
722 static u32 qdisc_alloc_handle(struct net_device *dev)
723 {
724         int i = 0x8000;
725         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
726
727         do {
728                 autohandle += TC_H_MAKE(0x10000U, 0);
729                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
730                         autohandle = TC_H_MAKE(0x80000000U, 0);
731                 if (!qdisc_lookup(dev, autohandle))
732                         return autohandle;
733                 cond_resched();
734         } while (--i > 0);
735
736         return 0;
737 }
738
739 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
740                                unsigned int len)
741 {
742         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
743         const struct Qdisc_class_ops *cops;
744         unsigned long cl;
745         u32 parentid;
746         bool notify;
747         int drops;
748
749         if (n == 0 && len == 0)
750                 return;
751         drops = max_t(int, n, 0);
752         rcu_read_lock();
753         while ((parentid = sch->parent)) {
754                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
755                         break;
756
757                 if (sch->flags & TCQ_F_NOPARENT)
758                         break;
759                 /* Notify parent qdisc only if child qdisc becomes empty.
760                  *
761                  * If child was empty even before update then backlog
762                  * counter is screwed and we skip notification because
763                  * parent class is already passive.
764                  *
765                  * If the original child was offloaded then it is allowed
766                  * to be seem as empty, so the parent is notified anyway.
767                  */
768                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
769                                                        !qdisc_is_offloaded);
770                 /* TODO: perform the search on a per txq basis */
771                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
772                 if (sch == NULL) {
773                         WARN_ON_ONCE(parentid != TC_H_ROOT);
774                         break;
775                 }
776                 cops = sch->ops->cl_ops;
777                 if (notify && cops->qlen_notify) {
778                         cl = cops->find(sch, parentid);
779                         cops->qlen_notify(sch, cl);
780                 }
781                 sch->q.qlen -= n;
782                 sch->qstats.backlog -= len;
783                 __qdisc_qstats_drop(sch, drops);
784         }
785         rcu_read_unlock();
786 }
787 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
788
789 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
790                          u32 portid, u32 seq, u16 flags, int event)
791 {
792         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
793         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
794         struct tcmsg *tcm;
795         struct nlmsghdr  *nlh;
796         unsigned char *b = skb_tail_pointer(skb);
797         struct gnet_dump d;
798         struct qdisc_size_table *stab;
799         u32 block_index;
800         __u32 qlen;
801
802         cond_resched();
803         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
804         if (!nlh)
805                 goto out_nlmsg_trim;
806         tcm = nlmsg_data(nlh);
807         tcm->tcm_family = AF_UNSPEC;
808         tcm->tcm__pad1 = 0;
809         tcm->tcm__pad2 = 0;
810         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
811         tcm->tcm_parent = clid;
812         tcm->tcm_handle = q->handle;
813         tcm->tcm_info = refcount_read(&q->refcnt);
814         if (nla_put_string(skb, TCA_KIND, q->ops->id))
815                 goto nla_put_failure;
816         if (q->ops->ingress_block_get) {
817                 block_index = q->ops->ingress_block_get(q);
818                 if (block_index &&
819                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
820                         goto nla_put_failure;
821         }
822         if (q->ops->egress_block_get) {
823                 block_index = q->ops->egress_block_get(q);
824                 if (block_index &&
825                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
826                         goto nla_put_failure;
827         }
828         if (q->ops->dump && q->ops->dump(q, skb) < 0)
829                 goto nla_put_failure;
830         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
831                 goto nla_put_failure;
832         qlen = qdisc_qlen_sum(q);
833
834         stab = rtnl_dereference(q->stab);
835         if (stab && qdisc_dump_stab(skb, stab) < 0)
836                 goto nla_put_failure;
837
838         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
839                                          NULL, &d, TCA_PAD) < 0)
840                 goto nla_put_failure;
841
842         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
843                 goto nla_put_failure;
844
845         if (qdisc_is_percpu_stats(q)) {
846                 cpu_bstats = q->cpu_bstats;
847                 cpu_qstats = q->cpu_qstats;
848         }
849
850         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
851                                   &d, cpu_bstats, &q->bstats) < 0 ||
852             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
853             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
854                 goto nla_put_failure;
855
856         if (gnet_stats_finish_copy(&d) < 0)
857                 goto nla_put_failure;
858
859         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
860         return skb->len;
861
862 out_nlmsg_trim:
863 nla_put_failure:
864         nlmsg_trim(skb, b);
865         return -1;
866 }
867
868 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
869 {
870         if (q->flags & TCQ_F_BUILTIN)
871                 return true;
872         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
873                 return true;
874
875         return false;
876 }
877
878 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
879                         struct nlmsghdr *n, u32 clid,
880                         struct Qdisc *old, struct Qdisc *new)
881 {
882         struct sk_buff *skb;
883         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
884
885         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
886         if (!skb)
887                 return -ENOBUFS;
888
889         if (old && !tc_qdisc_dump_ignore(old, false)) {
890                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
891                                   0, RTM_DELQDISC) < 0)
892                         goto err_out;
893         }
894         if (new && !tc_qdisc_dump_ignore(new, false)) {
895                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
896                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
897                         goto err_out;
898         }
899
900         if (skb->len)
901                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
902                                       n->nlmsg_flags & NLM_F_ECHO);
903
904 err_out:
905         kfree_skb(skb);
906         return -EINVAL;
907 }
908
909 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
910                                struct nlmsghdr *n, u32 clid,
911                                struct Qdisc *old, struct Qdisc *new)
912 {
913         if (new || old)
914                 qdisc_notify(net, skb, n, clid, old, new);
915
916         if (old)
917                 qdisc_destroy(old);
918 }
919
920 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
921  * to device "dev".
922  *
923  * When appropriate send a netlink notification using 'skb'
924  * and "n".
925  *
926  * On success, destroy old qdisc.
927  */
928
929 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
930                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
931                        struct Qdisc *new, struct Qdisc *old,
932                        struct netlink_ext_ack *extack)
933 {
934         struct Qdisc *q = old;
935         struct net *net = dev_net(dev);
936         int err = 0;
937
938         if (parent == NULL) {
939                 unsigned int i, num_q, ingress;
940
941                 ingress = 0;
942                 num_q = dev->num_tx_queues;
943                 if ((q && q->flags & TCQ_F_INGRESS) ||
944                     (new && new->flags & TCQ_F_INGRESS)) {
945                         num_q = 1;
946                         ingress = 1;
947                         if (!dev_ingress_queue(dev)) {
948                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
949                                 return -ENOENT;
950                         }
951                 }
952
953                 if (dev->flags & IFF_UP)
954                         dev_deactivate(dev);
955
956                 if (new && new->ops->attach)
957                         goto skip;
958
959                 for (i = 0; i < num_q; i++) {
960                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
961
962                         if (!ingress)
963                                 dev_queue = netdev_get_tx_queue(dev, i);
964
965                         old = dev_graft_qdisc(dev_queue, new);
966                         if (new && i > 0)
967                                 qdisc_refcount_inc(new);
968
969                         if (!ingress)
970                                 qdisc_destroy(old);
971                 }
972
973 skip:
974                 if (!ingress) {
975                         notify_and_destroy(net, skb, n, classid,
976                                            dev->qdisc, new);
977                         if (new && !new->ops->attach)
978                                 qdisc_refcount_inc(new);
979                         dev->qdisc = new ? : &noop_qdisc;
980
981                         if (new && new->ops->attach)
982                                 new->ops->attach(new);
983                 } else {
984                         notify_and_destroy(net, skb, n, classid, old, new);
985                 }
986
987                 if (dev->flags & IFF_UP)
988                         dev_activate(dev);
989         } else {
990                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
991
992                 /* Only support running class lockless if parent is lockless */
993                 if (new && (new->flags & TCQ_F_NOLOCK) &&
994                     parent && !(parent->flags & TCQ_F_NOLOCK))
995                         new->flags &= ~TCQ_F_NOLOCK;
996
997                 err = -EOPNOTSUPP;
998                 if (cops && cops->graft) {
999                         unsigned long cl = cops->find(parent, classid);
1000
1001                         if (cl) {
1002                                 err = cops->graft(parent, cl, new, &old,
1003                                                   extack);
1004                         } else {
1005                                 NL_SET_ERR_MSG(extack, "Specified class not found");
1006                                 err = -ENOENT;
1007                         }
1008                 }
1009                 if (!err)
1010                         notify_and_destroy(net, skb, n, classid, old, new);
1011         }
1012         return err;
1013 }
1014
1015 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1016                                    struct netlink_ext_ack *extack)
1017 {
1018         u32 block_index;
1019
1020         if (tca[TCA_INGRESS_BLOCK]) {
1021                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1022
1023                 if (!block_index) {
1024                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1025                         return -EINVAL;
1026                 }
1027                 if (!sch->ops->ingress_block_set) {
1028                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1029                         return -EOPNOTSUPP;
1030                 }
1031                 sch->ops->ingress_block_set(sch, block_index);
1032         }
1033         if (tca[TCA_EGRESS_BLOCK]) {
1034                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1035
1036                 if (!block_index) {
1037                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1038                         return -EINVAL;
1039                 }
1040                 if (!sch->ops->egress_block_set) {
1041                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1042                         return -EOPNOTSUPP;
1043                 }
1044                 sch->ops->egress_block_set(sch, block_index);
1045         }
1046         return 0;
1047 }
1048
1049 /* lockdep annotation is needed for ingress; egress gets it only for name */
1050 static struct lock_class_key qdisc_tx_lock;
1051 static struct lock_class_key qdisc_rx_lock;
1052
1053 /*
1054    Allocate and initialize new qdisc.
1055
1056    Parameters are passed via opt.
1057  */
1058
1059 static struct Qdisc *qdisc_create(struct net_device *dev,
1060                                   struct netdev_queue *dev_queue,
1061                                   struct Qdisc *p, u32 parent, u32 handle,
1062                                   struct nlattr **tca, int *errp,
1063                                   struct netlink_ext_ack *extack)
1064 {
1065         int err;
1066         struct nlattr *kind = tca[TCA_KIND];
1067         struct Qdisc *sch;
1068         struct Qdisc_ops *ops;
1069         struct qdisc_size_table *stab;
1070
1071         ops = qdisc_lookup_ops(kind);
1072 #ifdef CONFIG_MODULES
1073         if (ops == NULL && kind != NULL) {
1074                 char name[IFNAMSIZ];
1075                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1076                         /* We dropped the RTNL semaphore in order to
1077                          * perform the module load.  So, even if we
1078                          * succeeded in loading the module we have to
1079                          * tell the caller to replay the request.  We
1080                          * indicate this using -EAGAIN.
1081                          * We replay the request because the device may
1082                          * go away in the mean time.
1083                          */
1084                         rtnl_unlock();
1085                         request_module("sch_%s", name);
1086                         rtnl_lock();
1087                         ops = qdisc_lookup_ops(kind);
1088                         if (ops != NULL) {
1089                                 /* We will try again qdisc_lookup_ops,
1090                                  * so don't keep a reference.
1091                                  */
1092                                 module_put(ops->owner);
1093                                 err = -EAGAIN;
1094                                 goto err_out;
1095                         }
1096                 }
1097         }
1098 #endif
1099
1100         err = -ENOENT;
1101         if (!ops) {
1102                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1103                 goto err_out;
1104         }
1105
1106         sch = qdisc_alloc(dev_queue, ops, extack);
1107         if (IS_ERR(sch)) {
1108                 err = PTR_ERR(sch);
1109                 goto err_out2;
1110         }
1111
1112         sch->parent = parent;
1113
1114         if (handle == TC_H_INGRESS) {
1115                 sch->flags |= TCQ_F_INGRESS;
1116                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1117                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1118         } else {
1119                 if (handle == 0) {
1120                         handle = qdisc_alloc_handle(dev);
1121                         err = -ENOMEM;
1122                         if (handle == 0)
1123                                 goto err_out3;
1124                 }
1125                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1126                 if (!netif_is_multiqueue(dev))
1127                         sch->flags |= TCQ_F_ONETXQUEUE;
1128         }
1129
1130         sch->handle = handle;
1131
1132         /* This exist to keep backward compatible with a userspace
1133          * loophole, what allowed userspace to get IFF_NO_QUEUE
1134          * facility on older kernels by setting tx_queue_len=0 (prior
1135          * to qdisc init), and then forgot to reinit tx_queue_len
1136          * before again attaching a qdisc.
1137          */
1138         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1139                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1140                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1141         }
1142
1143         err = qdisc_block_indexes_set(sch, tca, extack);
1144         if (err)
1145                 goto err_out3;
1146
1147         if (ops->init) {
1148                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1149                 if (err != 0)
1150                         goto err_out5;
1151         }
1152
1153         if (tca[TCA_STAB]) {
1154                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1155                 if (IS_ERR(stab)) {
1156                         err = PTR_ERR(stab);
1157                         goto err_out4;
1158                 }
1159                 rcu_assign_pointer(sch->stab, stab);
1160         }
1161         if (tca[TCA_RATE]) {
1162                 seqcount_t *running;
1163
1164                 err = -EOPNOTSUPP;
1165                 if (sch->flags & TCQ_F_MQROOT) {
1166                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1167                         goto err_out4;
1168                 }
1169
1170                 if (sch->parent != TC_H_ROOT &&
1171                     !(sch->flags & TCQ_F_INGRESS) &&
1172                     (!p || !(p->flags & TCQ_F_MQROOT)))
1173                         running = qdisc_root_sleeping_running(sch);
1174                 else
1175                         running = &sch->running;
1176
1177                 err = gen_new_estimator(&sch->bstats,
1178                                         sch->cpu_bstats,
1179                                         &sch->rate_est,
1180                                         NULL,
1181                                         running,
1182                                         tca[TCA_RATE]);
1183                 if (err) {
1184                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1185                         goto err_out4;
1186                 }
1187         }
1188
1189         qdisc_hash_add(sch, false);
1190
1191         return sch;
1192
1193 err_out5:
1194         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1195         if (ops->destroy)
1196                 ops->destroy(sch);
1197 err_out3:
1198         dev_put(dev);
1199         qdisc_free(sch);
1200 err_out2:
1201         module_put(ops->owner);
1202 err_out:
1203         *errp = err;
1204         return NULL;
1205
1206 err_out4:
1207         /*
1208          * Any broken qdiscs that would require a ops->reset() here?
1209          * The qdisc was never in action so it shouldn't be necessary.
1210          */
1211         qdisc_put_stab(rtnl_dereference(sch->stab));
1212         if (ops->destroy)
1213                 ops->destroy(sch);
1214         goto err_out3;
1215 }
1216
1217 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1218                         struct netlink_ext_ack *extack)
1219 {
1220         struct qdisc_size_table *ostab, *stab = NULL;
1221         int err = 0;
1222
1223         if (tca[TCA_OPTIONS]) {
1224                 if (!sch->ops->change) {
1225                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1226                         return -EINVAL;
1227                 }
1228                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1229                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1230                         return -EOPNOTSUPP;
1231                 }
1232                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1233                 if (err)
1234                         return err;
1235         }
1236
1237         if (tca[TCA_STAB]) {
1238                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1239                 if (IS_ERR(stab))
1240                         return PTR_ERR(stab);
1241         }
1242
1243         ostab = rtnl_dereference(sch->stab);
1244         rcu_assign_pointer(sch->stab, stab);
1245         qdisc_put_stab(ostab);
1246
1247         if (tca[TCA_RATE]) {
1248                 /* NB: ignores errors from replace_estimator
1249                    because change can't be undone. */
1250                 if (sch->flags & TCQ_F_MQROOT)
1251                         goto out;
1252                 gen_replace_estimator(&sch->bstats,
1253                                       sch->cpu_bstats,
1254                                       &sch->rate_est,
1255                                       NULL,
1256                                       qdisc_root_sleeping_running(sch),
1257                                       tca[TCA_RATE]);
1258         }
1259 out:
1260         return 0;
1261 }
1262
1263 struct check_loop_arg {
1264         struct qdisc_walker     w;
1265         struct Qdisc            *p;
1266         int                     depth;
1267 };
1268
1269 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1270                          struct qdisc_walker *w);
1271
1272 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1273 {
1274         struct check_loop_arg   arg;
1275
1276         if (q->ops->cl_ops == NULL)
1277                 return 0;
1278
1279         arg.w.stop = arg.w.skip = arg.w.count = 0;
1280         arg.w.fn = check_loop_fn;
1281         arg.depth = depth;
1282         arg.p = p;
1283         q->ops->cl_ops->walk(q, &arg.w);
1284         return arg.w.stop ? -ELOOP : 0;
1285 }
1286
1287 static int
1288 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1289 {
1290         struct Qdisc *leaf;
1291         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1292         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1293
1294         leaf = cops->leaf(q, cl);
1295         if (leaf) {
1296                 if (leaf == arg->p || arg->depth > 7)
1297                         return -ELOOP;
1298                 return check_loop(leaf, arg->p, arg->depth + 1);
1299         }
1300         return 0;
1301 }
1302
1303 /*
1304  * Delete/get qdisc.
1305  */
1306
1307 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1308                         struct netlink_ext_ack *extack)
1309 {
1310         struct net *net = sock_net(skb->sk);
1311         struct tcmsg *tcm = nlmsg_data(n);
1312         struct nlattr *tca[TCA_MAX + 1];
1313         struct net_device *dev;
1314         u32 clid;
1315         struct Qdisc *q = NULL;
1316         struct Qdisc *p = NULL;
1317         int err;
1318
1319         if ((n->nlmsg_type != RTM_GETQDISC) &&
1320             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1321                 return -EPERM;
1322
1323         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1324         if (err < 0)
1325                 return err;
1326
1327         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1328         if (!dev)
1329                 return -ENODEV;
1330
1331         clid = tcm->tcm_parent;
1332         if (clid) {
1333                 if (clid != TC_H_ROOT) {
1334                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1335                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1336                                 if (!p) {
1337                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1338                                         return -ENOENT;
1339                                 }
1340                                 q = qdisc_leaf(p, clid);
1341                         } else if (dev_ingress_queue(dev)) {
1342                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1343                         }
1344                 } else {
1345                         q = dev->qdisc;
1346                 }
1347                 if (!q) {
1348                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1349                         return -ENOENT;
1350                 }
1351
1352                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1353                         NL_SET_ERR_MSG(extack, "Invalid handle");
1354                         return -EINVAL;
1355                 }
1356         } else {
1357                 q = qdisc_lookup(dev, tcm->tcm_handle);
1358                 if (!q) {
1359                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1360                         return -ENOENT;
1361                 }
1362         }
1363
1364         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1365                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1366                 return -EINVAL;
1367         }
1368
1369         if (n->nlmsg_type == RTM_DELQDISC) {
1370                 if (!clid) {
1371                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1372                         return -EINVAL;
1373                 }
1374                 if (q->handle == 0) {
1375                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1376                         return -ENOENT;
1377                 }
1378                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1379                 if (err != 0)
1380                         return err;
1381         } else {
1382                 qdisc_notify(net, skb, n, clid, NULL, q);
1383         }
1384         return 0;
1385 }
1386
1387 /*
1388  * Create/change qdisc.
1389  */
1390
1391 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1392                            struct netlink_ext_ack *extack)
1393 {
1394         struct net *net = sock_net(skb->sk);
1395         struct tcmsg *tcm;
1396         struct nlattr *tca[TCA_MAX + 1];
1397         struct net_device *dev;
1398         u32 clid;
1399         struct Qdisc *q, *p;
1400         int err;
1401
1402         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1403                 return -EPERM;
1404
1405 replay:
1406         /* Reinit, just in case something touches this. */
1407         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1408         if (err < 0)
1409                 return err;
1410
1411         tcm = nlmsg_data(n);
1412         clid = tcm->tcm_parent;
1413         q = p = NULL;
1414
1415         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1416         if (!dev)
1417                 return -ENODEV;
1418
1419
1420         if (clid) {
1421                 if (clid != TC_H_ROOT) {
1422                         if (clid != TC_H_INGRESS) {
1423                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1424                                 if (!p) {
1425                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1426                                         return -ENOENT;
1427                                 }
1428                                 q = qdisc_leaf(p, clid);
1429                         } else if (dev_ingress_queue_create(dev)) {
1430                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1431                         }
1432                 } else {
1433                         q = dev->qdisc;
1434                 }
1435
1436                 /* It may be default qdisc, ignore it */
1437                 if (q && q->handle == 0)
1438                         q = NULL;
1439
1440                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1441                         if (tcm->tcm_handle) {
1442                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1443                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1444                                         return -EEXIST;
1445                                 }
1446                                 if (TC_H_MIN(tcm->tcm_handle)) {
1447                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1448                                         return -EINVAL;
1449                                 }
1450                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1451                                 if (!q)
1452                                         goto create_n_graft;
1453                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1454                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1455                                         return -EEXIST;
1456                                 }
1457                                 if (tca[TCA_KIND] &&
1458                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1459                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1460                                         return -EINVAL;
1461                                 }
1462                                 if (q == p ||
1463                                     (p && check_loop(q, p, 0))) {
1464                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1465                                         return -ELOOP;
1466                                 }
1467                                 qdisc_refcount_inc(q);
1468                                 goto graft;
1469                         } else {
1470                                 if (!q)
1471                                         goto create_n_graft;
1472
1473                                 /* This magic test requires explanation.
1474                                  *
1475                                  *   We know, that some child q is already
1476                                  *   attached to this parent and have choice:
1477                                  *   either to change it or to create/graft new one.
1478                                  *
1479                                  *   1. We are allowed to create/graft only
1480                                  *   if CREATE and REPLACE flags are set.
1481                                  *
1482                                  *   2. If EXCL is set, requestor wanted to say,
1483                                  *   that qdisc tcm_handle is not expected
1484                                  *   to exist, so that we choose create/graft too.
1485                                  *
1486                                  *   3. The last case is when no flags are set.
1487                                  *   Alas, it is sort of hole in API, we
1488                                  *   cannot decide what to do unambiguously.
1489                                  *   For now we select create/graft, if
1490                                  *   user gave KIND, which does not match existing.
1491                                  */
1492                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1493                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1494                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1495                                      (tca[TCA_KIND] &&
1496                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1497                                         goto create_n_graft;
1498                         }
1499                 }
1500         } else {
1501                 if (!tcm->tcm_handle) {
1502                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1503                         return -EINVAL;
1504                 }
1505                 q = qdisc_lookup(dev, tcm->tcm_handle);
1506         }
1507
1508         /* Change qdisc parameters */
1509         if (!q) {
1510                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1511                 return -ENOENT;
1512         }
1513         if (n->nlmsg_flags & NLM_F_EXCL) {
1514                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1515                 return -EEXIST;
1516         }
1517         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1518                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1519                 return -EINVAL;
1520         }
1521         err = qdisc_change(q, tca, extack);
1522         if (err == 0)
1523                 qdisc_notify(net, skb, n, clid, NULL, q);
1524         return err;
1525
1526 create_n_graft:
1527         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1528                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1529                 return -ENOENT;
1530         }
1531         if (clid == TC_H_INGRESS) {
1532                 if (dev_ingress_queue(dev)) {
1533                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1534                                          tcm->tcm_parent, tcm->tcm_parent,
1535                                          tca, &err, extack);
1536                 } else {
1537                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1538                         err = -ENOENT;
1539                 }
1540         } else {
1541                 struct netdev_queue *dev_queue;
1542
1543                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1544                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1545                 else if (p)
1546                         dev_queue = p->dev_queue;
1547                 else
1548                         dev_queue = netdev_get_tx_queue(dev, 0);
1549
1550                 q = qdisc_create(dev, dev_queue, p,
1551                                  tcm->tcm_parent, tcm->tcm_handle,
1552                                  tca, &err, extack);
1553         }
1554         if (q == NULL) {
1555                 if (err == -EAGAIN)
1556                         goto replay;
1557                 return err;
1558         }
1559
1560 graft:
1561         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1562         if (err) {
1563                 if (q)
1564                         qdisc_destroy(q);
1565                 return err;
1566         }
1567
1568         return 0;
1569 }
1570
1571 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1572                               struct netlink_callback *cb,
1573                               int *q_idx_p, int s_q_idx, bool recur,
1574                               bool dump_invisible)
1575 {
1576         int ret = 0, q_idx = *q_idx_p;
1577         struct Qdisc *q;
1578         int b;
1579
1580         if (!root)
1581                 return 0;
1582
1583         q = root;
1584         if (q_idx < s_q_idx) {
1585                 q_idx++;
1586         } else {
1587                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1588                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1589                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1590                                   RTM_NEWQDISC) <= 0)
1591                         goto done;
1592                 q_idx++;
1593         }
1594
1595         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1596          * itself has already been dumped.
1597          *
1598          * If we've already dumped the top-level (ingress) qdisc above and the global
1599          * qdisc hashtable, we don't want to hit it again
1600          */
1601         if (!qdisc_dev(root) || !recur)
1602                 goto out;
1603
1604         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1605                 if (q_idx < s_q_idx) {
1606                         q_idx++;
1607                         continue;
1608                 }
1609                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1610                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1611                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1612                                   RTM_NEWQDISC) <= 0)
1613                         goto done;
1614                 q_idx++;
1615         }
1616
1617 out:
1618         *q_idx_p = q_idx;
1619         return ret;
1620 done:
1621         ret = -1;
1622         goto out;
1623 }
1624
1625 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1626 {
1627         struct net *net = sock_net(skb->sk);
1628         int idx, q_idx;
1629         int s_idx, s_q_idx;
1630         struct net_device *dev;
1631         const struct nlmsghdr *nlh = cb->nlh;
1632         struct nlattr *tca[TCA_MAX + 1];
1633         int err;
1634
1635         s_idx = cb->args[0];
1636         s_q_idx = q_idx = cb->args[1];
1637
1638         idx = 0;
1639         ASSERT_RTNL();
1640
1641         err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL);
1642         if (err < 0)
1643                 return err;
1644
1645         for_each_netdev(net, dev) {
1646                 struct netdev_queue *dev_queue;
1647
1648                 if (idx < s_idx)
1649                         goto cont;
1650                 if (idx > s_idx)
1651                         s_q_idx = 0;
1652                 q_idx = 0;
1653
1654                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1655                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1656                         goto done;
1657
1658                 dev_queue = dev_ingress_queue(dev);
1659                 if (dev_queue &&
1660                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1661                                        &q_idx, s_q_idx, false,
1662                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1663                         goto done;
1664
1665 cont:
1666                 idx++;
1667         }
1668
1669 done:
1670         cb->args[0] = idx;
1671         cb->args[1] = q_idx;
1672
1673         return skb->len;
1674 }
1675
1676
1677
1678 /************************************************
1679  *      Traffic classes manipulation.           *
1680  ************************************************/
1681
1682 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1683                           unsigned long cl,
1684                           u32 portid, u32 seq, u16 flags, int event)
1685 {
1686         struct tcmsg *tcm;
1687         struct nlmsghdr  *nlh;
1688         unsigned char *b = skb_tail_pointer(skb);
1689         struct gnet_dump d;
1690         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1691
1692         cond_resched();
1693         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1694         if (!nlh)
1695                 goto out_nlmsg_trim;
1696         tcm = nlmsg_data(nlh);
1697         tcm->tcm_family = AF_UNSPEC;
1698         tcm->tcm__pad1 = 0;
1699         tcm->tcm__pad2 = 0;
1700         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1701         tcm->tcm_parent = q->handle;
1702         tcm->tcm_handle = q->handle;
1703         tcm->tcm_info = 0;
1704         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1705                 goto nla_put_failure;
1706         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1707                 goto nla_put_failure;
1708
1709         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1710                                          NULL, &d, TCA_PAD) < 0)
1711                 goto nla_put_failure;
1712
1713         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1714                 goto nla_put_failure;
1715
1716         if (gnet_stats_finish_copy(&d) < 0)
1717                 goto nla_put_failure;
1718
1719         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1720         return skb->len;
1721
1722 out_nlmsg_trim:
1723 nla_put_failure:
1724         nlmsg_trim(skb, b);
1725         return -1;
1726 }
1727
1728 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1729                          struct nlmsghdr *n, struct Qdisc *q,
1730                          unsigned long cl, int event)
1731 {
1732         struct sk_buff *skb;
1733         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1734
1735         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1736         if (!skb)
1737                 return -ENOBUFS;
1738
1739         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1740                 kfree_skb(skb);
1741                 return -EINVAL;
1742         }
1743
1744         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1745                               n->nlmsg_flags & NLM_F_ECHO);
1746 }
1747
1748 static int tclass_del_notify(struct net *net,
1749                              const struct Qdisc_class_ops *cops,
1750                              struct sk_buff *oskb, struct nlmsghdr *n,
1751                              struct Qdisc *q, unsigned long cl)
1752 {
1753         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1754         struct sk_buff *skb;
1755         int err = 0;
1756
1757         if (!cops->delete)
1758                 return -EOPNOTSUPP;
1759
1760         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1761         if (!skb)
1762                 return -ENOBUFS;
1763
1764         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1765                            RTM_DELTCLASS) < 0) {
1766                 kfree_skb(skb);
1767                 return -EINVAL;
1768         }
1769
1770         err = cops->delete(q, cl);
1771         if (err) {
1772                 kfree_skb(skb);
1773                 return err;
1774         }
1775
1776         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1777                               n->nlmsg_flags & NLM_F_ECHO);
1778 }
1779
1780 #ifdef CONFIG_NET_CLS
1781
1782 struct tcf_bind_args {
1783         struct tcf_walker w;
1784         u32 classid;
1785         unsigned long cl;
1786 };
1787
1788 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1789 {
1790         struct tcf_bind_args *a = (void *)arg;
1791
1792         if (tp->ops->bind_class) {
1793                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1794
1795                 sch_tree_lock(q);
1796                 tp->ops->bind_class(n, a->classid, a->cl);
1797                 sch_tree_unlock(q);
1798         }
1799         return 0;
1800 }
1801
1802 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1803                            unsigned long new_cl)
1804 {
1805         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1806         struct tcf_block *block;
1807         struct tcf_chain *chain;
1808         unsigned long cl;
1809
1810         cl = cops->find(q, portid);
1811         if (!cl)
1812                 return;
1813         block = cops->tcf_block(q, cl, NULL);
1814         if (!block)
1815                 return;
1816         list_for_each_entry(chain, &block->chain_list, list) {
1817                 struct tcf_proto *tp;
1818
1819                 for (tp = rtnl_dereference(chain->filter_chain);
1820                      tp; tp = rtnl_dereference(tp->next)) {
1821                         struct tcf_bind_args arg = {};
1822
1823                         arg.w.fn = tcf_node_bind;
1824                         arg.classid = clid;
1825                         arg.cl = new_cl;
1826                         tp->ops->walk(tp, &arg.w);
1827                 }
1828         }
1829 }
1830
1831 #else
1832
1833 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1834                            unsigned long new_cl)
1835 {
1836 }
1837
1838 #endif
1839
1840 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1841                          struct netlink_ext_ack *extack)
1842 {
1843         struct net *net = sock_net(skb->sk);
1844         struct tcmsg *tcm = nlmsg_data(n);
1845         struct nlattr *tca[TCA_MAX + 1];
1846         struct net_device *dev;
1847         struct Qdisc *q = NULL;
1848         const struct Qdisc_class_ops *cops;
1849         unsigned long cl = 0;
1850         unsigned long new_cl;
1851         u32 portid;
1852         u32 clid;
1853         u32 qid;
1854         int err;
1855
1856         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1857             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1858                 return -EPERM;
1859
1860         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1861         if (err < 0)
1862                 return err;
1863
1864         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1865         if (!dev)
1866                 return -ENODEV;
1867
1868         /*
1869            parent == TC_H_UNSPEC - unspecified parent.
1870            parent == TC_H_ROOT   - class is root, which has no parent.
1871            parent == X:0         - parent is root class.
1872            parent == X:Y         - parent is a node in hierarchy.
1873            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1874
1875            handle == 0:0         - generate handle from kernel pool.
1876            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1877            handle == X:Y         - clear.
1878            handle == X:0         - root class.
1879          */
1880
1881         /* Step 1. Determine qdisc handle X:0 */
1882
1883         portid = tcm->tcm_parent;
1884         clid = tcm->tcm_handle;
1885         qid = TC_H_MAJ(clid);
1886
1887         if (portid != TC_H_ROOT) {
1888                 u32 qid1 = TC_H_MAJ(portid);
1889
1890                 if (qid && qid1) {
1891                         /* If both majors are known, they must be identical. */
1892                         if (qid != qid1)
1893                                 return -EINVAL;
1894                 } else if (qid1) {
1895                         qid = qid1;
1896                 } else if (qid == 0)
1897                         qid = dev->qdisc->handle;
1898
1899                 /* Now qid is genuine qdisc handle consistent
1900                  * both with parent and child.
1901                  *
1902                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1903                  */
1904                 if (portid)
1905                         portid = TC_H_MAKE(qid, portid);
1906         } else {
1907                 if (qid == 0)
1908                         qid = dev->qdisc->handle;
1909         }
1910
1911         /* OK. Locate qdisc */
1912         q = qdisc_lookup(dev, qid);
1913         if (!q)
1914                 return -ENOENT;
1915
1916         /* An check that it supports classes */
1917         cops = q->ops->cl_ops;
1918         if (cops == NULL)
1919                 return -EINVAL;
1920
1921         /* Now try to get class */
1922         if (clid == 0) {
1923                 if (portid == TC_H_ROOT)
1924                         clid = qid;
1925         } else
1926                 clid = TC_H_MAKE(qid, clid);
1927
1928         if (clid)
1929                 cl = cops->find(q, clid);
1930
1931         if (cl == 0) {
1932                 err = -ENOENT;
1933                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1934                     !(n->nlmsg_flags & NLM_F_CREATE))
1935                         goto out;
1936         } else {
1937                 switch (n->nlmsg_type) {
1938                 case RTM_NEWTCLASS:
1939                         err = -EEXIST;
1940                         if (n->nlmsg_flags & NLM_F_EXCL)
1941                                 goto out;
1942                         break;
1943                 case RTM_DELTCLASS:
1944                         err = tclass_del_notify(net, cops, skb, n, q, cl);
1945                         /* Unbind the class with flilters with 0 */
1946                         tc_bind_tclass(q, portid, clid, 0);
1947                         goto out;
1948                 case RTM_GETTCLASS:
1949                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1950                         goto out;
1951                 default:
1952                         err = -EINVAL;
1953                         goto out;
1954                 }
1955         }
1956
1957         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1958                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
1959                 return -EOPNOTSUPP;
1960         }
1961
1962         new_cl = cl;
1963         err = -EOPNOTSUPP;
1964         if (cops->change)
1965                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
1966         if (err == 0) {
1967                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1968                 /* We just create a new class, need to do reverse binding. */
1969                 if (cl != new_cl)
1970                         tc_bind_tclass(q, portid, clid, new_cl);
1971         }
1972 out:
1973         return err;
1974 }
1975
1976 struct qdisc_dump_args {
1977         struct qdisc_walker     w;
1978         struct sk_buff          *skb;
1979         struct netlink_callback *cb;
1980 };
1981
1982 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
1983                             struct qdisc_walker *arg)
1984 {
1985         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1986
1987         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1988                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
1989                               RTM_NEWTCLASS);
1990 }
1991
1992 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1993                                 struct tcmsg *tcm, struct netlink_callback *cb,
1994                                 int *t_p, int s_t)
1995 {
1996         struct qdisc_dump_args arg;
1997
1998         if (tc_qdisc_dump_ignore(q, false) ||
1999             *t_p < s_t || !q->ops->cl_ops ||
2000             (tcm->tcm_parent &&
2001              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2002                 (*t_p)++;
2003                 return 0;
2004         }
2005         if (*t_p > s_t)
2006                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2007         arg.w.fn = qdisc_class_dump;
2008         arg.skb = skb;
2009         arg.cb = cb;
2010         arg.w.stop  = 0;
2011         arg.w.skip = cb->args[1];
2012         arg.w.count = 0;
2013         q->ops->cl_ops->walk(q, &arg.w);
2014         cb->args[1] = arg.w.count;
2015         if (arg.w.stop)
2016                 return -1;
2017         (*t_p)++;
2018         return 0;
2019 }
2020
2021 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2022                                struct tcmsg *tcm, struct netlink_callback *cb,
2023                                int *t_p, int s_t)
2024 {
2025         struct Qdisc *q;
2026         int b;
2027
2028         if (!root)
2029                 return 0;
2030
2031         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2032                 return -1;
2033
2034         if (!qdisc_dev(root))
2035                 return 0;
2036
2037         if (tcm->tcm_parent) {
2038                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2039                 if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2040                         return -1;
2041                 return 0;
2042         }
2043         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2044                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2045                         return -1;
2046         }
2047
2048         return 0;
2049 }
2050
2051 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2052 {
2053         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2054         struct net *net = sock_net(skb->sk);
2055         struct netdev_queue *dev_queue;
2056         struct net_device *dev;
2057         int t, s_t;
2058
2059         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2060                 return 0;
2061         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2062         if (!dev)
2063                 return 0;
2064
2065         s_t = cb->args[0];
2066         t = 0;
2067
2068         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2069                 goto done;
2070
2071         dev_queue = dev_ingress_queue(dev);
2072         if (dev_queue &&
2073             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2074                                 &t, s_t) < 0)
2075                 goto done;
2076
2077 done:
2078         cb->args[0] = t;
2079
2080         dev_put(dev);
2081         return skb->len;
2082 }
2083
2084 #ifdef CONFIG_PROC_FS
2085 static int psched_show(struct seq_file *seq, void *v)
2086 {
2087         seq_printf(seq, "%08x %08x %08x %08x\n",
2088                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2089                    1000000,
2090                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2091
2092         return 0;
2093 }
2094
2095 static int __net_init psched_net_init(struct net *net)
2096 {
2097         struct proc_dir_entry *e;
2098
2099         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2100         if (e == NULL)
2101                 return -ENOMEM;
2102
2103         return 0;
2104 }
2105
2106 static void __net_exit psched_net_exit(struct net *net)
2107 {
2108         remove_proc_entry("psched", net->proc_net);
2109 }
2110 #else
2111 static int __net_init psched_net_init(struct net *net)
2112 {
2113         return 0;
2114 }
2115
2116 static void __net_exit psched_net_exit(struct net *net)
2117 {
2118 }
2119 #endif
2120
2121 static struct pernet_operations psched_net_ops = {
2122         .init = psched_net_init,
2123         .exit = psched_net_exit,
2124 };
2125
2126 static int __init pktsched_init(void)
2127 {
2128         int err;
2129
2130         err = register_pernet_subsys(&psched_net_ops);
2131         if (err) {
2132                 pr_err("pktsched_init: "
2133                        "cannot initialize per netns operations\n");
2134                 return err;
2135         }
2136
2137         register_qdisc(&pfifo_fast_ops);
2138         register_qdisc(&pfifo_qdisc_ops);
2139         register_qdisc(&bfifo_qdisc_ops);
2140         register_qdisc(&pfifo_head_drop_qdisc_ops);
2141         register_qdisc(&mq_qdisc_ops);
2142         register_qdisc(&noqueue_qdisc_ops);
2143
2144         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2145         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2146         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2147                       0);
2148         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2149         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2150         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2151                       0);
2152
2153         return 0;
2154 }
2155
2156 subsys_initcall(pktsched_init);