Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
[linux-2.6-microblaze.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/slab.h>
31 #include <linux/hashtable.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37 #include <net/pkt_cls.h>
38
39 /*
40
41    Short review.
42    -------------
43
44    This file consists of two interrelated parts:
45
46    1. queueing disciplines manager frontend.
47    2. traffic classes manager frontend.
48
49    Generally, queueing discipline ("qdisc") is a black box,
50    which is able to enqueue packets and to dequeue them (when
51    device is ready to send something) in order and at times
52    determined by algorithm hidden in it.
53
54    qdisc's are divided to two categories:
55    - "queues", which have no internal structure visible from outside.
56    - "schedulers", which split all the packets to "traffic classes",
57      using "packet classifiers" (look at cls_api.c)
58
59    In turn, classes may have child qdiscs (as rule, queues)
60    attached to them etc. etc. etc.
61
62    The goal of the routines in this file is to translate
63    information supplied by user in the form of handles
64    to more intelligible for kernel form, to make some sanity
65    checks and part of work, which is common to all qdiscs
66    and to provide rtnetlink notifications.
67
68    All real intelligent work is done inside qdisc modules.
69
70
71
72    Every discipline has two major routines: enqueue and dequeue.
73
74    ---dequeue
75
76    dequeue usually returns a skb to send. It is allowed to return NULL,
77    but it does not mean that queue is empty, it just means that
78    discipline does not want to send anything this time.
79    Queue is really empty if q->q.qlen == 0.
80    For complicated disciplines with multiple queues q->q is not
81    real packet queue, but however q->q.qlen must be valid.
82
83    ---enqueue
84
85    enqueue returns 0, if packet was enqueued successfully.
86    If packet (this one or another one) was dropped, it returns
87    not zero error code.
88    NET_XMIT_DROP        - this packet dropped
89      Expected action: do not backoff, but wait until queue will clear.
90    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
91      Expected action: backoff or ignore
92
93    Auxiliary routines:
94
95    ---peek
96
97    like dequeue but without removing a packet from the queue
98
99    ---reset
100
101    returns qdisc to initial state: purge all buffers, clear all
102    timers, counters (except for statistics) etc.
103
104    ---init
105
106    initializes newly created qdisc.
107
108    ---destroy
109
110    destroys resources allocated by init and during lifetime of qdisc.
111
112    ---change
113
114    changes qdisc parameters.
115  */
116
117 /* Protects list of registered TC modules. It is pure SMP lock. */
118 static DEFINE_RWLOCK(qdisc_mod_lock);
119
120
121 /************************************************
122  *      Queueing disciplines manipulation.      *
123  ************************************************/
124
125
126 /* The list of all installed queueing disciplines. */
127
128 static struct Qdisc_ops *qdisc_base;
129
130 /* Register/unregister queueing discipline */
131
132 int register_qdisc(struct Qdisc_ops *qops)
133 {
134         struct Qdisc_ops *q, **qp;
135         int rc = -EEXIST;
136
137         write_lock(&qdisc_mod_lock);
138         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
139                 if (!strcmp(qops->id, q->id))
140                         goto out;
141
142         if (qops->enqueue == NULL)
143                 qops->enqueue = noop_qdisc_ops.enqueue;
144         if (qops->peek == NULL) {
145                 if (qops->dequeue == NULL)
146                         qops->peek = noop_qdisc_ops.peek;
147                 else
148                         goto out_einval;
149         }
150         if (qops->dequeue == NULL)
151                 qops->dequeue = noop_qdisc_ops.dequeue;
152
153         if (qops->cl_ops) {
154                 const struct Qdisc_class_ops *cops = qops->cl_ops;
155
156                 if (!(cops->find && cops->walk && cops->leaf))
157                         goto out_einval;
158
159                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
160                         goto out_einval;
161         }
162
163         qops->next = NULL;
164         *qp = qops;
165         rc = 0;
166 out:
167         write_unlock(&qdisc_mod_lock);
168         return rc;
169
170 out_einval:
171         rc = -EINVAL;
172         goto out;
173 }
174 EXPORT_SYMBOL(register_qdisc);
175
176 int unregister_qdisc(struct Qdisc_ops *qops)
177 {
178         struct Qdisc_ops *q, **qp;
179         int err = -ENOENT;
180
181         write_lock(&qdisc_mod_lock);
182         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
183                 if (q == qops)
184                         break;
185         if (q) {
186                 *qp = q->next;
187                 q->next = NULL;
188                 err = 0;
189         }
190         write_unlock(&qdisc_mod_lock);
191         return err;
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198         read_lock(&qdisc_mod_lock);
199         strlcpy(name, default_qdisc_ops->id, len);
200         read_unlock(&qdisc_mod_lock);
201 }
202
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205         struct Qdisc_ops *q = NULL;
206
207         for (q = qdisc_base; q; q = q->next) {
208                 if (!strcmp(name, q->id)) {
209                         if (!try_module_get(q->owner))
210                                 q = NULL;
211                         break;
212                 }
213         }
214
215         return q;
216 }
217
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221         const struct Qdisc_ops *ops;
222
223         if (!capable(CAP_NET_ADMIN))
224                 return -EPERM;
225
226         write_lock(&qdisc_mod_lock);
227         ops = qdisc_lookup_default(name);
228         if (!ops) {
229                 /* Not found, drop lock and try to load module */
230                 write_unlock(&qdisc_mod_lock);
231                 request_module("sch_%s", name);
232                 write_lock(&qdisc_mod_lock);
233
234                 ops = qdisc_lookup_default(name);
235         }
236
237         if (ops) {
238                 /* Set new default */
239                 module_put(default_qdisc_ops->owner);
240                 default_qdisc_ops = ops;
241         }
242         write_unlock(&qdisc_mod_lock);
243
244         return ops ? 0 : -ENOENT;
245 }
246
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263         struct Qdisc *q;
264
265         if (!qdisc_dev(root))
266                 return (root->handle == handle ? root : NULL);
267
268         if (!(root->flags & TCQ_F_BUILTIN) &&
269             root->handle == handle)
270                 return root;
271
272         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
273                 if (q->handle == handle)
274                         return q;
275         }
276         return NULL;
277 }
278
279 void qdisc_hash_add(struct Qdisc *q, bool invisible)
280 {
281         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
282                 ASSERT_RTNL();
283                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
284                 if (invisible)
285                         q->flags |= TCQ_F_INVISIBLE;
286         }
287 }
288 EXPORT_SYMBOL(qdisc_hash_add);
289
290 void qdisc_hash_del(struct Qdisc *q)
291 {
292         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
293                 ASSERT_RTNL();
294                 hash_del_rcu(&q->hash);
295         }
296 }
297 EXPORT_SYMBOL(qdisc_hash_del);
298
299 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
300 {
301         struct Qdisc *q;
302
303         if (!handle)
304                 return NULL;
305         q = qdisc_match_from_root(dev->qdisc, handle);
306         if (q)
307                 goto out;
308
309         if (dev_ingress_queue(dev))
310                 q = qdisc_match_from_root(
311                         dev_ingress_queue(dev)->qdisc_sleeping,
312                         handle);
313 out:
314         return q;
315 }
316
317 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
318 {
319         struct netdev_queue *nq;
320         struct Qdisc *q;
321
322         if (!handle)
323                 return NULL;
324         q = qdisc_match_from_root(dev->qdisc, handle);
325         if (q)
326                 goto out;
327
328         nq = dev_ingress_queue_rcu(dev);
329         if (nq)
330                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
331 out:
332         return q;
333 }
334
335 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
336 {
337         unsigned long cl;
338         struct Qdisc *leaf;
339         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
340
341         if (cops == NULL)
342                 return NULL;
343         cl = cops->find(p, classid);
344
345         if (cl == 0)
346                 return NULL;
347         leaf = cops->leaf(p, cl);
348         return leaf;
349 }
350
351 /* Find queueing discipline by name */
352
353 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
354 {
355         struct Qdisc_ops *q = NULL;
356
357         if (kind) {
358                 read_lock(&qdisc_mod_lock);
359                 for (q = qdisc_base; q; q = q->next) {
360                         if (nla_strcmp(kind, q->id) == 0) {
361                                 if (!try_module_get(q->owner))
362                                         q = NULL;
363                                 break;
364                         }
365                 }
366                 read_unlock(&qdisc_mod_lock);
367         }
368         return q;
369 }
370
371 /* The linklayer setting were not transferred from iproute2, in older
372  * versions, and the rate tables lookup systems have been dropped in
373  * the kernel. To keep backward compatible with older iproute2 tc
374  * utils, we detect the linklayer setting by detecting if the rate
375  * table were modified.
376  *
377  * For linklayer ATM table entries, the rate table will be aligned to
378  * 48 bytes, thus some table entries will contain the same value.  The
379  * mpu (min packet unit) is also encoded into the old rate table, thus
380  * starting from the mpu, we find low and high table entries for
381  * mapping this cell.  If these entries contain the same value, when
382  * the rate tables have been modified for linklayer ATM.
383  *
384  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
385  * and then roundup to the next cell, calc the table entry one below,
386  * and compare.
387  */
388 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
389 {
390         int low       = roundup(r->mpu, 48);
391         int high      = roundup(low+1, 48);
392         int cell_low  = low >> r->cell_log;
393         int cell_high = (high >> r->cell_log) - 1;
394
395         /* rtab is too inaccurate at rates > 100Mbit/s */
396         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
397                 pr_debug("TC linklayer: Giving up ATM detection\n");
398                 return TC_LINKLAYER_ETHERNET;
399         }
400
401         if ((cell_high > cell_low) && (cell_high < 256)
402             && (rtab[cell_low] == rtab[cell_high])) {
403                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
404                          cell_low, cell_high, rtab[cell_high]);
405                 return TC_LINKLAYER_ATM;
406         }
407         return TC_LINKLAYER_ETHERNET;
408 }
409
410 static struct qdisc_rate_table *qdisc_rtab_list;
411
412 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
413                                         struct nlattr *tab,
414                                         struct netlink_ext_ack *extack)
415 {
416         struct qdisc_rate_table *rtab;
417
418         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
419             nla_len(tab) != TC_RTAB_SIZE) {
420                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
421                 return NULL;
422         }
423
424         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
425                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
426                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
427                         rtab->refcnt++;
428                         return rtab;
429                 }
430         }
431
432         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
433         if (rtab) {
434                 rtab->rate = *r;
435                 rtab->refcnt = 1;
436                 memcpy(rtab->data, nla_data(tab), 1024);
437                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
438                         r->linklayer = __detect_linklayer(r, rtab->data);
439                 rtab->next = qdisc_rtab_list;
440                 qdisc_rtab_list = rtab;
441         } else {
442                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
443         }
444         return rtab;
445 }
446 EXPORT_SYMBOL(qdisc_get_rtab);
447
448 void qdisc_put_rtab(struct qdisc_rate_table *tab)
449 {
450         struct qdisc_rate_table *rtab, **rtabp;
451
452         if (!tab || --tab->refcnt)
453                 return;
454
455         for (rtabp = &qdisc_rtab_list;
456              (rtab = *rtabp) != NULL;
457              rtabp = &rtab->next) {
458                 if (rtab == tab) {
459                         *rtabp = rtab->next;
460                         kfree(rtab);
461                         return;
462                 }
463         }
464 }
465 EXPORT_SYMBOL(qdisc_put_rtab);
466
467 static LIST_HEAD(qdisc_stab_list);
468
469 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
470         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
471         [TCA_STAB_DATA] = { .type = NLA_BINARY },
472 };
473
474 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
475                                                struct netlink_ext_ack *extack)
476 {
477         struct nlattr *tb[TCA_STAB_MAX + 1];
478         struct qdisc_size_table *stab;
479         struct tc_sizespec *s;
480         unsigned int tsize = 0;
481         u16 *tab = NULL;
482         int err;
483
484         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
485         if (err < 0)
486                 return ERR_PTR(err);
487         if (!tb[TCA_STAB_BASE]) {
488                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
489                 return ERR_PTR(-EINVAL);
490         }
491
492         s = nla_data(tb[TCA_STAB_BASE]);
493
494         if (s->tsize > 0) {
495                 if (!tb[TCA_STAB_DATA]) {
496                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
497                         return ERR_PTR(-EINVAL);
498                 }
499                 tab = nla_data(tb[TCA_STAB_DATA]);
500                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
501         }
502
503         if (tsize != s->tsize || (!tab && tsize > 0)) {
504                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
505                 return ERR_PTR(-EINVAL);
506         }
507
508         list_for_each_entry(stab, &qdisc_stab_list, list) {
509                 if (memcmp(&stab->szopts, s, sizeof(*s)))
510                         continue;
511                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
512                         continue;
513                 stab->refcnt++;
514                 return stab;
515         }
516
517         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
518         if (!stab)
519                 return ERR_PTR(-ENOMEM);
520
521         stab->refcnt = 1;
522         stab->szopts = *s;
523         if (tsize > 0)
524                 memcpy(stab->data, tab, tsize * sizeof(u16));
525
526         list_add_tail(&stab->list, &qdisc_stab_list);
527
528         return stab;
529 }
530
531 static void stab_kfree_rcu(struct rcu_head *head)
532 {
533         kfree(container_of(head, struct qdisc_size_table, rcu));
534 }
535
536 void qdisc_put_stab(struct qdisc_size_table *tab)
537 {
538         if (!tab)
539                 return;
540
541         if (--tab->refcnt == 0) {
542                 list_del(&tab->list);
543                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
544         }
545 }
546 EXPORT_SYMBOL(qdisc_put_stab);
547
548 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
549 {
550         struct nlattr *nest;
551
552         nest = nla_nest_start(skb, TCA_STAB);
553         if (nest == NULL)
554                 goto nla_put_failure;
555         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
556                 goto nla_put_failure;
557         nla_nest_end(skb, nest);
558
559         return skb->len;
560
561 nla_put_failure:
562         return -1;
563 }
564
565 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
566                                const struct qdisc_size_table *stab)
567 {
568         int pkt_len, slot;
569
570         pkt_len = skb->len + stab->szopts.overhead;
571         if (unlikely(!stab->szopts.tsize))
572                 goto out;
573
574         slot = pkt_len + stab->szopts.cell_align;
575         if (unlikely(slot < 0))
576                 slot = 0;
577
578         slot >>= stab->szopts.cell_log;
579         if (likely(slot < stab->szopts.tsize))
580                 pkt_len = stab->data[slot];
581         else
582                 pkt_len = stab->data[stab->szopts.tsize - 1] *
583                                 (slot / stab->szopts.tsize) +
584                                 stab->data[slot % stab->szopts.tsize];
585
586         pkt_len <<= stab->szopts.size_log;
587 out:
588         if (unlikely(pkt_len < 1))
589                 pkt_len = 1;
590         qdisc_skb_cb(skb)->pkt_len = pkt_len;
591 }
592 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
593
594 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
595 {
596         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
597                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
598                         txt, qdisc->ops->id, qdisc->handle >> 16);
599                 qdisc->flags |= TCQ_F_WARN_NONWC;
600         }
601 }
602 EXPORT_SYMBOL(qdisc_warn_nonwc);
603
604 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
605 {
606         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
607                                                  timer);
608
609         rcu_read_lock();
610         __netif_schedule(qdisc_root(wd->qdisc));
611         rcu_read_unlock();
612
613         return HRTIMER_NORESTART;
614 }
615
616 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
617                                  clockid_t clockid)
618 {
619         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
620         wd->timer.function = qdisc_watchdog;
621         wd->qdisc = qdisc;
622 }
623 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
624
625 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
626 {
627         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
628 }
629 EXPORT_SYMBOL(qdisc_watchdog_init);
630
631 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
632 {
633         if (test_bit(__QDISC_STATE_DEACTIVATED,
634                      &qdisc_root_sleeping(wd->qdisc)->state))
635                 return;
636
637         if (wd->last_expires == expires)
638                 return;
639
640         wd->last_expires = expires;
641         hrtimer_start(&wd->timer,
642                       ns_to_ktime(expires),
643                       HRTIMER_MODE_ABS_PINNED);
644 }
645 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
646
647 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
648 {
649         hrtimer_cancel(&wd->timer);
650 }
651 EXPORT_SYMBOL(qdisc_watchdog_cancel);
652
653 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
654 {
655         struct hlist_head *h;
656         unsigned int i;
657
658         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
659
660         if (h != NULL) {
661                 for (i = 0; i < n; i++)
662                         INIT_HLIST_HEAD(&h[i]);
663         }
664         return h;
665 }
666
667 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
668 {
669         struct Qdisc_class_common *cl;
670         struct hlist_node *next;
671         struct hlist_head *nhash, *ohash;
672         unsigned int nsize, nmask, osize;
673         unsigned int i, h;
674
675         /* Rehash when load factor exceeds 0.75 */
676         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
677                 return;
678         nsize = clhash->hashsize * 2;
679         nmask = nsize - 1;
680         nhash = qdisc_class_hash_alloc(nsize);
681         if (nhash == NULL)
682                 return;
683
684         ohash = clhash->hash;
685         osize = clhash->hashsize;
686
687         sch_tree_lock(sch);
688         for (i = 0; i < osize; i++) {
689                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
690                         h = qdisc_class_hash(cl->classid, nmask);
691                         hlist_add_head(&cl->hnode, &nhash[h]);
692                 }
693         }
694         clhash->hash     = nhash;
695         clhash->hashsize = nsize;
696         clhash->hashmask = nmask;
697         sch_tree_unlock(sch);
698
699         kvfree(ohash);
700 }
701 EXPORT_SYMBOL(qdisc_class_hash_grow);
702
703 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
704 {
705         unsigned int size = 4;
706
707         clhash->hash = qdisc_class_hash_alloc(size);
708         if (!clhash->hash)
709                 return -ENOMEM;
710         clhash->hashsize  = size;
711         clhash->hashmask  = size - 1;
712         clhash->hashelems = 0;
713         return 0;
714 }
715 EXPORT_SYMBOL(qdisc_class_hash_init);
716
717 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
718 {
719         kvfree(clhash->hash);
720 }
721 EXPORT_SYMBOL(qdisc_class_hash_destroy);
722
723 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
724                              struct Qdisc_class_common *cl)
725 {
726         unsigned int h;
727
728         INIT_HLIST_NODE(&cl->hnode);
729         h = qdisc_class_hash(cl->classid, clhash->hashmask);
730         hlist_add_head(&cl->hnode, &clhash->hash[h]);
731         clhash->hashelems++;
732 }
733 EXPORT_SYMBOL(qdisc_class_hash_insert);
734
735 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
736                              struct Qdisc_class_common *cl)
737 {
738         hlist_del(&cl->hnode);
739         clhash->hashelems--;
740 }
741 EXPORT_SYMBOL(qdisc_class_hash_remove);
742
743 /* Allocate an unique handle from space managed by kernel
744  * Possible range is [8000-FFFF]:0000 (0x8000 values)
745  */
746 static u32 qdisc_alloc_handle(struct net_device *dev)
747 {
748         int i = 0x8000;
749         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
750
751         do {
752                 autohandle += TC_H_MAKE(0x10000U, 0);
753                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
754                         autohandle = TC_H_MAKE(0x80000000U, 0);
755                 if (!qdisc_lookup(dev, autohandle))
756                         return autohandle;
757                 cond_resched();
758         } while (--i > 0);
759
760         return 0;
761 }
762
763 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
764                                unsigned int len)
765 {
766         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
767         const struct Qdisc_class_ops *cops;
768         unsigned long cl;
769         u32 parentid;
770         bool notify;
771         int drops;
772
773         if (n == 0 && len == 0)
774                 return;
775         drops = max_t(int, n, 0);
776         rcu_read_lock();
777         while ((parentid = sch->parent)) {
778                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
779                         break;
780
781                 if (sch->flags & TCQ_F_NOPARENT)
782                         break;
783                 /* Notify parent qdisc only if child qdisc becomes empty.
784                  *
785                  * If child was empty even before update then backlog
786                  * counter is screwed and we skip notification because
787                  * parent class is already passive.
788                  *
789                  * If the original child was offloaded then it is allowed
790                  * to be seem as empty, so the parent is notified anyway.
791                  */
792                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
793                                                        !qdisc_is_offloaded);
794                 /* TODO: perform the search on a per txq basis */
795                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
796                 if (sch == NULL) {
797                         WARN_ON_ONCE(parentid != TC_H_ROOT);
798                         break;
799                 }
800                 cops = sch->ops->cl_ops;
801                 if (notify && cops->qlen_notify) {
802                         cl = cops->find(sch, parentid);
803                         cops->qlen_notify(sch, cl);
804                 }
805                 sch->q.qlen -= n;
806                 sch->qstats.backlog -= len;
807                 __qdisc_qstats_drop(sch, drops);
808         }
809         rcu_read_unlock();
810 }
811 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
812
813 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
814                          u32 portid, u32 seq, u16 flags, int event)
815 {
816         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
817         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
818         struct tcmsg *tcm;
819         struct nlmsghdr  *nlh;
820         unsigned char *b = skb_tail_pointer(skb);
821         struct gnet_dump d;
822         struct qdisc_size_table *stab;
823         u32 block_index;
824         __u32 qlen;
825
826         cond_resched();
827         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
828         if (!nlh)
829                 goto out_nlmsg_trim;
830         tcm = nlmsg_data(nlh);
831         tcm->tcm_family = AF_UNSPEC;
832         tcm->tcm__pad1 = 0;
833         tcm->tcm__pad2 = 0;
834         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
835         tcm->tcm_parent = clid;
836         tcm->tcm_handle = q->handle;
837         tcm->tcm_info = refcount_read(&q->refcnt);
838         if (nla_put_string(skb, TCA_KIND, q->ops->id))
839                 goto nla_put_failure;
840         if (q->ops->ingress_block_get) {
841                 block_index = q->ops->ingress_block_get(q);
842                 if (block_index &&
843                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
844                         goto nla_put_failure;
845         }
846         if (q->ops->egress_block_get) {
847                 block_index = q->ops->egress_block_get(q);
848                 if (block_index &&
849                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
850                         goto nla_put_failure;
851         }
852         if (q->ops->dump && q->ops->dump(q, skb) < 0)
853                 goto nla_put_failure;
854         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
855                 goto nla_put_failure;
856         qlen = qdisc_qlen_sum(q);
857
858         stab = rtnl_dereference(q->stab);
859         if (stab && qdisc_dump_stab(skb, stab) < 0)
860                 goto nla_put_failure;
861
862         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
863                                          NULL, &d, TCA_PAD) < 0)
864                 goto nla_put_failure;
865
866         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
867                 goto nla_put_failure;
868
869         if (qdisc_is_percpu_stats(q)) {
870                 cpu_bstats = q->cpu_bstats;
871                 cpu_qstats = q->cpu_qstats;
872         }
873
874         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
875                                   &d, cpu_bstats, &q->bstats) < 0 ||
876             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
877             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
878                 goto nla_put_failure;
879
880         if (gnet_stats_finish_copy(&d) < 0)
881                 goto nla_put_failure;
882
883         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
884         return skb->len;
885
886 out_nlmsg_trim:
887 nla_put_failure:
888         nlmsg_trim(skb, b);
889         return -1;
890 }
891
892 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
893 {
894         if (q->flags & TCQ_F_BUILTIN)
895                 return true;
896         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
897                 return true;
898
899         return false;
900 }
901
902 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
903                         struct nlmsghdr *n, u32 clid,
904                         struct Qdisc *old, struct Qdisc *new)
905 {
906         struct sk_buff *skb;
907         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
908
909         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
910         if (!skb)
911                 return -ENOBUFS;
912
913         if (old && !tc_qdisc_dump_ignore(old, false)) {
914                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
915                                   0, RTM_DELQDISC) < 0)
916                         goto err_out;
917         }
918         if (new && !tc_qdisc_dump_ignore(new, false)) {
919                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
920                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
921                         goto err_out;
922         }
923
924         if (skb->len)
925                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
926                                       n->nlmsg_flags & NLM_F_ECHO);
927
928 err_out:
929         kfree_skb(skb);
930         return -EINVAL;
931 }
932
933 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
934                                struct nlmsghdr *n, u32 clid,
935                                struct Qdisc *old, struct Qdisc *new)
936 {
937         if (new || old)
938                 qdisc_notify(net, skb, n, clid, old, new);
939
940         if (old)
941                 qdisc_put(old);
942 }
943
944 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
945  * to device "dev".
946  *
947  * When appropriate send a netlink notification using 'skb'
948  * and "n".
949  *
950  * On success, destroy old qdisc.
951  */
952
953 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
954                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
955                        struct Qdisc *new, struct Qdisc *old,
956                        struct netlink_ext_ack *extack)
957 {
958         struct Qdisc *q = old;
959         struct net *net = dev_net(dev);
960         int err = 0;
961
962         if (parent == NULL) {
963                 unsigned int i, num_q, ingress;
964
965                 ingress = 0;
966                 num_q = dev->num_tx_queues;
967                 if ((q && q->flags & TCQ_F_INGRESS) ||
968                     (new && new->flags & TCQ_F_INGRESS)) {
969                         num_q = 1;
970                         ingress = 1;
971                         if (!dev_ingress_queue(dev)) {
972                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
973                                 return -ENOENT;
974                         }
975                 }
976
977                 if (dev->flags & IFF_UP)
978                         dev_deactivate(dev);
979
980                 if (new && new->ops->attach)
981                         goto skip;
982
983                 for (i = 0; i < num_q; i++) {
984                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
985
986                         if (!ingress)
987                                 dev_queue = netdev_get_tx_queue(dev, i);
988
989                         old = dev_graft_qdisc(dev_queue, new);
990                         if (new && i > 0)
991                                 qdisc_refcount_inc(new);
992
993                         if (!ingress)
994                                 qdisc_put(old);
995                 }
996
997 skip:
998                 if (!ingress) {
999                         notify_and_destroy(net, skb, n, classid,
1000                                            dev->qdisc, new);
1001                         if (new && !new->ops->attach)
1002                                 qdisc_refcount_inc(new);
1003                         dev->qdisc = new ? : &noop_qdisc;
1004
1005                         if (new && new->ops->attach)
1006                                 new->ops->attach(new);
1007                 } else {
1008                         notify_and_destroy(net, skb, n, classid, old, new);
1009                 }
1010
1011                 if (dev->flags & IFF_UP)
1012                         dev_activate(dev);
1013         } else {
1014                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1015
1016                 /* Only support running class lockless if parent is lockless */
1017                 if (new && (new->flags & TCQ_F_NOLOCK) &&
1018                     parent && !(parent->flags & TCQ_F_NOLOCK))
1019                         new->flags &= ~TCQ_F_NOLOCK;
1020
1021                 err = -EOPNOTSUPP;
1022                 if (cops && cops->graft) {
1023                         unsigned long cl = cops->find(parent, classid);
1024
1025                         if (cl) {
1026                                 err = cops->graft(parent, cl, new, &old,
1027                                                   extack);
1028                         } else {
1029                                 NL_SET_ERR_MSG(extack, "Specified class not found");
1030                                 err = -ENOENT;
1031                         }
1032                 }
1033                 if (!err)
1034                         notify_and_destroy(net, skb, n, classid, old, new);
1035         }
1036         return err;
1037 }
1038
1039 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1040                                    struct netlink_ext_ack *extack)
1041 {
1042         u32 block_index;
1043
1044         if (tca[TCA_INGRESS_BLOCK]) {
1045                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1046
1047                 if (!block_index) {
1048                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1049                         return -EINVAL;
1050                 }
1051                 if (!sch->ops->ingress_block_set) {
1052                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1053                         return -EOPNOTSUPP;
1054                 }
1055                 sch->ops->ingress_block_set(sch, block_index);
1056         }
1057         if (tca[TCA_EGRESS_BLOCK]) {
1058                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1059
1060                 if (!block_index) {
1061                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1062                         return -EINVAL;
1063                 }
1064                 if (!sch->ops->egress_block_set) {
1065                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1066                         return -EOPNOTSUPP;
1067                 }
1068                 sch->ops->egress_block_set(sch, block_index);
1069         }
1070         return 0;
1071 }
1072
1073 /*
1074    Allocate and initialize new qdisc.
1075
1076    Parameters are passed via opt.
1077  */
1078
1079 static struct Qdisc *qdisc_create(struct net_device *dev,
1080                                   struct netdev_queue *dev_queue,
1081                                   struct Qdisc *p, u32 parent, u32 handle,
1082                                   struct nlattr **tca, int *errp,
1083                                   struct netlink_ext_ack *extack)
1084 {
1085         int err;
1086         struct nlattr *kind = tca[TCA_KIND];
1087         struct Qdisc *sch;
1088         struct Qdisc_ops *ops;
1089         struct qdisc_size_table *stab;
1090
1091         ops = qdisc_lookup_ops(kind);
1092 #ifdef CONFIG_MODULES
1093         if (ops == NULL && kind != NULL) {
1094                 char name[IFNAMSIZ];
1095                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1096                         /* We dropped the RTNL semaphore in order to
1097                          * perform the module load.  So, even if we
1098                          * succeeded in loading the module we have to
1099                          * tell the caller to replay the request.  We
1100                          * indicate this using -EAGAIN.
1101                          * We replay the request because the device may
1102                          * go away in the mean time.
1103                          */
1104                         rtnl_unlock();
1105                         request_module("sch_%s", name);
1106                         rtnl_lock();
1107                         ops = qdisc_lookup_ops(kind);
1108                         if (ops != NULL) {
1109                                 /* We will try again qdisc_lookup_ops,
1110                                  * so don't keep a reference.
1111                                  */
1112                                 module_put(ops->owner);
1113                                 err = -EAGAIN;
1114                                 goto err_out;
1115                         }
1116                 }
1117         }
1118 #endif
1119
1120         err = -ENOENT;
1121         if (!ops) {
1122                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1123                 goto err_out;
1124         }
1125
1126         sch = qdisc_alloc(dev_queue, ops, extack);
1127         if (IS_ERR(sch)) {
1128                 err = PTR_ERR(sch);
1129                 goto err_out2;
1130         }
1131
1132         sch->parent = parent;
1133
1134         if (handle == TC_H_INGRESS) {
1135                 sch->flags |= TCQ_F_INGRESS;
1136                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1137         } else {
1138                 if (handle == 0) {
1139                         handle = qdisc_alloc_handle(dev);
1140                         err = -ENOMEM;
1141                         if (handle == 0)
1142                                 goto err_out3;
1143                 }
1144                 if (!netif_is_multiqueue(dev))
1145                         sch->flags |= TCQ_F_ONETXQUEUE;
1146         }
1147
1148         sch->handle = handle;
1149
1150         /* This exist to keep backward compatible with a userspace
1151          * loophole, what allowed userspace to get IFF_NO_QUEUE
1152          * facility on older kernels by setting tx_queue_len=0 (prior
1153          * to qdisc init), and then forgot to reinit tx_queue_len
1154          * before again attaching a qdisc.
1155          */
1156         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1157                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1158                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1159         }
1160
1161         err = qdisc_block_indexes_set(sch, tca, extack);
1162         if (err)
1163                 goto err_out3;
1164
1165         if (ops->init) {
1166                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1167                 if (err != 0)
1168                         goto err_out5;
1169         }
1170
1171         if (tca[TCA_STAB]) {
1172                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1173                 if (IS_ERR(stab)) {
1174                         err = PTR_ERR(stab);
1175                         goto err_out4;
1176                 }
1177                 rcu_assign_pointer(sch->stab, stab);
1178         }
1179         if (tca[TCA_RATE]) {
1180                 seqcount_t *running;
1181
1182                 err = -EOPNOTSUPP;
1183                 if (sch->flags & TCQ_F_MQROOT) {
1184                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1185                         goto err_out4;
1186                 }
1187
1188                 if (sch->parent != TC_H_ROOT &&
1189                     !(sch->flags & TCQ_F_INGRESS) &&
1190                     (!p || !(p->flags & TCQ_F_MQROOT)))
1191                         running = qdisc_root_sleeping_running(sch);
1192                 else
1193                         running = &sch->running;
1194
1195                 err = gen_new_estimator(&sch->bstats,
1196                                         sch->cpu_bstats,
1197                                         &sch->rate_est,
1198                                         NULL,
1199                                         running,
1200                                         tca[TCA_RATE]);
1201                 if (err) {
1202                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1203                         goto err_out4;
1204                 }
1205         }
1206
1207         qdisc_hash_add(sch, false);
1208
1209         return sch;
1210
1211 err_out5:
1212         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1213         if (ops->destroy)
1214                 ops->destroy(sch);
1215 err_out3:
1216         dev_put(dev);
1217         qdisc_free(sch);
1218 err_out2:
1219         module_put(ops->owner);
1220 err_out:
1221         *errp = err;
1222         return NULL;
1223
1224 err_out4:
1225         /*
1226          * Any broken qdiscs that would require a ops->reset() here?
1227          * The qdisc was never in action so it shouldn't be necessary.
1228          */
1229         qdisc_put_stab(rtnl_dereference(sch->stab));
1230         if (ops->destroy)
1231                 ops->destroy(sch);
1232         goto err_out3;
1233 }
1234
1235 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1236                         struct netlink_ext_ack *extack)
1237 {
1238         struct qdisc_size_table *ostab, *stab = NULL;
1239         int err = 0;
1240
1241         if (tca[TCA_OPTIONS]) {
1242                 if (!sch->ops->change) {
1243                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1244                         return -EINVAL;
1245                 }
1246                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1247                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1248                         return -EOPNOTSUPP;
1249                 }
1250                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1251                 if (err)
1252                         return err;
1253         }
1254
1255         if (tca[TCA_STAB]) {
1256                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1257                 if (IS_ERR(stab))
1258                         return PTR_ERR(stab);
1259         }
1260
1261         ostab = rtnl_dereference(sch->stab);
1262         rcu_assign_pointer(sch->stab, stab);
1263         qdisc_put_stab(ostab);
1264
1265         if (tca[TCA_RATE]) {
1266                 /* NB: ignores errors from replace_estimator
1267                    because change can't be undone. */
1268                 if (sch->flags & TCQ_F_MQROOT)
1269                         goto out;
1270                 gen_replace_estimator(&sch->bstats,
1271                                       sch->cpu_bstats,
1272                                       &sch->rate_est,
1273                                       NULL,
1274                                       qdisc_root_sleeping_running(sch),
1275                                       tca[TCA_RATE]);
1276         }
1277 out:
1278         return 0;
1279 }
1280
1281 struct check_loop_arg {
1282         struct qdisc_walker     w;
1283         struct Qdisc            *p;
1284         int                     depth;
1285 };
1286
1287 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1288                          struct qdisc_walker *w);
1289
1290 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1291 {
1292         struct check_loop_arg   arg;
1293
1294         if (q->ops->cl_ops == NULL)
1295                 return 0;
1296
1297         arg.w.stop = arg.w.skip = arg.w.count = 0;
1298         arg.w.fn = check_loop_fn;
1299         arg.depth = depth;
1300         arg.p = p;
1301         q->ops->cl_ops->walk(q, &arg.w);
1302         return arg.w.stop ? -ELOOP : 0;
1303 }
1304
1305 static int
1306 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1307 {
1308         struct Qdisc *leaf;
1309         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1310         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1311
1312         leaf = cops->leaf(q, cl);
1313         if (leaf) {
1314                 if (leaf == arg->p || arg->depth > 7)
1315                         return -ELOOP;
1316                 return check_loop(leaf, arg->p, arg->depth + 1);
1317         }
1318         return 0;
1319 }
1320
1321 /*
1322  * Delete/get qdisc.
1323  */
1324
1325 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1326                         struct netlink_ext_ack *extack)
1327 {
1328         struct net *net = sock_net(skb->sk);
1329         struct tcmsg *tcm = nlmsg_data(n);
1330         struct nlattr *tca[TCA_MAX + 1];
1331         struct net_device *dev;
1332         u32 clid;
1333         struct Qdisc *q = NULL;
1334         struct Qdisc *p = NULL;
1335         int err;
1336
1337         if ((n->nlmsg_type != RTM_GETQDISC) &&
1338             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1339                 return -EPERM;
1340
1341         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1342         if (err < 0)
1343                 return err;
1344
1345         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1346         if (!dev)
1347                 return -ENODEV;
1348
1349         clid = tcm->tcm_parent;
1350         if (clid) {
1351                 if (clid != TC_H_ROOT) {
1352                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1353                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1354                                 if (!p) {
1355                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1356                                         return -ENOENT;
1357                                 }
1358                                 q = qdisc_leaf(p, clid);
1359                         } else if (dev_ingress_queue(dev)) {
1360                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1361                         }
1362                 } else {
1363                         q = dev->qdisc;
1364                 }
1365                 if (!q) {
1366                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1367                         return -ENOENT;
1368                 }
1369
1370                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1371                         NL_SET_ERR_MSG(extack, "Invalid handle");
1372                         return -EINVAL;
1373                 }
1374         } else {
1375                 q = qdisc_lookup(dev, tcm->tcm_handle);
1376                 if (!q) {
1377                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1378                         return -ENOENT;
1379                 }
1380         }
1381
1382         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1383                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1384                 return -EINVAL;
1385         }
1386
1387         if (n->nlmsg_type == RTM_DELQDISC) {
1388                 if (!clid) {
1389                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1390                         return -EINVAL;
1391                 }
1392                 if (q->handle == 0) {
1393                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1394                         return -ENOENT;
1395                 }
1396                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1397                 if (err != 0)
1398                         return err;
1399         } else {
1400                 qdisc_notify(net, skb, n, clid, NULL, q);
1401         }
1402         return 0;
1403 }
1404
1405 /*
1406  * Create/change qdisc.
1407  */
1408
1409 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1410                            struct netlink_ext_ack *extack)
1411 {
1412         struct net *net = sock_net(skb->sk);
1413         struct tcmsg *tcm;
1414         struct nlattr *tca[TCA_MAX + 1];
1415         struct net_device *dev;
1416         u32 clid;
1417         struct Qdisc *q, *p;
1418         int err;
1419
1420         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1421                 return -EPERM;
1422
1423 replay:
1424         /* Reinit, just in case something touches this. */
1425         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1426         if (err < 0)
1427                 return err;
1428
1429         tcm = nlmsg_data(n);
1430         clid = tcm->tcm_parent;
1431         q = p = NULL;
1432
1433         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1434         if (!dev)
1435                 return -ENODEV;
1436
1437
1438         if (clid) {
1439                 if (clid != TC_H_ROOT) {
1440                         if (clid != TC_H_INGRESS) {
1441                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1442                                 if (!p) {
1443                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1444                                         return -ENOENT;
1445                                 }
1446                                 q = qdisc_leaf(p, clid);
1447                         } else if (dev_ingress_queue_create(dev)) {
1448                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1449                         }
1450                 } else {
1451                         q = dev->qdisc;
1452                 }
1453
1454                 /* It may be default qdisc, ignore it */
1455                 if (q && q->handle == 0)
1456                         q = NULL;
1457
1458                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1459                         if (tcm->tcm_handle) {
1460                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1461                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1462                                         return -EEXIST;
1463                                 }
1464                                 if (TC_H_MIN(tcm->tcm_handle)) {
1465                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1466                                         return -EINVAL;
1467                                 }
1468                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1469                                 if (!q)
1470                                         goto create_n_graft;
1471                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1472                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1473                                         return -EEXIST;
1474                                 }
1475                                 if (tca[TCA_KIND] &&
1476                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1477                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1478                                         return -EINVAL;
1479                                 }
1480                                 if (q == p ||
1481                                     (p && check_loop(q, p, 0))) {
1482                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1483                                         return -ELOOP;
1484                                 }
1485                                 qdisc_refcount_inc(q);
1486                                 goto graft;
1487                         } else {
1488                                 if (!q)
1489                                         goto create_n_graft;
1490
1491                                 /* This magic test requires explanation.
1492                                  *
1493                                  *   We know, that some child q is already
1494                                  *   attached to this parent and have choice:
1495                                  *   either to change it or to create/graft new one.
1496                                  *
1497                                  *   1. We are allowed to create/graft only
1498                                  *   if CREATE and REPLACE flags are set.
1499                                  *
1500                                  *   2. If EXCL is set, requestor wanted to say,
1501                                  *   that qdisc tcm_handle is not expected
1502                                  *   to exist, so that we choose create/graft too.
1503                                  *
1504                                  *   3. The last case is when no flags are set.
1505                                  *   Alas, it is sort of hole in API, we
1506                                  *   cannot decide what to do unambiguously.
1507                                  *   For now we select create/graft, if
1508                                  *   user gave KIND, which does not match existing.
1509                                  */
1510                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1511                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1512                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1513                                      (tca[TCA_KIND] &&
1514                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1515                                         goto create_n_graft;
1516                         }
1517                 }
1518         } else {
1519                 if (!tcm->tcm_handle) {
1520                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1521                         return -EINVAL;
1522                 }
1523                 q = qdisc_lookup(dev, tcm->tcm_handle);
1524         }
1525
1526         /* Change qdisc parameters */
1527         if (!q) {
1528                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1529                 return -ENOENT;
1530         }
1531         if (n->nlmsg_flags & NLM_F_EXCL) {
1532                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1533                 return -EEXIST;
1534         }
1535         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1536                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1537                 return -EINVAL;
1538         }
1539         err = qdisc_change(q, tca, extack);
1540         if (err == 0)
1541                 qdisc_notify(net, skb, n, clid, NULL, q);
1542         return err;
1543
1544 create_n_graft:
1545         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1546                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1547                 return -ENOENT;
1548         }
1549         if (clid == TC_H_INGRESS) {
1550                 if (dev_ingress_queue(dev)) {
1551                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1552                                          tcm->tcm_parent, tcm->tcm_parent,
1553                                          tca, &err, extack);
1554                 } else {
1555                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1556                         err = -ENOENT;
1557                 }
1558         } else {
1559                 struct netdev_queue *dev_queue;
1560
1561                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1562                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1563                 else if (p)
1564                         dev_queue = p->dev_queue;
1565                 else
1566                         dev_queue = netdev_get_tx_queue(dev, 0);
1567
1568                 q = qdisc_create(dev, dev_queue, p,
1569                                  tcm->tcm_parent, tcm->tcm_handle,
1570                                  tca, &err, extack);
1571         }
1572         if (q == NULL) {
1573                 if (err == -EAGAIN)
1574                         goto replay;
1575                 return err;
1576         }
1577
1578 graft:
1579         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1580         if (err) {
1581                 if (q)
1582                         qdisc_put(q);
1583                 return err;
1584         }
1585
1586         return 0;
1587 }
1588
1589 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1590                               struct netlink_callback *cb,
1591                               int *q_idx_p, int s_q_idx, bool recur,
1592                               bool dump_invisible)
1593 {
1594         int ret = 0, q_idx = *q_idx_p;
1595         struct Qdisc *q;
1596         int b;
1597
1598         if (!root)
1599                 return 0;
1600
1601         q = root;
1602         if (q_idx < s_q_idx) {
1603                 q_idx++;
1604         } else {
1605                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1606                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1607                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1608                                   RTM_NEWQDISC) <= 0)
1609                         goto done;
1610                 q_idx++;
1611         }
1612
1613         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1614          * itself has already been dumped.
1615          *
1616          * If we've already dumped the top-level (ingress) qdisc above and the global
1617          * qdisc hashtable, we don't want to hit it again
1618          */
1619         if (!qdisc_dev(root) || !recur)
1620                 goto out;
1621
1622         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1623                 if (q_idx < s_q_idx) {
1624                         q_idx++;
1625                         continue;
1626                 }
1627                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1628                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1629                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1630                                   RTM_NEWQDISC) <= 0)
1631                         goto done;
1632                 q_idx++;
1633         }
1634
1635 out:
1636         *q_idx_p = q_idx;
1637         return ret;
1638 done:
1639         ret = -1;
1640         goto out;
1641 }
1642
1643 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1644 {
1645         struct net *net = sock_net(skb->sk);
1646         int idx, q_idx;
1647         int s_idx, s_q_idx;
1648         struct net_device *dev;
1649         const struct nlmsghdr *nlh = cb->nlh;
1650         struct nlattr *tca[TCA_MAX + 1];
1651         int err;
1652
1653         s_idx = cb->args[0];
1654         s_q_idx = q_idx = cb->args[1];
1655
1656         idx = 0;
1657         ASSERT_RTNL();
1658
1659         err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL);
1660         if (err < 0)
1661                 return err;
1662
1663         for_each_netdev(net, dev) {
1664                 struct netdev_queue *dev_queue;
1665
1666                 if (idx < s_idx)
1667                         goto cont;
1668                 if (idx > s_idx)
1669                         s_q_idx = 0;
1670                 q_idx = 0;
1671
1672                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1673                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1674                         goto done;
1675
1676                 dev_queue = dev_ingress_queue(dev);
1677                 if (dev_queue &&
1678                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1679                                        &q_idx, s_q_idx, false,
1680                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1681                         goto done;
1682
1683 cont:
1684                 idx++;
1685         }
1686
1687 done:
1688         cb->args[0] = idx;
1689         cb->args[1] = q_idx;
1690
1691         return skb->len;
1692 }
1693
1694
1695
1696 /************************************************
1697  *      Traffic classes manipulation.           *
1698  ************************************************/
1699
1700 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1701                           unsigned long cl,
1702                           u32 portid, u32 seq, u16 flags, int event)
1703 {
1704         struct tcmsg *tcm;
1705         struct nlmsghdr  *nlh;
1706         unsigned char *b = skb_tail_pointer(skb);
1707         struct gnet_dump d;
1708         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1709
1710         cond_resched();
1711         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1712         if (!nlh)
1713                 goto out_nlmsg_trim;
1714         tcm = nlmsg_data(nlh);
1715         tcm->tcm_family = AF_UNSPEC;
1716         tcm->tcm__pad1 = 0;
1717         tcm->tcm__pad2 = 0;
1718         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1719         tcm->tcm_parent = q->handle;
1720         tcm->tcm_handle = q->handle;
1721         tcm->tcm_info = 0;
1722         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1723                 goto nla_put_failure;
1724         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1725                 goto nla_put_failure;
1726
1727         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1728                                          NULL, &d, TCA_PAD) < 0)
1729                 goto nla_put_failure;
1730
1731         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1732                 goto nla_put_failure;
1733
1734         if (gnet_stats_finish_copy(&d) < 0)
1735                 goto nla_put_failure;
1736
1737         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1738         return skb->len;
1739
1740 out_nlmsg_trim:
1741 nla_put_failure:
1742         nlmsg_trim(skb, b);
1743         return -1;
1744 }
1745
1746 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1747                          struct nlmsghdr *n, struct Qdisc *q,
1748                          unsigned long cl, int event)
1749 {
1750         struct sk_buff *skb;
1751         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1752
1753         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1754         if (!skb)
1755                 return -ENOBUFS;
1756
1757         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1758                 kfree_skb(skb);
1759                 return -EINVAL;
1760         }
1761
1762         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1763                               n->nlmsg_flags & NLM_F_ECHO);
1764 }
1765
1766 static int tclass_del_notify(struct net *net,
1767                              const struct Qdisc_class_ops *cops,
1768                              struct sk_buff *oskb, struct nlmsghdr *n,
1769                              struct Qdisc *q, unsigned long cl)
1770 {
1771         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1772         struct sk_buff *skb;
1773         int err = 0;
1774
1775         if (!cops->delete)
1776                 return -EOPNOTSUPP;
1777
1778         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1779         if (!skb)
1780                 return -ENOBUFS;
1781
1782         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1783                            RTM_DELTCLASS) < 0) {
1784                 kfree_skb(skb);
1785                 return -EINVAL;
1786         }
1787
1788         err = cops->delete(q, cl);
1789         if (err) {
1790                 kfree_skb(skb);
1791                 return err;
1792         }
1793
1794         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1795                               n->nlmsg_flags & NLM_F_ECHO);
1796 }
1797
1798 #ifdef CONFIG_NET_CLS
1799
1800 struct tcf_bind_args {
1801         struct tcf_walker w;
1802         u32 classid;
1803         unsigned long cl;
1804 };
1805
1806 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1807 {
1808         struct tcf_bind_args *a = (void *)arg;
1809
1810         if (tp->ops->bind_class) {
1811                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1812
1813                 sch_tree_lock(q);
1814                 tp->ops->bind_class(n, a->classid, a->cl);
1815                 sch_tree_unlock(q);
1816         }
1817         return 0;
1818 }
1819
1820 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1821                            unsigned long new_cl)
1822 {
1823         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1824         struct tcf_block *block;
1825         struct tcf_chain *chain;
1826         unsigned long cl;
1827
1828         cl = cops->find(q, portid);
1829         if (!cl)
1830                 return;
1831         block = cops->tcf_block(q, cl, NULL);
1832         if (!block)
1833                 return;
1834         list_for_each_entry(chain, &block->chain_list, list) {
1835                 struct tcf_proto *tp;
1836
1837                 for (tp = rtnl_dereference(chain->filter_chain);
1838                      tp; tp = rtnl_dereference(tp->next)) {
1839                         struct tcf_bind_args arg = {};
1840
1841                         arg.w.fn = tcf_node_bind;
1842                         arg.classid = clid;
1843                         arg.cl = new_cl;
1844                         tp->ops->walk(tp, &arg.w);
1845                 }
1846         }
1847 }
1848
1849 #else
1850
1851 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1852                            unsigned long new_cl)
1853 {
1854 }
1855
1856 #endif
1857
1858 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1859                          struct netlink_ext_ack *extack)
1860 {
1861         struct net *net = sock_net(skb->sk);
1862         struct tcmsg *tcm = nlmsg_data(n);
1863         struct nlattr *tca[TCA_MAX + 1];
1864         struct net_device *dev;
1865         struct Qdisc *q = NULL;
1866         const struct Qdisc_class_ops *cops;
1867         unsigned long cl = 0;
1868         unsigned long new_cl;
1869         u32 portid;
1870         u32 clid;
1871         u32 qid;
1872         int err;
1873
1874         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1875             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1876                 return -EPERM;
1877
1878         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1879         if (err < 0)
1880                 return err;
1881
1882         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1883         if (!dev)
1884                 return -ENODEV;
1885
1886         /*
1887            parent == TC_H_UNSPEC - unspecified parent.
1888            parent == TC_H_ROOT   - class is root, which has no parent.
1889            parent == X:0         - parent is root class.
1890            parent == X:Y         - parent is a node in hierarchy.
1891            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1892
1893            handle == 0:0         - generate handle from kernel pool.
1894            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1895            handle == X:Y         - clear.
1896            handle == X:0         - root class.
1897          */
1898
1899         /* Step 1. Determine qdisc handle X:0 */
1900
1901         portid = tcm->tcm_parent;
1902         clid = tcm->tcm_handle;
1903         qid = TC_H_MAJ(clid);
1904
1905         if (portid != TC_H_ROOT) {
1906                 u32 qid1 = TC_H_MAJ(portid);
1907
1908                 if (qid && qid1) {
1909                         /* If both majors are known, they must be identical. */
1910                         if (qid != qid1)
1911                                 return -EINVAL;
1912                 } else if (qid1) {
1913                         qid = qid1;
1914                 } else if (qid == 0)
1915                         qid = dev->qdisc->handle;
1916
1917                 /* Now qid is genuine qdisc handle consistent
1918                  * both with parent and child.
1919                  *
1920                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1921                  */
1922                 if (portid)
1923                         portid = TC_H_MAKE(qid, portid);
1924         } else {
1925                 if (qid == 0)
1926                         qid = dev->qdisc->handle;
1927         }
1928
1929         /* OK. Locate qdisc */
1930         q = qdisc_lookup(dev, qid);
1931         if (!q)
1932                 return -ENOENT;
1933
1934         /* An check that it supports classes */
1935         cops = q->ops->cl_ops;
1936         if (cops == NULL)
1937                 return -EINVAL;
1938
1939         /* Now try to get class */
1940         if (clid == 0) {
1941                 if (portid == TC_H_ROOT)
1942                         clid = qid;
1943         } else
1944                 clid = TC_H_MAKE(qid, clid);
1945
1946         if (clid)
1947                 cl = cops->find(q, clid);
1948
1949         if (cl == 0) {
1950                 err = -ENOENT;
1951                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1952                     !(n->nlmsg_flags & NLM_F_CREATE))
1953                         goto out;
1954         } else {
1955                 switch (n->nlmsg_type) {
1956                 case RTM_NEWTCLASS:
1957                         err = -EEXIST;
1958                         if (n->nlmsg_flags & NLM_F_EXCL)
1959                                 goto out;
1960                         break;
1961                 case RTM_DELTCLASS:
1962                         err = tclass_del_notify(net, cops, skb, n, q, cl);
1963                         /* Unbind the class with flilters with 0 */
1964                         tc_bind_tclass(q, portid, clid, 0);
1965                         goto out;
1966                 case RTM_GETTCLASS:
1967                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1968                         goto out;
1969                 default:
1970                         err = -EINVAL;
1971                         goto out;
1972                 }
1973         }
1974
1975         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1976                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
1977                 return -EOPNOTSUPP;
1978         }
1979
1980         new_cl = cl;
1981         err = -EOPNOTSUPP;
1982         if (cops->change)
1983                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
1984         if (err == 0) {
1985                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1986                 /* We just create a new class, need to do reverse binding. */
1987                 if (cl != new_cl)
1988                         tc_bind_tclass(q, portid, clid, new_cl);
1989         }
1990 out:
1991         return err;
1992 }
1993
1994 struct qdisc_dump_args {
1995         struct qdisc_walker     w;
1996         struct sk_buff          *skb;
1997         struct netlink_callback *cb;
1998 };
1999
2000 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2001                             struct qdisc_walker *arg)
2002 {
2003         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2004
2005         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2006                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2007                               RTM_NEWTCLASS);
2008 }
2009
2010 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2011                                 struct tcmsg *tcm, struct netlink_callback *cb,
2012                                 int *t_p, int s_t)
2013 {
2014         struct qdisc_dump_args arg;
2015
2016         if (tc_qdisc_dump_ignore(q, false) ||
2017             *t_p < s_t || !q->ops->cl_ops ||
2018             (tcm->tcm_parent &&
2019              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2020                 (*t_p)++;
2021                 return 0;
2022         }
2023         if (*t_p > s_t)
2024                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2025         arg.w.fn = qdisc_class_dump;
2026         arg.skb = skb;
2027         arg.cb = cb;
2028         arg.w.stop  = 0;
2029         arg.w.skip = cb->args[1];
2030         arg.w.count = 0;
2031         q->ops->cl_ops->walk(q, &arg.w);
2032         cb->args[1] = arg.w.count;
2033         if (arg.w.stop)
2034                 return -1;
2035         (*t_p)++;
2036         return 0;
2037 }
2038
2039 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2040                                struct tcmsg *tcm, struct netlink_callback *cb,
2041                                int *t_p, int s_t)
2042 {
2043         struct Qdisc *q;
2044         int b;
2045
2046         if (!root)
2047                 return 0;
2048
2049         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2050                 return -1;
2051
2052         if (!qdisc_dev(root))
2053                 return 0;
2054
2055         if (tcm->tcm_parent) {
2056                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2057                 if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2058                         return -1;
2059                 return 0;
2060         }
2061         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2062                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2063                         return -1;
2064         }
2065
2066         return 0;
2067 }
2068
2069 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2070 {
2071         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2072         struct net *net = sock_net(skb->sk);
2073         struct netdev_queue *dev_queue;
2074         struct net_device *dev;
2075         int t, s_t;
2076
2077         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2078                 return 0;
2079         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2080         if (!dev)
2081                 return 0;
2082
2083         s_t = cb->args[0];
2084         t = 0;
2085
2086         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2087                 goto done;
2088
2089         dev_queue = dev_ingress_queue(dev);
2090         if (dev_queue &&
2091             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2092                                 &t, s_t) < 0)
2093                 goto done;
2094
2095 done:
2096         cb->args[0] = t;
2097
2098         dev_put(dev);
2099         return skb->len;
2100 }
2101
2102 #ifdef CONFIG_PROC_FS
2103 static int psched_show(struct seq_file *seq, void *v)
2104 {
2105         seq_printf(seq, "%08x %08x %08x %08x\n",
2106                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2107                    1000000,
2108                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2109
2110         return 0;
2111 }
2112
2113 static int __net_init psched_net_init(struct net *net)
2114 {
2115         struct proc_dir_entry *e;
2116
2117         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2118         if (e == NULL)
2119                 return -ENOMEM;
2120
2121         return 0;
2122 }
2123
2124 static void __net_exit psched_net_exit(struct net *net)
2125 {
2126         remove_proc_entry("psched", net->proc_net);
2127 }
2128 #else
2129 static int __net_init psched_net_init(struct net *net)
2130 {
2131         return 0;
2132 }
2133
2134 static void __net_exit psched_net_exit(struct net *net)
2135 {
2136 }
2137 #endif
2138
2139 static struct pernet_operations psched_net_ops = {
2140         .init = psched_net_init,
2141         .exit = psched_net_exit,
2142 };
2143
2144 static int __init pktsched_init(void)
2145 {
2146         int err;
2147
2148         err = register_pernet_subsys(&psched_net_ops);
2149         if (err) {
2150                 pr_err("pktsched_init: "
2151                        "cannot initialize per netns operations\n");
2152                 return err;
2153         }
2154
2155         register_qdisc(&pfifo_fast_ops);
2156         register_qdisc(&pfifo_qdisc_ops);
2157         register_qdisc(&bfifo_qdisc_ops);
2158         register_qdisc(&pfifo_head_drop_qdisc_ops);
2159         register_qdisc(&mq_qdisc_ops);
2160         register_qdisc(&noqueue_qdisc_ops);
2161
2162         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2163         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2164         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2165                       0);
2166         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2167         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2168         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2169                       0);
2170
2171         return 0;
2172 }
2173
2174 subsys_initcall(pktsched_init);