11ebba60da3b4c865f902743e651bbc3fabc85e3
[linux-2.6-microblaze.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 #include <trace/events/qdisc.h>
36
37 /*
38
39    Short review.
40    -------------
41
42    This file consists of two interrelated parts:
43
44    1. queueing disciplines manager frontend.
45    2. traffic classes manager frontend.
46
47    Generally, queueing discipline ("qdisc") is a black box,
48    which is able to enqueue packets and to dequeue them (when
49    device is ready to send something) in order and at times
50    determined by algorithm hidden in it.
51
52    qdisc's are divided to two categories:
53    - "queues", which have no internal structure visible from outside.
54    - "schedulers", which split all the packets to "traffic classes",
55      using "packet classifiers" (look at cls_api.c)
56
57    In turn, classes may have child qdiscs (as rule, queues)
58    attached to them etc. etc. etc.
59
60    The goal of the routines in this file is to translate
61    information supplied by user in the form of handles
62    to more intelligible for kernel form, to make some sanity
63    checks and part of work, which is common to all qdiscs
64    and to provide rtnetlink notifications.
65
66    All real intelligent work is done inside qdisc modules.
67
68
69
70    Every discipline has two major routines: enqueue and dequeue.
71
72    ---dequeue
73
74    dequeue usually returns a skb to send. It is allowed to return NULL,
75    but it does not mean that queue is empty, it just means that
76    discipline does not want to send anything this time.
77    Queue is really empty if q->q.qlen == 0.
78    For complicated disciplines with multiple queues q->q is not
79    real packet queue, but however q->q.qlen must be valid.
80
81    ---enqueue
82
83    enqueue returns 0, if packet was enqueued successfully.
84    If packet (this one or another one) was dropped, it returns
85    not zero error code.
86    NET_XMIT_DROP        - this packet dropped
87      Expected action: do not backoff, but wait until queue will clear.
88    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
89      Expected action: backoff or ignore
90
91    Auxiliary routines:
92
93    ---peek
94
95    like dequeue but without removing a packet from the queue
96
97    ---reset
98
99    returns qdisc to initial state: purge all buffers, clear all
100    timers, counters (except for statistics) etc.
101
102    ---init
103
104    initializes newly created qdisc.
105
106    ---destroy
107
108    destroys resources allocated by init and during lifetime of qdisc.
109
110    ---change
111
112    changes qdisc parameters.
113  */
114
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
117
118
119 /************************************************
120  *      Queueing disciplines manipulation.      *
121  ************************************************/
122
123
124 /* The list of all installed queueing disciplines. */
125
126 static struct Qdisc_ops *qdisc_base;
127
128 /* Register/unregister queueing discipline */
129
130 int register_qdisc(struct Qdisc_ops *qops)
131 {
132         struct Qdisc_ops *q, **qp;
133         int rc = -EEXIST;
134
135         write_lock(&qdisc_mod_lock);
136         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137                 if (!strcmp(qops->id, q->id))
138                         goto out;
139
140         if (qops->enqueue == NULL)
141                 qops->enqueue = noop_qdisc_ops.enqueue;
142         if (qops->peek == NULL) {
143                 if (qops->dequeue == NULL)
144                         qops->peek = noop_qdisc_ops.peek;
145                 else
146                         goto out_einval;
147         }
148         if (qops->dequeue == NULL)
149                 qops->dequeue = noop_qdisc_ops.dequeue;
150
151         if (qops->cl_ops) {
152                 const struct Qdisc_class_ops *cops = qops->cl_ops;
153
154                 if (!(cops->find && cops->walk && cops->leaf))
155                         goto out_einval;
156
157                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158                         goto out_einval;
159         }
160
161         qops->next = NULL;
162         *qp = qops;
163         rc = 0;
164 out:
165         write_unlock(&qdisc_mod_lock);
166         return rc;
167
168 out_einval:
169         rc = -EINVAL;
170         goto out;
171 }
172 EXPORT_SYMBOL(register_qdisc);
173
174 int unregister_qdisc(struct Qdisc_ops *qops)
175 {
176         struct Qdisc_ops *q, **qp;
177         int err = -ENOENT;
178
179         write_lock(&qdisc_mod_lock);
180         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181                 if (q == qops)
182                         break;
183         if (q) {
184                 *qp = q->next;
185                 q->next = NULL;
186                 err = 0;
187         }
188         write_unlock(&qdisc_mod_lock);
189         return err;
190 }
191 EXPORT_SYMBOL(unregister_qdisc);
192
193 /* Get default qdisc if not otherwise specified */
194 void qdisc_get_default(char *name, size_t len)
195 {
196         read_lock(&qdisc_mod_lock);
197         strlcpy(name, default_qdisc_ops->id, len);
198         read_unlock(&qdisc_mod_lock);
199 }
200
201 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
202 {
203         struct Qdisc_ops *q = NULL;
204
205         for (q = qdisc_base; q; q = q->next) {
206                 if (!strcmp(name, q->id)) {
207                         if (!try_module_get(q->owner))
208                                 q = NULL;
209                         break;
210                 }
211         }
212
213         return q;
214 }
215
216 /* Set new default qdisc to use */
217 int qdisc_set_default(const char *name)
218 {
219         const struct Qdisc_ops *ops;
220
221         if (!capable(CAP_NET_ADMIN))
222                 return -EPERM;
223
224         write_lock(&qdisc_mod_lock);
225         ops = qdisc_lookup_default(name);
226         if (!ops) {
227                 /* Not found, drop lock and try to load module */
228                 write_unlock(&qdisc_mod_lock);
229                 request_module("sch_%s", name);
230                 write_lock(&qdisc_mod_lock);
231
232                 ops = qdisc_lookup_default(name);
233         }
234
235         if (ops) {
236                 /* Set new default */
237                 module_put(default_qdisc_ops->owner);
238                 default_qdisc_ops = ops;
239         }
240         write_unlock(&qdisc_mod_lock);
241
242         return ops ? 0 : -ENOENT;
243 }
244
245 #ifdef CONFIG_NET_SCH_DEFAULT
246 /* Set default value from kernel config */
247 static int __init sch_default_qdisc(void)
248 {
249         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
250 }
251 late_initcall(sch_default_qdisc);
252 #endif
253
254 /* We know handle. Find qdisc among all qdisc's attached to device
255  * (root qdisc, all its children, children of children etc.)
256  * Note: caller either uses rtnl or rcu_read_lock()
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!qdisc_dev(root))
264                 return (root->handle == handle ? root : NULL);
265
266         if (!(root->flags & TCQ_F_BUILTIN) &&
267             root->handle == handle)
268                 return root;
269
270         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
271                 if (q->handle == handle)
272                         return q;
273         }
274         return NULL;
275 }
276
277 void qdisc_hash_add(struct Qdisc *q, bool invisible)
278 {
279         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
280                 ASSERT_RTNL();
281                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
282                 if (invisible)
283                         q->flags |= TCQ_F_INVISIBLE;
284         }
285 }
286 EXPORT_SYMBOL(qdisc_hash_add);
287
288 void qdisc_hash_del(struct Qdisc *q)
289 {
290         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
291                 ASSERT_RTNL();
292                 hash_del_rcu(&q->hash);
293         }
294 }
295 EXPORT_SYMBOL(qdisc_hash_del);
296
297 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
298 {
299         struct Qdisc *q;
300
301         if (!handle)
302                 return NULL;
303         q = qdisc_match_from_root(dev->qdisc, handle);
304         if (q)
305                 goto out;
306
307         if (dev_ingress_queue(dev))
308                 q = qdisc_match_from_root(
309                         dev_ingress_queue(dev)->qdisc_sleeping,
310                         handle);
311 out:
312         return q;
313 }
314
315 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
316 {
317         struct netdev_queue *nq;
318         struct Qdisc *q;
319
320         if (!handle)
321                 return NULL;
322         q = qdisc_match_from_root(dev->qdisc, handle);
323         if (q)
324                 goto out;
325
326         nq = dev_ingress_queue_rcu(dev);
327         if (nq)
328                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
329 out:
330         return q;
331 }
332
333 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
334 {
335         unsigned long cl;
336         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
337
338         if (cops == NULL)
339                 return NULL;
340         cl = cops->find(p, classid);
341
342         if (cl == 0)
343                 return NULL;
344         return cops->leaf(p, cl);
345 }
346
347 /* Find queueing discipline by name */
348
349 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
350 {
351         struct Qdisc_ops *q = NULL;
352
353         if (kind) {
354                 read_lock(&qdisc_mod_lock);
355                 for (q = qdisc_base; q; q = q->next) {
356                         if (nla_strcmp(kind, q->id) == 0) {
357                                 if (!try_module_get(q->owner))
358                                         q = NULL;
359                                 break;
360                         }
361                 }
362                 read_unlock(&qdisc_mod_lock);
363         }
364         return q;
365 }
366
367 /* The linklayer setting were not transferred from iproute2, in older
368  * versions, and the rate tables lookup systems have been dropped in
369  * the kernel. To keep backward compatible with older iproute2 tc
370  * utils, we detect the linklayer setting by detecting if the rate
371  * table were modified.
372  *
373  * For linklayer ATM table entries, the rate table will be aligned to
374  * 48 bytes, thus some table entries will contain the same value.  The
375  * mpu (min packet unit) is also encoded into the old rate table, thus
376  * starting from the mpu, we find low and high table entries for
377  * mapping this cell.  If these entries contain the same value, when
378  * the rate tables have been modified for linklayer ATM.
379  *
380  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
381  * and then roundup to the next cell, calc the table entry one below,
382  * and compare.
383  */
384 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
385 {
386         int low       = roundup(r->mpu, 48);
387         int high      = roundup(low+1, 48);
388         int cell_low  = low >> r->cell_log;
389         int cell_high = (high >> r->cell_log) - 1;
390
391         /* rtab is too inaccurate at rates > 100Mbit/s */
392         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
393                 pr_debug("TC linklayer: Giving up ATM detection\n");
394                 return TC_LINKLAYER_ETHERNET;
395         }
396
397         if ((cell_high > cell_low) && (cell_high < 256)
398             && (rtab[cell_low] == rtab[cell_high])) {
399                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
400                          cell_low, cell_high, rtab[cell_high]);
401                 return TC_LINKLAYER_ATM;
402         }
403         return TC_LINKLAYER_ETHERNET;
404 }
405
406 static struct qdisc_rate_table *qdisc_rtab_list;
407
408 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
409                                         struct nlattr *tab,
410                                         struct netlink_ext_ack *extack)
411 {
412         struct qdisc_rate_table *rtab;
413
414         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
415             nla_len(tab) != TC_RTAB_SIZE) {
416                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
417                 return NULL;
418         }
419
420         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
421                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
422                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
423                         rtab->refcnt++;
424                         return rtab;
425                 }
426         }
427
428         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
429         if (rtab) {
430                 rtab->rate = *r;
431                 rtab->refcnt = 1;
432                 memcpy(rtab->data, nla_data(tab), 1024);
433                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
434                         r->linklayer = __detect_linklayer(r, rtab->data);
435                 rtab->next = qdisc_rtab_list;
436                 qdisc_rtab_list = rtab;
437         } else {
438                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
439         }
440         return rtab;
441 }
442 EXPORT_SYMBOL(qdisc_get_rtab);
443
444 void qdisc_put_rtab(struct qdisc_rate_table *tab)
445 {
446         struct qdisc_rate_table *rtab, **rtabp;
447
448         if (!tab || --tab->refcnt)
449                 return;
450
451         for (rtabp = &qdisc_rtab_list;
452              (rtab = *rtabp) != NULL;
453              rtabp = &rtab->next) {
454                 if (rtab == tab) {
455                         *rtabp = rtab->next;
456                         kfree(rtab);
457                         return;
458                 }
459         }
460 }
461 EXPORT_SYMBOL(qdisc_put_rtab);
462
463 static LIST_HEAD(qdisc_stab_list);
464
465 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
466         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
467         [TCA_STAB_DATA] = { .type = NLA_BINARY },
468 };
469
470 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
471                                                struct netlink_ext_ack *extack)
472 {
473         struct nlattr *tb[TCA_STAB_MAX + 1];
474         struct qdisc_size_table *stab;
475         struct tc_sizespec *s;
476         unsigned int tsize = 0;
477         u16 *tab = NULL;
478         int err;
479
480         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
481                                           extack);
482         if (err < 0)
483                 return ERR_PTR(err);
484         if (!tb[TCA_STAB_BASE]) {
485                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
486                 return ERR_PTR(-EINVAL);
487         }
488
489         s = nla_data(tb[TCA_STAB_BASE]);
490
491         if (s->tsize > 0) {
492                 if (!tb[TCA_STAB_DATA]) {
493                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
494                         return ERR_PTR(-EINVAL);
495                 }
496                 tab = nla_data(tb[TCA_STAB_DATA]);
497                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
498         }
499
500         if (tsize != s->tsize || (!tab && tsize > 0)) {
501                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
502                 return ERR_PTR(-EINVAL);
503         }
504
505         list_for_each_entry(stab, &qdisc_stab_list, list) {
506                 if (memcmp(&stab->szopts, s, sizeof(*s)))
507                         continue;
508                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
509                         continue;
510                 stab->refcnt++;
511                 return stab;
512         }
513
514         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
515         if (!stab)
516                 return ERR_PTR(-ENOMEM);
517
518         stab->refcnt = 1;
519         stab->szopts = *s;
520         if (tsize > 0)
521                 memcpy(stab->data, tab, tsize * sizeof(u16));
522
523         list_add_tail(&stab->list, &qdisc_stab_list);
524
525         return stab;
526 }
527
528 void qdisc_put_stab(struct qdisc_size_table *tab)
529 {
530         if (!tab)
531                 return;
532
533         if (--tab->refcnt == 0) {
534                 list_del(&tab->list);
535                 kfree_rcu(tab, rcu);
536         }
537 }
538 EXPORT_SYMBOL(qdisc_put_stab);
539
540 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
541 {
542         struct nlattr *nest;
543
544         nest = nla_nest_start_noflag(skb, TCA_STAB);
545         if (nest == NULL)
546                 goto nla_put_failure;
547         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
548                 goto nla_put_failure;
549         nla_nest_end(skb, nest);
550
551         return skb->len;
552
553 nla_put_failure:
554         return -1;
555 }
556
557 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
558                                const struct qdisc_size_table *stab)
559 {
560         int pkt_len, slot;
561
562         pkt_len = skb->len + stab->szopts.overhead;
563         if (unlikely(!stab->szopts.tsize))
564                 goto out;
565
566         slot = pkt_len + stab->szopts.cell_align;
567         if (unlikely(slot < 0))
568                 slot = 0;
569
570         slot >>= stab->szopts.cell_log;
571         if (likely(slot < stab->szopts.tsize))
572                 pkt_len = stab->data[slot];
573         else
574                 pkt_len = stab->data[stab->szopts.tsize - 1] *
575                                 (slot / stab->szopts.tsize) +
576                                 stab->data[slot % stab->szopts.tsize];
577
578         pkt_len <<= stab->szopts.size_log;
579 out:
580         if (unlikely(pkt_len < 1))
581                 pkt_len = 1;
582         qdisc_skb_cb(skb)->pkt_len = pkt_len;
583 }
584 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
585
586 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
587 {
588         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
589                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
590                         txt, qdisc->ops->id, qdisc->handle >> 16);
591                 qdisc->flags |= TCQ_F_WARN_NONWC;
592         }
593 }
594 EXPORT_SYMBOL(qdisc_warn_nonwc);
595
596 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
597 {
598         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
599                                                  timer);
600
601         rcu_read_lock();
602         __netif_schedule(qdisc_root(wd->qdisc));
603         rcu_read_unlock();
604
605         return HRTIMER_NORESTART;
606 }
607
608 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
609                                  clockid_t clockid)
610 {
611         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
612         wd->timer.function = qdisc_watchdog;
613         wd->qdisc = qdisc;
614 }
615 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
616
617 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
618 {
619         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
620 }
621 EXPORT_SYMBOL(qdisc_watchdog_init);
622
623 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
624                                       u64 delta_ns)
625 {
626         if (test_bit(__QDISC_STATE_DEACTIVATED,
627                      &qdisc_root_sleeping(wd->qdisc)->state))
628                 return;
629
630         if (hrtimer_is_queued(&wd->timer)) {
631                 /* If timer is already set in [expires, expires + delta_ns],
632                  * do not reprogram it.
633                  */
634                 if (wd->last_expires - expires <= delta_ns)
635                         return;
636         }
637
638         wd->last_expires = expires;
639         hrtimer_start_range_ns(&wd->timer,
640                                ns_to_ktime(expires),
641                                delta_ns,
642                                HRTIMER_MODE_ABS_PINNED);
643 }
644 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
645
646 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
647 {
648         hrtimer_cancel(&wd->timer);
649 }
650 EXPORT_SYMBOL(qdisc_watchdog_cancel);
651
652 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
653 {
654         struct hlist_head *h;
655         unsigned int i;
656
657         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
658
659         if (h != NULL) {
660                 for (i = 0; i < n; i++)
661                         INIT_HLIST_HEAD(&h[i]);
662         }
663         return h;
664 }
665
666 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
667 {
668         struct Qdisc_class_common *cl;
669         struct hlist_node *next;
670         struct hlist_head *nhash, *ohash;
671         unsigned int nsize, nmask, osize;
672         unsigned int i, h;
673
674         /* Rehash when load factor exceeds 0.75 */
675         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
676                 return;
677         nsize = clhash->hashsize * 2;
678         nmask = nsize - 1;
679         nhash = qdisc_class_hash_alloc(nsize);
680         if (nhash == NULL)
681                 return;
682
683         ohash = clhash->hash;
684         osize = clhash->hashsize;
685
686         sch_tree_lock(sch);
687         for (i = 0; i < osize; i++) {
688                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
689                         h = qdisc_class_hash(cl->classid, nmask);
690                         hlist_add_head(&cl->hnode, &nhash[h]);
691                 }
692         }
693         clhash->hash     = nhash;
694         clhash->hashsize = nsize;
695         clhash->hashmask = nmask;
696         sch_tree_unlock(sch);
697
698         kvfree(ohash);
699 }
700 EXPORT_SYMBOL(qdisc_class_hash_grow);
701
702 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
703 {
704         unsigned int size = 4;
705
706         clhash->hash = qdisc_class_hash_alloc(size);
707         if (!clhash->hash)
708                 return -ENOMEM;
709         clhash->hashsize  = size;
710         clhash->hashmask  = size - 1;
711         clhash->hashelems = 0;
712         return 0;
713 }
714 EXPORT_SYMBOL(qdisc_class_hash_init);
715
716 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
717 {
718         kvfree(clhash->hash);
719 }
720 EXPORT_SYMBOL(qdisc_class_hash_destroy);
721
722 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
723                              struct Qdisc_class_common *cl)
724 {
725         unsigned int h;
726
727         INIT_HLIST_NODE(&cl->hnode);
728         h = qdisc_class_hash(cl->classid, clhash->hashmask);
729         hlist_add_head(&cl->hnode, &clhash->hash[h]);
730         clhash->hashelems++;
731 }
732 EXPORT_SYMBOL(qdisc_class_hash_insert);
733
734 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
735                              struct Qdisc_class_common *cl)
736 {
737         hlist_del(&cl->hnode);
738         clhash->hashelems--;
739 }
740 EXPORT_SYMBOL(qdisc_class_hash_remove);
741
742 /* Allocate an unique handle from space managed by kernel
743  * Possible range is [8000-FFFF]:0000 (0x8000 values)
744  */
745 static u32 qdisc_alloc_handle(struct net_device *dev)
746 {
747         int i = 0x8000;
748         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
749
750         do {
751                 autohandle += TC_H_MAKE(0x10000U, 0);
752                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
753                         autohandle = TC_H_MAKE(0x80000000U, 0);
754                 if (!qdisc_lookup(dev, autohandle))
755                         return autohandle;
756                 cond_resched();
757         } while (--i > 0);
758
759         return 0;
760 }
761
762 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
763 {
764         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
765         const struct Qdisc_class_ops *cops;
766         unsigned long cl;
767         u32 parentid;
768         bool notify;
769         int drops;
770
771         if (n == 0 && len == 0)
772                 return;
773         drops = max_t(int, n, 0);
774         rcu_read_lock();
775         while ((parentid = sch->parent)) {
776                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
777                         break;
778
779                 if (sch->flags & TCQ_F_NOPARENT)
780                         break;
781                 /* Notify parent qdisc only if child qdisc becomes empty.
782                  *
783                  * If child was empty even before update then backlog
784                  * counter is screwed and we skip notification because
785                  * parent class is already passive.
786                  *
787                  * If the original child was offloaded then it is allowed
788                  * to be seem as empty, so the parent is notified anyway.
789                  */
790                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
791                                                        !qdisc_is_offloaded);
792                 /* TODO: perform the search on a per txq basis */
793                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
794                 if (sch == NULL) {
795                         WARN_ON_ONCE(parentid != TC_H_ROOT);
796                         break;
797                 }
798                 cops = sch->ops->cl_ops;
799                 if (notify && cops->qlen_notify) {
800                         cl = cops->find(sch, parentid);
801                         cops->qlen_notify(sch, cl);
802                 }
803                 sch->q.qlen -= n;
804                 sch->qstats.backlog -= len;
805                 __qdisc_qstats_drop(sch, drops);
806         }
807         rcu_read_unlock();
808 }
809 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
810
811 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
812                               void *type_data)
813 {
814         struct net_device *dev = qdisc_dev(sch);
815         int err;
816
817         sch->flags &= ~TCQ_F_OFFLOADED;
818         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
819                 return 0;
820
821         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
822         if (err == -EOPNOTSUPP)
823                 return 0;
824
825         if (!err)
826                 sch->flags |= TCQ_F_OFFLOADED;
827
828         return err;
829 }
830 EXPORT_SYMBOL(qdisc_offload_dump_helper);
831
832 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
833                                 struct Qdisc *new, struct Qdisc *old,
834                                 enum tc_setup_type type, void *type_data,
835                                 struct netlink_ext_ack *extack)
836 {
837         bool any_qdisc_is_offloaded;
838         int err;
839
840         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
841                 return;
842
843         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
844
845         /* Don't report error if the graft is part of destroy operation. */
846         if (!err || !new || new == &noop_qdisc)
847                 return;
848
849         /* Don't report error if the parent, the old child and the new
850          * one are not offloaded.
851          */
852         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
853         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
854         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
855
856         if (any_qdisc_is_offloaded)
857                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
858 }
859 EXPORT_SYMBOL(qdisc_offload_graft_helper);
860
861 static void qdisc_offload_graft_root(struct net_device *dev,
862                                      struct Qdisc *new, struct Qdisc *old,
863                                      struct netlink_ext_ack *extack)
864 {
865         struct tc_root_qopt_offload graft_offload = {
866                 .command        = TC_ROOT_GRAFT,
867                 .handle         = new ? new->handle : 0,
868                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
869                                   (old && old->flags & TCQ_F_INGRESS),
870         };
871
872         qdisc_offload_graft_helper(dev, NULL, new, old,
873                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
874 }
875
876 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
877                          u32 portid, u32 seq, u16 flags, int event)
878 {
879         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
880         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
881         struct tcmsg *tcm;
882         struct nlmsghdr  *nlh;
883         unsigned char *b = skb_tail_pointer(skb);
884         struct gnet_dump d;
885         struct qdisc_size_table *stab;
886         u32 block_index;
887         __u32 qlen;
888
889         cond_resched();
890         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
891         if (!nlh)
892                 goto out_nlmsg_trim;
893         tcm = nlmsg_data(nlh);
894         tcm->tcm_family = AF_UNSPEC;
895         tcm->tcm__pad1 = 0;
896         tcm->tcm__pad2 = 0;
897         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
898         tcm->tcm_parent = clid;
899         tcm->tcm_handle = q->handle;
900         tcm->tcm_info = refcount_read(&q->refcnt);
901         if (nla_put_string(skb, TCA_KIND, q->ops->id))
902                 goto nla_put_failure;
903         if (q->ops->ingress_block_get) {
904                 block_index = q->ops->ingress_block_get(q);
905                 if (block_index &&
906                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
907                         goto nla_put_failure;
908         }
909         if (q->ops->egress_block_get) {
910                 block_index = q->ops->egress_block_get(q);
911                 if (block_index &&
912                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
913                         goto nla_put_failure;
914         }
915         if (q->ops->dump && q->ops->dump(q, skb) < 0)
916                 goto nla_put_failure;
917         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
918                 goto nla_put_failure;
919         qlen = qdisc_qlen_sum(q);
920
921         stab = rtnl_dereference(q->stab);
922         if (stab && qdisc_dump_stab(skb, stab) < 0)
923                 goto nla_put_failure;
924
925         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
926                                          NULL, &d, TCA_PAD) < 0)
927                 goto nla_put_failure;
928
929         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
930                 goto nla_put_failure;
931
932         if (qdisc_is_percpu_stats(q)) {
933                 cpu_bstats = q->cpu_bstats;
934                 cpu_qstats = q->cpu_qstats;
935         }
936
937         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
938                                   &d, cpu_bstats, &q->bstats) < 0 ||
939             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
940             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
941                 goto nla_put_failure;
942
943         if (gnet_stats_finish_copy(&d) < 0)
944                 goto nla_put_failure;
945
946         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
947         return skb->len;
948
949 out_nlmsg_trim:
950 nla_put_failure:
951         nlmsg_trim(skb, b);
952         return -1;
953 }
954
955 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
956 {
957         if (q->flags & TCQ_F_BUILTIN)
958                 return true;
959         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
960                 return true;
961
962         return false;
963 }
964
965 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
966                         struct nlmsghdr *n, u32 clid,
967                         struct Qdisc *old, struct Qdisc *new)
968 {
969         struct sk_buff *skb;
970         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
971
972         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
973         if (!skb)
974                 return -ENOBUFS;
975
976         if (old && !tc_qdisc_dump_ignore(old, false)) {
977                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
978                                   0, RTM_DELQDISC) < 0)
979                         goto err_out;
980         }
981         if (new && !tc_qdisc_dump_ignore(new, false)) {
982                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
983                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
984                         goto err_out;
985         }
986
987         if (skb->len)
988                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
989                                       n->nlmsg_flags & NLM_F_ECHO);
990
991 err_out:
992         kfree_skb(skb);
993         return -EINVAL;
994 }
995
996 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
997                                struct nlmsghdr *n, u32 clid,
998                                struct Qdisc *old, struct Qdisc *new)
999 {
1000         if (new || old)
1001                 qdisc_notify(net, skb, n, clid, old, new);
1002
1003         if (old)
1004                 qdisc_put(old);
1005 }
1006
1007 static void qdisc_clear_nolock(struct Qdisc *sch)
1008 {
1009         sch->flags &= ~TCQ_F_NOLOCK;
1010         if (!(sch->flags & TCQ_F_CPUSTATS))
1011                 return;
1012
1013         free_percpu(sch->cpu_bstats);
1014         free_percpu(sch->cpu_qstats);
1015         sch->cpu_bstats = NULL;
1016         sch->cpu_qstats = NULL;
1017         sch->flags &= ~TCQ_F_CPUSTATS;
1018 }
1019
1020 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1021  * to device "dev".
1022  *
1023  * When appropriate send a netlink notification using 'skb'
1024  * and "n".
1025  *
1026  * On success, destroy old qdisc.
1027  */
1028
1029 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1030                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1031                        struct Qdisc *new, struct Qdisc *old,
1032                        struct netlink_ext_ack *extack)
1033 {
1034         struct Qdisc *q = old;
1035         struct net *net = dev_net(dev);
1036
1037         if (parent == NULL) {
1038                 unsigned int i, num_q, ingress;
1039
1040                 ingress = 0;
1041                 num_q = dev->num_tx_queues;
1042                 if ((q && q->flags & TCQ_F_INGRESS) ||
1043                     (new && new->flags & TCQ_F_INGRESS)) {
1044                         num_q = 1;
1045                         ingress = 1;
1046                         if (!dev_ingress_queue(dev)) {
1047                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1048                                 return -ENOENT;
1049                         }
1050                 }
1051
1052                 if (dev->flags & IFF_UP)
1053                         dev_deactivate(dev);
1054
1055                 qdisc_offload_graft_root(dev, new, old, extack);
1056
1057                 if (new && new->ops->attach)
1058                         goto skip;
1059
1060                 for (i = 0; i < num_q; i++) {
1061                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1062
1063                         if (!ingress)
1064                                 dev_queue = netdev_get_tx_queue(dev, i);
1065
1066                         old = dev_graft_qdisc(dev_queue, new);
1067                         if (new && i > 0)
1068                                 qdisc_refcount_inc(new);
1069
1070                         if (!ingress)
1071                                 qdisc_put(old);
1072                 }
1073
1074 skip:
1075                 if (!ingress) {
1076                         notify_and_destroy(net, skb, n, classid,
1077                                            dev->qdisc, new);
1078                         if (new && !new->ops->attach)
1079                                 qdisc_refcount_inc(new);
1080                         dev->qdisc = new ? : &noop_qdisc;
1081
1082                         if (new && new->ops->attach)
1083                                 new->ops->attach(new);
1084                 } else {
1085                         notify_and_destroy(net, skb, n, classid, old, new);
1086                 }
1087
1088                 if (dev->flags & IFF_UP)
1089                         dev_activate(dev);
1090         } else {
1091                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1092                 unsigned long cl;
1093                 int err;
1094
1095                 /* Only support running class lockless if parent is lockless */
1096                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1097                         qdisc_clear_nolock(new);
1098
1099                 if (!cops || !cops->graft)
1100                         return -EOPNOTSUPP;
1101
1102                 cl = cops->find(parent, classid);
1103                 if (!cl) {
1104                         NL_SET_ERR_MSG(extack, "Specified class not found");
1105                         return -ENOENT;
1106                 }
1107
1108                 err = cops->graft(parent, cl, new, &old, extack);
1109                 if (err)
1110                         return err;
1111                 notify_and_destroy(net, skb, n, classid, old, new);
1112         }
1113         return 0;
1114 }
1115
1116 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1117                                    struct netlink_ext_ack *extack)
1118 {
1119         u32 block_index;
1120
1121         if (tca[TCA_INGRESS_BLOCK]) {
1122                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1123
1124                 if (!block_index) {
1125                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1126                         return -EINVAL;
1127                 }
1128                 if (!sch->ops->ingress_block_set) {
1129                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1130                         return -EOPNOTSUPP;
1131                 }
1132                 sch->ops->ingress_block_set(sch, block_index);
1133         }
1134         if (tca[TCA_EGRESS_BLOCK]) {
1135                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1136
1137                 if (!block_index) {
1138                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1139                         return -EINVAL;
1140                 }
1141                 if (!sch->ops->egress_block_set) {
1142                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1143                         return -EOPNOTSUPP;
1144                 }
1145                 sch->ops->egress_block_set(sch, block_index);
1146         }
1147         return 0;
1148 }
1149
1150 /*
1151    Allocate and initialize new qdisc.
1152
1153    Parameters are passed via opt.
1154  */
1155
1156 static struct Qdisc *qdisc_create(struct net_device *dev,
1157                                   struct netdev_queue *dev_queue,
1158                                   struct Qdisc *p, u32 parent, u32 handle,
1159                                   struct nlattr **tca, int *errp,
1160                                   struct netlink_ext_ack *extack)
1161 {
1162         int err;
1163         struct nlattr *kind = tca[TCA_KIND];
1164         struct Qdisc *sch;
1165         struct Qdisc_ops *ops;
1166         struct qdisc_size_table *stab;
1167
1168         ops = qdisc_lookup_ops(kind);
1169 #ifdef CONFIG_MODULES
1170         if (ops == NULL && kind != NULL) {
1171                 char name[IFNAMSIZ];
1172                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1173                         /* We dropped the RTNL semaphore in order to
1174                          * perform the module load.  So, even if we
1175                          * succeeded in loading the module we have to
1176                          * tell the caller to replay the request.  We
1177                          * indicate this using -EAGAIN.
1178                          * We replay the request because the device may
1179                          * go away in the mean time.
1180                          */
1181                         rtnl_unlock();
1182                         request_module("sch_%s", name);
1183                         rtnl_lock();
1184                         ops = qdisc_lookup_ops(kind);
1185                         if (ops != NULL) {
1186                                 /* We will try again qdisc_lookup_ops,
1187                                  * so don't keep a reference.
1188                                  */
1189                                 module_put(ops->owner);
1190                                 err = -EAGAIN;
1191                                 goto err_out;
1192                         }
1193                 }
1194         }
1195 #endif
1196
1197         err = -ENOENT;
1198         if (!ops) {
1199                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1200                 goto err_out;
1201         }
1202
1203         sch = qdisc_alloc(dev_queue, ops, extack);
1204         if (IS_ERR(sch)) {
1205                 err = PTR_ERR(sch);
1206                 goto err_out2;
1207         }
1208
1209         sch->parent = parent;
1210
1211         if (handle == TC_H_INGRESS) {
1212                 sch->flags |= TCQ_F_INGRESS;
1213                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1214         } else {
1215                 if (handle == 0) {
1216                         handle = qdisc_alloc_handle(dev);
1217                         if (handle == 0) {
1218                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1219                                 err = -ENOSPC;
1220                                 goto err_out3;
1221                         }
1222                 }
1223                 if (!netif_is_multiqueue(dev))
1224                         sch->flags |= TCQ_F_ONETXQUEUE;
1225         }
1226
1227         sch->handle = handle;
1228
1229         /* This exist to keep backward compatible with a userspace
1230          * loophole, what allowed userspace to get IFF_NO_QUEUE
1231          * facility on older kernels by setting tx_queue_len=0 (prior
1232          * to qdisc init), and then forgot to reinit tx_queue_len
1233          * before again attaching a qdisc.
1234          */
1235         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1236                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1237                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1238         }
1239
1240         err = qdisc_block_indexes_set(sch, tca, extack);
1241         if (err)
1242                 goto err_out3;
1243
1244         if (ops->init) {
1245                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1246                 if (err != 0)
1247                         goto err_out5;
1248         }
1249
1250         if (tca[TCA_STAB]) {
1251                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1252                 if (IS_ERR(stab)) {
1253                         err = PTR_ERR(stab);
1254                         goto err_out4;
1255                 }
1256                 rcu_assign_pointer(sch->stab, stab);
1257         }
1258         if (tca[TCA_RATE]) {
1259                 seqcount_t *running;
1260
1261                 err = -EOPNOTSUPP;
1262                 if (sch->flags & TCQ_F_MQROOT) {
1263                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1264                         goto err_out4;
1265                 }
1266
1267                 if (sch->parent != TC_H_ROOT &&
1268                     !(sch->flags & TCQ_F_INGRESS) &&
1269                     (!p || !(p->flags & TCQ_F_MQROOT)))
1270                         running = qdisc_root_sleeping_running(sch);
1271                 else
1272                         running = &sch->running;
1273
1274                 err = gen_new_estimator(&sch->bstats,
1275                                         sch->cpu_bstats,
1276                                         &sch->rate_est,
1277                                         NULL,
1278                                         running,
1279                                         tca[TCA_RATE]);
1280                 if (err) {
1281                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1282                         goto err_out4;
1283                 }
1284         }
1285
1286         qdisc_hash_add(sch, false);
1287         trace_qdisc_create(ops, dev, parent);
1288
1289         return sch;
1290
1291 err_out5:
1292         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1293         if (ops->destroy)
1294                 ops->destroy(sch);
1295 err_out3:
1296         dev_put(dev);
1297         qdisc_free(sch);
1298 err_out2:
1299         module_put(ops->owner);
1300 err_out:
1301         *errp = err;
1302         return NULL;
1303
1304 err_out4:
1305         /*
1306          * Any broken qdiscs that would require a ops->reset() here?
1307          * The qdisc was never in action so it shouldn't be necessary.
1308          */
1309         qdisc_put_stab(rtnl_dereference(sch->stab));
1310         if (ops->destroy)
1311                 ops->destroy(sch);
1312         goto err_out3;
1313 }
1314
1315 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1316                         struct netlink_ext_ack *extack)
1317 {
1318         struct qdisc_size_table *ostab, *stab = NULL;
1319         int err = 0;
1320
1321         if (tca[TCA_OPTIONS]) {
1322                 if (!sch->ops->change) {
1323                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1324                         return -EINVAL;
1325                 }
1326                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1327                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1328                         return -EOPNOTSUPP;
1329                 }
1330                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1331                 if (err)
1332                         return err;
1333         }
1334
1335         if (tca[TCA_STAB]) {
1336                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1337                 if (IS_ERR(stab))
1338                         return PTR_ERR(stab);
1339         }
1340
1341         ostab = rtnl_dereference(sch->stab);
1342         rcu_assign_pointer(sch->stab, stab);
1343         qdisc_put_stab(ostab);
1344
1345         if (tca[TCA_RATE]) {
1346                 /* NB: ignores errors from replace_estimator
1347                    because change can't be undone. */
1348                 if (sch->flags & TCQ_F_MQROOT)
1349                         goto out;
1350                 gen_replace_estimator(&sch->bstats,
1351                                       sch->cpu_bstats,
1352                                       &sch->rate_est,
1353                                       NULL,
1354                                       qdisc_root_sleeping_running(sch),
1355                                       tca[TCA_RATE]);
1356         }
1357 out:
1358         return 0;
1359 }
1360
1361 struct check_loop_arg {
1362         struct qdisc_walker     w;
1363         struct Qdisc            *p;
1364         int                     depth;
1365 };
1366
1367 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1368                          struct qdisc_walker *w);
1369
1370 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1371 {
1372         struct check_loop_arg   arg;
1373
1374         if (q->ops->cl_ops == NULL)
1375                 return 0;
1376
1377         arg.w.stop = arg.w.skip = arg.w.count = 0;
1378         arg.w.fn = check_loop_fn;
1379         arg.depth = depth;
1380         arg.p = p;
1381         q->ops->cl_ops->walk(q, &arg.w);
1382         return arg.w.stop ? -ELOOP : 0;
1383 }
1384
1385 static int
1386 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1387 {
1388         struct Qdisc *leaf;
1389         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1390         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1391
1392         leaf = cops->leaf(q, cl);
1393         if (leaf) {
1394                 if (leaf == arg->p || arg->depth > 7)
1395                         return -ELOOP;
1396                 return check_loop(leaf, arg->p, arg->depth + 1);
1397         }
1398         return 0;
1399 }
1400
1401 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1402         [TCA_KIND]              = { .type = NLA_STRING },
1403         [TCA_RATE]              = { .type = NLA_BINARY,
1404                                     .len = sizeof(struct tc_estimator) },
1405         [TCA_STAB]              = { .type = NLA_NESTED },
1406         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1407         [TCA_CHAIN]             = { .type = NLA_U32 },
1408         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1409         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1410 };
1411
1412 /*
1413  * Delete/get qdisc.
1414  */
1415
1416 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1417                         struct netlink_ext_ack *extack)
1418 {
1419         struct net *net = sock_net(skb->sk);
1420         struct tcmsg *tcm = nlmsg_data(n);
1421         struct nlattr *tca[TCA_MAX + 1];
1422         struct net_device *dev;
1423         u32 clid;
1424         struct Qdisc *q = NULL;
1425         struct Qdisc *p = NULL;
1426         int err;
1427
1428         if ((n->nlmsg_type != RTM_GETQDISC) &&
1429             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1430                 return -EPERM;
1431
1432         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1433                                      rtm_tca_policy, extack);
1434         if (err < 0)
1435                 return err;
1436
1437         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1438         if (!dev)
1439                 return -ENODEV;
1440
1441         clid = tcm->tcm_parent;
1442         if (clid) {
1443                 if (clid != TC_H_ROOT) {
1444                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1445                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1446                                 if (!p) {
1447                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1448                                         return -ENOENT;
1449                                 }
1450                                 q = qdisc_leaf(p, clid);
1451                         } else if (dev_ingress_queue(dev)) {
1452                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1453                         }
1454                 } else {
1455                         q = dev->qdisc;
1456                 }
1457                 if (!q) {
1458                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1459                         return -ENOENT;
1460                 }
1461
1462                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1463                         NL_SET_ERR_MSG(extack, "Invalid handle");
1464                         return -EINVAL;
1465                 }
1466         } else {
1467                 q = qdisc_lookup(dev, tcm->tcm_handle);
1468                 if (!q) {
1469                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1470                         return -ENOENT;
1471                 }
1472         }
1473
1474         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1475                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1476                 return -EINVAL;
1477         }
1478
1479         if (n->nlmsg_type == RTM_DELQDISC) {
1480                 if (!clid) {
1481                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1482                         return -EINVAL;
1483                 }
1484                 if (q->handle == 0) {
1485                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1486                         return -ENOENT;
1487                 }
1488                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1489                 if (err != 0)
1490                         return err;
1491         } else {
1492                 qdisc_notify(net, skb, n, clid, NULL, q);
1493         }
1494         return 0;
1495 }
1496
1497 /*
1498  * Create/change qdisc.
1499  */
1500
1501 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1502                            struct netlink_ext_ack *extack)
1503 {
1504         struct net *net = sock_net(skb->sk);
1505         struct tcmsg *tcm;
1506         struct nlattr *tca[TCA_MAX + 1];
1507         struct net_device *dev;
1508         u32 clid;
1509         struct Qdisc *q, *p;
1510         int err;
1511
1512         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1513                 return -EPERM;
1514
1515 replay:
1516         /* Reinit, just in case something touches this. */
1517         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1518                                      rtm_tca_policy, extack);
1519         if (err < 0)
1520                 return err;
1521
1522         tcm = nlmsg_data(n);
1523         clid = tcm->tcm_parent;
1524         q = p = NULL;
1525
1526         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1527         if (!dev)
1528                 return -ENODEV;
1529
1530
1531         if (clid) {
1532                 if (clid != TC_H_ROOT) {
1533                         if (clid != TC_H_INGRESS) {
1534                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1535                                 if (!p) {
1536                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1537                                         return -ENOENT;
1538                                 }
1539                                 q = qdisc_leaf(p, clid);
1540                         } else if (dev_ingress_queue_create(dev)) {
1541                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1542                         }
1543                 } else {
1544                         q = dev->qdisc;
1545                 }
1546
1547                 /* It may be default qdisc, ignore it */
1548                 if (q && q->handle == 0)
1549                         q = NULL;
1550
1551                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1552                         if (tcm->tcm_handle) {
1553                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1554                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1555                                         return -EEXIST;
1556                                 }
1557                                 if (TC_H_MIN(tcm->tcm_handle)) {
1558                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1559                                         return -EINVAL;
1560                                 }
1561                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1562                                 if (!q)
1563                                         goto create_n_graft;
1564                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1565                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1566                                         return -EEXIST;
1567                                 }
1568                                 if (tca[TCA_KIND] &&
1569                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1570                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1571                                         return -EINVAL;
1572                                 }
1573                                 if (q == p ||
1574                                     (p && check_loop(q, p, 0))) {
1575                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1576                                         return -ELOOP;
1577                                 }
1578                                 qdisc_refcount_inc(q);
1579                                 goto graft;
1580                         } else {
1581                                 if (!q)
1582                                         goto create_n_graft;
1583
1584                                 /* This magic test requires explanation.
1585                                  *
1586                                  *   We know, that some child q is already
1587                                  *   attached to this parent and have choice:
1588                                  *   either to change it or to create/graft new one.
1589                                  *
1590                                  *   1. We are allowed to create/graft only
1591                                  *   if CREATE and REPLACE flags are set.
1592                                  *
1593                                  *   2. If EXCL is set, requestor wanted to say,
1594                                  *   that qdisc tcm_handle is not expected
1595                                  *   to exist, so that we choose create/graft too.
1596                                  *
1597                                  *   3. The last case is when no flags are set.
1598                                  *   Alas, it is sort of hole in API, we
1599                                  *   cannot decide what to do unambiguously.
1600                                  *   For now we select create/graft, if
1601                                  *   user gave KIND, which does not match existing.
1602                                  */
1603                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1604                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1605                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1606                                      (tca[TCA_KIND] &&
1607                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1608                                         goto create_n_graft;
1609                         }
1610                 }
1611         } else {
1612                 if (!tcm->tcm_handle) {
1613                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1614                         return -EINVAL;
1615                 }
1616                 q = qdisc_lookup(dev, tcm->tcm_handle);
1617         }
1618
1619         /* Change qdisc parameters */
1620         if (!q) {
1621                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1622                 return -ENOENT;
1623         }
1624         if (n->nlmsg_flags & NLM_F_EXCL) {
1625                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1626                 return -EEXIST;
1627         }
1628         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1629                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1630                 return -EINVAL;
1631         }
1632         err = qdisc_change(q, tca, extack);
1633         if (err == 0)
1634                 qdisc_notify(net, skb, n, clid, NULL, q);
1635         return err;
1636
1637 create_n_graft:
1638         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1639                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1640                 return -ENOENT;
1641         }
1642         if (clid == TC_H_INGRESS) {
1643                 if (dev_ingress_queue(dev)) {
1644                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1645                                          tcm->tcm_parent, tcm->tcm_parent,
1646                                          tca, &err, extack);
1647                 } else {
1648                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1649                         err = -ENOENT;
1650                 }
1651         } else {
1652                 struct netdev_queue *dev_queue;
1653
1654                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1655                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1656                 else if (p)
1657                         dev_queue = p->dev_queue;
1658                 else
1659                         dev_queue = netdev_get_tx_queue(dev, 0);
1660
1661                 q = qdisc_create(dev, dev_queue, p,
1662                                  tcm->tcm_parent, tcm->tcm_handle,
1663                                  tca, &err, extack);
1664         }
1665         if (q == NULL) {
1666                 if (err == -EAGAIN)
1667                         goto replay;
1668                 return err;
1669         }
1670
1671 graft:
1672         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1673         if (err) {
1674                 if (q)
1675                         qdisc_put(q);
1676                 return err;
1677         }
1678
1679         return 0;
1680 }
1681
1682 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1683                               struct netlink_callback *cb,
1684                               int *q_idx_p, int s_q_idx, bool recur,
1685                               bool dump_invisible)
1686 {
1687         int ret = 0, q_idx = *q_idx_p;
1688         struct Qdisc *q;
1689         int b;
1690
1691         if (!root)
1692                 return 0;
1693
1694         q = root;
1695         if (q_idx < s_q_idx) {
1696                 q_idx++;
1697         } else {
1698                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1699                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1700                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1701                                   RTM_NEWQDISC) <= 0)
1702                         goto done;
1703                 q_idx++;
1704         }
1705
1706         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1707          * itself has already been dumped.
1708          *
1709          * If we've already dumped the top-level (ingress) qdisc above and the global
1710          * qdisc hashtable, we don't want to hit it again
1711          */
1712         if (!qdisc_dev(root) || !recur)
1713                 goto out;
1714
1715         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1716                 if (q_idx < s_q_idx) {
1717                         q_idx++;
1718                         continue;
1719                 }
1720                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1721                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1722                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1723                                   RTM_NEWQDISC) <= 0)
1724                         goto done;
1725                 q_idx++;
1726         }
1727
1728 out:
1729         *q_idx_p = q_idx;
1730         return ret;
1731 done:
1732         ret = -1;
1733         goto out;
1734 }
1735
1736 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1737 {
1738         struct net *net = sock_net(skb->sk);
1739         int idx, q_idx;
1740         int s_idx, s_q_idx;
1741         struct net_device *dev;
1742         const struct nlmsghdr *nlh = cb->nlh;
1743         struct nlattr *tca[TCA_MAX + 1];
1744         int err;
1745
1746         s_idx = cb->args[0];
1747         s_q_idx = q_idx = cb->args[1];
1748
1749         idx = 0;
1750         ASSERT_RTNL();
1751
1752         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1753                                      rtm_tca_policy, cb->extack);
1754         if (err < 0)
1755                 return err;
1756
1757         for_each_netdev(net, dev) {
1758                 struct netdev_queue *dev_queue;
1759
1760                 if (idx < s_idx)
1761                         goto cont;
1762                 if (idx > s_idx)
1763                         s_q_idx = 0;
1764                 q_idx = 0;
1765
1766                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1767                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1768                         goto done;
1769
1770                 dev_queue = dev_ingress_queue(dev);
1771                 if (dev_queue &&
1772                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1773                                        &q_idx, s_q_idx, false,
1774                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1775                         goto done;
1776
1777 cont:
1778                 idx++;
1779         }
1780
1781 done:
1782         cb->args[0] = idx;
1783         cb->args[1] = q_idx;
1784
1785         return skb->len;
1786 }
1787
1788
1789
1790 /************************************************
1791  *      Traffic classes manipulation.           *
1792  ************************************************/
1793
1794 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1795                           unsigned long cl,
1796                           u32 portid, u32 seq, u16 flags, int event)
1797 {
1798         struct tcmsg *tcm;
1799         struct nlmsghdr  *nlh;
1800         unsigned char *b = skb_tail_pointer(skb);
1801         struct gnet_dump d;
1802         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1803
1804         cond_resched();
1805         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1806         if (!nlh)
1807                 goto out_nlmsg_trim;
1808         tcm = nlmsg_data(nlh);
1809         tcm->tcm_family = AF_UNSPEC;
1810         tcm->tcm__pad1 = 0;
1811         tcm->tcm__pad2 = 0;
1812         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1813         tcm->tcm_parent = q->handle;
1814         tcm->tcm_handle = q->handle;
1815         tcm->tcm_info = 0;
1816         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1817                 goto nla_put_failure;
1818         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1819                 goto nla_put_failure;
1820
1821         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1822                                          NULL, &d, TCA_PAD) < 0)
1823                 goto nla_put_failure;
1824
1825         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1826                 goto nla_put_failure;
1827
1828         if (gnet_stats_finish_copy(&d) < 0)
1829                 goto nla_put_failure;
1830
1831         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1832         return skb->len;
1833
1834 out_nlmsg_trim:
1835 nla_put_failure:
1836         nlmsg_trim(skb, b);
1837         return -1;
1838 }
1839
1840 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1841                          struct nlmsghdr *n, struct Qdisc *q,
1842                          unsigned long cl, int event)
1843 {
1844         struct sk_buff *skb;
1845         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1846         int err = 0;
1847
1848         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1849         if (!skb)
1850                 return -ENOBUFS;
1851
1852         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1853                 kfree_skb(skb);
1854                 return -EINVAL;
1855         }
1856
1857         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1858                              n->nlmsg_flags & NLM_F_ECHO);
1859         if (err > 0)
1860                 err = 0;
1861         return err;
1862 }
1863
1864 static int tclass_del_notify(struct net *net,
1865                              const struct Qdisc_class_ops *cops,
1866                              struct sk_buff *oskb, struct nlmsghdr *n,
1867                              struct Qdisc *q, unsigned long cl)
1868 {
1869         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1870         struct sk_buff *skb;
1871         int err = 0;
1872
1873         if (!cops->delete)
1874                 return -EOPNOTSUPP;
1875
1876         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1877         if (!skb)
1878                 return -ENOBUFS;
1879
1880         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1881                            RTM_DELTCLASS) < 0) {
1882                 kfree_skb(skb);
1883                 return -EINVAL;
1884         }
1885
1886         err = cops->delete(q, cl);
1887         if (err) {
1888                 kfree_skb(skb);
1889                 return err;
1890         }
1891
1892         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1893                              n->nlmsg_flags & NLM_F_ECHO);
1894         if (err > 0)
1895                 err = 0;
1896         return err;
1897 }
1898
1899 #ifdef CONFIG_NET_CLS
1900
1901 struct tcf_bind_args {
1902         struct tcf_walker w;
1903         unsigned long base;
1904         unsigned long cl;
1905         u32 classid;
1906 };
1907
1908 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1909 {
1910         struct tcf_bind_args *a = (void *)arg;
1911
1912         if (tp->ops->bind_class) {
1913                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1914
1915                 sch_tree_lock(q);
1916                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1917                 sch_tree_unlock(q);
1918         }
1919         return 0;
1920 }
1921
1922 struct tc_bind_class_args {
1923         struct qdisc_walker w;
1924         unsigned long new_cl;
1925         u32 portid;
1926         u32 clid;
1927 };
1928
1929 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1930                                 struct qdisc_walker *w)
1931 {
1932         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1933         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1934         struct tcf_block *block;
1935         struct tcf_chain *chain;
1936
1937         block = cops->tcf_block(q, cl, NULL);
1938         if (!block)
1939                 return 0;
1940         for (chain = tcf_get_next_chain(block, NULL);
1941              chain;
1942              chain = tcf_get_next_chain(block, chain)) {
1943                 struct tcf_proto *tp;
1944
1945                 for (tp = tcf_get_next_proto(chain, NULL, true);
1946                      tp; tp = tcf_get_next_proto(chain, tp, true)) {
1947                         struct tcf_bind_args arg = {};
1948
1949                         arg.w.fn = tcf_node_bind;
1950                         arg.classid = a->clid;
1951                         arg.base = cl;
1952                         arg.cl = a->new_cl;
1953                         tp->ops->walk(tp, &arg.w, true);
1954                 }
1955         }
1956
1957         return 0;
1958 }
1959
1960 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1961                            unsigned long new_cl)
1962 {
1963         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1964         struct tc_bind_class_args args = {};
1965
1966         if (!cops->tcf_block)
1967                 return;
1968         args.portid = portid;
1969         args.clid = clid;
1970         args.new_cl = new_cl;
1971         args.w.fn = tc_bind_class_walker;
1972         q->ops->cl_ops->walk(q, &args.w);
1973 }
1974
1975 #else
1976
1977 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1978                            unsigned long new_cl)
1979 {
1980 }
1981
1982 #endif
1983
1984 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1985                          struct netlink_ext_ack *extack)
1986 {
1987         struct net *net = sock_net(skb->sk);
1988         struct tcmsg *tcm = nlmsg_data(n);
1989         struct nlattr *tca[TCA_MAX + 1];
1990         struct net_device *dev;
1991         struct Qdisc *q = NULL;
1992         const struct Qdisc_class_ops *cops;
1993         unsigned long cl = 0;
1994         unsigned long new_cl;
1995         u32 portid;
1996         u32 clid;
1997         u32 qid;
1998         int err;
1999
2000         if ((n->nlmsg_type != RTM_GETTCLASS) &&
2001             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2002                 return -EPERM;
2003
2004         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2005                                      rtm_tca_policy, extack);
2006         if (err < 0)
2007                 return err;
2008
2009         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2010         if (!dev)
2011                 return -ENODEV;
2012
2013         /*
2014            parent == TC_H_UNSPEC - unspecified parent.
2015            parent == TC_H_ROOT   - class is root, which has no parent.
2016            parent == X:0         - parent is root class.
2017            parent == X:Y         - parent is a node in hierarchy.
2018            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2019
2020            handle == 0:0         - generate handle from kernel pool.
2021            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2022            handle == X:Y         - clear.
2023            handle == X:0         - root class.
2024          */
2025
2026         /* Step 1. Determine qdisc handle X:0 */
2027
2028         portid = tcm->tcm_parent;
2029         clid = tcm->tcm_handle;
2030         qid = TC_H_MAJ(clid);
2031
2032         if (portid != TC_H_ROOT) {
2033                 u32 qid1 = TC_H_MAJ(portid);
2034
2035                 if (qid && qid1) {
2036                         /* If both majors are known, they must be identical. */
2037                         if (qid != qid1)
2038                                 return -EINVAL;
2039                 } else if (qid1) {
2040                         qid = qid1;
2041                 } else if (qid == 0)
2042                         qid = dev->qdisc->handle;
2043
2044                 /* Now qid is genuine qdisc handle consistent
2045                  * both with parent and child.
2046                  *
2047                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2048                  */
2049                 if (portid)
2050                         portid = TC_H_MAKE(qid, portid);
2051         } else {
2052                 if (qid == 0)
2053                         qid = dev->qdisc->handle;
2054         }
2055
2056         /* OK. Locate qdisc */
2057         q = qdisc_lookup(dev, qid);
2058         if (!q)
2059                 return -ENOENT;
2060
2061         /* An check that it supports classes */
2062         cops = q->ops->cl_ops;
2063         if (cops == NULL)
2064                 return -EINVAL;
2065
2066         /* Now try to get class */
2067         if (clid == 0) {
2068                 if (portid == TC_H_ROOT)
2069                         clid = qid;
2070         } else
2071                 clid = TC_H_MAKE(qid, clid);
2072
2073         if (clid)
2074                 cl = cops->find(q, clid);
2075
2076         if (cl == 0) {
2077                 err = -ENOENT;
2078                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2079                     !(n->nlmsg_flags & NLM_F_CREATE))
2080                         goto out;
2081         } else {
2082                 switch (n->nlmsg_type) {
2083                 case RTM_NEWTCLASS:
2084                         err = -EEXIST;
2085                         if (n->nlmsg_flags & NLM_F_EXCL)
2086                                 goto out;
2087                         break;
2088                 case RTM_DELTCLASS:
2089                         err = tclass_del_notify(net, cops, skb, n, q, cl);
2090                         /* Unbind the class with flilters with 0 */
2091                         tc_bind_tclass(q, portid, clid, 0);
2092                         goto out;
2093                 case RTM_GETTCLASS:
2094                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2095                         goto out;
2096                 default:
2097                         err = -EINVAL;
2098                         goto out;
2099                 }
2100         }
2101
2102         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2103                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2104                 return -EOPNOTSUPP;
2105         }
2106
2107         new_cl = cl;
2108         err = -EOPNOTSUPP;
2109         if (cops->change)
2110                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2111         if (err == 0) {
2112                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2113                 /* We just create a new class, need to do reverse binding. */
2114                 if (cl != new_cl)
2115                         tc_bind_tclass(q, portid, clid, new_cl);
2116         }
2117 out:
2118         return err;
2119 }
2120
2121 struct qdisc_dump_args {
2122         struct qdisc_walker     w;
2123         struct sk_buff          *skb;
2124         struct netlink_callback *cb;
2125 };
2126
2127 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2128                             struct qdisc_walker *arg)
2129 {
2130         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2131
2132         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2133                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2134                               RTM_NEWTCLASS);
2135 }
2136
2137 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2138                                 struct tcmsg *tcm, struct netlink_callback *cb,
2139                                 int *t_p, int s_t)
2140 {
2141         struct qdisc_dump_args arg;
2142
2143         if (tc_qdisc_dump_ignore(q, false) ||
2144             *t_p < s_t || !q->ops->cl_ops ||
2145             (tcm->tcm_parent &&
2146              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2147                 (*t_p)++;
2148                 return 0;
2149         }
2150         if (*t_p > s_t)
2151                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2152         arg.w.fn = qdisc_class_dump;
2153         arg.skb = skb;
2154         arg.cb = cb;
2155         arg.w.stop  = 0;
2156         arg.w.skip = cb->args[1];
2157         arg.w.count = 0;
2158         q->ops->cl_ops->walk(q, &arg.w);
2159         cb->args[1] = arg.w.count;
2160         if (arg.w.stop)
2161                 return -1;
2162         (*t_p)++;
2163         return 0;
2164 }
2165
2166 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2167                                struct tcmsg *tcm, struct netlink_callback *cb,
2168                                int *t_p, int s_t)
2169 {
2170         struct Qdisc *q;
2171         int b;
2172
2173         if (!root)
2174                 return 0;
2175
2176         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2177                 return -1;
2178
2179         if (!qdisc_dev(root))
2180                 return 0;
2181
2182         if (tcm->tcm_parent) {
2183                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2184                 if (q && q != root &&
2185                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2186                         return -1;
2187                 return 0;
2188         }
2189         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2190                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2191                         return -1;
2192         }
2193
2194         return 0;
2195 }
2196
2197 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2198 {
2199         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2200         struct net *net = sock_net(skb->sk);
2201         struct netdev_queue *dev_queue;
2202         struct net_device *dev;
2203         int t, s_t;
2204
2205         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2206                 return 0;
2207         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2208         if (!dev)
2209                 return 0;
2210
2211         s_t = cb->args[0];
2212         t = 0;
2213
2214         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2215                 goto done;
2216
2217         dev_queue = dev_ingress_queue(dev);
2218         if (dev_queue &&
2219             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2220                                 &t, s_t) < 0)
2221                 goto done;
2222
2223 done:
2224         cb->args[0] = t;
2225
2226         dev_put(dev);
2227         return skb->len;
2228 }
2229
2230 #ifdef CONFIG_PROC_FS
2231 static int psched_show(struct seq_file *seq, void *v)
2232 {
2233         seq_printf(seq, "%08x %08x %08x %08x\n",
2234                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2235                    1000000,
2236                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2237
2238         return 0;
2239 }
2240
2241 static int __net_init psched_net_init(struct net *net)
2242 {
2243         struct proc_dir_entry *e;
2244
2245         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2246         if (e == NULL)
2247                 return -ENOMEM;
2248
2249         return 0;
2250 }
2251
2252 static void __net_exit psched_net_exit(struct net *net)
2253 {
2254         remove_proc_entry("psched", net->proc_net);
2255 }
2256 #else
2257 static int __net_init psched_net_init(struct net *net)
2258 {
2259         return 0;
2260 }
2261
2262 static void __net_exit psched_net_exit(struct net *net)
2263 {
2264 }
2265 #endif
2266
2267 static struct pernet_operations psched_net_ops = {
2268         .init = psched_net_init,
2269         .exit = psched_net_exit,
2270 };
2271
2272 static int __init pktsched_init(void)
2273 {
2274         int err;
2275
2276         err = register_pernet_subsys(&psched_net_ops);
2277         if (err) {
2278                 pr_err("pktsched_init: "
2279                        "cannot initialize per netns operations\n");
2280                 return err;
2281         }
2282
2283         register_qdisc(&pfifo_fast_ops);
2284         register_qdisc(&pfifo_qdisc_ops);
2285         register_qdisc(&bfifo_qdisc_ops);
2286         register_qdisc(&pfifo_head_drop_qdisc_ops);
2287         register_qdisc(&mq_qdisc_ops);
2288         register_qdisc(&noqueue_qdisc_ops);
2289
2290         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2291         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2292         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2293                       0);
2294         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2295         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2296         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2297                       0);
2298
2299         return 0;
2300 }
2301
2302 subsys_initcall(pktsched_init);