Merge tag 'for-5.12/block-ipi-2021-02-21' of git://git.kernel.dk/linux-block
[linux-2.6-microblaze.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 #include <trace/events/qdisc.h>
36
37 /*
38
39    Short review.
40    -------------
41
42    This file consists of two interrelated parts:
43
44    1. queueing disciplines manager frontend.
45    2. traffic classes manager frontend.
46
47    Generally, queueing discipline ("qdisc") is a black box,
48    which is able to enqueue packets and to dequeue them (when
49    device is ready to send something) in order and at times
50    determined by algorithm hidden in it.
51
52    qdisc's are divided to two categories:
53    - "queues", which have no internal structure visible from outside.
54    - "schedulers", which split all the packets to "traffic classes",
55      using "packet classifiers" (look at cls_api.c)
56
57    In turn, classes may have child qdiscs (as rule, queues)
58    attached to them etc. etc. etc.
59
60    The goal of the routines in this file is to translate
61    information supplied by user in the form of handles
62    to more intelligible for kernel form, to make some sanity
63    checks and part of work, which is common to all qdiscs
64    and to provide rtnetlink notifications.
65
66    All real intelligent work is done inside qdisc modules.
67
68
69
70    Every discipline has two major routines: enqueue and dequeue.
71
72    ---dequeue
73
74    dequeue usually returns a skb to send. It is allowed to return NULL,
75    but it does not mean that queue is empty, it just means that
76    discipline does not want to send anything this time.
77    Queue is really empty if q->q.qlen == 0.
78    For complicated disciplines with multiple queues q->q is not
79    real packet queue, but however q->q.qlen must be valid.
80
81    ---enqueue
82
83    enqueue returns 0, if packet was enqueued successfully.
84    If packet (this one or another one) was dropped, it returns
85    not zero error code.
86    NET_XMIT_DROP        - this packet dropped
87      Expected action: do not backoff, but wait until queue will clear.
88    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
89      Expected action: backoff or ignore
90
91    Auxiliary routines:
92
93    ---peek
94
95    like dequeue but without removing a packet from the queue
96
97    ---reset
98
99    returns qdisc to initial state: purge all buffers, clear all
100    timers, counters (except for statistics) etc.
101
102    ---init
103
104    initializes newly created qdisc.
105
106    ---destroy
107
108    destroys resources allocated by init and during lifetime of qdisc.
109
110    ---change
111
112    changes qdisc parameters.
113  */
114
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
117
118
119 /************************************************
120  *      Queueing disciplines manipulation.      *
121  ************************************************/
122
123
124 /* The list of all installed queueing disciplines. */
125
126 static struct Qdisc_ops *qdisc_base;
127
128 /* Register/unregister queueing discipline */
129
130 int register_qdisc(struct Qdisc_ops *qops)
131 {
132         struct Qdisc_ops *q, **qp;
133         int rc = -EEXIST;
134
135         write_lock(&qdisc_mod_lock);
136         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137                 if (!strcmp(qops->id, q->id))
138                         goto out;
139
140         if (qops->enqueue == NULL)
141                 qops->enqueue = noop_qdisc_ops.enqueue;
142         if (qops->peek == NULL) {
143                 if (qops->dequeue == NULL)
144                         qops->peek = noop_qdisc_ops.peek;
145                 else
146                         goto out_einval;
147         }
148         if (qops->dequeue == NULL)
149                 qops->dequeue = noop_qdisc_ops.dequeue;
150
151         if (qops->cl_ops) {
152                 const struct Qdisc_class_ops *cops = qops->cl_ops;
153
154                 if (!(cops->find && cops->walk && cops->leaf))
155                         goto out_einval;
156
157                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158                         goto out_einval;
159         }
160
161         qops->next = NULL;
162         *qp = qops;
163         rc = 0;
164 out:
165         write_unlock(&qdisc_mod_lock);
166         return rc;
167
168 out_einval:
169         rc = -EINVAL;
170         goto out;
171 }
172 EXPORT_SYMBOL(register_qdisc);
173
174 int unregister_qdisc(struct Qdisc_ops *qops)
175 {
176         struct Qdisc_ops *q, **qp;
177         int err = -ENOENT;
178
179         write_lock(&qdisc_mod_lock);
180         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181                 if (q == qops)
182                         break;
183         if (q) {
184                 *qp = q->next;
185                 q->next = NULL;
186                 err = 0;
187         }
188         write_unlock(&qdisc_mod_lock);
189         return err;
190 }
191 EXPORT_SYMBOL(unregister_qdisc);
192
193 /* Get default qdisc if not otherwise specified */
194 void qdisc_get_default(char *name, size_t len)
195 {
196         read_lock(&qdisc_mod_lock);
197         strlcpy(name, default_qdisc_ops->id, len);
198         read_unlock(&qdisc_mod_lock);
199 }
200
201 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
202 {
203         struct Qdisc_ops *q = NULL;
204
205         for (q = qdisc_base; q; q = q->next) {
206                 if (!strcmp(name, q->id)) {
207                         if (!try_module_get(q->owner))
208                                 q = NULL;
209                         break;
210                 }
211         }
212
213         return q;
214 }
215
216 /* Set new default qdisc to use */
217 int qdisc_set_default(const char *name)
218 {
219         const struct Qdisc_ops *ops;
220
221         if (!capable(CAP_NET_ADMIN))
222                 return -EPERM;
223
224         write_lock(&qdisc_mod_lock);
225         ops = qdisc_lookup_default(name);
226         if (!ops) {
227                 /* Not found, drop lock and try to load module */
228                 write_unlock(&qdisc_mod_lock);
229                 request_module("sch_%s", name);
230                 write_lock(&qdisc_mod_lock);
231
232                 ops = qdisc_lookup_default(name);
233         }
234
235         if (ops) {
236                 /* Set new default */
237                 module_put(default_qdisc_ops->owner);
238                 default_qdisc_ops = ops;
239         }
240         write_unlock(&qdisc_mod_lock);
241
242         return ops ? 0 : -ENOENT;
243 }
244
245 #ifdef CONFIG_NET_SCH_DEFAULT
246 /* Set default value from kernel config */
247 static int __init sch_default_qdisc(void)
248 {
249         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
250 }
251 late_initcall(sch_default_qdisc);
252 #endif
253
254 /* We know handle. Find qdisc among all qdisc's attached to device
255  * (root qdisc, all its children, children of children etc.)
256  * Note: caller either uses rtnl or rcu_read_lock()
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!qdisc_dev(root))
264                 return (root->handle == handle ? root : NULL);
265
266         if (!(root->flags & TCQ_F_BUILTIN) &&
267             root->handle == handle)
268                 return root;
269
270         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
271                                    lockdep_rtnl_is_held()) {
272                 if (q->handle == handle)
273                         return q;
274         }
275         return NULL;
276 }
277
278 void qdisc_hash_add(struct Qdisc *q, bool invisible)
279 {
280         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
281                 ASSERT_RTNL();
282                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
283                 if (invisible)
284                         q->flags |= TCQ_F_INVISIBLE;
285         }
286 }
287 EXPORT_SYMBOL(qdisc_hash_add);
288
289 void qdisc_hash_del(struct Qdisc *q)
290 {
291         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
292                 ASSERT_RTNL();
293                 hash_del_rcu(&q->hash);
294         }
295 }
296 EXPORT_SYMBOL(qdisc_hash_del);
297
298 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
299 {
300         struct Qdisc *q;
301
302         if (!handle)
303                 return NULL;
304         q = qdisc_match_from_root(dev->qdisc, handle);
305         if (q)
306                 goto out;
307
308         if (dev_ingress_queue(dev))
309                 q = qdisc_match_from_root(
310                         dev_ingress_queue(dev)->qdisc_sleeping,
311                         handle);
312 out:
313         return q;
314 }
315
316 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
317 {
318         struct netdev_queue *nq;
319         struct Qdisc *q;
320
321         if (!handle)
322                 return NULL;
323         q = qdisc_match_from_root(dev->qdisc, handle);
324         if (q)
325                 goto out;
326
327         nq = dev_ingress_queue_rcu(dev);
328         if (nq)
329                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
330 out:
331         return q;
332 }
333
334 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
335 {
336         unsigned long cl;
337         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
338
339         if (cops == NULL)
340                 return NULL;
341         cl = cops->find(p, classid);
342
343         if (cl == 0)
344                 return NULL;
345         return cops->leaf(p, cl);
346 }
347
348 /* Find queueing discipline by name */
349
350 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
351 {
352         struct Qdisc_ops *q = NULL;
353
354         if (kind) {
355                 read_lock(&qdisc_mod_lock);
356                 for (q = qdisc_base; q; q = q->next) {
357                         if (nla_strcmp(kind, q->id) == 0) {
358                                 if (!try_module_get(q->owner))
359                                         q = NULL;
360                                 break;
361                         }
362                 }
363                 read_unlock(&qdisc_mod_lock);
364         }
365         return q;
366 }
367
368 /* The linklayer setting were not transferred from iproute2, in older
369  * versions, and the rate tables lookup systems have been dropped in
370  * the kernel. To keep backward compatible with older iproute2 tc
371  * utils, we detect the linklayer setting by detecting if the rate
372  * table were modified.
373  *
374  * For linklayer ATM table entries, the rate table will be aligned to
375  * 48 bytes, thus some table entries will contain the same value.  The
376  * mpu (min packet unit) is also encoded into the old rate table, thus
377  * starting from the mpu, we find low and high table entries for
378  * mapping this cell.  If these entries contain the same value, when
379  * the rate tables have been modified for linklayer ATM.
380  *
381  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
382  * and then roundup to the next cell, calc the table entry one below,
383  * and compare.
384  */
385 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
386 {
387         int low       = roundup(r->mpu, 48);
388         int high      = roundup(low+1, 48);
389         int cell_low  = low >> r->cell_log;
390         int cell_high = (high >> r->cell_log) - 1;
391
392         /* rtab is too inaccurate at rates > 100Mbit/s */
393         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
394                 pr_debug("TC linklayer: Giving up ATM detection\n");
395                 return TC_LINKLAYER_ETHERNET;
396         }
397
398         if ((cell_high > cell_low) && (cell_high < 256)
399             && (rtab[cell_low] == rtab[cell_high])) {
400                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
401                          cell_low, cell_high, rtab[cell_high]);
402                 return TC_LINKLAYER_ATM;
403         }
404         return TC_LINKLAYER_ETHERNET;
405 }
406
407 static struct qdisc_rate_table *qdisc_rtab_list;
408
409 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
410                                         struct nlattr *tab,
411                                         struct netlink_ext_ack *extack)
412 {
413         struct qdisc_rate_table *rtab;
414
415         if (tab == NULL || r->rate == 0 ||
416             r->cell_log == 0 || r->cell_log >= 32 ||
417             nla_len(tab) != TC_RTAB_SIZE) {
418                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
419                 return NULL;
420         }
421
422         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
423                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
424                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
425                         rtab->refcnt++;
426                         return rtab;
427                 }
428         }
429
430         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
431         if (rtab) {
432                 rtab->rate = *r;
433                 rtab->refcnt = 1;
434                 memcpy(rtab->data, nla_data(tab), 1024);
435                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
436                         r->linklayer = __detect_linklayer(r, rtab->data);
437                 rtab->next = qdisc_rtab_list;
438                 qdisc_rtab_list = rtab;
439         } else {
440                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
441         }
442         return rtab;
443 }
444 EXPORT_SYMBOL(qdisc_get_rtab);
445
446 void qdisc_put_rtab(struct qdisc_rate_table *tab)
447 {
448         struct qdisc_rate_table *rtab, **rtabp;
449
450         if (!tab || --tab->refcnt)
451                 return;
452
453         for (rtabp = &qdisc_rtab_list;
454              (rtab = *rtabp) != NULL;
455              rtabp = &rtab->next) {
456                 if (rtab == tab) {
457                         *rtabp = rtab->next;
458                         kfree(rtab);
459                         return;
460                 }
461         }
462 }
463 EXPORT_SYMBOL(qdisc_put_rtab);
464
465 static LIST_HEAD(qdisc_stab_list);
466
467 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
468         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
469         [TCA_STAB_DATA] = { .type = NLA_BINARY },
470 };
471
472 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
473                                                struct netlink_ext_ack *extack)
474 {
475         struct nlattr *tb[TCA_STAB_MAX + 1];
476         struct qdisc_size_table *stab;
477         struct tc_sizespec *s;
478         unsigned int tsize = 0;
479         u16 *tab = NULL;
480         int err;
481
482         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
483                                           extack);
484         if (err < 0)
485                 return ERR_PTR(err);
486         if (!tb[TCA_STAB_BASE]) {
487                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
488                 return ERR_PTR(-EINVAL);
489         }
490
491         s = nla_data(tb[TCA_STAB_BASE]);
492
493         if (s->tsize > 0) {
494                 if (!tb[TCA_STAB_DATA]) {
495                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
496                         return ERR_PTR(-EINVAL);
497                 }
498                 tab = nla_data(tb[TCA_STAB_DATA]);
499                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
500         }
501
502         if (tsize != s->tsize || (!tab && tsize > 0)) {
503                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
504                 return ERR_PTR(-EINVAL);
505         }
506
507         list_for_each_entry(stab, &qdisc_stab_list, list) {
508                 if (memcmp(&stab->szopts, s, sizeof(*s)))
509                         continue;
510                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
511                         continue;
512                 stab->refcnt++;
513                 return stab;
514         }
515
516         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
517         if (!stab)
518                 return ERR_PTR(-ENOMEM);
519
520         stab->refcnt = 1;
521         stab->szopts = *s;
522         if (tsize > 0)
523                 memcpy(stab->data, tab, tsize * sizeof(u16));
524
525         list_add_tail(&stab->list, &qdisc_stab_list);
526
527         return stab;
528 }
529
530 void qdisc_put_stab(struct qdisc_size_table *tab)
531 {
532         if (!tab)
533                 return;
534
535         if (--tab->refcnt == 0) {
536                 list_del(&tab->list);
537                 kfree_rcu(tab, rcu);
538         }
539 }
540 EXPORT_SYMBOL(qdisc_put_stab);
541
542 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
543 {
544         struct nlattr *nest;
545
546         nest = nla_nest_start_noflag(skb, TCA_STAB);
547         if (nest == NULL)
548                 goto nla_put_failure;
549         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
550                 goto nla_put_failure;
551         nla_nest_end(skb, nest);
552
553         return skb->len;
554
555 nla_put_failure:
556         return -1;
557 }
558
559 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
560                                const struct qdisc_size_table *stab)
561 {
562         int pkt_len, slot;
563
564         pkt_len = skb->len + stab->szopts.overhead;
565         if (unlikely(!stab->szopts.tsize))
566                 goto out;
567
568         slot = pkt_len + stab->szopts.cell_align;
569         if (unlikely(slot < 0))
570                 slot = 0;
571
572         slot >>= stab->szopts.cell_log;
573         if (likely(slot < stab->szopts.tsize))
574                 pkt_len = stab->data[slot];
575         else
576                 pkt_len = stab->data[stab->szopts.tsize - 1] *
577                                 (slot / stab->szopts.tsize) +
578                                 stab->data[slot % stab->szopts.tsize];
579
580         pkt_len <<= stab->szopts.size_log;
581 out:
582         if (unlikely(pkt_len < 1))
583                 pkt_len = 1;
584         qdisc_skb_cb(skb)->pkt_len = pkt_len;
585 }
586 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
587
588 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
589 {
590         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
591                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
592                         txt, qdisc->ops->id, qdisc->handle >> 16);
593                 qdisc->flags |= TCQ_F_WARN_NONWC;
594         }
595 }
596 EXPORT_SYMBOL(qdisc_warn_nonwc);
597
598 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
599 {
600         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
601                                                  timer);
602
603         rcu_read_lock();
604         __netif_schedule(qdisc_root(wd->qdisc));
605         rcu_read_unlock();
606
607         return HRTIMER_NORESTART;
608 }
609
610 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
611                                  clockid_t clockid)
612 {
613         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
614         wd->timer.function = qdisc_watchdog;
615         wd->qdisc = qdisc;
616 }
617 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
618
619 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
620 {
621         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
622 }
623 EXPORT_SYMBOL(qdisc_watchdog_init);
624
625 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
626                                       u64 delta_ns)
627 {
628         if (test_bit(__QDISC_STATE_DEACTIVATED,
629                      &qdisc_root_sleeping(wd->qdisc)->state))
630                 return;
631
632         if (hrtimer_is_queued(&wd->timer)) {
633                 /* If timer is already set in [expires, expires + delta_ns],
634                  * do not reprogram it.
635                  */
636                 if (wd->last_expires - expires <= delta_ns)
637                         return;
638         }
639
640         wd->last_expires = expires;
641         hrtimer_start_range_ns(&wd->timer,
642                                ns_to_ktime(expires),
643                                delta_ns,
644                                HRTIMER_MODE_ABS_PINNED);
645 }
646 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
647
648 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
649 {
650         hrtimer_cancel(&wd->timer);
651 }
652 EXPORT_SYMBOL(qdisc_watchdog_cancel);
653
654 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
655 {
656         struct hlist_head *h;
657         unsigned int i;
658
659         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
660
661         if (h != NULL) {
662                 for (i = 0; i < n; i++)
663                         INIT_HLIST_HEAD(&h[i]);
664         }
665         return h;
666 }
667
668 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
669 {
670         struct Qdisc_class_common *cl;
671         struct hlist_node *next;
672         struct hlist_head *nhash, *ohash;
673         unsigned int nsize, nmask, osize;
674         unsigned int i, h;
675
676         /* Rehash when load factor exceeds 0.75 */
677         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
678                 return;
679         nsize = clhash->hashsize * 2;
680         nmask = nsize - 1;
681         nhash = qdisc_class_hash_alloc(nsize);
682         if (nhash == NULL)
683                 return;
684
685         ohash = clhash->hash;
686         osize = clhash->hashsize;
687
688         sch_tree_lock(sch);
689         for (i = 0; i < osize; i++) {
690                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
691                         h = qdisc_class_hash(cl->classid, nmask);
692                         hlist_add_head(&cl->hnode, &nhash[h]);
693                 }
694         }
695         clhash->hash     = nhash;
696         clhash->hashsize = nsize;
697         clhash->hashmask = nmask;
698         sch_tree_unlock(sch);
699
700         kvfree(ohash);
701 }
702 EXPORT_SYMBOL(qdisc_class_hash_grow);
703
704 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
705 {
706         unsigned int size = 4;
707
708         clhash->hash = qdisc_class_hash_alloc(size);
709         if (!clhash->hash)
710                 return -ENOMEM;
711         clhash->hashsize  = size;
712         clhash->hashmask  = size - 1;
713         clhash->hashelems = 0;
714         return 0;
715 }
716 EXPORT_SYMBOL(qdisc_class_hash_init);
717
718 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
719 {
720         kvfree(clhash->hash);
721 }
722 EXPORT_SYMBOL(qdisc_class_hash_destroy);
723
724 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
725                              struct Qdisc_class_common *cl)
726 {
727         unsigned int h;
728
729         INIT_HLIST_NODE(&cl->hnode);
730         h = qdisc_class_hash(cl->classid, clhash->hashmask);
731         hlist_add_head(&cl->hnode, &clhash->hash[h]);
732         clhash->hashelems++;
733 }
734 EXPORT_SYMBOL(qdisc_class_hash_insert);
735
736 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
737                              struct Qdisc_class_common *cl)
738 {
739         hlist_del(&cl->hnode);
740         clhash->hashelems--;
741 }
742 EXPORT_SYMBOL(qdisc_class_hash_remove);
743
744 /* Allocate an unique handle from space managed by kernel
745  * Possible range is [8000-FFFF]:0000 (0x8000 values)
746  */
747 static u32 qdisc_alloc_handle(struct net_device *dev)
748 {
749         int i = 0x8000;
750         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
751
752         do {
753                 autohandle += TC_H_MAKE(0x10000U, 0);
754                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
755                         autohandle = TC_H_MAKE(0x80000000U, 0);
756                 if (!qdisc_lookup(dev, autohandle))
757                         return autohandle;
758                 cond_resched();
759         } while (--i > 0);
760
761         return 0;
762 }
763
764 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
765 {
766         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
767         const struct Qdisc_class_ops *cops;
768         unsigned long cl;
769         u32 parentid;
770         bool notify;
771         int drops;
772
773         if (n == 0 && len == 0)
774                 return;
775         drops = max_t(int, n, 0);
776         rcu_read_lock();
777         while ((parentid = sch->parent)) {
778                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
779                         break;
780
781                 if (sch->flags & TCQ_F_NOPARENT)
782                         break;
783                 /* Notify parent qdisc only if child qdisc becomes empty.
784                  *
785                  * If child was empty even before update then backlog
786                  * counter is screwed and we skip notification because
787                  * parent class is already passive.
788                  *
789                  * If the original child was offloaded then it is allowed
790                  * to be seem as empty, so the parent is notified anyway.
791                  */
792                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
793                                                        !qdisc_is_offloaded);
794                 /* TODO: perform the search on a per txq basis */
795                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
796                 if (sch == NULL) {
797                         WARN_ON_ONCE(parentid != TC_H_ROOT);
798                         break;
799                 }
800                 cops = sch->ops->cl_ops;
801                 if (notify && cops->qlen_notify) {
802                         cl = cops->find(sch, parentid);
803                         cops->qlen_notify(sch, cl);
804                 }
805                 sch->q.qlen -= n;
806                 sch->qstats.backlog -= len;
807                 __qdisc_qstats_drop(sch, drops);
808         }
809         rcu_read_unlock();
810 }
811 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
812
813 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
814                               void *type_data)
815 {
816         struct net_device *dev = qdisc_dev(sch);
817         int err;
818
819         sch->flags &= ~TCQ_F_OFFLOADED;
820         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
821                 return 0;
822
823         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
824         if (err == -EOPNOTSUPP)
825                 return 0;
826
827         if (!err)
828                 sch->flags |= TCQ_F_OFFLOADED;
829
830         return err;
831 }
832 EXPORT_SYMBOL(qdisc_offload_dump_helper);
833
834 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
835                                 struct Qdisc *new, struct Qdisc *old,
836                                 enum tc_setup_type type, void *type_data,
837                                 struct netlink_ext_ack *extack)
838 {
839         bool any_qdisc_is_offloaded;
840         int err;
841
842         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
843                 return;
844
845         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
846
847         /* Don't report error if the graft is part of destroy operation. */
848         if (!err || !new || new == &noop_qdisc)
849                 return;
850
851         /* Don't report error if the parent, the old child and the new
852          * one are not offloaded.
853          */
854         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
855         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
856         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
857
858         if (any_qdisc_is_offloaded)
859                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
860 }
861 EXPORT_SYMBOL(qdisc_offload_graft_helper);
862
863 static void qdisc_offload_graft_root(struct net_device *dev,
864                                      struct Qdisc *new, struct Qdisc *old,
865                                      struct netlink_ext_ack *extack)
866 {
867         struct tc_root_qopt_offload graft_offload = {
868                 .command        = TC_ROOT_GRAFT,
869                 .handle         = new ? new->handle : 0,
870                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
871                                   (old && old->flags & TCQ_F_INGRESS),
872         };
873
874         qdisc_offload_graft_helper(dev, NULL, new, old,
875                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
876 }
877
878 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
879                          u32 portid, u32 seq, u16 flags, int event)
880 {
881         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
882         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
883         struct tcmsg *tcm;
884         struct nlmsghdr  *nlh;
885         unsigned char *b = skb_tail_pointer(skb);
886         struct gnet_dump d;
887         struct qdisc_size_table *stab;
888         u32 block_index;
889         __u32 qlen;
890
891         cond_resched();
892         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
893         if (!nlh)
894                 goto out_nlmsg_trim;
895         tcm = nlmsg_data(nlh);
896         tcm->tcm_family = AF_UNSPEC;
897         tcm->tcm__pad1 = 0;
898         tcm->tcm__pad2 = 0;
899         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
900         tcm->tcm_parent = clid;
901         tcm->tcm_handle = q->handle;
902         tcm->tcm_info = refcount_read(&q->refcnt);
903         if (nla_put_string(skb, TCA_KIND, q->ops->id))
904                 goto nla_put_failure;
905         if (q->ops->ingress_block_get) {
906                 block_index = q->ops->ingress_block_get(q);
907                 if (block_index &&
908                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
909                         goto nla_put_failure;
910         }
911         if (q->ops->egress_block_get) {
912                 block_index = q->ops->egress_block_get(q);
913                 if (block_index &&
914                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
915                         goto nla_put_failure;
916         }
917         if (q->ops->dump && q->ops->dump(q, skb) < 0)
918                 goto nla_put_failure;
919         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
920                 goto nla_put_failure;
921         qlen = qdisc_qlen_sum(q);
922
923         stab = rtnl_dereference(q->stab);
924         if (stab && qdisc_dump_stab(skb, stab) < 0)
925                 goto nla_put_failure;
926
927         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
928                                          NULL, &d, TCA_PAD) < 0)
929                 goto nla_put_failure;
930
931         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
932                 goto nla_put_failure;
933
934         if (qdisc_is_percpu_stats(q)) {
935                 cpu_bstats = q->cpu_bstats;
936                 cpu_qstats = q->cpu_qstats;
937         }
938
939         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
940                                   &d, cpu_bstats, &q->bstats) < 0 ||
941             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
942             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
943                 goto nla_put_failure;
944
945         if (gnet_stats_finish_copy(&d) < 0)
946                 goto nla_put_failure;
947
948         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
949         return skb->len;
950
951 out_nlmsg_trim:
952 nla_put_failure:
953         nlmsg_trim(skb, b);
954         return -1;
955 }
956
957 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
958 {
959         if (q->flags & TCQ_F_BUILTIN)
960                 return true;
961         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
962                 return true;
963
964         return false;
965 }
966
967 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
968                         struct nlmsghdr *n, u32 clid,
969                         struct Qdisc *old, struct Qdisc *new)
970 {
971         struct sk_buff *skb;
972         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
973
974         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
975         if (!skb)
976                 return -ENOBUFS;
977
978         if (old && !tc_qdisc_dump_ignore(old, false)) {
979                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
980                                   0, RTM_DELQDISC) < 0)
981                         goto err_out;
982         }
983         if (new && !tc_qdisc_dump_ignore(new, false)) {
984                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
985                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
986                         goto err_out;
987         }
988
989         if (skb->len)
990                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
991                                       n->nlmsg_flags & NLM_F_ECHO);
992
993 err_out:
994         kfree_skb(skb);
995         return -EINVAL;
996 }
997
998 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
999                                struct nlmsghdr *n, u32 clid,
1000                                struct Qdisc *old, struct Qdisc *new)
1001 {
1002         if (new || old)
1003                 qdisc_notify(net, skb, n, clid, old, new);
1004
1005         if (old)
1006                 qdisc_put(old);
1007 }
1008
1009 static void qdisc_clear_nolock(struct Qdisc *sch)
1010 {
1011         sch->flags &= ~TCQ_F_NOLOCK;
1012         if (!(sch->flags & TCQ_F_CPUSTATS))
1013                 return;
1014
1015         free_percpu(sch->cpu_bstats);
1016         free_percpu(sch->cpu_qstats);
1017         sch->cpu_bstats = NULL;
1018         sch->cpu_qstats = NULL;
1019         sch->flags &= ~TCQ_F_CPUSTATS;
1020 }
1021
1022 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1023  * to device "dev".
1024  *
1025  * When appropriate send a netlink notification using 'skb'
1026  * and "n".
1027  *
1028  * On success, destroy old qdisc.
1029  */
1030
1031 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1032                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1033                        struct Qdisc *new, struct Qdisc *old,
1034                        struct netlink_ext_ack *extack)
1035 {
1036         struct Qdisc *q = old;
1037         struct net *net = dev_net(dev);
1038
1039         if (parent == NULL) {
1040                 unsigned int i, num_q, ingress;
1041
1042                 ingress = 0;
1043                 num_q = dev->num_tx_queues;
1044                 if ((q && q->flags & TCQ_F_INGRESS) ||
1045                     (new && new->flags & TCQ_F_INGRESS)) {
1046                         num_q = 1;
1047                         ingress = 1;
1048                         if (!dev_ingress_queue(dev)) {
1049                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1050                                 return -ENOENT;
1051                         }
1052                 }
1053
1054                 if (dev->flags & IFF_UP)
1055                         dev_deactivate(dev);
1056
1057                 qdisc_offload_graft_root(dev, new, old, extack);
1058
1059                 if (new && new->ops->attach)
1060                         goto skip;
1061
1062                 for (i = 0; i < num_q; i++) {
1063                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1064
1065                         if (!ingress)
1066                                 dev_queue = netdev_get_tx_queue(dev, i);
1067
1068                         old = dev_graft_qdisc(dev_queue, new);
1069                         if (new && i > 0)
1070                                 qdisc_refcount_inc(new);
1071
1072                         if (!ingress)
1073                                 qdisc_put(old);
1074                 }
1075
1076 skip:
1077                 if (!ingress) {
1078                         notify_and_destroy(net, skb, n, classid,
1079                                            dev->qdisc, new);
1080                         if (new && !new->ops->attach)
1081                                 qdisc_refcount_inc(new);
1082                         dev->qdisc = new ? : &noop_qdisc;
1083
1084                         if (new && new->ops->attach)
1085                                 new->ops->attach(new);
1086                 } else {
1087                         notify_and_destroy(net, skb, n, classid, old, new);
1088                 }
1089
1090                 if (dev->flags & IFF_UP)
1091                         dev_activate(dev);
1092         } else {
1093                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1094                 unsigned long cl;
1095                 int err;
1096
1097                 /* Only support running class lockless if parent is lockless */
1098                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1099                         qdisc_clear_nolock(new);
1100
1101                 if (!cops || !cops->graft)
1102                         return -EOPNOTSUPP;
1103
1104                 cl = cops->find(parent, classid);
1105                 if (!cl) {
1106                         NL_SET_ERR_MSG(extack, "Specified class not found");
1107                         return -ENOENT;
1108                 }
1109
1110                 err = cops->graft(parent, cl, new, &old, extack);
1111                 if (err)
1112                         return err;
1113                 notify_and_destroy(net, skb, n, classid, old, new);
1114         }
1115         return 0;
1116 }
1117
1118 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1119                                    struct netlink_ext_ack *extack)
1120 {
1121         u32 block_index;
1122
1123         if (tca[TCA_INGRESS_BLOCK]) {
1124                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1125
1126                 if (!block_index) {
1127                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1128                         return -EINVAL;
1129                 }
1130                 if (!sch->ops->ingress_block_set) {
1131                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1132                         return -EOPNOTSUPP;
1133                 }
1134                 sch->ops->ingress_block_set(sch, block_index);
1135         }
1136         if (tca[TCA_EGRESS_BLOCK]) {
1137                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1138
1139                 if (!block_index) {
1140                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1141                         return -EINVAL;
1142                 }
1143                 if (!sch->ops->egress_block_set) {
1144                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1145                         return -EOPNOTSUPP;
1146                 }
1147                 sch->ops->egress_block_set(sch, block_index);
1148         }
1149         return 0;
1150 }
1151
1152 /*
1153    Allocate and initialize new qdisc.
1154
1155    Parameters are passed via opt.
1156  */
1157
1158 static struct Qdisc *qdisc_create(struct net_device *dev,
1159                                   struct netdev_queue *dev_queue,
1160                                   struct Qdisc *p, u32 parent, u32 handle,
1161                                   struct nlattr **tca, int *errp,
1162                                   struct netlink_ext_ack *extack)
1163 {
1164         int err;
1165         struct nlattr *kind = tca[TCA_KIND];
1166         struct Qdisc *sch;
1167         struct Qdisc_ops *ops;
1168         struct qdisc_size_table *stab;
1169
1170         ops = qdisc_lookup_ops(kind);
1171 #ifdef CONFIG_MODULES
1172         if (ops == NULL && kind != NULL) {
1173                 char name[IFNAMSIZ];
1174                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1175                         /* We dropped the RTNL semaphore in order to
1176                          * perform the module load.  So, even if we
1177                          * succeeded in loading the module we have to
1178                          * tell the caller to replay the request.  We
1179                          * indicate this using -EAGAIN.
1180                          * We replay the request because the device may
1181                          * go away in the mean time.
1182                          */
1183                         rtnl_unlock();
1184                         request_module("sch_%s", name);
1185                         rtnl_lock();
1186                         ops = qdisc_lookup_ops(kind);
1187                         if (ops != NULL) {
1188                                 /* We will try again qdisc_lookup_ops,
1189                                  * so don't keep a reference.
1190                                  */
1191                                 module_put(ops->owner);
1192                                 err = -EAGAIN;
1193                                 goto err_out;
1194                         }
1195                 }
1196         }
1197 #endif
1198
1199         err = -ENOENT;
1200         if (!ops) {
1201                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1202                 goto err_out;
1203         }
1204
1205         sch = qdisc_alloc(dev_queue, ops, extack);
1206         if (IS_ERR(sch)) {
1207                 err = PTR_ERR(sch);
1208                 goto err_out2;
1209         }
1210
1211         sch->parent = parent;
1212
1213         if (handle == TC_H_INGRESS) {
1214                 sch->flags |= TCQ_F_INGRESS;
1215                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1216         } else {
1217                 if (handle == 0) {
1218                         handle = qdisc_alloc_handle(dev);
1219                         if (handle == 0) {
1220                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1221                                 err = -ENOSPC;
1222                                 goto err_out3;
1223                         }
1224                 }
1225                 if (!netif_is_multiqueue(dev))
1226                         sch->flags |= TCQ_F_ONETXQUEUE;
1227         }
1228
1229         sch->handle = handle;
1230
1231         /* This exist to keep backward compatible with a userspace
1232          * loophole, what allowed userspace to get IFF_NO_QUEUE
1233          * facility on older kernels by setting tx_queue_len=0 (prior
1234          * to qdisc init), and then forgot to reinit tx_queue_len
1235          * before again attaching a qdisc.
1236          */
1237         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1238                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1239                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1240         }
1241
1242         err = qdisc_block_indexes_set(sch, tca, extack);
1243         if (err)
1244                 goto err_out3;
1245
1246         if (ops->init) {
1247                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1248                 if (err != 0)
1249                         goto err_out5;
1250         }
1251
1252         if (tca[TCA_STAB]) {
1253                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1254                 if (IS_ERR(stab)) {
1255                         err = PTR_ERR(stab);
1256                         goto err_out4;
1257                 }
1258                 rcu_assign_pointer(sch->stab, stab);
1259         }
1260         if (tca[TCA_RATE]) {
1261                 seqcount_t *running;
1262
1263                 err = -EOPNOTSUPP;
1264                 if (sch->flags & TCQ_F_MQROOT) {
1265                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1266                         goto err_out4;
1267                 }
1268
1269                 if (sch->parent != TC_H_ROOT &&
1270                     !(sch->flags & TCQ_F_INGRESS) &&
1271                     (!p || !(p->flags & TCQ_F_MQROOT)))
1272                         running = qdisc_root_sleeping_running(sch);
1273                 else
1274                         running = &sch->running;
1275
1276                 err = gen_new_estimator(&sch->bstats,
1277                                         sch->cpu_bstats,
1278                                         &sch->rate_est,
1279                                         NULL,
1280                                         running,
1281                                         tca[TCA_RATE]);
1282                 if (err) {
1283                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1284                         goto err_out4;
1285                 }
1286         }
1287
1288         qdisc_hash_add(sch, false);
1289         trace_qdisc_create(ops, dev, parent);
1290
1291         return sch;
1292
1293 err_out5:
1294         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1295         if (ops->destroy)
1296                 ops->destroy(sch);
1297 err_out3:
1298         dev_put(dev);
1299         qdisc_free(sch);
1300 err_out2:
1301         module_put(ops->owner);
1302 err_out:
1303         *errp = err;
1304         return NULL;
1305
1306 err_out4:
1307         /*
1308          * Any broken qdiscs that would require a ops->reset() here?
1309          * The qdisc was never in action so it shouldn't be necessary.
1310          */
1311         qdisc_put_stab(rtnl_dereference(sch->stab));
1312         if (ops->destroy)
1313                 ops->destroy(sch);
1314         goto err_out3;
1315 }
1316
1317 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1318                         struct netlink_ext_ack *extack)
1319 {
1320         struct qdisc_size_table *ostab, *stab = NULL;
1321         int err = 0;
1322
1323         if (tca[TCA_OPTIONS]) {
1324                 if (!sch->ops->change) {
1325                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1326                         return -EINVAL;
1327                 }
1328                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1329                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1330                         return -EOPNOTSUPP;
1331                 }
1332                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1333                 if (err)
1334                         return err;
1335         }
1336
1337         if (tca[TCA_STAB]) {
1338                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1339                 if (IS_ERR(stab))
1340                         return PTR_ERR(stab);
1341         }
1342
1343         ostab = rtnl_dereference(sch->stab);
1344         rcu_assign_pointer(sch->stab, stab);
1345         qdisc_put_stab(ostab);
1346
1347         if (tca[TCA_RATE]) {
1348                 /* NB: ignores errors from replace_estimator
1349                    because change can't be undone. */
1350                 if (sch->flags & TCQ_F_MQROOT)
1351                         goto out;
1352                 gen_replace_estimator(&sch->bstats,
1353                                       sch->cpu_bstats,
1354                                       &sch->rate_est,
1355                                       NULL,
1356                                       qdisc_root_sleeping_running(sch),
1357                                       tca[TCA_RATE]);
1358         }
1359 out:
1360         return 0;
1361 }
1362
1363 struct check_loop_arg {
1364         struct qdisc_walker     w;
1365         struct Qdisc            *p;
1366         int                     depth;
1367 };
1368
1369 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1370                          struct qdisc_walker *w);
1371
1372 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1373 {
1374         struct check_loop_arg   arg;
1375
1376         if (q->ops->cl_ops == NULL)
1377                 return 0;
1378
1379         arg.w.stop = arg.w.skip = arg.w.count = 0;
1380         arg.w.fn = check_loop_fn;
1381         arg.depth = depth;
1382         arg.p = p;
1383         q->ops->cl_ops->walk(q, &arg.w);
1384         return arg.w.stop ? -ELOOP : 0;
1385 }
1386
1387 static int
1388 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1389 {
1390         struct Qdisc *leaf;
1391         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1392         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1393
1394         leaf = cops->leaf(q, cl);
1395         if (leaf) {
1396                 if (leaf == arg->p || arg->depth > 7)
1397                         return -ELOOP;
1398                 return check_loop(leaf, arg->p, arg->depth + 1);
1399         }
1400         return 0;
1401 }
1402
1403 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1404         [TCA_KIND]              = { .type = NLA_STRING },
1405         [TCA_RATE]              = { .type = NLA_BINARY,
1406                                     .len = sizeof(struct tc_estimator) },
1407         [TCA_STAB]              = { .type = NLA_NESTED },
1408         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1409         [TCA_CHAIN]             = { .type = NLA_U32 },
1410         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1411         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1412 };
1413
1414 /*
1415  * Delete/get qdisc.
1416  */
1417
1418 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1419                         struct netlink_ext_ack *extack)
1420 {
1421         struct net *net = sock_net(skb->sk);
1422         struct tcmsg *tcm = nlmsg_data(n);
1423         struct nlattr *tca[TCA_MAX + 1];
1424         struct net_device *dev;
1425         u32 clid;
1426         struct Qdisc *q = NULL;
1427         struct Qdisc *p = NULL;
1428         int err;
1429
1430         if ((n->nlmsg_type != RTM_GETQDISC) &&
1431             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1432                 return -EPERM;
1433
1434         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1435                                      rtm_tca_policy, extack);
1436         if (err < 0)
1437                 return err;
1438
1439         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1440         if (!dev)
1441                 return -ENODEV;
1442
1443         clid = tcm->tcm_parent;
1444         if (clid) {
1445                 if (clid != TC_H_ROOT) {
1446                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1447                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1448                                 if (!p) {
1449                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1450                                         return -ENOENT;
1451                                 }
1452                                 q = qdisc_leaf(p, clid);
1453                         } else if (dev_ingress_queue(dev)) {
1454                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1455                         }
1456                 } else {
1457                         q = dev->qdisc;
1458                 }
1459                 if (!q) {
1460                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1461                         return -ENOENT;
1462                 }
1463
1464                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1465                         NL_SET_ERR_MSG(extack, "Invalid handle");
1466                         return -EINVAL;
1467                 }
1468         } else {
1469                 q = qdisc_lookup(dev, tcm->tcm_handle);
1470                 if (!q) {
1471                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1472                         return -ENOENT;
1473                 }
1474         }
1475
1476         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1477                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1478                 return -EINVAL;
1479         }
1480
1481         if (n->nlmsg_type == RTM_DELQDISC) {
1482                 if (!clid) {
1483                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1484                         return -EINVAL;
1485                 }
1486                 if (q->handle == 0) {
1487                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1488                         return -ENOENT;
1489                 }
1490                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1491                 if (err != 0)
1492                         return err;
1493         } else {
1494                 qdisc_notify(net, skb, n, clid, NULL, q);
1495         }
1496         return 0;
1497 }
1498
1499 /*
1500  * Create/change qdisc.
1501  */
1502
1503 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1504                            struct netlink_ext_ack *extack)
1505 {
1506         struct net *net = sock_net(skb->sk);
1507         struct tcmsg *tcm;
1508         struct nlattr *tca[TCA_MAX + 1];
1509         struct net_device *dev;
1510         u32 clid;
1511         struct Qdisc *q, *p;
1512         int err;
1513
1514         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1515                 return -EPERM;
1516
1517 replay:
1518         /* Reinit, just in case something touches this. */
1519         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1520                                      rtm_tca_policy, extack);
1521         if (err < 0)
1522                 return err;
1523
1524         tcm = nlmsg_data(n);
1525         clid = tcm->tcm_parent;
1526         q = p = NULL;
1527
1528         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1529         if (!dev)
1530                 return -ENODEV;
1531
1532
1533         if (clid) {
1534                 if (clid != TC_H_ROOT) {
1535                         if (clid != TC_H_INGRESS) {
1536                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1537                                 if (!p) {
1538                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1539                                         return -ENOENT;
1540                                 }
1541                                 q = qdisc_leaf(p, clid);
1542                         } else if (dev_ingress_queue_create(dev)) {
1543                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1544                         }
1545                 } else {
1546                         q = dev->qdisc;
1547                 }
1548
1549                 /* It may be default qdisc, ignore it */
1550                 if (q && q->handle == 0)
1551                         q = NULL;
1552
1553                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1554                         if (tcm->tcm_handle) {
1555                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1556                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1557                                         return -EEXIST;
1558                                 }
1559                                 if (TC_H_MIN(tcm->tcm_handle)) {
1560                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1561                                         return -EINVAL;
1562                                 }
1563                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1564                                 if (!q)
1565                                         goto create_n_graft;
1566                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1567                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1568                                         return -EEXIST;
1569                                 }
1570                                 if (tca[TCA_KIND] &&
1571                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1572                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1573                                         return -EINVAL;
1574                                 }
1575                                 if (q == p ||
1576                                     (p && check_loop(q, p, 0))) {
1577                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1578                                         return -ELOOP;
1579                                 }
1580                                 qdisc_refcount_inc(q);
1581                                 goto graft;
1582                         } else {
1583                                 if (!q)
1584                                         goto create_n_graft;
1585
1586                                 /* This magic test requires explanation.
1587                                  *
1588                                  *   We know, that some child q is already
1589                                  *   attached to this parent and have choice:
1590                                  *   either to change it or to create/graft new one.
1591                                  *
1592                                  *   1. We are allowed to create/graft only
1593                                  *   if CREATE and REPLACE flags are set.
1594                                  *
1595                                  *   2. If EXCL is set, requestor wanted to say,
1596                                  *   that qdisc tcm_handle is not expected
1597                                  *   to exist, so that we choose create/graft too.
1598                                  *
1599                                  *   3. The last case is when no flags are set.
1600                                  *   Alas, it is sort of hole in API, we
1601                                  *   cannot decide what to do unambiguously.
1602                                  *   For now we select create/graft, if
1603                                  *   user gave KIND, which does not match existing.
1604                                  */
1605                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1606                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1607                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1608                                      (tca[TCA_KIND] &&
1609                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1610                                         goto create_n_graft;
1611                         }
1612                 }
1613         } else {
1614                 if (!tcm->tcm_handle) {
1615                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1616                         return -EINVAL;
1617                 }
1618                 q = qdisc_lookup(dev, tcm->tcm_handle);
1619         }
1620
1621         /* Change qdisc parameters */
1622         if (!q) {
1623                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1624                 return -ENOENT;
1625         }
1626         if (n->nlmsg_flags & NLM_F_EXCL) {
1627                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1628                 return -EEXIST;
1629         }
1630         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1631                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1632                 return -EINVAL;
1633         }
1634         err = qdisc_change(q, tca, extack);
1635         if (err == 0)
1636                 qdisc_notify(net, skb, n, clid, NULL, q);
1637         return err;
1638
1639 create_n_graft:
1640         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1641                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1642                 return -ENOENT;
1643         }
1644         if (clid == TC_H_INGRESS) {
1645                 if (dev_ingress_queue(dev)) {
1646                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1647                                          tcm->tcm_parent, tcm->tcm_parent,
1648                                          tca, &err, extack);
1649                 } else {
1650                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1651                         err = -ENOENT;
1652                 }
1653         } else {
1654                 struct netdev_queue *dev_queue;
1655
1656                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1657                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1658                 else if (p)
1659                         dev_queue = p->dev_queue;
1660                 else
1661                         dev_queue = netdev_get_tx_queue(dev, 0);
1662
1663                 q = qdisc_create(dev, dev_queue, p,
1664                                  tcm->tcm_parent, tcm->tcm_handle,
1665                                  tca, &err, extack);
1666         }
1667         if (q == NULL) {
1668                 if (err == -EAGAIN)
1669                         goto replay;
1670                 return err;
1671         }
1672
1673 graft:
1674         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1675         if (err) {
1676                 if (q)
1677                         qdisc_put(q);
1678                 return err;
1679         }
1680
1681         return 0;
1682 }
1683
1684 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1685                               struct netlink_callback *cb,
1686                               int *q_idx_p, int s_q_idx, bool recur,
1687                               bool dump_invisible)
1688 {
1689         int ret = 0, q_idx = *q_idx_p;
1690         struct Qdisc *q;
1691         int b;
1692
1693         if (!root)
1694                 return 0;
1695
1696         q = root;
1697         if (q_idx < s_q_idx) {
1698                 q_idx++;
1699         } else {
1700                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1701                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1702                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1703                                   RTM_NEWQDISC) <= 0)
1704                         goto done;
1705                 q_idx++;
1706         }
1707
1708         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1709          * itself has already been dumped.
1710          *
1711          * If we've already dumped the top-level (ingress) qdisc above and the global
1712          * qdisc hashtable, we don't want to hit it again
1713          */
1714         if (!qdisc_dev(root) || !recur)
1715                 goto out;
1716
1717         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1718                 if (q_idx < s_q_idx) {
1719                         q_idx++;
1720                         continue;
1721                 }
1722                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1723                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1724                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1725                                   RTM_NEWQDISC) <= 0)
1726                         goto done;
1727                 q_idx++;
1728         }
1729
1730 out:
1731         *q_idx_p = q_idx;
1732         return ret;
1733 done:
1734         ret = -1;
1735         goto out;
1736 }
1737
1738 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1739 {
1740         struct net *net = sock_net(skb->sk);
1741         int idx, q_idx;
1742         int s_idx, s_q_idx;
1743         struct net_device *dev;
1744         const struct nlmsghdr *nlh = cb->nlh;
1745         struct nlattr *tca[TCA_MAX + 1];
1746         int err;
1747
1748         s_idx = cb->args[0];
1749         s_q_idx = q_idx = cb->args[1];
1750
1751         idx = 0;
1752         ASSERT_RTNL();
1753
1754         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1755                                      rtm_tca_policy, cb->extack);
1756         if (err < 0)
1757                 return err;
1758
1759         for_each_netdev(net, dev) {
1760                 struct netdev_queue *dev_queue;
1761
1762                 if (idx < s_idx)
1763                         goto cont;
1764                 if (idx > s_idx)
1765                         s_q_idx = 0;
1766                 q_idx = 0;
1767
1768                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1769                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1770                         goto done;
1771
1772                 dev_queue = dev_ingress_queue(dev);
1773                 if (dev_queue &&
1774                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1775                                        &q_idx, s_q_idx, false,
1776                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1777                         goto done;
1778
1779 cont:
1780                 idx++;
1781         }
1782
1783 done:
1784         cb->args[0] = idx;
1785         cb->args[1] = q_idx;
1786
1787         return skb->len;
1788 }
1789
1790
1791
1792 /************************************************
1793  *      Traffic classes manipulation.           *
1794  ************************************************/
1795
1796 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1797                           unsigned long cl,
1798                           u32 portid, u32 seq, u16 flags, int event)
1799 {
1800         struct tcmsg *tcm;
1801         struct nlmsghdr  *nlh;
1802         unsigned char *b = skb_tail_pointer(skb);
1803         struct gnet_dump d;
1804         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1805
1806         cond_resched();
1807         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1808         if (!nlh)
1809                 goto out_nlmsg_trim;
1810         tcm = nlmsg_data(nlh);
1811         tcm->tcm_family = AF_UNSPEC;
1812         tcm->tcm__pad1 = 0;
1813         tcm->tcm__pad2 = 0;
1814         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1815         tcm->tcm_parent = q->handle;
1816         tcm->tcm_handle = q->handle;
1817         tcm->tcm_info = 0;
1818         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1819                 goto nla_put_failure;
1820         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1821                 goto nla_put_failure;
1822
1823         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1824                                          NULL, &d, TCA_PAD) < 0)
1825                 goto nla_put_failure;
1826
1827         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1828                 goto nla_put_failure;
1829
1830         if (gnet_stats_finish_copy(&d) < 0)
1831                 goto nla_put_failure;
1832
1833         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1834         return skb->len;
1835
1836 out_nlmsg_trim:
1837 nla_put_failure:
1838         nlmsg_trim(skb, b);
1839         return -1;
1840 }
1841
1842 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1843                          struct nlmsghdr *n, struct Qdisc *q,
1844                          unsigned long cl, int event)
1845 {
1846         struct sk_buff *skb;
1847         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1848         int err = 0;
1849
1850         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1851         if (!skb)
1852                 return -ENOBUFS;
1853
1854         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1855                 kfree_skb(skb);
1856                 return -EINVAL;
1857         }
1858
1859         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1860                              n->nlmsg_flags & NLM_F_ECHO);
1861         if (err > 0)
1862                 err = 0;
1863         return err;
1864 }
1865
1866 static int tclass_del_notify(struct net *net,
1867                              const struct Qdisc_class_ops *cops,
1868                              struct sk_buff *oskb, struct nlmsghdr *n,
1869                              struct Qdisc *q, unsigned long cl,
1870                              struct netlink_ext_ack *extack)
1871 {
1872         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1873         struct sk_buff *skb;
1874         int err = 0;
1875
1876         if (!cops->delete)
1877                 return -EOPNOTSUPP;
1878
1879         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1880         if (!skb)
1881                 return -ENOBUFS;
1882
1883         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1884                            RTM_DELTCLASS) < 0) {
1885                 kfree_skb(skb);
1886                 return -EINVAL;
1887         }
1888
1889         err = cops->delete(q, cl, extack);
1890         if (err) {
1891                 kfree_skb(skb);
1892                 return err;
1893         }
1894
1895         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1896                              n->nlmsg_flags & NLM_F_ECHO);
1897         if (err > 0)
1898                 err = 0;
1899         return err;
1900 }
1901
1902 #ifdef CONFIG_NET_CLS
1903
1904 struct tcf_bind_args {
1905         struct tcf_walker w;
1906         unsigned long base;
1907         unsigned long cl;
1908         u32 classid;
1909 };
1910
1911 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1912 {
1913         struct tcf_bind_args *a = (void *)arg;
1914
1915         if (tp->ops->bind_class) {
1916                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1917
1918                 sch_tree_lock(q);
1919                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1920                 sch_tree_unlock(q);
1921         }
1922         return 0;
1923 }
1924
1925 struct tc_bind_class_args {
1926         struct qdisc_walker w;
1927         unsigned long new_cl;
1928         u32 portid;
1929         u32 clid;
1930 };
1931
1932 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1933                                 struct qdisc_walker *w)
1934 {
1935         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1936         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1937         struct tcf_block *block;
1938         struct tcf_chain *chain;
1939
1940         block = cops->tcf_block(q, cl, NULL);
1941         if (!block)
1942                 return 0;
1943         for (chain = tcf_get_next_chain(block, NULL);
1944              chain;
1945              chain = tcf_get_next_chain(block, chain)) {
1946                 struct tcf_proto *tp;
1947
1948                 for (tp = tcf_get_next_proto(chain, NULL);
1949                      tp; tp = tcf_get_next_proto(chain, tp)) {
1950                         struct tcf_bind_args arg = {};
1951
1952                         arg.w.fn = tcf_node_bind;
1953                         arg.classid = a->clid;
1954                         arg.base = cl;
1955                         arg.cl = a->new_cl;
1956                         tp->ops->walk(tp, &arg.w, true);
1957                 }
1958         }
1959
1960         return 0;
1961 }
1962
1963 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1964                            unsigned long new_cl)
1965 {
1966         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1967         struct tc_bind_class_args args = {};
1968
1969         if (!cops->tcf_block)
1970                 return;
1971         args.portid = portid;
1972         args.clid = clid;
1973         args.new_cl = new_cl;
1974         args.w.fn = tc_bind_class_walker;
1975         q->ops->cl_ops->walk(q, &args.w);
1976 }
1977
1978 #else
1979
1980 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1981                            unsigned long new_cl)
1982 {
1983 }
1984
1985 #endif
1986
1987 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1988                          struct netlink_ext_ack *extack)
1989 {
1990         struct net *net = sock_net(skb->sk);
1991         struct tcmsg *tcm = nlmsg_data(n);
1992         struct nlattr *tca[TCA_MAX + 1];
1993         struct net_device *dev;
1994         struct Qdisc *q = NULL;
1995         const struct Qdisc_class_ops *cops;
1996         unsigned long cl = 0;
1997         unsigned long new_cl;
1998         u32 portid;
1999         u32 clid;
2000         u32 qid;
2001         int err;
2002
2003         if ((n->nlmsg_type != RTM_GETTCLASS) &&
2004             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2005                 return -EPERM;
2006
2007         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2008                                      rtm_tca_policy, extack);
2009         if (err < 0)
2010                 return err;
2011
2012         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2013         if (!dev)
2014                 return -ENODEV;
2015
2016         /*
2017            parent == TC_H_UNSPEC - unspecified parent.
2018            parent == TC_H_ROOT   - class is root, which has no parent.
2019            parent == X:0         - parent is root class.
2020            parent == X:Y         - parent is a node in hierarchy.
2021            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2022
2023            handle == 0:0         - generate handle from kernel pool.
2024            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2025            handle == X:Y         - clear.
2026            handle == X:0         - root class.
2027          */
2028
2029         /* Step 1. Determine qdisc handle X:0 */
2030
2031         portid = tcm->tcm_parent;
2032         clid = tcm->tcm_handle;
2033         qid = TC_H_MAJ(clid);
2034
2035         if (portid != TC_H_ROOT) {
2036                 u32 qid1 = TC_H_MAJ(portid);
2037
2038                 if (qid && qid1) {
2039                         /* If both majors are known, they must be identical. */
2040                         if (qid != qid1)
2041                                 return -EINVAL;
2042                 } else if (qid1) {
2043                         qid = qid1;
2044                 } else if (qid == 0)
2045                         qid = dev->qdisc->handle;
2046
2047                 /* Now qid is genuine qdisc handle consistent
2048                  * both with parent and child.
2049                  *
2050                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2051                  */
2052                 if (portid)
2053                         portid = TC_H_MAKE(qid, portid);
2054         } else {
2055                 if (qid == 0)
2056                         qid = dev->qdisc->handle;
2057         }
2058
2059         /* OK. Locate qdisc */
2060         q = qdisc_lookup(dev, qid);
2061         if (!q)
2062                 return -ENOENT;
2063
2064         /* An check that it supports classes */
2065         cops = q->ops->cl_ops;
2066         if (cops == NULL)
2067                 return -EINVAL;
2068
2069         /* Now try to get class */
2070         if (clid == 0) {
2071                 if (portid == TC_H_ROOT)
2072                         clid = qid;
2073         } else
2074                 clid = TC_H_MAKE(qid, clid);
2075
2076         if (clid)
2077                 cl = cops->find(q, clid);
2078
2079         if (cl == 0) {
2080                 err = -ENOENT;
2081                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2082                     !(n->nlmsg_flags & NLM_F_CREATE))
2083                         goto out;
2084         } else {
2085                 switch (n->nlmsg_type) {
2086                 case RTM_NEWTCLASS:
2087                         err = -EEXIST;
2088                         if (n->nlmsg_flags & NLM_F_EXCL)
2089                                 goto out;
2090                         break;
2091                 case RTM_DELTCLASS:
2092                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2093                         /* Unbind the class with flilters with 0 */
2094                         tc_bind_tclass(q, portid, clid, 0);
2095                         goto out;
2096                 case RTM_GETTCLASS:
2097                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2098                         goto out;
2099                 default:
2100                         err = -EINVAL;
2101                         goto out;
2102                 }
2103         }
2104
2105         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2106                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2107                 return -EOPNOTSUPP;
2108         }
2109
2110         new_cl = cl;
2111         err = -EOPNOTSUPP;
2112         if (cops->change)
2113                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2114         if (err == 0) {
2115                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2116                 /* We just create a new class, need to do reverse binding. */
2117                 if (cl != new_cl)
2118                         tc_bind_tclass(q, portid, clid, new_cl);
2119         }
2120 out:
2121         return err;
2122 }
2123
2124 struct qdisc_dump_args {
2125         struct qdisc_walker     w;
2126         struct sk_buff          *skb;
2127         struct netlink_callback *cb;
2128 };
2129
2130 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2131                             struct qdisc_walker *arg)
2132 {
2133         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2134
2135         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2136                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2137                               RTM_NEWTCLASS);
2138 }
2139
2140 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2141                                 struct tcmsg *tcm, struct netlink_callback *cb,
2142                                 int *t_p, int s_t)
2143 {
2144         struct qdisc_dump_args arg;
2145
2146         if (tc_qdisc_dump_ignore(q, false) ||
2147             *t_p < s_t || !q->ops->cl_ops ||
2148             (tcm->tcm_parent &&
2149              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2150                 (*t_p)++;
2151                 return 0;
2152         }
2153         if (*t_p > s_t)
2154                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2155         arg.w.fn = qdisc_class_dump;
2156         arg.skb = skb;
2157         arg.cb = cb;
2158         arg.w.stop  = 0;
2159         arg.w.skip = cb->args[1];
2160         arg.w.count = 0;
2161         q->ops->cl_ops->walk(q, &arg.w);
2162         cb->args[1] = arg.w.count;
2163         if (arg.w.stop)
2164                 return -1;
2165         (*t_p)++;
2166         return 0;
2167 }
2168
2169 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2170                                struct tcmsg *tcm, struct netlink_callback *cb,
2171                                int *t_p, int s_t)
2172 {
2173         struct Qdisc *q;
2174         int b;
2175
2176         if (!root)
2177                 return 0;
2178
2179         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2180                 return -1;
2181
2182         if (!qdisc_dev(root))
2183                 return 0;
2184
2185         if (tcm->tcm_parent) {
2186                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2187                 if (q && q != root &&
2188                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2189                         return -1;
2190                 return 0;
2191         }
2192         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2193                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2194                         return -1;
2195         }
2196
2197         return 0;
2198 }
2199
2200 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2201 {
2202         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2203         struct net *net = sock_net(skb->sk);
2204         struct netdev_queue *dev_queue;
2205         struct net_device *dev;
2206         int t, s_t;
2207
2208         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2209                 return 0;
2210         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2211         if (!dev)
2212                 return 0;
2213
2214         s_t = cb->args[0];
2215         t = 0;
2216
2217         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2218                 goto done;
2219
2220         dev_queue = dev_ingress_queue(dev);
2221         if (dev_queue &&
2222             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2223                                 &t, s_t) < 0)
2224                 goto done;
2225
2226 done:
2227         cb->args[0] = t;
2228
2229         dev_put(dev);
2230         return skb->len;
2231 }
2232
2233 #ifdef CONFIG_PROC_FS
2234 static int psched_show(struct seq_file *seq, void *v)
2235 {
2236         seq_printf(seq, "%08x %08x %08x %08x\n",
2237                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2238                    1000000,
2239                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2240
2241         return 0;
2242 }
2243
2244 static int __net_init psched_net_init(struct net *net)
2245 {
2246         struct proc_dir_entry *e;
2247
2248         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2249         if (e == NULL)
2250                 return -ENOMEM;
2251
2252         return 0;
2253 }
2254
2255 static void __net_exit psched_net_exit(struct net *net)
2256 {
2257         remove_proc_entry("psched", net->proc_net);
2258 }
2259 #else
2260 static int __net_init psched_net_init(struct net *net)
2261 {
2262         return 0;
2263 }
2264
2265 static void __net_exit psched_net_exit(struct net *net)
2266 {
2267 }
2268 #endif
2269
2270 static struct pernet_operations psched_net_ops = {
2271         .init = psched_net_init,
2272         .exit = psched_net_exit,
2273 };
2274
2275 static int __init pktsched_init(void)
2276 {
2277         int err;
2278
2279         err = register_pernet_subsys(&psched_net_ops);
2280         if (err) {
2281                 pr_err("pktsched_init: "
2282                        "cannot initialize per netns operations\n");
2283                 return err;
2284         }
2285
2286         register_qdisc(&pfifo_fast_ops);
2287         register_qdisc(&pfifo_qdisc_ops);
2288         register_qdisc(&bfifo_qdisc_ops);
2289         register_qdisc(&pfifo_head_drop_qdisc_ops);
2290         register_qdisc(&mq_qdisc_ops);
2291         register_qdisc(&noqueue_qdisc_ops);
2292
2293         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2294         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2295         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2296                       0);
2297         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2298         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2299         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2300                       0);
2301
2302         return 0;
2303 }
2304
2305 subsys_initcall(pktsched_init);