alpha: fix TIF_NOTIFY_SIGNAL handling
[linux-2.6-microblaze.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 #include <trace/events/qdisc.h>
36
37 /*
38
39    Short review.
40    -------------
41
42    This file consists of two interrelated parts:
43
44    1. queueing disciplines manager frontend.
45    2. traffic classes manager frontend.
46
47    Generally, queueing discipline ("qdisc") is a black box,
48    which is able to enqueue packets and to dequeue them (when
49    device is ready to send something) in order and at times
50    determined by algorithm hidden in it.
51
52    qdisc's are divided to two categories:
53    - "queues", which have no internal structure visible from outside.
54    - "schedulers", which split all the packets to "traffic classes",
55      using "packet classifiers" (look at cls_api.c)
56
57    In turn, classes may have child qdiscs (as rule, queues)
58    attached to them etc. etc. etc.
59
60    The goal of the routines in this file is to translate
61    information supplied by user in the form of handles
62    to more intelligible for kernel form, to make some sanity
63    checks and part of work, which is common to all qdiscs
64    and to provide rtnetlink notifications.
65
66    All real intelligent work is done inside qdisc modules.
67
68
69
70    Every discipline has two major routines: enqueue and dequeue.
71
72    ---dequeue
73
74    dequeue usually returns a skb to send. It is allowed to return NULL,
75    but it does not mean that queue is empty, it just means that
76    discipline does not want to send anything this time.
77    Queue is really empty if q->q.qlen == 0.
78    For complicated disciplines with multiple queues q->q is not
79    real packet queue, but however q->q.qlen must be valid.
80
81    ---enqueue
82
83    enqueue returns 0, if packet was enqueued successfully.
84    If packet (this one or another one) was dropped, it returns
85    not zero error code.
86    NET_XMIT_DROP        - this packet dropped
87      Expected action: do not backoff, but wait until queue will clear.
88    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
89      Expected action: backoff or ignore
90
91    Auxiliary routines:
92
93    ---peek
94
95    like dequeue but without removing a packet from the queue
96
97    ---reset
98
99    returns qdisc to initial state: purge all buffers, clear all
100    timers, counters (except for statistics) etc.
101
102    ---init
103
104    initializes newly created qdisc.
105
106    ---destroy
107
108    destroys resources allocated by init and during lifetime of qdisc.
109
110    ---change
111
112    changes qdisc parameters.
113  */
114
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
117
118
119 /************************************************
120  *      Queueing disciplines manipulation.      *
121  ************************************************/
122
123
124 /* The list of all installed queueing disciplines. */
125
126 static struct Qdisc_ops *qdisc_base;
127
128 /* Register/unregister queueing discipline */
129
130 int register_qdisc(struct Qdisc_ops *qops)
131 {
132         struct Qdisc_ops *q, **qp;
133         int rc = -EEXIST;
134
135         write_lock(&qdisc_mod_lock);
136         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137                 if (!strcmp(qops->id, q->id))
138                         goto out;
139
140         if (qops->enqueue == NULL)
141                 qops->enqueue = noop_qdisc_ops.enqueue;
142         if (qops->peek == NULL) {
143                 if (qops->dequeue == NULL)
144                         qops->peek = noop_qdisc_ops.peek;
145                 else
146                         goto out_einval;
147         }
148         if (qops->dequeue == NULL)
149                 qops->dequeue = noop_qdisc_ops.dequeue;
150
151         if (qops->cl_ops) {
152                 const struct Qdisc_class_ops *cops = qops->cl_ops;
153
154                 if (!(cops->find && cops->walk && cops->leaf))
155                         goto out_einval;
156
157                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158                         goto out_einval;
159         }
160
161         qops->next = NULL;
162         *qp = qops;
163         rc = 0;
164 out:
165         write_unlock(&qdisc_mod_lock);
166         return rc;
167
168 out_einval:
169         rc = -EINVAL;
170         goto out;
171 }
172 EXPORT_SYMBOL(register_qdisc);
173
174 void unregister_qdisc(struct Qdisc_ops *qops)
175 {
176         struct Qdisc_ops *q, **qp;
177         int err = -ENOENT;
178
179         write_lock(&qdisc_mod_lock);
180         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181                 if (q == qops)
182                         break;
183         if (q) {
184                 *qp = q->next;
185                 q->next = NULL;
186                 err = 0;
187         }
188         write_unlock(&qdisc_mod_lock);
189
190         WARN(err, "unregister qdisc(%s) failed\n", qops->id);
191 }
192 EXPORT_SYMBOL(unregister_qdisc);
193
194 /* Get default qdisc if not otherwise specified */
195 void qdisc_get_default(char *name, size_t len)
196 {
197         read_lock(&qdisc_mod_lock);
198         strscpy(name, default_qdisc_ops->id, len);
199         read_unlock(&qdisc_mod_lock);
200 }
201
202 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
203 {
204         struct Qdisc_ops *q = NULL;
205
206         for (q = qdisc_base; q; q = q->next) {
207                 if (!strcmp(name, q->id)) {
208                         if (!try_module_get(q->owner))
209                                 q = NULL;
210                         break;
211                 }
212         }
213
214         return q;
215 }
216
217 /* Set new default qdisc to use */
218 int qdisc_set_default(const char *name)
219 {
220         const struct Qdisc_ops *ops;
221
222         if (!capable(CAP_NET_ADMIN))
223                 return -EPERM;
224
225         write_lock(&qdisc_mod_lock);
226         ops = qdisc_lookup_default(name);
227         if (!ops) {
228                 /* Not found, drop lock and try to load module */
229                 write_unlock(&qdisc_mod_lock);
230                 request_module("sch_%s", name);
231                 write_lock(&qdisc_mod_lock);
232
233                 ops = qdisc_lookup_default(name);
234         }
235
236         if (ops) {
237                 /* Set new default */
238                 module_put(default_qdisc_ops->owner);
239                 default_qdisc_ops = ops;
240         }
241         write_unlock(&qdisc_mod_lock);
242
243         return ops ? 0 : -ENOENT;
244 }
245
246 #ifdef CONFIG_NET_SCH_DEFAULT
247 /* Set default value from kernel config */
248 static int __init sch_default_qdisc(void)
249 {
250         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
251 }
252 late_initcall(sch_default_qdisc);
253 #endif
254
255 /* We know handle. Find qdisc among all qdisc's attached to device
256  * (root qdisc, all its children, children of children etc.)
257  * Note: caller either uses rtnl or rcu_read_lock()
258  */
259
260 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
261 {
262         struct Qdisc *q;
263
264         if (!qdisc_dev(root))
265                 return (root->handle == handle ? root : NULL);
266
267         if (!(root->flags & TCQ_F_BUILTIN) &&
268             root->handle == handle)
269                 return root;
270
271         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
272                                    lockdep_rtnl_is_held()) {
273                 if (q->handle == handle)
274                         return q;
275         }
276         return NULL;
277 }
278
279 void qdisc_hash_add(struct Qdisc *q, bool invisible)
280 {
281         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
282                 ASSERT_RTNL();
283                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
284                 if (invisible)
285                         q->flags |= TCQ_F_INVISIBLE;
286         }
287 }
288 EXPORT_SYMBOL(qdisc_hash_add);
289
290 void qdisc_hash_del(struct Qdisc *q)
291 {
292         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
293                 ASSERT_RTNL();
294                 hash_del_rcu(&q->hash);
295         }
296 }
297 EXPORT_SYMBOL(qdisc_hash_del);
298
299 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
300 {
301         struct Qdisc *q;
302
303         if (!handle)
304                 return NULL;
305         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
306         if (q)
307                 goto out;
308
309         if (dev_ingress_queue(dev))
310                 q = qdisc_match_from_root(
311                         dev_ingress_queue(dev)->qdisc_sleeping,
312                         handle);
313 out:
314         return q;
315 }
316
317 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
318 {
319         struct netdev_queue *nq;
320         struct Qdisc *q;
321
322         if (!handle)
323                 return NULL;
324         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
325         if (q)
326                 goto out;
327
328         nq = dev_ingress_queue_rcu(dev);
329         if (nq)
330                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
331 out:
332         return q;
333 }
334
335 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
336 {
337         unsigned long cl;
338         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
339
340         if (cops == NULL)
341                 return NULL;
342         cl = cops->find(p, classid);
343
344         if (cl == 0)
345                 return NULL;
346         return cops->leaf(p, cl);
347 }
348
349 /* Find queueing discipline by name */
350
351 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
352 {
353         struct Qdisc_ops *q = NULL;
354
355         if (kind) {
356                 read_lock(&qdisc_mod_lock);
357                 for (q = qdisc_base; q; q = q->next) {
358                         if (nla_strcmp(kind, q->id) == 0) {
359                                 if (!try_module_get(q->owner))
360                                         q = NULL;
361                                 break;
362                         }
363                 }
364                 read_unlock(&qdisc_mod_lock);
365         }
366         return q;
367 }
368
369 /* The linklayer setting were not transferred from iproute2, in older
370  * versions, and the rate tables lookup systems have been dropped in
371  * the kernel. To keep backward compatible with older iproute2 tc
372  * utils, we detect the linklayer setting by detecting if the rate
373  * table were modified.
374  *
375  * For linklayer ATM table entries, the rate table will be aligned to
376  * 48 bytes, thus some table entries will contain the same value.  The
377  * mpu (min packet unit) is also encoded into the old rate table, thus
378  * starting from the mpu, we find low and high table entries for
379  * mapping this cell.  If these entries contain the same value, when
380  * the rate tables have been modified for linklayer ATM.
381  *
382  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
383  * and then roundup to the next cell, calc the table entry one below,
384  * and compare.
385  */
386 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
387 {
388         int low       = roundup(r->mpu, 48);
389         int high      = roundup(low+1, 48);
390         int cell_low  = low >> r->cell_log;
391         int cell_high = (high >> r->cell_log) - 1;
392
393         /* rtab is too inaccurate at rates > 100Mbit/s */
394         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
395                 pr_debug("TC linklayer: Giving up ATM detection\n");
396                 return TC_LINKLAYER_ETHERNET;
397         }
398
399         if ((cell_high > cell_low) && (cell_high < 256)
400             && (rtab[cell_low] == rtab[cell_high])) {
401                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
402                          cell_low, cell_high, rtab[cell_high]);
403                 return TC_LINKLAYER_ATM;
404         }
405         return TC_LINKLAYER_ETHERNET;
406 }
407
408 static struct qdisc_rate_table *qdisc_rtab_list;
409
410 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
411                                         struct nlattr *tab,
412                                         struct netlink_ext_ack *extack)
413 {
414         struct qdisc_rate_table *rtab;
415
416         if (tab == NULL || r->rate == 0 ||
417             r->cell_log == 0 || r->cell_log >= 32 ||
418             nla_len(tab) != TC_RTAB_SIZE) {
419                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
420                 return NULL;
421         }
422
423         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
424                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
425                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
426                         rtab->refcnt++;
427                         return rtab;
428                 }
429         }
430
431         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
432         if (rtab) {
433                 rtab->rate = *r;
434                 rtab->refcnt = 1;
435                 memcpy(rtab->data, nla_data(tab), 1024);
436                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
437                         r->linklayer = __detect_linklayer(r, rtab->data);
438                 rtab->next = qdisc_rtab_list;
439                 qdisc_rtab_list = rtab;
440         } else {
441                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
442         }
443         return rtab;
444 }
445 EXPORT_SYMBOL(qdisc_get_rtab);
446
447 void qdisc_put_rtab(struct qdisc_rate_table *tab)
448 {
449         struct qdisc_rate_table *rtab, **rtabp;
450
451         if (!tab || --tab->refcnt)
452                 return;
453
454         for (rtabp = &qdisc_rtab_list;
455              (rtab = *rtabp) != NULL;
456              rtabp = &rtab->next) {
457                 if (rtab == tab) {
458                         *rtabp = rtab->next;
459                         kfree(rtab);
460                         return;
461                 }
462         }
463 }
464 EXPORT_SYMBOL(qdisc_put_rtab);
465
466 static LIST_HEAD(qdisc_stab_list);
467
468 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
469         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
470         [TCA_STAB_DATA] = { .type = NLA_BINARY },
471 };
472
473 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
474                                                struct netlink_ext_ack *extack)
475 {
476         struct nlattr *tb[TCA_STAB_MAX + 1];
477         struct qdisc_size_table *stab;
478         struct tc_sizespec *s;
479         unsigned int tsize = 0;
480         u16 *tab = NULL;
481         int err;
482
483         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
484                                           extack);
485         if (err < 0)
486                 return ERR_PTR(err);
487         if (!tb[TCA_STAB_BASE]) {
488                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
489                 return ERR_PTR(-EINVAL);
490         }
491
492         s = nla_data(tb[TCA_STAB_BASE]);
493
494         if (s->tsize > 0) {
495                 if (!tb[TCA_STAB_DATA]) {
496                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
497                         return ERR_PTR(-EINVAL);
498                 }
499                 tab = nla_data(tb[TCA_STAB_DATA]);
500                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
501         }
502
503         if (tsize != s->tsize || (!tab && tsize > 0)) {
504                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
505                 return ERR_PTR(-EINVAL);
506         }
507
508         list_for_each_entry(stab, &qdisc_stab_list, list) {
509                 if (memcmp(&stab->szopts, s, sizeof(*s)))
510                         continue;
511                 if (tsize > 0 &&
512                     memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
513                         continue;
514                 stab->refcnt++;
515                 return stab;
516         }
517
518         if (s->size_log > STAB_SIZE_LOG_MAX ||
519             s->cell_log > STAB_SIZE_LOG_MAX) {
520                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
521                 return ERR_PTR(-EINVAL);
522         }
523
524         stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
525         if (!stab)
526                 return ERR_PTR(-ENOMEM);
527
528         stab->refcnt = 1;
529         stab->szopts = *s;
530         if (tsize > 0)
531                 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
532
533         list_add_tail(&stab->list, &qdisc_stab_list);
534
535         return stab;
536 }
537
538 void qdisc_put_stab(struct qdisc_size_table *tab)
539 {
540         if (!tab)
541                 return;
542
543         if (--tab->refcnt == 0) {
544                 list_del(&tab->list);
545                 kfree_rcu(tab, rcu);
546         }
547 }
548 EXPORT_SYMBOL(qdisc_put_stab);
549
550 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
551 {
552         struct nlattr *nest;
553
554         nest = nla_nest_start_noflag(skb, TCA_STAB);
555         if (nest == NULL)
556                 goto nla_put_failure;
557         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
558                 goto nla_put_failure;
559         nla_nest_end(skb, nest);
560
561         return skb->len;
562
563 nla_put_failure:
564         return -1;
565 }
566
567 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
568                                const struct qdisc_size_table *stab)
569 {
570         int pkt_len, slot;
571
572         pkt_len = skb->len + stab->szopts.overhead;
573         if (unlikely(!stab->szopts.tsize))
574                 goto out;
575
576         slot = pkt_len + stab->szopts.cell_align;
577         if (unlikely(slot < 0))
578                 slot = 0;
579
580         slot >>= stab->szopts.cell_log;
581         if (likely(slot < stab->szopts.tsize))
582                 pkt_len = stab->data[slot];
583         else
584                 pkt_len = stab->data[stab->szopts.tsize - 1] *
585                                 (slot / stab->szopts.tsize) +
586                                 stab->data[slot % stab->szopts.tsize];
587
588         pkt_len <<= stab->szopts.size_log;
589 out:
590         if (unlikely(pkt_len < 1))
591                 pkt_len = 1;
592         qdisc_skb_cb(skb)->pkt_len = pkt_len;
593 }
594 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
595
596 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
597 {
598         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
599                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
600                         txt, qdisc->ops->id, qdisc->handle >> 16);
601                 qdisc->flags |= TCQ_F_WARN_NONWC;
602         }
603 }
604 EXPORT_SYMBOL(qdisc_warn_nonwc);
605
606 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
607 {
608         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
609                                                  timer);
610
611         rcu_read_lock();
612         __netif_schedule(qdisc_root(wd->qdisc));
613         rcu_read_unlock();
614
615         return HRTIMER_NORESTART;
616 }
617
618 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
619                                  clockid_t clockid)
620 {
621         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
622         wd->timer.function = qdisc_watchdog;
623         wd->qdisc = qdisc;
624 }
625 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
626
627 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
628 {
629         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
630 }
631 EXPORT_SYMBOL(qdisc_watchdog_init);
632
633 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
634                                       u64 delta_ns)
635 {
636         if (test_bit(__QDISC_STATE_DEACTIVATED,
637                      &qdisc_root_sleeping(wd->qdisc)->state))
638                 return;
639
640         if (hrtimer_is_queued(&wd->timer)) {
641                 /* If timer is already set in [expires, expires + delta_ns],
642                  * do not reprogram it.
643                  */
644                 if (wd->last_expires - expires <= delta_ns)
645                         return;
646         }
647
648         wd->last_expires = expires;
649         hrtimer_start_range_ns(&wd->timer,
650                                ns_to_ktime(expires),
651                                delta_ns,
652                                HRTIMER_MODE_ABS_PINNED);
653 }
654 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
655
656 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
657 {
658         hrtimer_cancel(&wd->timer);
659 }
660 EXPORT_SYMBOL(qdisc_watchdog_cancel);
661
662 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
663 {
664         struct hlist_head *h;
665         unsigned int i;
666
667         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
668
669         if (h != NULL) {
670                 for (i = 0; i < n; i++)
671                         INIT_HLIST_HEAD(&h[i]);
672         }
673         return h;
674 }
675
676 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
677 {
678         struct Qdisc_class_common *cl;
679         struct hlist_node *next;
680         struct hlist_head *nhash, *ohash;
681         unsigned int nsize, nmask, osize;
682         unsigned int i, h;
683
684         /* Rehash when load factor exceeds 0.75 */
685         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
686                 return;
687         nsize = clhash->hashsize * 2;
688         nmask = nsize - 1;
689         nhash = qdisc_class_hash_alloc(nsize);
690         if (nhash == NULL)
691                 return;
692
693         ohash = clhash->hash;
694         osize = clhash->hashsize;
695
696         sch_tree_lock(sch);
697         for (i = 0; i < osize; i++) {
698                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
699                         h = qdisc_class_hash(cl->classid, nmask);
700                         hlist_add_head(&cl->hnode, &nhash[h]);
701                 }
702         }
703         clhash->hash     = nhash;
704         clhash->hashsize = nsize;
705         clhash->hashmask = nmask;
706         sch_tree_unlock(sch);
707
708         kvfree(ohash);
709 }
710 EXPORT_SYMBOL(qdisc_class_hash_grow);
711
712 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
713 {
714         unsigned int size = 4;
715
716         clhash->hash = qdisc_class_hash_alloc(size);
717         if (!clhash->hash)
718                 return -ENOMEM;
719         clhash->hashsize  = size;
720         clhash->hashmask  = size - 1;
721         clhash->hashelems = 0;
722         return 0;
723 }
724 EXPORT_SYMBOL(qdisc_class_hash_init);
725
726 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
727 {
728         kvfree(clhash->hash);
729 }
730 EXPORT_SYMBOL(qdisc_class_hash_destroy);
731
732 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
733                              struct Qdisc_class_common *cl)
734 {
735         unsigned int h;
736
737         INIT_HLIST_NODE(&cl->hnode);
738         h = qdisc_class_hash(cl->classid, clhash->hashmask);
739         hlist_add_head(&cl->hnode, &clhash->hash[h]);
740         clhash->hashelems++;
741 }
742 EXPORT_SYMBOL(qdisc_class_hash_insert);
743
744 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
745                              struct Qdisc_class_common *cl)
746 {
747         hlist_del(&cl->hnode);
748         clhash->hashelems--;
749 }
750 EXPORT_SYMBOL(qdisc_class_hash_remove);
751
752 /* Allocate an unique handle from space managed by kernel
753  * Possible range is [8000-FFFF]:0000 (0x8000 values)
754  */
755 static u32 qdisc_alloc_handle(struct net_device *dev)
756 {
757         int i = 0x8000;
758         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
759
760         do {
761                 autohandle += TC_H_MAKE(0x10000U, 0);
762                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
763                         autohandle = TC_H_MAKE(0x80000000U, 0);
764                 if (!qdisc_lookup(dev, autohandle))
765                         return autohandle;
766                 cond_resched();
767         } while (--i > 0);
768
769         return 0;
770 }
771
772 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
773 {
774         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
775         const struct Qdisc_class_ops *cops;
776         unsigned long cl;
777         u32 parentid;
778         bool notify;
779         int drops;
780
781         if (n == 0 && len == 0)
782                 return;
783         drops = max_t(int, n, 0);
784         rcu_read_lock();
785         while ((parentid = sch->parent)) {
786                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
787                         break;
788
789                 if (sch->flags & TCQ_F_NOPARENT)
790                         break;
791                 /* Notify parent qdisc only if child qdisc becomes empty.
792                  *
793                  * If child was empty even before update then backlog
794                  * counter is screwed and we skip notification because
795                  * parent class is already passive.
796                  *
797                  * If the original child was offloaded then it is allowed
798                  * to be seem as empty, so the parent is notified anyway.
799                  */
800                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
801                                                        !qdisc_is_offloaded);
802                 /* TODO: perform the search on a per txq basis */
803                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
804                 if (sch == NULL) {
805                         WARN_ON_ONCE(parentid != TC_H_ROOT);
806                         break;
807                 }
808                 cops = sch->ops->cl_ops;
809                 if (notify && cops->qlen_notify) {
810                         cl = cops->find(sch, parentid);
811                         cops->qlen_notify(sch, cl);
812                 }
813                 sch->q.qlen -= n;
814                 sch->qstats.backlog -= len;
815                 __qdisc_qstats_drop(sch, drops);
816         }
817         rcu_read_unlock();
818 }
819 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
820
821 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
822                               void *type_data)
823 {
824         struct net_device *dev = qdisc_dev(sch);
825         int err;
826
827         sch->flags &= ~TCQ_F_OFFLOADED;
828         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
829                 return 0;
830
831         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
832         if (err == -EOPNOTSUPP)
833                 return 0;
834
835         if (!err)
836                 sch->flags |= TCQ_F_OFFLOADED;
837
838         return err;
839 }
840 EXPORT_SYMBOL(qdisc_offload_dump_helper);
841
842 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
843                                 struct Qdisc *new, struct Qdisc *old,
844                                 enum tc_setup_type type, void *type_data,
845                                 struct netlink_ext_ack *extack)
846 {
847         bool any_qdisc_is_offloaded;
848         int err;
849
850         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
851                 return;
852
853         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
854
855         /* Don't report error if the graft is part of destroy operation. */
856         if (!err || !new || new == &noop_qdisc)
857                 return;
858
859         /* Don't report error if the parent, the old child and the new
860          * one are not offloaded.
861          */
862         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
863         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
864         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
865
866         if (any_qdisc_is_offloaded)
867                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
868 }
869 EXPORT_SYMBOL(qdisc_offload_graft_helper);
870
871 void qdisc_offload_query_caps(struct net_device *dev,
872                               enum tc_setup_type type,
873                               void *caps, size_t caps_len)
874 {
875         const struct net_device_ops *ops = dev->netdev_ops;
876         struct tc_query_caps_base base = {
877                 .type = type,
878                 .caps = caps,
879         };
880
881         memset(caps, 0, caps_len);
882
883         if (ops->ndo_setup_tc)
884                 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
885 }
886 EXPORT_SYMBOL(qdisc_offload_query_caps);
887
888 static void qdisc_offload_graft_root(struct net_device *dev,
889                                      struct Qdisc *new, struct Qdisc *old,
890                                      struct netlink_ext_ack *extack)
891 {
892         struct tc_root_qopt_offload graft_offload = {
893                 .command        = TC_ROOT_GRAFT,
894                 .handle         = new ? new->handle : 0,
895                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
896                                   (old && old->flags & TCQ_F_INGRESS),
897         };
898
899         qdisc_offload_graft_helper(dev, NULL, new, old,
900                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
901 }
902
903 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
904                          u32 portid, u32 seq, u16 flags, int event)
905 {
906         struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
907         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
908         struct tcmsg *tcm;
909         struct nlmsghdr  *nlh;
910         unsigned char *b = skb_tail_pointer(skb);
911         struct gnet_dump d;
912         struct qdisc_size_table *stab;
913         u32 block_index;
914         __u32 qlen;
915
916         cond_resched();
917         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
918         if (!nlh)
919                 goto out_nlmsg_trim;
920         tcm = nlmsg_data(nlh);
921         tcm->tcm_family = AF_UNSPEC;
922         tcm->tcm__pad1 = 0;
923         tcm->tcm__pad2 = 0;
924         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
925         tcm->tcm_parent = clid;
926         tcm->tcm_handle = q->handle;
927         tcm->tcm_info = refcount_read(&q->refcnt);
928         if (nla_put_string(skb, TCA_KIND, q->ops->id))
929                 goto nla_put_failure;
930         if (q->ops->ingress_block_get) {
931                 block_index = q->ops->ingress_block_get(q);
932                 if (block_index &&
933                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
934                         goto nla_put_failure;
935         }
936         if (q->ops->egress_block_get) {
937                 block_index = q->ops->egress_block_get(q);
938                 if (block_index &&
939                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
940                         goto nla_put_failure;
941         }
942         if (q->ops->dump && q->ops->dump(q, skb) < 0)
943                 goto nla_put_failure;
944         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
945                 goto nla_put_failure;
946         qlen = qdisc_qlen_sum(q);
947
948         stab = rtnl_dereference(q->stab);
949         if (stab && qdisc_dump_stab(skb, stab) < 0)
950                 goto nla_put_failure;
951
952         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
953                                          NULL, &d, TCA_PAD) < 0)
954                 goto nla_put_failure;
955
956         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
957                 goto nla_put_failure;
958
959         if (qdisc_is_percpu_stats(q)) {
960                 cpu_bstats = q->cpu_bstats;
961                 cpu_qstats = q->cpu_qstats;
962         }
963
964         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
965             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
966             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
967                 goto nla_put_failure;
968
969         if (gnet_stats_finish_copy(&d) < 0)
970                 goto nla_put_failure;
971
972         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
973         return skb->len;
974
975 out_nlmsg_trim:
976 nla_put_failure:
977         nlmsg_trim(skb, b);
978         return -1;
979 }
980
981 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
982 {
983         if (q->flags & TCQ_F_BUILTIN)
984                 return true;
985         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
986                 return true;
987
988         return false;
989 }
990
991 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
992                         struct nlmsghdr *n, u32 clid,
993                         struct Qdisc *old, struct Qdisc *new)
994 {
995         struct sk_buff *skb;
996         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
997
998         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
999         if (!skb)
1000                 return -ENOBUFS;
1001
1002         if (old && !tc_qdisc_dump_ignore(old, false)) {
1003                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1004                                   0, RTM_DELQDISC) < 0)
1005                         goto err_out;
1006         }
1007         if (new && !tc_qdisc_dump_ignore(new, false)) {
1008                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1009                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1010                         goto err_out;
1011         }
1012
1013         if (skb->len)
1014                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1015                                       n->nlmsg_flags & NLM_F_ECHO);
1016
1017 err_out:
1018         kfree_skb(skb);
1019         return -EINVAL;
1020 }
1021
1022 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1023                                struct nlmsghdr *n, u32 clid,
1024                                struct Qdisc *old, struct Qdisc *new)
1025 {
1026         if (new || old)
1027                 qdisc_notify(net, skb, n, clid, old, new);
1028
1029         if (old)
1030                 qdisc_put(old);
1031 }
1032
1033 static void qdisc_clear_nolock(struct Qdisc *sch)
1034 {
1035         sch->flags &= ~TCQ_F_NOLOCK;
1036         if (!(sch->flags & TCQ_F_CPUSTATS))
1037                 return;
1038
1039         free_percpu(sch->cpu_bstats);
1040         free_percpu(sch->cpu_qstats);
1041         sch->cpu_bstats = NULL;
1042         sch->cpu_qstats = NULL;
1043         sch->flags &= ~TCQ_F_CPUSTATS;
1044 }
1045
1046 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1047  * to device "dev".
1048  *
1049  * When appropriate send a netlink notification using 'skb'
1050  * and "n".
1051  *
1052  * On success, destroy old qdisc.
1053  */
1054
1055 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1056                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1057                        struct Qdisc *new, struct Qdisc *old,
1058                        struct netlink_ext_ack *extack)
1059 {
1060         struct Qdisc *q = old;
1061         struct net *net = dev_net(dev);
1062
1063         if (parent == NULL) {
1064                 unsigned int i, num_q, ingress;
1065
1066                 ingress = 0;
1067                 num_q = dev->num_tx_queues;
1068                 if ((q && q->flags & TCQ_F_INGRESS) ||
1069                     (new && new->flags & TCQ_F_INGRESS)) {
1070                         num_q = 1;
1071                         ingress = 1;
1072                         if (!dev_ingress_queue(dev)) {
1073                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1074                                 return -ENOENT;
1075                         }
1076                 }
1077
1078                 if (dev->flags & IFF_UP)
1079                         dev_deactivate(dev);
1080
1081                 qdisc_offload_graft_root(dev, new, old, extack);
1082
1083                 if (new && new->ops->attach && !ingress)
1084                         goto skip;
1085
1086                 for (i = 0; i < num_q; i++) {
1087                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1088
1089                         if (!ingress)
1090                                 dev_queue = netdev_get_tx_queue(dev, i);
1091
1092                         old = dev_graft_qdisc(dev_queue, new);
1093                         if (new && i > 0)
1094                                 qdisc_refcount_inc(new);
1095
1096                         if (!ingress)
1097                                 qdisc_put(old);
1098                 }
1099
1100 skip:
1101                 if (!ingress) {
1102                         notify_and_destroy(net, skb, n, classid,
1103                                            rtnl_dereference(dev->qdisc), new);
1104                         if (new && !new->ops->attach)
1105                                 qdisc_refcount_inc(new);
1106                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1107
1108                         if (new && new->ops->attach)
1109                                 new->ops->attach(new);
1110                 } else {
1111                         notify_and_destroy(net, skb, n, classid, old, new);
1112                 }
1113
1114                 if (dev->flags & IFF_UP)
1115                         dev_activate(dev);
1116         } else {
1117                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1118                 unsigned long cl;
1119                 int err;
1120
1121                 /* Only support running class lockless if parent is lockless */
1122                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1123                         qdisc_clear_nolock(new);
1124
1125                 if (!cops || !cops->graft)
1126                         return -EOPNOTSUPP;
1127
1128                 cl = cops->find(parent, classid);
1129                 if (!cl) {
1130                         NL_SET_ERR_MSG(extack, "Specified class not found");
1131                         return -ENOENT;
1132                 }
1133
1134                 err = cops->graft(parent, cl, new, &old, extack);
1135                 if (err)
1136                         return err;
1137                 notify_and_destroy(net, skb, n, classid, old, new);
1138         }
1139         return 0;
1140 }
1141
1142 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1143                                    struct netlink_ext_ack *extack)
1144 {
1145         u32 block_index;
1146
1147         if (tca[TCA_INGRESS_BLOCK]) {
1148                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1149
1150                 if (!block_index) {
1151                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1152                         return -EINVAL;
1153                 }
1154                 if (!sch->ops->ingress_block_set) {
1155                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1156                         return -EOPNOTSUPP;
1157                 }
1158                 sch->ops->ingress_block_set(sch, block_index);
1159         }
1160         if (tca[TCA_EGRESS_BLOCK]) {
1161                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1162
1163                 if (!block_index) {
1164                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1165                         return -EINVAL;
1166                 }
1167                 if (!sch->ops->egress_block_set) {
1168                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1169                         return -EOPNOTSUPP;
1170                 }
1171                 sch->ops->egress_block_set(sch, block_index);
1172         }
1173         return 0;
1174 }
1175
1176 /*
1177    Allocate and initialize new qdisc.
1178
1179    Parameters are passed via opt.
1180  */
1181
1182 static struct Qdisc *qdisc_create(struct net_device *dev,
1183                                   struct netdev_queue *dev_queue,
1184                                   u32 parent, u32 handle,
1185                                   struct nlattr **tca, int *errp,
1186                                   struct netlink_ext_ack *extack)
1187 {
1188         int err;
1189         struct nlattr *kind = tca[TCA_KIND];
1190         struct Qdisc *sch;
1191         struct Qdisc_ops *ops;
1192         struct qdisc_size_table *stab;
1193
1194         ops = qdisc_lookup_ops(kind);
1195 #ifdef CONFIG_MODULES
1196         if (ops == NULL && kind != NULL) {
1197                 char name[IFNAMSIZ];
1198                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1199                         /* We dropped the RTNL semaphore in order to
1200                          * perform the module load.  So, even if we
1201                          * succeeded in loading the module we have to
1202                          * tell the caller to replay the request.  We
1203                          * indicate this using -EAGAIN.
1204                          * We replay the request because the device may
1205                          * go away in the mean time.
1206                          */
1207                         rtnl_unlock();
1208                         request_module("sch_%s", name);
1209                         rtnl_lock();
1210                         ops = qdisc_lookup_ops(kind);
1211                         if (ops != NULL) {
1212                                 /* We will try again qdisc_lookup_ops,
1213                                  * so don't keep a reference.
1214                                  */
1215                                 module_put(ops->owner);
1216                                 err = -EAGAIN;
1217                                 goto err_out;
1218                         }
1219                 }
1220         }
1221 #endif
1222
1223         err = -ENOENT;
1224         if (!ops) {
1225                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1226                 goto err_out;
1227         }
1228
1229         sch = qdisc_alloc(dev_queue, ops, extack);
1230         if (IS_ERR(sch)) {
1231                 err = PTR_ERR(sch);
1232                 goto err_out2;
1233         }
1234
1235         sch->parent = parent;
1236
1237         if (handle == TC_H_INGRESS) {
1238                 sch->flags |= TCQ_F_INGRESS;
1239                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1240         } else {
1241                 if (handle == 0) {
1242                         handle = qdisc_alloc_handle(dev);
1243                         if (handle == 0) {
1244                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1245                                 err = -ENOSPC;
1246                                 goto err_out3;
1247                         }
1248                 }
1249                 if (!netif_is_multiqueue(dev))
1250                         sch->flags |= TCQ_F_ONETXQUEUE;
1251         }
1252
1253         sch->handle = handle;
1254
1255         /* This exist to keep backward compatible with a userspace
1256          * loophole, what allowed userspace to get IFF_NO_QUEUE
1257          * facility on older kernels by setting tx_queue_len=0 (prior
1258          * to qdisc init), and then forgot to reinit tx_queue_len
1259          * before again attaching a qdisc.
1260          */
1261         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1262                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1263                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1264         }
1265
1266         err = qdisc_block_indexes_set(sch, tca, extack);
1267         if (err)
1268                 goto err_out3;
1269
1270         if (ops->init) {
1271                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1272                 if (err != 0)
1273                         goto err_out5;
1274         }
1275
1276         if (tca[TCA_STAB]) {
1277                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1278                 if (IS_ERR(stab)) {
1279                         err = PTR_ERR(stab);
1280                         goto err_out4;
1281                 }
1282                 rcu_assign_pointer(sch->stab, stab);
1283         }
1284         if (tca[TCA_RATE]) {
1285                 err = -EOPNOTSUPP;
1286                 if (sch->flags & TCQ_F_MQROOT) {
1287                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1288                         goto err_out4;
1289                 }
1290
1291                 err = gen_new_estimator(&sch->bstats,
1292                                         sch->cpu_bstats,
1293                                         &sch->rate_est,
1294                                         NULL,
1295                                         true,
1296                                         tca[TCA_RATE]);
1297                 if (err) {
1298                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1299                         goto err_out4;
1300                 }
1301         }
1302
1303         qdisc_hash_add(sch, false);
1304         trace_qdisc_create(ops, dev, parent);
1305
1306         return sch;
1307
1308 err_out5:
1309         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1310         if (ops->destroy)
1311                 ops->destroy(sch);
1312 err_out3:
1313         netdev_put(dev, &sch->dev_tracker);
1314         qdisc_free(sch);
1315 err_out2:
1316         module_put(ops->owner);
1317 err_out:
1318         *errp = err;
1319         return NULL;
1320
1321 err_out4:
1322         /*
1323          * Any broken qdiscs that would require a ops->reset() here?
1324          * The qdisc was never in action so it shouldn't be necessary.
1325          */
1326         qdisc_put_stab(rtnl_dereference(sch->stab));
1327         if (ops->destroy)
1328                 ops->destroy(sch);
1329         goto err_out3;
1330 }
1331
1332 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1333                         struct netlink_ext_ack *extack)
1334 {
1335         struct qdisc_size_table *ostab, *stab = NULL;
1336         int err = 0;
1337
1338         if (tca[TCA_OPTIONS]) {
1339                 if (!sch->ops->change) {
1340                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1341                         return -EINVAL;
1342                 }
1343                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1344                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1345                         return -EOPNOTSUPP;
1346                 }
1347                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1348                 if (err)
1349                         return err;
1350         }
1351
1352         if (tca[TCA_STAB]) {
1353                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1354                 if (IS_ERR(stab))
1355                         return PTR_ERR(stab);
1356         }
1357
1358         ostab = rtnl_dereference(sch->stab);
1359         rcu_assign_pointer(sch->stab, stab);
1360         qdisc_put_stab(ostab);
1361
1362         if (tca[TCA_RATE]) {
1363                 /* NB: ignores errors from replace_estimator
1364                    because change can't be undone. */
1365                 if (sch->flags & TCQ_F_MQROOT)
1366                         goto out;
1367                 gen_replace_estimator(&sch->bstats,
1368                                       sch->cpu_bstats,
1369                                       &sch->rate_est,
1370                                       NULL,
1371                                       true,
1372                                       tca[TCA_RATE]);
1373         }
1374 out:
1375         return 0;
1376 }
1377
1378 struct check_loop_arg {
1379         struct qdisc_walker     w;
1380         struct Qdisc            *p;
1381         int                     depth;
1382 };
1383
1384 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1385                          struct qdisc_walker *w);
1386
1387 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1388 {
1389         struct check_loop_arg   arg;
1390
1391         if (q->ops->cl_ops == NULL)
1392                 return 0;
1393
1394         arg.w.stop = arg.w.skip = arg.w.count = 0;
1395         arg.w.fn = check_loop_fn;
1396         arg.depth = depth;
1397         arg.p = p;
1398         q->ops->cl_ops->walk(q, &arg.w);
1399         return arg.w.stop ? -ELOOP : 0;
1400 }
1401
1402 static int
1403 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1404 {
1405         struct Qdisc *leaf;
1406         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1407         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1408
1409         leaf = cops->leaf(q, cl);
1410         if (leaf) {
1411                 if (leaf == arg->p || arg->depth > 7)
1412                         return -ELOOP;
1413                 return check_loop(leaf, arg->p, arg->depth + 1);
1414         }
1415         return 0;
1416 }
1417
1418 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1419         [TCA_KIND]              = { .type = NLA_STRING },
1420         [TCA_RATE]              = { .type = NLA_BINARY,
1421                                     .len = sizeof(struct tc_estimator) },
1422         [TCA_STAB]              = { .type = NLA_NESTED },
1423         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1424         [TCA_CHAIN]             = { .type = NLA_U32 },
1425         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1426         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1427 };
1428
1429 /*
1430  * Delete/get qdisc.
1431  */
1432
1433 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1434                         struct netlink_ext_ack *extack)
1435 {
1436         struct net *net = sock_net(skb->sk);
1437         struct tcmsg *tcm = nlmsg_data(n);
1438         struct nlattr *tca[TCA_MAX + 1];
1439         struct net_device *dev;
1440         u32 clid;
1441         struct Qdisc *q = NULL;
1442         struct Qdisc *p = NULL;
1443         int err;
1444
1445         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1446                                      rtm_tca_policy, extack);
1447         if (err < 0)
1448                 return err;
1449
1450         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1451         if (!dev)
1452                 return -ENODEV;
1453
1454         clid = tcm->tcm_parent;
1455         if (clid) {
1456                 if (clid != TC_H_ROOT) {
1457                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1458                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1459                                 if (!p) {
1460                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1461                                         return -ENOENT;
1462                                 }
1463                                 q = qdisc_leaf(p, clid);
1464                         } else if (dev_ingress_queue(dev)) {
1465                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1466                         }
1467                 } else {
1468                         q = rtnl_dereference(dev->qdisc);
1469                 }
1470                 if (!q) {
1471                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1472                         return -ENOENT;
1473                 }
1474
1475                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1476                         NL_SET_ERR_MSG(extack, "Invalid handle");
1477                         return -EINVAL;
1478                 }
1479         } else {
1480                 q = qdisc_lookup(dev, tcm->tcm_handle);
1481                 if (!q) {
1482                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1483                         return -ENOENT;
1484                 }
1485         }
1486
1487         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1488                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1489                 return -EINVAL;
1490         }
1491
1492         if (n->nlmsg_type == RTM_DELQDISC) {
1493                 if (!clid) {
1494                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1495                         return -EINVAL;
1496                 }
1497                 if (q->handle == 0) {
1498                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1499                         return -ENOENT;
1500                 }
1501                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1502                 if (err != 0)
1503                         return err;
1504         } else {
1505                 qdisc_notify(net, skb, n, clid, NULL, q);
1506         }
1507         return 0;
1508 }
1509
1510 /*
1511  * Create/change qdisc.
1512  */
1513
1514 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1515                            struct netlink_ext_ack *extack)
1516 {
1517         struct net *net = sock_net(skb->sk);
1518         struct tcmsg *tcm;
1519         struct nlattr *tca[TCA_MAX + 1];
1520         struct net_device *dev;
1521         u32 clid;
1522         struct Qdisc *q, *p;
1523         int err;
1524
1525 replay:
1526         /* Reinit, just in case something touches this. */
1527         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1528                                      rtm_tca_policy, extack);
1529         if (err < 0)
1530                 return err;
1531
1532         tcm = nlmsg_data(n);
1533         clid = tcm->tcm_parent;
1534         q = p = NULL;
1535
1536         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1537         if (!dev)
1538                 return -ENODEV;
1539
1540
1541         if (clid) {
1542                 if (clid != TC_H_ROOT) {
1543                         if (clid != TC_H_INGRESS) {
1544                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1545                                 if (!p) {
1546                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1547                                         return -ENOENT;
1548                                 }
1549                                 q = qdisc_leaf(p, clid);
1550                         } else if (dev_ingress_queue_create(dev)) {
1551                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1552                         }
1553                 } else {
1554                         q = rtnl_dereference(dev->qdisc);
1555                 }
1556
1557                 /* It may be default qdisc, ignore it */
1558                 if (q && q->handle == 0)
1559                         q = NULL;
1560
1561                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1562                         if (tcm->tcm_handle) {
1563                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1564                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1565                                         return -EEXIST;
1566                                 }
1567                                 if (TC_H_MIN(tcm->tcm_handle)) {
1568                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1569                                         return -EINVAL;
1570                                 }
1571                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1572                                 if (!q)
1573                                         goto create_n_graft;
1574                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1575                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1576                                         return -EEXIST;
1577                                 }
1578                                 if (tca[TCA_KIND] &&
1579                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1580                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1581                                         return -EINVAL;
1582                                 }
1583                                 if (q == p ||
1584                                     (p && check_loop(q, p, 0))) {
1585                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1586                                         return -ELOOP;
1587                                 }
1588                                 qdisc_refcount_inc(q);
1589                                 goto graft;
1590                         } else {
1591                                 if (!q)
1592                                         goto create_n_graft;
1593
1594                                 /* This magic test requires explanation.
1595                                  *
1596                                  *   We know, that some child q is already
1597                                  *   attached to this parent and have choice:
1598                                  *   either to change it or to create/graft new one.
1599                                  *
1600                                  *   1. We are allowed to create/graft only
1601                                  *   if CREATE and REPLACE flags are set.
1602                                  *
1603                                  *   2. If EXCL is set, requestor wanted to say,
1604                                  *   that qdisc tcm_handle is not expected
1605                                  *   to exist, so that we choose create/graft too.
1606                                  *
1607                                  *   3. The last case is when no flags are set.
1608                                  *   Alas, it is sort of hole in API, we
1609                                  *   cannot decide what to do unambiguously.
1610                                  *   For now we select create/graft, if
1611                                  *   user gave KIND, which does not match existing.
1612                                  */
1613                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1614                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1615                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1616                                      (tca[TCA_KIND] &&
1617                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1618                                         goto create_n_graft;
1619                         }
1620                 }
1621         } else {
1622                 if (!tcm->tcm_handle) {
1623                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1624                         return -EINVAL;
1625                 }
1626                 q = qdisc_lookup(dev, tcm->tcm_handle);
1627         }
1628
1629         /* Change qdisc parameters */
1630         if (!q) {
1631                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1632                 return -ENOENT;
1633         }
1634         if (n->nlmsg_flags & NLM_F_EXCL) {
1635                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1636                 return -EEXIST;
1637         }
1638         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1639                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1640                 return -EINVAL;
1641         }
1642         err = qdisc_change(q, tca, extack);
1643         if (err == 0)
1644                 qdisc_notify(net, skb, n, clid, NULL, q);
1645         return err;
1646
1647 create_n_graft:
1648         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1649                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1650                 return -ENOENT;
1651         }
1652         if (clid == TC_H_INGRESS) {
1653                 if (dev_ingress_queue(dev)) {
1654                         q = qdisc_create(dev, dev_ingress_queue(dev),
1655                                          tcm->tcm_parent, tcm->tcm_parent,
1656                                          tca, &err, extack);
1657                 } else {
1658                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1659                         err = -ENOENT;
1660                 }
1661         } else {
1662                 struct netdev_queue *dev_queue;
1663
1664                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1665                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1666                 else if (p)
1667                         dev_queue = p->dev_queue;
1668                 else
1669                         dev_queue = netdev_get_tx_queue(dev, 0);
1670
1671                 q = qdisc_create(dev, dev_queue,
1672                                  tcm->tcm_parent, tcm->tcm_handle,
1673                                  tca, &err, extack);
1674         }
1675         if (q == NULL) {
1676                 if (err == -EAGAIN)
1677                         goto replay;
1678                 return err;
1679         }
1680
1681 graft:
1682         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1683         if (err) {
1684                 if (q)
1685                         qdisc_put(q);
1686                 return err;
1687         }
1688
1689         return 0;
1690 }
1691
1692 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1693                               struct netlink_callback *cb,
1694                               int *q_idx_p, int s_q_idx, bool recur,
1695                               bool dump_invisible)
1696 {
1697         int ret = 0, q_idx = *q_idx_p;
1698         struct Qdisc *q;
1699         int b;
1700
1701         if (!root)
1702                 return 0;
1703
1704         q = root;
1705         if (q_idx < s_q_idx) {
1706                 q_idx++;
1707         } else {
1708                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1709                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1710                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1711                                   RTM_NEWQDISC) <= 0)
1712                         goto done;
1713                 q_idx++;
1714         }
1715
1716         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1717          * itself has already been dumped.
1718          *
1719          * If we've already dumped the top-level (ingress) qdisc above and the global
1720          * qdisc hashtable, we don't want to hit it again
1721          */
1722         if (!qdisc_dev(root) || !recur)
1723                 goto out;
1724
1725         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1726                 if (q_idx < s_q_idx) {
1727                         q_idx++;
1728                         continue;
1729                 }
1730                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1731                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1732                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1733                                   RTM_NEWQDISC) <= 0)
1734                         goto done;
1735                 q_idx++;
1736         }
1737
1738 out:
1739         *q_idx_p = q_idx;
1740         return ret;
1741 done:
1742         ret = -1;
1743         goto out;
1744 }
1745
1746 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1747 {
1748         struct net *net = sock_net(skb->sk);
1749         int idx, q_idx;
1750         int s_idx, s_q_idx;
1751         struct net_device *dev;
1752         const struct nlmsghdr *nlh = cb->nlh;
1753         struct nlattr *tca[TCA_MAX + 1];
1754         int err;
1755
1756         s_idx = cb->args[0];
1757         s_q_idx = q_idx = cb->args[1];
1758
1759         idx = 0;
1760         ASSERT_RTNL();
1761
1762         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1763                                      rtm_tca_policy, cb->extack);
1764         if (err < 0)
1765                 return err;
1766
1767         for_each_netdev(net, dev) {
1768                 struct netdev_queue *dev_queue;
1769
1770                 if (idx < s_idx)
1771                         goto cont;
1772                 if (idx > s_idx)
1773                         s_q_idx = 0;
1774                 q_idx = 0;
1775
1776                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1777                                        skb, cb, &q_idx, s_q_idx,
1778                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1779                         goto done;
1780
1781                 dev_queue = dev_ingress_queue(dev);
1782                 if (dev_queue &&
1783                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1784                                        &q_idx, s_q_idx, false,
1785                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1786                         goto done;
1787
1788 cont:
1789                 idx++;
1790         }
1791
1792 done:
1793         cb->args[0] = idx;
1794         cb->args[1] = q_idx;
1795
1796         return skb->len;
1797 }
1798
1799
1800
1801 /************************************************
1802  *      Traffic classes manipulation.           *
1803  ************************************************/
1804
1805 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1806                           unsigned long cl,
1807                           u32 portid, u32 seq, u16 flags, int event)
1808 {
1809         struct tcmsg *tcm;
1810         struct nlmsghdr  *nlh;
1811         unsigned char *b = skb_tail_pointer(skb);
1812         struct gnet_dump d;
1813         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1814
1815         cond_resched();
1816         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1817         if (!nlh)
1818                 goto out_nlmsg_trim;
1819         tcm = nlmsg_data(nlh);
1820         tcm->tcm_family = AF_UNSPEC;
1821         tcm->tcm__pad1 = 0;
1822         tcm->tcm__pad2 = 0;
1823         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1824         tcm->tcm_parent = q->handle;
1825         tcm->tcm_handle = q->handle;
1826         tcm->tcm_info = 0;
1827         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1828                 goto nla_put_failure;
1829         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1830                 goto nla_put_failure;
1831
1832         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1833                                          NULL, &d, TCA_PAD) < 0)
1834                 goto nla_put_failure;
1835
1836         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1837                 goto nla_put_failure;
1838
1839         if (gnet_stats_finish_copy(&d) < 0)
1840                 goto nla_put_failure;
1841
1842         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1843         return skb->len;
1844
1845 out_nlmsg_trim:
1846 nla_put_failure:
1847         nlmsg_trim(skb, b);
1848         return -1;
1849 }
1850
1851 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1852                          struct nlmsghdr *n, struct Qdisc *q,
1853                          unsigned long cl, int event)
1854 {
1855         struct sk_buff *skb;
1856         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1857
1858         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1859         if (!skb)
1860                 return -ENOBUFS;
1861
1862         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1863                 kfree_skb(skb);
1864                 return -EINVAL;
1865         }
1866
1867         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1868                               n->nlmsg_flags & NLM_F_ECHO);
1869 }
1870
1871 static int tclass_del_notify(struct net *net,
1872                              const struct Qdisc_class_ops *cops,
1873                              struct sk_buff *oskb, struct nlmsghdr *n,
1874                              struct Qdisc *q, unsigned long cl,
1875                              struct netlink_ext_ack *extack)
1876 {
1877         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1878         struct sk_buff *skb;
1879         int err = 0;
1880
1881         if (!cops->delete)
1882                 return -EOPNOTSUPP;
1883
1884         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1885         if (!skb)
1886                 return -ENOBUFS;
1887
1888         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1889                            RTM_DELTCLASS) < 0) {
1890                 kfree_skb(skb);
1891                 return -EINVAL;
1892         }
1893
1894         err = cops->delete(q, cl, extack);
1895         if (err) {
1896                 kfree_skb(skb);
1897                 return err;
1898         }
1899
1900         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1901                              n->nlmsg_flags & NLM_F_ECHO);
1902         return err;
1903 }
1904
1905 #ifdef CONFIG_NET_CLS
1906
1907 struct tcf_bind_args {
1908         struct tcf_walker w;
1909         unsigned long base;
1910         unsigned long cl;
1911         u32 classid;
1912 };
1913
1914 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1915 {
1916         struct tcf_bind_args *a = (void *)arg;
1917
1918         if (n && tp->ops->bind_class) {
1919                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1920
1921                 sch_tree_lock(q);
1922                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1923                 sch_tree_unlock(q);
1924         }
1925         return 0;
1926 }
1927
1928 struct tc_bind_class_args {
1929         struct qdisc_walker w;
1930         unsigned long new_cl;
1931         u32 portid;
1932         u32 clid;
1933 };
1934
1935 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1936                                 struct qdisc_walker *w)
1937 {
1938         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1939         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1940         struct tcf_block *block;
1941         struct tcf_chain *chain;
1942
1943         block = cops->tcf_block(q, cl, NULL);
1944         if (!block)
1945                 return 0;
1946         for (chain = tcf_get_next_chain(block, NULL);
1947              chain;
1948              chain = tcf_get_next_chain(block, chain)) {
1949                 struct tcf_proto *tp;
1950
1951                 for (tp = tcf_get_next_proto(chain, NULL);
1952                      tp; tp = tcf_get_next_proto(chain, tp)) {
1953                         struct tcf_bind_args arg = {};
1954
1955                         arg.w.fn = tcf_node_bind;
1956                         arg.classid = a->clid;
1957                         arg.base = cl;
1958                         arg.cl = a->new_cl;
1959                         tp->ops->walk(tp, &arg.w, true);
1960                 }
1961         }
1962
1963         return 0;
1964 }
1965
1966 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1967                            unsigned long new_cl)
1968 {
1969         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1970         struct tc_bind_class_args args = {};
1971
1972         if (!cops->tcf_block)
1973                 return;
1974         args.portid = portid;
1975         args.clid = clid;
1976         args.new_cl = new_cl;
1977         args.w.fn = tc_bind_class_walker;
1978         q->ops->cl_ops->walk(q, &args.w);
1979 }
1980
1981 #else
1982
1983 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1984                            unsigned long new_cl)
1985 {
1986 }
1987
1988 #endif
1989
1990 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1991                          struct netlink_ext_ack *extack)
1992 {
1993         struct net *net = sock_net(skb->sk);
1994         struct tcmsg *tcm = nlmsg_data(n);
1995         struct nlattr *tca[TCA_MAX + 1];
1996         struct net_device *dev;
1997         struct Qdisc *q = NULL;
1998         const struct Qdisc_class_ops *cops;
1999         unsigned long cl = 0;
2000         unsigned long new_cl;
2001         u32 portid;
2002         u32 clid;
2003         u32 qid;
2004         int err;
2005
2006         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2007                                      rtm_tca_policy, extack);
2008         if (err < 0)
2009                 return err;
2010
2011         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2012         if (!dev)
2013                 return -ENODEV;
2014
2015         /*
2016            parent == TC_H_UNSPEC - unspecified parent.
2017            parent == TC_H_ROOT   - class is root, which has no parent.
2018            parent == X:0         - parent is root class.
2019            parent == X:Y         - parent is a node in hierarchy.
2020            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2021
2022            handle == 0:0         - generate handle from kernel pool.
2023            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2024            handle == X:Y         - clear.
2025            handle == X:0         - root class.
2026          */
2027
2028         /* Step 1. Determine qdisc handle X:0 */
2029
2030         portid = tcm->tcm_parent;
2031         clid = tcm->tcm_handle;
2032         qid = TC_H_MAJ(clid);
2033
2034         if (portid != TC_H_ROOT) {
2035                 u32 qid1 = TC_H_MAJ(portid);
2036
2037                 if (qid && qid1) {
2038                         /* If both majors are known, they must be identical. */
2039                         if (qid != qid1)
2040                                 return -EINVAL;
2041                 } else if (qid1) {
2042                         qid = qid1;
2043                 } else if (qid == 0)
2044                         qid = rtnl_dereference(dev->qdisc)->handle;
2045
2046                 /* Now qid is genuine qdisc handle consistent
2047                  * both with parent and child.
2048                  *
2049                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2050                  */
2051                 if (portid)
2052                         portid = TC_H_MAKE(qid, portid);
2053         } else {
2054                 if (qid == 0)
2055                         qid = rtnl_dereference(dev->qdisc)->handle;
2056         }
2057
2058         /* OK. Locate qdisc */
2059         q = qdisc_lookup(dev, qid);
2060         if (!q)
2061                 return -ENOENT;
2062
2063         /* An check that it supports classes */
2064         cops = q->ops->cl_ops;
2065         if (cops == NULL)
2066                 return -EINVAL;
2067
2068         /* Now try to get class */
2069         if (clid == 0) {
2070                 if (portid == TC_H_ROOT)
2071                         clid = qid;
2072         } else
2073                 clid = TC_H_MAKE(qid, clid);
2074
2075         if (clid)
2076                 cl = cops->find(q, clid);
2077
2078         if (cl == 0) {
2079                 err = -ENOENT;
2080                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2081                     !(n->nlmsg_flags & NLM_F_CREATE))
2082                         goto out;
2083         } else {
2084                 switch (n->nlmsg_type) {
2085                 case RTM_NEWTCLASS:
2086                         err = -EEXIST;
2087                         if (n->nlmsg_flags & NLM_F_EXCL)
2088                                 goto out;
2089                         break;
2090                 case RTM_DELTCLASS:
2091                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2092                         /* Unbind the class with flilters with 0 */
2093                         tc_bind_tclass(q, portid, clid, 0);
2094                         goto out;
2095                 case RTM_GETTCLASS:
2096                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2097                         goto out;
2098                 default:
2099                         err = -EINVAL;
2100                         goto out;
2101                 }
2102         }
2103
2104         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2105                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2106                 return -EOPNOTSUPP;
2107         }
2108
2109         new_cl = cl;
2110         err = -EOPNOTSUPP;
2111         if (cops->change)
2112                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2113         if (err == 0) {
2114                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2115                 /* We just create a new class, need to do reverse binding. */
2116                 if (cl != new_cl)
2117                         tc_bind_tclass(q, portid, clid, new_cl);
2118         }
2119 out:
2120         return err;
2121 }
2122
2123 struct qdisc_dump_args {
2124         struct qdisc_walker     w;
2125         struct sk_buff          *skb;
2126         struct netlink_callback *cb;
2127 };
2128
2129 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2130                             struct qdisc_walker *arg)
2131 {
2132         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2133
2134         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2135                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2136                               RTM_NEWTCLASS);
2137 }
2138
2139 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2140                                 struct tcmsg *tcm, struct netlink_callback *cb,
2141                                 int *t_p, int s_t)
2142 {
2143         struct qdisc_dump_args arg;
2144
2145         if (tc_qdisc_dump_ignore(q, false) ||
2146             *t_p < s_t || !q->ops->cl_ops ||
2147             (tcm->tcm_parent &&
2148              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2149                 (*t_p)++;
2150                 return 0;
2151         }
2152         if (*t_p > s_t)
2153                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2154         arg.w.fn = qdisc_class_dump;
2155         arg.skb = skb;
2156         arg.cb = cb;
2157         arg.w.stop  = 0;
2158         arg.w.skip = cb->args[1];
2159         arg.w.count = 0;
2160         q->ops->cl_ops->walk(q, &arg.w);
2161         cb->args[1] = arg.w.count;
2162         if (arg.w.stop)
2163                 return -1;
2164         (*t_p)++;
2165         return 0;
2166 }
2167
2168 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2169                                struct tcmsg *tcm, struct netlink_callback *cb,
2170                                int *t_p, int s_t, bool recur)
2171 {
2172         struct Qdisc *q;
2173         int b;
2174
2175         if (!root)
2176                 return 0;
2177
2178         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2179                 return -1;
2180
2181         if (!qdisc_dev(root) || !recur)
2182                 return 0;
2183
2184         if (tcm->tcm_parent) {
2185                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2186                 if (q && q != root &&
2187                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2188                         return -1;
2189                 return 0;
2190         }
2191         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2192                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2193                         return -1;
2194         }
2195
2196         return 0;
2197 }
2198
2199 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2200 {
2201         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2202         struct net *net = sock_net(skb->sk);
2203         struct netdev_queue *dev_queue;
2204         struct net_device *dev;
2205         int t, s_t;
2206
2207         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2208                 return 0;
2209         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2210         if (!dev)
2211                 return 0;
2212
2213         s_t = cb->args[0];
2214         t = 0;
2215
2216         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2217                                 skb, tcm, cb, &t, s_t, true) < 0)
2218                 goto done;
2219
2220         dev_queue = dev_ingress_queue(dev);
2221         if (dev_queue &&
2222             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2223                                 &t, s_t, false) < 0)
2224                 goto done;
2225
2226 done:
2227         cb->args[0] = t;
2228
2229         dev_put(dev);
2230         return skb->len;
2231 }
2232
2233 #ifdef CONFIG_PROC_FS
2234 static int psched_show(struct seq_file *seq, void *v)
2235 {
2236         seq_printf(seq, "%08x %08x %08x %08x\n",
2237                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2238                    1000000,
2239                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2240
2241         return 0;
2242 }
2243
2244 static int __net_init psched_net_init(struct net *net)
2245 {
2246         struct proc_dir_entry *e;
2247
2248         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2249         if (e == NULL)
2250                 return -ENOMEM;
2251
2252         return 0;
2253 }
2254
2255 static void __net_exit psched_net_exit(struct net *net)
2256 {
2257         remove_proc_entry("psched", net->proc_net);
2258 }
2259 #else
2260 static int __net_init psched_net_init(struct net *net)
2261 {
2262         return 0;
2263 }
2264
2265 static void __net_exit psched_net_exit(struct net *net)
2266 {
2267 }
2268 #endif
2269
2270 static struct pernet_operations psched_net_ops = {
2271         .init = psched_net_init,
2272         .exit = psched_net_exit,
2273 };
2274
2275 static int __init pktsched_init(void)
2276 {
2277         int err;
2278
2279         err = register_pernet_subsys(&psched_net_ops);
2280         if (err) {
2281                 pr_err("pktsched_init: "
2282                        "cannot initialize per netns operations\n");
2283                 return err;
2284         }
2285
2286         register_qdisc(&pfifo_fast_ops);
2287         register_qdisc(&pfifo_qdisc_ops);
2288         register_qdisc(&bfifo_qdisc_ops);
2289         register_qdisc(&pfifo_head_drop_qdisc_ops);
2290         register_qdisc(&mq_qdisc_ops);
2291         register_qdisc(&noqueue_qdisc_ops);
2292
2293         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2294         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2295         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2296                       0);
2297         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2298         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2299         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2300                       0);
2301
2302         return 0;
2303 }
2304
2305 subsys_initcall(pktsched_init);