Merge tag 'spdx-6.0-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh...
[linux-2.6-microblaze.git] / net / sched / sch_cbq.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_cbq.c  Class-Based Queueing discipline.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  */
7
8 #include <linux/module.h>
9 #include <linux/slab.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/string.h>
13 #include <linux/errno.h>
14 #include <linux/skbuff.h>
15 #include <net/netlink.h>
16 #include <net/pkt_sched.h>
17 #include <net/pkt_cls.h>
18
19
20 /*      Class-Based Queueing (CBQ) algorithm.
21         =======================================
22
23         Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
24                  Management Models for Packet Networks",
25                  IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
26
27                  [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
28
29                  [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
30                  Parameters", 1996
31
32                  [4] Sally Floyd and Michael Speer, "Experimental Results
33                  for Class-Based Queueing", 1998, not published.
34
35         -----------------------------------------------------------------------
36
37         Algorithm skeleton was taken from NS simulator cbq.cc.
38         If someone wants to check this code against the LBL version,
39         he should take into account that ONLY the skeleton was borrowed,
40         the implementation is different. Particularly:
41
42         --- The WRR algorithm is different. Our version looks more
43         reasonable (I hope) and works when quanta are allowed to be
44         less than MTU, which is always the case when real time classes
45         have small rates. Note, that the statement of [3] is
46         incomplete, delay may actually be estimated even if class
47         per-round allotment is less than MTU. Namely, if per-round
48         allotment is W*r_i, and r_1+...+r_k = r < 1
49
50         delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
51
52         In the worst case we have IntServ estimate with D = W*r+k*MTU
53         and C = MTU*r. The proof (if correct at all) is trivial.
54
55
56         --- It seems that cbq-2.0 is not very accurate. At least, I cannot
57         interpret some places, which look like wrong translations
58         from NS. Anyone is advised to find these differences
59         and explain to me, why I am wrong 8).
60
61         --- Linux has no EOI event, so that we cannot estimate true class
62         idle time. Workaround is to consider the next dequeue event
63         as sign that previous packet is finished. This is wrong because of
64         internal device queueing, but on a permanently loaded link it is true.
65         Moreover, combined with clock integrator, this scheme looks
66         very close to an ideal solution.  */
67
68 struct cbq_sched_data;
69
70
71 struct cbq_class {
72         struct Qdisc_class_common common;
73         struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
74
75 /* Parameters */
76         unsigned char           priority;       /* class priority */
77         unsigned char           priority2;      /* priority to be used after overlimit */
78         unsigned char           ewma_log;       /* time constant for idle time calculation */
79
80         u32                     defmap;
81
82         /* Link-sharing scheduler parameters */
83         long                    maxidle;        /* Class parameters: see below. */
84         long                    offtime;
85         long                    minidle;
86         u32                     avpkt;
87         struct qdisc_rate_table *R_tab;
88
89         /* General scheduler (WRR) parameters */
90         long                    allot;
91         long                    quantum;        /* Allotment per WRR round */
92         long                    weight;         /* Relative allotment: see below */
93
94         struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
95         struct cbq_class        *split;         /* Ptr to split node */
96         struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
97         struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
98         struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
99                                                    parent otherwise */
100         struct cbq_class        *sibling;       /* Sibling chain */
101         struct cbq_class        *children;      /* Pointer to children chain */
102
103         struct Qdisc            *q;             /* Elementary queueing discipline */
104
105
106 /* Variables */
107         unsigned char           cpriority;      /* Effective priority */
108         unsigned char           delayed;
109         unsigned char           level;          /* level of the class in hierarchy:
110                                                    0 for leaf classes, and maximal
111                                                    level of children + 1 for nodes.
112                                                  */
113
114         psched_time_t           last;           /* Last end of service */
115         psched_time_t           undertime;
116         long                    avgidle;
117         long                    deficit;        /* Saved deficit for WRR */
118         psched_time_t           penalized;
119         struct gnet_stats_basic_sync bstats;
120         struct gnet_stats_queue qstats;
121         struct net_rate_estimator __rcu *rate_est;
122         struct tc_cbq_xstats    xstats;
123
124         struct tcf_proto __rcu  *filter_list;
125         struct tcf_block        *block;
126
127         int                     filters;
128
129         struct cbq_class        *defaults[TC_PRIO_MAX + 1];
130 };
131
132 struct cbq_sched_data {
133         struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
134         int                     nclasses[TC_CBQ_MAXPRIO + 1];
135         unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
136
137         struct cbq_class        link;
138
139         unsigned int            activemask;
140         struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
141                                                                    with backlog */
142
143 #ifdef CONFIG_NET_CLS_ACT
144         struct cbq_class        *rx_class;
145 #endif
146         struct cbq_class        *tx_class;
147         struct cbq_class        *tx_borrowed;
148         int                     tx_len;
149         psched_time_t           now;            /* Cached timestamp */
150         unsigned int            pmask;
151
152         struct qdisc_watchdog   watchdog;       /* Watchdog timer,
153                                                    started when CBQ has
154                                                    backlog, but cannot
155                                                    transmit just now */
156         psched_tdiff_t          wd_expires;
157         int                     toplevel;
158         u32                     hgenerator;
159 };
160
161
162 #define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
163
164 static inline struct cbq_class *
165 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
166 {
167         struct Qdisc_class_common *clc;
168
169         clc = qdisc_class_find(&q->clhash, classid);
170         if (clc == NULL)
171                 return NULL;
172         return container_of(clc, struct cbq_class, common);
173 }
174
175 #ifdef CONFIG_NET_CLS_ACT
176
177 static struct cbq_class *
178 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
179 {
180         struct cbq_class *cl;
181
182         for (cl = this->tparent; cl; cl = cl->tparent) {
183                 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
184
185                 if (new != NULL && new != this)
186                         return new;
187         }
188         return NULL;
189 }
190
191 #endif
192
193 /* Classify packet. The procedure is pretty complicated, but
194  * it allows us to combine link sharing and priority scheduling
195  * transparently.
196  *
197  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
198  * so that it resolves to split nodes. Then packets are classified
199  * by logical priority, or a more specific classifier may be attached
200  * to the split node.
201  */
202
203 static struct cbq_class *
204 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
205 {
206         struct cbq_sched_data *q = qdisc_priv(sch);
207         struct cbq_class *head = &q->link;
208         struct cbq_class **defmap;
209         struct cbq_class *cl = NULL;
210         u32 prio = skb->priority;
211         struct tcf_proto *fl;
212         struct tcf_result res;
213
214         /*
215          *  Step 1. If skb->priority points to one of our classes, use it.
216          */
217         if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
218             (cl = cbq_class_lookup(q, prio)) != NULL)
219                 return cl;
220
221         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
222         for (;;) {
223                 int result = 0;
224                 defmap = head->defaults;
225
226                 fl = rcu_dereference_bh(head->filter_list);
227                 /*
228                  * Step 2+n. Apply classifier.
229                  */
230                 result = tcf_classify(skb, NULL, fl, &res, true);
231                 if (!fl || result < 0)
232                         goto fallback;
233
234                 cl = (void *)res.class;
235                 if (!cl) {
236                         if (TC_H_MAJ(res.classid))
237                                 cl = cbq_class_lookup(q, res.classid);
238                         else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
239                                 cl = defmap[TC_PRIO_BESTEFFORT];
240
241                         if (cl == NULL)
242                                 goto fallback;
243                 }
244                 if (cl->level >= head->level)
245                         goto fallback;
246 #ifdef CONFIG_NET_CLS_ACT
247                 switch (result) {
248                 case TC_ACT_QUEUED:
249                 case TC_ACT_STOLEN:
250                 case TC_ACT_TRAP:
251                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
252                         fallthrough;
253                 case TC_ACT_SHOT:
254                         return NULL;
255                 case TC_ACT_RECLASSIFY:
256                         return cbq_reclassify(skb, cl);
257                 }
258 #endif
259                 if (cl->level == 0)
260                         return cl;
261
262                 /*
263                  * Step 3+n. If classifier selected a link sharing class,
264                  *         apply agency specific classifier.
265                  *         Repeat this procedure until we hit a leaf node.
266                  */
267                 head = cl;
268         }
269
270 fallback:
271         cl = head;
272
273         /*
274          * Step 4. No success...
275          */
276         if (TC_H_MAJ(prio) == 0 &&
277             !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
278             !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
279                 return head;
280
281         return cl;
282 }
283
284 /*
285  * A packet has just been enqueued on the empty class.
286  * cbq_activate_class adds it to the tail of active class list
287  * of its priority band.
288  */
289
290 static inline void cbq_activate_class(struct cbq_class *cl)
291 {
292         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
293         int prio = cl->cpriority;
294         struct cbq_class *cl_tail;
295
296         cl_tail = q->active[prio];
297         q->active[prio] = cl;
298
299         if (cl_tail != NULL) {
300                 cl->next_alive = cl_tail->next_alive;
301                 cl_tail->next_alive = cl;
302         } else {
303                 cl->next_alive = cl;
304                 q->activemask |= (1<<prio);
305         }
306 }
307
308 /*
309  * Unlink class from active chain.
310  * Note that this same procedure is done directly in cbq_dequeue*
311  * during round-robin procedure.
312  */
313
314 static void cbq_deactivate_class(struct cbq_class *this)
315 {
316         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
317         int prio = this->cpriority;
318         struct cbq_class *cl;
319         struct cbq_class *cl_prev = q->active[prio];
320
321         do {
322                 cl = cl_prev->next_alive;
323                 if (cl == this) {
324                         cl_prev->next_alive = cl->next_alive;
325                         cl->next_alive = NULL;
326
327                         if (cl == q->active[prio]) {
328                                 q->active[prio] = cl_prev;
329                                 if (cl == q->active[prio]) {
330                                         q->active[prio] = NULL;
331                                         q->activemask &= ~(1<<prio);
332                                         return;
333                                 }
334                         }
335                         return;
336                 }
337         } while ((cl_prev = cl) != q->active[prio]);
338 }
339
340 static void
341 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
342 {
343         int toplevel = q->toplevel;
344
345         if (toplevel > cl->level) {
346                 psched_time_t now = psched_get_time();
347
348                 do {
349                         if (cl->undertime < now) {
350                                 q->toplevel = cl->level;
351                                 return;
352                         }
353                 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
354         }
355 }
356
357 static int
358 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
359             struct sk_buff **to_free)
360 {
361         struct cbq_sched_data *q = qdisc_priv(sch);
362         int ret;
363         struct cbq_class *cl = cbq_classify(skb, sch, &ret);
364
365 #ifdef CONFIG_NET_CLS_ACT
366         q->rx_class = cl;
367 #endif
368         if (cl == NULL) {
369                 if (ret & __NET_XMIT_BYPASS)
370                         qdisc_qstats_drop(sch);
371                 __qdisc_drop(skb, to_free);
372                 return ret;
373         }
374
375         ret = qdisc_enqueue(skb, cl->q, to_free);
376         if (ret == NET_XMIT_SUCCESS) {
377                 sch->q.qlen++;
378                 cbq_mark_toplevel(q, cl);
379                 if (!cl->next_alive)
380                         cbq_activate_class(cl);
381                 return ret;
382         }
383
384         if (net_xmit_drop_count(ret)) {
385                 qdisc_qstats_drop(sch);
386                 cbq_mark_toplevel(q, cl);
387                 cl->qstats.drops++;
388         }
389         return ret;
390 }
391
392 /* Overlimit action: penalize leaf class by adding offtime */
393 static void cbq_overlimit(struct cbq_class *cl)
394 {
395         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
396         psched_tdiff_t delay = cl->undertime - q->now;
397
398         if (!cl->delayed) {
399                 delay += cl->offtime;
400
401                 /*
402                  * Class goes to sleep, so that it will have no
403                  * chance to work avgidle. Let's forgive it 8)
404                  *
405                  * BTW cbq-2.0 has a crap in this
406                  * place, apparently they forgot to shift it by cl->ewma_log.
407                  */
408                 if (cl->avgidle < 0)
409                         delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
410                 if (cl->avgidle < cl->minidle)
411                         cl->avgidle = cl->minidle;
412                 if (delay <= 0)
413                         delay = 1;
414                 cl->undertime = q->now + delay;
415
416                 cl->xstats.overactions++;
417                 cl->delayed = 1;
418         }
419         if (q->wd_expires == 0 || q->wd_expires > delay)
420                 q->wd_expires = delay;
421
422         /* Dirty work! We must schedule wakeups based on
423          * real available rate, rather than leaf rate,
424          * which may be tiny (even zero).
425          */
426         if (q->toplevel == TC_CBQ_MAXLEVEL) {
427                 struct cbq_class *b;
428                 psched_tdiff_t base_delay = q->wd_expires;
429
430                 for (b = cl->borrow; b; b = b->borrow) {
431                         delay = b->undertime - q->now;
432                         if (delay < base_delay) {
433                                 if (delay <= 0)
434                                         delay = 1;
435                                 base_delay = delay;
436                         }
437                 }
438
439                 q->wd_expires = base_delay;
440         }
441 }
442
443 /*
444  * It is mission critical procedure.
445  *
446  * We "regenerate" toplevel cutoff, if transmitting class
447  * has backlog and it is not regulated. It is not part of
448  * original CBQ description, but looks more reasonable.
449  * Probably, it is wrong. This question needs further investigation.
450  */
451
452 static inline void
453 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
454                     struct cbq_class *borrowed)
455 {
456         if (cl && q->toplevel >= borrowed->level) {
457                 if (cl->q->q.qlen > 1) {
458                         do {
459                                 if (borrowed->undertime == PSCHED_PASTPERFECT) {
460                                         q->toplevel = borrowed->level;
461                                         return;
462                                 }
463                         } while ((borrowed = borrowed->borrow) != NULL);
464                 }
465 #if 0
466         /* It is not necessary now. Uncommenting it
467            will save CPU cycles, but decrease fairness.
468          */
469                 q->toplevel = TC_CBQ_MAXLEVEL;
470 #endif
471         }
472 }
473
474 static void
475 cbq_update(struct cbq_sched_data *q)
476 {
477         struct cbq_class *this = q->tx_class;
478         struct cbq_class *cl = this;
479         int len = q->tx_len;
480         psched_time_t now;
481
482         q->tx_class = NULL;
483         /* Time integrator. We calculate EOS time
484          * by adding expected packet transmission time.
485          */
486         now = q->now + L2T(&q->link, len);
487
488         for ( ; cl; cl = cl->share) {
489                 long avgidle = cl->avgidle;
490                 long idle;
491
492                 _bstats_update(&cl->bstats, len, 1);
493
494                 /*
495                  * (now - last) is total time between packet right edges.
496                  * (last_pktlen/rate) is "virtual" busy time, so that
497                  *
498                  *      idle = (now - last) - last_pktlen/rate
499                  */
500
501                 idle = now - cl->last;
502                 if ((unsigned long)idle > 128*1024*1024) {
503                         avgidle = cl->maxidle;
504                 } else {
505                         idle -= L2T(cl, len);
506
507                 /* true_avgidle := (1-W)*true_avgidle + W*idle,
508                  * where W=2^{-ewma_log}. But cl->avgidle is scaled:
509                  * cl->avgidle == true_avgidle/W,
510                  * hence:
511                  */
512                         avgidle += idle - (avgidle>>cl->ewma_log);
513                 }
514
515                 if (avgidle <= 0) {
516                         /* Overlimit or at-limit */
517
518                         if (avgidle < cl->minidle)
519                                 avgidle = cl->minidle;
520
521                         cl->avgidle = avgidle;
522
523                         /* Calculate expected time, when this class
524                          * will be allowed to send.
525                          * It will occur, when:
526                          * (1-W)*true_avgidle + W*delay = 0, i.e.
527                          * idle = (1/W - 1)*(-true_avgidle)
528                          * or
529                          * idle = (1 - W)*(-cl->avgidle);
530                          */
531                         idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
532
533                         /*
534                          * That is not all.
535                          * To maintain the rate allocated to the class,
536                          * we add to undertime virtual clock,
537                          * necessary to complete transmitted packet.
538                          * (len/phys_bandwidth has been already passed
539                          * to the moment of cbq_update)
540                          */
541
542                         idle -= L2T(&q->link, len);
543                         idle += L2T(cl, len);
544
545                         cl->undertime = now + idle;
546                 } else {
547                         /* Underlimit */
548
549                         cl->undertime = PSCHED_PASTPERFECT;
550                         if (avgidle > cl->maxidle)
551                                 cl->avgidle = cl->maxidle;
552                         else
553                                 cl->avgidle = avgidle;
554                 }
555                 if ((s64)(now - cl->last) > 0)
556                         cl->last = now;
557         }
558
559         cbq_update_toplevel(q, this, q->tx_borrowed);
560 }
561
562 static inline struct cbq_class *
563 cbq_under_limit(struct cbq_class *cl)
564 {
565         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
566         struct cbq_class *this_cl = cl;
567
568         if (cl->tparent == NULL)
569                 return cl;
570
571         if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
572                 cl->delayed = 0;
573                 return cl;
574         }
575
576         do {
577                 /* It is very suspicious place. Now overlimit
578                  * action is generated for not bounded classes
579                  * only if link is completely congested.
580                  * Though it is in agree with ancestor-only paradigm,
581                  * it looks very stupid. Particularly,
582                  * it means that this chunk of code will either
583                  * never be called or result in strong amplification
584                  * of burstiness. Dangerous, silly, and, however,
585                  * no another solution exists.
586                  */
587                 cl = cl->borrow;
588                 if (!cl) {
589                         this_cl->qstats.overlimits++;
590                         cbq_overlimit(this_cl);
591                         return NULL;
592                 }
593                 if (cl->level > q->toplevel)
594                         return NULL;
595         } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
596
597         cl->delayed = 0;
598         return cl;
599 }
600
601 static inline struct sk_buff *
602 cbq_dequeue_prio(struct Qdisc *sch, int prio)
603 {
604         struct cbq_sched_data *q = qdisc_priv(sch);
605         struct cbq_class *cl_tail, *cl_prev, *cl;
606         struct sk_buff *skb;
607         int deficit;
608
609         cl_tail = cl_prev = q->active[prio];
610         cl = cl_prev->next_alive;
611
612         do {
613                 deficit = 0;
614
615                 /* Start round */
616                 do {
617                         struct cbq_class *borrow = cl;
618
619                         if (cl->q->q.qlen &&
620                             (borrow = cbq_under_limit(cl)) == NULL)
621                                 goto skip_class;
622
623                         if (cl->deficit <= 0) {
624                                 /* Class exhausted its allotment per
625                                  * this round. Switch to the next one.
626                                  */
627                                 deficit = 1;
628                                 cl->deficit += cl->quantum;
629                                 goto next_class;
630                         }
631
632                         skb = cl->q->dequeue(cl->q);
633
634                         /* Class did not give us any skb :-(
635                          * It could occur even if cl->q->q.qlen != 0
636                          * f.e. if cl->q == "tbf"
637                          */
638                         if (skb == NULL)
639                                 goto skip_class;
640
641                         cl->deficit -= qdisc_pkt_len(skb);
642                         q->tx_class = cl;
643                         q->tx_borrowed = borrow;
644                         if (borrow != cl) {
645 #ifndef CBQ_XSTATS_BORROWS_BYTES
646                                 borrow->xstats.borrows++;
647                                 cl->xstats.borrows++;
648 #else
649                                 borrow->xstats.borrows += qdisc_pkt_len(skb);
650                                 cl->xstats.borrows += qdisc_pkt_len(skb);
651 #endif
652                         }
653                         q->tx_len = qdisc_pkt_len(skb);
654
655                         if (cl->deficit <= 0) {
656                                 q->active[prio] = cl;
657                                 cl = cl->next_alive;
658                                 cl->deficit += cl->quantum;
659                         }
660                         return skb;
661
662 skip_class:
663                         if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
664                                 /* Class is empty or penalized.
665                                  * Unlink it from active chain.
666                                  */
667                                 cl_prev->next_alive = cl->next_alive;
668                                 cl->next_alive = NULL;
669
670                                 /* Did cl_tail point to it? */
671                                 if (cl == cl_tail) {
672                                         /* Repair it! */
673                                         cl_tail = cl_prev;
674
675                                         /* Was it the last class in this band? */
676                                         if (cl == cl_tail) {
677                                                 /* Kill the band! */
678                                                 q->active[prio] = NULL;
679                                                 q->activemask &= ~(1<<prio);
680                                                 if (cl->q->q.qlen)
681                                                         cbq_activate_class(cl);
682                                                 return NULL;
683                                         }
684
685                                         q->active[prio] = cl_tail;
686                                 }
687                                 if (cl->q->q.qlen)
688                                         cbq_activate_class(cl);
689
690                                 cl = cl_prev;
691                         }
692
693 next_class:
694                         cl_prev = cl;
695                         cl = cl->next_alive;
696                 } while (cl_prev != cl_tail);
697         } while (deficit);
698
699         q->active[prio] = cl_prev;
700
701         return NULL;
702 }
703
704 static inline struct sk_buff *
705 cbq_dequeue_1(struct Qdisc *sch)
706 {
707         struct cbq_sched_data *q = qdisc_priv(sch);
708         struct sk_buff *skb;
709         unsigned int activemask;
710
711         activemask = q->activemask & 0xFF;
712         while (activemask) {
713                 int prio = ffz(~activemask);
714                 activemask &= ~(1<<prio);
715                 skb = cbq_dequeue_prio(sch, prio);
716                 if (skb)
717                         return skb;
718         }
719         return NULL;
720 }
721
722 static struct sk_buff *
723 cbq_dequeue(struct Qdisc *sch)
724 {
725         struct sk_buff *skb;
726         struct cbq_sched_data *q = qdisc_priv(sch);
727         psched_time_t now;
728
729         now = psched_get_time();
730
731         if (q->tx_class)
732                 cbq_update(q);
733
734         q->now = now;
735
736         for (;;) {
737                 q->wd_expires = 0;
738
739                 skb = cbq_dequeue_1(sch);
740                 if (skb) {
741                         qdisc_bstats_update(sch, skb);
742                         sch->q.qlen--;
743                         return skb;
744                 }
745
746                 /* All the classes are overlimit.
747                  *
748                  * It is possible, if:
749                  *
750                  * 1. Scheduler is empty.
751                  * 2. Toplevel cutoff inhibited borrowing.
752                  * 3. Root class is overlimit.
753                  *
754                  * Reset 2d and 3d conditions and retry.
755                  *
756                  * Note, that NS and cbq-2.0 are buggy, peeking
757                  * an arbitrary class is appropriate for ancestor-only
758                  * sharing, but not for toplevel algorithm.
759                  *
760                  * Our version is better, but slower, because it requires
761                  * two passes, but it is unavoidable with top-level sharing.
762                  */
763
764                 if (q->toplevel == TC_CBQ_MAXLEVEL &&
765                     q->link.undertime == PSCHED_PASTPERFECT)
766                         break;
767
768                 q->toplevel = TC_CBQ_MAXLEVEL;
769                 q->link.undertime = PSCHED_PASTPERFECT;
770         }
771
772         /* No packets in scheduler or nobody wants to give them to us :-(
773          * Sigh... start watchdog timer in the last case.
774          */
775
776         if (sch->q.qlen) {
777                 qdisc_qstats_overlimit(sch);
778                 if (q->wd_expires)
779                         qdisc_watchdog_schedule(&q->watchdog,
780                                                 now + q->wd_expires);
781         }
782         return NULL;
783 }
784
785 /* CBQ class maintenance routines */
786
787 static void cbq_adjust_levels(struct cbq_class *this)
788 {
789         if (this == NULL)
790                 return;
791
792         do {
793                 int level = 0;
794                 struct cbq_class *cl;
795
796                 cl = this->children;
797                 if (cl) {
798                         do {
799                                 if (cl->level > level)
800                                         level = cl->level;
801                         } while ((cl = cl->sibling) != this->children);
802                 }
803                 this->level = level + 1;
804         } while ((this = this->tparent) != NULL);
805 }
806
807 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
808 {
809         struct cbq_class *cl;
810         unsigned int h;
811
812         if (q->quanta[prio] == 0)
813                 return;
814
815         for (h = 0; h < q->clhash.hashsize; h++) {
816                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
817                         /* BUGGGG... Beware! This expression suffer of
818                          * arithmetic overflows!
819                          */
820                         if (cl->priority == prio) {
821                                 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
822                                         q->quanta[prio];
823                         }
824                         if (cl->quantum <= 0 ||
825                             cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
826                                 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
827                                         cl->common.classid, cl->quantum);
828                                 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
829                         }
830                 }
831         }
832 }
833
834 static void cbq_sync_defmap(struct cbq_class *cl)
835 {
836         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
837         struct cbq_class *split = cl->split;
838         unsigned int h;
839         int i;
840
841         if (split == NULL)
842                 return;
843
844         for (i = 0; i <= TC_PRIO_MAX; i++) {
845                 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
846                         split->defaults[i] = NULL;
847         }
848
849         for (i = 0; i <= TC_PRIO_MAX; i++) {
850                 int level = split->level;
851
852                 if (split->defaults[i])
853                         continue;
854
855                 for (h = 0; h < q->clhash.hashsize; h++) {
856                         struct cbq_class *c;
857
858                         hlist_for_each_entry(c, &q->clhash.hash[h],
859                                              common.hnode) {
860                                 if (c->split == split && c->level < level &&
861                                     c->defmap & (1<<i)) {
862                                         split->defaults[i] = c;
863                                         level = c->level;
864                                 }
865                         }
866                 }
867         }
868 }
869
870 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
871 {
872         struct cbq_class *split = NULL;
873
874         if (splitid == 0) {
875                 split = cl->split;
876                 if (!split)
877                         return;
878                 splitid = split->common.classid;
879         }
880
881         if (split == NULL || split->common.classid != splitid) {
882                 for (split = cl->tparent; split; split = split->tparent)
883                         if (split->common.classid == splitid)
884                                 break;
885         }
886
887         if (split == NULL)
888                 return;
889
890         if (cl->split != split) {
891                 cl->defmap = 0;
892                 cbq_sync_defmap(cl);
893                 cl->split = split;
894                 cl->defmap = def & mask;
895         } else
896                 cl->defmap = (cl->defmap & ~mask) | (def & mask);
897
898         cbq_sync_defmap(cl);
899 }
900
901 static void cbq_unlink_class(struct cbq_class *this)
902 {
903         struct cbq_class *cl, **clp;
904         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
905
906         qdisc_class_hash_remove(&q->clhash, &this->common);
907
908         if (this->tparent) {
909                 clp = &this->sibling;
910                 cl = *clp;
911                 do {
912                         if (cl == this) {
913                                 *clp = cl->sibling;
914                                 break;
915                         }
916                         clp = &cl->sibling;
917                 } while ((cl = *clp) != this->sibling);
918
919                 if (this->tparent->children == this) {
920                         this->tparent->children = this->sibling;
921                         if (this->sibling == this)
922                                 this->tparent->children = NULL;
923                 }
924         } else {
925                 WARN_ON(this->sibling != this);
926         }
927 }
928
929 static void cbq_link_class(struct cbq_class *this)
930 {
931         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
932         struct cbq_class *parent = this->tparent;
933
934         this->sibling = this;
935         qdisc_class_hash_insert(&q->clhash, &this->common);
936
937         if (parent == NULL)
938                 return;
939
940         if (parent->children == NULL) {
941                 parent->children = this;
942         } else {
943                 this->sibling = parent->children->sibling;
944                 parent->children->sibling = this;
945         }
946 }
947
948 static void
949 cbq_reset(struct Qdisc *sch)
950 {
951         struct cbq_sched_data *q = qdisc_priv(sch);
952         struct cbq_class *cl;
953         int prio;
954         unsigned int h;
955
956         q->activemask = 0;
957         q->pmask = 0;
958         q->tx_class = NULL;
959         q->tx_borrowed = NULL;
960         qdisc_watchdog_cancel(&q->watchdog);
961         q->toplevel = TC_CBQ_MAXLEVEL;
962         q->now = psched_get_time();
963
964         for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
965                 q->active[prio] = NULL;
966
967         for (h = 0; h < q->clhash.hashsize; h++) {
968                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
969                         qdisc_reset(cl->q);
970
971                         cl->next_alive = NULL;
972                         cl->undertime = PSCHED_PASTPERFECT;
973                         cl->avgidle = cl->maxidle;
974                         cl->deficit = cl->quantum;
975                         cl->cpriority = cl->priority;
976                 }
977         }
978         sch->q.qlen = 0;
979 }
980
981
982 static void cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
983 {
984         if (lss->change & TCF_CBQ_LSS_FLAGS) {
985                 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
986                 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
987         }
988         if (lss->change & TCF_CBQ_LSS_EWMA)
989                 cl->ewma_log = lss->ewma_log;
990         if (lss->change & TCF_CBQ_LSS_AVPKT)
991                 cl->avpkt = lss->avpkt;
992         if (lss->change & TCF_CBQ_LSS_MINIDLE)
993                 cl->minidle = -(long)lss->minidle;
994         if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
995                 cl->maxidle = lss->maxidle;
996                 cl->avgidle = lss->maxidle;
997         }
998         if (lss->change & TCF_CBQ_LSS_OFFTIME)
999                 cl->offtime = lss->offtime;
1000 }
1001
1002 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1003 {
1004         q->nclasses[cl->priority]--;
1005         q->quanta[cl->priority] -= cl->weight;
1006         cbq_normalize_quanta(q, cl->priority);
1007 }
1008
1009 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1010 {
1011         q->nclasses[cl->priority]++;
1012         q->quanta[cl->priority] += cl->weight;
1013         cbq_normalize_quanta(q, cl->priority);
1014 }
1015
1016 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1017 {
1018         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1019
1020         if (wrr->allot)
1021                 cl->allot = wrr->allot;
1022         if (wrr->weight)
1023                 cl->weight = wrr->weight;
1024         if (wrr->priority) {
1025                 cl->priority = wrr->priority - 1;
1026                 cl->cpriority = cl->priority;
1027                 if (cl->priority >= cl->priority2)
1028                         cl->priority2 = TC_CBQ_MAXPRIO - 1;
1029         }
1030
1031         cbq_addprio(q, cl);
1032         return 0;
1033 }
1034
1035 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1036 {
1037         cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1038         return 0;
1039 }
1040
1041 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1042         [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1043         [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1044         [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1045         [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1046         [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1047         [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1048         [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1049 };
1050
1051 static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1052                          struct nlattr *opt,
1053                          struct netlink_ext_ack *extack)
1054 {
1055         int err;
1056
1057         if (!opt) {
1058                 NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1059                 return -EINVAL;
1060         }
1061
1062         err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1063                                           cbq_policy, extack);
1064         if (err < 0)
1065                 return err;
1066
1067         if (tb[TCA_CBQ_WRROPT]) {
1068                 const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1069
1070                 if (wrr->priority > TC_CBQ_MAXPRIO) {
1071                         NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1072                         err = -EINVAL;
1073                 }
1074         }
1075         return err;
1076 }
1077
1078 static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1079                     struct netlink_ext_ack *extack)
1080 {
1081         struct cbq_sched_data *q = qdisc_priv(sch);
1082         struct nlattr *tb[TCA_CBQ_MAX + 1];
1083         struct tc_ratespec *r;
1084         int err;
1085
1086         qdisc_watchdog_init(&q->watchdog, sch);
1087
1088         err = cbq_opt_parse(tb, opt, extack);
1089         if (err < 0)
1090                 return err;
1091
1092         if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1093                 NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1094                 return -EINVAL;
1095         }
1096
1097         r = nla_data(tb[TCA_CBQ_RATE]);
1098
1099         q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1100         if (!q->link.R_tab)
1101                 return -EINVAL;
1102
1103         err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1104         if (err)
1105                 goto put_rtab;
1106
1107         err = qdisc_class_hash_init(&q->clhash);
1108         if (err < 0)
1109                 goto put_block;
1110
1111         q->link.sibling = &q->link;
1112         q->link.common.classid = sch->handle;
1113         q->link.qdisc = sch;
1114         q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1115                                       sch->handle, NULL);
1116         if (!q->link.q)
1117                 q->link.q = &noop_qdisc;
1118         else
1119                 qdisc_hash_add(q->link.q, true);
1120
1121         q->link.priority = TC_CBQ_MAXPRIO - 1;
1122         q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1123         q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1124         q->link.allot = psched_mtu(qdisc_dev(sch));
1125         q->link.quantum = q->link.allot;
1126         q->link.weight = q->link.R_tab->rate.rate;
1127
1128         q->link.ewma_log = TC_CBQ_DEF_EWMA;
1129         q->link.avpkt = q->link.allot/2;
1130         q->link.minidle = -0x7FFFFFFF;
1131
1132         q->toplevel = TC_CBQ_MAXLEVEL;
1133         q->now = psched_get_time();
1134
1135         cbq_link_class(&q->link);
1136
1137         if (tb[TCA_CBQ_LSSOPT])
1138                 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1139
1140         cbq_addprio(q, &q->link);
1141         return 0;
1142
1143 put_block:
1144         tcf_block_put(q->link.block);
1145
1146 put_rtab:
1147         qdisc_put_rtab(q->link.R_tab);
1148         return err;
1149 }
1150
1151 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1152 {
1153         unsigned char *b = skb_tail_pointer(skb);
1154
1155         if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1156                 goto nla_put_failure;
1157         return skb->len;
1158
1159 nla_put_failure:
1160         nlmsg_trim(skb, b);
1161         return -1;
1162 }
1163
1164 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1165 {
1166         unsigned char *b = skb_tail_pointer(skb);
1167         struct tc_cbq_lssopt opt;
1168
1169         opt.flags = 0;
1170         if (cl->borrow == NULL)
1171                 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1172         if (cl->share == NULL)
1173                 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1174         opt.ewma_log = cl->ewma_log;
1175         opt.level = cl->level;
1176         opt.avpkt = cl->avpkt;
1177         opt.maxidle = cl->maxidle;
1178         opt.minidle = (u32)(-cl->minidle);
1179         opt.offtime = cl->offtime;
1180         opt.change = ~0;
1181         if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1182                 goto nla_put_failure;
1183         return skb->len;
1184
1185 nla_put_failure:
1186         nlmsg_trim(skb, b);
1187         return -1;
1188 }
1189
1190 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1191 {
1192         unsigned char *b = skb_tail_pointer(skb);
1193         struct tc_cbq_wrropt opt;
1194
1195         memset(&opt, 0, sizeof(opt));
1196         opt.flags = 0;
1197         opt.allot = cl->allot;
1198         opt.priority = cl->priority + 1;
1199         opt.cpriority = cl->cpriority + 1;
1200         opt.weight = cl->weight;
1201         if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1202                 goto nla_put_failure;
1203         return skb->len;
1204
1205 nla_put_failure:
1206         nlmsg_trim(skb, b);
1207         return -1;
1208 }
1209
1210 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1211 {
1212         unsigned char *b = skb_tail_pointer(skb);
1213         struct tc_cbq_fopt opt;
1214
1215         if (cl->split || cl->defmap) {
1216                 opt.split = cl->split ? cl->split->common.classid : 0;
1217                 opt.defmap = cl->defmap;
1218                 opt.defchange = ~0;
1219                 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1220                         goto nla_put_failure;
1221         }
1222         return skb->len;
1223
1224 nla_put_failure:
1225         nlmsg_trim(skb, b);
1226         return -1;
1227 }
1228
1229 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1230 {
1231         if (cbq_dump_lss(skb, cl) < 0 ||
1232             cbq_dump_rate(skb, cl) < 0 ||
1233             cbq_dump_wrr(skb, cl) < 0 ||
1234             cbq_dump_fopt(skb, cl) < 0)
1235                 return -1;
1236         return 0;
1237 }
1238
1239 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1240 {
1241         struct cbq_sched_data *q = qdisc_priv(sch);
1242         struct nlattr *nest;
1243
1244         nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1245         if (nest == NULL)
1246                 goto nla_put_failure;
1247         if (cbq_dump_attr(skb, &q->link) < 0)
1248                 goto nla_put_failure;
1249         return nla_nest_end(skb, nest);
1250
1251 nla_put_failure:
1252         nla_nest_cancel(skb, nest);
1253         return -1;
1254 }
1255
1256 static int
1257 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1258 {
1259         struct cbq_sched_data *q = qdisc_priv(sch);
1260
1261         q->link.xstats.avgidle = q->link.avgidle;
1262         return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1263 }
1264
1265 static int
1266 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1267                struct sk_buff *skb, struct tcmsg *tcm)
1268 {
1269         struct cbq_class *cl = (struct cbq_class *)arg;
1270         struct nlattr *nest;
1271
1272         if (cl->tparent)
1273                 tcm->tcm_parent = cl->tparent->common.classid;
1274         else
1275                 tcm->tcm_parent = TC_H_ROOT;
1276         tcm->tcm_handle = cl->common.classid;
1277         tcm->tcm_info = cl->q->handle;
1278
1279         nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1280         if (nest == NULL)
1281                 goto nla_put_failure;
1282         if (cbq_dump_attr(skb, cl) < 0)
1283                 goto nla_put_failure;
1284         return nla_nest_end(skb, nest);
1285
1286 nla_put_failure:
1287         nla_nest_cancel(skb, nest);
1288         return -1;
1289 }
1290
1291 static int
1292 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1293         struct gnet_dump *d)
1294 {
1295         struct cbq_sched_data *q = qdisc_priv(sch);
1296         struct cbq_class *cl = (struct cbq_class *)arg;
1297         __u32 qlen;
1298
1299         cl->xstats.avgidle = cl->avgidle;
1300         cl->xstats.undertime = 0;
1301         qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1302
1303         if (cl->undertime != PSCHED_PASTPERFECT)
1304                 cl->xstats.undertime = cl->undertime - q->now;
1305
1306         if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1307             gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1308             gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1309                 return -1;
1310
1311         return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1312 }
1313
1314 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1315                      struct Qdisc **old, struct netlink_ext_ack *extack)
1316 {
1317         struct cbq_class *cl = (struct cbq_class *)arg;
1318
1319         if (new == NULL) {
1320                 new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1321                                         cl->common.classid, extack);
1322                 if (new == NULL)
1323                         return -ENOBUFS;
1324         }
1325
1326         *old = qdisc_replace(sch, new, &cl->q);
1327         return 0;
1328 }
1329
1330 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1331 {
1332         struct cbq_class *cl = (struct cbq_class *)arg;
1333
1334         return cl->q;
1335 }
1336
1337 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1338 {
1339         struct cbq_class *cl = (struct cbq_class *)arg;
1340
1341         cbq_deactivate_class(cl);
1342 }
1343
1344 static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1345 {
1346         struct cbq_sched_data *q = qdisc_priv(sch);
1347
1348         return (unsigned long)cbq_class_lookup(q, classid);
1349 }
1350
1351 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1352 {
1353         struct cbq_sched_data *q = qdisc_priv(sch);
1354
1355         WARN_ON(cl->filters);
1356
1357         tcf_block_put(cl->block);
1358         qdisc_put(cl->q);
1359         qdisc_put_rtab(cl->R_tab);
1360         gen_kill_estimator(&cl->rate_est);
1361         if (cl != &q->link)
1362                 kfree(cl);
1363 }
1364
1365 static void cbq_destroy(struct Qdisc *sch)
1366 {
1367         struct cbq_sched_data *q = qdisc_priv(sch);
1368         struct hlist_node *next;
1369         struct cbq_class *cl;
1370         unsigned int h;
1371
1372 #ifdef CONFIG_NET_CLS_ACT
1373         q->rx_class = NULL;
1374 #endif
1375         /*
1376          * Filters must be destroyed first because we don't destroy the
1377          * classes from root to leafs which means that filters can still
1378          * be bound to classes which have been destroyed already. --TGR '04
1379          */
1380         for (h = 0; h < q->clhash.hashsize; h++) {
1381                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1382                         tcf_block_put(cl->block);
1383                         cl->block = NULL;
1384                 }
1385         }
1386         for (h = 0; h < q->clhash.hashsize; h++) {
1387                 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1388                                           common.hnode)
1389                         cbq_destroy_class(sch, cl);
1390         }
1391         qdisc_class_hash_destroy(&q->clhash);
1392 }
1393
1394 static int
1395 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1396                  unsigned long *arg, struct netlink_ext_ack *extack)
1397 {
1398         int err;
1399         struct cbq_sched_data *q = qdisc_priv(sch);
1400         struct cbq_class *cl = (struct cbq_class *)*arg;
1401         struct nlattr *opt = tca[TCA_OPTIONS];
1402         struct nlattr *tb[TCA_CBQ_MAX + 1];
1403         struct cbq_class *parent;
1404         struct qdisc_rate_table *rtab = NULL;
1405
1406         err = cbq_opt_parse(tb, opt, extack);
1407         if (err < 0)
1408                 return err;
1409
1410         if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1411                 NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1412                 return -EOPNOTSUPP;
1413         }
1414
1415         if (cl) {
1416                 /* Check parent */
1417                 if (parentid) {
1418                         if (cl->tparent &&
1419                             cl->tparent->common.classid != parentid) {
1420                                 NL_SET_ERR_MSG(extack, "Invalid parent id");
1421                                 return -EINVAL;
1422                         }
1423                         if (!cl->tparent && parentid != TC_H_ROOT) {
1424                                 NL_SET_ERR_MSG(extack, "Parent must be root");
1425                                 return -EINVAL;
1426                         }
1427                 }
1428
1429                 if (tb[TCA_CBQ_RATE]) {
1430                         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1431                                               tb[TCA_CBQ_RTAB], extack);
1432                         if (rtab == NULL)
1433                                 return -EINVAL;
1434                 }
1435
1436                 if (tca[TCA_RATE]) {
1437                         err = gen_replace_estimator(&cl->bstats, NULL,
1438                                                     &cl->rate_est,
1439                                                     NULL,
1440                                                     true,
1441                                                     tca[TCA_RATE]);
1442                         if (err) {
1443                                 NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1444                                 qdisc_put_rtab(rtab);
1445                                 return err;
1446                         }
1447                 }
1448
1449                 /* Change class parameters */
1450                 sch_tree_lock(sch);
1451
1452                 if (cl->next_alive != NULL)
1453                         cbq_deactivate_class(cl);
1454
1455                 if (rtab) {
1456                         qdisc_put_rtab(cl->R_tab);
1457                         cl->R_tab = rtab;
1458                 }
1459
1460                 if (tb[TCA_CBQ_LSSOPT])
1461                         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1462
1463                 if (tb[TCA_CBQ_WRROPT]) {
1464                         cbq_rmprio(q, cl);
1465                         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1466                 }
1467
1468                 if (tb[TCA_CBQ_FOPT])
1469                         cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1470
1471                 if (cl->q->q.qlen)
1472                         cbq_activate_class(cl);
1473
1474                 sch_tree_unlock(sch);
1475
1476                 return 0;
1477         }
1478
1479         if (parentid == TC_H_ROOT)
1480                 return -EINVAL;
1481
1482         if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1483                 NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1484                 return -EINVAL;
1485         }
1486
1487         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1488                               extack);
1489         if (rtab == NULL)
1490                 return -EINVAL;
1491
1492         if (classid) {
1493                 err = -EINVAL;
1494                 if (TC_H_MAJ(classid ^ sch->handle) ||
1495                     cbq_class_lookup(q, classid)) {
1496                         NL_SET_ERR_MSG(extack, "Specified class not found");
1497                         goto failure;
1498                 }
1499         } else {
1500                 int i;
1501                 classid = TC_H_MAKE(sch->handle, 0x8000);
1502
1503                 for (i = 0; i < 0x8000; i++) {
1504                         if (++q->hgenerator >= 0x8000)
1505                                 q->hgenerator = 1;
1506                         if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1507                                 break;
1508                 }
1509                 err = -ENOSR;
1510                 if (i >= 0x8000) {
1511                         NL_SET_ERR_MSG(extack, "Unable to generate classid");
1512                         goto failure;
1513                 }
1514                 classid = classid|q->hgenerator;
1515         }
1516
1517         parent = &q->link;
1518         if (parentid) {
1519                 parent = cbq_class_lookup(q, parentid);
1520                 err = -EINVAL;
1521                 if (!parent) {
1522                         NL_SET_ERR_MSG(extack, "Failed to find parentid");
1523                         goto failure;
1524                 }
1525         }
1526
1527         err = -ENOBUFS;
1528         cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1529         if (cl == NULL)
1530                 goto failure;
1531
1532         gnet_stats_basic_sync_init(&cl->bstats);
1533         err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1534         if (err) {
1535                 kfree(cl);
1536                 goto failure;
1537         }
1538
1539         if (tca[TCA_RATE]) {
1540                 err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1541                                         NULL, true, tca[TCA_RATE]);
1542                 if (err) {
1543                         NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1544                         tcf_block_put(cl->block);
1545                         kfree(cl);
1546                         goto failure;
1547                 }
1548         }
1549
1550         cl->R_tab = rtab;
1551         rtab = NULL;
1552         cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1553                                   NULL);
1554         if (!cl->q)
1555                 cl->q = &noop_qdisc;
1556         else
1557                 qdisc_hash_add(cl->q, true);
1558
1559         cl->common.classid = classid;
1560         cl->tparent = parent;
1561         cl->qdisc = sch;
1562         cl->allot = parent->allot;
1563         cl->quantum = cl->allot;
1564         cl->weight = cl->R_tab->rate.rate;
1565
1566         sch_tree_lock(sch);
1567         cbq_link_class(cl);
1568         cl->borrow = cl->tparent;
1569         if (cl->tparent != &q->link)
1570                 cl->share = cl->tparent;
1571         cbq_adjust_levels(parent);
1572         cl->minidle = -0x7FFFFFFF;
1573         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1574         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1575         if (cl->ewma_log == 0)
1576                 cl->ewma_log = q->link.ewma_log;
1577         if (cl->maxidle == 0)
1578                 cl->maxidle = q->link.maxidle;
1579         if (cl->avpkt == 0)
1580                 cl->avpkt = q->link.avpkt;
1581         if (tb[TCA_CBQ_FOPT])
1582                 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1583         sch_tree_unlock(sch);
1584
1585         qdisc_class_hash_grow(sch, &q->clhash);
1586
1587         *arg = (unsigned long)cl;
1588         return 0;
1589
1590 failure:
1591         qdisc_put_rtab(rtab);
1592         return err;
1593 }
1594
1595 static int cbq_delete(struct Qdisc *sch, unsigned long arg,
1596                       struct netlink_ext_ack *extack)
1597 {
1598         struct cbq_sched_data *q = qdisc_priv(sch);
1599         struct cbq_class *cl = (struct cbq_class *)arg;
1600
1601         if (cl->filters || cl->children || cl == &q->link)
1602                 return -EBUSY;
1603
1604         sch_tree_lock(sch);
1605
1606         qdisc_purge_queue(cl->q);
1607
1608         if (cl->next_alive)
1609                 cbq_deactivate_class(cl);
1610
1611         if (q->tx_borrowed == cl)
1612                 q->tx_borrowed = q->tx_class;
1613         if (q->tx_class == cl) {
1614                 q->tx_class = NULL;
1615                 q->tx_borrowed = NULL;
1616         }
1617 #ifdef CONFIG_NET_CLS_ACT
1618         if (q->rx_class == cl)
1619                 q->rx_class = NULL;
1620 #endif
1621
1622         cbq_unlink_class(cl);
1623         cbq_adjust_levels(cl->tparent);
1624         cl->defmap = 0;
1625         cbq_sync_defmap(cl);
1626
1627         cbq_rmprio(q, cl);
1628         sch_tree_unlock(sch);
1629
1630         cbq_destroy_class(sch, cl);
1631         return 0;
1632 }
1633
1634 static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1635                                        struct netlink_ext_ack *extack)
1636 {
1637         struct cbq_sched_data *q = qdisc_priv(sch);
1638         struct cbq_class *cl = (struct cbq_class *)arg;
1639
1640         if (cl == NULL)
1641                 cl = &q->link;
1642
1643         return cl->block;
1644 }
1645
1646 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1647                                      u32 classid)
1648 {
1649         struct cbq_sched_data *q = qdisc_priv(sch);
1650         struct cbq_class *p = (struct cbq_class *)parent;
1651         struct cbq_class *cl = cbq_class_lookup(q, classid);
1652
1653         if (cl) {
1654                 if (p && p->level <= cl->level)
1655                         return 0;
1656                 cl->filters++;
1657                 return (unsigned long)cl;
1658         }
1659         return 0;
1660 }
1661
1662 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1663 {
1664         struct cbq_class *cl = (struct cbq_class *)arg;
1665
1666         cl->filters--;
1667 }
1668
1669 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1670 {
1671         struct cbq_sched_data *q = qdisc_priv(sch);
1672         struct cbq_class *cl;
1673         unsigned int h;
1674
1675         if (arg->stop)
1676                 return;
1677
1678         for (h = 0; h < q->clhash.hashsize; h++) {
1679                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1680                         if (arg->count < arg->skip) {
1681                                 arg->count++;
1682                                 continue;
1683                         }
1684                         if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1685                                 arg->stop = 1;
1686                                 return;
1687                         }
1688                         arg->count++;
1689                 }
1690         }
1691 }
1692
1693 static const struct Qdisc_class_ops cbq_class_ops = {
1694         .graft          =       cbq_graft,
1695         .leaf           =       cbq_leaf,
1696         .qlen_notify    =       cbq_qlen_notify,
1697         .find           =       cbq_find,
1698         .change         =       cbq_change_class,
1699         .delete         =       cbq_delete,
1700         .walk           =       cbq_walk,
1701         .tcf_block      =       cbq_tcf_block,
1702         .bind_tcf       =       cbq_bind_filter,
1703         .unbind_tcf     =       cbq_unbind_filter,
1704         .dump           =       cbq_dump_class,
1705         .dump_stats     =       cbq_dump_class_stats,
1706 };
1707
1708 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1709         .next           =       NULL,
1710         .cl_ops         =       &cbq_class_ops,
1711         .id             =       "cbq",
1712         .priv_size      =       sizeof(struct cbq_sched_data),
1713         .enqueue        =       cbq_enqueue,
1714         .dequeue        =       cbq_dequeue,
1715         .peek           =       qdisc_peek_dequeued,
1716         .init           =       cbq_init,
1717         .reset          =       cbq_reset,
1718         .destroy        =       cbq_destroy,
1719         .change         =       NULL,
1720         .dump           =       cbq_dump,
1721         .dump_stats     =       cbq_dump_stats,
1722         .owner          =       THIS_MODULE,
1723 };
1724
1725 static int __init cbq_module_init(void)
1726 {
1727         return register_qdisc(&cbq_qdisc_ops);
1728 }
1729 static void __exit cbq_module_exit(void)
1730 {
1731         unregister_qdisc(&cbq_qdisc_ops);
1732 }
1733 module_init(cbq_module_init)
1734 module_exit(cbq_module_exit)
1735 MODULE_LICENSE("GPL");