1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * net/sched/sch_htb.c Hierarchical token bucket, feed tree version
5 * Authors: Martin Devera, <devik@cdi.cz>
7 * Credits (in time order) for older HTB versions:
8 * Stef Coene <stef.coene@docum.org>
9 * HTB support at LARTC mailing list
10 * Ondrej Kraus, <krauso@barr.cz>
11 * found missing INIT_QDISC(htb)
12 * Vladimir Smelhaus, Aamer Akhter, Bert Hubert
13 * helped a lot to locate nasty class stall bug
14 * Andi Kleen, Jamal Hadi, Bert Hubert
15 * code review and helpful comments on shaping
16 * Tomasz Wrona, <tw@eter.tym.pl>
17 * created test case so that I was able to fix nasty bug
19 * spotted bug in dequeue code and helped with fix
21 * fixed requeue routine
22 * and many others. thanks.
24 #include <linux/module.h>
25 #include <linux/moduleparam.h>
26 #include <linux/types.h>
27 #include <linux/kernel.h>
28 #include <linux/string.h>
29 #include <linux/errno.h>
30 #include <linux/skbuff.h>
31 #include <linux/list.h>
32 #include <linux/compiler.h>
33 #include <linux/rbtree.h>
34 #include <linux/workqueue.h>
35 #include <linux/slab.h>
36 #include <net/netlink.h>
37 #include <net/sch_generic.h>
38 #include <net/pkt_sched.h>
39 #include <net/pkt_cls.h>
43 ========================================================================
44 HTB is like TBF with multiple classes. It is also similar to CBQ because
45 it allows to assign priority to each class in hierarchy.
46 In fact it is another implementation of Floyd's formal sharing.
49 Each class is assigned level. Leaf has ALWAYS level 0 and root
50 classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51 one less than their parent.
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011 /* major must be matched with number supplied by TC as version */
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
61 /* Module parameter and sysfs export */
62 module_param (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
65 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
66 module_param(htb_rate_est, int, 0640);
67 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
69 /* used internaly to keep status of single class */
71 HTB_CANT_SEND, /* class can't send and can't borrow */
72 HTB_MAY_BORROW, /* class can't send but may borrow */
73 HTB_CAN_SEND /* class can send */
82 /* When class changes from state 1->2 and disconnects from
83 * parent's feed then we lost ptr value and start from the
84 * first child again. Here we store classid of the
85 * last valid ptr (used when ptr is NULL).
90 /* interior & leaf nodes; props specific to leaves are marked L:
91 * To reduce false sharing, place mostly read fields at beginning,
92 * and mostly written ones at the end.
95 struct Qdisc_class_common common;
96 struct psched_ratecfg rate;
97 struct psched_ratecfg ceil;
98 s64 buffer, cbuffer;/* token bucket depth/rate */
99 s64 mbuffer; /* max wait time */
100 u32 prio; /* these two are used only by leaves... */
101 int quantum; /* but stored for parent-to-leaf return */
103 struct tcf_proto __rcu *filter_list; /* class attached filters */
104 struct tcf_block *block;
107 int level; /* our level (see above) */
108 unsigned int children;
109 struct htb_class *parent; /* parent class */
111 struct net_rate_estimator __rcu *rate_est;
114 * Written often fields
116 struct gnet_stats_basic_packed bstats;
117 struct gnet_stats_basic_packed bstats_bias;
118 struct tc_htb_xstats xstats; /* our special stats */
120 /* token bucket parameters */
121 s64 tokens, ctokens;/* current number of tokens */
122 s64 t_c; /* checkpoint time */
125 struct htb_class_leaf {
126 int deficit[TC_HTB_MAXDEPTH];
129 struct htb_class_inner {
130 struct htb_prio clprio[TC_HTB_NUMPRIO];
135 int prio_activity; /* for which prios are we active */
136 enum htb_cmode cmode; /* current mode of the class */
137 struct rb_node pq_node; /* node for event queue */
138 struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
140 unsigned int drops ____cacheline_aligned_in_smp;
141 unsigned int overlimits;
145 struct rb_root wait_pq;
146 struct htb_prio hprio[TC_HTB_NUMPRIO];
150 struct Qdisc_class_hash clhash;
151 int defcls; /* class where unclassified flows go to */
152 int rate2quantum; /* quant = rate / rate2quantum */
154 /* filters for qdisc itself */
155 struct tcf_proto __rcu *filter_list;
156 struct tcf_block *block;
158 #define HTB_WARN_TOOMANYEVENTS 0x1
159 unsigned int warned; /* only one warning */
161 struct work_struct work;
163 /* non shaped skbs; let them go directly thru */
164 struct qdisc_skb_head direct_queue;
168 struct qdisc_watchdog watchdog;
170 s64 now; /* cached dequeue time */
172 /* time of nearest event per level (row) */
173 s64 near_ev_cache[TC_HTB_MAXDEPTH];
175 int row_mask[TC_HTB_MAXDEPTH];
177 struct htb_level hlevel[TC_HTB_MAXDEPTH];
179 struct Qdisc **direct_qdiscs;
180 unsigned int num_direct_qdiscs;
185 /* find class in global hash table using given handle */
186 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
188 struct htb_sched *q = qdisc_priv(sch);
189 struct Qdisc_class_common *clc;
191 clc = qdisc_class_find(&q->clhash, handle);
194 return container_of(clc, struct htb_class, common);
197 static unsigned long htb_search(struct Qdisc *sch, u32 handle)
199 return (unsigned long)htb_find(handle, sch);
202 * htb_classify - classify a packet into class
204 * It returns NULL if the packet should be dropped or -1 if the packet
205 * should be passed directly thru. In all other cases leaf class is returned.
206 * We allow direct class selection by classid in priority. The we examine
207 * filters in qdisc and in inner nodes (if higher filter points to the inner
208 * node). If we end up with classid MAJOR:0 we enqueue the skb into special
209 * internal fifo (direct). These packets then go directly thru. If we still
210 * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
211 * then finish and return direct queue.
213 #define HTB_DIRECT ((struct htb_class *)-1L)
215 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
218 struct htb_sched *q = qdisc_priv(sch);
219 struct htb_class *cl;
220 struct tcf_result res;
221 struct tcf_proto *tcf;
224 /* allow to select class by setting skb->priority to valid classid;
225 * note that nfmark can be used too by attaching filter fw with no
228 if (skb->priority == sch->handle)
229 return HTB_DIRECT; /* X:0 (direct flow) selected */
230 cl = htb_find(skb->priority, sch);
234 /* Start with inner filter chain if a non-leaf class is selected */
235 tcf = rcu_dereference_bh(cl->filter_list);
237 tcf = rcu_dereference_bh(q->filter_list);
240 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
241 while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) {
242 #ifdef CONFIG_NET_CLS_ACT
247 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
253 cl = (void *)res.class;
255 if (res.classid == sch->handle)
256 return HTB_DIRECT; /* X:0 (direct flow) */
257 cl = htb_find(res.classid, sch);
259 break; /* filter selected invalid classid */
262 return cl; /* we hit leaf; return it */
264 /* we have got inner class; apply inner filter chain */
265 tcf = rcu_dereference_bh(cl->filter_list);
267 /* classification failed; try to use default class */
268 cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
269 if (!cl || cl->level)
270 return HTB_DIRECT; /* bad default .. this is safe bet */
275 * htb_add_to_id_tree - adds class to the round robin list
276 * @root: the root of the tree
277 * @cl: the class to add
278 * @prio: the give prio in class
280 * Routine adds class to the list (actually tree) sorted by classid.
281 * Make sure that class is not already on such list for given prio.
283 static void htb_add_to_id_tree(struct rb_root *root,
284 struct htb_class *cl, int prio)
286 struct rb_node **p = &root->rb_node, *parent = NULL;
291 c = rb_entry(parent, struct htb_class, node[prio]);
293 if (cl->common.classid > c->common.classid)
294 p = &parent->rb_right;
296 p = &parent->rb_left;
298 rb_link_node(&cl->node[prio], parent, p);
299 rb_insert_color(&cl->node[prio], root);
303 * htb_add_to_wait_tree - adds class to the event queue with delay
304 * @q: the priority event queue
305 * @cl: the class to add
306 * @delay: delay in microseconds
308 * The class is added to priority event queue to indicate that class will
309 * change its mode in cl->pq_key microseconds. Make sure that class is not
310 * already in the queue.
312 static void htb_add_to_wait_tree(struct htb_sched *q,
313 struct htb_class *cl, s64 delay)
315 struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
317 cl->pq_key = q->now + delay;
318 if (cl->pq_key == q->now)
321 /* update the nearest event cache */
322 if (q->near_ev_cache[cl->level] > cl->pq_key)
323 q->near_ev_cache[cl->level] = cl->pq_key;
328 c = rb_entry(parent, struct htb_class, pq_node);
329 if (cl->pq_key >= c->pq_key)
330 p = &parent->rb_right;
332 p = &parent->rb_left;
334 rb_link_node(&cl->pq_node, parent, p);
335 rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
339 * htb_next_rb_node - finds next node in binary tree
340 * @n: the current node in binary tree
342 * When we are past last key we return NULL.
343 * Average complexity is 2 steps per call.
345 static inline void htb_next_rb_node(struct rb_node **n)
351 * htb_add_class_to_row - add class to its row
352 * @q: the priority event queue
353 * @cl: the class to add
354 * @mask: the given priorities in class in bitmap
356 * The class is added to row at priorities marked in mask.
357 * It does nothing if mask == 0.
359 static inline void htb_add_class_to_row(struct htb_sched *q,
360 struct htb_class *cl, int mask)
362 q->row_mask[cl->level] |= mask;
364 int prio = ffz(~mask);
365 mask &= ~(1 << prio);
366 htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
370 /* If this triggers, it is a bug in this code, but it need not be fatal */
371 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
373 if (RB_EMPTY_NODE(rb)) {
383 * htb_remove_class_from_row - removes class from its row
384 * @q: the priority event queue
385 * @cl: the class to add
386 * @mask: the given priorities in class in bitmap
388 * The class is removed from row at priorities marked in mask.
389 * It does nothing if mask == 0.
391 static inline void htb_remove_class_from_row(struct htb_sched *q,
392 struct htb_class *cl, int mask)
395 struct htb_level *hlevel = &q->hlevel[cl->level];
398 int prio = ffz(~mask);
399 struct htb_prio *hprio = &hlevel->hprio[prio];
401 mask &= ~(1 << prio);
402 if (hprio->ptr == cl->node + prio)
403 htb_next_rb_node(&hprio->ptr);
405 htb_safe_rb_erase(cl->node + prio, &hprio->row);
406 if (!hprio->row.rb_node)
409 q->row_mask[cl->level] &= ~m;
413 * htb_activate_prios - creates active classe's feed chain
414 * @q: the priority event queue
415 * @cl: the class to activate
417 * The class is connected to ancestors and/or appropriate rows
418 * for priorities it is participating on. cl->cmode must be new
419 * (activated) mode. It does nothing if cl->prio_activity == 0.
421 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
423 struct htb_class *p = cl->parent;
424 long m, mask = cl->prio_activity;
426 while (cl->cmode == HTB_MAY_BORROW && p && mask) {
432 if (p->inner.clprio[prio].feed.rb_node)
433 /* parent already has its feed in use so that
434 * reset bit in mask as parent is already ok
436 mask &= ~(1 << prio);
438 htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio);
440 p->prio_activity |= mask;
445 if (cl->cmode == HTB_CAN_SEND && mask)
446 htb_add_class_to_row(q, cl, mask);
450 * htb_deactivate_prios - remove class from feed chain
451 * @q: the priority event queue
452 * @cl: the class to deactivate
454 * cl->cmode must represent old mode (before deactivation). It does
455 * nothing if cl->prio_activity == 0. Class is removed from all feed
458 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
460 struct htb_class *p = cl->parent;
461 long m, mask = cl->prio_activity;
463 while (cl->cmode == HTB_MAY_BORROW && p && mask) {
470 if (p->inner.clprio[prio].ptr == cl->node + prio) {
471 /* we are removing child which is pointed to from
472 * parent feed - forget the pointer but remember
475 p->inner.clprio[prio].last_ptr_id = cl->common.classid;
476 p->inner.clprio[prio].ptr = NULL;
479 htb_safe_rb_erase(cl->node + prio,
480 &p->inner.clprio[prio].feed);
482 if (!p->inner.clprio[prio].feed.rb_node)
486 p->prio_activity &= ~mask;
491 if (cl->cmode == HTB_CAN_SEND && mask)
492 htb_remove_class_from_row(q, cl, mask);
495 static inline s64 htb_lowater(const struct htb_class *cl)
498 return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
502 static inline s64 htb_hiwater(const struct htb_class *cl)
505 return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
512 * htb_class_mode - computes and returns current class mode
513 * @cl: the target class
514 * @diff: diff time in microseconds
516 * It computes cl's mode at time cl->t_c+diff and returns it. If mode
517 * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
518 * from now to time when cl will change its state.
519 * Also it is worth to note that class mode doesn't change simply
520 * at cl->{c,}tokens == 0 but there can rather be hysteresis of
521 * 0 .. -cl->{c,}buffer range. It is meant to limit number of
522 * mode transitions per time unit. The speed gain is about 1/6.
524 static inline enum htb_cmode
525 htb_class_mode(struct htb_class *cl, s64 *diff)
529 if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
531 return HTB_CANT_SEND;
534 if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
538 return HTB_MAY_BORROW;
542 * htb_change_class_mode - changes classe's mode
543 * @q: the priority event queue
544 * @cl: the target class
545 * @diff: diff time in microseconds
547 * This should be the only way how to change classe's mode under normal
548 * circumstances. Routine will update feed lists linkage, change mode
549 * and add class to the wait event queue if appropriate. New mode should
550 * be different from old one and cl->pq_key has to be valid if changing
551 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
554 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
556 enum htb_cmode new_mode = htb_class_mode(cl, diff);
558 if (new_mode == cl->cmode)
561 if (new_mode == HTB_CANT_SEND) {
566 if (cl->prio_activity) { /* not necessary: speed optimization */
567 if (cl->cmode != HTB_CANT_SEND)
568 htb_deactivate_prios(q, cl);
569 cl->cmode = new_mode;
570 if (new_mode != HTB_CANT_SEND)
571 htb_activate_prios(q, cl);
573 cl->cmode = new_mode;
577 * htb_activate - inserts leaf cl into appropriate active feeds
579 * Routine learns (new) priority of leaf and activates feed chain
580 * for the prio. It can be called on already active leaf safely.
581 * It also adds leaf into droplist.
583 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
585 WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen);
587 if (!cl->prio_activity) {
588 cl->prio_activity = 1 << cl->prio;
589 htb_activate_prios(q, cl);
594 * htb_deactivate - remove leaf cl from active feeds
596 * Make sure that leaf is active. In the other words it can't be called
597 * with non-active leaf. It also removes class from the drop list.
599 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
601 WARN_ON(!cl->prio_activity);
603 htb_deactivate_prios(q, cl);
604 cl->prio_activity = 0;
607 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
608 struct sk_buff **to_free)
611 unsigned int len = qdisc_pkt_len(skb);
612 struct htb_sched *q = qdisc_priv(sch);
613 struct htb_class *cl = htb_classify(skb, sch, &ret);
615 if (cl == HTB_DIRECT) {
616 /* enqueue to helper queue */
617 if (q->direct_queue.qlen < q->direct_qlen) {
618 __qdisc_enqueue_tail(skb, &q->direct_queue);
621 return qdisc_drop(skb, sch, to_free);
623 #ifdef CONFIG_NET_CLS_ACT
625 if (ret & __NET_XMIT_BYPASS)
626 qdisc_qstats_drop(sch);
627 __qdisc_drop(skb, to_free);
630 } else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
631 to_free)) != NET_XMIT_SUCCESS) {
632 if (net_xmit_drop_count(ret)) {
633 qdisc_qstats_drop(sch);
641 sch->qstats.backlog += len;
643 return NET_XMIT_SUCCESS;
646 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
648 s64 toks = diff + cl->tokens;
650 if (toks > cl->buffer)
652 toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
653 if (toks <= -cl->mbuffer)
654 toks = 1 - cl->mbuffer;
659 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
661 s64 toks = diff + cl->ctokens;
663 if (toks > cl->cbuffer)
665 toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
666 if (toks <= -cl->mbuffer)
667 toks = 1 - cl->mbuffer;
673 * htb_charge_class - charges amount "bytes" to leaf and ancestors
675 * Routine assumes that packet "bytes" long was dequeued from leaf cl
676 * borrowing from "level". It accounts bytes to ceil leaky bucket for
677 * leaf and all ancestors and to rate bucket for ancestors at levels
678 * "level" and higher. It also handles possible change of mode resulting
679 * from the update. Note that mode can also increase here (MAY_BORROW to
680 * CAN_SEND) because we can use more precise clock that event queue here.
681 * In such case we remove class from event queue first.
683 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
684 int level, struct sk_buff *skb)
686 int bytes = qdisc_pkt_len(skb);
687 enum htb_cmode old_mode;
691 diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
692 if (cl->level >= level) {
693 if (cl->level == level)
695 htb_accnt_tokens(cl, bytes, diff);
697 cl->xstats.borrows++;
698 cl->tokens += diff; /* we moved t_c; update tokens */
700 htb_accnt_ctokens(cl, bytes, diff);
703 old_mode = cl->cmode;
705 htb_change_class_mode(q, cl, &diff);
706 if (old_mode != cl->cmode) {
707 if (old_mode != HTB_CAN_SEND)
708 htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
709 if (cl->cmode != HTB_CAN_SEND)
710 htb_add_to_wait_tree(q, cl, diff);
713 /* update basic stats except for leaves which are already updated */
715 bstats_update(&cl->bstats, skb);
722 * htb_do_events - make mode changes to classes at the level
724 * Scans event queue for pending events and applies them. Returns time of
725 * next pending event (0 for no event in pq, q->now for too many events).
726 * Note: Applied are events whose have cl->pq_key <= q->now.
728 static s64 htb_do_events(struct htb_sched *q, const int level,
731 /* don't run for longer than 2 jiffies; 2 is used instead of
732 * 1 to simplify things when jiffy is going to be incremented
735 unsigned long stop_at = start + 2;
736 struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
738 while (time_before(jiffies, stop_at)) {
739 struct htb_class *cl;
741 struct rb_node *p = rb_first(wait_pq);
746 cl = rb_entry(p, struct htb_class, pq_node);
747 if (cl->pq_key > q->now)
750 htb_safe_rb_erase(p, wait_pq);
751 diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
752 htb_change_class_mode(q, cl, &diff);
753 if (cl->cmode != HTB_CAN_SEND)
754 htb_add_to_wait_tree(q, cl, diff);
757 /* too much load - let's continue after a break for scheduling */
758 if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
759 pr_warn("htb: too many events!\n");
760 q->warned |= HTB_WARN_TOOMANYEVENTS;
766 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
767 * is no such one exists.
769 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
772 struct rb_node *r = NULL;
774 struct htb_class *cl =
775 rb_entry(n, struct htb_class, node[prio]);
777 if (id > cl->common.classid) {
779 } else if (id < cl->common.classid) {
790 * htb_lookup_leaf - returns next leaf class in DRR order
792 * Find leaf where current feed pointers points to.
794 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
798 struct rb_node *root;
799 struct rb_node **pptr;
801 } stk[TC_HTB_MAXDEPTH], *sp = stk;
803 BUG_ON(!hprio->row.rb_node);
804 sp->root = hprio->row.rb_node;
805 sp->pptr = &hprio->ptr;
806 sp->pid = &hprio->last_ptr_id;
808 for (i = 0; i < 65535; i++) {
809 if (!*sp->pptr && *sp->pid) {
810 /* ptr was invalidated but id is valid - try to recover
811 * the original or next ptr
814 htb_id_find_next_upper(prio, sp->root, *sp->pid);
816 *sp->pid = 0; /* ptr is valid now so that remove this hint as it
817 * can become out of date quickly
819 if (!*sp->pptr) { /* we are at right end; rewind & go up */
820 *sp->pptr = sp->root;
821 while ((*sp->pptr)->rb_left)
822 *sp->pptr = (*sp->pptr)->rb_left;
829 htb_next_rb_node(sp->pptr);
832 struct htb_class *cl;
833 struct htb_prio *clp;
835 cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
838 clp = &cl->inner.clprio[prio];
839 (++sp)->root = clp->feed.rb_node;
840 sp->pptr = &clp->ptr;
841 sp->pid = &clp->last_ptr_id;
848 /* dequeues packet at given priority and level; call only if
849 * you are sure that there is active class at prio/level
851 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
854 struct sk_buff *skb = NULL;
855 struct htb_class *cl, *start;
856 struct htb_level *hlevel = &q->hlevel[level];
857 struct htb_prio *hprio = &hlevel->hprio[prio];
859 /* look initial class up in the row */
860 start = cl = htb_lookup_leaf(hprio, prio);
867 /* class can be empty - it is unlikely but can be true if leaf
868 * qdisc drops packets in enqueue routine or if someone used
869 * graft operation on the leaf since last dequeue;
870 * simply deactivate and skip such class
872 if (unlikely(cl->leaf.q->q.qlen == 0)) {
873 struct htb_class *next;
874 htb_deactivate(q, cl);
876 /* row/level might become empty */
877 if ((q->row_mask[level] & (1 << prio)) == 0)
880 next = htb_lookup_leaf(hprio, prio);
882 if (cl == start) /* fix start if we just deleted it */
888 skb = cl->leaf.q->dequeue(cl->leaf.q);
889 if (likely(skb != NULL))
892 qdisc_warn_nonwc("htb", cl->leaf.q);
893 htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr:
894 &q->hlevel[0].hprio[prio].ptr);
895 cl = htb_lookup_leaf(hprio, prio);
897 } while (cl != start);
899 if (likely(skb != NULL)) {
900 bstats_update(&cl->bstats, skb);
901 cl->leaf.deficit[level] -= qdisc_pkt_len(skb);
902 if (cl->leaf.deficit[level] < 0) {
903 cl->leaf.deficit[level] += cl->quantum;
904 htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr :
905 &q->hlevel[0].hprio[prio].ptr);
907 /* this used to be after charge_class but this constelation
908 * gives us slightly better performance
910 if (!cl->leaf.q->q.qlen)
911 htb_deactivate(q, cl);
912 htb_charge_class(q, cl, level, skb);
917 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
920 struct htb_sched *q = qdisc_priv(sch);
923 unsigned long start_at;
925 /* try to dequeue direct packets as high prio (!) to minimize cpu work */
926 skb = __qdisc_dequeue_head(&q->direct_queue);
929 qdisc_bstats_update(sch, skb);
930 qdisc_qstats_backlog_dec(sch, skb);
937 q->now = ktime_get_ns();
940 next_event = q->now + 5LLU * NSEC_PER_SEC;
942 for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
943 /* common case optimization - skip event handler quickly */
945 s64 event = q->near_ev_cache[level];
947 if (q->now >= event) {
948 event = htb_do_events(q, level, start_at);
950 event = q->now + NSEC_PER_SEC;
951 q->near_ev_cache[level] = event;
954 if (next_event > event)
957 m = ~q->row_mask[level];
958 while (m != (int)(-1)) {
962 skb = htb_dequeue_tree(q, prio, level);
963 if (likely(skb != NULL))
967 if (likely(next_event > q->now))
968 qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
970 schedule_work(&q->work);
975 /* reset all classes */
976 /* always caled under BH & queue lock */
977 static void htb_reset(struct Qdisc *sch)
979 struct htb_sched *q = qdisc_priv(sch);
980 struct htb_class *cl;
983 for (i = 0; i < q->clhash.hashsize; i++) {
984 hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
986 memset(&cl->inner, 0, sizeof(cl->inner));
988 if (cl->leaf.q && !q->offload)
989 qdisc_reset(cl->leaf.q);
991 cl->prio_activity = 0;
992 cl->cmode = HTB_CAN_SEND;
995 qdisc_watchdog_cancel(&q->watchdog);
996 __qdisc_reset_queue(&q->direct_queue);
998 sch->qstats.backlog = 0;
999 memset(q->hlevel, 0, sizeof(q->hlevel));
1000 memset(q->row_mask, 0, sizeof(q->row_mask));
1003 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1004 [TCA_HTB_PARMS] = { .len = sizeof(struct tc_htb_opt) },
1005 [TCA_HTB_INIT] = { .len = sizeof(struct tc_htb_glob) },
1006 [TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1007 [TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1008 [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1009 [TCA_HTB_RATE64] = { .type = NLA_U64 },
1010 [TCA_HTB_CEIL64] = { .type = NLA_U64 },
1011 [TCA_HTB_OFFLOAD] = { .type = NLA_FLAG },
1014 static void htb_work_func(struct work_struct *work)
1016 struct htb_sched *q = container_of(work, struct htb_sched, work);
1017 struct Qdisc *sch = q->watchdog.qdisc;
1020 __netif_schedule(qdisc_root(sch));
1024 static void htb_set_lockdep_class_child(struct Qdisc *q)
1026 static struct lock_class_key child_key;
1028 lockdep_set_class(qdisc_lock(q), &child_key);
1031 static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt)
1033 return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt);
1036 static int htb_init(struct Qdisc *sch, struct nlattr *opt,
1037 struct netlink_ext_ack *extack)
1039 struct net_device *dev = qdisc_dev(sch);
1040 struct tc_htb_qopt_offload offload_opt;
1041 struct htb_sched *q = qdisc_priv(sch);
1042 struct nlattr *tb[TCA_HTB_MAX + 1];
1043 struct tc_htb_glob *gopt;
1048 qdisc_watchdog_init(&q->watchdog, sch);
1049 INIT_WORK(&q->work, htb_work_func);
1054 err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
1058 err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1063 if (!tb[TCA_HTB_INIT])
1066 gopt = nla_data(tb[TCA_HTB_INIT]);
1067 if (gopt->version != HTB_VER >> 16)
1070 offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
1073 if (sch->parent != TC_H_ROOT)
1076 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
1079 q->num_direct_qdiscs = dev->real_num_tx_queues;
1080 q->direct_qdiscs = kcalloc(q->num_direct_qdiscs,
1081 sizeof(*q->direct_qdiscs),
1083 if (!q->direct_qdiscs)
1087 err = qdisc_class_hash_init(&q->clhash);
1089 goto err_free_direct_qdiscs;
1091 qdisc_skb_head_init(&q->direct_queue);
1093 if (tb[TCA_HTB_DIRECT_QLEN])
1094 q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1096 q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1098 if ((q->rate2quantum = gopt->rate2quantum) < 1)
1099 q->rate2quantum = 1;
1100 q->defcls = gopt->defcls;
1105 for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1106 struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1107 struct Qdisc *qdisc;
1109 qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1110 TC_H_MAKE(sch->handle, 0), extack);
1113 goto err_free_qdiscs;
1116 htb_set_lockdep_class_child(qdisc);
1117 q->direct_qdiscs[ntx] = qdisc;
1118 qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1121 sch->flags |= TCQ_F_MQROOT;
1123 offload_opt = (struct tc_htb_qopt_offload) {
1124 .command = TC_HTB_CREATE,
1125 .parent_classid = TC_H_MAJ(sch->handle) >> 16,
1126 .classid = TC_H_MIN(q->defcls),
1129 err = htb_offload(dev, &offload_opt);
1131 goto err_free_qdiscs;
1133 /* Defer this assignment, so that htb_destroy skips offload-related
1134 * parts (especially calling ndo_setup_tc) on errors.
1141 for (ntx = 0; ntx < q->num_direct_qdiscs && q->direct_qdiscs[ntx];
1143 qdisc_put(q->direct_qdiscs[ntx]);
1145 qdisc_class_hash_destroy(&q->clhash);
1146 /* Prevent use-after-free and double-free when htb_destroy gets called.
1148 q->clhash.hash = NULL;
1149 q->clhash.hashsize = 0;
1151 err_free_direct_qdiscs:
1152 kfree(q->direct_qdiscs);
1153 q->direct_qdiscs = NULL;
1157 static void htb_attach_offload(struct Qdisc *sch)
1159 struct net_device *dev = qdisc_dev(sch);
1160 struct htb_sched *q = qdisc_priv(sch);
1163 for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1164 struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx];
1166 old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
1168 qdisc_hash_add(qdisc, false);
1170 for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) {
1171 struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1172 struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL);
1177 kfree(q->direct_qdiscs);
1178 q->direct_qdiscs = NULL;
1181 static void htb_attach_software(struct Qdisc *sch)
1183 struct net_device *dev = qdisc_dev(sch);
1186 /* Resemble qdisc_graft behavior. */
1187 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
1188 struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1189 struct Qdisc *old = dev_graft_qdisc(dev_queue, sch);
1191 qdisc_refcount_inc(sch);
1197 static void htb_attach(struct Qdisc *sch)
1199 struct htb_sched *q = qdisc_priv(sch);
1202 htb_attach_offload(sch);
1204 htb_attach_software(sch);
1207 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1209 struct htb_sched *q = qdisc_priv(sch);
1210 struct nlattr *nest;
1211 struct tc_htb_glob gopt;
1214 sch->flags |= TCQ_F_OFFLOADED;
1216 sch->flags &= ~TCQ_F_OFFLOADED;
1218 sch->qstats.overlimits = q->overlimits;
1219 /* Its safe to not acquire qdisc lock. As we hold RTNL,
1220 * no change can happen on the qdisc parameters.
1223 gopt.direct_pkts = q->direct_pkts;
1224 gopt.version = HTB_VER;
1225 gopt.rate2quantum = q->rate2quantum;
1226 gopt.defcls = q->defcls;
1229 nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1231 goto nla_put_failure;
1232 if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1233 nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1234 goto nla_put_failure;
1235 if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1236 goto nla_put_failure;
1238 return nla_nest_end(skb, nest);
1241 nla_nest_cancel(skb, nest);
1245 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1246 struct sk_buff *skb, struct tcmsg *tcm)
1248 struct htb_class *cl = (struct htb_class *)arg;
1249 struct htb_sched *q = qdisc_priv(sch);
1250 struct nlattr *nest;
1251 struct tc_htb_opt opt;
1253 /* Its safe to not acquire qdisc lock. As we hold RTNL,
1254 * no change can happen on the class parameters.
1256 tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1257 tcm->tcm_handle = cl->common.classid;
1258 if (!cl->level && cl->leaf.q)
1259 tcm->tcm_info = cl->leaf.q->handle;
1261 nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1263 goto nla_put_failure;
1265 memset(&opt, 0, sizeof(opt));
1267 psched_ratecfg_getrate(&opt.rate, &cl->rate);
1268 opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1269 psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1270 opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1271 opt.quantum = cl->quantum;
1272 opt.prio = cl->prio;
1273 opt.level = cl->level;
1274 if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1275 goto nla_put_failure;
1276 if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1277 goto nla_put_failure;
1278 if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1279 nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1281 goto nla_put_failure;
1282 if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1283 nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1285 goto nla_put_failure;
1287 return nla_nest_end(skb, nest);
1290 nla_nest_cancel(skb, nest);
1294 static void htb_offload_aggregate_stats(struct htb_sched *q,
1295 struct htb_class *cl)
1297 struct htb_class *c;
1300 memset(&cl->bstats, 0, sizeof(cl->bstats));
1302 for (i = 0; i < q->clhash.hashsize; i++) {
1303 hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) {
1304 struct htb_class *p = c;
1306 while (p && p->level < cl->level)
1312 cl->bstats.bytes += c->bstats_bias.bytes;
1313 cl->bstats.packets += c->bstats_bias.packets;
1314 if (c->level == 0) {
1315 cl->bstats.bytes += c->leaf.q->bstats.bytes;
1316 cl->bstats.packets += c->leaf.q->bstats.packets;
1323 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1325 struct htb_class *cl = (struct htb_class *)arg;
1326 struct htb_sched *q = qdisc_priv(sch);
1327 struct gnet_stats_queue qs = {
1329 .overlimits = cl->overlimits,
1333 if (!cl->level && cl->leaf.q)
1334 qdisc_qstats_qlen_backlog(cl->leaf.q, &qlen, &qs.backlog);
1336 cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1338 cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1344 cl->bstats = cl->leaf.q->bstats;
1346 memset(&cl->bstats, 0, sizeof(cl->bstats));
1347 cl->bstats.bytes += cl->bstats_bias.bytes;
1348 cl->bstats.packets += cl->bstats_bias.packets;
1350 htb_offload_aggregate_stats(q, cl);
1354 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1355 d, NULL, &cl->bstats) < 0 ||
1356 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1357 gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1360 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1363 static struct netdev_queue *
1364 htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
1366 struct net_device *dev = qdisc_dev(sch);
1367 struct tc_htb_qopt_offload offload_opt;
1368 struct htb_sched *q = qdisc_priv(sch);
1372 return sch->dev_queue;
1374 offload_opt = (struct tc_htb_qopt_offload) {
1375 .command = TC_HTB_LEAF_QUERY_QUEUE,
1376 .classid = TC_H_MIN(tcm->tcm_parent),
1378 err = htb_offload(dev, &offload_opt);
1379 if (err || offload_opt.qid >= dev->num_tx_queues)
1381 return netdev_get_tx_queue(dev, offload_opt.qid);
1384 static struct Qdisc *
1385 htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q)
1387 struct net_device *dev = dev_queue->dev;
1388 struct Qdisc *old_q;
1390 if (dev->flags & IFF_UP)
1391 dev_deactivate(dev);
1392 old_q = dev_graft_qdisc(dev_queue, new_q);
1394 new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1395 if (dev->flags & IFF_UP)
1401 static void htb_offload_move_qdisc(struct Qdisc *sch, u16 qid_old, u16 qid_new)
1403 struct netdev_queue *queue_old, *queue_new;
1404 struct net_device *dev = qdisc_dev(sch);
1405 struct Qdisc *qdisc;
1407 queue_old = netdev_get_tx_queue(dev, qid_old);
1408 queue_new = netdev_get_tx_queue(dev, qid_new);
1410 if (dev->flags & IFF_UP)
1411 dev_deactivate(dev);
1412 qdisc = dev_graft_qdisc(queue_old, NULL);
1413 qdisc->dev_queue = queue_new;
1414 qdisc = dev_graft_qdisc(queue_new, qdisc);
1415 if (dev->flags & IFF_UP)
1418 WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
1421 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1422 struct Qdisc **old, struct netlink_ext_ack *extack)
1424 struct netdev_queue *dev_queue = sch->dev_queue;
1425 struct htb_class *cl = (struct htb_class *)arg;
1426 struct htb_sched *q = qdisc_priv(sch);
1427 struct Qdisc *old_q;
1433 dev_queue = new->dev_queue;
1434 WARN_ON(dev_queue != cl->leaf.q->dev_queue);
1438 new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1439 cl->common.classid, extack);
1445 htb_set_lockdep_class_child(new);
1446 /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1447 qdisc_refcount_inc(new);
1448 old_q = htb_graft_helper(dev_queue, new);
1451 *old = qdisc_replace(sch, new, &cl->leaf.q);
1454 WARN_ON(old_q != *old);
1461 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1463 struct htb_class *cl = (struct htb_class *)arg;
1464 return !cl->level ? cl->leaf.q : NULL;
1467 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1469 struct htb_class *cl = (struct htb_class *)arg;
1471 htb_deactivate(qdisc_priv(sch), cl);
1474 static inline int htb_parent_last_child(struct htb_class *cl)
1477 /* the root class */
1479 if (cl->parent->children > 1)
1480 /* not the last child */
1485 static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
1486 struct Qdisc *new_q)
1488 struct htb_sched *q = qdisc_priv(sch);
1489 struct htb_class *parent = cl->parent;
1491 WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
1493 if (parent->cmode != HTB_CAN_SEND)
1494 htb_safe_rb_erase(&parent->pq_node,
1495 &q->hlevel[parent->level].wait_pq);
1498 memset(&parent->inner, 0, sizeof(parent->inner));
1499 parent->leaf.q = new_q ? new_q : &noop_qdisc;
1500 parent->tokens = parent->buffer;
1501 parent->ctokens = parent->cbuffer;
1502 parent->t_c = ktime_get_ns();
1503 parent->cmode = HTB_CAN_SEND;
1506 static void htb_parent_to_leaf_offload(struct Qdisc *sch,
1507 struct netdev_queue *dev_queue,
1508 struct Qdisc *new_q)
1510 struct Qdisc *old_q;
1512 /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1513 qdisc_refcount_inc(new_q);
1514 old_q = htb_graft_helper(dev_queue, new_q);
1515 WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1518 static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
1519 bool last_child, bool destroying,
1520 struct netlink_ext_ack *extack)
1522 struct tc_htb_qopt_offload offload_opt;
1523 struct Qdisc *q = cl->leaf.q;
1524 struct Qdisc *old = NULL;
1532 /* On destroy of HTB, two cases are possible:
1533 * 1. q is a normal qdisc, but q->dev_queue has noop qdisc.
1534 * 2. q is a noop qdisc (for nodes that were inner),
1535 * q->dev_queue is noop_netdev_queue.
1537 old = htb_graft_helper(q->dev_queue, NULL);
1543 cl->parent->bstats_bias.bytes += q->bstats.bytes;
1544 cl->parent->bstats_bias.packets += q->bstats.packets;
1547 offload_opt = (struct tc_htb_qopt_offload) {
1548 .command = !last_child ? TC_HTB_LEAF_DEL :
1549 destroying ? TC_HTB_LEAF_DEL_LAST_FORCE :
1550 TC_HTB_LEAF_DEL_LAST,
1551 .classid = cl->common.classid,
1554 err = htb_offload(qdisc_dev(sch), &offload_opt);
1556 if (!err || destroying)
1559 htb_graft_helper(q->dev_queue, old);
1564 if (!err && offload_opt.moved_qid != 0) {
1566 q->dev_queue = netdev_get_tx_queue(qdisc_dev(sch),
1569 htb_offload_move_qdisc(sch, offload_opt.moved_qid,
1576 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1579 WARN_ON(!cl->leaf.q);
1580 qdisc_put(cl->leaf.q);
1582 gen_kill_estimator(&cl->rate_est);
1583 tcf_block_put(cl->block);
1587 static void htb_destroy(struct Qdisc *sch)
1589 struct net_device *dev = qdisc_dev(sch);
1590 struct tc_htb_qopt_offload offload_opt;
1591 struct htb_sched *q = qdisc_priv(sch);
1592 struct hlist_node *next;
1593 bool nonempty, changed;
1594 struct htb_class *cl;
1597 cancel_work_sync(&q->work);
1598 qdisc_watchdog_cancel(&q->watchdog);
1599 /* This line used to be after htb_destroy_class call below
1600 * and surprisingly it worked in 2.4. But it must precede it
1601 * because filter need its target class alive to be able to call
1602 * unbind_filter on it (without Oops).
1604 tcf_block_put(q->block);
1606 for (i = 0; i < q->clhash.hashsize; i++) {
1607 hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1608 tcf_block_put(cl->block);
1616 for (i = 0; i < q->clhash.hashsize; i++) {
1617 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1622 htb_destroy_class(sch, cl);
1633 last_child = htb_parent_last_child(cl);
1634 htb_destroy_class_offload(sch, cl, last_child,
1636 qdisc_class_hash_remove(&q->clhash,
1639 cl->parent->children--;
1641 htb_parent_to_leaf(sch, cl, NULL);
1642 htb_destroy_class(sch, cl);
1648 qdisc_class_hash_destroy(&q->clhash);
1649 __qdisc_reset_queue(&q->direct_queue);
1654 offload_opt = (struct tc_htb_qopt_offload) {
1655 .command = TC_HTB_DESTROY,
1657 htb_offload(dev, &offload_opt);
1659 if (!q->direct_qdiscs)
1661 for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++)
1662 qdisc_put(q->direct_qdiscs[i]);
1663 kfree(q->direct_qdiscs);
1666 static int htb_delete(struct Qdisc *sch, unsigned long arg,
1667 struct netlink_ext_ack *extack)
1669 struct htb_sched *q = qdisc_priv(sch);
1670 struct htb_class *cl = (struct htb_class *)arg;
1671 struct Qdisc *new_q = NULL;
1675 /* TODO: why don't allow to delete subtree ? references ? does
1676 * tc subsys guarantee us that in htb_destroy it holds no class
1677 * refs so that we can remove children safely there ?
1679 if (cl->children || cl->filter_cnt)
1682 if (!cl->level && htb_parent_last_child(cl))
1686 err = htb_destroy_class_offload(sch, cl, last_child, false,
1693 struct netdev_queue *dev_queue;
1695 dev_queue = q->offload ? cl->leaf.q->dev_queue : sch->dev_queue;
1696 new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1697 cl->parent->common.classid,
1701 htb_set_lockdep_class_child(new_q);
1702 htb_parent_to_leaf_offload(sch, dev_queue, new_q);
1710 qdisc_purge_queue(cl->leaf.q);
1712 /* delete from hash and active; remainder in destroy_class */
1713 qdisc_class_hash_remove(&q->clhash, &cl->common);
1715 cl->parent->children--;
1717 if (cl->prio_activity)
1718 htb_deactivate(q, cl);
1720 if (cl->cmode != HTB_CAN_SEND)
1721 htb_safe_rb_erase(&cl->pq_node,
1722 &q->hlevel[cl->level].wait_pq);
1725 htb_parent_to_leaf(sch, cl, new_q);
1727 sch_tree_unlock(sch);
1729 htb_destroy_class(sch, cl);
1733 static int htb_change_class(struct Qdisc *sch, u32 classid,
1734 u32 parentid, struct nlattr **tca,
1735 unsigned long *arg, struct netlink_ext_ack *extack)
1738 struct htb_sched *q = qdisc_priv(sch);
1739 struct htb_class *cl = (struct htb_class *)*arg, *parent;
1740 struct tc_htb_qopt_offload offload_opt;
1741 struct nlattr *opt = tca[TCA_OPTIONS];
1742 struct nlattr *tb[TCA_HTB_MAX + 1];
1743 struct Qdisc *parent_qdisc = NULL;
1744 struct netdev_queue *dev_queue;
1745 struct tc_htb_opt *hopt;
1749 /* extract all subattrs from opt attr */
1753 err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1759 if (tb[TCA_HTB_PARMS] == NULL)
1762 parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1764 hopt = nla_data(tb[TCA_HTB_PARMS]);
1765 if (!hopt->rate.rate || !hopt->ceil.rate)
1768 /* Keeping backward compatible with rate_table based iproute2 tc */
1769 if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1770 qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
1773 if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1774 qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
1777 rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1778 ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1780 if (!cl) { /* new class */
1781 struct net_device *dev = qdisc_dev(sch);
1782 struct Qdisc *new_q, *old_q;
1786 struct gnet_estimator opt;
1789 .nla_len = nla_attr_size(sizeof(est.opt)),
1790 .nla_type = TCA_RATE,
1793 /* 4s interval, 16s averaging constant */
1799 /* check for valid classid */
1800 if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1801 htb_find(classid, sch))
1804 /* check maximal depth */
1805 if (parent && parent->parent && parent->parent->level < 2) {
1806 pr_err("htb: tree is too deep\n");
1810 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1814 err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1819 if (htb_rate_est || tca[TCA_RATE]) {
1820 err = gen_new_estimator(&cl->bstats, NULL,
1823 qdisc_root_sleeping_running(sch),
1824 tca[TCA_RATE] ? : &est.nla);
1830 RB_CLEAR_NODE(&cl->pq_node);
1832 for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1833 RB_CLEAR_NODE(&cl->node[prio]);
1835 cl->common.classid = classid;
1837 /* Make sure nothing interrupts us in between of two
1838 * ndo_setup_tc calls.
1842 /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1843 * so that can't be used inside of sch_tree_lock
1844 * -- thanks to Karlis Peisenieks
1847 dev_queue = sch->dev_queue;
1848 } else if (!(parent && !parent->level)) {
1849 /* Assign a dev_queue to this classid. */
1850 offload_opt = (struct tc_htb_qopt_offload) {
1851 .command = TC_HTB_LEAF_ALLOC_QUEUE,
1852 .classid = cl->common.classid,
1853 .parent_classid = parent ?
1854 TC_H_MIN(parent->common.classid) :
1855 TC_HTB_CLASSID_ROOT,
1856 .rate = max_t(u64, hopt->rate.rate, rate64),
1857 .ceil = max_t(u64, hopt->ceil.rate, ceil64),
1860 err = htb_offload(dev, &offload_opt);
1862 pr_err("htb: TC_HTB_LEAF_ALLOC_QUEUE failed with err = %d\n",
1864 goto err_kill_estimator;
1866 dev_queue = netdev_get_tx_queue(dev, offload_opt.qid);
1867 } else { /* First child. */
1868 dev_queue = parent->leaf.q->dev_queue;
1869 old_q = htb_graft_helper(dev_queue, NULL);
1870 WARN_ON(old_q != parent->leaf.q);
1871 offload_opt = (struct tc_htb_qopt_offload) {
1872 .command = TC_HTB_LEAF_TO_INNER,
1873 .classid = cl->common.classid,
1875 TC_H_MIN(parent->common.classid),
1876 .rate = max_t(u64, hopt->rate.rate, rate64),
1877 .ceil = max_t(u64, hopt->ceil.rate, ceil64),
1880 err = htb_offload(dev, &offload_opt);
1882 pr_err("htb: TC_HTB_LEAF_TO_INNER failed with err = %d\n",
1884 htb_graft_helper(dev_queue, old_q);
1885 goto err_kill_estimator;
1887 parent->bstats_bias.bytes += old_q->bstats.bytes;
1888 parent->bstats_bias.packets += old_q->bstats.packets;
1891 new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1895 htb_set_lockdep_class_child(new_q);
1896 /* One ref for cl->leaf.q, the other for
1899 qdisc_refcount_inc(new_q);
1901 old_q = htb_graft_helper(dev_queue, new_q);
1902 /* No qdisc_put needed. */
1903 WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1906 if (parent && !parent->level) {
1907 /* turn parent into inner node */
1908 qdisc_purge_queue(parent->leaf.q);
1909 parent_qdisc = parent->leaf.q;
1910 if (parent->prio_activity)
1911 htb_deactivate(q, parent);
1913 /* remove from evt list because of level change */
1914 if (parent->cmode != HTB_CAN_SEND) {
1915 htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1916 parent->cmode = HTB_CAN_SEND;
1918 parent->level = (parent->parent ? parent->parent->level
1919 : TC_HTB_MAXDEPTH) - 1;
1920 memset(&parent->inner, 0, sizeof(parent->inner));
1923 /* leaf (we) needs elementary qdisc */
1924 cl->leaf.q = new_q ? new_q : &noop_qdisc;
1926 cl->parent = parent;
1928 /* set class to be in HTB_CAN_SEND state */
1929 cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1930 cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1931 cl->mbuffer = 60ULL * NSEC_PER_SEC; /* 1min */
1932 cl->t_c = ktime_get_ns();
1933 cl->cmode = HTB_CAN_SEND;
1935 /* attach to the hash list and parent's family */
1936 qdisc_class_hash_insert(&q->clhash, &cl->common);
1939 if (cl->leaf.q != &noop_qdisc)
1940 qdisc_hash_add(cl->leaf.q, true);
1942 if (tca[TCA_RATE]) {
1943 err = gen_replace_estimator(&cl->bstats, NULL,
1946 qdisc_root_sleeping_running(sch),
1953 struct net_device *dev = qdisc_dev(sch);
1955 offload_opt = (struct tc_htb_qopt_offload) {
1956 .command = TC_HTB_NODE_MODIFY,
1957 .classid = cl->common.classid,
1958 .rate = max_t(u64, hopt->rate.rate, rate64),
1959 .ceil = max_t(u64, hopt->ceil.rate, ceil64),
1962 err = htb_offload(dev, &offload_opt);
1964 /* Estimator was replaced, and rollback may fail
1965 * as well, so we don't try to recover it, and
1966 * the estimator won't work property with the
1967 * offload anyway, because bstats are updated
1968 * only when the stats are queried.
1976 psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
1977 psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
1979 /* it used to be a nasty bug here, we have to check that node
1980 * is really leaf before changing cl->leaf !
1983 u64 quantum = cl->rate.rate_bytes_ps;
1985 do_div(quantum, q->rate2quantum);
1986 cl->quantum = min_t(u64, quantum, INT_MAX);
1988 if (!hopt->quantum && cl->quantum < 1000) {
1992 if (!hopt->quantum && cl->quantum > 200000) {
1994 cl->quantum = 200000;
1997 cl->quantum = hopt->quantum;
1998 if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1999 cl->prio = TC_HTB_NUMPRIO - 1;
2002 cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
2003 cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
2005 sch_tree_unlock(sch);
2006 qdisc_put(parent_qdisc);
2009 pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n",
2010 cl->common.classid, (warn == -1 ? "small" : "big"));
2012 qdisc_class_hash_grow(sch, &q->clhash);
2014 *arg = (unsigned long)cl;
2018 gen_kill_estimator(&cl->rate_est);
2020 tcf_block_put(cl->block);
2026 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg,
2027 struct netlink_ext_ack *extack)
2029 struct htb_sched *q = qdisc_priv(sch);
2030 struct htb_class *cl = (struct htb_class *)arg;
2032 return cl ? cl->block : q->block;
2035 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
2038 struct htb_class *cl = htb_find(classid, sch);
2040 /*if (cl && !cl->level) return 0;
2041 * The line above used to be there to prevent attaching filters to
2042 * leaves. But at least tc_index filter uses this just to get class
2043 * for other reasons so that we have to allow for it.
2045 * 19.6.2002 As Werner explained it is ok - bind filter is just
2046 * another way to "lock" the class - unlike "get" this lock can
2047 * be broken by class during destroy IIUC.
2051 return (unsigned long)cl;
2054 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
2056 struct htb_class *cl = (struct htb_class *)arg;
2062 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2064 struct htb_sched *q = qdisc_priv(sch);
2065 struct htb_class *cl;
2071 for (i = 0; i < q->clhash.hashsize; i++) {
2072 hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
2073 if (arg->count < arg->skip) {
2077 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2086 static const struct Qdisc_class_ops htb_class_ops = {
2087 .select_queue = htb_select_queue,
2090 .qlen_notify = htb_qlen_notify,
2092 .change = htb_change_class,
2093 .delete = htb_delete,
2095 .tcf_block = htb_tcf_block,
2096 .bind_tcf = htb_bind_filter,
2097 .unbind_tcf = htb_unbind_filter,
2098 .dump = htb_dump_class,
2099 .dump_stats = htb_dump_class_stats,
2102 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
2103 .cl_ops = &htb_class_ops,
2105 .priv_size = sizeof(struct htb_sched),
2106 .enqueue = htb_enqueue,
2107 .dequeue = htb_dequeue,
2108 .peek = qdisc_peek_dequeued,
2110 .attach = htb_attach,
2112 .destroy = htb_destroy,
2114 .owner = THIS_MODULE,
2117 static int __init htb_module_init(void)
2119 return register_qdisc(&htb_qdisc_ops);
2121 static void __exit htb_module_exit(void)
2123 unregister_qdisc(&htb_qdisc_ops);
2126 module_init(htb_module_init)
2127 module_exit(htb_module_exit)
2128 MODULE_LICENSE("GPL");