ipv6/addrconf: fix potential NULL deref in inet6_set_link_af()
[linux-2.6-microblaze.git] / net / sched / sch_cake.c
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *                 (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/pkt_sched.h>
69 #include <net/pkt_cls.h>
70 #include <net/tcp.h>
71 #include <net/flow_dissector.h>
72
73 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
74 #include <net/netfilter/nf_conntrack_core.h>
75 #endif
76
77 #define CAKE_SET_WAYS (8)
78 #define CAKE_MAX_TINS (8)
79 #define CAKE_QUEUES (1024)
80 #define CAKE_FLOW_MASK 63
81 #define CAKE_FLOW_NAT_FLAG 64
82
83 /* struct cobalt_params - contains codel and blue parameters
84  * @interval:   codel initial drop rate
85  * @target:     maximum persistent sojourn time & blue update rate
86  * @mtu_time:   serialisation delay of maximum-size packet
87  * @p_inc:      increment of blue drop probability (0.32 fxp)
88  * @p_dec:      decrement of blue drop probability (0.32 fxp)
89  */
90 struct cobalt_params {
91         u64     interval;
92         u64     target;
93         u64     mtu_time;
94         u32     p_inc;
95         u32     p_dec;
96 };
97
98 /* struct cobalt_vars - contains codel and blue variables
99  * @count:              codel dropping frequency
100  * @rec_inv_sqrt:       reciprocal value of sqrt(count) >> 1
101  * @drop_next:          time to drop next packet, or when we dropped last
102  * @blue_timer:         Blue time to next drop
103  * @p_drop:             BLUE drop probability (0.32 fxp)
104  * @dropping:           set if in dropping state
105  * @ecn_marked:         set if marked
106  */
107 struct cobalt_vars {
108         u32     count;
109         u32     rec_inv_sqrt;
110         ktime_t drop_next;
111         ktime_t blue_timer;
112         u32     p_drop;
113         bool    dropping;
114         bool    ecn_marked;
115 };
116
117 enum {
118         CAKE_SET_NONE = 0,
119         CAKE_SET_SPARSE,
120         CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
121         CAKE_SET_BULK,
122         CAKE_SET_DECAYING
123 };
124
125 struct cake_flow {
126         /* this stuff is all needed per-flow at dequeue time */
127         struct sk_buff    *head;
128         struct sk_buff    *tail;
129         struct list_head  flowchain;
130         s32               deficit;
131         u32               dropped;
132         struct cobalt_vars cvars;
133         u16               srchost; /* index into cake_host table */
134         u16               dsthost;
135         u8                set;
136 }; /* please try to keep this structure <= 64 bytes */
137
138 struct cake_host {
139         u32 srchost_tag;
140         u32 dsthost_tag;
141         u16 srchost_bulk_flow_count;
142         u16 dsthost_bulk_flow_count;
143 };
144
145 struct cake_heap_entry {
146         u16 t:3, b:10;
147 };
148
149 struct cake_tin_data {
150         struct cake_flow flows[CAKE_QUEUES];
151         u32     backlogs[CAKE_QUEUES];
152         u32     tags[CAKE_QUEUES]; /* for set association */
153         u16     overflow_idx[CAKE_QUEUES];
154         struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
155         u16     flow_quantum;
156
157         struct cobalt_params cparams;
158         u32     drop_overlimit;
159         u16     bulk_flow_count;
160         u16     sparse_flow_count;
161         u16     decaying_flow_count;
162         u16     unresponsive_flow_count;
163
164         u32     max_skblen;
165
166         struct list_head new_flows;
167         struct list_head old_flows;
168         struct list_head decaying_flows;
169
170         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
171         ktime_t time_next_packet;
172         u64     tin_rate_ns;
173         u64     tin_rate_bps;
174         u16     tin_rate_shft;
175
176         u16     tin_quantum;
177         s32     tin_deficit;
178         u32     tin_backlog;
179         u32     tin_dropped;
180         u32     tin_ecn_mark;
181
182         u32     packets;
183         u64     bytes;
184
185         u32     ack_drops;
186
187         /* moving averages */
188         u64 avge_delay;
189         u64 peak_delay;
190         u64 base_delay;
191
192         /* hash function stats */
193         u32     way_directs;
194         u32     way_hits;
195         u32     way_misses;
196         u32     way_collisions;
197 }; /* number of tins is small, so size of this struct doesn't matter much */
198
199 struct cake_sched_data {
200         struct tcf_proto __rcu *filter_list; /* optional external classifier */
201         struct tcf_block *block;
202         struct cake_tin_data *tins;
203
204         struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
205         u16             overflow_timeout;
206
207         u16             tin_cnt;
208         u8              tin_mode;
209         u8              flow_mode;
210         u8              ack_filter;
211         u8              atm_mode;
212
213         u32             fwmark_mask;
214         u16             fwmark_shft;
215
216         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
217         u16             rate_shft;
218         ktime_t         time_next_packet;
219         ktime_t         failsafe_next_packet;
220         u64             rate_ns;
221         u64             rate_bps;
222         u16             rate_flags;
223         s16             rate_overhead;
224         u16             rate_mpu;
225         u64             interval;
226         u64             target;
227
228         /* resource tracking */
229         u32             buffer_used;
230         u32             buffer_max_used;
231         u32             buffer_limit;
232         u32             buffer_config_limit;
233
234         /* indices for dequeue */
235         u16             cur_tin;
236         u16             cur_flow;
237
238         struct qdisc_watchdog watchdog;
239         const u8        *tin_index;
240         const u8        *tin_order;
241
242         /* bandwidth capacity estimate */
243         ktime_t         last_packet_time;
244         ktime_t         avg_window_begin;
245         u64             avg_packet_interval;
246         u64             avg_window_bytes;
247         u64             avg_peak_bandwidth;
248         ktime_t         last_reconfig_time;
249
250         /* packet length stats */
251         u32             avg_netoff;
252         u16             max_netlen;
253         u16             max_adjlen;
254         u16             min_netlen;
255         u16             min_adjlen;
256 };
257
258 enum {
259         CAKE_FLAG_OVERHEAD         = BIT(0),
260         CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
261         CAKE_FLAG_INGRESS          = BIT(2),
262         CAKE_FLAG_WASH             = BIT(3),
263         CAKE_FLAG_SPLIT_GSO        = BIT(4)
264 };
265
266 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
267  * obtain the best features of each.  Codel is excellent on flows which
268  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
269  * unresponsive flows.
270  */
271
272 struct cobalt_skb_cb {
273         ktime_t enqueue_time;
274         u32     adjusted_len;
275 };
276
277 static u64 us_to_ns(u64 us)
278 {
279         return us * NSEC_PER_USEC;
280 }
281
282 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
283 {
284         qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
285         return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
286 }
287
288 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
289 {
290         return get_cobalt_cb(skb)->enqueue_time;
291 }
292
293 static void cobalt_set_enqueue_time(struct sk_buff *skb,
294                                     ktime_t now)
295 {
296         get_cobalt_cb(skb)->enqueue_time = now;
297 }
298
299 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
300
301 /* Diffserv lookup tables */
302
303 static const u8 precedence[] = {
304         0, 0, 0, 0, 0, 0, 0, 0,
305         1, 1, 1, 1, 1, 1, 1, 1,
306         2, 2, 2, 2, 2, 2, 2, 2,
307         3, 3, 3, 3, 3, 3, 3, 3,
308         4, 4, 4, 4, 4, 4, 4, 4,
309         5, 5, 5, 5, 5, 5, 5, 5,
310         6, 6, 6, 6, 6, 6, 6, 6,
311         7, 7, 7, 7, 7, 7, 7, 7,
312 };
313
314 static const u8 diffserv8[] = {
315         2, 5, 1, 2, 4, 2, 2, 2,
316         0, 2, 1, 2, 1, 2, 1, 2,
317         5, 2, 4, 2, 4, 2, 4, 2,
318         3, 2, 3, 2, 3, 2, 3, 2,
319         6, 2, 3, 2, 3, 2, 3, 2,
320         6, 2, 2, 2, 6, 2, 6, 2,
321         7, 2, 2, 2, 2, 2, 2, 2,
322         7, 2, 2, 2, 2, 2, 2, 2,
323 };
324
325 static const u8 diffserv4[] = {
326         0, 2, 0, 0, 2, 0, 0, 0,
327         1, 0, 0, 0, 0, 0, 0, 0,
328         2, 0, 2, 0, 2, 0, 2, 0,
329         2, 0, 2, 0, 2, 0, 2, 0,
330         3, 0, 2, 0, 2, 0, 2, 0,
331         3, 0, 0, 0, 3, 0, 3, 0,
332         3, 0, 0, 0, 0, 0, 0, 0,
333         3, 0, 0, 0, 0, 0, 0, 0,
334 };
335
336 static const u8 diffserv3[] = {
337         0, 0, 0, 0, 2, 0, 0, 0,
338         1, 0, 0, 0, 0, 0, 0, 0,
339         0, 0, 0, 0, 0, 0, 0, 0,
340         0, 0, 0, 0, 0, 0, 0, 0,
341         0, 0, 0, 0, 0, 0, 0, 0,
342         0, 0, 0, 0, 2, 0, 2, 0,
343         2, 0, 0, 0, 0, 0, 0, 0,
344         2, 0, 0, 0, 0, 0, 0, 0,
345 };
346
347 static const u8 besteffort[] = {
348         0, 0, 0, 0, 0, 0, 0, 0,
349         0, 0, 0, 0, 0, 0, 0, 0,
350         0, 0, 0, 0, 0, 0, 0, 0,
351         0, 0, 0, 0, 0, 0, 0, 0,
352         0, 0, 0, 0, 0, 0, 0, 0,
353         0, 0, 0, 0, 0, 0, 0, 0,
354         0, 0, 0, 0, 0, 0, 0, 0,
355         0, 0, 0, 0, 0, 0, 0, 0,
356 };
357
358 /* tin priority order for stats dumping */
359
360 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
361 static const u8 bulk_order[] = {1, 0, 2, 3};
362
363 #define REC_INV_SQRT_CACHE (16)
364 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
365
366 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
367  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
368  *
369  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
370  */
371
372 static void cobalt_newton_step(struct cobalt_vars *vars)
373 {
374         u32 invsqrt, invsqrt2;
375         u64 val;
376
377         invsqrt = vars->rec_inv_sqrt;
378         invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
379         val = (3LL << 32) - ((u64)vars->count * invsqrt2);
380
381         val >>= 2; /* avoid overflow in following multiply */
382         val = (val * invsqrt) >> (32 - 2 + 1);
383
384         vars->rec_inv_sqrt = val;
385 }
386
387 static void cobalt_invsqrt(struct cobalt_vars *vars)
388 {
389         if (vars->count < REC_INV_SQRT_CACHE)
390                 vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
391         else
392                 cobalt_newton_step(vars);
393 }
394
395 /* There is a big difference in timing between the accurate values placed in
396  * the cache and the approximations given by a single Newton step for small
397  * count values, particularly when stepping from count 1 to 2 or vice versa.
398  * Above 16, a single Newton step gives sufficient accuracy in either
399  * direction, given the precision stored.
400  *
401  * The magnitude of the error when stepping up to count 2 is such as to give
402  * the value that *should* have been produced at count 4.
403  */
404
405 static void cobalt_cache_init(void)
406 {
407         struct cobalt_vars v;
408
409         memset(&v, 0, sizeof(v));
410         v.rec_inv_sqrt = ~0U;
411         cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
412
413         for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
414                 cobalt_newton_step(&v);
415                 cobalt_newton_step(&v);
416                 cobalt_newton_step(&v);
417                 cobalt_newton_step(&v);
418
419                 cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
420         }
421 }
422
423 static void cobalt_vars_init(struct cobalt_vars *vars)
424 {
425         memset(vars, 0, sizeof(*vars));
426
427         if (!cobalt_rec_inv_sqrt_cache[0]) {
428                 cobalt_cache_init();
429                 cobalt_rec_inv_sqrt_cache[0] = ~0;
430         }
431 }
432
433 /* CoDel control_law is t + interval/sqrt(count)
434  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
435  * both sqrt() and divide operation.
436  */
437 static ktime_t cobalt_control(ktime_t t,
438                               u64 interval,
439                               u32 rec_inv_sqrt)
440 {
441         return ktime_add_ns(t, reciprocal_scale(interval,
442                                                 rec_inv_sqrt));
443 }
444
445 /* Call this when a packet had to be dropped due to queue overflow.  Returns
446  * true if the BLUE state was quiescent before but active after this call.
447  */
448 static bool cobalt_queue_full(struct cobalt_vars *vars,
449                               struct cobalt_params *p,
450                               ktime_t now)
451 {
452         bool up = false;
453
454         if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
455                 up = !vars->p_drop;
456                 vars->p_drop += p->p_inc;
457                 if (vars->p_drop < p->p_inc)
458                         vars->p_drop = ~0;
459                 vars->blue_timer = now;
460         }
461         vars->dropping = true;
462         vars->drop_next = now;
463         if (!vars->count)
464                 vars->count = 1;
465
466         return up;
467 }
468
469 /* Call this when the queue was serviced but turned out to be empty.  Returns
470  * true if the BLUE state was active before but quiescent after this call.
471  */
472 static bool cobalt_queue_empty(struct cobalt_vars *vars,
473                                struct cobalt_params *p,
474                                ktime_t now)
475 {
476         bool down = false;
477
478         if (vars->p_drop &&
479             ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
480                 if (vars->p_drop < p->p_dec)
481                         vars->p_drop = 0;
482                 else
483                         vars->p_drop -= p->p_dec;
484                 vars->blue_timer = now;
485                 down = !vars->p_drop;
486         }
487         vars->dropping = false;
488
489         if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
490                 vars->count--;
491                 cobalt_invsqrt(vars);
492                 vars->drop_next = cobalt_control(vars->drop_next,
493                                                  p->interval,
494                                                  vars->rec_inv_sqrt);
495         }
496
497         return down;
498 }
499
500 /* Call this with a freshly dequeued packet for possible congestion marking.
501  * Returns true as an instruction to drop the packet, false for delivery.
502  */
503 static bool cobalt_should_drop(struct cobalt_vars *vars,
504                                struct cobalt_params *p,
505                                ktime_t now,
506                                struct sk_buff *skb,
507                                u32 bulk_flows)
508 {
509         bool next_due, over_target, drop = false;
510         ktime_t schedule;
511         u64 sojourn;
512
513 /* The 'schedule' variable records, in its sign, whether 'now' is before or
514  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
515  * scheduling decision is actually branched, without destroying that
516  * information.  Similarly, the first 'schedule' value calculated is preserved
517  * in the boolean 'next_due'.
518  *
519  * As for 'drop_next', we take advantage of the fact that 'interval' is both
520  * the delay between first exceeding 'target' and the first signalling event,
521  * *and* the scaling factor for the signalling frequency.  It's therefore very
522  * natural to use a single mechanism for both purposes, and eliminates a
523  * significant amount of reference Codel's spaghetti code.  To help with this,
524  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
525  * as possible to 1.0 in fixed-point.
526  */
527
528         sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
529         schedule = ktime_sub(now, vars->drop_next);
530         over_target = sojourn > p->target &&
531                       sojourn > p->mtu_time * bulk_flows * 2 &&
532                       sojourn > p->mtu_time * 4;
533         next_due = vars->count && ktime_to_ns(schedule) >= 0;
534
535         vars->ecn_marked = false;
536
537         if (over_target) {
538                 if (!vars->dropping) {
539                         vars->dropping = true;
540                         vars->drop_next = cobalt_control(now,
541                                                          p->interval,
542                                                          vars->rec_inv_sqrt);
543                 }
544                 if (!vars->count)
545                         vars->count = 1;
546         } else if (vars->dropping) {
547                 vars->dropping = false;
548         }
549
550         if (next_due && vars->dropping) {
551                 /* Use ECN mark if possible, otherwise drop */
552                 drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
553
554                 vars->count++;
555                 if (!vars->count)
556                         vars->count--;
557                 cobalt_invsqrt(vars);
558                 vars->drop_next = cobalt_control(vars->drop_next,
559                                                  p->interval,
560                                                  vars->rec_inv_sqrt);
561                 schedule = ktime_sub(now, vars->drop_next);
562         } else {
563                 while (next_due) {
564                         vars->count--;
565                         cobalt_invsqrt(vars);
566                         vars->drop_next = cobalt_control(vars->drop_next,
567                                                          p->interval,
568                                                          vars->rec_inv_sqrt);
569                         schedule = ktime_sub(now, vars->drop_next);
570                         next_due = vars->count && ktime_to_ns(schedule) >= 0;
571                 }
572         }
573
574         /* Simple BLUE implementation.  Lack of ECN is deliberate. */
575         if (vars->p_drop)
576                 drop |= (prandom_u32() < vars->p_drop);
577
578         /* Overload the drop_next field as an activity timeout */
579         if (!vars->count)
580                 vars->drop_next = ktime_add_ns(now, p->interval);
581         else if (ktime_to_ns(schedule) > 0 && !drop)
582                 vars->drop_next = now;
583
584         return drop;
585 }
586
587 static void cake_update_flowkeys(struct flow_keys *keys,
588                                  const struct sk_buff *skb)
589 {
590 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
591         struct nf_conntrack_tuple tuple = {};
592         bool rev = !skb->_nfct;
593
594         if (tc_skb_protocol(skb) != htons(ETH_P_IP))
595                 return;
596
597         if (!nf_ct_get_tuple_skb(&tuple, skb))
598                 return;
599
600         keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
601         keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
602
603         if (keys->ports.ports) {
604                 keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
605                 keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
606         }
607 #endif
608 }
609
610 /* Cake has several subtle multiple bit settings. In these cases you
611  *  would be matching triple isolate mode as well.
612  */
613
614 static bool cake_dsrc(int flow_mode)
615 {
616         return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
617 }
618
619 static bool cake_ddst(int flow_mode)
620 {
621         return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
622 }
623
624 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
625                      int flow_mode, u16 flow_override, u16 host_override)
626 {
627         u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
628         u16 reduced_hash, srchost_idx, dsthost_idx;
629         struct flow_keys keys, host_keys;
630
631         if (unlikely(flow_mode == CAKE_FLOW_NONE))
632                 return 0;
633
634         /* If both overrides are set we can skip packet dissection entirely */
635         if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
636             (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
637                 goto skip_hash;
638
639         skb_flow_dissect_flow_keys(skb, &keys,
640                                    FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
641
642         if (flow_mode & CAKE_FLOW_NAT_FLAG)
643                 cake_update_flowkeys(&keys, skb);
644
645         /* flow_hash_from_keys() sorts the addresses by value, so we have
646          * to preserve their order in a separate data structure to treat
647          * src and dst host addresses as independently selectable.
648          */
649         host_keys = keys;
650         host_keys.ports.ports     = 0;
651         host_keys.basic.ip_proto  = 0;
652         host_keys.keyid.keyid     = 0;
653         host_keys.tags.flow_label = 0;
654
655         switch (host_keys.control.addr_type) {
656         case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
657                 host_keys.addrs.v4addrs.src = 0;
658                 dsthost_hash = flow_hash_from_keys(&host_keys);
659                 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
660                 host_keys.addrs.v4addrs.dst = 0;
661                 srchost_hash = flow_hash_from_keys(&host_keys);
662                 break;
663
664         case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
665                 memset(&host_keys.addrs.v6addrs.src, 0,
666                        sizeof(host_keys.addrs.v6addrs.src));
667                 dsthost_hash = flow_hash_from_keys(&host_keys);
668                 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
669                 memset(&host_keys.addrs.v6addrs.dst, 0,
670                        sizeof(host_keys.addrs.v6addrs.dst));
671                 srchost_hash = flow_hash_from_keys(&host_keys);
672                 break;
673
674         default:
675                 dsthost_hash = 0;
676                 srchost_hash = 0;
677         }
678
679         /* This *must* be after the above switch, since as a
680          * side-effect it sorts the src and dst addresses.
681          */
682         if (flow_mode & CAKE_FLOW_FLOWS)
683                 flow_hash = flow_hash_from_keys(&keys);
684
685 skip_hash:
686         if (flow_override)
687                 flow_hash = flow_override - 1;
688         if (host_override) {
689                 dsthost_hash = host_override - 1;
690                 srchost_hash = host_override - 1;
691         }
692
693         if (!(flow_mode & CAKE_FLOW_FLOWS)) {
694                 if (flow_mode & CAKE_FLOW_SRC_IP)
695                         flow_hash ^= srchost_hash;
696
697                 if (flow_mode & CAKE_FLOW_DST_IP)
698                         flow_hash ^= dsthost_hash;
699         }
700
701         reduced_hash = flow_hash % CAKE_QUEUES;
702
703         /* set-associative hashing */
704         /* fast path if no hash collision (direct lookup succeeds) */
705         if (likely(q->tags[reduced_hash] == flow_hash &&
706                    q->flows[reduced_hash].set)) {
707                 q->way_directs++;
708         } else {
709                 u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
710                 u32 outer_hash = reduced_hash - inner_hash;
711                 bool allocate_src = false;
712                 bool allocate_dst = false;
713                 u32 i, k;
714
715                 /* check if any active queue in the set is reserved for
716                  * this flow.
717                  */
718                 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
719                      i++, k = (k + 1) % CAKE_SET_WAYS) {
720                         if (q->tags[outer_hash + k] == flow_hash) {
721                                 if (i)
722                                         q->way_hits++;
723
724                                 if (!q->flows[outer_hash + k].set) {
725                                         /* need to increment host refcnts */
726                                         allocate_src = cake_dsrc(flow_mode);
727                                         allocate_dst = cake_ddst(flow_mode);
728                                 }
729
730                                 goto found;
731                         }
732                 }
733
734                 /* no queue is reserved for this flow, look for an
735                  * empty one.
736                  */
737                 for (i = 0; i < CAKE_SET_WAYS;
738                          i++, k = (k + 1) % CAKE_SET_WAYS) {
739                         if (!q->flows[outer_hash + k].set) {
740                                 q->way_misses++;
741                                 allocate_src = cake_dsrc(flow_mode);
742                                 allocate_dst = cake_ddst(flow_mode);
743                                 goto found;
744                         }
745                 }
746
747                 /* With no empty queues, default to the original
748                  * queue, accept the collision, update the host tags.
749                  */
750                 q->way_collisions++;
751                 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
752                         q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
753                         q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
754                 }
755                 allocate_src = cake_dsrc(flow_mode);
756                 allocate_dst = cake_ddst(flow_mode);
757 found:
758                 /* reserve queue for future packets in same flow */
759                 reduced_hash = outer_hash + k;
760                 q->tags[reduced_hash] = flow_hash;
761
762                 if (allocate_src) {
763                         srchost_idx = srchost_hash % CAKE_QUEUES;
764                         inner_hash = srchost_idx % CAKE_SET_WAYS;
765                         outer_hash = srchost_idx - inner_hash;
766                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
767                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
768                                 if (q->hosts[outer_hash + k].srchost_tag ==
769                                     srchost_hash)
770                                         goto found_src;
771                         }
772                         for (i = 0; i < CAKE_SET_WAYS;
773                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
774                                 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
775                                         break;
776                         }
777                         q->hosts[outer_hash + k].srchost_tag = srchost_hash;
778 found_src:
779                         srchost_idx = outer_hash + k;
780                         if (q->flows[reduced_hash].set == CAKE_SET_BULK)
781                                 q->hosts[srchost_idx].srchost_bulk_flow_count++;
782                         q->flows[reduced_hash].srchost = srchost_idx;
783                 }
784
785                 if (allocate_dst) {
786                         dsthost_idx = dsthost_hash % CAKE_QUEUES;
787                         inner_hash = dsthost_idx % CAKE_SET_WAYS;
788                         outer_hash = dsthost_idx - inner_hash;
789                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
790                              i++, k = (k + 1) % CAKE_SET_WAYS) {
791                                 if (q->hosts[outer_hash + k].dsthost_tag ==
792                                     dsthost_hash)
793                                         goto found_dst;
794                         }
795                         for (i = 0; i < CAKE_SET_WAYS;
796                              i++, k = (k + 1) % CAKE_SET_WAYS) {
797                                 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
798                                         break;
799                         }
800                         q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
801 found_dst:
802                         dsthost_idx = outer_hash + k;
803                         if (q->flows[reduced_hash].set == CAKE_SET_BULK)
804                                 q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
805                         q->flows[reduced_hash].dsthost = dsthost_idx;
806                 }
807         }
808
809         return reduced_hash;
810 }
811
812 /* helper functions : might be changed when/if skb use a standard list_head */
813 /* remove one skb from head of slot queue */
814
815 static struct sk_buff *dequeue_head(struct cake_flow *flow)
816 {
817         struct sk_buff *skb = flow->head;
818
819         if (skb) {
820                 flow->head = skb->next;
821                 skb_mark_not_on_list(skb);
822         }
823
824         return skb;
825 }
826
827 /* add skb to flow queue (tail add) */
828
829 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
830 {
831         if (!flow->head)
832                 flow->head = skb;
833         else
834                 flow->tail->next = skb;
835         flow->tail = skb;
836         skb->next = NULL;
837 }
838
839 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
840                                     struct ipv6hdr *buf)
841 {
842         unsigned int offset = skb_network_offset(skb);
843         struct iphdr *iph;
844
845         iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
846
847         if (!iph)
848                 return NULL;
849
850         if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
851                 return skb_header_pointer(skb, offset + iph->ihl * 4,
852                                           sizeof(struct ipv6hdr), buf);
853
854         else if (iph->version == 4)
855                 return iph;
856
857         else if (iph->version == 6)
858                 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
859                                           buf);
860
861         return NULL;
862 }
863
864 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
865                                       void *buf, unsigned int bufsize)
866 {
867         unsigned int offset = skb_network_offset(skb);
868         const struct ipv6hdr *ipv6h;
869         const struct tcphdr *tcph;
870         const struct iphdr *iph;
871         struct ipv6hdr _ipv6h;
872         struct tcphdr _tcph;
873
874         ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
875
876         if (!ipv6h)
877                 return NULL;
878
879         if (ipv6h->version == 4) {
880                 iph = (struct iphdr *)ipv6h;
881                 offset += iph->ihl * 4;
882
883                 /* special-case 6in4 tunnelling, as that is a common way to get
884                  * v6 connectivity in the home
885                  */
886                 if (iph->protocol == IPPROTO_IPV6) {
887                         ipv6h = skb_header_pointer(skb, offset,
888                                                    sizeof(_ipv6h), &_ipv6h);
889
890                         if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
891                                 return NULL;
892
893                         offset += sizeof(struct ipv6hdr);
894
895                 } else if (iph->protocol != IPPROTO_TCP) {
896                         return NULL;
897                 }
898
899         } else if (ipv6h->version == 6) {
900                 if (ipv6h->nexthdr != IPPROTO_TCP)
901                         return NULL;
902
903                 offset += sizeof(struct ipv6hdr);
904         } else {
905                 return NULL;
906         }
907
908         tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
909         if (!tcph)
910                 return NULL;
911
912         return skb_header_pointer(skb, offset,
913                                   min(__tcp_hdrlen(tcph), bufsize), buf);
914 }
915
916 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
917                                    int code, int *oplen)
918 {
919         /* inspired by tcp_parse_options in tcp_input.c */
920         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
921         const u8 *ptr = (const u8 *)(tcph + 1);
922
923         while (length > 0) {
924                 int opcode = *ptr++;
925                 int opsize;
926
927                 if (opcode == TCPOPT_EOL)
928                         break;
929                 if (opcode == TCPOPT_NOP) {
930                         length--;
931                         continue;
932                 }
933                 opsize = *ptr++;
934                 if (opsize < 2 || opsize > length)
935                         break;
936
937                 if (opcode == code) {
938                         *oplen = opsize;
939                         return ptr;
940                 }
941
942                 ptr += opsize - 2;
943                 length -= opsize;
944         }
945
946         return NULL;
947 }
948
949 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
950  * bytes than the other. In the case where both sequences ACKs bytes that the
951  * other doesn't, A is considered greater. DSACKs in A also makes A be
952  * considered greater.
953  *
954  * @return -1, 0 or 1 as normal compare functions
955  */
956 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
957                                   const struct tcphdr *tcph_b)
958 {
959         const struct tcp_sack_block_wire *sack_a, *sack_b;
960         u32 ack_seq_a = ntohl(tcph_a->ack_seq);
961         u32 bytes_a = 0, bytes_b = 0;
962         int oplen_a, oplen_b;
963         bool first = true;
964
965         sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
966         sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
967
968         /* pointers point to option contents */
969         oplen_a -= TCPOLEN_SACK_BASE;
970         oplen_b -= TCPOLEN_SACK_BASE;
971
972         if (sack_a && oplen_a >= sizeof(*sack_a) &&
973             (!sack_b || oplen_b < sizeof(*sack_b)))
974                 return -1;
975         else if (sack_b && oplen_b >= sizeof(*sack_b) &&
976                  (!sack_a || oplen_a < sizeof(*sack_a)))
977                 return 1;
978         else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
979                  (!sack_b || oplen_b < sizeof(*sack_b)))
980                 return 0;
981
982         while (oplen_a >= sizeof(*sack_a)) {
983                 const struct tcp_sack_block_wire *sack_tmp = sack_b;
984                 u32 start_a = get_unaligned_be32(&sack_a->start_seq);
985                 u32 end_a = get_unaligned_be32(&sack_a->end_seq);
986                 int oplen_tmp = oplen_b;
987                 bool found = false;
988
989                 /* DSACK; always considered greater to prevent dropping */
990                 if (before(start_a, ack_seq_a))
991                         return -1;
992
993                 bytes_a += end_a - start_a;
994
995                 while (oplen_tmp >= sizeof(*sack_tmp)) {
996                         u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
997                         u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
998
999                         /* first time through we count the total size */
1000                         if (first)
1001                                 bytes_b += end_b - start_b;
1002
1003                         if (!after(start_b, start_a) && !before(end_b, end_a)) {
1004                                 found = true;
1005                                 if (!first)
1006                                         break;
1007                         }
1008                         oplen_tmp -= sizeof(*sack_tmp);
1009                         sack_tmp++;
1010                 }
1011
1012                 if (!found)
1013                         return -1;
1014
1015                 oplen_a -= sizeof(*sack_a);
1016                 sack_a++;
1017                 first = false;
1018         }
1019
1020         /* If we made it this far, all ranges SACKed by A are covered by B, so
1021          * either the SACKs are equal, or B SACKs more bytes.
1022          */
1023         return bytes_b > bytes_a ? 1 : 0;
1024 }
1025
1026 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1027                                  u32 *tsval, u32 *tsecr)
1028 {
1029         const u8 *ptr;
1030         int opsize;
1031
1032         ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1033
1034         if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1035                 *tsval = get_unaligned_be32(ptr);
1036                 *tsecr = get_unaligned_be32(ptr + 4);
1037         }
1038 }
1039
1040 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1041                                u32 tstamp_new, u32 tsecr_new)
1042 {
1043         /* inspired by tcp_parse_options in tcp_input.c */
1044         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1045         const u8 *ptr = (const u8 *)(tcph + 1);
1046         u32 tstamp, tsecr;
1047
1048         /* 3 reserved flags must be unset to avoid future breakage
1049          * ACK must be set
1050          * ECE/CWR are handled separately
1051          * All other flags URG/PSH/RST/SYN/FIN must be unset
1052          * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1053          * 0x00C00000 = CWR/ECE (handled separately)
1054          * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1055          */
1056         if (((tcp_flag_word(tcph) &
1057               cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1058                 return false;
1059
1060         while (length > 0) {
1061                 int opcode = *ptr++;
1062                 int opsize;
1063
1064                 if (opcode == TCPOPT_EOL)
1065                         break;
1066                 if (opcode == TCPOPT_NOP) {
1067                         length--;
1068                         continue;
1069                 }
1070                 opsize = *ptr++;
1071                 if (opsize < 2 || opsize > length)
1072                         break;
1073
1074                 switch (opcode) {
1075                 case TCPOPT_MD5SIG: /* doesn't influence state */
1076                         break;
1077
1078                 case TCPOPT_SACK: /* stricter checking performed later */
1079                         if (opsize % 8 != 2)
1080                                 return false;
1081                         break;
1082
1083                 case TCPOPT_TIMESTAMP:
1084                         /* only drop timestamps lower than new */
1085                         if (opsize != TCPOLEN_TIMESTAMP)
1086                                 return false;
1087                         tstamp = get_unaligned_be32(ptr);
1088                         tsecr = get_unaligned_be32(ptr + 4);
1089                         if (after(tstamp, tstamp_new) ||
1090                             after(tsecr, tsecr_new))
1091                                 return false;
1092                         break;
1093
1094                 case TCPOPT_MSS:  /* these should only be set on SYN */
1095                 case TCPOPT_WINDOW:
1096                 case TCPOPT_SACK_PERM:
1097                 case TCPOPT_FASTOPEN:
1098                 case TCPOPT_EXP:
1099                 default: /* don't drop if any unknown options are present */
1100                         return false;
1101                 }
1102
1103                 ptr += opsize - 2;
1104                 length -= opsize;
1105         }
1106
1107         return true;
1108 }
1109
1110 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1111                                        struct cake_flow *flow)
1112 {
1113         bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1114         struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1115         struct sk_buff *skb_check, *skb_prev = NULL;
1116         const struct ipv6hdr *ipv6h, *ipv6h_check;
1117         unsigned char _tcph[64], _tcph_check[64];
1118         const struct tcphdr *tcph, *tcph_check;
1119         const struct iphdr *iph, *iph_check;
1120         struct ipv6hdr _iph, _iph_check;
1121         const struct sk_buff *skb;
1122         int seglen, num_found = 0;
1123         u32 tstamp = 0, tsecr = 0;
1124         __be32 elig_flags = 0;
1125         int sack_comp;
1126
1127         /* no other possible ACKs to filter */
1128         if (flow->head == flow->tail)
1129                 return NULL;
1130
1131         skb = flow->tail;
1132         tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1133         iph = cake_get_iphdr(skb, &_iph);
1134         if (!tcph)
1135                 return NULL;
1136
1137         cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1138
1139         /* the 'triggering' packet need only have the ACK flag set.
1140          * also check that SYN is not set, as there won't be any previous ACKs.
1141          */
1142         if ((tcp_flag_word(tcph) &
1143              (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1144                 return NULL;
1145
1146         /* the 'triggering' ACK is at the tail of the queue, we have already
1147          * returned if it is the only packet in the flow. loop through the rest
1148          * of the queue looking for pure ACKs with the same 5-tuple as the
1149          * triggering one.
1150          */
1151         for (skb_check = flow->head;
1152              skb_check && skb_check != skb;
1153              skb_prev = skb_check, skb_check = skb_check->next) {
1154                 iph_check = cake_get_iphdr(skb_check, &_iph_check);
1155                 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1156                                              sizeof(_tcph_check));
1157
1158                 /* only TCP packets with matching 5-tuple are eligible, and only
1159                  * drop safe headers
1160                  */
1161                 if (!tcph_check || iph->version != iph_check->version ||
1162                     tcph_check->source != tcph->source ||
1163                     tcph_check->dest != tcph->dest)
1164                         continue;
1165
1166                 if (iph_check->version == 4) {
1167                         if (iph_check->saddr != iph->saddr ||
1168                             iph_check->daddr != iph->daddr)
1169                                 continue;
1170
1171                         seglen = ntohs(iph_check->tot_len) -
1172                                        (4 * iph_check->ihl);
1173                 } else if (iph_check->version == 6) {
1174                         ipv6h = (struct ipv6hdr *)iph;
1175                         ipv6h_check = (struct ipv6hdr *)iph_check;
1176
1177                         if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1178                             ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1179                                 continue;
1180
1181                         seglen = ntohs(ipv6h_check->payload_len);
1182                 } else {
1183                         WARN_ON(1);  /* shouldn't happen */
1184                         continue;
1185                 }
1186
1187                 /* If the ECE/CWR flags changed from the previous eligible
1188                  * packet in the same flow, we should no longer be dropping that
1189                  * previous packet as this would lose information.
1190                  */
1191                 if (elig_ack && (tcp_flag_word(tcph_check) &
1192                                  (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1193                         elig_ack = NULL;
1194                         elig_ack_prev = NULL;
1195                         num_found--;
1196                 }
1197
1198                 /* Check TCP options and flags, don't drop ACKs with segment
1199                  * data, and don't drop ACKs with a higher cumulative ACK
1200                  * counter than the triggering packet. Check ACK seqno here to
1201                  * avoid parsing SACK options of packets we are going to exclude
1202                  * anyway.
1203                  */
1204                 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1205                     (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1206                     after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1207                         continue;
1208
1209                 /* Check SACK options. The triggering packet must SACK more data
1210                  * than the ACK under consideration, or SACK the same range but
1211                  * have a larger cumulative ACK counter. The latter is a
1212                  * pathological case, but is contained in the following check
1213                  * anyway, just to be safe.
1214                  */
1215                 sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1216
1217                 if (sack_comp < 0 ||
1218                     (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1219                      sack_comp == 0))
1220                         continue;
1221
1222                 /* At this point we have found an eligible pure ACK to drop; if
1223                  * we are in aggressive mode, we are done. Otherwise, keep
1224                  * searching unless this is the second eligible ACK we
1225                  * found.
1226                  *
1227                  * Since we want to drop ACK closest to the head of the queue,
1228                  * save the first eligible ACK we find, even if we need to loop
1229                  * again.
1230                  */
1231                 if (!elig_ack) {
1232                         elig_ack = skb_check;
1233                         elig_ack_prev = skb_prev;
1234                         elig_flags = (tcp_flag_word(tcph_check)
1235                                       & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1236                 }
1237
1238                 if (num_found++ > 0)
1239                         goto found;
1240         }
1241
1242         /* We made it through the queue without finding two eligible ACKs . If
1243          * we found a single eligible ACK we can drop it in aggressive mode if
1244          * we can guarantee that this does not interfere with ECN flag
1245          * information. We ensure this by dropping it only if the enqueued
1246          * packet is consecutive with the eligible ACK, and their flags match.
1247          */
1248         if (elig_ack && aggressive && elig_ack->next == skb &&
1249             (elig_flags == (tcp_flag_word(tcph) &
1250                             (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1251                 goto found;
1252
1253         return NULL;
1254
1255 found:
1256         if (elig_ack_prev)
1257                 elig_ack_prev->next = elig_ack->next;
1258         else
1259                 flow->head = elig_ack->next;
1260
1261         skb_mark_not_on_list(elig_ack);
1262
1263         return elig_ack;
1264 }
1265
1266 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1267 {
1268         avg -= avg >> shift;
1269         avg += sample >> shift;
1270         return avg;
1271 }
1272
1273 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1274 {
1275         if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1276                 len -= off;
1277
1278         if (q->max_netlen < len)
1279                 q->max_netlen = len;
1280         if (q->min_netlen > len)
1281                 q->min_netlen = len;
1282
1283         len += q->rate_overhead;
1284
1285         if (len < q->rate_mpu)
1286                 len = q->rate_mpu;
1287
1288         if (q->atm_mode == CAKE_ATM_ATM) {
1289                 len += 47;
1290                 len /= 48;
1291                 len *= 53;
1292         } else if (q->atm_mode == CAKE_ATM_PTM) {
1293                 /* Add one byte per 64 bytes or part thereof.
1294                  * This is conservative and easier to calculate than the
1295                  * precise value.
1296                  */
1297                 len += (len + 63) / 64;
1298         }
1299
1300         if (q->max_adjlen < len)
1301                 q->max_adjlen = len;
1302         if (q->min_adjlen > len)
1303                 q->min_adjlen = len;
1304
1305         return len;
1306 }
1307
1308 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1309 {
1310         const struct skb_shared_info *shinfo = skb_shinfo(skb);
1311         unsigned int hdr_len, last_len = 0;
1312         u32 off = skb_network_offset(skb);
1313         u32 len = qdisc_pkt_len(skb);
1314         u16 segs = 1;
1315
1316         q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1317
1318         if (!shinfo->gso_size)
1319                 return cake_calc_overhead(q, len, off);
1320
1321         /* borrowed from qdisc_pkt_len_init() */
1322         hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1323
1324         /* + transport layer */
1325         if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1326                                                 SKB_GSO_TCPV6))) {
1327                 const struct tcphdr *th;
1328                 struct tcphdr _tcphdr;
1329
1330                 th = skb_header_pointer(skb, skb_transport_offset(skb),
1331                                         sizeof(_tcphdr), &_tcphdr);
1332                 if (likely(th))
1333                         hdr_len += __tcp_hdrlen(th);
1334         } else {
1335                 struct udphdr _udphdr;
1336
1337                 if (skb_header_pointer(skb, skb_transport_offset(skb),
1338                                        sizeof(_udphdr), &_udphdr))
1339                         hdr_len += sizeof(struct udphdr);
1340         }
1341
1342         if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1343                 segs = DIV_ROUND_UP(skb->len - hdr_len,
1344                                     shinfo->gso_size);
1345         else
1346                 segs = shinfo->gso_segs;
1347
1348         len = shinfo->gso_size + hdr_len;
1349         last_len = skb->len - shinfo->gso_size * (segs - 1);
1350
1351         return (cake_calc_overhead(q, len, off) * (segs - 1) +
1352                 cake_calc_overhead(q, last_len, off));
1353 }
1354
1355 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1356 {
1357         struct cake_heap_entry ii = q->overflow_heap[i];
1358         struct cake_heap_entry jj = q->overflow_heap[j];
1359
1360         q->overflow_heap[i] = jj;
1361         q->overflow_heap[j] = ii;
1362
1363         q->tins[ii.t].overflow_idx[ii.b] = j;
1364         q->tins[jj.t].overflow_idx[jj.b] = i;
1365 }
1366
1367 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1368 {
1369         struct cake_heap_entry ii = q->overflow_heap[i];
1370
1371         return q->tins[ii.t].backlogs[ii.b];
1372 }
1373
1374 static void cake_heapify(struct cake_sched_data *q, u16 i)
1375 {
1376         static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1377         u32 mb = cake_heap_get_backlog(q, i);
1378         u32 m = i;
1379
1380         while (m < a) {
1381                 u32 l = m + m + 1;
1382                 u32 r = l + 1;
1383
1384                 if (l < a) {
1385                         u32 lb = cake_heap_get_backlog(q, l);
1386
1387                         if (lb > mb) {
1388                                 m  = l;
1389                                 mb = lb;
1390                         }
1391                 }
1392
1393                 if (r < a) {
1394                         u32 rb = cake_heap_get_backlog(q, r);
1395
1396                         if (rb > mb) {
1397                                 m  = r;
1398                                 mb = rb;
1399                         }
1400                 }
1401
1402                 if (m != i) {
1403                         cake_heap_swap(q, i, m);
1404                         i = m;
1405                 } else {
1406                         break;
1407                 }
1408         }
1409 }
1410
1411 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1412 {
1413         while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1414                 u16 p = (i - 1) >> 1;
1415                 u32 ib = cake_heap_get_backlog(q, i);
1416                 u32 pb = cake_heap_get_backlog(q, p);
1417
1418                 if (ib > pb) {
1419                         cake_heap_swap(q, i, p);
1420                         i = p;
1421                 } else {
1422                         break;
1423                 }
1424         }
1425 }
1426
1427 static int cake_advance_shaper(struct cake_sched_data *q,
1428                                struct cake_tin_data *b,
1429                                struct sk_buff *skb,
1430                                ktime_t now, bool drop)
1431 {
1432         u32 len = get_cobalt_cb(skb)->adjusted_len;
1433
1434         /* charge packet bandwidth to this tin
1435          * and to the global shaper.
1436          */
1437         if (q->rate_ns) {
1438                 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1439                 u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1440                 u64 failsafe_dur = global_dur + (global_dur >> 1);
1441
1442                 if (ktime_before(b->time_next_packet, now))
1443                         b->time_next_packet = ktime_add_ns(b->time_next_packet,
1444                                                            tin_dur);
1445
1446                 else if (ktime_before(b->time_next_packet,
1447                                       ktime_add_ns(now, tin_dur)))
1448                         b->time_next_packet = ktime_add_ns(now, tin_dur);
1449
1450                 q->time_next_packet = ktime_add_ns(q->time_next_packet,
1451                                                    global_dur);
1452                 if (!drop)
1453                         q->failsafe_next_packet = \
1454                                 ktime_add_ns(q->failsafe_next_packet,
1455                                              failsafe_dur);
1456         }
1457         return len;
1458 }
1459
1460 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1461 {
1462         struct cake_sched_data *q = qdisc_priv(sch);
1463         ktime_t now = ktime_get();
1464         u32 idx = 0, tin = 0, len;
1465         struct cake_heap_entry qq;
1466         struct cake_tin_data *b;
1467         struct cake_flow *flow;
1468         struct sk_buff *skb;
1469
1470         if (!q->overflow_timeout) {
1471                 int i;
1472                 /* Build fresh max-heap */
1473                 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1474                         cake_heapify(q, i);
1475         }
1476         q->overflow_timeout = 65535;
1477
1478         /* select longest queue for pruning */
1479         qq  = q->overflow_heap[0];
1480         tin = qq.t;
1481         idx = qq.b;
1482
1483         b = &q->tins[tin];
1484         flow = &b->flows[idx];
1485         skb = dequeue_head(flow);
1486         if (unlikely(!skb)) {
1487                 /* heap has gone wrong, rebuild it next time */
1488                 q->overflow_timeout = 0;
1489                 return idx + (tin << 16);
1490         }
1491
1492         if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1493                 b->unresponsive_flow_count++;
1494
1495         len = qdisc_pkt_len(skb);
1496         q->buffer_used      -= skb->truesize;
1497         b->backlogs[idx]    -= len;
1498         b->tin_backlog      -= len;
1499         sch->qstats.backlog -= len;
1500         qdisc_tree_reduce_backlog(sch, 1, len);
1501
1502         flow->dropped++;
1503         b->tin_dropped++;
1504         sch->qstats.drops++;
1505
1506         if (q->rate_flags & CAKE_FLAG_INGRESS)
1507                 cake_advance_shaper(q, b, skb, now, true);
1508
1509         __qdisc_drop(skb, to_free);
1510         sch->q.qlen--;
1511
1512         cake_heapify(q, 0);
1513
1514         return idx + (tin << 16);
1515 }
1516
1517 static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
1518 {
1519         int wlen = skb_network_offset(skb);
1520         u8 dscp;
1521
1522         switch (tc_skb_protocol(skb)) {
1523         case htons(ETH_P_IP):
1524                 wlen += sizeof(struct iphdr);
1525                 if (!pskb_may_pull(skb, wlen) ||
1526                     skb_try_make_writable(skb, wlen))
1527                         return 0;
1528
1529                 dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
1530                 if (wash && dscp)
1531                         ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1532                 return dscp;
1533
1534         case htons(ETH_P_IPV6):
1535                 wlen += sizeof(struct ipv6hdr);
1536                 if (!pskb_may_pull(skb, wlen) ||
1537                     skb_try_make_writable(skb, wlen))
1538                         return 0;
1539
1540                 dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
1541                 if (wash && dscp)
1542                         ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1543                 return dscp;
1544
1545         case htons(ETH_P_ARP):
1546                 return 0x38;  /* CS7 - Net Control */
1547
1548         default:
1549                 /* If there is no Diffserv field, treat as best-effort */
1550                 return 0;
1551         }
1552 }
1553
1554 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1555                                              struct sk_buff *skb)
1556 {
1557         struct cake_sched_data *q = qdisc_priv(sch);
1558         u32 tin, mark;
1559         u8 dscp;
1560
1561         /* Tin selection: Default to diffserv-based selection, allow overriding
1562          * using firewall marks or skb->priority.
1563          */
1564         dscp = cake_handle_diffserv(skb,
1565                                     q->rate_flags & CAKE_FLAG_WASH);
1566         mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1567
1568         if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1569                 tin = 0;
1570
1571         else if (mark && mark <= q->tin_cnt)
1572                 tin = q->tin_order[mark - 1];
1573
1574         else if (TC_H_MAJ(skb->priority) == sch->handle &&
1575                  TC_H_MIN(skb->priority) > 0 &&
1576                  TC_H_MIN(skb->priority) <= q->tin_cnt)
1577                 tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1578
1579         else {
1580                 tin = q->tin_index[dscp];
1581
1582                 if (unlikely(tin >= q->tin_cnt))
1583                         tin = 0;
1584         }
1585
1586         return &q->tins[tin];
1587 }
1588
1589 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1590                          struct sk_buff *skb, int flow_mode, int *qerr)
1591 {
1592         struct cake_sched_data *q = qdisc_priv(sch);
1593         struct tcf_proto *filter;
1594         struct tcf_result res;
1595         u16 flow = 0, host = 0;
1596         int result;
1597
1598         filter = rcu_dereference_bh(q->filter_list);
1599         if (!filter)
1600                 goto hash;
1601
1602         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1603         result = tcf_classify(skb, filter, &res, false);
1604
1605         if (result >= 0) {
1606 #ifdef CONFIG_NET_CLS_ACT
1607                 switch (result) {
1608                 case TC_ACT_STOLEN:
1609                 case TC_ACT_QUEUED:
1610                 case TC_ACT_TRAP:
1611                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1612                         /* fall through */
1613                 case TC_ACT_SHOT:
1614                         return 0;
1615                 }
1616 #endif
1617                 if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1618                         flow = TC_H_MIN(res.classid);
1619                 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1620                         host = TC_H_MAJ(res.classid) >> 16;
1621         }
1622 hash:
1623         *t = cake_select_tin(sch, skb);
1624         return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1625 }
1626
1627 static void cake_reconfigure(struct Qdisc *sch);
1628
1629 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1630                         struct sk_buff **to_free)
1631 {
1632         struct cake_sched_data *q = qdisc_priv(sch);
1633         int len = qdisc_pkt_len(skb);
1634         int uninitialized_var(ret);
1635         struct sk_buff *ack = NULL;
1636         ktime_t now = ktime_get();
1637         struct cake_tin_data *b;
1638         struct cake_flow *flow;
1639         u32 idx;
1640
1641         /* choose flow to insert into */
1642         idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1643         if (idx == 0) {
1644                 if (ret & __NET_XMIT_BYPASS)
1645                         qdisc_qstats_drop(sch);
1646                 __qdisc_drop(skb, to_free);
1647                 return ret;
1648         }
1649         idx--;
1650         flow = &b->flows[idx];
1651
1652         /* ensure shaper state isn't stale */
1653         if (!b->tin_backlog) {
1654                 if (ktime_before(b->time_next_packet, now))
1655                         b->time_next_packet = now;
1656
1657                 if (!sch->q.qlen) {
1658                         if (ktime_before(q->time_next_packet, now)) {
1659                                 q->failsafe_next_packet = now;
1660                                 q->time_next_packet = now;
1661                         } else if (ktime_after(q->time_next_packet, now) &&
1662                                    ktime_after(q->failsafe_next_packet, now)) {
1663                                 u64 next = \
1664                                         min(ktime_to_ns(q->time_next_packet),
1665                                             ktime_to_ns(
1666                                                    q->failsafe_next_packet));
1667                                 sch->qstats.overlimits++;
1668                                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1669                         }
1670                 }
1671         }
1672
1673         if (unlikely(len > b->max_skblen))
1674                 b->max_skblen = len;
1675
1676         if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1677                 struct sk_buff *segs, *nskb;
1678                 netdev_features_t features = netif_skb_features(skb);
1679                 unsigned int slen = 0, numsegs = 0;
1680
1681                 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1682                 if (IS_ERR_OR_NULL(segs))
1683                         return qdisc_drop(skb, sch, to_free);
1684
1685                 skb_list_walk_safe(segs, segs, nskb) {
1686                         skb_mark_not_on_list(segs);
1687                         qdisc_skb_cb(segs)->pkt_len = segs->len;
1688                         cobalt_set_enqueue_time(segs, now);
1689                         get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1690                                                                           segs);
1691                         flow_queue_add(flow, segs);
1692
1693                         sch->q.qlen++;
1694                         numsegs++;
1695                         slen += segs->len;
1696                         q->buffer_used += segs->truesize;
1697                         b->packets++;
1698                 }
1699
1700                 /* stats */
1701                 b->bytes            += slen;
1702                 b->backlogs[idx]    += slen;
1703                 b->tin_backlog      += slen;
1704                 sch->qstats.backlog += slen;
1705                 q->avg_window_bytes += slen;
1706
1707                 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1708                 consume_skb(skb);
1709         } else {
1710                 /* not splitting */
1711                 cobalt_set_enqueue_time(skb, now);
1712                 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1713                 flow_queue_add(flow, skb);
1714
1715                 if (q->ack_filter)
1716                         ack = cake_ack_filter(q, flow);
1717
1718                 if (ack) {
1719                         b->ack_drops++;
1720                         sch->qstats.drops++;
1721                         b->bytes += qdisc_pkt_len(ack);
1722                         len -= qdisc_pkt_len(ack);
1723                         q->buffer_used += skb->truesize - ack->truesize;
1724                         if (q->rate_flags & CAKE_FLAG_INGRESS)
1725                                 cake_advance_shaper(q, b, ack, now, true);
1726
1727                         qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1728                         consume_skb(ack);
1729                 } else {
1730                         sch->q.qlen++;
1731                         q->buffer_used      += skb->truesize;
1732                 }
1733
1734                 /* stats */
1735                 b->packets++;
1736                 b->bytes            += len;
1737                 b->backlogs[idx]    += len;
1738                 b->tin_backlog      += len;
1739                 sch->qstats.backlog += len;
1740                 q->avg_window_bytes += len;
1741         }
1742
1743         if (q->overflow_timeout)
1744                 cake_heapify_up(q, b->overflow_idx[idx]);
1745
1746         /* incoming bandwidth capacity estimate */
1747         if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1748                 u64 packet_interval = \
1749                         ktime_to_ns(ktime_sub(now, q->last_packet_time));
1750
1751                 if (packet_interval > NSEC_PER_SEC)
1752                         packet_interval = NSEC_PER_SEC;
1753
1754                 /* filter out short-term bursts, eg. wifi aggregation */
1755                 q->avg_packet_interval = \
1756                         cake_ewma(q->avg_packet_interval,
1757                                   packet_interval,
1758                                   (packet_interval > q->avg_packet_interval ?
1759                                           2 : 8));
1760
1761                 q->last_packet_time = now;
1762
1763                 if (packet_interval > q->avg_packet_interval) {
1764                         u64 window_interval = \
1765                                 ktime_to_ns(ktime_sub(now,
1766                                                       q->avg_window_begin));
1767                         u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1768
1769                         b = div64_u64(b, window_interval);
1770                         q->avg_peak_bandwidth =
1771                                 cake_ewma(q->avg_peak_bandwidth, b,
1772                                           b > q->avg_peak_bandwidth ? 2 : 8);
1773                         q->avg_window_bytes = 0;
1774                         q->avg_window_begin = now;
1775
1776                         if (ktime_after(now,
1777                                         ktime_add_ms(q->last_reconfig_time,
1778                                                      250))) {
1779                                 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1780                                 cake_reconfigure(sch);
1781                         }
1782                 }
1783         } else {
1784                 q->avg_window_bytes = 0;
1785                 q->last_packet_time = now;
1786         }
1787
1788         /* flowchain */
1789         if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1790                 struct cake_host *srchost = &b->hosts[flow->srchost];
1791                 struct cake_host *dsthost = &b->hosts[flow->dsthost];
1792                 u16 host_load = 1;
1793
1794                 if (!flow->set) {
1795                         list_add_tail(&flow->flowchain, &b->new_flows);
1796                 } else {
1797                         b->decaying_flow_count--;
1798                         list_move_tail(&flow->flowchain, &b->new_flows);
1799                 }
1800                 flow->set = CAKE_SET_SPARSE;
1801                 b->sparse_flow_count++;
1802
1803                 if (cake_dsrc(q->flow_mode))
1804                         host_load = max(host_load, srchost->srchost_bulk_flow_count);
1805
1806                 if (cake_ddst(q->flow_mode))
1807                         host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1808
1809                 flow->deficit = (b->flow_quantum *
1810                                  quantum_div[host_load]) >> 16;
1811         } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1812                 struct cake_host *srchost = &b->hosts[flow->srchost];
1813                 struct cake_host *dsthost = &b->hosts[flow->dsthost];
1814
1815                 /* this flow was empty, accounted as a sparse flow, but actually
1816                  * in the bulk rotation.
1817                  */
1818                 flow->set = CAKE_SET_BULK;
1819                 b->sparse_flow_count--;
1820                 b->bulk_flow_count++;
1821
1822                 if (cake_dsrc(q->flow_mode))
1823                         srchost->srchost_bulk_flow_count++;
1824
1825                 if (cake_ddst(q->flow_mode))
1826                         dsthost->dsthost_bulk_flow_count++;
1827
1828         }
1829
1830         if (q->buffer_used > q->buffer_max_used)
1831                 q->buffer_max_used = q->buffer_used;
1832
1833         if (q->buffer_used > q->buffer_limit) {
1834                 u32 dropped = 0;
1835
1836                 while (q->buffer_used > q->buffer_limit) {
1837                         dropped++;
1838                         cake_drop(sch, to_free);
1839                 }
1840                 b->drop_overlimit += dropped;
1841         }
1842         return NET_XMIT_SUCCESS;
1843 }
1844
1845 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1846 {
1847         struct cake_sched_data *q = qdisc_priv(sch);
1848         struct cake_tin_data *b = &q->tins[q->cur_tin];
1849         struct cake_flow *flow = &b->flows[q->cur_flow];
1850         struct sk_buff *skb = NULL;
1851         u32 len;
1852
1853         if (flow->head) {
1854                 skb = dequeue_head(flow);
1855                 len = qdisc_pkt_len(skb);
1856                 b->backlogs[q->cur_flow] -= len;
1857                 b->tin_backlog           -= len;
1858                 sch->qstats.backlog      -= len;
1859                 q->buffer_used           -= skb->truesize;
1860                 sch->q.qlen--;
1861
1862                 if (q->overflow_timeout)
1863                         cake_heapify(q, b->overflow_idx[q->cur_flow]);
1864         }
1865         return skb;
1866 }
1867
1868 /* Discard leftover packets from a tin no longer in use. */
1869 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1870 {
1871         struct cake_sched_data *q = qdisc_priv(sch);
1872         struct sk_buff *skb;
1873
1874         q->cur_tin = tin;
1875         for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1876                 while (!!(skb = cake_dequeue_one(sch)))
1877                         kfree_skb(skb);
1878 }
1879
1880 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1881 {
1882         struct cake_sched_data *q = qdisc_priv(sch);
1883         struct cake_tin_data *b = &q->tins[q->cur_tin];
1884         struct cake_host *srchost, *dsthost;
1885         ktime_t now = ktime_get();
1886         struct cake_flow *flow;
1887         struct list_head *head;
1888         bool first_flow = true;
1889         struct sk_buff *skb;
1890         u16 host_load;
1891         u64 delay;
1892         u32 len;
1893
1894 begin:
1895         if (!sch->q.qlen)
1896                 return NULL;
1897
1898         /* global hard shaper */
1899         if (ktime_after(q->time_next_packet, now) &&
1900             ktime_after(q->failsafe_next_packet, now)) {
1901                 u64 next = min(ktime_to_ns(q->time_next_packet),
1902                                ktime_to_ns(q->failsafe_next_packet));
1903
1904                 sch->qstats.overlimits++;
1905                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1906                 return NULL;
1907         }
1908
1909         /* Choose a class to work on. */
1910         if (!q->rate_ns) {
1911                 /* In unlimited mode, can't rely on shaper timings, just balance
1912                  * with DRR
1913                  */
1914                 bool wrapped = false, empty = true;
1915
1916                 while (b->tin_deficit < 0 ||
1917                        !(b->sparse_flow_count + b->bulk_flow_count)) {
1918                         if (b->tin_deficit <= 0)
1919                                 b->tin_deficit += b->tin_quantum;
1920                         if (b->sparse_flow_count + b->bulk_flow_count)
1921                                 empty = false;
1922
1923                         q->cur_tin++;
1924                         b++;
1925                         if (q->cur_tin >= q->tin_cnt) {
1926                                 q->cur_tin = 0;
1927                                 b = q->tins;
1928
1929                                 if (wrapped) {
1930                                         /* It's possible for q->qlen to be
1931                                          * nonzero when we actually have no
1932                                          * packets anywhere.
1933                                          */
1934                                         if (empty)
1935                                                 return NULL;
1936                                 } else {
1937                                         wrapped = true;
1938                                 }
1939                         }
1940                 }
1941         } else {
1942                 /* In shaped mode, choose:
1943                  * - Highest-priority tin with queue and meeting schedule, or
1944                  * - The earliest-scheduled tin with queue.
1945                  */
1946                 ktime_t best_time = KTIME_MAX;
1947                 int tin, best_tin = 0;
1948
1949                 for (tin = 0; tin < q->tin_cnt; tin++) {
1950                         b = q->tins + tin;
1951                         if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
1952                                 ktime_t time_to_pkt = \
1953                                         ktime_sub(b->time_next_packet, now);
1954
1955                                 if (ktime_to_ns(time_to_pkt) <= 0 ||
1956                                     ktime_compare(time_to_pkt,
1957                                                   best_time) <= 0) {
1958                                         best_time = time_to_pkt;
1959                                         best_tin = tin;
1960                                 }
1961                         }
1962                 }
1963
1964                 q->cur_tin = best_tin;
1965                 b = q->tins + best_tin;
1966
1967                 /* No point in going further if no packets to deliver. */
1968                 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
1969                         return NULL;
1970         }
1971
1972 retry:
1973         /* service this class */
1974         head = &b->decaying_flows;
1975         if (!first_flow || list_empty(head)) {
1976                 head = &b->new_flows;
1977                 if (list_empty(head)) {
1978                         head = &b->old_flows;
1979                         if (unlikely(list_empty(head))) {
1980                                 head = &b->decaying_flows;
1981                                 if (unlikely(list_empty(head)))
1982                                         goto begin;
1983                         }
1984                 }
1985         }
1986         flow = list_first_entry(head, struct cake_flow, flowchain);
1987         q->cur_flow = flow - b->flows;
1988         first_flow = false;
1989
1990         /* triple isolation (modified DRR++) */
1991         srchost = &b->hosts[flow->srchost];
1992         dsthost = &b->hosts[flow->dsthost];
1993         host_load = 1;
1994
1995         /* flow isolation (DRR++) */
1996         if (flow->deficit <= 0) {
1997                 /* Keep all flows with deficits out of the sparse and decaying
1998                  * rotations.  No non-empty flow can go into the decaying
1999                  * rotation, so they can't get deficits
2000                  */
2001                 if (flow->set == CAKE_SET_SPARSE) {
2002                         if (flow->head) {
2003                                 b->sparse_flow_count--;
2004                                 b->bulk_flow_count++;
2005
2006                                 if (cake_dsrc(q->flow_mode))
2007                                         srchost->srchost_bulk_flow_count++;
2008
2009                                 if (cake_ddst(q->flow_mode))
2010                                         dsthost->dsthost_bulk_flow_count++;
2011
2012                                 flow->set = CAKE_SET_BULK;
2013                         } else {
2014                                 /* we've moved it to the bulk rotation for
2015                                  * correct deficit accounting but we still want
2016                                  * to count it as a sparse flow, not a bulk one.
2017                                  */
2018                                 flow->set = CAKE_SET_SPARSE_WAIT;
2019                         }
2020                 }
2021
2022                 if (cake_dsrc(q->flow_mode))
2023                         host_load = max(host_load, srchost->srchost_bulk_flow_count);
2024
2025                 if (cake_ddst(q->flow_mode))
2026                         host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2027
2028                 WARN_ON(host_load > CAKE_QUEUES);
2029
2030                 /* The shifted prandom_u32() is a way to apply dithering to
2031                  * avoid accumulating roundoff errors
2032                  */
2033                 flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2034                                   (prandom_u32() >> 16)) >> 16;
2035                 list_move_tail(&flow->flowchain, &b->old_flows);
2036
2037                 goto retry;
2038         }
2039
2040         /* Retrieve a packet via the AQM */
2041         while (1) {
2042                 skb = cake_dequeue_one(sch);
2043                 if (!skb) {
2044                         /* this queue was actually empty */
2045                         if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2046                                 b->unresponsive_flow_count--;
2047
2048                         if (flow->cvars.p_drop || flow->cvars.count ||
2049                             ktime_before(now, flow->cvars.drop_next)) {
2050                                 /* keep in the flowchain until the state has
2051                                  * decayed to rest
2052                                  */
2053                                 list_move_tail(&flow->flowchain,
2054                                                &b->decaying_flows);
2055                                 if (flow->set == CAKE_SET_BULK) {
2056                                         b->bulk_flow_count--;
2057
2058                                         if (cake_dsrc(q->flow_mode))
2059                                                 srchost->srchost_bulk_flow_count--;
2060
2061                                         if (cake_ddst(q->flow_mode))
2062                                                 dsthost->dsthost_bulk_flow_count--;
2063
2064                                         b->decaying_flow_count++;
2065                                 } else if (flow->set == CAKE_SET_SPARSE ||
2066                                            flow->set == CAKE_SET_SPARSE_WAIT) {
2067                                         b->sparse_flow_count--;
2068                                         b->decaying_flow_count++;
2069                                 }
2070                                 flow->set = CAKE_SET_DECAYING;
2071                         } else {
2072                                 /* remove empty queue from the flowchain */
2073                                 list_del_init(&flow->flowchain);
2074                                 if (flow->set == CAKE_SET_SPARSE ||
2075                                     flow->set == CAKE_SET_SPARSE_WAIT)
2076                                         b->sparse_flow_count--;
2077                                 else if (flow->set == CAKE_SET_BULK) {
2078                                         b->bulk_flow_count--;
2079
2080                                         if (cake_dsrc(q->flow_mode))
2081                                                 srchost->srchost_bulk_flow_count--;
2082
2083                                         if (cake_ddst(q->flow_mode))
2084                                                 dsthost->dsthost_bulk_flow_count--;
2085
2086                                 } else
2087                                         b->decaying_flow_count--;
2088
2089                                 flow->set = CAKE_SET_NONE;
2090                         }
2091                         goto begin;
2092                 }
2093
2094                 /* Last packet in queue may be marked, shouldn't be dropped */
2095                 if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2096                                         (b->bulk_flow_count *
2097                                          !!(q->rate_flags &
2098                                             CAKE_FLAG_INGRESS))) ||
2099                     !flow->head)
2100                         break;
2101
2102                 /* drop this packet, get another one */
2103                 if (q->rate_flags & CAKE_FLAG_INGRESS) {
2104                         len = cake_advance_shaper(q, b, skb,
2105                                                   now, true);
2106                         flow->deficit -= len;
2107                         b->tin_deficit -= len;
2108                 }
2109                 flow->dropped++;
2110                 b->tin_dropped++;
2111                 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2112                 qdisc_qstats_drop(sch);
2113                 kfree_skb(skb);
2114                 if (q->rate_flags & CAKE_FLAG_INGRESS)
2115                         goto retry;
2116         }
2117
2118         b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2119         qdisc_bstats_update(sch, skb);
2120
2121         /* collect delay stats */
2122         delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2123         b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2124         b->peak_delay = cake_ewma(b->peak_delay, delay,
2125                                   delay > b->peak_delay ? 2 : 8);
2126         b->base_delay = cake_ewma(b->base_delay, delay,
2127                                   delay < b->base_delay ? 2 : 8);
2128
2129         len = cake_advance_shaper(q, b, skb, now, false);
2130         flow->deficit -= len;
2131         b->tin_deficit -= len;
2132
2133         if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2134                 u64 next = min(ktime_to_ns(q->time_next_packet),
2135                                ktime_to_ns(q->failsafe_next_packet));
2136
2137                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2138         } else if (!sch->q.qlen) {
2139                 int i;
2140
2141                 for (i = 0; i < q->tin_cnt; i++) {
2142                         if (q->tins[i].decaying_flow_count) {
2143                                 ktime_t next = \
2144                                         ktime_add_ns(now,
2145                                                      q->tins[i].cparams.target);
2146
2147                                 qdisc_watchdog_schedule_ns(&q->watchdog,
2148                                                            ktime_to_ns(next));
2149                                 break;
2150                         }
2151                 }
2152         }
2153
2154         if (q->overflow_timeout)
2155                 q->overflow_timeout--;
2156
2157         return skb;
2158 }
2159
2160 static void cake_reset(struct Qdisc *sch)
2161 {
2162         u32 c;
2163
2164         for (c = 0; c < CAKE_MAX_TINS; c++)
2165                 cake_clear_tin(sch, c);
2166 }
2167
2168 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2169         [TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2170         [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2171         [TCA_CAKE_ATM]           = { .type = NLA_U32 },
2172         [TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2173         [TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2174         [TCA_CAKE_RTT]           = { .type = NLA_U32 },
2175         [TCA_CAKE_TARGET]        = { .type = NLA_U32 },
2176         [TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2177         [TCA_CAKE_MEMORY]        = { .type = NLA_U32 },
2178         [TCA_CAKE_NAT]           = { .type = NLA_U32 },
2179         [TCA_CAKE_RAW]           = { .type = NLA_U32 },
2180         [TCA_CAKE_WASH]          = { .type = NLA_U32 },
2181         [TCA_CAKE_MPU]           = { .type = NLA_U32 },
2182         [TCA_CAKE_INGRESS]       = { .type = NLA_U32 },
2183         [TCA_CAKE_ACK_FILTER]    = { .type = NLA_U32 },
2184         [TCA_CAKE_SPLIT_GSO]     = { .type = NLA_U32 },
2185         [TCA_CAKE_FWMARK]        = { .type = NLA_U32 },
2186 };
2187
2188 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2189                           u64 target_ns, u64 rtt_est_ns)
2190 {
2191         /* convert byte-rate into time-per-byte
2192          * so it will always unwedge in reasonable time.
2193          */
2194         static const u64 MIN_RATE = 64;
2195         u32 byte_target = mtu;
2196         u64 byte_target_ns;
2197         u8  rate_shft = 0;
2198         u64 rate_ns = 0;
2199
2200         b->flow_quantum = 1514;
2201         if (rate) {
2202                 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2203                 rate_shft = 34;
2204                 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2205                 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2206                 while (!!(rate_ns >> 34)) {
2207                         rate_ns >>= 1;
2208                         rate_shft--;
2209                 }
2210         } /* else unlimited, ie. zero delay */
2211
2212         b->tin_rate_bps  = rate;
2213         b->tin_rate_ns   = rate_ns;
2214         b->tin_rate_shft = rate_shft;
2215
2216         byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2217
2218         b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2219         b->cparams.interval = max(rtt_est_ns +
2220                                      b->cparams.target - target_ns,
2221                                      b->cparams.target * 2);
2222         b->cparams.mtu_time = byte_target_ns;
2223         b->cparams.p_inc = 1 << 24; /* 1/256 */
2224         b->cparams.p_dec = 1 << 20; /* 1/4096 */
2225 }
2226
2227 static int cake_config_besteffort(struct Qdisc *sch)
2228 {
2229         struct cake_sched_data *q = qdisc_priv(sch);
2230         struct cake_tin_data *b = &q->tins[0];
2231         u32 mtu = psched_mtu(qdisc_dev(sch));
2232         u64 rate = q->rate_bps;
2233
2234         q->tin_cnt = 1;
2235
2236         q->tin_index = besteffort;
2237         q->tin_order = normal_order;
2238
2239         cake_set_rate(b, rate, mtu,
2240                       us_to_ns(q->target), us_to_ns(q->interval));
2241         b->tin_quantum = 65535;
2242
2243         return 0;
2244 }
2245
2246 static int cake_config_precedence(struct Qdisc *sch)
2247 {
2248         /* convert high-level (user visible) parameters into internal format */
2249         struct cake_sched_data *q = qdisc_priv(sch);
2250         u32 mtu = psched_mtu(qdisc_dev(sch));
2251         u64 rate = q->rate_bps;
2252         u32 quantum = 256;
2253         u32 i;
2254
2255         q->tin_cnt = 8;
2256         q->tin_index = precedence;
2257         q->tin_order = normal_order;
2258
2259         for (i = 0; i < q->tin_cnt; i++) {
2260                 struct cake_tin_data *b = &q->tins[i];
2261
2262                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2263                               us_to_ns(q->interval));
2264
2265                 b->tin_quantum = max_t(u16, 1U, quantum);
2266
2267                 /* calculate next class's parameters */
2268                 rate  *= 7;
2269                 rate >>= 3;
2270
2271                 quantum  *= 7;
2272                 quantum >>= 3;
2273         }
2274
2275         return 0;
2276 }
2277
2278 /*      List of known Diffserv codepoints:
2279  *
2280  *      Least Effort (CS1)
2281  *      Best Effort (CS0)
2282  *      Max Reliability & LLT "Lo" (TOS1)
2283  *      Max Throughput (TOS2)
2284  *      Min Delay (TOS4)
2285  *      LLT "La" (TOS5)
2286  *      Assured Forwarding 1 (AF1x) - x3
2287  *      Assured Forwarding 2 (AF2x) - x3
2288  *      Assured Forwarding 3 (AF3x) - x3
2289  *      Assured Forwarding 4 (AF4x) - x3
2290  *      Precedence Class 2 (CS2)
2291  *      Precedence Class 3 (CS3)
2292  *      Precedence Class 4 (CS4)
2293  *      Precedence Class 5 (CS5)
2294  *      Precedence Class 6 (CS6)
2295  *      Precedence Class 7 (CS7)
2296  *      Voice Admit (VA)
2297  *      Expedited Forwarding (EF)
2298
2299  *      Total 25 codepoints.
2300  */
2301
2302 /*      List of traffic classes in RFC 4594:
2303  *              (roughly descending order of contended priority)
2304  *              (roughly ascending order of uncontended throughput)
2305  *
2306  *      Network Control (CS6,CS7)      - routing traffic
2307  *      Telephony (EF,VA)         - aka. VoIP streams
2308  *      Signalling (CS5)               - VoIP setup
2309  *      Multimedia Conferencing (AF4x) - aka. video calls
2310  *      Realtime Interactive (CS4)     - eg. games
2311  *      Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2312  *      Broadcast Video (CS3)
2313  *      Low Latency Data (AF2x,TOS4)      - eg. database
2314  *      Ops, Admin, Management (CS2,TOS1) - eg. ssh
2315  *      Standard Service (CS0 & unrecognised codepoints)
2316  *      High Throughput Data (AF1x,TOS2)  - eg. web traffic
2317  *      Low Priority Data (CS1)           - eg. BitTorrent
2318
2319  *      Total 12 traffic classes.
2320  */
2321
2322 static int cake_config_diffserv8(struct Qdisc *sch)
2323 {
2324 /*      Pruned list of traffic classes for typical applications:
2325  *
2326  *              Network Control          (CS6, CS7)
2327  *              Minimum Latency          (EF, VA, CS5, CS4)
2328  *              Interactive Shell        (CS2, TOS1)
2329  *              Low Latency Transactions (AF2x, TOS4)
2330  *              Video Streaming          (AF4x, AF3x, CS3)
2331  *              Bog Standard             (CS0 etc.)
2332  *              High Throughput          (AF1x, TOS2)
2333  *              Background Traffic       (CS1)
2334  *
2335  *              Total 8 traffic classes.
2336  */
2337
2338         struct cake_sched_data *q = qdisc_priv(sch);
2339         u32 mtu = psched_mtu(qdisc_dev(sch));
2340         u64 rate = q->rate_bps;
2341         u32 quantum = 256;
2342         u32 i;
2343
2344         q->tin_cnt = 8;
2345
2346         /* codepoint to class mapping */
2347         q->tin_index = diffserv8;
2348         q->tin_order = normal_order;
2349
2350         /* class characteristics */
2351         for (i = 0; i < q->tin_cnt; i++) {
2352                 struct cake_tin_data *b = &q->tins[i];
2353
2354                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2355                               us_to_ns(q->interval));
2356
2357                 b->tin_quantum = max_t(u16, 1U, quantum);
2358
2359                 /* calculate next class's parameters */
2360                 rate  *= 7;
2361                 rate >>= 3;
2362
2363                 quantum  *= 7;
2364                 quantum >>= 3;
2365         }
2366
2367         return 0;
2368 }
2369
2370 static int cake_config_diffserv4(struct Qdisc *sch)
2371 {
2372 /*  Further pruned list of traffic classes for four-class system:
2373  *
2374  *          Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2375  *          Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2376  *          Best Effort        (CS0, AF1x, TOS2, and those not specified)
2377  *          Background Traffic (CS1)
2378  *
2379  *              Total 4 traffic classes.
2380  */
2381
2382         struct cake_sched_data *q = qdisc_priv(sch);
2383         u32 mtu = psched_mtu(qdisc_dev(sch));
2384         u64 rate = q->rate_bps;
2385         u32 quantum = 1024;
2386
2387         q->tin_cnt = 4;
2388
2389         /* codepoint to class mapping */
2390         q->tin_index = diffserv4;
2391         q->tin_order = bulk_order;
2392
2393         /* class characteristics */
2394         cake_set_rate(&q->tins[0], rate, mtu,
2395                       us_to_ns(q->target), us_to_ns(q->interval));
2396         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2397                       us_to_ns(q->target), us_to_ns(q->interval));
2398         cake_set_rate(&q->tins[2], rate >> 1, mtu,
2399                       us_to_ns(q->target), us_to_ns(q->interval));
2400         cake_set_rate(&q->tins[3], rate >> 2, mtu,
2401                       us_to_ns(q->target), us_to_ns(q->interval));
2402
2403         /* bandwidth-sharing weights */
2404         q->tins[0].tin_quantum = quantum;
2405         q->tins[1].tin_quantum = quantum >> 4;
2406         q->tins[2].tin_quantum = quantum >> 1;
2407         q->tins[3].tin_quantum = quantum >> 2;
2408
2409         return 0;
2410 }
2411
2412 static int cake_config_diffserv3(struct Qdisc *sch)
2413 {
2414 /*  Simplified Diffserv structure with 3 tins.
2415  *              Low Priority            (CS1)
2416  *              Best Effort
2417  *              Latency Sensitive       (TOS4, VA, EF, CS6, CS7)
2418  */
2419         struct cake_sched_data *q = qdisc_priv(sch);
2420         u32 mtu = psched_mtu(qdisc_dev(sch));
2421         u64 rate = q->rate_bps;
2422         u32 quantum = 1024;
2423
2424         q->tin_cnt = 3;
2425
2426         /* codepoint to class mapping */
2427         q->tin_index = diffserv3;
2428         q->tin_order = bulk_order;
2429
2430         /* class characteristics */
2431         cake_set_rate(&q->tins[0], rate, mtu,
2432                       us_to_ns(q->target), us_to_ns(q->interval));
2433         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2434                       us_to_ns(q->target), us_to_ns(q->interval));
2435         cake_set_rate(&q->tins[2], rate >> 2, mtu,
2436                       us_to_ns(q->target), us_to_ns(q->interval));
2437
2438         /* bandwidth-sharing weights */
2439         q->tins[0].tin_quantum = quantum;
2440         q->tins[1].tin_quantum = quantum >> 4;
2441         q->tins[2].tin_quantum = quantum >> 2;
2442
2443         return 0;
2444 }
2445
2446 static void cake_reconfigure(struct Qdisc *sch)
2447 {
2448         struct cake_sched_data *q = qdisc_priv(sch);
2449         int c, ft;
2450
2451         switch (q->tin_mode) {
2452         case CAKE_DIFFSERV_BESTEFFORT:
2453                 ft = cake_config_besteffort(sch);
2454                 break;
2455
2456         case CAKE_DIFFSERV_PRECEDENCE:
2457                 ft = cake_config_precedence(sch);
2458                 break;
2459
2460         case CAKE_DIFFSERV_DIFFSERV8:
2461                 ft = cake_config_diffserv8(sch);
2462                 break;
2463
2464         case CAKE_DIFFSERV_DIFFSERV4:
2465                 ft = cake_config_diffserv4(sch);
2466                 break;
2467
2468         case CAKE_DIFFSERV_DIFFSERV3:
2469         default:
2470                 ft = cake_config_diffserv3(sch);
2471                 break;
2472         }
2473
2474         for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2475                 cake_clear_tin(sch, c);
2476                 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2477         }
2478
2479         q->rate_ns   = q->tins[ft].tin_rate_ns;
2480         q->rate_shft = q->tins[ft].tin_rate_shft;
2481
2482         if (q->buffer_config_limit) {
2483                 q->buffer_limit = q->buffer_config_limit;
2484         } else if (q->rate_bps) {
2485                 u64 t = q->rate_bps * q->interval;
2486
2487                 do_div(t, USEC_PER_SEC / 4);
2488                 q->buffer_limit = max_t(u32, t, 4U << 20);
2489         } else {
2490                 q->buffer_limit = ~0;
2491         }
2492
2493         sch->flags &= ~TCQ_F_CAN_BYPASS;
2494
2495         q->buffer_limit = min(q->buffer_limit,
2496                               max(sch->limit * psched_mtu(qdisc_dev(sch)),
2497                                   q->buffer_config_limit));
2498 }
2499
2500 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2501                        struct netlink_ext_ack *extack)
2502 {
2503         struct cake_sched_data *q = qdisc_priv(sch);
2504         struct nlattr *tb[TCA_CAKE_MAX + 1];
2505         int err;
2506
2507         if (!opt)
2508                 return -EINVAL;
2509
2510         err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2511                                           extack);
2512         if (err < 0)
2513                 return err;
2514
2515         if (tb[TCA_CAKE_NAT]) {
2516 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2517                 q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2518                 q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2519                         !!nla_get_u32(tb[TCA_CAKE_NAT]);
2520 #else
2521                 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2522                                     "No conntrack support in kernel");
2523                 return -EOPNOTSUPP;
2524 #endif
2525         }
2526
2527         if (tb[TCA_CAKE_BASE_RATE64])
2528                 q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2529
2530         if (tb[TCA_CAKE_DIFFSERV_MODE])
2531                 q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2532
2533         if (tb[TCA_CAKE_WASH]) {
2534                 if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2535                         q->rate_flags |= CAKE_FLAG_WASH;
2536                 else
2537                         q->rate_flags &= ~CAKE_FLAG_WASH;
2538         }
2539
2540         if (tb[TCA_CAKE_FLOW_MODE])
2541                 q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2542                                 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2543                                         CAKE_FLOW_MASK));
2544
2545         if (tb[TCA_CAKE_ATM])
2546                 q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2547
2548         if (tb[TCA_CAKE_OVERHEAD]) {
2549                 q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2550                 q->rate_flags |= CAKE_FLAG_OVERHEAD;
2551
2552                 q->max_netlen = 0;
2553                 q->max_adjlen = 0;
2554                 q->min_netlen = ~0;
2555                 q->min_adjlen = ~0;
2556         }
2557
2558         if (tb[TCA_CAKE_RAW]) {
2559                 q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2560
2561                 q->max_netlen = 0;
2562                 q->max_adjlen = 0;
2563                 q->min_netlen = ~0;
2564                 q->min_adjlen = ~0;
2565         }
2566
2567         if (tb[TCA_CAKE_MPU])
2568                 q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2569
2570         if (tb[TCA_CAKE_RTT]) {
2571                 q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2572
2573                 if (!q->interval)
2574                         q->interval = 1;
2575         }
2576
2577         if (tb[TCA_CAKE_TARGET]) {
2578                 q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2579
2580                 if (!q->target)
2581                         q->target = 1;
2582         }
2583
2584         if (tb[TCA_CAKE_AUTORATE]) {
2585                 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2586                         q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2587                 else
2588                         q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2589         }
2590
2591         if (tb[TCA_CAKE_INGRESS]) {
2592                 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2593                         q->rate_flags |= CAKE_FLAG_INGRESS;
2594                 else
2595                         q->rate_flags &= ~CAKE_FLAG_INGRESS;
2596         }
2597
2598         if (tb[TCA_CAKE_ACK_FILTER])
2599                 q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2600
2601         if (tb[TCA_CAKE_MEMORY])
2602                 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2603
2604         if (tb[TCA_CAKE_SPLIT_GSO]) {
2605                 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2606                         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2607                 else
2608                         q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2609         }
2610
2611         if (tb[TCA_CAKE_FWMARK]) {
2612                 q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2613                 q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2614         }
2615
2616         if (q->tins) {
2617                 sch_tree_lock(sch);
2618                 cake_reconfigure(sch);
2619                 sch_tree_unlock(sch);
2620         }
2621
2622         return 0;
2623 }
2624
2625 static void cake_destroy(struct Qdisc *sch)
2626 {
2627         struct cake_sched_data *q = qdisc_priv(sch);
2628
2629         qdisc_watchdog_cancel(&q->watchdog);
2630         tcf_block_put(q->block);
2631         kvfree(q->tins);
2632 }
2633
2634 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2635                      struct netlink_ext_ack *extack)
2636 {
2637         struct cake_sched_data *q = qdisc_priv(sch);
2638         int i, j, err;
2639
2640         sch->limit = 10240;
2641         q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2642         q->flow_mode  = CAKE_FLOW_TRIPLE;
2643
2644         q->rate_bps = 0; /* unlimited by default */
2645
2646         q->interval = 100000; /* 100ms default */
2647         q->target   =   5000; /* 5ms: codel RFC argues
2648                                * for 5 to 10% of interval
2649                                */
2650         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2651         q->cur_tin = 0;
2652         q->cur_flow  = 0;
2653
2654         qdisc_watchdog_init(&q->watchdog, sch);
2655
2656         if (opt) {
2657                 int err = cake_change(sch, opt, extack);
2658
2659                 if (err)
2660                         return err;
2661         }
2662
2663         err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2664         if (err)
2665                 return err;
2666
2667         quantum_div[0] = ~0;
2668         for (i = 1; i <= CAKE_QUEUES; i++)
2669                 quantum_div[i] = 65535 / i;
2670
2671         q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2672                            GFP_KERNEL);
2673         if (!q->tins)
2674                 goto nomem;
2675
2676         for (i = 0; i < CAKE_MAX_TINS; i++) {
2677                 struct cake_tin_data *b = q->tins + i;
2678
2679                 INIT_LIST_HEAD(&b->new_flows);
2680                 INIT_LIST_HEAD(&b->old_flows);
2681                 INIT_LIST_HEAD(&b->decaying_flows);
2682                 b->sparse_flow_count = 0;
2683                 b->bulk_flow_count = 0;
2684                 b->decaying_flow_count = 0;
2685
2686                 for (j = 0; j < CAKE_QUEUES; j++) {
2687                         struct cake_flow *flow = b->flows + j;
2688                         u32 k = j * CAKE_MAX_TINS + i;
2689
2690                         INIT_LIST_HEAD(&flow->flowchain);
2691                         cobalt_vars_init(&flow->cvars);
2692
2693                         q->overflow_heap[k].t = i;
2694                         q->overflow_heap[k].b = j;
2695                         b->overflow_idx[j] = k;
2696                 }
2697         }
2698
2699         cake_reconfigure(sch);
2700         q->avg_peak_bandwidth = q->rate_bps;
2701         q->min_netlen = ~0;
2702         q->min_adjlen = ~0;
2703         return 0;
2704
2705 nomem:
2706         cake_destroy(sch);
2707         return -ENOMEM;
2708 }
2709
2710 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2711 {
2712         struct cake_sched_data *q = qdisc_priv(sch);
2713         struct nlattr *opts;
2714
2715         opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2716         if (!opts)
2717                 goto nla_put_failure;
2718
2719         if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2720                               TCA_CAKE_PAD))
2721                 goto nla_put_failure;
2722
2723         if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2724                         q->flow_mode & CAKE_FLOW_MASK))
2725                 goto nla_put_failure;
2726
2727         if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2728                 goto nla_put_failure;
2729
2730         if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2731                 goto nla_put_failure;
2732
2733         if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2734                 goto nla_put_failure;
2735
2736         if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2737                         !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2738                 goto nla_put_failure;
2739
2740         if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2741                         !!(q->rate_flags & CAKE_FLAG_INGRESS)))
2742                 goto nla_put_failure;
2743
2744         if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2745                 goto nla_put_failure;
2746
2747         if (nla_put_u32(skb, TCA_CAKE_NAT,
2748                         !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2749                 goto nla_put_failure;
2750
2751         if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2752                 goto nla_put_failure;
2753
2754         if (nla_put_u32(skb, TCA_CAKE_WASH,
2755                         !!(q->rate_flags & CAKE_FLAG_WASH)))
2756                 goto nla_put_failure;
2757
2758         if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2759                 goto nla_put_failure;
2760
2761         if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2762                 if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2763                         goto nla_put_failure;
2764
2765         if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2766                 goto nla_put_failure;
2767
2768         if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2769                 goto nla_put_failure;
2770
2771         if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2772                         !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2773                 goto nla_put_failure;
2774
2775         if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2776                 goto nla_put_failure;
2777
2778         return nla_nest_end(skb, opts);
2779
2780 nla_put_failure:
2781         return -1;
2782 }
2783
2784 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2785 {
2786         struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2787         struct cake_sched_data *q = qdisc_priv(sch);
2788         struct nlattr *tstats, *ts;
2789         int i;
2790
2791         if (!stats)
2792                 return -1;
2793
2794 #define PUT_STAT_U32(attr, data) do {                                  \
2795                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2796                         goto nla_put_failure;                          \
2797         } while (0)
2798 #define PUT_STAT_U64(attr, data) do {                                  \
2799                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2800                                         data, TCA_CAKE_STATS_PAD)) \
2801                         goto nla_put_failure;                          \
2802         } while (0)
2803
2804         PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2805         PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2806         PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2807         PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2808         PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2809         PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2810         PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2811         PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2812
2813 #undef PUT_STAT_U32
2814 #undef PUT_STAT_U64
2815
2816         tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2817         if (!tstats)
2818                 goto nla_put_failure;
2819
2820 #define PUT_TSTAT_U32(attr, data) do {                                  \
2821                 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2822                         goto nla_put_failure;                           \
2823         } while (0)
2824 #define PUT_TSTAT_U64(attr, data) do {                                  \
2825                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2826                                         data, TCA_CAKE_TIN_STATS_PAD))  \
2827                         goto nla_put_failure;                           \
2828         } while (0)
2829
2830         for (i = 0; i < q->tin_cnt; i++) {
2831                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2832
2833                 ts = nla_nest_start_noflag(d->skb, i + 1);
2834                 if (!ts)
2835                         goto nla_put_failure;
2836
2837                 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2838                 PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2839                 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2840
2841                 PUT_TSTAT_U32(TARGET_US,
2842                               ktime_to_us(ns_to_ktime(b->cparams.target)));
2843                 PUT_TSTAT_U32(INTERVAL_US,
2844                               ktime_to_us(ns_to_ktime(b->cparams.interval)));
2845
2846                 PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2847                 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2848                 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2849                 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2850
2851                 PUT_TSTAT_U32(PEAK_DELAY_US,
2852                               ktime_to_us(ns_to_ktime(b->peak_delay)));
2853                 PUT_TSTAT_U32(AVG_DELAY_US,
2854                               ktime_to_us(ns_to_ktime(b->avge_delay)));
2855                 PUT_TSTAT_U32(BASE_DELAY_US,
2856                               ktime_to_us(ns_to_ktime(b->base_delay)));
2857
2858                 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2859                 PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2860                 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2861
2862                 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2863                                             b->decaying_flow_count);
2864                 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2865                 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2866                 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2867
2868                 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2869                 nla_nest_end(d->skb, ts);
2870         }
2871
2872 #undef PUT_TSTAT_U32
2873 #undef PUT_TSTAT_U64
2874
2875         nla_nest_end(d->skb, tstats);
2876         return nla_nest_end(d->skb, stats);
2877
2878 nla_put_failure:
2879         nla_nest_cancel(d->skb, stats);
2880         return -1;
2881 }
2882
2883 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2884 {
2885         return NULL;
2886 }
2887
2888 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2889 {
2890         return 0;
2891 }
2892
2893 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2894                                u32 classid)
2895 {
2896         return 0;
2897 }
2898
2899 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2900 {
2901 }
2902
2903 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2904                                         struct netlink_ext_ack *extack)
2905 {
2906         struct cake_sched_data *q = qdisc_priv(sch);
2907
2908         if (cl)
2909                 return NULL;
2910         return q->block;
2911 }
2912
2913 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2914                            struct sk_buff *skb, struct tcmsg *tcm)
2915 {
2916         tcm->tcm_handle |= TC_H_MIN(cl);
2917         return 0;
2918 }
2919
2920 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2921                                  struct gnet_dump *d)
2922 {
2923         struct cake_sched_data *q = qdisc_priv(sch);
2924         const struct cake_flow *flow = NULL;
2925         struct gnet_stats_queue qs = { 0 };
2926         struct nlattr *stats;
2927         u32 idx = cl - 1;
2928
2929         if (idx < CAKE_QUEUES * q->tin_cnt) {
2930                 const struct cake_tin_data *b = \
2931                         &q->tins[q->tin_order[idx / CAKE_QUEUES]];
2932                 const struct sk_buff *skb;
2933
2934                 flow = &b->flows[idx % CAKE_QUEUES];
2935
2936                 if (flow->head) {
2937                         sch_tree_lock(sch);
2938                         skb = flow->head;
2939                         while (skb) {
2940                                 qs.qlen++;
2941                                 skb = skb->next;
2942                         }
2943                         sch_tree_unlock(sch);
2944                 }
2945                 qs.backlog = b->backlogs[idx % CAKE_QUEUES];
2946                 qs.drops = flow->dropped;
2947         }
2948         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
2949                 return -1;
2950         if (flow) {
2951                 ktime_t now = ktime_get();
2952
2953                 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2954                 if (!stats)
2955                         return -1;
2956
2957 #define PUT_STAT_U32(attr, data) do {                                  \
2958                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2959                         goto nla_put_failure;                          \
2960         } while (0)
2961 #define PUT_STAT_S32(attr, data) do {                                  \
2962                 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2963                         goto nla_put_failure;                          \
2964         } while (0)
2965
2966                 PUT_STAT_S32(DEFICIT, flow->deficit);
2967                 PUT_STAT_U32(DROPPING, flow->cvars.dropping);
2968                 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
2969                 PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
2970                 if (flow->cvars.p_drop) {
2971                         PUT_STAT_S32(BLUE_TIMER_US,
2972                                      ktime_to_us(
2973                                              ktime_sub(now,
2974                                                      flow->cvars.blue_timer)));
2975                 }
2976                 if (flow->cvars.dropping) {
2977                         PUT_STAT_S32(DROP_NEXT_US,
2978                                      ktime_to_us(
2979                                              ktime_sub(now,
2980                                                        flow->cvars.drop_next)));
2981                 }
2982
2983                 if (nla_nest_end(d->skb, stats) < 0)
2984                         return -1;
2985         }
2986
2987         return 0;
2988
2989 nla_put_failure:
2990         nla_nest_cancel(d->skb, stats);
2991         return -1;
2992 }
2993
2994 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2995 {
2996         struct cake_sched_data *q = qdisc_priv(sch);
2997         unsigned int i, j;
2998
2999         if (arg->stop)
3000                 return;
3001
3002         for (i = 0; i < q->tin_cnt; i++) {
3003                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3004
3005                 for (j = 0; j < CAKE_QUEUES; j++) {
3006                         if (list_empty(&b->flows[j].flowchain) ||
3007                             arg->count < arg->skip) {
3008                                 arg->count++;
3009                                 continue;
3010                         }
3011                         if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
3012                                 arg->stop = 1;
3013                                 break;
3014                         }
3015                         arg->count++;
3016                 }
3017         }
3018 }
3019
3020 static const struct Qdisc_class_ops cake_class_ops = {
3021         .leaf           =       cake_leaf,
3022         .find           =       cake_find,
3023         .tcf_block      =       cake_tcf_block,
3024         .bind_tcf       =       cake_bind,
3025         .unbind_tcf     =       cake_unbind,
3026         .dump           =       cake_dump_class,
3027         .dump_stats     =       cake_dump_class_stats,
3028         .walk           =       cake_walk,
3029 };
3030
3031 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3032         .cl_ops         =       &cake_class_ops,
3033         .id             =       "cake",
3034         .priv_size      =       sizeof(struct cake_sched_data),
3035         .enqueue        =       cake_enqueue,
3036         .dequeue        =       cake_dequeue,
3037         .peek           =       qdisc_peek_dequeued,
3038         .init           =       cake_init,
3039         .reset          =       cake_reset,
3040         .destroy        =       cake_destroy,
3041         .change         =       cake_change,
3042         .dump           =       cake_dump,
3043         .dump_stats     =       cake_dump_stats,
3044         .owner          =       THIS_MODULE,
3045 };
3046
3047 static int __init cake_module_init(void)
3048 {
3049         return register_qdisc(&cake_qdisc_ops);
3050 }
3051
3052 static void __exit cake_module_exit(void)
3053 {
3054         unregister_qdisc(&cake_qdisc_ops);
3055 }
3056
3057 module_init(cake_module_init)
3058 module_exit(cake_module_exit)
3059 MODULE_AUTHOR("Jonathan Morton");
3060 MODULE_LICENSE("Dual BSD/GPL");
3061 MODULE_DESCRIPTION("The CAKE shaper.");