net/sched/sch_cake.c

   1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2
   3 /* COMMON Applications Kept Enhanced (CAKE) discipline
   4  *
   5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
   6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
   7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
   8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
   9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
  10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
  11  *
  12  * The CAKE Principles:
  13  *                 (or, how to have your cake and eat it too)
  14  *
  15  * This is a combination of several shaping, AQM and FQ techniques into one
  16  * easy-to-use package:
  17  *
  18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
  19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
  20  *   eliminating the need for any sort of burst parameter (eg. token bucket
  21  *   depth).  Burst support is limited to that necessary to overcome scheduling
  22  *   latency.
  23  *
  24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
  25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
  26  *   the priority is reduced to avoid starving other tins.
  27  *
  28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
  29  *   flows from each other.  This prevents a burst on one flow from increasing
  30  *   the delay to another.  Flows are distributed to queues using a
  31  *   set-associative hash function.
  32  *
  33  * - Each queue is actively managed by Cobalt, which is a combination of the
  34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
  35  *   congestion early via ECN (if available) and/or packet drops, to keep
  36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
  37  *   setting, as is necessary at low bandwidths.
  38  *
  39  * The configuration parameters are kept deliberately simple for ease of use.
  40  * Everything has sane defaults.  Complete generality of configuration is *not*
  41  * a goal.
  42  *
  43  * The priority queue operates according to a weighted DRR scheme, combined with
  44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
  45  * bandwidth sharing threshold the tin is operating.  This determines whether a
  46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
  47  * that tin in the current pass.
  48  *
  49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
  50  * granted us permission to leverage.
  51  */
  52
  53 #include <linux/module.h>
  54 #include <linux/types.h>
  55 #include <linux/kernel.h>
  56 #include <linux/jiffies.h>
  57 #include <linux/string.h>
  58 #include <linux/in.h>
  59 #include <linux/errno.h>
  60 #include <linux/init.h>
  61 #include <linux/skbuff.h>
  62 #include <linux/jhash.h>
  63 #include <linux/slab.h>
  64 #include <linux/vmalloc.h>
  65 #include <linux/reciprocal_div.h>
  66 #include <net/netlink.h>
  67 #include <linux/if_vlan.h>
  68 #include <net/pkt_sched.h>
  69 #include <net/pkt_cls.h>
  70 #include <net/tcp.h>
  71 #include <net/flow_dissector.h>
  72
  73 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
  74 #include <net/netfilter/nf_conntrack_core.h>
  75 #endif
  76
  77 #define CAKE_SET_WAYS (8)
  78 #define CAKE_MAX_TINS (8)
  79 #define CAKE_QUEUES (1024)
  80 #define CAKE_FLOW_MASK 63
  81 #define CAKE_FLOW_NAT_FLAG 64
  82
  83 /* struct cobalt_params - contains codel and blue parameters
  84  * @interval:   codel initial drop rate
  85  * @target:     maximum persistent sojourn time & blue update rate
  86  * @mtu_time:   serialisation delay of maximum-size packet
  87  * @p_inc:      increment of blue drop probability (0.32 fxp)
  88  * @p_dec:      decrement of blue drop probability (0.32 fxp)
  89  */
  90 struct cobalt_params {
  91         u64     interval;
  92         u64     target;
  93         u64     mtu_time;
  94         u32     p_inc;
  95         u32     p_dec;
  96 };
  97
  98 /* struct cobalt_vars - contains codel and blue variables
  99  * @count:              codel dropping frequency
 100  * @rec_inv_sqrt:       reciprocal value of sqrt(count) >> 1
 101  * @drop_next:          time to drop next packet, or when we dropped last
 102  * @blue_timer:         Blue time to next drop
 103  * @p_drop:             BLUE drop probability (0.32 fxp)
 104  * @dropping:           set if in dropping state
 105  * @ecn_marked:         set if marked
 106  */
 107 struct cobalt_vars {
 108         u32     count;
 109         u32     rec_inv_sqrt;
 110         ktime_t drop_next;
 111         ktime_t blue_timer;
 112         u32     p_drop;
 113         bool    dropping;
 114         bool    ecn_marked;
 115 };
 116
 117 enum {
 118         CAKE_SET_NONE = 0,
 119         CAKE_SET_SPARSE,
 120         CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
 121         CAKE_SET_BULK,
 122         CAKE_SET_DECAYING
 123 };
 124
 125 struct cake_flow {
 126         /* this stuff is all needed per-flow at dequeue time */
 127         struct sk_buff    *head;
 128         struct sk_buff    *tail;
 129         struct list_head  flowchain;
 130         s32               deficit;
 131         u32               dropped;
 132         struct cobalt_vars cvars;
 133         u16               srchost; /* index into cake_host table */
 134         u16               dsthost;
 135         u8                set;
 136 }; /* please try to keep this structure <= 64 bytes */
 137
 138 struct cake_host {
 139         u32 srchost_tag;
 140         u32 dsthost_tag;
 141         u16 srchost_bulk_flow_count;
 142         u16 dsthost_bulk_flow_count;
 143 };
 144
 145 struct cake_heap_entry {
 146         u16 t:3, b:10;
 147 };
 148
 149 struct cake_tin_data {
 150         struct cake_flow flows[CAKE_QUEUES];
 151         u32     backlogs[CAKE_QUEUES];
 152         u32     tags[CAKE_QUEUES]; /* for set association */
 153         u16     overflow_idx[CAKE_QUEUES];
 154         struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
 155         u16     flow_quantum;
 156
 157         struct cobalt_params cparams;
 158         u32     drop_overlimit;
 159         u16     bulk_flow_count;
 160         u16     sparse_flow_count;
 161         u16     decaying_flow_count;
 162         u16     unresponsive_flow_count;
 163
 164         u32     max_skblen;
 165
 166         struct list_head new_flows;
 167         struct list_head old_flows;
 168         struct list_head decaying_flows;
 169
 170         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 171         ktime_t time_next_packet;
 172         u64     tin_rate_ns;
 173         u64     tin_rate_bps;
 174         u16     tin_rate_shft;
 175
 176         u16     tin_quantum;
 177         s32     tin_deficit;
 178         u32     tin_backlog;
 179         u32     tin_dropped;
 180         u32     tin_ecn_mark;
 181
 182         u32     packets;
 183         u64     bytes;
 184
 185         u32     ack_drops;
 186
 187         /* moving averages */
 188         u64 avge_delay;
 189         u64 peak_delay;
 190         u64 base_delay;
 191
 192         /* hash function stats */
 193         u32     way_directs;
 194         u32     way_hits;
 195         u32     way_misses;
 196         u32     way_collisions;
 197 }; /* number of tins is small, so size of this struct doesn't matter much */
 198
 199 struct cake_sched_data {
 200         struct tcf_proto __rcu *filter_list; /* optional external classifier */
 201         struct tcf_block *block;
 202         struct cake_tin_data *tins;
 203
 204         struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
 205         u16             overflow_timeout;
 206
 207         u16             tin_cnt;
 208         u8              tin_mode;
 209         u8              flow_mode;
 210         u8              ack_filter;
 211         u8              atm_mode;
 212
 213         u32             fwmark_mask;
 214         u16             fwmark_shft;
 215
 216         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 217         u16             rate_shft;
 218         ktime_t         time_next_packet;
 219         ktime_t         failsafe_next_packet;
 220         u64             rate_ns;
 221         u64             rate_bps;
 222         u16             rate_flags;
 223         s16             rate_overhead;
 224         u16             rate_mpu;
 225         u64             interval;
 226         u64             target;
 227
 228         /* resource tracking */
 229         u32             buffer_used;
 230         u32             buffer_max_used;
 231         u32             buffer_limit;
 232         u32             buffer_config_limit;
 233
 234         /* indices for dequeue */
 235         u16             cur_tin;
 236         u16             cur_flow;
 237
 238         struct qdisc_watchdog watchdog;
 239         const u8        *tin_index;
 240         const u8        *tin_order;
 241
 242         /* bandwidth capacity estimate */
 243         ktime_t         last_packet_time;
 244         ktime_t         avg_window_begin;
 245         u64             avg_packet_interval;
 246         u64             avg_window_bytes;
 247         u64             avg_peak_bandwidth;
 248         ktime_t         last_reconfig_time;
 249
 250         /* packet length stats */
 251         u32             avg_netoff;
 252         u16             max_netlen;
 253         u16             max_adjlen;
 254         u16             min_netlen;
 255         u16             min_adjlen;
 256 };
 257
 258 enum {
 259         CAKE_FLAG_OVERHEAD         = BIT(0),
 260         CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
 261         CAKE_FLAG_INGRESS          = BIT(2),
 262         CAKE_FLAG_WASH             = BIT(3),
 263         CAKE_FLAG_SPLIT_GSO        = BIT(4)
 264 };
 265
 266 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
 267  * obtain the best features of each.  Codel is excellent on flows which
 268  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
 269  * unresponsive flows.
 270  */
 271
 272 struct cobalt_skb_cb {
 273         ktime_t enqueue_time;
 274         u32     adjusted_len;
 275 };
 276
 277 static u64 us_to_ns(u64 us)
 278 {
 279         return us * NSEC_PER_USEC;
 280 }
 281
 282 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
 283 {
 284         qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
 285         return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
 286 }
 287
 288 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
 289 {
 290         return get_cobalt_cb(skb)->enqueue_time;
 291 }
 292
 293 static void cobalt_set_enqueue_time(struct sk_buff *skb,
 294                                     ktime_t now)
 295 {
 296         get_cobalt_cb(skb)->enqueue_time = now;
 297 }
 298
 299 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
 300
 301 /* Diffserv lookup tables */
 302
 303 static const u8 precedence[] = {
 304         0, 0, 0, 0, 0, 0, 0, 0,
 305         1, 1, 1, 1, 1, 1, 1, 1,
 306         2, 2, 2, 2, 2, 2, 2, 2,
 307         3, 3, 3, 3, 3, 3, 3, 3,
 308         4, 4, 4, 4, 4, 4, 4, 4,
 309         5, 5, 5, 5, 5, 5, 5, 5,
 310         6, 6, 6, 6, 6, 6, 6, 6,
 311         7, 7, 7, 7, 7, 7, 7, 7,
 312 };
 313
 314 static const u8 diffserv8[] = {
 315         2, 0, 1, 2, 4, 2, 2, 2,
 316         1, 2, 1, 2, 1, 2, 1, 2,
 317         5, 2, 4, 2, 4, 2, 4, 2,
 318         3, 2, 3, 2, 3, 2, 3, 2,
 319         6, 2, 3, 2, 3, 2, 3, 2,
 320         6, 2, 2, 2, 6, 2, 6, 2,
 321         7, 2, 2, 2, 2, 2, 2, 2,
 322         7, 2, 2, 2, 2, 2, 2, 2,
 323 };
 324
 325 static const u8 diffserv4[] = {
 326         0, 1, 0, 0, 2, 0, 0, 0,
 327         1, 0, 0, 0, 0, 0, 0, 0,
 328         2, 0, 2, 0, 2, 0, 2, 0,
 329         2, 0, 2, 0, 2, 0, 2, 0,
 330         3, 0, 2, 0, 2, 0, 2, 0,
 331         3, 0, 0, 0, 3, 0, 3, 0,
 332         3, 0, 0, 0, 0, 0, 0, 0,
 333         3, 0, 0, 0, 0, 0, 0, 0,
 334 };
 335
 336 static const u8 diffserv3[] = {
 337         0, 1, 0, 0, 2, 0, 0, 0,
 338         1, 0, 0, 0, 0, 0, 0, 0,
 339         0, 0, 0, 0, 0, 0, 0, 0,
 340         0, 0, 0, 0, 0, 0, 0, 0,
 341         0, 0, 0, 0, 0, 0, 0, 0,
 342         0, 0, 0, 0, 2, 0, 2, 0,
 343         2, 0, 0, 0, 0, 0, 0, 0,
 344         2, 0, 0, 0, 0, 0, 0, 0,
 345 };
 346
 347 static const u8 besteffort[] = {
 348         0, 0, 0, 0, 0, 0, 0, 0,
 349         0, 0, 0, 0, 0, 0, 0, 0,
 350         0, 0, 0, 0, 0, 0, 0, 0,
 351         0, 0, 0, 0, 0, 0, 0, 0,
 352         0, 0, 0, 0, 0, 0, 0, 0,
 353         0, 0, 0, 0, 0, 0, 0, 0,
 354         0, 0, 0, 0, 0, 0, 0, 0,
 355         0, 0, 0, 0, 0, 0, 0, 0,
 356 };
 357
 358 /* tin priority order for stats dumping */
 359
 360 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
 361 static const u8 bulk_order[] = {1, 0, 2, 3};
 362
 363 #define REC_INV_SQRT_CACHE (16)
 364 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
 365
 366 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
 367  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
 368  *
 369  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
 370  */
 371
 372 static void cobalt_newton_step(struct cobalt_vars *vars)
 373 {
 374         u32 invsqrt, invsqrt2;
 375         u64 val;
 376
 377         invsqrt = vars->rec_inv_sqrt;
 378         invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
 379         val = (3LL << 32) - ((u64)vars->count * invsqrt2);
 380
 381         val >>= 2; /* avoid overflow in following multiply */
 382         val = (val * invsqrt) >> (32 - 2 + 1);
 383
 384         vars->rec_inv_sqrt = val;
 385 }
 386
 387 static void cobalt_invsqrt(struct cobalt_vars *vars)
 388 {
 389         if (vars->count < REC_INV_SQRT_CACHE)
 390                 vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
 391         else
 392                 cobalt_newton_step(vars);
 393 }
 394
 395 /* There is a big difference in timing between the accurate values placed in
 396  * the cache and the approximations given by a single Newton step for small
 397  * count values, particularly when stepping from count 1 to 2 or vice versa.
 398  * Above 16, a single Newton step gives sufficient accuracy in either
 399  * direction, given the precision stored.
 400  *
 401  * The magnitude of the error when stepping up to count 2 is such as to give
 402  * the value that *should* have been produced at count 4.
 403  */
 404
 405 static void cobalt_cache_init(void)
 406 {
 407         struct cobalt_vars v;
 408
 409         memset(&v, 0, sizeof(v));
 410         v.rec_inv_sqrt = ~0U;
 411         cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
 412
 413         for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
 414                 cobalt_newton_step(&v);
 415                 cobalt_newton_step(&v);
 416                 cobalt_newton_step(&v);
 417                 cobalt_newton_step(&v);
 418
 419                 cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
 420         }
 421 }
 422
 423 static void cobalt_vars_init(struct cobalt_vars *vars)
 424 {
 425         memset(vars, 0, sizeof(*vars));
 426
 427         if (!cobalt_rec_inv_sqrt_cache[0]) {
 428                 cobalt_cache_init();
 429                 cobalt_rec_inv_sqrt_cache[0] = ~0;
 430         }
 431 }
 432
 433 /* CoDel control_law is t + interval/sqrt(count)
 434  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
 435  * both sqrt() and divide operation.
 436  */
 437 static ktime_t cobalt_control(ktime_t t,
 438                               u64 interval,
 439                               u32 rec_inv_sqrt)
 440 {
 441         return ktime_add_ns(t, reciprocal_scale(interval,
 442                                                 rec_inv_sqrt));
 443 }
 444
 445 /* Call this when a packet had to be dropped due to queue overflow.  Returns
 446  * true if the BLUE state was quiescent before but active after this call.
 447  */
 448 static bool cobalt_queue_full(struct cobalt_vars *vars,
 449                               struct cobalt_params *p,
 450                               ktime_t now)
 451 {
 452         bool up = false;
 453
 454         if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 455                 up = !vars->p_drop;
 456                 vars->p_drop += p->p_inc;
 457                 if (vars->p_drop < p->p_inc)
 458                         vars->p_drop = ~0;
 459                 vars->blue_timer = now;
 460         }
 461         vars->dropping = true;
 462         vars->drop_next = now;
 463         if (!vars->count)
 464                 vars->count = 1;
 465
 466         return up;
 467 }
 468
 469 /* Call this when the queue was serviced but turned out to be empty.  Returns
 470  * true if the BLUE state was active before but quiescent after this call.
 471  */
 472 static bool cobalt_queue_empty(struct cobalt_vars *vars,
 473                                struct cobalt_params *p,
 474                                ktime_t now)
 475 {
 476         bool down = false;
 477
 478         if (vars->p_drop &&
 479             ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 480                 if (vars->p_drop < p->p_dec)
 481                         vars->p_drop = 0;
 482                 else
 483                         vars->p_drop -= p->p_dec;
 484                 vars->blue_timer = now;
 485                 down = !vars->p_drop;
 486         }
 487         vars->dropping = false;
 488
 489         if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
 490                 vars->count--;
 491                 cobalt_invsqrt(vars);
 492                 vars->drop_next = cobalt_control(vars->drop_next,
 493                                                  p->interval,
 494                                                  vars->rec_inv_sqrt);
 495         }
 496
 497         return down;
 498 }
 499
 500 /* Call this with a freshly dequeued packet for possible congestion marking.
 501  * Returns true as an instruction to drop the packet, false for delivery.
 502  */
 503 static bool cobalt_should_drop(struct cobalt_vars *vars,
 504                                struct cobalt_params *p,
 505                                ktime_t now,
 506                                struct sk_buff *skb,
 507                                u32 bulk_flows)
 508 {
 509         bool next_due, over_target, drop = false;
 510         ktime_t schedule;
 511         u64 sojourn;
 512
 513 /* The 'schedule' variable records, in its sign, whether 'now' is before or
 514  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
 515  * scheduling decision is actually branched, without destroying that
 516  * information.  Similarly, the first 'schedule' value calculated is preserved
 517  * in the boolean 'next_due'.
 518  *
 519  * As for 'drop_next', we take advantage of the fact that 'interval' is both
 520  * the delay between first exceeding 'target' and the first signalling event,
 521  * *and* the scaling factor for the signalling frequency.  It's therefore very
 522  * natural to use a single mechanism for both purposes, and eliminates a
 523  * significant amount of reference Codel's spaghetti code.  To help with this,
 524  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
 525  * as possible to 1.0 in fixed-point.
 526  */
 527
 528         sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
 529         schedule = ktime_sub(now, vars->drop_next);
 530         over_target = sojourn > p->target &&
 531                       sojourn > p->mtu_time * bulk_flows * 2 &&
 532                       sojourn > p->mtu_time * 4;
 533         next_due = vars->count && ktime_to_ns(schedule) >= 0;
 534
 535         vars->ecn_marked = false;
 536
 537         if (over_target) {
 538                 if (!vars->dropping) {
 539                         vars->dropping = true;
 540                         vars->drop_next = cobalt_control(now,
 541                                                          p->interval,
 542                                                          vars->rec_inv_sqrt);
 543                 }
 544                 if (!vars->count)
 545                         vars->count = 1;
 546         } else if (vars->dropping) {
 547                 vars->dropping = false;
 548         }
 549
 550         if (next_due && vars->dropping) {
 551                 /* Use ECN mark if possible, otherwise drop */
 552                 drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
 553
 554                 vars->count++;
 555                 if (!vars->count)
 556                         vars->count--;
 557                 cobalt_invsqrt(vars);
 558                 vars->drop_next = cobalt_control(vars->drop_next,
 559                                                  p->interval,
 560                                                  vars->rec_inv_sqrt);
 561                 schedule = ktime_sub(now, vars->drop_next);
 562         } else {
 563                 while (next_due) {
 564                         vars->count--;
 565                         cobalt_invsqrt(vars);
 566                         vars->drop_next = cobalt_control(vars->drop_next,
 567                                                          p->interval,
 568                                                          vars->rec_inv_sqrt);
 569                         schedule = ktime_sub(now, vars->drop_next);
 570                         next_due = vars->count && ktime_to_ns(schedule) >= 0;
 571                 }
 572         }
 573
 574         /* Simple BLUE implementation.  Lack of ECN is deliberate. */
 575         if (vars->p_drop)
 576                 drop |= (prandom_u32() < vars->p_drop);
 577
 578         /* Overload the drop_next field as an activity timeout */
 579         if (!vars->count)
 580                 vars->drop_next = ktime_add_ns(now, p->interval);
 581         else if (ktime_to_ns(schedule) > 0 && !drop)
 582                 vars->drop_next = now;
 583
 584         return drop;
 585 }
 586
 587 static bool cake_update_flowkeys(struct flow_keys *keys,
 588                                  const struct sk_buff *skb)
 589 {
 590 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 591         struct nf_conntrack_tuple tuple = {};
 592         bool rev = !skb->_nfct, upd = false;
 593         __be32 ip;
 594
 595         if (skb_protocol(skb, true) != htons(ETH_P_IP))
 596                 return false;
 597
 598         if (!nf_ct_get_tuple_skb(&tuple, skb))
 599                 return false;
 600
 601         ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
 602         if (ip != keys->addrs.v4addrs.src) {
 603                 keys->addrs.v4addrs.src = ip;
 604                 upd = true;
 605         }
 606         ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
 607         if (ip != keys->addrs.v4addrs.dst) {
 608                 keys->addrs.v4addrs.dst = ip;
 609                 upd = true;
 610         }
 611
 612         if (keys->ports.ports) {
 613                 __be16 port;
 614
 615                 port = rev ? tuple.dst.u.all : tuple.src.u.all;
 616                 if (port != keys->ports.src) {
 617                         keys->ports.src = port;
 618                         upd = true;
 619                 }
 620                 port = rev ? tuple.src.u.all : tuple.dst.u.all;
 621                 if (port != keys->ports.dst) {
 622                         port = keys->ports.dst;
 623                         upd = true;
 624                 }
 625         }
 626         return upd;
 627 #else
 628         return false;
 629 #endif
 630 }
 631
 632 /* Cake has several subtle multiple bit settings. In these cases you
 633  *  would be matching triple isolate mode as well.
 634  */
 635
 636 static bool cake_dsrc(int flow_mode)
 637 {
 638         return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
 639 }
 640
 641 static bool cake_ddst(int flow_mode)
 642 {
 643         return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
 644 }
 645
 646 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
 647                      int flow_mode, u16 flow_override, u16 host_override)
 648 {
 649         bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
 650         bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
 651         bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
 652         u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
 653         u16 reduced_hash, srchost_idx, dsthost_idx;
 654         struct flow_keys keys, host_keys;
 655         bool use_skbhash = skb->l4_hash;
 656
 657         if (unlikely(flow_mode == CAKE_FLOW_NONE))
 658                 return 0;
 659
 660         /* If both overrides are set, or we can use the SKB hash and nat mode is
 661          * disabled, we can skip packet dissection entirely. If nat mode is
 662          * enabled there's another check below after doing the conntrack lookup.
 663          */
 664         if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
 665                 goto skip_hash;
 666
 667         skb_flow_dissect_flow_keys(skb, &keys,
 668                                    FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
 669
 670         /* Don't use the SKB hash if we change the lookup keys from conntrack */
 671         if (nat_enabled && cake_update_flowkeys(&keys, skb))
 672                 use_skbhash = false;
 673
 674         /* If we can still use the SKB hash and don't need the host hash, we can
 675          * skip the rest of the hashing procedure
 676          */
 677         if (use_skbhash && !hash_hosts)
 678                 goto skip_hash;
 679
 680         /* flow_hash_from_keys() sorts the addresses by value, so we have
 681          * to preserve their order in a separate data structure to treat
 682          * src and dst host addresses as independently selectable.
 683          */
 684         host_keys = keys;
 685         host_keys.ports.ports     = 0;
 686         host_keys.basic.ip_proto  = 0;
 687         host_keys.keyid.keyid     = 0;
 688         host_keys.tags.flow_label = 0;
 689
 690         switch (host_keys.control.addr_type) {
 691         case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
 692                 host_keys.addrs.v4addrs.src = 0;
 693                 dsthost_hash = flow_hash_from_keys(&host_keys);
 694                 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
 695                 host_keys.addrs.v4addrs.dst = 0;
 696                 srchost_hash = flow_hash_from_keys(&host_keys);
 697                 break;
 698
 699         case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 700                 memset(&host_keys.addrs.v6addrs.src, 0,
 701                        sizeof(host_keys.addrs.v6addrs.src));
 702                 dsthost_hash = flow_hash_from_keys(&host_keys);
 703                 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
 704                 memset(&host_keys.addrs.v6addrs.dst, 0,
 705                        sizeof(host_keys.addrs.v6addrs.dst));
 706                 srchost_hash = flow_hash_from_keys(&host_keys);
 707                 break;
 708
 709         default:
 710                 dsthost_hash = 0;
 711                 srchost_hash = 0;
 712         }
 713
 714         /* This *must* be after the above switch, since as a
 715          * side-effect it sorts the src and dst addresses.
 716          */
 717         if (hash_flows && !use_skbhash)
 718                 flow_hash = flow_hash_from_keys(&keys);
 719
 720 skip_hash:
 721         if (flow_override)
 722                 flow_hash = flow_override - 1;
 723         else if (use_skbhash)
 724                 flow_hash = skb->hash;
 725         if (host_override) {
 726                 dsthost_hash = host_override - 1;
 727                 srchost_hash = host_override - 1;
 728         }
 729
 730         if (!(flow_mode & CAKE_FLOW_FLOWS)) {
 731                 if (flow_mode & CAKE_FLOW_SRC_IP)
 732                         flow_hash ^= srchost_hash;
 733
 734                 if (flow_mode & CAKE_FLOW_DST_IP)
 735                         flow_hash ^= dsthost_hash;
 736         }
 737
 738         reduced_hash = flow_hash % CAKE_QUEUES;
 739
 740         /* set-associative hashing */
 741         /* fast path if no hash collision (direct lookup succeeds) */
 742         if (likely(q->tags[reduced_hash] == flow_hash &&
 743                    q->flows[reduced_hash].set)) {
 744                 q->way_directs++;
 745         } else {
 746                 u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
 747                 u32 outer_hash = reduced_hash - inner_hash;
 748                 bool allocate_src = false;
 749                 bool allocate_dst = false;
 750                 u32 i, k;
 751
 752                 /* check if any active queue in the set is reserved for
 753                  * this flow.
 754                  */
 755                 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 756                      i++, k = (k + 1) % CAKE_SET_WAYS) {
 757                         if (q->tags[outer_hash + k] == flow_hash) {
 758                                 if (i)
 759                                         q->way_hits++;
 760
 761                                 if (!q->flows[outer_hash + k].set) {
 762                                         /* need to increment host refcnts */
 763                                         allocate_src = cake_dsrc(flow_mode);
 764                                         allocate_dst = cake_ddst(flow_mode);
 765                                 }
 766
 767                                 goto found;
 768                         }
 769                 }
 770
 771                 /* no queue is reserved for this flow, look for an
 772                  * empty one.
 773                  */
 774                 for (i = 0; i < CAKE_SET_WAYS;
 775                          i++, k = (k + 1) % CAKE_SET_WAYS) {
 776                         if (!q->flows[outer_hash + k].set) {
 777                                 q->way_misses++;
 778                                 allocate_src = cake_dsrc(flow_mode);
 779                                 allocate_dst = cake_ddst(flow_mode);
 780                                 goto found;
 781                         }
 782                 }
 783
 784                 /* With no empty queues, default to the original
 785                  * queue, accept the collision, update the host tags.
 786                  */
 787                 q->way_collisions++;
 788                 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
 789                         q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
 790                         q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
 791                 }
 792                 allocate_src = cake_dsrc(flow_mode);
 793                 allocate_dst = cake_ddst(flow_mode);
 794 found:
 795                 /* reserve queue for future packets in same flow */
 796                 reduced_hash = outer_hash + k;
 797                 q->tags[reduced_hash] = flow_hash;
 798
 799                 if (allocate_src) {
 800                         srchost_idx = srchost_hash % CAKE_QUEUES;
 801                         inner_hash = srchost_idx % CAKE_SET_WAYS;
 802                         outer_hash = srchost_idx - inner_hash;
 803                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 804                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
 805                                 if (q->hosts[outer_hash + k].srchost_tag ==
 806                                     srchost_hash)
 807                                         goto found_src;
 808                         }
 809                         for (i = 0; i < CAKE_SET_WAYS;
 810                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
 811                                 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
 812                                         break;
 813                         }
 814                         q->hosts[outer_hash + k].srchost_tag = srchost_hash;
 815 found_src:
 816                         srchost_idx = outer_hash + k;
 817                         if (q->flows[reduced_hash].set == CAKE_SET_BULK)
 818                                 q->hosts[srchost_idx].srchost_bulk_flow_count++;
 819                         q->flows[reduced_hash].srchost = srchost_idx;
 820                 }
 821
 822                 if (allocate_dst) {
 823                         dsthost_idx = dsthost_hash % CAKE_QUEUES;
 824                         inner_hash = dsthost_idx % CAKE_SET_WAYS;
 825                         outer_hash = dsthost_idx - inner_hash;
 826                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 827                              i++, k = (k + 1) % CAKE_SET_WAYS) {
 828                                 if (q->hosts[outer_hash + k].dsthost_tag ==
 829                                     dsthost_hash)
 830                                         goto found_dst;
 831                         }
 832                         for (i = 0; i < CAKE_SET_WAYS;
 833                              i++, k = (k + 1) % CAKE_SET_WAYS) {
 834                                 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
 835                                         break;
 836                         }
 837                         q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
 838 found_dst:
 839                         dsthost_idx = outer_hash + k;
 840                         if (q->flows[reduced_hash].set == CAKE_SET_BULK)
 841                                 q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
 842                         q->flows[reduced_hash].dsthost = dsthost_idx;
 843                 }
 844         }
 845
 846         return reduced_hash;
 847 }
 848
 849 /* helper functions : might be changed when/if skb use a standard list_head */
 850 /* remove one skb from head of slot queue */
 851
 852 static struct sk_buff *dequeue_head(struct cake_flow *flow)
 853 {
 854         struct sk_buff *skb = flow->head;
 855
 856         if (skb) {
 857                 flow->head = skb->next;
 858                 skb_mark_not_on_list(skb);
 859         }
 860
 861         return skb;
 862 }
 863
 864 /* add skb to flow queue (tail add) */
 865
 866 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
 867 {
 868         if (!flow->head)
 869                 flow->head = skb;
 870         else
 871                 flow->tail->next = skb;
 872         flow->tail = skb;
 873         skb->next = NULL;
 874 }
 875
 876 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
 877                                     struct ipv6hdr *buf)
 878 {
 879         unsigned int offset = skb_network_offset(skb);
 880         struct iphdr *iph;
 881
 882         iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
 883
 884         if (!iph)
 885                 return NULL;
 886
 887         if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
 888                 return skb_header_pointer(skb, offset + iph->ihl * 4,
 889                                           sizeof(struct ipv6hdr), buf);
 890
 891         else if (iph->version == 4)
 892                 return iph;
 893
 894         else if (iph->version == 6)
 895                 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
 896                                           buf);
 897
 898         return NULL;
 899 }
 900
 901 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
 902                                       void *buf, unsigned int bufsize)
 903 {
 904         unsigned int offset = skb_network_offset(skb);
 905         const struct ipv6hdr *ipv6h;
 906         const struct tcphdr *tcph;
 907         const struct iphdr *iph;
 908         struct ipv6hdr _ipv6h;
 909         struct tcphdr _tcph;
 910
 911         ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
 912
 913         if (!ipv6h)
 914                 return NULL;
 915
 916         if (ipv6h->version == 4) {
 917                 iph = (struct iphdr *)ipv6h;
 918                 offset += iph->ihl * 4;
 919
 920                 /* special-case 6in4 tunnelling, as that is a common way to get
 921                  * v6 connectivity in the home
 922                  */
 923                 if (iph->protocol == IPPROTO_IPV6) {
 924                         ipv6h = skb_header_pointer(skb, offset,
 925                                                    sizeof(_ipv6h), &_ipv6h);
 926
 927                         if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
 928                                 return NULL;
 929
 930                         offset += sizeof(struct ipv6hdr);
 931
 932                 } else if (iph->protocol != IPPROTO_TCP) {
 933                         return NULL;
 934                 }
 935
 936         } else if (ipv6h->version == 6) {
 937                 if (ipv6h->nexthdr != IPPROTO_TCP)
 938                         return NULL;
 939
 940                 offset += sizeof(struct ipv6hdr);
 941         } else {
 942                 return NULL;
 943         }
 944
 945         tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
 946         if (!tcph || tcph->doff < 5)
 947                 return NULL;
 948
 949         return skb_header_pointer(skb, offset,
 950                                   min(__tcp_hdrlen(tcph), bufsize), buf);
 951 }
 952
 953 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
 954                                    int code, int *oplen)
 955 {
 956         /* inspired by tcp_parse_options in tcp_input.c */
 957         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
 958         const u8 *ptr = (const u8 *)(tcph + 1);
 959
 960         while (length > 0) {
 961                 int opcode = *ptr++;
 962                 int opsize;
 963
 964                 if (opcode == TCPOPT_EOL)
 965                         break;
 966                 if (opcode == TCPOPT_NOP) {
 967                         length--;
 968                         continue;
 969                 }
 970                 if (length < 2)
 971                         break;
 972                 opsize = *ptr++;
 973                 if (opsize < 2 || opsize > length)
 974                         break;
 975
 976                 if (opcode == code) {
 977                         *oplen = opsize;
 978                         return ptr;
 979                 }
 980
 981                 ptr += opsize - 2;
 982                 length -= opsize;
 983         }
 984
 985         return NULL;
 986 }
 987
 988 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
 989  * bytes than the other. In the case where both sequences ACKs bytes that the
 990  * other doesn't, A is considered greater. DSACKs in A also makes A be
 991  * considered greater.
 992  *
 993  * @return -1, 0 or 1 as normal compare functions
 994  */
 995 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
 996                                   const struct tcphdr *tcph_b)
 997 {
 998         const struct tcp_sack_block_wire *sack_a, *sack_b;
 999         u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1000         u32 bytes_a = 0, bytes_b = 0;
1001         int oplen_a, oplen_b;
1002         bool first = true;
1003
1004         sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1005         sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1006
1007         /* pointers point to option contents */
1008         oplen_a -= TCPOLEN_SACK_BASE;
1009         oplen_b -= TCPOLEN_SACK_BASE;
1010
1011         if (sack_a && oplen_a >= sizeof(*sack_a) &&
1012             (!sack_b || oplen_b < sizeof(*sack_b)))
1013                 return -1;
1014         else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1015                  (!sack_a || oplen_a < sizeof(*sack_a)))
1016                 return 1;
1017         else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1018                  (!sack_b || oplen_b < sizeof(*sack_b)))
1019                 return 0;
1020
1021         while (oplen_a >= sizeof(*sack_a)) {
1022                 const struct tcp_sack_block_wire *sack_tmp = sack_b;
1023                 u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1024                 u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1025                 int oplen_tmp = oplen_b;
1026                 bool found = false;
1027
1028                 /* DSACK; always considered greater to prevent dropping */
1029                 if (before(start_a, ack_seq_a))
1030                         return -1;
1031
1032                 bytes_a += end_a - start_a;
1033
1034                 while (oplen_tmp >= sizeof(*sack_tmp)) {
1035                         u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1036                         u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1037
1038                         /* first time through we count the total size */
1039                         if (first)
1040                                 bytes_b += end_b - start_b;
1041
1042                         if (!after(start_b, start_a) && !before(end_b, end_a)) {
1043                                 found = true;
1044                                 if (!first)
1045                                         break;
1046                         }
1047                         oplen_tmp -= sizeof(*sack_tmp);
1048                         sack_tmp++;
1049                 }
1050
1051                 if (!found)
1052                         return -1;
1053
1054                 oplen_a -= sizeof(*sack_a);
1055                 sack_a++;
1056                 first = false;
1057         }
1058
1059         /* If we made it this far, all ranges SACKed by A are covered by B, so
1060          * either the SACKs are equal, or B SACKs more bytes.
1061          */
1062         return bytes_b > bytes_a ? 1 : 0;
1063 }
1064
1065 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1066                                  u32 *tsval, u32 *tsecr)
1067 {
1068         const u8 *ptr;
1069         int opsize;
1070
1071         ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1072
1073         if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1074                 *tsval = get_unaligned_be32(ptr);
1075                 *tsecr = get_unaligned_be32(ptr + 4);
1076         }
1077 }
1078
1079 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1080                                u32 tstamp_new, u32 tsecr_new)
1081 {
1082         /* inspired by tcp_parse_options in tcp_input.c */
1083         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1084         const u8 *ptr = (const u8 *)(tcph + 1);
1085         u32 tstamp, tsecr;
1086
1087         /* 3 reserved flags must be unset to avoid future breakage
1088          * ACK must be set
1089          * ECE/CWR are handled separately
1090          * All other flags URG/PSH/RST/SYN/FIN must be unset
1091          * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1092          * 0x00C00000 = CWR/ECE (handled separately)
1093          * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1094          */
1095         if (((tcp_flag_word(tcph) &
1096               cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1097                 return false;
1098
1099         while (length > 0) {
1100                 int opcode = *ptr++;
1101                 int opsize;
1102
1103                 if (opcode == TCPOPT_EOL)
1104                         break;
1105                 if (opcode == TCPOPT_NOP) {
1106                         length--;
1107                         continue;
1108                 }
1109                 if (length < 2)
1110                         break;
1111                 opsize = *ptr++;
1112                 if (opsize < 2 || opsize > length)
1113                         break;
1114
1115                 switch (opcode) {
1116                 case TCPOPT_MD5SIG: /* doesn't influence state */
1117                         break;
1118
1119                 case TCPOPT_SACK: /* stricter checking performed later */
1120                         if (opsize % 8 != 2)
1121                                 return false;
1122                         break;
1123
1124                 case TCPOPT_TIMESTAMP:
1125                         /* only drop timestamps lower than new */
1126                         if (opsize != TCPOLEN_TIMESTAMP)
1127                                 return false;
1128                         tstamp = get_unaligned_be32(ptr);
1129                         tsecr = get_unaligned_be32(ptr + 4);
1130                         if (after(tstamp, tstamp_new) ||
1131                             after(tsecr, tsecr_new))
1132                                 return false;
1133                         break;
1134
1135                 case TCPOPT_MSS:  /* these should only be set on SYN */
1136                 case TCPOPT_WINDOW:
1137                 case TCPOPT_SACK_PERM:
1138                 case TCPOPT_FASTOPEN:
1139                 case TCPOPT_EXP:
1140                 default: /* don't drop if any unknown options are present */
1141                         return false;
1142                 }
1143
1144                 ptr += opsize - 2;
1145                 length -= opsize;
1146         }
1147
1148         return true;
1149 }
1150
1151 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1152                                        struct cake_flow *flow)
1153 {
1154         bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1155         struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1156         struct sk_buff *skb_check, *skb_prev = NULL;
1157         const struct ipv6hdr *ipv6h, *ipv6h_check;
1158         unsigned char _tcph[64], _tcph_check[64];
1159         const struct tcphdr *tcph, *tcph_check;
1160         const struct iphdr *iph, *iph_check;
1161         struct ipv6hdr _iph, _iph_check;
1162         const struct sk_buff *skb;
1163         int seglen, num_found = 0;
1164         u32 tstamp = 0, tsecr = 0;
1165         __be32 elig_flags = 0;
1166         int sack_comp;
1167
1168         /* no other possible ACKs to filter */
1169         if (flow->head == flow->tail)
1170                 return NULL;
1171
1172         skb = flow->tail;
1173         tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1174         iph = cake_get_iphdr(skb, &_iph);
1175         if (!tcph)
1176                 return NULL;
1177
1178         cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1179
1180         /* the 'triggering' packet need only have the ACK flag set.
1181          * also check that SYN is not set, as there won't be any previous ACKs.
1182          */
1183         if ((tcp_flag_word(tcph) &
1184              (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1185                 return NULL;
1186
1187         /* the 'triggering' ACK is at the tail of the queue, we have already
1188          * returned if it is the only packet in the flow. loop through the rest
1189          * of the queue looking for pure ACKs with the same 5-tuple as the
1190          * triggering one.
1191          */
1192         for (skb_check = flow->head;
1193              skb_check && skb_check != skb;
1194              skb_prev = skb_check, skb_check = skb_check->next) {
1195                 iph_check = cake_get_iphdr(skb_check, &_iph_check);
1196                 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1197                                              sizeof(_tcph_check));
1198
1199                 /* only TCP packets with matching 5-tuple are eligible, and only
1200                  * drop safe headers
1201                  */
1202                 if (!tcph_check || iph->version != iph_check->version ||
1203                     tcph_check->source != tcph->source ||
1204                     tcph_check->dest != tcph->dest)
1205                         continue;
1206
1207                 if (iph_check->version == 4) {
1208                         if (iph_check->saddr != iph->saddr ||
1209                             iph_check->daddr != iph->daddr)
1210                                 continue;
1211
1212                         seglen = ntohs(iph_check->tot_len) -
1213                                        (4 * iph_check->ihl);
1214                 } else if (iph_check->version == 6) {
1215                         ipv6h = (struct ipv6hdr *)iph;
1216                         ipv6h_check = (struct ipv6hdr *)iph_check;
1217
1218                         if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1219                             ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1220                                 continue;
1221
1222                         seglen = ntohs(ipv6h_check->payload_len);
1223                 } else {
1224                         WARN_ON(1);  /* shouldn't happen */
1225                         continue;
1226                 }
1227
1228                 /* If the ECE/CWR flags changed from the previous eligible
1229                  * packet in the same flow, we should no longer be dropping that
1230                  * previous packet as this would lose information.
1231                  */
1232                 if (elig_ack && (tcp_flag_word(tcph_check) &
1233                                  (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1234                         elig_ack = NULL;
1235                         elig_ack_prev = NULL;
1236                         num_found--;
1237                 }
1238
1239                 /* Check TCP options and flags, don't drop ACKs with segment
1240                  * data, and don't drop ACKs with a higher cumulative ACK
1241                  * counter than the triggering packet. Check ACK seqno here to
1242                  * avoid parsing SACK options of packets we are going to exclude
1243                  * anyway.
1244                  */
1245                 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1246                     (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1247                     after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1248                         continue;
1249
1250                 /* Check SACK options. The triggering packet must SACK more data
1251                  * than the ACK under consideration, or SACK the same range but
1252                  * have a larger cumulative ACK counter. The latter is a
1253                  * pathological case, but is contained in the following check
1254                  * anyway, just to be safe.
1255                  */
1256                 sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1257
1258                 if (sack_comp < 0 ||
1259                     (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1260                      sack_comp == 0))
1261                         continue;
1262
1263                 /* At this point we have found an eligible pure ACK to drop; if
1264                  * we are in aggressive mode, we are done. Otherwise, keep
1265                  * searching unless this is the second eligible ACK we
1266                  * found.
1267                  *
1268                  * Since we want to drop ACK closest to the head of the queue,
1269                  * save the first eligible ACK we find, even if we need to loop
1270                  * again.
1271                  */
1272                 if (!elig_ack) {
1273                         elig_ack = skb_check;
1274                         elig_ack_prev = skb_prev;
1275                         elig_flags = (tcp_flag_word(tcph_check)
1276                                       & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1277                 }
1278
1279                 if (num_found++ > 0)
1280                         goto found;
1281         }
1282
1283         /* We made it through the queue without finding two eligible ACKs . If
1284          * we found a single eligible ACK we can drop it in aggressive mode if
1285          * we can guarantee that this does not interfere with ECN flag
1286          * information. We ensure this by dropping it only if the enqueued
1287          * packet is consecutive with the eligible ACK, and their flags match.
1288          */
1289         if (elig_ack && aggressive && elig_ack->next == skb &&
1290             (elig_flags == (tcp_flag_word(tcph) &
1291                             (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1292                 goto found;
1293
1294         return NULL;
1295
1296 found:
1297         if (elig_ack_prev)
1298                 elig_ack_prev->next = elig_ack->next;
1299         else
1300                 flow->head = elig_ack->next;
1301
1302         skb_mark_not_on_list(elig_ack);
1303
1304         return elig_ack;
1305 }
1306
1307 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1308 {
1309         avg -= avg >> shift;
1310         avg += sample >> shift;
1311         return avg;
1312 }
1313
1314 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1315 {
1316         if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1317                 len -= off;
1318
1319         if (q->max_netlen < len)
1320                 q->max_netlen = len;
1321         if (q->min_netlen > len)
1322                 q->min_netlen = len;
1323
1324         len += q->rate_overhead;
1325
1326         if (len < q->rate_mpu)
1327                 len = q->rate_mpu;
1328
1329         if (q->atm_mode == CAKE_ATM_ATM) {
1330                 len += 47;
1331                 len /= 48;
1332                 len *= 53;
1333         } else if (q->atm_mode == CAKE_ATM_PTM) {
1334                 /* Add one byte per 64 bytes or part thereof.
1335                  * This is conservative and easier to calculate than the
1336                  * precise value.
1337                  */
1338                 len += (len + 63) / 64;
1339         }
1340
1341         if (q->max_adjlen < len)
1342                 q->max_adjlen = len;
1343         if (q->min_adjlen > len)
1344                 q->min_adjlen = len;
1345
1346         return len;
1347 }
1348
1349 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1350 {
1351         const struct skb_shared_info *shinfo = skb_shinfo(skb);
1352         unsigned int hdr_len, last_len = 0;
1353         u32 off = skb_network_offset(skb);
1354         u32 len = qdisc_pkt_len(skb);
1355         u16 segs = 1;
1356
1357         q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1358
1359         if (!shinfo->gso_size)
1360                 return cake_calc_overhead(q, len, off);
1361
1362         /* borrowed from qdisc_pkt_len_init() */
1363         hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1364
1365         /* + transport layer */
1366         if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1367                                                 SKB_GSO_TCPV6))) {
1368                 const struct tcphdr *th;
1369                 struct tcphdr _tcphdr;
1370
1371                 th = skb_header_pointer(skb, skb_transport_offset(skb),
1372                                         sizeof(_tcphdr), &_tcphdr);
1373                 if (likely(th))
1374                         hdr_len += __tcp_hdrlen(th);
1375         } else {
1376                 struct udphdr _udphdr;
1377
1378                 if (skb_header_pointer(skb, skb_transport_offset(skb),
1379                                        sizeof(_udphdr), &_udphdr))
1380                         hdr_len += sizeof(struct udphdr);
1381         }
1382
1383         if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1384                 segs = DIV_ROUND_UP(skb->len - hdr_len,
1385                                     shinfo->gso_size);
1386         else
1387                 segs = shinfo->gso_segs;
1388
1389         len = shinfo->gso_size + hdr_len;
1390         last_len = skb->len - shinfo->gso_size * (segs - 1);
1391
1392         return (cake_calc_overhead(q, len, off) * (segs - 1) +
1393                 cake_calc_overhead(q, last_len, off));
1394 }
1395
1396 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1397 {
1398         struct cake_heap_entry ii = q->overflow_heap[i];
1399         struct cake_heap_entry jj = q->overflow_heap[j];
1400
1401         q->overflow_heap[i] = jj;
1402         q->overflow_heap[j] = ii;
1403
1404         q->tins[ii.t].overflow_idx[ii.b] = j;
1405         q->tins[jj.t].overflow_idx[jj.b] = i;
1406 }
1407
1408 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1409 {
1410         struct cake_heap_entry ii = q->overflow_heap[i];
1411
1412         return q->tins[ii.t].backlogs[ii.b];
1413 }
1414
1415 static void cake_heapify(struct cake_sched_data *q, u16 i)
1416 {
1417         static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1418         u32 mb = cake_heap_get_backlog(q, i);
1419         u32 m = i;
1420
1421         while (m < a) {
1422                 u32 l = m + m + 1;
1423                 u32 r = l + 1;
1424
1425                 if (l < a) {
1426                         u32 lb = cake_heap_get_backlog(q, l);
1427
1428                         if (lb > mb) {
1429                                 m  = l;
1430                                 mb = lb;
1431                         }
1432                 }
1433
1434                 if (r < a) {
1435                         u32 rb = cake_heap_get_backlog(q, r);
1436
1437                         if (rb > mb) {
1438                                 m  = r;
1439                                 mb = rb;
1440                         }
1441                 }
1442
1443                 if (m != i) {
1444                         cake_heap_swap(q, i, m);
1445                         i = m;
1446                 } else {
1447                         break;
1448                 }
1449         }
1450 }
1451
1452 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1453 {
1454         while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1455                 u16 p = (i - 1) >> 1;
1456                 u32 ib = cake_heap_get_backlog(q, i);
1457                 u32 pb = cake_heap_get_backlog(q, p);
1458
1459                 if (ib > pb) {
1460                         cake_heap_swap(q, i, p);
1461                         i = p;
1462                 } else {
1463                         break;
1464                 }
1465         }
1466 }
1467
1468 static int cake_advance_shaper(struct cake_sched_data *q,
1469                                struct cake_tin_data *b,
1470                                struct sk_buff *skb,
1471                                ktime_t now, bool drop)
1472 {
1473         u32 len = get_cobalt_cb(skb)->adjusted_len;
1474
1475         /* charge packet bandwidth to this tin
1476          * and to the global shaper.
1477          */
1478         if (q->rate_ns) {
1479                 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1480                 u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1481                 u64 failsafe_dur = global_dur + (global_dur >> 1);
1482
1483                 if (ktime_before(b->time_next_packet, now))
1484                         b->time_next_packet = ktime_add_ns(b->time_next_packet,
1485                                                            tin_dur);
1486
1487                 else if (ktime_before(b->time_next_packet,
1488                                       ktime_add_ns(now, tin_dur)))
1489                         b->time_next_packet = ktime_add_ns(now, tin_dur);
1490
1491                 q->time_next_packet = ktime_add_ns(q->time_next_packet,
1492                                                    global_dur);
1493                 if (!drop)
1494                         q->failsafe_next_packet = \
1495                                 ktime_add_ns(q->failsafe_next_packet,
1496                                              failsafe_dur);
1497         }
1498         return len;
1499 }
1500
1501 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1502 {
1503         struct cake_sched_data *q = qdisc_priv(sch);
1504         ktime_t now = ktime_get();
1505         u32 idx = 0, tin = 0, len;
1506         struct cake_heap_entry qq;
1507         struct cake_tin_data *b;
1508         struct cake_flow *flow;
1509         struct sk_buff *skb;
1510
1511         if (!q->overflow_timeout) {
1512                 int i;
1513                 /* Build fresh max-heap */
1514                 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1515                         cake_heapify(q, i);
1516         }
1517         q->overflow_timeout = 65535;
1518
1519         /* select longest queue for pruning */
1520         qq  = q->overflow_heap[0];
1521         tin = qq.t;
1522         idx = qq.b;
1523
1524         b = &q->tins[tin];
1525         flow = &b->flows[idx];
1526         skb = dequeue_head(flow);
1527         if (unlikely(!skb)) {
1528                 /* heap has gone wrong, rebuild it next time */
1529                 q->overflow_timeout = 0;
1530                 return idx + (tin << 16);
1531         }
1532
1533         if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1534                 b->unresponsive_flow_count++;
1535
1536         len = qdisc_pkt_len(skb);
1537         q->buffer_used      -= skb->truesize;
1538         b->backlogs[idx]    -= len;
1539         b->tin_backlog      -= len;
1540         sch->qstats.backlog -= len;
1541         qdisc_tree_reduce_backlog(sch, 1, len);
1542
1543         flow->dropped++;
1544         b->tin_dropped++;
1545         sch->qstats.drops++;
1546
1547         if (q->rate_flags & CAKE_FLAG_INGRESS)
1548                 cake_advance_shaper(q, b, skb, now, true);
1549
1550         __qdisc_drop(skb, to_free);
1551         sch->q.qlen--;
1552
1553         cake_heapify(q, 0);
1554
1555         return idx + (tin << 16);
1556 }
1557
1558 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1559 {
1560         const int offset = skb_network_offset(skb);
1561         u16 *buf, buf_;
1562         u8 dscp;
1563
1564         switch (skb_protocol(skb, true)) {
1565         case htons(ETH_P_IP):
1566                 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1567                 if (unlikely(!buf))
1568                         return 0;
1569
1570                 /* ToS is in the second byte of iphdr */
1571                 dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1572
1573                 if (wash && dscp) {
1574                         const int wlen = offset + sizeof(struct iphdr);
1575
1576                         if (!pskb_may_pull(skb, wlen) ||
1577                             skb_try_make_writable(skb, wlen))
1578                                 return 0;
1579
1580                         ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1581                 }
1582
1583                 return dscp;
1584
1585         case htons(ETH_P_IPV6):
1586                 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1587                 if (unlikely(!buf))
1588                         return 0;
1589
1590                 /* Traffic class is in the first and second bytes of ipv6hdr */
1591                 dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1592
1593                 if (wash && dscp) {
1594                         const int wlen = offset + sizeof(struct ipv6hdr);
1595
1596                         if (!pskb_may_pull(skb, wlen) ||
1597                             skb_try_make_writable(skb, wlen))
1598                                 return 0;
1599
1600                         ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1601                 }
1602
1603                 return dscp;
1604
1605         case htons(ETH_P_ARP):
1606                 return 0x38;  /* CS7 - Net Control */
1607
1608         default:
1609                 /* If there is no Diffserv field, treat as best-effort */
1610                 return 0;
1611         }
1612 }
1613
1614 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1615                                              struct sk_buff *skb)
1616 {
1617         struct cake_sched_data *q = qdisc_priv(sch);
1618         u32 tin, mark;
1619         bool wash;
1620         u8 dscp;
1621
1622         /* Tin selection: Default to diffserv-based selection, allow overriding
1623          * using firewall marks or skb->priority. Call DSCP parsing early if
1624          * wash is enabled, otherwise defer to below to skip unneeded parsing.
1625          */
1626         mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1627         wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1628         if (wash)
1629                 dscp = cake_handle_diffserv(skb, wash);
1630
1631         if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1632                 tin = 0;
1633
1634         else if (mark && mark <= q->tin_cnt)
1635                 tin = q->tin_order[mark - 1];
1636
1637         else if (TC_H_MAJ(skb->priority) == sch->handle &&
1638                  TC_H_MIN(skb->priority) > 0 &&
1639                  TC_H_MIN(skb->priority) <= q->tin_cnt)
1640                 tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1641
1642         else {
1643                 if (!wash)
1644                         dscp = cake_handle_diffserv(skb, wash);
1645                 tin = q->tin_index[dscp];
1646
1647                 if (unlikely(tin >= q->tin_cnt))
1648                         tin = 0;
1649         }
1650
1651         return &q->tins[tin];
1652 }
1653
1654 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1655                          struct sk_buff *skb, int flow_mode, int *qerr)
1656 {
1657         struct cake_sched_data *q = qdisc_priv(sch);
1658         struct tcf_proto *filter;
1659         struct tcf_result res;
1660         u16 flow = 0, host = 0;
1661         int result;
1662
1663         filter = rcu_dereference_bh(q->filter_list);
1664         if (!filter)
1665                 goto hash;
1666
1667         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1668         result = tcf_classify(skb, filter, &res, false);
1669
1670         if (result >= 0) {
1671 #ifdef CONFIG_NET_CLS_ACT
1672                 switch (result) {
1673                 case TC_ACT_STOLEN:
1674                 case TC_ACT_QUEUED:
1675                 case TC_ACT_TRAP:
1676                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1677                         fallthrough;
1678                 case TC_ACT_SHOT:
1679                         return 0;
1680                 }
1681 #endif
1682                 if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1683                         flow = TC_H_MIN(res.classid);
1684                 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1685                         host = TC_H_MAJ(res.classid) >> 16;
1686         }
1687 hash:
1688         *t = cake_select_tin(sch, skb);
1689         return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1690 }
1691
1692 static void cake_reconfigure(struct Qdisc *sch);
1693
1694 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1695                         struct sk_buff **to_free)
1696 {
1697         struct cake_sched_data *q = qdisc_priv(sch);
1698         int len = qdisc_pkt_len(skb);
1699         int ret;
1700         struct sk_buff *ack = NULL;
1701         ktime_t now = ktime_get();
1702         struct cake_tin_data *b;
1703         struct cake_flow *flow;
1704         u32 idx;
1705
1706         /* choose flow to insert into */
1707         idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1708         if (idx == 0) {
1709                 if (ret & __NET_XMIT_BYPASS)
1710                         qdisc_qstats_drop(sch);
1711                 __qdisc_drop(skb, to_free);
1712                 return ret;
1713         }
1714         idx--;
1715         flow = &b->flows[idx];
1716
1717         /* ensure shaper state isn't stale */
1718         if (!b->tin_backlog) {
1719                 if (ktime_before(b->time_next_packet, now))
1720                         b->time_next_packet = now;
1721
1722                 if (!sch->q.qlen) {
1723                         if (ktime_before(q->time_next_packet, now)) {
1724                                 q->failsafe_next_packet = now;
1725                                 q->time_next_packet = now;
1726                         } else if (ktime_after(q->time_next_packet, now) &&
1727                                    ktime_after(q->failsafe_next_packet, now)) {
1728                                 u64 next = \
1729                                         min(ktime_to_ns(q->time_next_packet),
1730                                             ktime_to_ns(
1731                                                    q->failsafe_next_packet));
1732                                 sch->qstats.overlimits++;
1733                                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1734                         }
1735                 }
1736         }
1737
1738         if (unlikely(len > b->max_skblen))
1739                 b->max_skblen = len;
1740
1741         if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1742                 struct sk_buff *segs, *nskb;
1743                 netdev_features_t features = netif_skb_features(skb);
1744                 unsigned int slen = 0, numsegs = 0;
1745
1746                 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1747                 if (IS_ERR_OR_NULL(segs))
1748                         return qdisc_drop(skb, sch, to_free);
1749
1750                 skb_list_walk_safe(segs, segs, nskb) {
1751                         skb_mark_not_on_list(segs);
1752                         qdisc_skb_cb(segs)->pkt_len = segs->len;
1753                         cobalt_set_enqueue_time(segs, now);
1754                         get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1755                                                                           segs);
1756                         flow_queue_add(flow, segs);
1757
1758                         sch->q.qlen++;
1759                         numsegs++;
1760                         slen += segs->len;
1761                         q->buffer_used += segs->truesize;
1762                         b->packets++;
1763                 }
1764
1765                 /* stats */
1766                 b->bytes            += slen;
1767                 b->backlogs[idx]    += slen;
1768                 b->tin_backlog      += slen;
1769                 sch->qstats.backlog += slen;
1770                 q->avg_window_bytes += slen;
1771
1772                 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1773                 consume_skb(skb);
1774         } else {
1775                 /* not splitting */
1776                 cobalt_set_enqueue_time(skb, now);
1777                 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1778                 flow_queue_add(flow, skb);
1779
1780                 if (q->ack_filter)
1781                         ack = cake_ack_filter(q, flow);
1782
1783                 if (ack) {
1784                         b->ack_drops++;
1785                         sch->qstats.drops++;
1786                         b->bytes += qdisc_pkt_len(ack);
1787                         len -= qdisc_pkt_len(ack);
1788                         q->buffer_used += skb->truesize - ack->truesize;
1789                         if (q->rate_flags & CAKE_FLAG_INGRESS)
1790                                 cake_advance_shaper(q, b, ack, now, true);
1791
1792                         qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1793                         consume_skb(ack);
1794                 } else {
1795                         sch->q.qlen++;
1796                         q->buffer_used      += skb->truesize;
1797                 }
1798
1799                 /* stats */
1800                 b->packets++;
1801                 b->bytes            += len;
1802                 b->backlogs[idx]    += len;
1803                 b->tin_backlog      += len;
1804                 sch->qstats.backlog += len;
1805                 q->avg_window_bytes += len;
1806         }
1807
1808         if (q->overflow_timeout)
1809                 cake_heapify_up(q, b->overflow_idx[idx]);
1810
1811         /* incoming bandwidth capacity estimate */
1812         if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1813                 u64 packet_interval = \
1814                         ktime_to_ns(ktime_sub(now, q->last_packet_time));
1815
1816                 if (packet_interval > NSEC_PER_SEC)
1817                         packet_interval = NSEC_PER_SEC;
1818
1819                 /* filter out short-term bursts, eg. wifi aggregation */
1820                 q->avg_packet_interval = \
1821                         cake_ewma(q->avg_packet_interval,
1822                                   packet_interval,
1823                                   (packet_interval > q->avg_packet_interval ?
1824                                           2 : 8));
1825
1826                 q->last_packet_time = now;
1827
1828                 if (packet_interval > q->avg_packet_interval) {
1829                         u64 window_interval = \
1830                                 ktime_to_ns(ktime_sub(now,
1831                                                       q->avg_window_begin));
1832                         u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1833
1834                         b = div64_u64(b, window_interval);
1835                         q->avg_peak_bandwidth =
1836                                 cake_ewma(q->avg_peak_bandwidth, b,
1837                                           b > q->avg_peak_bandwidth ? 2 : 8);
1838                         q->avg_window_bytes = 0;
1839                         q->avg_window_begin = now;
1840
1841                         if (ktime_after(now,
1842                                         ktime_add_ms(q->last_reconfig_time,
1843                                                      250))) {
1844                                 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1845                                 cake_reconfigure(sch);
1846                         }
1847                 }
1848         } else {
1849                 q->avg_window_bytes = 0;
1850                 q->last_packet_time = now;
1851         }
1852
1853         /* flowchain */
1854         if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1855                 struct cake_host *srchost = &b->hosts[flow->srchost];
1856                 struct cake_host *dsthost = &b->hosts[flow->dsthost];
1857                 u16 host_load = 1;
1858
1859                 if (!flow->set) {
1860                         list_add_tail(&flow->flowchain, &b->new_flows);
1861                 } else {
1862                         b->decaying_flow_count--;
1863                         list_move_tail(&flow->flowchain, &b->new_flows);
1864                 }
1865                 flow->set = CAKE_SET_SPARSE;
1866                 b->sparse_flow_count++;
1867
1868                 if (cake_dsrc(q->flow_mode))
1869                         host_load = max(host_load, srchost->srchost_bulk_flow_count);
1870
1871                 if (cake_ddst(q->flow_mode))
1872                         host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1873
1874                 flow->deficit = (b->flow_quantum *
1875                                  quantum_div[host_load]) >> 16;
1876         } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1877                 struct cake_host *srchost = &b->hosts[flow->srchost];
1878                 struct cake_host *dsthost = &b->hosts[flow->dsthost];
1879
1880                 /* this flow was empty, accounted as a sparse flow, but actually
1881                  * in the bulk rotation.
1882                  */
1883                 flow->set = CAKE_SET_BULK;
1884                 b->sparse_flow_count--;
1885                 b->bulk_flow_count++;
1886
1887                 if (cake_dsrc(q->flow_mode))
1888                         srchost->srchost_bulk_flow_count++;
1889
1890                 if (cake_ddst(q->flow_mode))
1891                         dsthost->dsthost_bulk_flow_count++;
1892
1893         }
1894
1895         if (q->buffer_used > q->buffer_max_used)
1896                 q->buffer_max_used = q->buffer_used;
1897
1898         if (q->buffer_used > q->buffer_limit) {
1899                 u32 dropped = 0;
1900
1901                 while (q->buffer_used > q->buffer_limit) {
1902                         dropped++;
1903                         cake_drop(sch, to_free);
1904                 }
1905                 b->drop_overlimit += dropped;
1906         }
1907         return NET_XMIT_SUCCESS;
1908 }
1909
1910 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1911 {
1912         struct cake_sched_data *q = qdisc_priv(sch);
1913         struct cake_tin_data *b = &q->tins[q->cur_tin];
1914         struct cake_flow *flow = &b->flows[q->cur_flow];
1915         struct sk_buff *skb = NULL;
1916         u32 len;
1917
1918         if (flow->head) {
1919                 skb = dequeue_head(flow);
1920                 len = qdisc_pkt_len(skb);
1921                 b->backlogs[q->cur_flow] -= len;
1922                 b->tin_backlog           -= len;
1923                 sch->qstats.backlog      -= len;
1924                 q->buffer_used           -= skb->truesize;
1925                 sch->q.qlen--;
1926
1927                 if (q->overflow_timeout)
1928                         cake_heapify(q, b->overflow_idx[q->cur_flow]);
1929         }
1930         return skb;
1931 }
1932
1933 /* Discard leftover packets from a tin no longer in use. */
1934 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1935 {
1936         struct cake_sched_data *q = qdisc_priv(sch);
1937         struct sk_buff *skb;
1938
1939         q->cur_tin = tin;
1940         for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1941                 while (!!(skb = cake_dequeue_one(sch)))
1942                         kfree_skb(skb);
1943 }
1944
1945 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1946 {
1947         struct cake_sched_data *q = qdisc_priv(sch);
1948         struct cake_tin_data *b = &q->tins[q->cur_tin];
1949         struct cake_host *srchost, *dsthost;
1950         ktime_t now = ktime_get();
1951         struct cake_flow *flow;
1952         struct list_head *head;
1953         bool first_flow = true;
1954         struct sk_buff *skb;
1955         u16 host_load;
1956         u64 delay;
1957         u32 len;
1958
1959 begin:
1960         if (!sch->q.qlen)
1961                 return NULL;
1962
1963         /* global hard shaper */
1964         if (ktime_after(q->time_next_packet, now) &&
1965             ktime_after(q->failsafe_next_packet, now)) {
1966                 u64 next = min(ktime_to_ns(q->time_next_packet),
1967                                ktime_to_ns(q->failsafe_next_packet));
1968
1969                 sch->qstats.overlimits++;
1970                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1971                 return NULL;
1972         }
1973
1974         /* Choose a class to work on. */
1975         if (!q->rate_ns) {
1976                 /* In unlimited mode, can't rely on shaper timings, just balance
1977                  * with DRR
1978                  */
1979                 bool wrapped = false, empty = true;
1980
1981                 while (b->tin_deficit < 0 ||
1982                        !(b->sparse_flow_count + b->bulk_flow_count)) {
1983                         if (b->tin_deficit <= 0)
1984                                 b->tin_deficit += b->tin_quantum;
1985                         if (b->sparse_flow_count + b->bulk_flow_count)
1986                                 empty = false;
1987
1988                         q->cur_tin++;
1989                         b++;
1990                         if (q->cur_tin >= q->tin_cnt) {
1991                                 q->cur_tin = 0;
1992                                 b = q->tins;
1993
1994                                 if (wrapped) {
1995                                         /* It's possible for q->qlen to be
1996                                          * nonzero when we actually have no
1997                                          * packets anywhere.
1998                                          */
1999                                         if (empty)
2000                                                 return NULL;
2001                                 } else {
2002                                         wrapped = true;
2003                                 }
2004                         }
2005                 }
2006         } else {
2007                 /* In shaped mode, choose:
2008                  * - Highest-priority tin with queue and meeting schedule, or
2009                  * - The earliest-scheduled tin with queue.
2010                  */
2011                 ktime_t best_time = KTIME_MAX;
2012                 int tin, best_tin = 0;
2013
2014                 for (tin = 0; tin < q->tin_cnt; tin++) {
2015                         b = q->tins + tin;
2016                         if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2017                                 ktime_t time_to_pkt = \
2018                                         ktime_sub(b->time_next_packet, now);
2019
2020                                 if (ktime_to_ns(time_to_pkt) <= 0 ||
2021                                     ktime_compare(time_to_pkt,
2022                                                   best_time) <= 0) {
2023                                         best_time = time_to_pkt;
2024                                         best_tin = tin;
2025                                 }
2026                         }
2027                 }
2028
2029                 q->cur_tin = best_tin;
2030                 b = q->tins + best_tin;
2031
2032                 /* No point in going further if no packets to deliver. */
2033                 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2034                         return NULL;
2035         }
2036
2037 retry:
2038         /* service this class */
2039         head = &b->decaying_flows;
2040         if (!first_flow || list_empty(head)) {
2041                 head = &b->new_flows;
2042                 if (list_empty(head)) {
2043                         head = &b->old_flows;
2044                         if (unlikely(list_empty(head))) {
2045                                 head = &b->decaying_flows;
2046                                 if (unlikely(list_empty(head)))
2047                                         goto begin;
2048                         }
2049                 }
2050         }
2051         flow = list_first_entry(head, struct cake_flow, flowchain);
2052         q->cur_flow = flow - b->flows;
2053         first_flow = false;
2054
2055         /* triple isolation (modified DRR++) */
2056         srchost = &b->hosts[flow->srchost];
2057         dsthost = &b->hosts[flow->dsthost];
2058         host_load = 1;
2059
2060         /* flow isolation (DRR++) */
2061         if (flow->deficit <= 0) {
2062                 /* Keep all flows with deficits out of the sparse and decaying
2063                  * rotations.  No non-empty flow can go into the decaying
2064                  * rotation, so they can't get deficits
2065                  */
2066                 if (flow->set == CAKE_SET_SPARSE) {
2067                         if (flow->head) {
2068                                 b->sparse_flow_count--;
2069                                 b->bulk_flow_count++;
2070
2071                                 if (cake_dsrc(q->flow_mode))
2072                                         srchost->srchost_bulk_flow_count++;
2073
2074                                 if (cake_ddst(q->flow_mode))
2075                                         dsthost->dsthost_bulk_flow_count++;
2076
2077                                 flow->set = CAKE_SET_BULK;
2078                         } else {
2079                                 /* we've moved it to the bulk rotation for
2080                                  * correct deficit accounting but we still want
2081                                  * to count it as a sparse flow, not a bulk one.
2082                                  */
2083                                 flow->set = CAKE_SET_SPARSE_WAIT;
2084                         }
2085                 }
2086
2087                 if (cake_dsrc(q->flow_mode))
2088                         host_load = max(host_load, srchost->srchost_bulk_flow_count);
2089
2090                 if (cake_ddst(q->flow_mode))
2091                         host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2092
2093                 WARN_ON(host_load > CAKE_QUEUES);
2094
2095                 /* The shifted prandom_u32() is a way to apply dithering to
2096                  * avoid accumulating roundoff errors
2097                  */
2098                 flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2099                                   (prandom_u32() >> 16)) >> 16;
2100                 list_move_tail(&flow->flowchain, &b->old_flows);
2101
2102                 goto retry;
2103         }
2104
2105         /* Retrieve a packet via the AQM */
2106         while (1) {
2107                 skb = cake_dequeue_one(sch);
2108                 if (!skb) {
2109                         /* this queue was actually empty */
2110                         if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2111                                 b->unresponsive_flow_count--;
2112
2113                         if (flow->cvars.p_drop || flow->cvars.count ||
2114                             ktime_before(now, flow->cvars.drop_next)) {
2115                                 /* keep in the flowchain until the state has
2116                                  * decayed to rest
2117                                  */
2118                                 list_move_tail(&flow->flowchain,
2119                                                &b->decaying_flows);
2120                                 if (flow->set == CAKE_SET_BULK) {
2121                                         b->bulk_flow_count--;
2122
2123                                         if (cake_dsrc(q->flow_mode))
2124                                                 srchost->srchost_bulk_flow_count--;
2125
2126                                         if (cake_ddst(q->flow_mode))
2127                                                 dsthost->dsthost_bulk_flow_count--;
2128
2129                                         b->decaying_flow_count++;
2130                                 } else if (flow->set == CAKE_SET_SPARSE ||
2131                                            flow->set == CAKE_SET_SPARSE_WAIT) {
2132                                         b->sparse_flow_count--;
2133                                         b->decaying_flow_count++;
2134                                 }
2135                                 flow->set = CAKE_SET_DECAYING;
2136                         } else {
2137                                 /* remove empty queue from the flowchain */
2138                                 list_del_init(&flow->flowchain);
2139                                 if (flow->set == CAKE_SET_SPARSE ||
2140                                     flow->set == CAKE_SET_SPARSE_WAIT)
2141                                         b->sparse_flow_count--;
2142                                 else if (flow->set == CAKE_SET_BULK) {
2143                                         b->bulk_flow_count--;
2144
2145                                         if (cake_dsrc(q->flow_mode))
2146                                                 srchost->srchost_bulk_flow_count--;
2147
2148                                         if (cake_ddst(q->flow_mode))
2149                                                 dsthost->dsthost_bulk_flow_count--;
2150
2151                                 } else
2152                                         b->decaying_flow_count--;
2153
2154                                 flow->set = CAKE_SET_NONE;
2155                         }
2156                         goto begin;
2157                 }
2158
2159                 /* Last packet in queue may be marked, shouldn't be dropped */
2160                 if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2161                                         (b->bulk_flow_count *
2162                                          !!(q->rate_flags &
2163                                             CAKE_FLAG_INGRESS))) ||
2164                     !flow->head)
2165                         break;
2166
2167                 /* drop this packet, get another one */
2168                 if (q->rate_flags & CAKE_FLAG_INGRESS) {
2169                         len = cake_advance_shaper(q, b, skb,
2170                                                   now, true);
2171                         flow->deficit -= len;
2172                         b->tin_deficit -= len;
2173                 }
2174                 flow->dropped++;
2175                 b->tin_dropped++;
2176                 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2177                 qdisc_qstats_drop(sch);
2178                 kfree_skb(skb);
2179                 if (q->rate_flags & CAKE_FLAG_INGRESS)
2180                         goto retry;
2181         }
2182
2183         b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2184         qdisc_bstats_update(sch, skb);
2185
2186         /* collect delay stats */
2187         delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2188         b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2189         b->peak_delay = cake_ewma(b->peak_delay, delay,
2190                                   delay > b->peak_delay ? 2 : 8);
2191         b->base_delay = cake_ewma(b->base_delay, delay,
2192                                   delay < b->base_delay ? 2 : 8);
2193
2194         len = cake_advance_shaper(q, b, skb, now, false);
2195         flow->deficit -= len;
2196         b->tin_deficit -= len;
2197
2198         if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2199                 u64 next = min(ktime_to_ns(q->time_next_packet),
2200                                ktime_to_ns(q->failsafe_next_packet));
2201
2202                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2203         } else if (!sch->q.qlen) {
2204                 int i;
2205
2206                 for (i = 0; i < q->tin_cnt; i++) {
2207                         if (q->tins[i].decaying_flow_count) {
2208                                 ktime_t next = \
2209                                         ktime_add_ns(now,
2210                                                      q->tins[i].cparams.target);
2211
2212                                 qdisc_watchdog_schedule_ns(&q->watchdog,
2213                                                            ktime_to_ns(next));
2214                                 break;
2215                         }
2216                 }
2217         }
2218
2219         if (q->overflow_timeout)
2220                 q->overflow_timeout--;
2221
2222         return skb;
2223 }
2224
2225 static void cake_reset(struct Qdisc *sch)
2226 {
2227         u32 c;
2228
2229         for (c = 0; c < CAKE_MAX_TINS; c++)
2230                 cake_clear_tin(sch, c);
2231 }
2232
2233 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2234         [TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2235         [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2236         [TCA_CAKE_ATM]           = { .type = NLA_U32 },
2237         [TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2238         [TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2239         [TCA_CAKE_RTT]           = { .type = NLA_U32 },
2240         [TCA_CAKE_TARGET]        = { .type = NLA_U32 },
2241         [TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2242         [TCA_CAKE_MEMORY]        = { .type = NLA_U32 },
2243         [TCA_CAKE_NAT]           = { .type = NLA_U32 },
2244         [TCA_CAKE_RAW]           = { .type = NLA_U32 },
2245         [TCA_CAKE_WASH]          = { .type = NLA_U32 },
2246         [TCA_CAKE_MPU]           = { .type = NLA_U32 },
2247         [TCA_CAKE_INGRESS]       = { .type = NLA_U32 },
2248         [TCA_CAKE_ACK_FILTER]    = { .type = NLA_U32 },
2249         [TCA_CAKE_SPLIT_GSO]     = { .type = NLA_U32 },
2250         [TCA_CAKE_FWMARK]        = { .type = NLA_U32 },
2251 };
2252
2253 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2254                           u64 target_ns, u64 rtt_est_ns)
2255 {
2256         /* convert byte-rate into time-per-byte
2257          * so it will always unwedge in reasonable time.
2258          */
2259         static const u64 MIN_RATE = 64;
2260         u32 byte_target = mtu;
2261         u64 byte_target_ns;
2262         u8  rate_shft = 0;
2263         u64 rate_ns = 0;
2264
2265         b->flow_quantum = 1514;
2266         if (rate) {
2267                 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2268                 rate_shft = 34;
2269                 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2270                 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2271                 while (!!(rate_ns >> 34)) {
2272                         rate_ns >>= 1;
2273                         rate_shft--;
2274                 }
2275         } /* else unlimited, ie. zero delay */
2276
2277         b->tin_rate_bps  = rate;
2278         b->tin_rate_ns   = rate_ns;
2279         b->tin_rate_shft = rate_shft;
2280
2281         byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2282
2283         b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2284         b->cparams.interval = max(rtt_est_ns +
2285                                      b->cparams.target - target_ns,
2286                                      b->cparams.target * 2);
2287         b->cparams.mtu_time = byte_target_ns;
2288         b->cparams.p_inc = 1 << 24; /* 1/256 */
2289         b->cparams.p_dec = 1 << 20; /* 1/4096 */
2290 }
2291
2292 static int cake_config_besteffort(struct Qdisc *sch)
2293 {
2294         struct cake_sched_data *q = qdisc_priv(sch);
2295         struct cake_tin_data *b = &q->tins[0];
2296         u32 mtu = psched_mtu(qdisc_dev(sch));
2297         u64 rate = q->rate_bps;
2298
2299         q->tin_cnt = 1;
2300
2301         q->tin_index = besteffort;
2302         q->tin_order = normal_order;
2303
2304         cake_set_rate(b, rate, mtu,
2305                       us_to_ns(q->target), us_to_ns(q->interval));
2306         b->tin_quantum = 65535;
2307
2308         return 0;
2309 }
2310
2311 static int cake_config_precedence(struct Qdisc *sch)
2312 {
2313         /* convert high-level (user visible) parameters into internal format */
2314         struct cake_sched_data *q = qdisc_priv(sch);
2315         u32 mtu = psched_mtu(qdisc_dev(sch));
2316         u64 rate = q->rate_bps;
2317         u32 quantum = 256;
2318         u32 i;
2319
2320         q->tin_cnt = 8;
2321         q->tin_index = precedence;
2322         q->tin_order = normal_order;
2323
2324         for (i = 0; i < q->tin_cnt; i++) {
2325                 struct cake_tin_data *b = &q->tins[i];
2326
2327                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2328                               us_to_ns(q->interval));
2329
2330                 b->tin_quantum = max_t(u16, 1U, quantum);
2331
2332                 /* calculate next class's parameters */
2333                 rate  *= 7;
2334                 rate >>= 3;
2335
2336                 quantum  *= 7;
2337                 quantum >>= 3;
2338         }
2339
2340         return 0;
2341 }
2342
2343 /*      List of known Diffserv codepoints:
2344  *
2345  *      Least Effort (CS1, LE)
2346  *      Best Effort (CS0)
2347  *      Max Reliability & LLT "Lo" (TOS1)
2348  *      Max Throughput (TOS2)
2349  *      Min Delay (TOS4)
2350  *      LLT "La" (TOS5)
2351  *      Assured Forwarding 1 (AF1x) - x3
2352  *      Assured Forwarding 2 (AF2x) - x3
2353  *      Assured Forwarding 3 (AF3x) - x3
2354  *      Assured Forwarding 4 (AF4x) - x3
2355  *      Precedence Class 2 (CS2)
2356  *      Precedence Class 3 (CS3)
2357  *      Precedence Class 4 (CS4)
2358  *      Precedence Class 5 (CS5)
2359  *      Precedence Class 6 (CS6)
2360  *      Precedence Class 7 (CS7)
2361  *      Voice Admit (VA)
2362  *      Expedited Forwarding (EF)
2363
2364  *      Total 25 codepoints.
2365  */
2366
2367 /*      List of traffic classes in RFC 4594, updated by RFC 8622:
2368  *              (roughly descending order of contended priority)
2369  *              (roughly ascending order of uncontended throughput)
2370  *
2371  *      Network Control (CS6,CS7)      - routing traffic
2372  *      Telephony (EF,VA)         - aka. VoIP streams
2373  *      Signalling (CS5)               - VoIP setup
2374  *      Multimedia Conferencing (AF4x) - aka. video calls
2375  *      Realtime Interactive (CS4)     - eg. games
2376  *      Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2377  *      Broadcast Video (CS3)
2378  *      Low Latency Data (AF2x,TOS4)      - eg. database
2379  *      Ops, Admin, Management (CS2,TOS1) - eg. ssh
2380  *      Standard Service (CS0 & unrecognised codepoints)
2381  *      High Throughput Data (AF1x,TOS2)  - eg. web traffic
2382  *      Low Priority Data (CS1,LE)        - eg. BitTorrent
2383
2384  *      Total 12 traffic classes.
2385  */
2386
2387 static int cake_config_diffserv8(struct Qdisc *sch)
2388 {
2389 /*      Pruned list of traffic classes for typical applications:
2390  *
2391  *              Network Control          (CS6, CS7)
2392  *              Minimum Latency          (EF, VA, CS5, CS4)
2393  *              Interactive Shell        (CS2, TOS1)
2394  *              Low Latency Transactions (AF2x, TOS4)
2395  *              Video Streaming          (AF4x, AF3x, CS3)
2396  *              Bog Standard             (CS0 etc.)
2397  *              High Throughput          (AF1x, TOS2)
2398  *              Background Traffic       (CS1, LE)
2399  *
2400  *              Total 8 traffic classes.
2401  */
2402
2403         struct cake_sched_data *q = qdisc_priv(sch);
2404         u32 mtu = psched_mtu(qdisc_dev(sch));
2405         u64 rate = q->rate_bps;
2406         u32 quantum = 256;
2407         u32 i;
2408
2409         q->tin_cnt = 8;
2410
2411         /* codepoint to class mapping */
2412         q->tin_index = diffserv8;
2413         q->tin_order = normal_order;
2414
2415         /* class characteristics */
2416         for (i = 0; i < q->tin_cnt; i++) {
2417                 struct cake_tin_data *b = &q->tins[i];
2418
2419                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2420                               us_to_ns(q->interval));
2421
2422                 b->tin_quantum = max_t(u16, 1U, quantum);
2423
2424                 /* calculate next class's parameters */
2425                 rate  *= 7;
2426                 rate >>= 3;
2427
2428                 quantum  *= 7;
2429                 quantum >>= 3;
2430         }
2431
2432         return 0;
2433 }
2434
2435 static int cake_config_diffserv4(struct Qdisc *sch)
2436 {
2437 /*  Further pruned list of traffic classes for four-class system:
2438  *
2439  *          Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2440  *          Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2441  *          Best Effort        (CS0, AF1x, TOS2, and those not specified)
2442  *          Background Traffic (CS1, LE)
2443  *
2444  *              Total 4 traffic classes.
2445  */
2446
2447         struct cake_sched_data *q = qdisc_priv(sch);
2448         u32 mtu = psched_mtu(qdisc_dev(sch));
2449         u64 rate = q->rate_bps;
2450         u32 quantum = 1024;
2451
2452         q->tin_cnt = 4;
2453
2454         /* codepoint to class mapping */
2455         q->tin_index = diffserv4;
2456         q->tin_order = bulk_order;
2457
2458         /* class characteristics */
2459         cake_set_rate(&q->tins[0], rate, mtu,
2460                       us_to_ns(q->target), us_to_ns(q->interval));
2461         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2462                       us_to_ns(q->target), us_to_ns(q->interval));
2463         cake_set_rate(&q->tins[2], rate >> 1, mtu,
2464                       us_to_ns(q->target), us_to_ns(q->interval));
2465         cake_set_rate(&q->tins[3], rate >> 2, mtu,
2466                       us_to_ns(q->target), us_to_ns(q->interval));
2467
2468         /* bandwidth-sharing weights */
2469         q->tins[0].tin_quantum = quantum;
2470         q->tins[1].tin_quantum = quantum >> 4;
2471         q->tins[2].tin_quantum = quantum >> 1;
2472         q->tins[3].tin_quantum = quantum >> 2;
2473
2474         return 0;
2475 }
2476
2477 static int cake_config_diffserv3(struct Qdisc *sch)
2478 {
2479 /*  Simplified Diffserv structure with 3 tins.
2480  *              Low Priority            (CS1, LE)
2481  *              Best Effort
2482  *              Latency Sensitive       (TOS4, VA, EF, CS6, CS7)
2483  */
2484         struct cake_sched_data *q = qdisc_priv(sch);
2485         u32 mtu = psched_mtu(qdisc_dev(sch));
2486         u64 rate = q->rate_bps;
2487         u32 quantum = 1024;
2488
2489         q->tin_cnt = 3;
2490
2491         /* codepoint to class mapping */
2492         q->tin_index = diffserv3;
2493         q->tin_order = bulk_order;
2494
2495         /* class characteristics */
2496         cake_set_rate(&q->tins[0], rate, mtu,
2497                       us_to_ns(q->target), us_to_ns(q->interval));
2498         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2499                       us_to_ns(q->target), us_to_ns(q->interval));
2500         cake_set_rate(&q->tins[2], rate >> 2, mtu,
2501                       us_to_ns(q->target), us_to_ns(q->interval));
2502
2503         /* bandwidth-sharing weights */
2504         q->tins[0].tin_quantum = quantum;
2505         q->tins[1].tin_quantum = quantum >> 4;
2506         q->tins[2].tin_quantum = quantum >> 2;
2507
2508         return 0;
2509 }
2510
2511 static void cake_reconfigure(struct Qdisc *sch)
2512 {
2513         struct cake_sched_data *q = qdisc_priv(sch);
2514         int c, ft;
2515
2516         switch (q->tin_mode) {
2517         case CAKE_DIFFSERV_BESTEFFORT:
2518                 ft = cake_config_besteffort(sch);
2519                 break;
2520
2521         case CAKE_DIFFSERV_PRECEDENCE:
2522                 ft = cake_config_precedence(sch);
2523                 break;
2524
2525         case CAKE_DIFFSERV_DIFFSERV8:
2526                 ft = cake_config_diffserv8(sch);
2527                 break;
2528
2529         case CAKE_DIFFSERV_DIFFSERV4:
2530                 ft = cake_config_diffserv4(sch);
2531                 break;
2532
2533         case CAKE_DIFFSERV_DIFFSERV3:
2534         default:
2535                 ft = cake_config_diffserv3(sch);
2536                 break;
2537         }
2538
2539         for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2540                 cake_clear_tin(sch, c);
2541                 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2542         }
2543
2544         q->rate_ns   = q->tins[ft].tin_rate_ns;
2545         q->rate_shft = q->tins[ft].tin_rate_shft;
2546
2547         if (q->buffer_config_limit) {
2548                 q->buffer_limit = q->buffer_config_limit;
2549         } else if (q->rate_bps) {
2550                 u64 t = q->rate_bps * q->interval;
2551
2552                 do_div(t, USEC_PER_SEC / 4);
2553                 q->buffer_limit = max_t(u32, t, 4U << 20);
2554         } else {
2555                 q->buffer_limit = ~0;
2556         }
2557
2558         sch->flags &= ~TCQ_F_CAN_BYPASS;
2559
2560         q->buffer_limit = min(q->buffer_limit,
2561                               max(sch->limit * psched_mtu(qdisc_dev(sch)),
2562                                   q->buffer_config_limit));
2563 }
2564
2565 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2566                        struct netlink_ext_ack *extack)
2567 {
2568         struct cake_sched_data *q = qdisc_priv(sch);
2569         struct nlattr *tb[TCA_CAKE_MAX + 1];
2570         int err;
2571
2572         if (!opt)
2573                 return -EINVAL;
2574
2575         err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2576                                           extack);
2577         if (err < 0)
2578                 return err;
2579
2580         if (tb[TCA_CAKE_NAT]) {
2581 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2582                 q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2583                 q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2584                         !!nla_get_u32(tb[TCA_CAKE_NAT]);
2585 #else
2586                 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2587                                     "No conntrack support in kernel");
2588                 return -EOPNOTSUPP;
2589 #endif
2590         }
2591
2592         if (tb[TCA_CAKE_BASE_RATE64])
2593                 q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2594
2595         if (tb[TCA_CAKE_DIFFSERV_MODE])
2596                 q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2597
2598         if (tb[TCA_CAKE_WASH]) {
2599                 if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2600                         q->rate_flags |= CAKE_FLAG_WASH;
2601                 else
2602                         q->rate_flags &= ~CAKE_FLAG_WASH;
2603         }
2604
2605         if (tb[TCA_CAKE_FLOW_MODE])
2606                 q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2607                                 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2608                                         CAKE_FLOW_MASK));
2609
2610         if (tb[TCA_CAKE_ATM])
2611                 q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2612
2613         if (tb[TCA_CAKE_OVERHEAD]) {
2614                 q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2615                 q->rate_flags |= CAKE_FLAG_OVERHEAD;
2616
2617                 q->max_netlen = 0;
2618                 q->max_adjlen = 0;
2619                 q->min_netlen = ~0;
2620                 q->min_adjlen = ~0;
2621         }
2622
2623         if (tb[TCA_CAKE_RAW]) {
2624                 q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2625
2626                 q->max_netlen = 0;
2627                 q->max_adjlen = 0;
2628                 q->min_netlen = ~0;
2629                 q->min_adjlen = ~0;
2630         }
2631
2632         if (tb[TCA_CAKE_MPU])
2633                 q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2634
2635         if (tb[TCA_CAKE_RTT]) {
2636                 q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2637
2638                 if (!q->interval)
2639                         q->interval = 1;
2640         }
2641
2642         if (tb[TCA_CAKE_TARGET]) {
2643                 q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2644
2645                 if (!q->target)
2646                         q->target = 1;
2647         }
2648
2649         if (tb[TCA_CAKE_AUTORATE]) {
2650                 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2651                         q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2652                 else
2653                         q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2654         }
2655
2656         if (tb[TCA_CAKE_INGRESS]) {
2657                 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2658                         q->rate_flags |= CAKE_FLAG_INGRESS;
2659                 else
2660                         q->rate_flags &= ~CAKE_FLAG_INGRESS;
2661         }
2662
2663         if (tb[TCA_CAKE_ACK_FILTER])
2664                 q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2665
2666         if (tb[TCA_CAKE_MEMORY])
2667                 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2668
2669         if (tb[TCA_CAKE_SPLIT_GSO]) {
2670                 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2671                         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2672                 else
2673                         q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2674         }
2675
2676         if (tb[TCA_CAKE_FWMARK]) {
2677                 q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2678                 q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2679         }
2680
2681         if (q->tins) {
2682                 sch_tree_lock(sch);
2683                 cake_reconfigure(sch);
2684                 sch_tree_unlock(sch);
2685         }
2686
2687         return 0;
2688 }
2689
2690 static void cake_destroy(struct Qdisc *sch)
2691 {
2692         struct cake_sched_data *q = qdisc_priv(sch);
2693
2694         qdisc_watchdog_cancel(&q->watchdog);
2695         tcf_block_put(q->block);
2696         kvfree(q->tins);
2697 }
2698
2699 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2700                      struct netlink_ext_ack *extack)
2701 {
2702         struct cake_sched_data *q = qdisc_priv(sch);
2703         int i, j, err;
2704
2705         sch->limit = 10240;
2706         q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2707         q->flow_mode  = CAKE_FLOW_TRIPLE;
2708
2709         q->rate_bps = 0; /* unlimited by default */
2710
2711         q->interval = 100000; /* 100ms default */
2712         q->target   =   5000; /* 5ms: codel RFC argues
2713                                * for 5 to 10% of interval
2714                                */
2715         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2716         q->cur_tin = 0;
2717         q->cur_flow  = 0;
2718
2719         qdisc_watchdog_init(&q->watchdog, sch);
2720
2721         if (opt) {
2722                 err = cake_change(sch, opt, extack);
2723
2724                 if (err)
2725                         return err;
2726         }
2727
2728         err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2729         if (err)
2730                 return err;
2731
2732         quantum_div[0] = ~0;
2733         for (i = 1; i <= CAKE_QUEUES; i++)
2734                 quantum_div[i] = 65535 / i;
2735
2736         q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2737                            GFP_KERNEL);
2738         if (!q->tins)
2739                 goto nomem;
2740
2741         for (i = 0; i < CAKE_MAX_TINS; i++) {
2742                 struct cake_tin_data *b = q->tins + i;
2743
2744                 INIT_LIST_HEAD(&b->new_flows);
2745                 INIT_LIST_HEAD(&b->old_flows);
2746                 INIT_LIST_HEAD(&b->decaying_flows);
2747                 b->sparse_flow_count = 0;
2748                 b->bulk_flow_count = 0;
2749                 b->decaying_flow_count = 0;
2750
2751                 for (j = 0; j < CAKE_QUEUES; j++) {
2752                         struct cake_flow *flow = b->flows + j;
2753                         u32 k = j * CAKE_MAX_TINS + i;
2754
2755                         INIT_LIST_HEAD(&flow->flowchain);
2756                         cobalt_vars_init(&flow->cvars);
2757
2758                         q->overflow_heap[k].t = i;
2759                         q->overflow_heap[k].b = j;
2760                         b->overflow_idx[j] = k;
2761                 }
2762         }
2763
2764         cake_reconfigure(sch);
2765         q->avg_peak_bandwidth = q->rate_bps;
2766         q->min_netlen = ~0;
2767         q->min_adjlen = ~0;
2768         return 0;
2769
2770 nomem:
2771         cake_destroy(sch);
2772         return -ENOMEM;
2773 }
2774
2775 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2776 {
2777         struct cake_sched_data *q = qdisc_priv(sch);
2778         struct nlattr *opts;
2779
2780         opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2781         if (!opts)
2782                 goto nla_put_failure;
2783
2784         if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2785                               TCA_CAKE_PAD))
2786                 goto nla_put_failure;
2787
2788         if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2789                         q->flow_mode & CAKE_FLOW_MASK))
2790                 goto nla_put_failure;
2791
2792         if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2793                 goto nla_put_failure;
2794
2795         if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2796                 goto nla_put_failure;
2797
2798         if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2799                 goto nla_put_failure;
2800
2801         if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2802                         !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2803                 goto nla_put_failure;
2804
2805         if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2806                         !!(q->rate_flags & CAKE_FLAG_INGRESS)))
2807                 goto nla_put_failure;
2808
2809         if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2810                 goto nla_put_failure;
2811
2812         if (nla_put_u32(skb, TCA_CAKE_NAT,
2813                         !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2814                 goto nla_put_failure;
2815
2816         if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2817                 goto nla_put_failure;
2818
2819         if (nla_put_u32(skb, TCA_CAKE_WASH,
2820                         !!(q->rate_flags & CAKE_FLAG_WASH)))
2821                 goto nla_put_failure;
2822
2823         if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2824                 goto nla_put_failure;
2825
2826         if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2827                 if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2828                         goto nla_put_failure;
2829
2830         if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2831                 goto nla_put_failure;
2832
2833         if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2834                 goto nla_put_failure;
2835
2836         if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2837                         !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2838                 goto nla_put_failure;
2839
2840         if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2841                 goto nla_put_failure;
2842
2843         return nla_nest_end(skb, opts);
2844
2845 nla_put_failure:
2846         return -1;
2847 }
2848
2849 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2850 {
2851         struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2852         struct cake_sched_data *q = qdisc_priv(sch);
2853         struct nlattr *tstats, *ts;
2854         int i;
2855
2856         if (!stats)
2857                 return -1;
2858
2859 #define PUT_STAT_U32(attr, data) do {                                  \
2860                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2861                         goto nla_put_failure;                          \
2862         } while (0)
2863 #define PUT_STAT_U64(attr, data) do {                                  \
2864                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2865                                         data, TCA_CAKE_STATS_PAD)) \
2866                         goto nla_put_failure;                          \
2867         } while (0)
2868
2869         PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2870         PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2871         PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2872         PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2873         PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2874         PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2875         PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2876         PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2877
2878 #undef PUT_STAT_U32
2879 #undef PUT_STAT_U64
2880
2881         tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2882         if (!tstats)
2883                 goto nla_put_failure;
2884
2885 #define PUT_TSTAT_U32(attr, data) do {                                  \
2886                 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2887                         goto nla_put_failure;                           \
2888         } while (0)
2889 #define PUT_TSTAT_U64(attr, data) do {                                  \
2890                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2891                                         data, TCA_CAKE_TIN_STATS_PAD))  \
2892                         goto nla_put_failure;                           \
2893         } while (0)
2894
2895         for (i = 0; i < q->tin_cnt; i++) {
2896                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2897
2898                 ts = nla_nest_start_noflag(d->skb, i + 1);
2899                 if (!ts)
2900                         goto nla_put_failure;
2901
2902                 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2903                 PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2904                 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2905
2906                 PUT_TSTAT_U32(TARGET_US,
2907                               ktime_to_us(ns_to_ktime(b->cparams.target)));
2908                 PUT_TSTAT_U32(INTERVAL_US,
2909                               ktime_to_us(ns_to_ktime(b->cparams.interval)));
2910
2911                 PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2912                 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2913                 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2914                 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2915
2916                 PUT_TSTAT_U32(PEAK_DELAY_US,
2917                               ktime_to_us(ns_to_ktime(b->peak_delay)));
2918                 PUT_TSTAT_U32(AVG_DELAY_US,
2919                               ktime_to_us(ns_to_ktime(b->avge_delay)));
2920                 PUT_TSTAT_U32(BASE_DELAY_US,
2921                               ktime_to_us(ns_to_ktime(b->base_delay)));
2922
2923                 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2924                 PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2925                 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2926
2927                 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2928                                             b->decaying_flow_count);
2929                 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2930                 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2931                 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2932
2933                 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2934                 nla_nest_end(d->skb, ts);
2935         }
2936
2937 #undef PUT_TSTAT_U32
2938 #undef PUT_TSTAT_U64
2939
2940         nla_nest_end(d->skb, tstats);
2941         return nla_nest_end(d->skb, stats);
2942
2943 nla_put_failure:
2944         nla_nest_cancel(d->skb, stats);
2945         return -1;
2946 }
2947
2948 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2949 {
2950         return NULL;
2951 }
2952
2953 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2954 {
2955         return 0;
2956 }
2957
2958 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2959                                u32 classid)
2960 {
2961         return 0;
2962 }
2963
2964 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2965 {
2966 }
2967
2968 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2969                                         struct netlink_ext_ack *extack)
2970 {
2971         struct cake_sched_data *q = qdisc_priv(sch);
2972
2973         if (cl)
2974                 return NULL;
2975         return q->block;
2976 }
2977
2978 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2979                            struct sk_buff *skb, struct tcmsg *tcm)
2980 {
2981         tcm->tcm_handle |= TC_H_MIN(cl);
2982         return 0;
2983 }
2984
2985 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2986                                  struct gnet_dump *d)
2987 {
2988         struct cake_sched_data *q = qdisc_priv(sch);
2989         const struct cake_flow *flow = NULL;
2990         struct gnet_stats_queue qs = { 0 };
2991         struct nlattr *stats;
2992         u32 idx = cl - 1;
2993
2994         if (idx < CAKE_QUEUES * q->tin_cnt) {
2995                 const struct cake_tin_data *b = \
2996                         &q->tins[q->tin_order[idx / CAKE_QUEUES]];
2997                 const struct sk_buff *skb;
2998
2999                 flow = &b->flows[idx % CAKE_QUEUES];
3000
3001                 if (flow->head) {
3002                         sch_tree_lock(sch);
3003                         skb = flow->head;
3004                         while (skb) {
3005                                 qs.qlen++;
3006                                 skb = skb->next;
3007                         }
3008                         sch_tree_unlock(sch);
3009                 }
3010                 qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3011                 qs.drops = flow->dropped;
3012         }
3013         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3014                 return -1;
3015         if (flow) {
3016                 ktime_t now = ktime_get();
3017
3018                 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3019                 if (!stats)
3020                         return -1;
3021
3022 #define PUT_STAT_U32(attr, data) do {                                  \
3023                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3024                         goto nla_put_failure;                          \
3025         } while (0)
3026 #define PUT_STAT_S32(attr, data) do {                                  \
3027                 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3028                         goto nla_put_failure;                          \
3029         } while (0)
3030
3031                 PUT_STAT_S32(DEFICIT, flow->deficit);
3032                 PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3033                 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3034                 PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3035                 if (flow->cvars.p_drop) {
3036                         PUT_STAT_S32(BLUE_TIMER_US,
3037                                      ktime_to_us(
3038                                              ktime_sub(now,
3039                                                        flow->cvars.blue_timer)));
3040                 }
3041                 if (flow->cvars.dropping) {
3042                         PUT_STAT_S32(DROP_NEXT_US,
3043                                      ktime_to_us(
3044                                              ktime_sub(now,
3045                                                        flow->cvars.drop_next)));
3046                 }
3047
3048                 if (nla_nest_end(d->skb, stats) < 0)
3049                         return -1;
3050         }
3051
3052         return 0;
3053
3054 nla_put_failure:
3055         nla_nest_cancel(d->skb, stats);
3056         return -1;
3057 }
3058
3059 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3060 {
3061         struct cake_sched_data *q = qdisc_priv(sch);
3062         unsigned int i, j;
3063
3064         if (arg->stop)
3065                 return;
3066
3067         for (i = 0; i < q->tin_cnt; i++) {
3068                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3069
3070                 for (j = 0; j < CAKE_QUEUES; j++) {
3071                         if (list_empty(&b->flows[j].flowchain) ||
3072                             arg->count < arg->skip) {
3073                                 arg->count++;
3074                                 continue;
3075                         }
3076                         if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
3077                                 arg->stop = 1;
3078                                 break;
3079                         }
3080                         arg->count++;
3081                 }
3082         }
3083 }
3084
3085 static const struct Qdisc_class_ops cake_class_ops = {
3086         .leaf           =       cake_leaf,
3087         .find           =       cake_find,
3088         .tcf_block      =       cake_tcf_block,
3089         .bind_tcf       =       cake_bind,
3090         .unbind_tcf     =       cake_unbind,
3091         .dump           =       cake_dump_class,
3092         .dump_stats     =       cake_dump_class_stats,
3093         .walk           =       cake_walk,
3094 };
3095
3096 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3097         .cl_ops         =       &cake_class_ops,
3098         .id             =       "cake",
3099         .priv_size      =       sizeof(struct cake_sched_data),
3100         .enqueue        =       cake_enqueue,
3101         .dequeue        =       cake_dequeue,
3102         .peek           =       qdisc_peek_dequeued,
3103         .init           =       cake_init,
3104         .reset          =       cake_reset,
3105         .destroy        =       cake_destroy,
3106         .change         =       cake_change,
3107         .dump           =       cake_dump,
3108         .dump_stats     =       cake_dump_stats,
3109         .owner          =       THIS_MODULE,
3110 };
3111
3112 static int __init cake_module_init(void)
3113 {
3114         return register_qdisc(&cake_qdisc_ops);
3115 }
3116
3117 static void __exit cake_module_exit(void)
3118 {
3119         unregister_qdisc(&cake_qdisc_ops);
3120 }
3121
3122 module_init(cake_module_init)
3123 module_exit(cake_module_exit)
3124 MODULE_AUTHOR("Jonathan Morton");
3125 MODULE_LICENSE("Dual BSD/GPL");
3126 MODULE_DESCRIPTION("The CAKE shaper.");