net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/errno.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/rtnetlink.h>
  25 #include <linux/reciprocal_div.h>
  26 #include <linux/rbtree.h>
  27
  28 #include <net/netlink.h>
  29 #include <net/pkt_sched.h>
  30 #include <net/inet_ecn.h>
  31
  32 #define VERSION "1.3"
  33
  34 /*      Network Emulation Queuing algorithm.
  35         ====================================
  36
  37         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  38                  Network Emulation Tool
  39                  [2] Luigi Rizzo, DummyNet for FreeBSD
  40
  41          ----------------------------------------------------------------
  42
  43          This started out as a simple way to delay outgoing packets to
  44          test TCP but has grown to include most of the functionality
  45          of a full blown network emulator like NISTnet. It can delay
  46          packets and add random jitter (and correlation). The random
  47          distribution can be loaded from a table as well to provide
  48          normal, Pareto, or experimental curves. Packet loss,
  49          duplication, and reordering can also be emulated.
  50
  51          This qdisc does not do classification that can be handled in
  52          layering other disciplines.  It does not need to do bandwidth
  53          control either since that can be handled by using token
  54          bucket or other rate control.
  55
  56      Correlated Loss Generator models
  57
  58         Added generation of correlated loss according to the
  59         "Gilbert-Elliot" model, a 4-state markov model.
  60
  61         References:
  62         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  63         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  64         and intuitive loss model for packet networks and its implementation
  65         in the Netem module in the Linux kernel", available in [1]
  66
  67         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  68                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  69 */
  70
  71 struct disttable {
  72         u32  size;
  73         s16 table[0];
  74 };
  75
  76 struct netem_sched_data {
  77         /* internal t(ime)fifo qdisc uses t_root and sch->limit */
  78         struct rb_root t_root;
  79
  80         /* optional qdisc for classful handling (NULL at netem init) */
  81         struct Qdisc    *qdisc;
  82
  83         struct qdisc_watchdog watchdog;
  84
  85         s64 latency;
  86         s64 jitter;
  87
  88         u32 loss;
  89         u32 ecn;
  90         u32 limit;
  91         u32 counter;
  92         u32 gap;
  93         u32 duplicate;
  94         u32 reorder;
  95         u32 corrupt;
  96         u64 rate;
  97         s32 packet_overhead;
  98         u32 cell_size;
  99         struct reciprocal_value cell_size_reciprocal;
 100         s32 cell_overhead;
 101
 102         struct crndstate {
 103                 u32 last;
 104                 u32 rho;
 105         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
 106
 107         struct disttable *delay_dist;
 108
 109         enum  {
 110                 CLG_RANDOM,
 111                 CLG_4_STATES,
 112                 CLG_GILB_ELL,
 113         } loss_model;
 114
 115         enum {
 116                 TX_IN_GAP_PERIOD = 1,
 117                 TX_IN_BURST_PERIOD,
 118                 LOST_IN_GAP_PERIOD,
 119                 LOST_IN_BURST_PERIOD,
 120         } _4_state_model;
 121
 122         enum {
 123                 GOOD_STATE = 1,
 124                 BAD_STATE,
 125         } GE_state_model;
 126
 127         /* Correlated Loss Generation models */
 128         struct clgstate {
 129                 /* state of the Markov chain */
 130                 u8 state;
 131
 132                 /* 4-states and Gilbert-Elliot models */
 133                 u32 a1; /* p13 for 4-states or p for GE */
 134                 u32 a2; /* p31 for 4-states or r for GE */
 135                 u32 a3; /* p32 for 4-states or h for GE */
 136                 u32 a4; /* p14 for 4-states or 1-k for GE */
 137                 u32 a5; /* p23 used only in 4-states */
 138         } clg;
 139
 140         struct tc_netem_slot slot_config;
 141         struct slotstate {
 142                 u64 slot_next;
 143                 s32 packets_left;
 144                 s32 bytes_left;
 145         } slot;
 146
 147         struct disttable *slot_dist;
 148 };
 149
 150 /* Time stamp put into socket buffer control block
 151  * Only valid when skbs are in our internal t(ime)fifo queue.
 152  *
 153  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
 154  * and skb->next & skb->prev are scratch space for a qdisc,
 155  * we save skb->tstamp value in skb->cb[] before destroying it.
 156  */
 157 struct netem_skb_cb {
 158         u64             time_to_send;
 159 };
 160
 161 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 162 {
 163         /* we assume we can use skb next/prev/tstamp as storage for rb_node */
 164         qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 165         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 166 }
 167
 168 /* init_crandom - initialize correlated random number generator
 169  * Use entropy source for initial seed.
 170  */
 171 static void init_crandom(struct crndstate *state, unsigned long rho)
 172 {
 173         state->rho = rho;
 174         state->last = prandom_u32();
 175 }
 176
 177 /* get_crandom - correlated random number generator
 178  * Next number depends on last value.
 179  * rho is scaled to avoid floating point.
 180  */
 181 static u32 get_crandom(struct crndstate *state)
 182 {
 183         u64 value, rho;
 184         unsigned long answer;
 185
 186         if (!state || state->rho == 0)  /* no correlation */
 187                 return prandom_u32();
 188
 189         value = prandom_u32();
 190         rho = (u64)state->rho + 1;
 191         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 192         state->last = answer;
 193         return answer;
 194 }
 195
 196 /* loss_4state - 4-state model loss generator
 197  * Generates losses according to the 4-state Markov chain adopted in
 198  * the GI (General and Intuitive) loss model.
 199  */
 200 static bool loss_4state(struct netem_sched_data *q)
 201 {
 202         struct clgstate *clg = &q->clg;
 203         u32 rnd = prandom_u32();
 204
 205         /*
 206          * Makes a comparison between rnd and the transition
 207          * probabilities outgoing from the current state, then decides the
 208          * next state and if the next packet has to be transmitted or lost.
 209          * The four states correspond to:
 210          *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
 211          *   LOST_IN_BURST_PERIOD => isolated losses within a gap period
 212          *   LOST_IN_GAP_PERIOD => lost packets within a burst period
 213          *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
 214          */
 215         switch (clg->state) {
 216         case TX_IN_GAP_PERIOD:
 217                 if (rnd < clg->a4) {
 218                         clg->state = LOST_IN_BURST_PERIOD;
 219                         return true;
 220                 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
 221                         clg->state = LOST_IN_GAP_PERIOD;
 222                         return true;
 223                 } else if (clg->a1 + clg->a4 < rnd) {
 224                         clg->state = TX_IN_GAP_PERIOD;
 225                 }
 226
 227                 break;
 228         case TX_IN_BURST_PERIOD:
 229                 if (rnd < clg->a5) {
 230                         clg->state = LOST_IN_GAP_PERIOD;
 231                         return true;
 232                 } else {
 233                         clg->state = TX_IN_BURST_PERIOD;
 234                 }
 235
 236                 break;
 237         case LOST_IN_GAP_PERIOD:
 238                 if (rnd < clg->a3)
 239                         clg->state = TX_IN_BURST_PERIOD;
 240                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 241                         clg->state = TX_IN_GAP_PERIOD;
 242                 } else if (clg->a2 + clg->a3 < rnd) {
 243                         clg->state = LOST_IN_GAP_PERIOD;
 244                         return true;
 245                 }
 246                 break;
 247         case LOST_IN_BURST_PERIOD:
 248                 clg->state = TX_IN_GAP_PERIOD;
 249                 break;
 250         }
 251
 252         return false;
 253 }
 254
 255 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 256  * Generates losses according to the Gilbert-Elliot loss model or
 257  * its special cases  (Gilbert or Simple Gilbert)
 258  *
 259  * Makes a comparison between random number and the transition
 260  * probabilities outgoing from the current state, then decides the
 261  * next state. A second random number is extracted and the comparison
 262  * with the loss probability of the current state decides if the next
 263  * packet will be transmitted or lost.
 264  */
 265 static bool loss_gilb_ell(struct netem_sched_data *q)
 266 {
 267         struct clgstate *clg = &q->clg;
 268
 269         switch (clg->state) {
 270         case GOOD_STATE:
 271                 if (prandom_u32() < clg->a1)
 272                         clg->state = BAD_STATE;
 273                 if (prandom_u32() < clg->a4)
 274                         return true;
 275                 break;
 276         case BAD_STATE:
 277                 if (prandom_u32() < clg->a2)
 278                         clg->state = GOOD_STATE;
 279                 if (prandom_u32() > clg->a3)
 280                         return true;
 281         }
 282
 283         return false;
 284 }
 285
 286 static bool loss_event(struct netem_sched_data *q)
 287 {
 288         switch (q->loss_model) {
 289         case CLG_RANDOM:
 290                 /* Random packet drop 0 => none, ~0 => all */
 291                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 292
 293         case CLG_4_STATES:
 294                 /* 4state loss model algorithm (used also for GI model)
 295                 * Extracts a value from the markov 4 state loss generator,
 296                 * if it is 1 drops a packet and if needed writes the event in
 297                 * the kernel logs
 298                 */
 299                 return loss_4state(q);
 300
 301         case CLG_GILB_ELL:
 302                 /* Gilbert-Elliot loss model algorithm
 303                 * Extracts a value from the Gilbert-Elliot loss generator,
 304                 * if it is 1 drops a packet and if needed writes the event in
 305                 * the kernel logs
 306                 */
 307                 return loss_gilb_ell(q);
 308         }
 309
 310         return false;   /* not reached */
 311 }
 312
 313
 314 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 315  * std deviation sigma.  Uses table lookup to approximate the desired
 316  * distribution, and a uniformly-distributed pseudo-random source.
 317  */
 318 static s64 tabledist(s64 mu, s32 sigma,
 319                      struct crndstate *state,
 320                      const struct disttable *dist)
 321 {
 322         s64 x;
 323         long t;
 324         u32 rnd;
 325
 326         if (sigma == 0)
 327                 return mu;
 328
 329         rnd = get_crandom(state);
 330
 331         /* default uniform distribution */
 332         if (dist == NULL)
 333                 return ((rnd % (2 * sigma)) + mu) - sigma;
 334
 335         t = dist->table[rnd % dist->size];
 336         x = (sigma % NETEM_DIST_SCALE) * t;
 337         if (x >= 0)
 338                 x += NETEM_DIST_SCALE/2;
 339         else
 340                 x -= NETEM_DIST_SCALE/2;
 341
 342         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 343 }
 344
 345 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
 346 {
 347         len += q->packet_overhead;
 348
 349         if (q->cell_size) {
 350                 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
 351
 352                 if (len > cells * q->cell_size) /* extra cell needed for remainder */
 353                         cells++;
 354                 len = cells * (q->cell_size + q->cell_overhead);
 355         }
 356
 357         return div64_u64(len * NSEC_PER_SEC, q->rate);
 358 }
 359
 360 static void tfifo_reset(struct Qdisc *sch)
 361 {
 362         struct netem_sched_data *q = qdisc_priv(sch);
 363         struct rb_node *p = rb_first(&q->t_root);
 364
 365         while (p) {
 366                 struct sk_buff *skb = rb_to_skb(p);
 367
 368                 p = rb_next(p);
 369                 rb_erase(&skb->rbnode, &q->t_root);
 370                 rtnl_kfree_skbs(skb, skb);
 371         }
 372 }
 373
 374 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 375 {
 376         struct netem_sched_data *q = qdisc_priv(sch);
 377         u64 tnext = netem_skb_cb(nskb)->time_to_send;
 378         struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
 379
 380         while (*p) {
 381                 struct sk_buff *skb;
 382
 383                 parent = *p;
 384                 skb = rb_to_skb(parent);
 385                 if (tnext >= netem_skb_cb(skb)->time_to_send)
 386                         p = &parent->rb_right;
 387                 else
 388                         p = &parent->rb_left;
 389         }
 390         rb_link_node(&nskb->rbnode, parent, p);
 391         rb_insert_color(&nskb->rbnode, &q->t_root);
 392         sch->q.qlen++;
 393 }
 394
 395 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
 396  * when we statistically choose to corrupt one, we instead segment it, returning
 397  * the first packet to be corrupted, and re-enqueue the remaining frames
 398  */
 399 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
 400                                      struct sk_buff **to_free)
 401 {
 402         struct sk_buff *segs;
 403         netdev_features_t features = netif_skb_features(skb);
 404
 405         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 406
 407         if (IS_ERR_OR_NULL(segs)) {
 408                 qdisc_drop(skb, sch, to_free);
 409                 return NULL;
 410         }
 411         consume_skb(skb);
 412         return segs;
 413 }
 414
 415 static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
 416 {
 417         skb->next = qh->head;
 418
 419         if (!qh->head)
 420                 qh->tail = skb;
 421         qh->head = skb;
 422         qh->qlen++;
 423 }
 424
 425 /*
 426  * Insert one skb into qdisc.
 427  * Note: parent depends on return value to account for queue length.
 428  *      NET_XMIT_DROP: queue length didn't change.
 429  *      NET_XMIT_SUCCESS: one skb was queued.
 430  */
 431 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 432                          struct sk_buff **to_free)
 433 {
 434         struct netem_sched_data *q = qdisc_priv(sch);
 435         /* We don't fill cb now as skb_unshare() may invalidate it */
 436         struct netem_skb_cb *cb;
 437         struct sk_buff *skb2;
 438         struct sk_buff *segs = NULL;
 439         unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
 440         int nb = 0;
 441         int count = 1;
 442         int rc = NET_XMIT_SUCCESS;
 443
 444         /* Random duplication */
 445         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 446                 ++count;
 447
 448         /* Drop packet? */
 449         if (loss_event(q)) {
 450                 if (q->ecn && INET_ECN_set_ce(skb))
 451                         qdisc_qstats_drop(sch); /* mark packet */
 452                 else
 453                         --count;
 454         }
 455         if (count == 0) {
 456                 qdisc_qstats_drop(sch);
 457                 __qdisc_drop(skb, to_free);
 458                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 459         }
 460
 461         /* If a delay is expected, orphan the skb. (orphaning usually takes
 462          * place at TX completion time, so _before_ the link transit delay)
 463          */
 464         if (q->latency || q->jitter || q->rate)
 465                 skb_orphan_partial(skb);
 466
 467         /*
 468          * If we need to duplicate packet, then re-insert at top of the
 469          * qdisc tree, since parent queuer expects that only one
 470          * skb will be queued.
 471          */
 472         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 473                 struct Qdisc *rootq = qdisc_root(sch);
 474                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 475
 476                 q->duplicate = 0;
 477                 rootq->enqueue(skb2, rootq, to_free);
 478                 q->duplicate = dupsave;
 479         }
 480
 481         /*
 482          * Randomized packet corruption.
 483          * Make copy if needed since we are modifying
 484          * If packet is going to be hardware checksummed, then
 485          * do it now in software before we mangle it.
 486          */
 487         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 488                 if (skb_is_gso(skb)) {
 489                         segs = netem_segment(skb, sch, to_free);
 490                         if (!segs)
 491                                 return NET_XMIT_DROP;
 492                 } else {
 493                         segs = skb;
 494                 }
 495
 496                 skb = segs;
 497                 segs = segs->next;
 498
 499                 skb = skb_unshare(skb, GFP_ATOMIC);
 500                 if (unlikely(!skb)) {
 501                         qdisc_qstats_drop(sch);
 502                         goto finish_segs;
 503                 }
 504                 if (skb->ip_summed == CHECKSUM_PARTIAL &&
 505                     skb_checksum_help(skb)) {
 506                         qdisc_drop(skb, sch, to_free);
 507                         goto finish_segs;
 508                 }
 509
 510                 skb->data[prandom_u32() % skb_headlen(skb)] ^=
 511                         1<<(prandom_u32() % 8);
 512         }
 513
 514         if (unlikely(sch->q.qlen >= sch->limit))
 515                 return qdisc_drop_all(skb, sch, to_free);
 516
 517         qdisc_qstats_backlog_inc(sch, skb);
 518
 519         cb = netem_skb_cb(skb);
 520         if (q->gap == 0 ||              /* not doing reordering */
 521             q->counter < q->gap - 1 ||  /* inside last reordering gap */
 522             q->reorder < get_crandom(&q->reorder_cor)) {
 523                 u64 now;
 524                 s64 delay;
 525
 526                 delay = tabledist(q->latency, q->jitter,
 527                                   &q->delay_cor, q->delay_dist);
 528
 529                 now = ktime_get_ns();
 530
 531                 if (q->rate) {
 532                         struct netem_skb_cb *last = NULL;
 533
 534                         if (sch->q.tail)
 535                                 last = netem_skb_cb(sch->q.tail);
 536                         if (q->t_root.rb_node) {
 537                                 struct sk_buff *t_skb;
 538                                 struct netem_skb_cb *t_last;
 539
 540                                 t_skb = skb_rb_last(&q->t_root);
 541                                 t_last = netem_skb_cb(t_skb);
 542                                 if (!last ||
 543                                     t_last->time_to_send > last->time_to_send) {
 544                                         last = t_last;
 545                                 }
 546                         }
 547
 548                         if (last) {
 549                                 /*
 550                                  * Last packet in queue is reference point (now),
 551                                  * calculate this time bonus and subtract
 552                                  * from delay.
 553                                  */
 554                                 delay -= last->time_to_send - now;
 555                                 delay = max_t(s64, 0, delay);
 556                                 now = last->time_to_send;
 557                         }
 558
 559                         delay += packet_time_ns(qdisc_pkt_len(skb), q);
 560                 }
 561
 562                 cb->time_to_send = now + delay;
 563                 ++q->counter;
 564                 tfifo_enqueue(skb, sch);
 565         } else {
 566                 /*
 567                  * Do re-ordering by putting one out of N packets at the front
 568                  * of the queue.
 569                  */
 570                 cb->time_to_send = ktime_get_ns();
 571                 q->counter = 0;
 572
 573                 netem_enqueue_skb_head(&sch->q, skb);
 574                 sch->qstats.requeues++;
 575         }
 576
 577 finish_segs:
 578         if (segs) {
 579                 while (segs) {
 580                         skb2 = segs->next;
 581                         segs->next = NULL;
 582                         qdisc_skb_cb(segs)->pkt_len = segs->len;
 583                         last_len = segs->len;
 584                         rc = qdisc_enqueue(segs, sch, to_free);
 585                         if (rc != NET_XMIT_SUCCESS) {
 586                                 if (net_xmit_drop_count(rc))
 587                                         qdisc_qstats_drop(sch);
 588                         } else {
 589                                 nb++;
 590                                 len += last_len;
 591                         }
 592                         segs = skb2;
 593                 }
 594                 sch->q.qlen += nb;
 595                 if (nb > 1)
 596                         qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
 597         }
 598         return NET_XMIT_SUCCESS;
 599 }
 600
 601 /* Delay the next round with a new future slot with a
 602  * correct number of bytes and packets.
 603  */
 604
 605 static void get_slot_next(struct netem_sched_data *q, u64 now)
 606 {
 607         s64 next_delay;
 608
 609         if (!q->slot_dist)
 610                 next_delay = q->slot_config.min_delay +
 611                                 (prandom_u32() *
 612                                  (q->slot_config.max_delay -
 613                                   q->slot_config.min_delay) >> 32);
 614         else
 615                 next_delay = tabledist(q->slot_config.dist_delay,
 616                                        (s32)(q->slot_config.dist_jitter),
 617                                        NULL, q->slot_dist);
 618
 619         q->slot.slot_next = now + next_delay;
 620         q->slot.packets_left = q->slot_config.max_packets;
 621         q->slot.bytes_left = q->slot_config.max_bytes;
 622 }
 623
 624 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 625 {
 626         struct netem_sched_data *q = qdisc_priv(sch);
 627         struct sk_buff *skb;
 628         struct rb_node *p;
 629
 630 tfifo_dequeue:
 631         skb = __qdisc_dequeue_head(&sch->q);
 632         if (skb) {
 633                 qdisc_qstats_backlog_dec(sch, skb);
 634 deliver:
 635                 qdisc_bstats_update(sch, skb);
 636                 return skb;
 637         }
 638         p = rb_first(&q->t_root);
 639         if (p) {
 640                 u64 time_to_send;
 641                 u64 now = ktime_get_ns();
 642
 643                 skb = rb_to_skb(p);
 644
 645                 /* if more time remaining? */
 646                 time_to_send = netem_skb_cb(skb)->time_to_send;
 647                 if (q->slot.slot_next && q->slot.slot_next < time_to_send)
 648                         get_slot_next(q, now);
 649
 650                 if (time_to_send <= now &&  q->slot.slot_next <= now) {
 651                         rb_erase(p, &q->t_root);
 652                         sch->q.qlen--;
 653                         qdisc_qstats_backlog_dec(sch, skb);
 654                         skb->next = NULL;
 655                         skb->prev = NULL;
 656                         /* skb->dev shares skb->rbnode area,
 657                          * we need to restore its value.
 658                          */
 659                         skb->dev = qdisc_dev(sch);
 660
 661 #ifdef CONFIG_NET_CLS_ACT
 662                         /*
 663                          * If it's at ingress let's pretend the delay is
 664                          * from the network (tstamp will be updated).
 665                          */
 666                         if (skb->tc_redirected && skb->tc_from_ingress)
 667                                 skb->tstamp = 0;
 668 #endif
 669
 670                         if (q->slot.slot_next) {
 671                                 q->slot.packets_left--;
 672                                 q->slot.bytes_left -= qdisc_pkt_len(skb);
 673                                 if (q->slot.packets_left <= 0 ||
 674                                     q->slot.bytes_left <= 0)
 675                                         get_slot_next(q, now);
 676                         }
 677
 678                         if (q->qdisc) {
 679                                 unsigned int pkt_len = qdisc_pkt_len(skb);
 680                                 struct sk_buff *to_free = NULL;
 681                                 int err;
 682
 683                                 err = qdisc_enqueue(skb, q->qdisc, &to_free);
 684                                 kfree_skb_list(to_free);
 685                                 if (err != NET_XMIT_SUCCESS &&
 686                                     net_xmit_drop_count(err)) {
 687                                         qdisc_qstats_drop(sch);
 688                                         qdisc_tree_reduce_backlog(sch, 1,
 689                                                                   pkt_len);
 690                                 }
 691                                 goto tfifo_dequeue;
 692                         }
 693                         goto deliver;
 694                 }
 695
 696                 if (q->qdisc) {
 697                         skb = q->qdisc->ops->dequeue(q->qdisc);
 698                         if (skb)
 699                                 goto deliver;
 700                 }
 701
 702                 qdisc_watchdog_schedule_ns(&q->watchdog,
 703                                            max(time_to_send,
 704                                                q->slot.slot_next));
 705         }
 706
 707         if (q->qdisc) {
 708                 skb = q->qdisc->ops->dequeue(q->qdisc);
 709                 if (skb)
 710                         goto deliver;
 711         }
 712         return NULL;
 713 }
 714
 715 static void netem_reset(struct Qdisc *sch)
 716 {
 717         struct netem_sched_data *q = qdisc_priv(sch);
 718
 719         qdisc_reset_queue(sch);
 720         tfifo_reset(sch);
 721         if (q->qdisc)
 722                 qdisc_reset(q->qdisc);
 723         qdisc_watchdog_cancel(&q->watchdog);
 724 }
 725
 726 static void dist_free(struct disttable *d)
 727 {
 728         kvfree(d);
 729 }
 730
 731 /*
 732  * Distribution data is a variable size payload containing
 733  * signed 16 bit values.
 734  */
 735
 736 static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
 737                           const struct nlattr *attr)
 738 {
 739         size_t n = nla_len(attr)/sizeof(__s16);
 740         const __s16 *data = nla_data(attr);
 741         spinlock_t *root_lock;
 742         struct disttable *d;
 743         int i;
 744
 745         if (n > NETEM_DIST_MAX)
 746                 return -EINVAL;
 747
 748         d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
 749         if (!d)
 750                 return -ENOMEM;
 751
 752         d->size = n;
 753         for (i = 0; i < n; i++)
 754                 d->table[i] = data[i];
 755
 756         root_lock = qdisc_root_sleeping_lock(sch);
 757
 758         spin_lock_bh(root_lock);
 759         swap(*tbl, d);
 760         spin_unlock_bh(root_lock);
 761
 762         dist_free(d);
 763         return 0;
 764 }
 765
 766 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
 767 {
 768         const struct tc_netem_slot *c = nla_data(attr);
 769
 770         q->slot_config = *c;
 771         if (q->slot_config.max_packets == 0)
 772                 q->slot_config.max_packets = INT_MAX;
 773         if (q->slot_config.max_bytes == 0)
 774                 q->slot_config.max_bytes = INT_MAX;
 775         q->slot.packets_left = q->slot_config.max_packets;
 776         q->slot.bytes_left = q->slot_config.max_bytes;
 777         if (q->slot_config.min_delay | q->slot_config.max_delay |
 778             q->slot_config.dist_jitter)
 779                 q->slot.slot_next = ktime_get_ns();
 780         else
 781                 q->slot.slot_next = 0;
 782 }
 783
 784 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
 785 {
 786         const struct tc_netem_corr *c = nla_data(attr);
 787
 788         init_crandom(&q->delay_cor, c->delay_corr);
 789         init_crandom(&q->loss_cor, c->loss_corr);
 790         init_crandom(&q->dup_cor, c->dup_corr);
 791 }
 792
 793 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
 794 {
 795         const struct tc_netem_reorder *r = nla_data(attr);
 796
 797         q->reorder = r->probability;
 798         init_crandom(&q->reorder_cor, r->correlation);
 799 }
 800
 801 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
 802 {
 803         const struct tc_netem_corrupt *r = nla_data(attr);
 804
 805         q->corrupt = r->probability;
 806         init_crandom(&q->corrupt_cor, r->correlation);
 807 }
 808
 809 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
 810 {
 811         const struct tc_netem_rate *r = nla_data(attr);
 812
 813         q->rate = r->rate;
 814         q->packet_overhead = r->packet_overhead;
 815         q->cell_size = r->cell_size;
 816         q->cell_overhead = r->cell_overhead;
 817         if (q->cell_size)
 818                 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
 819         else
 820                 q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
 821 }
 822
 823 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
 824 {
 825         const struct nlattr *la;
 826         int rem;
 827
 828         nla_for_each_nested(la, attr, rem) {
 829                 u16 type = nla_type(la);
 830
 831                 switch (type) {
 832                 case NETEM_LOSS_GI: {
 833                         const struct tc_netem_gimodel *gi = nla_data(la);
 834
 835                         if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
 836                                 pr_info("netem: incorrect gi model size\n");
 837                                 return -EINVAL;
 838                         }
 839
 840                         q->loss_model = CLG_4_STATES;
 841
 842                         q->clg.state = TX_IN_GAP_PERIOD;
 843                         q->clg.a1 = gi->p13;
 844                         q->clg.a2 = gi->p31;
 845                         q->clg.a3 = gi->p32;
 846                         q->clg.a4 = gi->p14;
 847                         q->clg.a5 = gi->p23;
 848                         break;
 849                 }
 850
 851                 case NETEM_LOSS_GE: {
 852                         const struct tc_netem_gemodel *ge = nla_data(la);
 853
 854                         if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
 855                                 pr_info("netem: incorrect ge model size\n");
 856                                 return -EINVAL;
 857                         }
 858
 859                         q->loss_model = CLG_GILB_ELL;
 860                         q->clg.state = GOOD_STATE;
 861                         q->clg.a1 = ge->p;
 862                         q->clg.a2 = ge->r;
 863                         q->clg.a3 = ge->h;
 864                         q->clg.a4 = ge->k1;
 865                         break;
 866                 }
 867
 868                 default:
 869                         pr_info("netem: unknown loss type %u\n", type);
 870                         return -EINVAL;
 871                 }
 872         }
 873
 874         return 0;
 875 }
 876
 877 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 878         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 879         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 880         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 881         [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 882         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 883         [TCA_NETEM_ECN]         = { .type = NLA_U32 },
 884         [TCA_NETEM_RATE64]      = { .type = NLA_U64 },
 885         [TCA_NETEM_LATENCY64]   = { .type = NLA_S64 },
 886         [TCA_NETEM_JITTER64]    = { .type = NLA_S64 },
 887         [TCA_NETEM_SLOT]        = { .len = sizeof(struct tc_netem_slot) },
 888 };
 889
 890 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 891                       const struct nla_policy *policy, int len)
 892 {
 893         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 894
 895         if (nested_len < 0) {
 896                 pr_info("netem: invalid attributes len %d\n", nested_len);
 897                 return -EINVAL;
 898         }
 899
 900         if (nested_len >= nla_attr_size(0))
 901                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 902                                  nested_len, policy, NULL);
 903
 904         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 905         return 0;
 906 }
 907
 908 /* Parse netlink message to set options */
 909 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
 910                         struct netlink_ext_ack *extack)
 911 {
 912         struct netem_sched_data *q = qdisc_priv(sch);
 913         struct nlattr *tb[TCA_NETEM_MAX + 1];
 914         struct tc_netem_qopt *qopt;
 915         struct clgstate old_clg;
 916         int old_loss_model = CLG_RANDOM;
 917         int ret;
 918
 919         if (opt == NULL)
 920                 return -EINVAL;
 921
 922         qopt = nla_data(opt);
 923         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 924         if (ret < 0)
 925                 return ret;
 926
 927         /* backup q->clg and q->loss_model */
 928         old_clg = q->clg;
 929         old_loss_model = q->loss_model;
 930
 931         if (tb[TCA_NETEM_LOSS]) {
 932                 ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
 933                 if (ret) {
 934                         q->loss_model = old_loss_model;
 935                         return ret;
 936                 }
 937         } else {
 938                 q->loss_model = CLG_RANDOM;
 939         }
 940
 941         if (tb[TCA_NETEM_DELAY_DIST]) {
 942                 ret = get_dist_table(sch, &q->delay_dist,
 943                                      tb[TCA_NETEM_DELAY_DIST]);
 944                 if (ret)
 945                         goto get_table_failure;
 946         }
 947
 948         if (tb[TCA_NETEM_SLOT_DIST]) {
 949                 ret = get_dist_table(sch, &q->slot_dist,
 950                                      tb[TCA_NETEM_SLOT_DIST]);
 951                 if (ret)
 952                         goto get_table_failure;
 953         }
 954
 955         sch->limit = qopt->limit;
 956
 957         q->latency = PSCHED_TICKS2NS(qopt->latency);
 958         q->jitter = PSCHED_TICKS2NS(qopt->jitter);
 959         q->limit = qopt->limit;
 960         q->gap = qopt->gap;
 961         q->counter = 0;
 962         q->loss = qopt->loss;
 963         q->duplicate = qopt->duplicate;
 964
 965         /* for compatibility with earlier versions.
 966          * if gap is set, need to assume 100% probability
 967          */
 968         if (q->gap)
 969                 q->reorder = ~0;
 970
 971         if (tb[TCA_NETEM_CORR])
 972                 get_correlation(q, tb[TCA_NETEM_CORR]);
 973
 974         if (tb[TCA_NETEM_REORDER])
 975                 get_reorder(q, tb[TCA_NETEM_REORDER]);
 976
 977         if (tb[TCA_NETEM_CORRUPT])
 978                 get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
 979
 980         if (tb[TCA_NETEM_RATE])
 981                 get_rate(q, tb[TCA_NETEM_RATE]);
 982
 983         if (tb[TCA_NETEM_RATE64])
 984                 q->rate = max_t(u64, q->rate,
 985                                 nla_get_u64(tb[TCA_NETEM_RATE64]));
 986
 987         if (tb[TCA_NETEM_LATENCY64])
 988                 q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
 989
 990         if (tb[TCA_NETEM_JITTER64])
 991                 q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
 992
 993         if (tb[TCA_NETEM_ECN])
 994                 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 995
 996         if (tb[TCA_NETEM_SLOT])
 997                 get_slot(q, tb[TCA_NETEM_SLOT]);
 998
 999         return ret;
1000
1001 get_table_failure:
1002         /* recover clg and loss_model, in case of
1003          * q->clg and q->loss_model were modified
1004          * in get_loss_clg()
1005          */
1006         q->clg = old_clg;
1007         q->loss_model = old_loss_model;
1008         return ret;
1009 }
1010
1011 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1012                       struct netlink_ext_ack *extack)
1013 {
1014         struct netem_sched_data *q = qdisc_priv(sch);
1015         int ret;
1016
1017         qdisc_watchdog_init(&q->watchdog, sch);
1018
1019         if (!opt)
1020                 return -EINVAL;
1021
1022         q->loss_model = CLG_RANDOM;
1023         ret = netem_change(sch, opt, extack);
1024         if (ret)
1025                 pr_info("netem: change failed\n");
1026         return ret;
1027 }
1028
1029 static void netem_destroy(struct Qdisc *sch)
1030 {
1031         struct netem_sched_data *q = qdisc_priv(sch);
1032
1033         qdisc_watchdog_cancel(&q->watchdog);
1034         if (q->qdisc)
1035                 qdisc_destroy(q->qdisc);
1036         dist_free(q->delay_dist);
1037         dist_free(q->slot_dist);
1038 }
1039
1040 static int dump_loss_model(const struct netem_sched_data *q,
1041                            struct sk_buff *skb)
1042 {
1043         struct nlattr *nest;
1044
1045         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
1046         if (nest == NULL)
1047                 goto nla_put_failure;
1048
1049         switch (q->loss_model) {
1050         case CLG_RANDOM:
1051                 /* legacy loss model */
1052                 nla_nest_cancel(skb, nest);
1053                 return 0;       /* no data */
1054
1055         case CLG_4_STATES: {
1056                 struct tc_netem_gimodel gi = {
1057                         .p13 = q->clg.a1,
1058                         .p31 = q->clg.a2,
1059                         .p32 = q->clg.a3,
1060                         .p14 = q->clg.a4,
1061                         .p23 = q->clg.a5,
1062                 };
1063
1064                 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1065                         goto nla_put_failure;
1066                 break;
1067         }
1068         case CLG_GILB_ELL: {
1069                 struct tc_netem_gemodel ge = {
1070                         .p = q->clg.a1,
1071                         .r = q->clg.a2,
1072                         .h = q->clg.a3,
1073                         .k1 = q->clg.a4,
1074                 };
1075
1076                 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1077                         goto nla_put_failure;
1078                 break;
1079         }
1080         }
1081
1082         nla_nest_end(skb, nest);
1083         return 0;
1084
1085 nla_put_failure:
1086         nla_nest_cancel(skb, nest);
1087         return -1;
1088 }
1089
1090 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1091 {
1092         const struct netem_sched_data *q = qdisc_priv(sch);
1093         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1094         struct tc_netem_qopt qopt;
1095         struct tc_netem_corr cor;
1096         struct tc_netem_reorder reorder;
1097         struct tc_netem_corrupt corrupt;
1098         struct tc_netem_rate rate;
1099         struct tc_netem_slot slot;
1100
1101         qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
1102                              UINT_MAX);
1103         qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
1104                             UINT_MAX);
1105         qopt.limit = q->limit;
1106         qopt.loss = q->loss;
1107         qopt.gap = q->gap;
1108         qopt.duplicate = q->duplicate;
1109         if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1110                 goto nla_put_failure;
1111
1112         if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1113                 goto nla_put_failure;
1114
1115         if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1116                 goto nla_put_failure;
1117
1118         cor.delay_corr = q->delay_cor.rho;
1119         cor.loss_corr = q->loss_cor.rho;
1120         cor.dup_corr = q->dup_cor.rho;
1121         if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1122                 goto nla_put_failure;
1123
1124         reorder.probability = q->reorder;
1125         reorder.correlation = q->reorder_cor.rho;
1126         if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1127                 goto nla_put_failure;
1128
1129         corrupt.probability = q->corrupt;
1130         corrupt.correlation = q->corrupt_cor.rho;
1131         if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1132                 goto nla_put_failure;
1133
1134         if (q->rate >= (1ULL << 32)) {
1135                 if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1136                                       TCA_NETEM_PAD))
1137                         goto nla_put_failure;
1138                 rate.rate = ~0U;
1139         } else {
1140                 rate.rate = q->rate;
1141         }
1142         rate.packet_overhead = q->packet_overhead;
1143         rate.cell_size = q->cell_size;
1144         rate.cell_overhead = q->cell_overhead;
1145         if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1146                 goto nla_put_failure;
1147
1148         if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1149                 goto nla_put_failure;
1150
1151         if (dump_loss_model(q, skb) != 0)
1152                 goto nla_put_failure;
1153
1154         if (q->slot_config.min_delay | q->slot_config.max_delay |
1155             q->slot_config.dist_jitter) {
1156                 slot = q->slot_config;
1157                 if (slot.max_packets == INT_MAX)
1158                         slot.max_packets = 0;
1159                 if (slot.max_bytes == INT_MAX)
1160                         slot.max_bytes = 0;
1161                 if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1162                         goto nla_put_failure;
1163         }
1164
1165         return nla_nest_end(skb, nla);
1166
1167 nla_put_failure:
1168         nlmsg_trim(skb, nla);
1169         return -1;
1170 }
1171
1172 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1173                           struct sk_buff *skb, struct tcmsg *tcm)
1174 {
1175         struct netem_sched_data *q = qdisc_priv(sch);
1176
1177         if (cl != 1 || !q->qdisc)       /* only one class */
1178                 return -ENOENT;
1179
1180         tcm->tcm_handle |= TC_H_MIN(1);
1181         tcm->tcm_info = q->qdisc->handle;
1182
1183         return 0;
1184 }
1185
1186 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1187                      struct Qdisc **old, struct netlink_ext_ack *extack)
1188 {
1189         struct netem_sched_data *q = qdisc_priv(sch);
1190
1191         *old = qdisc_replace(sch, new, &q->qdisc);
1192         return 0;
1193 }
1194
1195 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1196 {
1197         struct netem_sched_data *q = qdisc_priv(sch);
1198         return q->qdisc;
1199 }
1200
1201 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1202 {
1203         return 1;
1204 }
1205
1206 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1207 {
1208         if (!walker->stop) {
1209                 if (walker->count >= walker->skip)
1210                         if (walker->fn(sch, 1, walker) < 0) {
1211                                 walker->stop = 1;
1212                                 return;
1213                         }
1214                 walker->count++;
1215         }
1216 }
1217
1218 static const struct Qdisc_class_ops netem_class_ops = {
1219         .graft          =       netem_graft,
1220         .leaf           =       netem_leaf,
1221         .find           =       netem_find,
1222         .walk           =       netem_walk,
1223         .dump           =       netem_dump_class,
1224 };
1225
1226 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1227         .id             =       "netem",
1228         .cl_ops         =       &netem_class_ops,
1229         .priv_size      =       sizeof(struct netem_sched_data),
1230         .enqueue        =       netem_enqueue,
1231         .dequeue        =       netem_dequeue,
1232         .peek           =       qdisc_peek_dequeued,
1233         .init           =       netem_init,
1234         .reset          =       netem_reset,
1235         .destroy        =       netem_destroy,
1236         .change         =       netem_change,
1237         .dump           =       netem_dump,
1238         .owner          =       THIS_MODULE,
1239 };
1240
1241
1242 static int __init netem_module_init(void)
1243 {
1244         pr_info("netem: version " VERSION "\n");
1245         return register_qdisc(&netem_qdisc_ops);
1246 }
1247 static void __exit netem_module_exit(void)
1248 {
1249         unregister_qdisc(&netem_qdisc_ops);
1250 }
1251 module_init(netem_module_init)
1252 module_exit(netem_module_exit)
1253 MODULE_LICENSE("GPL");