net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/errno.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/rtnetlink.h>
  25 #include <linux/reciprocal_div.h>
  26 #include <linux/rbtree.h>
  27
  28 #include <net/netlink.h>
  29 #include <net/pkt_sched.h>
  30 #include <net/inet_ecn.h>
  31
  32 #define VERSION "1.3"
  33
  34 /*      Network Emulation Queuing algorithm.
  35         ====================================
  36
  37         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  38                  Network Emulation Tool
  39                  [2] Luigi Rizzo, DummyNet for FreeBSD
  40
  41          ----------------------------------------------------------------
  42
  43          This started out as a simple way to delay outgoing packets to
  44          test TCP but has grown to include most of the functionality
  45          of a full blown network emulator like NISTnet. It can delay
  46          packets and add random jitter (and correlation). The random
  47          distribution can be loaded from a table as well to provide
  48          normal, Pareto, or experimental curves. Packet loss,
  49          duplication, and reordering can also be emulated.
  50
  51          This qdisc does not do classification that can be handled in
  52          layering other disciplines.  It does not need to do bandwidth
  53          control either since that can be handled by using token
  54          bucket or other rate control.
  55
  56      Correlated Loss Generator models
  57
  58         Added generation of correlated loss according to the
  59         "Gilbert-Elliot" model, a 4-state markov model.
  60
  61         References:
  62         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  63         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  64         and intuitive loss model for packet networks and its implementation
  65         in the Netem module in the Linux kernel", available in [1]
  66
  67         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  68                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  69 */
  70
  71 struct netem_sched_data {
  72         /* internal t(ime)fifo qdisc uses t_root and sch->limit */
  73         struct rb_root t_root;
  74
  75         /* optional qdisc for classful handling (NULL at netem init) */
  76         struct Qdisc    *qdisc;
  77
  78         struct qdisc_watchdog watchdog;
  79
  80         s64 latency;
  81         s64 jitter;
  82
  83         u32 loss;
  84         u32 ecn;
  85         u32 limit;
  86         u32 counter;
  87         u32 gap;
  88         u32 duplicate;
  89         u32 reorder;
  90         u32 corrupt;
  91         u64 rate;
  92         s32 packet_overhead;
  93         u32 cell_size;
  94         struct reciprocal_value cell_size_reciprocal;
  95         s32 cell_overhead;
  96
  97         struct crndstate {
  98                 u32 last;
  99                 u32 rho;
 100         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
 101
 102         struct disttable {
 103                 u32  size;
 104                 s16 table[0];
 105         } *delay_dist;
 106
 107         enum  {
 108                 CLG_RANDOM,
 109                 CLG_4_STATES,
 110                 CLG_GILB_ELL,
 111         } loss_model;
 112
 113         enum {
 114                 TX_IN_GAP_PERIOD = 1,
 115                 TX_IN_BURST_PERIOD,
 116                 LOST_IN_GAP_PERIOD,
 117                 LOST_IN_BURST_PERIOD,
 118         } _4_state_model;
 119
 120         enum {
 121                 GOOD_STATE = 1,
 122                 BAD_STATE,
 123         } GE_state_model;
 124
 125         /* Correlated Loss Generation models */
 126         struct clgstate {
 127                 /* state of the Markov chain */
 128                 u8 state;
 129
 130                 /* 4-states and Gilbert-Elliot models */
 131                 u32 a1; /* p13 for 4-states or p for GE */
 132                 u32 a2; /* p31 for 4-states or r for GE */
 133                 u32 a3; /* p32 for 4-states or h for GE */
 134                 u32 a4; /* p14 for 4-states or 1-k for GE */
 135                 u32 a5; /* p23 used only in 4-states */
 136         } clg;
 137
 138         struct tc_netem_slot slot_config;
 139         struct slotstate {
 140                 u64 slot_next;
 141                 s32 packets_left;
 142                 s32 bytes_left;
 143         } slot;
 144
 145 };
 146
 147 /* Time stamp put into socket buffer control block
 148  * Only valid when skbs are in our internal t(ime)fifo queue.
 149  *
 150  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
 151  * and skb->next & skb->prev are scratch space for a qdisc,
 152  * we save skb->tstamp value in skb->cb[] before destroying it.
 153  */
 154 struct netem_skb_cb {
 155         u64             time_to_send;
 156 };
 157
 158 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 159 {
 160         /* we assume we can use skb next/prev/tstamp as storage for rb_node */
 161         qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 162         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 163 }
 164
 165 /* init_crandom - initialize correlated random number generator
 166  * Use entropy source for initial seed.
 167  */
 168 static void init_crandom(struct crndstate *state, unsigned long rho)
 169 {
 170         state->rho = rho;
 171         state->last = prandom_u32();
 172 }
 173
 174 /* get_crandom - correlated random number generator
 175  * Next number depends on last value.
 176  * rho is scaled to avoid floating point.
 177  */
 178 static u32 get_crandom(struct crndstate *state)
 179 {
 180         u64 value, rho;
 181         unsigned long answer;
 182
 183         if (state->rho == 0)    /* no correlation */
 184                 return prandom_u32();
 185
 186         value = prandom_u32();
 187         rho = (u64)state->rho + 1;
 188         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 189         state->last = answer;
 190         return answer;
 191 }
 192
 193 /* loss_4state - 4-state model loss generator
 194  * Generates losses according to the 4-state Markov chain adopted in
 195  * the GI (General and Intuitive) loss model.
 196  */
 197 static bool loss_4state(struct netem_sched_data *q)
 198 {
 199         struct clgstate *clg = &q->clg;
 200         u32 rnd = prandom_u32();
 201
 202         /*
 203          * Makes a comparison between rnd and the transition
 204          * probabilities outgoing from the current state, then decides the
 205          * next state and if the next packet has to be transmitted or lost.
 206          * The four states correspond to:
 207          *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
 208          *   LOST_IN_BURST_PERIOD => isolated losses within a gap period
 209          *   LOST_IN_GAP_PERIOD => lost packets within a burst period
 210          *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
 211          */
 212         switch (clg->state) {
 213         case TX_IN_GAP_PERIOD:
 214                 if (rnd < clg->a4) {
 215                         clg->state = LOST_IN_BURST_PERIOD;
 216                         return true;
 217                 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
 218                         clg->state = LOST_IN_GAP_PERIOD;
 219                         return true;
 220                 } else if (clg->a1 + clg->a4 < rnd) {
 221                         clg->state = TX_IN_GAP_PERIOD;
 222                 }
 223
 224                 break;
 225         case TX_IN_BURST_PERIOD:
 226                 if (rnd < clg->a5) {
 227                         clg->state = LOST_IN_GAP_PERIOD;
 228                         return true;
 229                 } else {
 230                         clg->state = TX_IN_BURST_PERIOD;
 231                 }
 232
 233                 break;
 234         case LOST_IN_GAP_PERIOD:
 235                 if (rnd < clg->a3)
 236                         clg->state = TX_IN_BURST_PERIOD;
 237                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 238                         clg->state = TX_IN_GAP_PERIOD;
 239                 } else if (clg->a2 + clg->a3 < rnd) {
 240                         clg->state = LOST_IN_GAP_PERIOD;
 241                         return true;
 242                 }
 243                 break;
 244         case LOST_IN_BURST_PERIOD:
 245                 clg->state = TX_IN_GAP_PERIOD;
 246                 break;
 247         }
 248
 249         return false;
 250 }
 251
 252 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 253  * Generates losses according to the Gilbert-Elliot loss model or
 254  * its special cases  (Gilbert or Simple Gilbert)
 255  *
 256  * Makes a comparison between random number and the transition
 257  * probabilities outgoing from the current state, then decides the
 258  * next state. A second random number is extracted and the comparison
 259  * with the loss probability of the current state decides if the next
 260  * packet will be transmitted or lost.
 261  */
 262 static bool loss_gilb_ell(struct netem_sched_data *q)
 263 {
 264         struct clgstate *clg = &q->clg;
 265
 266         switch (clg->state) {
 267         case GOOD_STATE:
 268                 if (prandom_u32() < clg->a1)
 269                         clg->state = BAD_STATE;
 270                 if (prandom_u32() < clg->a4)
 271                         return true;
 272                 break;
 273         case BAD_STATE:
 274                 if (prandom_u32() < clg->a2)
 275                         clg->state = GOOD_STATE;
 276                 if (prandom_u32() > clg->a3)
 277                         return true;
 278         }
 279
 280         return false;
 281 }
 282
 283 static bool loss_event(struct netem_sched_data *q)
 284 {
 285         switch (q->loss_model) {
 286         case CLG_RANDOM:
 287                 /* Random packet drop 0 => none, ~0 => all */
 288                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 289
 290         case CLG_4_STATES:
 291                 /* 4state loss model algorithm (used also for GI model)
 292                 * Extracts a value from the markov 4 state loss generator,
 293                 * if it is 1 drops a packet and if needed writes the event in
 294                 * the kernel logs
 295                 */
 296                 return loss_4state(q);
 297
 298         case CLG_GILB_ELL:
 299                 /* Gilbert-Elliot loss model algorithm
 300                 * Extracts a value from the Gilbert-Elliot loss generator,
 301                 * if it is 1 drops a packet and if needed writes the event in
 302                 * the kernel logs
 303                 */
 304                 return loss_gilb_ell(q);
 305         }
 306
 307         return false;   /* not reached */
 308 }
 309
 310
 311 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 312  * std deviation sigma.  Uses table lookup to approximate the desired
 313  * distribution, and a uniformly-distributed pseudo-random source.
 314  */
 315 static s64 tabledist(s64 mu, s64 sigma,
 316                      struct crndstate *state,
 317                          const struct disttable *dist)
 318 {
 319         s64 x;
 320         long t;
 321         u32 rnd;
 322
 323         if (sigma == 0)
 324                 return mu;
 325
 326         rnd = get_crandom(state);
 327
 328         /* default uniform distribution */
 329         if (dist == NULL)
 330                 return (rnd % (2*sigma)) - sigma + mu;
 331
 332         t = dist->table[rnd % dist->size];
 333         x = (sigma % NETEM_DIST_SCALE) * t;
 334         if (x >= 0)
 335                 x += NETEM_DIST_SCALE/2;
 336         else
 337                 x -= NETEM_DIST_SCALE/2;
 338
 339         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 340 }
 341
 342 static u64 packet_len_2_sched_time(unsigned int len,
 343                                    struct netem_sched_data *q)
 344 {
 345         u64 offset;
 346         len += q->packet_overhead;
 347
 348         if (q->cell_size) {
 349                 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
 350
 351                 if (len > cells * q->cell_size) /* extra cell needed for remainder */
 352                         cells++;
 353                 len = cells * (q->cell_size + q->cell_overhead);
 354         }
 355         offset = (u64)len * NSEC_PER_SEC;
 356         do_div(offset, q->rate);
 357         return offset;
 358 }
 359
 360 static void tfifo_reset(struct Qdisc *sch)
 361 {
 362         struct netem_sched_data *q = qdisc_priv(sch);
 363         struct rb_node *p = rb_first(&q->t_root);
 364
 365         while (p) {
 366                 struct sk_buff *skb = rb_to_skb(p);
 367
 368                 p = rb_next(p);
 369                 rb_erase(&skb->rbnode, &q->t_root);
 370                 rtnl_kfree_skbs(skb, skb);
 371         }
 372 }
 373
 374 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 375 {
 376         struct netem_sched_data *q = qdisc_priv(sch);
 377         u64 tnext = netem_skb_cb(nskb)->time_to_send;
 378         struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
 379
 380         while (*p) {
 381                 struct sk_buff *skb;
 382
 383                 parent = *p;
 384                 skb = rb_to_skb(parent);
 385                 if (tnext >= netem_skb_cb(skb)->time_to_send)
 386                         p = &parent->rb_right;
 387                 else
 388                         p = &parent->rb_left;
 389         }
 390         rb_link_node(&nskb->rbnode, parent, p);
 391         rb_insert_color(&nskb->rbnode, &q->t_root);
 392         sch->q.qlen++;
 393 }
 394
 395 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
 396  * when we statistically choose to corrupt one, we instead segment it, returning
 397  * the first packet to be corrupted, and re-enqueue the remaining frames
 398  */
 399 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
 400                                      struct sk_buff **to_free)
 401 {
 402         struct sk_buff *segs;
 403         netdev_features_t features = netif_skb_features(skb);
 404
 405         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 406
 407         if (IS_ERR_OR_NULL(segs)) {
 408                 qdisc_drop(skb, sch, to_free);
 409                 return NULL;
 410         }
 411         consume_skb(skb);
 412         return segs;
 413 }
 414
 415 static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
 416 {
 417         skb->next = qh->head;
 418
 419         if (!qh->head)
 420                 qh->tail = skb;
 421         qh->head = skb;
 422         qh->qlen++;
 423 }
 424
 425 /*
 426  * Insert one skb into qdisc.
 427  * Note: parent depends on return value to account for queue length.
 428  *      NET_XMIT_DROP: queue length didn't change.
 429  *      NET_XMIT_SUCCESS: one skb was queued.
 430  */
 431 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 432                          struct sk_buff **to_free)
 433 {
 434         struct netem_sched_data *q = qdisc_priv(sch);
 435         /* We don't fill cb now as skb_unshare() may invalidate it */
 436         struct netem_skb_cb *cb;
 437         struct sk_buff *skb2;
 438         struct sk_buff *segs = NULL;
 439         unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
 440         int nb = 0;
 441         int count = 1;
 442         int rc = NET_XMIT_SUCCESS;
 443
 444         /* Random duplication */
 445         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 446                 ++count;
 447
 448         /* Drop packet? */
 449         if (loss_event(q)) {
 450                 if (q->ecn && INET_ECN_set_ce(skb))
 451                         qdisc_qstats_drop(sch); /* mark packet */
 452                 else
 453                         --count;
 454         }
 455         if (count == 0) {
 456                 qdisc_qstats_drop(sch);
 457                 __qdisc_drop(skb, to_free);
 458                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 459         }
 460
 461         /* If a delay is expected, orphan the skb. (orphaning usually takes
 462          * place at TX completion time, so _before_ the link transit delay)
 463          */
 464         if (q->latency || q->jitter || q->rate)
 465                 skb_orphan_partial(skb);
 466
 467         /*
 468          * If we need to duplicate packet, then re-insert at top of the
 469          * qdisc tree, since parent queuer expects that only one
 470          * skb will be queued.
 471          */
 472         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 473                 struct Qdisc *rootq = qdisc_root(sch);
 474                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 475
 476                 q->duplicate = 0;
 477                 rootq->enqueue(skb2, rootq, to_free);
 478                 q->duplicate = dupsave;
 479         }
 480
 481         /*
 482          * Randomized packet corruption.
 483          * Make copy if needed since we are modifying
 484          * If packet is going to be hardware checksummed, then
 485          * do it now in software before we mangle it.
 486          */
 487         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 488                 if (skb_is_gso(skb)) {
 489                         segs = netem_segment(skb, sch, to_free);
 490                         if (!segs)
 491                                 return NET_XMIT_DROP;
 492                 } else {
 493                         segs = skb;
 494                 }
 495
 496                 skb = segs;
 497                 segs = segs->next;
 498
 499                 skb = skb_unshare(skb, GFP_ATOMIC);
 500                 if (unlikely(!skb)) {
 501                         qdisc_qstats_drop(sch);
 502                         goto finish_segs;
 503                 }
 504                 if (skb->ip_summed == CHECKSUM_PARTIAL &&
 505                     skb_checksum_help(skb)) {
 506                         qdisc_drop(skb, sch, to_free);
 507                         goto finish_segs;
 508                 }
 509
 510                 skb->data[prandom_u32() % skb_headlen(skb)] ^=
 511                         1<<(prandom_u32() % 8);
 512         }
 513
 514         if (unlikely(sch->q.qlen >= sch->limit))
 515                 return qdisc_drop(skb, sch, to_free);
 516
 517         qdisc_qstats_backlog_inc(sch, skb);
 518
 519         cb = netem_skb_cb(skb);
 520         if (q->gap == 0 ||              /* not doing reordering */
 521             q->counter < q->gap - 1 ||  /* inside last reordering gap */
 522             q->reorder < get_crandom(&q->reorder_cor)) {
 523                 u64 now;
 524                 s64 delay;
 525
 526                 delay = tabledist(q->latency, q->jitter,
 527                                   &q->delay_cor, q->delay_dist);
 528
 529                 now = ktime_get_ns();
 530
 531                 if (q->rate) {
 532                         struct netem_skb_cb *last = NULL;
 533
 534                         if (sch->q.tail)
 535                                 last = netem_skb_cb(sch->q.tail);
 536                         if (q->t_root.rb_node) {
 537                                 struct sk_buff *t_skb;
 538                                 struct netem_skb_cb *t_last;
 539
 540                                 t_skb = skb_rb_last(&q->t_root);
 541                                 t_last = netem_skb_cb(t_skb);
 542                                 if (!last ||
 543                                     t_last->time_to_send > last->time_to_send) {
 544                                         last = t_last;
 545                                 }
 546                         }
 547
 548                         if (last) {
 549                                 /*
 550                                  * Last packet in queue is reference point (now),
 551                                  * calculate this time bonus and subtract
 552                                  * from delay.
 553                                  */
 554                                 delay -= last->time_to_send - now;
 555                                 delay = max_t(s64, 0, delay);
 556                                 now = last->time_to_send;
 557                         }
 558
 559                         delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q);
 560                 }
 561
 562                 cb->time_to_send = now + delay;
 563                 ++q->counter;
 564                 tfifo_enqueue(skb, sch);
 565         } else {
 566                 /*
 567                  * Do re-ordering by putting one out of N packets at the front
 568                  * of the queue.
 569                  */
 570                 cb->time_to_send = ktime_get_ns();
 571                 q->counter = 0;
 572
 573                 netem_enqueue_skb_head(&sch->q, skb);
 574                 sch->qstats.requeues++;
 575         }
 576
 577 finish_segs:
 578         if (segs) {
 579                 while (segs) {
 580                         skb2 = segs->next;
 581                         segs->next = NULL;
 582                         qdisc_skb_cb(segs)->pkt_len = segs->len;
 583                         last_len = segs->len;
 584                         rc = qdisc_enqueue(segs, sch, to_free);
 585                         if (rc != NET_XMIT_SUCCESS) {
 586                                 if (net_xmit_drop_count(rc))
 587                                         qdisc_qstats_drop(sch);
 588                         } else {
 589                                 nb++;
 590                                 len += last_len;
 591                         }
 592                         segs = skb2;
 593                 }
 594                 sch->q.qlen += nb;
 595                 if (nb > 1)
 596                         qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
 597         }
 598         return NET_XMIT_SUCCESS;
 599 }
 600
 601 /* Delay the next round with a new future slot with a
 602  * correct number of bytes and packets.
 603  */
 604
 605 static void get_slot_next(struct netem_sched_data *q, u64 now)
 606 {
 607         q->slot.slot_next = now + q->slot_config.min_delay +
 608                 (prandom_u32() *
 609                         (q->slot_config.max_delay -
 610                                 q->slot_config.min_delay) >> 32);
 611         q->slot.packets_left = q->slot_config.max_packets;
 612         q->slot.bytes_left = q->slot_config.max_bytes;
 613 }
 614
 615 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 616 {
 617         struct netem_sched_data *q = qdisc_priv(sch);
 618         struct sk_buff *skb;
 619         struct rb_node *p;
 620
 621 tfifo_dequeue:
 622         skb = __qdisc_dequeue_head(&sch->q);
 623         if (skb) {
 624                 qdisc_qstats_backlog_dec(sch, skb);
 625 deliver:
 626                 qdisc_bstats_update(sch, skb);
 627                 return skb;
 628         }
 629         p = rb_first(&q->t_root);
 630         if (p) {
 631                 u64 time_to_send;
 632                 u64 now = ktime_get_ns();
 633
 634                 skb = rb_to_skb(p);
 635
 636                 /* if more time remaining? */
 637                 time_to_send = netem_skb_cb(skb)->time_to_send;
 638                 if (q->slot.slot_next && q->slot.slot_next < time_to_send)
 639                         get_slot_next(q, now);
 640
 641                 if (time_to_send <= now &&  q->slot.slot_next <= now) {
 642                         rb_erase(p, &q->t_root);
 643                         sch->q.qlen--;
 644                         qdisc_qstats_backlog_dec(sch, skb);
 645                         skb->next = NULL;
 646                         skb->prev = NULL;
 647                         /* skb->dev shares skb->rbnode area,
 648                          * we need to restore its value.
 649                          */
 650                         skb->dev = qdisc_dev(sch);
 651
 652 #ifdef CONFIG_NET_CLS_ACT
 653                         /*
 654                          * If it's at ingress let's pretend the delay is
 655                          * from the network (tstamp will be updated).
 656                          */
 657                         if (skb->tc_redirected && skb->tc_from_ingress)
 658                                 skb->tstamp = 0;
 659 #endif
 660
 661                         if (q->slot.slot_next) {
 662                                 q->slot.packets_left--;
 663                                 q->slot.bytes_left -= qdisc_pkt_len(skb);
 664                                 if (q->slot.packets_left <= 0 ||
 665                                     q->slot.bytes_left <= 0)
 666                                         get_slot_next(q, now);
 667                         }
 668
 669                         if (q->qdisc) {
 670                                 unsigned int pkt_len = qdisc_pkt_len(skb);
 671                                 struct sk_buff *to_free = NULL;
 672                                 int err;
 673
 674                                 err = qdisc_enqueue(skb, q->qdisc, &to_free);
 675                                 kfree_skb_list(to_free);
 676                                 if (err != NET_XMIT_SUCCESS &&
 677                                     net_xmit_drop_count(err)) {
 678                                         qdisc_qstats_drop(sch);
 679                                         qdisc_tree_reduce_backlog(sch, 1,
 680                                                                   pkt_len);
 681                                 }
 682                                 goto tfifo_dequeue;
 683                         }
 684                         goto deliver;
 685                 }
 686
 687                 if (q->qdisc) {
 688                         skb = q->qdisc->ops->dequeue(q->qdisc);
 689                         if (skb)
 690                                 goto deliver;
 691                 }
 692
 693                 qdisc_watchdog_schedule_ns(&q->watchdog,
 694                                            max(time_to_send,
 695                                                q->slot.slot_next));
 696         }
 697
 698         if (q->qdisc) {
 699                 skb = q->qdisc->ops->dequeue(q->qdisc);
 700                 if (skb)
 701                         goto deliver;
 702         }
 703         return NULL;
 704 }
 705
 706 static void netem_reset(struct Qdisc *sch)
 707 {
 708         struct netem_sched_data *q = qdisc_priv(sch);
 709
 710         qdisc_reset_queue(sch);
 711         tfifo_reset(sch);
 712         if (q->qdisc)
 713                 qdisc_reset(q->qdisc);
 714         qdisc_watchdog_cancel(&q->watchdog);
 715 }
 716
 717 static void dist_free(struct disttable *d)
 718 {
 719         kvfree(d);
 720 }
 721
 722 /*
 723  * Distribution data is a variable size payload containing
 724  * signed 16 bit values.
 725  */
 726
 727 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 728 {
 729         struct netem_sched_data *q = qdisc_priv(sch);
 730         size_t n = nla_len(attr)/sizeof(__s16);
 731         const __s16 *data = nla_data(attr);
 732         spinlock_t *root_lock;
 733         struct disttable *d;
 734         int i;
 735
 736         if (n > NETEM_DIST_MAX)
 737                 return -EINVAL;
 738
 739         d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
 740         if (!d)
 741                 return -ENOMEM;
 742
 743         d->size = n;
 744         for (i = 0; i < n; i++)
 745                 d->table[i] = data[i];
 746
 747         root_lock = qdisc_root_sleeping_lock(sch);
 748
 749         spin_lock_bh(root_lock);
 750         swap(q->delay_dist, d);
 751         spin_unlock_bh(root_lock);
 752
 753         dist_free(d);
 754         return 0;
 755 }
 756
 757 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
 758 {
 759         const struct tc_netem_slot *c = nla_data(attr);
 760
 761         q->slot_config = *c;
 762         if (q->slot_config.max_packets == 0)
 763                 q->slot_config.max_packets = INT_MAX;
 764         if (q->slot_config.max_bytes == 0)
 765                 q->slot_config.max_bytes = INT_MAX;
 766         q->slot.packets_left = q->slot_config.max_packets;
 767         q->slot.bytes_left = q->slot_config.max_bytes;
 768         if (q->slot_config.min_delay | q->slot_config.max_delay)
 769                 q->slot.slot_next = ktime_get_ns();
 770         else
 771                 q->slot.slot_next = 0;
 772 }
 773
 774 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
 775 {
 776         const struct tc_netem_corr *c = nla_data(attr);
 777
 778         init_crandom(&q->delay_cor, c->delay_corr);
 779         init_crandom(&q->loss_cor, c->loss_corr);
 780         init_crandom(&q->dup_cor, c->dup_corr);
 781 }
 782
 783 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
 784 {
 785         const struct tc_netem_reorder *r = nla_data(attr);
 786
 787         q->reorder = r->probability;
 788         init_crandom(&q->reorder_cor, r->correlation);
 789 }
 790
 791 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
 792 {
 793         const struct tc_netem_corrupt *r = nla_data(attr);
 794
 795         q->corrupt = r->probability;
 796         init_crandom(&q->corrupt_cor, r->correlation);
 797 }
 798
 799 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
 800 {
 801         const struct tc_netem_rate *r = nla_data(attr);
 802
 803         q->rate = r->rate;
 804         q->packet_overhead = r->packet_overhead;
 805         q->cell_size = r->cell_size;
 806         q->cell_overhead = r->cell_overhead;
 807         if (q->cell_size)
 808                 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
 809         else
 810                 q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
 811 }
 812
 813 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
 814 {
 815         const struct nlattr *la;
 816         int rem;
 817
 818         nla_for_each_nested(la, attr, rem) {
 819                 u16 type = nla_type(la);
 820
 821                 switch (type) {
 822                 case NETEM_LOSS_GI: {
 823                         const struct tc_netem_gimodel *gi = nla_data(la);
 824
 825                         if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
 826                                 pr_info("netem: incorrect gi model size\n");
 827                                 return -EINVAL;
 828                         }
 829
 830                         q->loss_model = CLG_4_STATES;
 831
 832                         q->clg.state = TX_IN_GAP_PERIOD;
 833                         q->clg.a1 = gi->p13;
 834                         q->clg.a2 = gi->p31;
 835                         q->clg.a3 = gi->p32;
 836                         q->clg.a4 = gi->p14;
 837                         q->clg.a5 = gi->p23;
 838                         break;
 839                 }
 840
 841                 case NETEM_LOSS_GE: {
 842                         const struct tc_netem_gemodel *ge = nla_data(la);
 843
 844                         if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
 845                                 pr_info("netem: incorrect ge model size\n");
 846                                 return -EINVAL;
 847                         }
 848
 849                         q->loss_model = CLG_GILB_ELL;
 850                         q->clg.state = GOOD_STATE;
 851                         q->clg.a1 = ge->p;
 852                         q->clg.a2 = ge->r;
 853                         q->clg.a3 = ge->h;
 854                         q->clg.a4 = ge->k1;
 855                         break;
 856                 }
 857
 858                 default:
 859                         pr_info("netem: unknown loss type %u\n", type);
 860                         return -EINVAL;
 861                 }
 862         }
 863
 864         return 0;
 865 }
 866
 867 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 868         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 869         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 870         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 871         [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 872         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 873         [TCA_NETEM_ECN]         = { .type = NLA_U32 },
 874         [TCA_NETEM_RATE64]      = { .type = NLA_U64 },
 875         [TCA_NETEM_LATENCY64]   = { .type = NLA_S64 },
 876         [TCA_NETEM_JITTER64]    = { .type = NLA_S64 },
 877         [TCA_NETEM_SLOT]        = { .len = sizeof(struct tc_netem_slot) },
 878 };
 879
 880 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 881                       const struct nla_policy *policy, int len)
 882 {
 883         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 884
 885         if (nested_len < 0) {
 886                 pr_info("netem: invalid attributes len %d\n", nested_len);
 887                 return -EINVAL;
 888         }
 889
 890         if (nested_len >= nla_attr_size(0))
 891                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 892                                  nested_len, policy, NULL);
 893
 894         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 895         return 0;
 896 }
 897
 898 /* Parse netlink message to set options */
 899 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 900 {
 901         struct netem_sched_data *q = qdisc_priv(sch);
 902         struct nlattr *tb[TCA_NETEM_MAX + 1];
 903         struct tc_netem_qopt *qopt;
 904         struct clgstate old_clg;
 905         int old_loss_model = CLG_RANDOM;
 906         int ret;
 907
 908         if (opt == NULL)
 909                 return -EINVAL;
 910
 911         qopt = nla_data(opt);
 912         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 913         if (ret < 0)
 914                 return ret;
 915
 916         /* backup q->clg and q->loss_model */
 917         old_clg = q->clg;
 918         old_loss_model = q->loss_model;
 919
 920         if (tb[TCA_NETEM_LOSS]) {
 921                 ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
 922                 if (ret) {
 923                         q->loss_model = old_loss_model;
 924                         return ret;
 925                 }
 926         } else {
 927                 q->loss_model = CLG_RANDOM;
 928         }
 929
 930         if (tb[TCA_NETEM_DELAY_DIST]) {
 931                 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 932                 if (ret) {
 933                         /* recover clg and loss_model, in case of
 934                          * q->clg and q->loss_model were modified
 935                          * in get_loss_clg()
 936                          */
 937                         q->clg = old_clg;
 938                         q->loss_model = old_loss_model;
 939                         return ret;
 940                 }
 941         }
 942
 943         sch->limit = qopt->limit;
 944
 945         q->latency = PSCHED_TICKS2NS(qopt->latency);
 946         q->jitter = PSCHED_TICKS2NS(qopt->jitter);
 947         q->limit = qopt->limit;
 948         q->gap = qopt->gap;
 949         q->counter = 0;
 950         q->loss = qopt->loss;
 951         q->duplicate = qopt->duplicate;
 952
 953         /* for compatibility with earlier versions.
 954          * if gap is set, need to assume 100% probability
 955          */
 956         if (q->gap)
 957                 q->reorder = ~0;
 958
 959         if (tb[TCA_NETEM_CORR])
 960                 get_correlation(q, tb[TCA_NETEM_CORR]);
 961
 962         if (tb[TCA_NETEM_REORDER])
 963                 get_reorder(q, tb[TCA_NETEM_REORDER]);
 964
 965         if (tb[TCA_NETEM_CORRUPT])
 966                 get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
 967
 968         if (tb[TCA_NETEM_RATE])
 969                 get_rate(q, tb[TCA_NETEM_RATE]);
 970
 971         if (tb[TCA_NETEM_RATE64])
 972                 q->rate = max_t(u64, q->rate,
 973                                 nla_get_u64(tb[TCA_NETEM_RATE64]));
 974
 975         if (tb[TCA_NETEM_LATENCY64])
 976                 q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
 977
 978         if (tb[TCA_NETEM_JITTER64])
 979                 q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
 980
 981         if (tb[TCA_NETEM_ECN])
 982                 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 983
 984         if (tb[TCA_NETEM_SLOT])
 985                 get_slot(q, tb[TCA_NETEM_SLOT]);
 986
 987         return ret;
 988 }
 989
 990 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 991 {
 992         struct netem_sched_data *q = qdisc_priv(sch);
 993         int ret;
 994
 995         qdisc_watchdog_init(&q->watchdog, sch);
 996
 997         if (!opt)
 998                 return -EINVAL;
 999
1000         q->loss_model = CLG_RANDOM;
1001         ret = netem_change(sch, opt);
1002         if (ret)
1003                 pr_info("netem: change failed\n");
1004         return ret;
1005 }
1006
1007 static void netem_destroy(struct Qdisc *sch)
1008 {
1009         struct netem_sched_data *q = qdisc_priv(sch);
1010
1011         qdisc_watchdog_cancel(&q->watchdog);
1012         if (q->qdisc)
1013                 qdisc_destroy(q->qdisc);
1014         dist_free(q->delay_dist);
1015 }
1016
1017 static int dump_loss_model(const struct netem_sched_data *q,
1018                            struct sk_buff *skb)
1019 {
1020         struct nlattr *nest;
1021
1022         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
1023         if (nest == NULL)
1024                 goto nla_put_failure;
1025
1026         switch (q->loss_model) {
1027         case CLG_RANDOM:
1028                 /* legacy loss model */
1029                 nla_nest_cancel(skb, nest);
1030                 return 0;       /* no data */
1031
1032         case CLG_4_STATES: {
1033                 struct tc_netem_gimodel gi = {
1034                         .p13 = q->clg.a1,
1035                         .p31 = q->clg.a2,
1036                         .p32 = q->clg.a3,
1037                         .p14 = q->clg.a4,
1038                         .p23 = q->clg.a5,
1039                 };
1040
1041                 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1042                         goto nla_put_failure;
1043                 break;
1044         }
1045         case CLG_GILB_ELL: {
1046                 struct tc_netem_gemodel ge = {
1047                         .p = q->clg.a1,
1048                         .r = q->clg.a2,
1049                         .h = q->clg.a3,
1050                         .k1 = q->clg.a4,
1051                 };
1052
1053                 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1054                         goto nla_put_failure;
1055                 break;
1056         }
1057         }
1058
1059         nla_nest_end(skb, nest);
1060         return 0;
1061
1062 nla_put_failure:
1063         nla_nest_cancel(skb, nest);
1064         return -1;
1065 }
1066
1067 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1068 {
1069         const struct netem_sched_data *q = qdisc_priv(sch);
1070         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1071         struct tc_netem_qopt qopt;
1072         struct tc_netem_corr cor;
1073         struct tc_netem_reorder reorder;
1074         struct tc_netem_corrupt corrupt;
1075         struct tc_netem_rate rate;
1076         struct tc_netem_slot slot;
1077
1078         qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
1079                              UINT_MAX);
1080         qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
1081                             UINT_MAX);
1082         qopt.limit = q->limit;
1083         qopt.loss = q->loss;
1084         qopt.gap = q->gap;
1085         qopt.duplicate = q->duplicate;
1086         if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1087                 goto nla_put_failure;
1088
1089         if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1090                 goto nla_put_failure;
1091
1092         if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1093                 goto nla_put_failure;
1094
1095         cor.delay_corr = q->delay_cor.rho;
1096         cor.loss_corr = q->loss_cor.rho;
1097         cor.dup_corr = q->dup_cor.rho;
1098         if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1099                 goto nla_put_failure;
1100
1101         reorder.probability = q->reorder;
1102         reorder.correlation = q->reorder_cor.rho;
1103         if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1104                 goto nla_put_failure;
1105
1106         corrupt.probability = q->corrupt;
1107         corrupt.correlation = q->corrupt_cor.rho;
1108         if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1109                 goto nla_put_failure;
1110
1111         if (q->rate >= (1ULL << 32)) {
1112                 if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1113                                       TCA_NETEM_PAD))
1114                         goto nla_put_failure;
1115                 rate.rate = ~0U;
1116         } else {
1117                 rate.rate = q->rate;
1118         }
1119         rate.packet_overhead = q->packet_overhead;
1120         rate.cell_size = q->cell_size;
1121         rate.cell_overhead = q->cell_overhead;
1122         if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1123                 goto nla_put_failure;
1124
1125         if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1126                 goto nla_put_failure;
1127
1128         if (dump_loss_model(q, skb) != 0)
1129                 goto nla_put_failure;
1130
1131         if (q->slot_config.min_delay | q->slot_config.max_delay) {
1132                 slot = q->slot_config;
1133                 if (slot.max_packets == INT_MAX)
1134                         slot.max_packets = 0;
1135                 if (slot.max_bytes == INT_MAX)
1136                         slot.max_bytes = 0;
1137                 if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1138                         goto nla_put_failure;
1139         }
1140
1141         return nla_nest_end(skb, nla);
1142
1143 nla_put_failure:
1144         nlmsg_trim(skb, nla);
1145         return -1;
1146 }
1147
1148 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1149                           struct sk_buff *skb, struct tcmsg *tcm)
1150 {
1151         struct netem_sched_data *q = qdisc_priv(sch);
1152
1153         if (cl != 1 || !q->qdisc)       /* only one class */
1154                 return -ENOENT;
1155
1156         tcm->tcm_handle |= TC_H_MIN(1);
1157         tcm->tcm_info = q->qdisc->handle;
1158
1159         return 0;
1160 }
1161
1162 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1163                      struct Qdisc **old)
1164 {
1165         struct netem_sched_data *q = qdisc_priv(sch);
1166
1167         *old = qdisc_replace(sch, new, &q->qdisc);
1168         return 0;
1169 }
1170
1171 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1172 {
1173         struct netem_sched_data *q = qdisc_priv(sch);
1174         return q->qdisc;
1175 }
1176
1177 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1178 {
1179         return 1;
1180 }
1181
1182 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1183 {
1184         if (!walker->stop) {
1185                 if (walker->count >= walker->skip)
1186                         if (walker->fn(sch, 1, walker) < 0) {
1187                                 walker->stop = 1;
1188                                 return;
1189                         }
1190                 walker->count++;
1191         }
1192 }
1193
1194 static const struct Qdisc_class_ops netem_class_ops = {
1195         .graft          =       netem_graft,
1196         .leaf           =       netem_leaf,
1197         .find           =       netem_find,
1198         .walk           =       netem_walk,
1199         .dump           =       netem_dump_class,
1200 };
1201
1202 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1203         .id             =       "netem",
1204         .cl_ops         =       &netem_class_ops,
1205         .priv_size      =       sizeof(struct netem_sched_data),
1206         .enqueue        =       netem_enqueue,
1207         .dequeue        =       netem_dequeue,
1208         .peek           =       qdisc_peek_dequeued,
1209         .init           =       netem_init,
1210         .reset          =       netem_reset,
1211         .destroy        =       netem_destroy,
1212         .change         =       netem_change,
1213         .dump           =       netem_dump,
1214         .owner          =       THIS_MODULE,
1215 };
1216
1217
1218 static int __init netem_module_init(void)
1219 {
1220         pr_info("netem: version " VERSION "\n");
1221         return register_qdisc(&netem_qdisc_ops);
1222 }
1223 static void __exit netem_module_exit(void)
1224 {
1225         unregister_qdisc(&netem_qdisc_ops);
1226 }
1227 module_init(netem_module_init)
1228 module_exit(netem_module_exit)
1229 MODULE_LICENSE("GPL");