net/sched/sch_tbf.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * net/sched/sch_tbf.c  Token Bucket Filter queue.
   4  *
   5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6  *              Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
   7  *                                               original idea by Martin Devera
   8  */
   9
  10 #include <linux/module.h>
  11 #include <linux/types.h>
  12 #include <linux/kernel.h>
  13 #include <linux/string.h>
  14 #include <linux/errno.h>
  15 #include <linux/skbuff.h>
  16 #include <net/netlink.h>
  17 #include <net/sch_generic.h>
  18 #include <net/pkt_sched.h>
  19
  20
  21 /*      Simple Token Bucket Filter.
  22         =======================================
  23
  24         SOURCE.
  25         -------
  26
  27         None.
  28
  29         Description.
  30         ------------
  31
  32         A data flow obeys TBF with rate R and depth B, if for any
  33         time interval t_i...t_f the number of transmitted bits
  34         does not exceed B + R*(t_f-t_i).
  35
  36         Packetized version of this definition:
  37         The sequence of packets of sizes s_i served at moments t_i
  38         obeys TBF, if for any i<=k:
  39
  40         s_i+....+s_k <= B + R*(t_k - t_i)
  41
  42         Algorithm.
  43         ----------
  44
  45         Let N(t_i) be B/R initially and N(t) grow continuously with time as:
  46
  47         N(t+delta) = min{B/R, N(t) + delta}
  48
  49         If the first packet in queue has length S, it may be
  50         transmitted only at the time t_* when S/R <= N(t_*),
  51         and in this case N(t) jumps:
  52
  53         N(t_* + 0) = N(t_* - 0) - S/R.
  54
  55
  56
  57         Actually, QoS requires two TBF to be applied to a data stream.
  58         One of them controls steady state burst size, another
  59         one with rate P (peak rate) and depth M (equal to link MTU)
  60         limits bursts at a smaller time scale.
  61
  62         It is easy to see that P>R, and B>M. If P is infinity, this double
  63         TBF is equivalent to a single one.
  64
  65         When TBF works in reshaping mode, latency is estimated as:
  66
  67         lat = max ((L-B)/R, (L-M)/P)
  68
  69
  70         NOTES.
  71         ------
  72
  73         If TBF throttles, it starts a watchdog timer, which will wake it up
  74         when it is ready to transmit.
  75         Note that the minimal timer resolution is 1/HZ.
  76         If no new packets arrive during this period,
  77         or if the device is not awaken by EOI for some previous packet,
  78         TBF can stop its activity for 1/HZ.
  79
  80
  81         This means, that with depth B, the maximal rate is
  82
  83         R_crit = B*HZ
  84
  85         F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
  86
  87         Note that the peak rate TBF is much more tough: with MTU 1500
  88         P_crit = 150Kbytes/sec. So, if you need greater peak
  89         rates, use alpha with HZ=1000 :-)
  90
  91         With classful TBF, limit is just kept for backwards compatibility.
  92         It is passed to the default bfifo qdisc - if the inner qdisc is
  93         changed the limit is not effective anymore.
  94 */
  95
  96 struct tbf_sched_data {
  97 /* Parameters */
  98         u32             limit;          /* Maximal length of backlog: bytes */
  99         u32             max_size;
 100         s64             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
 101         s64             mtu;
 102         struct psched_ratecfg rate;
 103         struct psched_ratecfg peak;
 104
 105 /* Variables */
 106         s64     tokens;                 /* Current number of B tokens */
 107         s64     ptokens;                /* Current number of P tokens */
 108         s64     t_c;                    /* Time check-point */
 109         struct Qdisc    *qdisc;         /* Inner qdisc, default - bfifo queue */
 110         struct qdisc_watchdog watchdog; /* Watchdog timer */
 111 };
 112
 113
 114 /* Time to Length, convert time in ns to length in bytes
 115  * to determinate how many bytes can be sent in given time.
 116  */
 117 static u64 psched_ns_t2l(const struct psched_ratecfg *r,
 118                          u64 time_in_ns)
 119 {
 120         /* The formula is :
 121          * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
 122          */
 123         u64 len = time_in_ns * r->rate_bytes_ps;
 124
 125         do_div(len, NSEC_PER_SEC);
 126
 127         if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
 128                 do_div(len, 53);
 129                 len = len * 48;
 130         }
 131
 132         if (len > r->overhead)
 133                 len -= r->overhead;
 134         else
 135                 len = 0;
 136
 137         return len;
 138 }
 139
 140 /* GSO packet is too big, segment it so that tbf can transmit
 141  * each segment in time
 142  */
 143 static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
 144                        struct sk_buff **to_free)
 145 {
 146         struct tbf_sched_data *q = qdisc_priv(sch);
 147         struct sk_buff *segs, *nskb;
 148         netdev_features_t features = netif_skb_features(skb);
 149         unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
 150         int ret, nb;
 151
 152         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 153
 154         if (IS_ERR_OR_NULL(segs))
 155                 return qdisc_drop(skb, sch, to_free);
 156
 157         nb = 0;
 158         skb_list_walk_safe(segs, segs, nskb) {
 159                 skb_mark_not_on_list(segs);
 160                 qdisc_skb_cb(segs)->pkt_len = segs->len;
 161                 len += segs->len;
 162                 ret = qdisc_enqueue(segs, q->qdisc, to_free);
 163                 if (ret != NET_XMIT_SUCCESS) {
 164                         if (net_xmit_drop_count(ret))
 165                                 qdisc_qstats_drop(sch);
 166                 } else {
 167                         nb++;
 168                 }
 169         }
 170         sch->q.qlen += nb;
 171         if (nb > 1)
 172                 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
 173         consume_skb(skb);
 174         return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
 175 }
 176
 177 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 178                        struct sk_buff **to_free)
 179 {
 180         struct tbf_sched_data *q = qdisc_priv(sch);
 181         unsigned int len = qdisc_pkt_len(skb);
 182         int ret;
 183
 184         if (qdisc_pkt_len(skb) > q->max_size) {
 185                 if (skb_is_gso(skb) &&
 186                     skb_gso_validate_mac_len(skb, q->max_size))
 187                         return tbf_segment(skb, sch, to_free);
 188                 return qdisc_drop(skb, sch, to_free);
 189         }
 190         ret = qdisc_enqueue(skb, q->qdisc, to_free);
 191         if (ret != NET_XMIT_SUCCESS) {
 192                 if (net_xmit_drop_count(ret))
 193                         qdisc_qstats_drop(sch);
 194                 return ret;
 195         }
 196
 197         sch->qstats.backlog += len;
 198         sch->q.qlen++;
 199         return NET_XMIT_SUCCESS;
 200 }
 201
 202 static bool tbf_peak_present(const struct tbf_sched_data *q)
 203 {
 204         return q->peak.rate_bytes_ps;
 205 }
 206
 207 static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 208 {
 209         struct tbf_sched_data *q = qdisc_priv(sch);
 210         struct sk_buff *skb;
 211
 212         skb = q->qdisc->ops->peek(q->qdisc);
 213
 214         if (skb) {
 215                 s64 now;
 216                 s64 toks;
 217                 s64 ptoks = 0;
 218                 unsigned int len = qdisc_pkt_len(skb);
 219
 220                 now = ktime_get_ns();
 221                 toks = min_t(s64, now - q->t_c, q->buffer);
 222
 223                 if (tbf_peak_present(q)) {
 224                         ptoks = toks + q->ptokens;
 225                         if (ptoks > q->mtu)
 226                                 ptoks = q->mtu;
 227                         ptoks -= (s64) psched_l2t_ns(&q->peak, len);
 228                 }
 229                 toks += q->tokens;
 230                 if (toks > q->buffer)
 231                         toks = q->buffer;
 232                 toks -= (s64) psched_l2t_ns(&q->rate, len);
 233
 234                 if ((toks|ptoks) >= 0) {
 235                         skb = qdisc_dequeue_peeked(q->qdisc);
 236                         if (unlikely(!skb))
 237                                 return NULL;
 238
 239                         q->t_c = now;
 240                         q->tokens = toks;
 241                         q->ptokens = ptoks;
 242                         qdisc_qstats_backlog_dec(sch, skb);
 243                         sch->q.qlen--;
 244                         qdisc_bstats_update(sch, skb);
 245                         return skb;
 246                 }
 247
 248                 qdisc_watchdog_schedule_ns(&q->watchdog,
 249                                            now + max_t(long, -toks, -ptoks));
 250
 251                 /* Maybe we have a shorter packet in the queue,
 252                    which can be sent now. It sounds cool,
 253                    but, however, this is wrong in principle.
 254                    We MUST NOT reorder packets under these circumstances.
 255
 256                    Really, if we split the flow into independent
 257                    subflows, it would be a very good solution.
 258                    This is the main idea of all FQ algorithms
 259                    (cf. CSZ, HPFQ, HFSC)
 260                  */
 261
 262                 qdisc_qstats_overlimit(sch);
 263         }
 264         return NULL;
 265 }
 266
 267 static void tbf_reset(struct Qdisc *sch)
 268 {
 269         struct tbf_sched_data *q = qdisc_priv(sch);
 270
 271         qdisc_reset(q->qdisc);
 272         sch->qstats.backlog = 0;
 273         sch->q.qlen = 0;
 274         q->t_c = ktime_get_ns();
 275         q->tokens = q->buffer;
 276         q->ptokens = q->mtu;
 277         qdisc_watchdog_cancel(&q->watchdog);
 278 }
 279
 280 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
 281         [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
 282         [TCA_TBF_RTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 283         [TCA_TBF_PTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 284         [TCA_TBF_RATE64]        = { .type = NLA_U64 },
 285         [TCA_TBF_PRATE64]       = { .type = NLA_U64 },
 286         [TCA_TBF_BURST] = { .type = NLA_U32 },
 287         [TCA_TBF_PBURST] = { .type = NLA_U32 },
 288 };
 289
 290 static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 291                       struct netlink_ext_ack *extack)
 292 {
 293         int err;
 294         struct tbf_sched_data *q = qdisc_priv(sch);
 295         struct nlattr *tb[TCA_TBF_MAX + 1];
 296         struct tc_tbf_qopt *qopt;
 297         struct Qdisc *child = NULL;
 298         struct psched_ratecfg rate;
 299         struct psched_ratecfg peak;
 300         u64 max_size;
 301         s64 buffer, mtu;
 302         u64 rate64 = 0, prate64 = 0;
 303
 304         err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
 305                                           NULL);
 306         if (err < 0)
 307                 return err;
 308
 309         err = -EINVAL;
 310         if (tb[TCA_TBF_PARMS] == NULL)
 311                 goto done;
 312
 313         qopt = nla_data(tb[TCA_TBF_PARMS]);
 314         if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
 315                 qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
 316                                               tb[TCA_TBF_RTAB],
 317                                               NULL));
 318
 319         if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
 320                         qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
 321                                                       tb[TCA_TBF_PTAB],
 322                                                       NULL));
 323
 324         buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
 325         mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
 326
 327         if (tb[TCA_TBF_RATE64])
 328                 rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
 329         psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
 330
 331         if (tb[TCA_TBF_BURST]) {
 332                 max_size = nla_get_u32(tb[TCA_TBF_BURST]);
 333                 buffer = psched_l2t_ns(&rate, max_size);
 334         } else {
 335                 max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
 336         }
 337
 338         if (qopt->peakrate.rate) {
 339                 if (tb[TCA_TBF_PRATE64])
 340                         prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
 341                 psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
 342                 if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
 343                         pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
 344                                         peak.rate_bytes_ps, rate.rate_bytes_ps);
 345                         err = -EINVAL;
 346                         goto done;
 347                 }
 348
 349                 if (tb[TCA_TBF_PBURST]) {
 350                         u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
 351                         max_size = min_t(u32, max_size, pburst);
 352                         mtu = psched_l2t_ns(&peak, pburst);
 353                 } else {
 354                         max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
 355                 }
 356         } else {
 357                 memset(&peak, 0, sizeof(peak));
 358         }
 359
 360         if (max_size < psched_mtu(qdisc_dev(sch)))
 361                 pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
 362                                     max_size, qdisc_dev(sch)->name,
 363                                     psched_mtu(qdisc_dev(sch)));
 364
 365         if (!max_size) {
 366                 err = -EINVAL;
 367                 goto done;
 368         }
 369
 370         if (q->qdisc != &noop_qdisc) {
 371                 err = fifo_set_limit(q->qdisc, qopt->limit);
 372                 if (err)
 373                         goto done;
 374         } else if (qopt->limit > 0) {
 375                 child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
 376                                          extack);
 377                 if (IS_ERR(child)) {
 378                         err = PTR_ERR(child);
 379                         goto done;
 380                 }
 381
 382                 /* child is fifo, no need to check for noop_qdisc */
 383                 qdisc_hash_add(child, true);
 384         }
 385
 386         sch_tree_lock(sch);
 387         if (child) {
 388                 qdisc_tree_flush_backlog(q->qdisc);
 389                 qdisc_put(q->qdisc);
 390                 q->qdisc = child;
 391         }
 392         q->limit = qopt->limit;
 393         if (tb[TCA_TBF_PBURST])
 394                 q->mtu = mtu;
 395         else
 396                 q->mtu = PSCHED_TICKS2NS(qopt->mtu);
 397         q->max_size = max_size;
 398         if (tb[TCA_TBF_BURST])
 399                 q->buffer = buffer;
 400         else
 401                 q->buffer = PSCHED_TICKS2NS(qopt->buffer);
 402         q->tokens = q->buffer;
 403         q->ptokens = q->mtu;
 404
 405         memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
 406         memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
 407
 408         sch_tree_unlock(sch);
 409         err = 0;
 410 done:
 411         return err;
 412 }
 413
 414 static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
 415                     struct netlink_ext_ack *extack)
 416 {
 417         struct tbf_sched_data *q = qdisc_priv(sch);
 418
 419         qdisc_watchdog_init(&q->watchdog, sch);
 420         q->qdisc = &noop_qdisc;
 421
 422         if (!opt)
 423                 return -EINVAL;
 424
 425         q->t_c = ktime_get_ns();
 426
 427         return tbf_change(sch, opt, extack);
 428 }
 429
 430 static void tbf_destroy(struct Qdisc *sch)
 431 {
 432         struct tbf_sched_data *q = qdisc_priv(sch);
 433
 434         qdisc_watchdog_cancel(&q->watchdog);
 435         qdisc_put(q->qdisc);
 436 }
 437
 438 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 439 {
 440         struct tbf_sched_data *q = qdisc_priv(sch);
 441         struct nlattr *nest;
 442         struct tc_tbf_qopt opt;
 443
 444         sch->qstats.backlog = q->qdisc->qstats.backlog;
 445         nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
 446         if (nest == NULL)
 447                 goto nla_put_failure;
 448
 449         opt.limit = q->limit;
 450         psched_ratecfg_getrate(&opt.rate, &q->rate);
 451         if (tbf_peak_present(q))
 452                 psched_ratecfg_getrate(&opt.peakrate, &q->peak);
 453         else
 454                 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 455         opt.mtu = PSCHED_NS2TICKS(q->mtu);
 456         opt.buffer = PSCHED_NS2TICKS(q->buffer);
 457         if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
 458                 goto nla_put_failure;
 459         if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
 460             nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
 461                               TCA_TBF_PAD))
 462                 goto nla_put_failure;
 463         if (tbf_peak_present(q) &&
 464             q->peak.rate_bytes_ps >= (1ULL << 32) &&
 465             nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
 466                               TCA_TBF_PAD))
 467                 goto nla_put_failure;
 468
 469         return nla_nest_end(skb, nest);
 470
 471 nla_put_failure:
 472         nla_nest_cancel(skb, nest);
 473         return -1;
 474 }
 475
 476 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 477                           struct sk_buff *skb, struct tcmsg *tcm)
 478 {
 479         struct tbf_sched_data *q = qdisc_priv(sch);
 480
 481         tcm->tcm_handle |= TC_H_MIN(1);
 482         tcm->tcm_info = q->qdisc->handle;
 483
 484         return 0;
 485 }
 486
 487 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 488                      struct Qdisc **old, struct netlink_ext_ack *extack)
 489 {
 490         struct tbf_sched_data *q = qdisc_priv(sch);
 491
 492         if (new == NULL)
 493                 new = &noop_qdisc;
 494
 495         *old = qdisc_replace(sch, new, &q->qdisc);
 496         return 0;
 497 }
 498
 499 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
 500 {
 501         struct tbf_sched_data *q = qdisc_priv(sch);
 502         return q->qdisc;
 503 }
 504
 505 static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
 506 {
 507         return 1;
 508 }
 509
 510 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 511 {
 512         if (!walker->stop) {
 513                 if (walker->count >= walker->skip)
 514                         if (walker->fn(sch, 1, walker) < 0) {
 515                                 walker->stop = 1;
 516                                 return;
 517                         }
 518                 walker->count++;
 519         }
 520 }
 521
 522 static const struct Qdisc_class_ops tbf_class_ops = {
 523         .graft          =       tbf_graft,
 524         .leaf           =       tbf_leaf,
 525         .find           =       tbf_find,
 526         .walk           =       tbf_walk,
 527         .dump           =       tbf_dump_class,
 528 };
 529
 530 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
 531         .next           =       NULL,
 532         .cl_ops         =       &tbf_class_ops,
 533         .id             =       "tbf",
 534         .priv_size      =       sizeof(struct tbf_sched_data),
 535         .enqueue        =       tbf_enqueue,
 536         .dequeue        =       tbf_dequeue,
 537         .peek           =       qdisc_peek_dequeued,
 538         .init           =       tbf_init,
 539         .reset          =       tbf_reset,
 540         .destroy        =       tbf_destroy,
 541         .change         =       tbf_change,
 542         .dump           =       tbf_dump,
 543         .owner          =       THIS_MODULE,
 544 };
 545
 546 static int __init tbf_module_init(void)
 547 {
 548         return register_qdisc(&tbf_qdisc_ops);
 549 }
 550
 551 static void __exit tbf_module_exit(void)
 552 {
 553         unregister_qdisc(&tbf_qdisc_ops);
 554 }
 555 module_init(tbf_module_init)
 556 module_exit(tbf_module_exit)
 557 MODULE_LICENSE("GPL");