block/blk-iocost.c

   1 /* SPDX-License-Identifier: GPL-2.0
   2  *
   3  * IO cost model based controller.
   4  *
   5  * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
   6  * Copyright (C) 2019 Andy Newell <newella@fb.com>
   7  * Copyright (C) 2019 Facebook
   8  *
   9  * One challenge of controlling IO resources is the lack of trivially
  10  * observable cost metric.  This is distinguished from CPU and memory where
  11  * wallclock time and the number of bytes can serve as accurate enough
  12  * approximations.
  13  *
  14  * Bandwidth and iops are the most commonly used metrics for IO devices but
  15  * depending on the type and specifics of the device, different IO patterns
  16  * easily lead to multiple orders of magnitude variations rendering them
  17  * useless for the purpose of IO capacity distribution.  While on-device
  18  * time, with a lot of clutches, could serve as a useful approximation for
  19  * non-queued rotational devices, this is no longer viable with modern
  20  * devices, even the rotational ones.
  21  *
  22  * While there is no cost metric we can trivially observe, it isn't a
  23  * complete mystery.  For example, on a rotational device, seek cost
  24  * dominates while a contiguous transfer contributes a smaller amount
  25  * proportional to the size.  If we can characterize at least the relative
  26  * costs of these different types of IOs, it should be possible to
  27  * implement a reasonable work-conserving proportional IO resource
  28  * distribution.
  29  *
  30  * 1. IO Cost Model
  31  *
  32  * IO cost model estimates the cost of an IO given its basic parameters and
  33  * history (e.g. the end sector of the last IO).  The cost is measured in
  34  * device time.  If a given IO is estimated to cost 10ms, the device should
  35  * be able to process ~100 of those IOs in a second.
  36  *
  37  * Currently, there's only one builtin cost model - linear.  Each IO is
  38  * classified as sequential or random and given a base cost accordingly.
  39  * On top of that, a size cost proportional to the length of the IO is
  40  * added.  While simple, this model captures the operational
  41  * characteristics of a wide varienty of devices well enough.  Default
  42  * paramters for several different classes of devices are provided and the
  43  * parameters can be configured from userspace via
  44  * /sys/fs/cgroup/io.cost.model.
  45  *
  46  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
  47  * device-specific coefficients.
  48  *
  49  * 2. Control Strategy
  50  *
  51  * The device virtual time (vtime) is used as the primary control metric.
  52  * The control strategy is composed of the following three parts.
  53  *
  54  * 2-1. Vtime Distribution
  55  *
  56  * When a cgroup becomes active in terms of IOs, its hierarchical share is
  57  * calculated.  Please consider the following hierarchy where the numbers
  58  * inside parentheses denote the configured weights.
  59  *
  60  *           root
  61  *         /       \
  62  *      A (w:100)  B (w:300)
  63  *      /       \
  64  *  A0 (w:100)  A1 (w:100)
  65  *
  66  * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
  67  * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
  68  * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
  69  * 12.5% each.  The distribution mechanism only cares about these flattened
  70  * shares.  They're called hweights (hierarchical weights) and always add
  71  * upto 1 (WEIGHT_ONE).
  72  *
  73  * A given cgroup's vtime runs slower in inverse proportion to its hweight.
  74  * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
  75  * against the device vtime - an IO which takes 10ms on the underlying
  76  * device is considered to take 80ms on A0.
  77  *
  78  * This constitutes the basis of IO capacity distribution.  Each cgroup's
  79  * vtime is running at a rate determined by its hweight.  A cgroup tracks
  80  * the vtime consumed by past IOs and can issue a new IO iff doing so
  81  * wouldn't outrun the current device vtime.  Otherwise, the IO is
  82  * suspended until the vtime has progressed enough to cover it.
  83  *
  84  * 2-2. Vrate Adjustment
  85  *
  86  * It's unrealistic to expect the cost model to be perfect.  There are too
  87  * many devices and even on the same device the overall performance
  88  * fluctuates depending on numerous factors such as IO mixture and device
  89  * internal garbage collection.  The controller needs to adapt dynamically.
  90  *
  91  * This is achieved by adjusting the overall IO rate according to how busy
  92  * the device is.  If the device becomes overloaded, we're sending down too
  93  * many IOs and should generally slow down.  If there are waiting issuers
  94  * but the device isn't saturated, we're issuing too few and should
  95  * generally speed up.
  96  *
  97  * To slow down, we lower the vrate - the rate at which the device vtime
  98  * passes compared to the wall clock.  For example, if the vtime is running
  99  * at the vrate of 75%, all cgroups added up would only be able to issue
 100  * 750ms worth of IOs per second, and vice-versa for speeding up.
 101  *
 102  * Device business is determined using two criteria - rq wait and
 103  * completion latencies.
 104  *
 105  * When a device gets saturated, the on-device and then the request queues
 106  * fill up and a bio which is ready to be issued has to wait for a request
 107  * to become available.  When this delay becomes noticeable, it's a clear
 108  * indication that the device is saturated and we lower the vrate.  This
 109  * saturation signal is fairly conservative as it only triggers when both
 110  * hardware and software queues are filled up, and is used as the default
 111  * busy signal.
 112  *
 113  * As devices can have deep queues and be unfair in how the queued commands
 114  * are executed, soley depending on rq wait may not result in satisfactory
 115  * control quality.  For a better control quality, completion latency QoS
 116  * parameters can be configured so that the device is considered saturated
 117  * if N'th percentile completion latency rises above the set point.
 118  *
 119  * The completion latency requirements are a function of both the
 120  * underlying device characteristics and the desired IO latency quality of
 121  * service.  There is an inherent trade-off - the tighter the latency QoS,
 122  * the higher the bandwidth lossage.  Latency QoS is disabled by default
 123  * and can be set through /sys/fs/cgroup/io.cost.qos.
 124  *
 125  * 2-3. Work Conservation
 126  *
 127  * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
 128  * periodically while B is sending out enough parallel IOs to saturate the
 129  * device on its own.  Let's say A's usage amounts to 100ms worth of IO
 130  * cost per second, i.e., 10% of the device capacity.  The naive
 131  * distribution of half and half would lead to 60% utilization of the
 132  * device, a significant reduction in the total amount of work done
 133  * compared to free-for-all competition.  This is too high a cost to pay
 134  * for IO control.
 135  *
 136  * To conserve the total amount of work done, we keep track of how much
 137  * each active cgroup is actually using and yield part of its weight if
 138  * there are other cgroups which can make use of it.  In the above case,
 139  * A's weight will be lowered so that it hovers above the actual usage and
 140  * B would be able to use the rest.
 141  *
 142  * As we don't want to penalize a cgroup for donating its weight, the
 143  * surplus weight adjustment factors in a margin and has an immediate
 144  * snapback mechanism in case the cgroup needs more IO vtime for itself.
 145  *
 146  * Note that adjusting down surplus weights has the same effects as
 147  * accelerating vtime for other cgroups and work conservation can also be
 148  * implemented by adjusting vrate dynamically.  However, squaring who can
 149  * donate and should take back how much requires hweight propagations
 150  * anyway making it easier to implement and understand as a separate
 151  * mechanism.
 152  *
 153  * 3. Monitoring
 154  *
 155  * Instead of debugfs or other clumsy monitoring mechanisms, this
 156  * controller uses a drgn based monitoring script -
 157  * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
 158  * https://github.com/osandov/drgn.  The ouput looks like the following.
 159  *
 160  *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
 161  *                 active      weight      hweight% inflt% dbt  delay usages%
 162  *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
 163  *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
 164  *
 165  * - per        : Timer period
 166  * - cur_per    : Internal wall and device vtime clock
 167  * - vrate      : Device virtual time rate against wall clock
 168  * - weight     : Surplus-adjusted and configured weights
 169  * - hweight    : Surplus-adjusted and configured hierarchical weights
 170  * - inflt      : The percentage of in-flight IO cost at the end of last period
 171  * - del_ms     : Deferred issuer delay induction level and duration
 172  * - usages     : Usage history
 173  */
 174
 175 #include <linux/kernel.h>
 176 #include <linux/module.h>
 177 #include <linux/timer.h>
 178 #include <linux/time64.h>
 179 #include <linux/parser.h>
 180 #include <linux/sched/signal.h>
 181 #include <linux/blk-cgroup.h>
 182 #include <asm/local.h>
 183 #include <asm/local64.h>
 184 #include "blk-rq-qos.h"
 185 #include "blk-stat.h"
 186 #include "blk-wbt.h"
 187
 188 #ifdef CONFIG_TRACEPOINTS
 189
 190 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
 191 #define TRACE_IOCG_PATH_LEN 1024
 192 static DEFINE_SPINLOCK(trace_iocg_path_lock);
 193 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
 194
 195 #define TRACE_IOCG_PATH(type, iocg, ...)                                        \
 196         do {                                                                    \
 197                 unsigned long flags;                                            \
 198                 if (trace_iocost_##type##_enabled()) {                          \
 199                         spin_lock_irqsave(&trace_iocg_path_lock, flags);        \
 200                         cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,      \
 201                                     trace_iocg_path, TRACE_IOCG_PATH_LEN);      \
 202                         trace_iocost_##type(iocg, trace_iocg_path,              \
 203                                               ##__VA_ARGS__);                   \
 204                         spin_unlock_irqrestore(&trace_iocg_path_lock, flags);   \
 205                 }                                                               \
 206         } while (0)
 207
 208 #else   /* CONFIG_TRACE_POINTS */
 209 #define TRACE_IOCG_PATH(type, iocg, ...)        do { } while (0)
 210 #endif  /* CONFIG_TRACE_POINTS */
 211
 212 enum {
 213         MILLION                 = 1000000,
 214
 215         /* timer period is calculated from latency requirements, bound it */
 216         MIN_PERIOD              = USEC_PER_MSEC,
 217         MAX_PERIOD              = USEC_PER_SEC,
 218
 219         /*
 220          * A cgroup's vtime can run 50% behind the device vtime, which
 221          * serves as its IO credit buffer.  Surplus weight adjustment is
 222          * immediately canceled if the vtime margin runs below 10%.
 223          */
 224         MARGIN_MIN_PCT          = 10,
 225         MARGIN_MAX_PCT          = 50,
 226
 227         /* Have some play in timer operations */
 228         TIMER_SLACK_PCT         = 1,
 229
 230         /*
 231          * vtime can wrap well within a reasonable uptime when vrate is
 232          * consistently raised.  Don't trust recorded cgroup vtime if the
 233          * period counter indicates that it's older than 5mins.
 234          */
 235         VTIME_VALID_DUR         = 300 * USEC_PER_SEC,
 236
 237         /*
 238          * Remember the past three non-zero usages and use the max for
 239          * surplus calculation.  Three slots guarantee that we remember one
 240          * full period usage from the last active stretch even after
 241          * partial deactivation and re-activation periods.  Don't start
 242          * giving away weight before collecting two data points to prevent
 243          * hweight adjustments based on one partial activation period.
 244          */
 245         NR_USAGE_SLOTS          = 3,
 246         MIN_VALID_USAGES        = 2,
 247
 248         /* 1/64k is granular enough and can easily be handled w/ u32 */
 249         WEIGHT_ONE              = 1 << 16,
 250
 251         /*
 252          * As vtime is used to calculate the cost of each IO, it needs to
 253          * be fairly high precision.  For example, it should be able to
 254          * represent the cost of a single page worth of discard with
 255          * suffificient accuracy.  At the same time, it should be able to
 256          * represent reasonably long enough durations to be useful and
 257          * convenient during operation.
 258          *
 259          * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
 260          * granularity and days of wrap-around time even at extreme vrates.
 261          */
 262         VTIME_PER_SEC_SHIFT     = 37,
 263         VTIME_PER_SEC           = 1LLU << VTIME_PER_SEC_SHIFT,
 264         VTIME_PER_USEC          = VTIME_PER_SEC / USEC_PER_SEC,
 265         VTIME_PER_NSEC          = VTIME_PER_SEC / NSEC_PER_SEC,
 266
 267         /* bound vrate adjustments within two orders of magnitude */
 268         VRATE_MIN_PPM           = 10000,        /* 1% */
 269         VRATE_MAX_PPM           = 100000000,    /* 10000% */
 270
 271         VRATE_MIN               = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
 272         VRATE_CLAMP_ADJ_PCT     = 4,
 273
 274         /* if IOs end up waiting for requests, issue less */
 275         RQ_WAIT_BUSY_PCT        = 5,
 276
 277         /* unbusy hysterisis */
 278         UNBUSY_THR_PCT          = 75,
 279
 280         /* don't let cmds which take a very long time pin lagging for too long */
 281         MAX_LAGGING_PERIODS     = 10,
 282
 283         /*
 284          * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
 285          * donate the surplus.
 286          */
 287         SURPLUS_SCALE_PCT       = 125,                  /* * 125% */
 288         SURPLUS_SCALE_ABS       = WEIGHT_ONE / 50,      /* + 2% */
 289         SURPLUS_MIN_ADJ_DELTA   = WEIGHT_ONE / 33,      /* 3% */
 290
 291         /* switch iff the conditions are met for longer than this */
 292         AUTOP_CYCLE_NSEC        = 10LLU * NSEC_PER_SEC,
 293
 294         /*
 295          * Count IO size in 4k pages.  The 12bit shift helps keeping
 296          * size-proportional components of cost calculation in closer
 297          * numbers of digits to per-IO cost components.
 298          */
 299         IOC_PAGE_SHIFT          = 12,
 300         IOC_PAGE_SIZE           = 1 << IOC_PAGE_SHIFT,
 301         IOC_SECT_TO_PAGE_SHIFT  = IOC_PAGE_SHIFT - SECTOR_SHIFT,
 302
 303         /* if apart further than 16M, consider randio for linear model */
 304         LCOEF_RANDIO_PAGES      = 4096,
 305 };
 306
 307 enum ioc_running {
 308         IOC_IDLE,
 309         IOC_RUNNING,
 310         IOC_STOP,
 311 };
 312
 313 /* io.cost.qos controls including per-dev enable of the whole controller */
 314 enum {
 315         QOS_ENABLE,
 316         QOS_CTRL,
 317         NR_QOS_CTRL_PARAMS,
 318 };
 319
 320 /* io.cost.qos params */
 321 enum {
 322         QOS_RPPM,
 323         QOS_RLAT,
 324         QOS_WPPM,
 325         QOS_WLAT,
 326         QOS_MIN,
 327         QOS_MAX,
 328         NR_QOS_PARAMS,
 329 };
 330
 331 /* io.cost.model controls */
 332 enum {
 333         COST_CTRL,
 334         COST_MODEL,
 335         NR_COST_CTRL_PARAMS,
 336 };
 337
 338 /* builtin linear cost model coefficients */
 339 enum {
 340         I_LCOEF_RBPS,
 341         I_LCOEF_RSEQIOPS,
 342         I_LCOEF_RRANDIOPS,
 343         I_LCOEF_WBPS,
 344         I_LCOEF_WSEQIOPS,
 345         I_LCOEF_WRANDIOPS,
 346         NR_I_LCOEFS,
 347 };
 348
 349 enum {
 350         LCOEF_RPAGE,
 351         LCOEF_RSEQIO,
 352         LCOEF_RRANDIO,
 353         LCOEF_WPAGE,
 354         LCOEF_WSEQIO,
 355         LCOEF_WRANDIO,
 356         NR_LCOEFS,
 357 };
 358
 359 enum {
 360         AUTOP_INVALID,
 361         AUTOP_HDD,
 362         AUTOP_SSD_QD1,
 363         AUTOP_SSD_DFL,
 364         AUTOP_SSD_FAST,
 365 };
 366
 367 struct ioc_gq;
 368
 369 struct ioc_params {
 370         u32                             qos[NR_QOS_PARAMS];
 371         u64                             i_lcoefs[NR_I_LCOEFS];
 372         u64                             lcoefs[NR_LCOEFS];
 373         u32                             too_fast_vrate_pct;
 374         u32                             too_slow_vrate_pct;
 375 };
 376
 377 struct ioc_margins {
 378         s64                             min;
 379         s64                             max;
 380 };
 381
 382 struct ioc_missed {
 383         local_t                         nr_met;
 384         local_t                         nr_missed;
 385         u32                             last_met;
 386         u32                             last_missed;
 387 };
 388
 389 struct ioc_pcpu_stat {
 390         struct ioc_missed               missed[2];
 391
 392         local64_t                       rq_wait_ns;
 393         u64                             last_rq_wait_ns;
 394 };
 395
 396 /* per device */
 397 struct ioc {
 398         struct rq_qos                   rqos;
 399
 400         bool                            enabled;
 401
 402         struct ioc_params               params;
 403         struct ioc_margins              margins;
 404         u32                             period_us;
 405         u32                             timer_slack_ns;
 406         u64                             vrate_min;
 407         u64                             vrate_max;
 408
 409         spinlock_t                      lock;
 410         struct timer_list               timer;
 411         struct list_head                active_iocgs;   /* active cgroups */
 412         struct ioc_pcpu_stat __percpu   *pcpu_stat;
 413
 414         enum ioc_running                running;
 415         atomic64_t                      vtime_rate;
 416
 417         seqcount_spinlock_t             period_seqcount;
 418         u64                             period_at;      /* wallclock starttime */
 419         u64                             period_at_vtime; /* vtime starttime */
 420
 421         atomic64_t                      cur_period;     /* inc'd each period */
 422         int                             busy_level;     /* saturation history */
 423
 424         bool                            weights_updated;
 425         atomic_t                        hweight_gen;    /* for lazy hweights */
 426
 427         u64                             autop_too_fast_at;
 428         u64                             autop_too_slow_at;
 429         int                             autop_idx;
 430         bool                            user_qos_params:1;
 431         bool                            user_cost_model:1;
 432 };
 433
 434 struct iocg_pcpu_stat {
 435         local64_t                       abs_vusage;
 436 };
 437
 438 struct iocg_stat {
 439         u64                             usage_us;
 440 };
 441
 442 /* per device-cgroup pair */
 443 struct ioc_gq {
 444         struct blkg_policy_data         pd;
 445         struct ioc                      *ioc;
 446
 447         /*
 448          * A iocg can get its weight from two sources - an explicit
 449          * per-device-cgroup configuration or the default weight of the
 450          * cgroup.  `cfg_weight` is the explicit per-device-cgroup
 451          * configuration.  `weight` is the effective considering both
 452          * sources.
 453          *
 454          * When an idle cgroup becomes active its `active` goes from 0 to
 455          * `weight`.  `inuse` is the surplus adjusted active weight.
 456          * `active` and `inuse` are used to calculate `hweight_active` and
 457          * `hweight_inuse`.
 458          *
 459          * `last_inuse` remembers `inuse` while an iocg is idle to persist
 460          * surplus adjustments.
 461          */
 462         u32                             cfg_weight;
 463         u32                             weight;
 464         u32                             active;
 465         u32                             inuse;
 466         u32                             last_inuse;
 467
 468         sector_t                        cursor;         /* to detect randio */
 469
 470         /*
 471          * `vtime` is this iocg's vtime cursor which progresses as IOs are
 472          * issued.  If lagging behind device vtime, the delta represents
 473          * the currently available IO budget.  If runnning ahead, the
 474          * overage.
 475          *
 476          * `vtime_done` is the same but progressed on completion rather
 477          * than issue.  The delta behind `vtime` represents the cost of
 478          * currently in-flight IOs.
 479          */
 480         atomic64_t                      vtime;
 481         atomic64_t                      done_vtime;
 482         u64                             abs_vdebt;
 483
 484         /*
 485          * The period this iocg was last active in.  Used for deactivation
 486          * and invalidating `vtime`.
 487          */
 488         atomic64_t                      active_period;
 489         struct list_head                active_list;
 490
 491         /* see __propagate_weights() and current_hweight() for details */
 492         u64                             child_active_sum;
 493         u64                             child_inuse_sum;
 494         int                             hweight_gen;
 495         u32                             hweight_active;
 496         u32                             hweight_inuse;
 497         bool                            has_surplus;
 498
 499         struct list_head                walk_list;
 500
 501         struct wait_queue_head          waitq;
 502         struct hrtimer                  waitq_timer;
 503         struct hrtimer                  delay_timer;
 504
 505         /* timestamp at the latest activation */
 506         u64                             activated_at;
 507
 508         /* statistics */
 509         struct iocg_pcpu_stat __percpu  *pcpu_stat;
 510         struct iocg_stat                local_stat;
 511         struct iocg_stat                desc_stat;
 512         struct iocg_stat                last_stat;
 513         u64                             last_stat_abs_vusage;
 514
 515         /* usage is recorded as fractions of WEIGHT_ONE */
 516         u32                             usage_delta_us;
 517         int                             usage_idx;
 518         u32                             usages[NR_USAGE_SLOTS];
 519
 520         /* this iocg's depth in the hierarchy and ancestors including self */
 521         int                             level;
 522         struct ioc_gq                   *ancestors[];
 523 };
 524
 525 /* per cgroup */
 526 struct ioc_cgrp {
 527         struct blkcg_policy_data        cpd;
 528         unsigned int                    dfl_weight;
 529 };
 530
 531 struct ioc_now {
 532         u64                             now_ns;
 533         u64                             now;
 534         u64                             vnow;
 535         u64                             vrate;
 536 };
 537
 538 struct iocg_wait {
 539         struct wait_queue_entry         wait;
 540         struct bio                      *bio;
 541         u64                             abs_cost;
 542         bool                            committed;
 543 };
 544
 545 struct iocg_wake_ctx {
 546         struct ioc_gq                   *iocg;
 547         u32                             hw_inuse;
 548         s64                             vbudget;
 549 };
 550
 551 static const struct ioc_params autop[] = {
 552         [AUTOP_HDD] = {
 553                 .qos                            = {
 554                         [QOS_RLAT]              =        250000, /* 250ms */
 555                         [QOS_WLAT]              =        250000,
 556                         [QOS_MIN]               = VRATE_MIN_PPM,
 557                         [QOS_MAX]               = VRATE_MAX_PPM,
 558                 },
 559                 .i_lcoefs                       = {
 560                         [I_LCOEF_RBPS]          =     174019176,
 561                         [I_LCOEF_RSEQIOPS]      =         41708,
 562                         [I_LCOEF_RRANDIOPS]     =           370,
 563                         [I_LCOEF_WBPS]          =     178075866,
 564                         [I_LCOEF_WSEQIOPS]      =         42705,
 565                         [I_LCOEF_WRANDIOPS]     =           378,
 566                 },
 567         },
 568         [AUTOP_SSD_QD1] = {
 569                 .qos                            = {
 570                         [QOS_RLAT]              =         25000, /* 25ms */
 571                         [QOS_WLAT]              =         25000,
 572                         [QOS_MIN]               = VRATE_MIN_PPM,
 573                         [QOS_MAX]               = VRATE_MAX_PPM,
 574                 },
 575                 .i_lcoefs                       = {
 576                         [I_LCOEF_RBPS]          =     245855193,
 577                         [I_LCOEF_RSEQIOPS]      =         61575,
 578                         [I_LCOEF_RRANDIOPS]     =          6946,
 579                         [I_LCOEF_WBPS]          =     141365009,
 580                         [I_LCOEF_WSEQIOPS]      =         33716,
 581                         [I_LCOEF_WRANDIOPS]     =         26796,
 582                 },
 583         },
 584         [AUTOP_SSD_DFL] = {
 585                 .qos                            = {
 586                         [QOS_RLAT]              =         25000, /* 25ms */
 587                         [QOS_WLAT]              =         25000,
 588                         [QOS_MIN]               = VRATE_MIN_PPM,
 589                         [QOS_MAX]               = VRATE_MAX_PPM,
 590                 },
 591                 .i_lcoefs                       = {
 592                         [I_LCOEF_RBPS]          =     488636629,
 593                         [I_LCOEF_RSEQIOPS]      =          8932,
 594                         [I_LCOEF_RRANDIOPS]     =          8518,
 595                         [I_LCOEF_WBPS]          =     427891549,
 596                         [I_LCOEF_WSEQIOPS]      =         28755,
 597                         [I_LCOEF_WRANDIOPS]     =         21940,
 598                 },
 599                 .too_fast_vrate_pct             =           500,
 600         },
 601         [AUTOP_SSD_FAST] = {
 602                 .qos                            = {
 603                         [QOS_RLAT]              =          5000, /* 5ms */
 604                         [QOS_WLAT]              =          5000,
 605                         [QOS_MIN]               = VRATE_MIN_PPM,
 606                         [QOS_MAX]               = VRATE_MAX_PPM,
 607                 },
 608                 .i_lcoefs                       = {
 609                         [I_LCOEF_RBPS]          =    3102524156LLU,
 610                         [I_LCOEF_RSEQIOPS]      =        724816,
 611                         [I_LCOEF_RRANDIOPS]     =        778122,
 612                         [I_LCOEF_WBPS]          =    1742780862LLU,
 613                         [I_LCOEF_WSEQIOPS]      =        425702,
 614                         [I_LCOEF_WRANDIOPS]     =        443193,
 615                 },
 616                 .too_slow_vrate_pct             =            10,
 617         },
 618 };
 619
 620 /*
 621  * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
 622  * vtime credit shortage and down on device saturation.
 623  */
 624 static u32 vrate_adj_pct[] =
 625         { 0, 0, 0, 0,
 626           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 627           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 628           4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
 629
 630 static struct blkcg_policy blkcg_policy_iocost;
 631
 632 /* accessors and helpers */
 633 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
 634 {
 635         return container_of(rqos, struct ioc, rqos);
 636 }
 637
 638 static struct ioc *q_to_ioc(struct request_queue *q)
 639 {
 640         return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
 641 }
 642
 643 static const char *q_name(struct request_queue *q)
 644 {
 645         if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
 646                 return kobject_name(q->kobj.parent);
 647         else
 648                 return "<unknown>";
 649 }
 650
 651 static const char __maybe_unused *ioc_name(struct ioc *ioc)
 652 {
 653         return q_name(ioc->rqos.q);
 654 }
 655
 656 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
 657 {
 658         return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
 659 }
 660
 661 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
 662 {
 663         return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
 664 }
 665
 666 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
 667 {
 668         return pd_to_blkg(&iocg->pd);
 669 }
 670
 671 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
 672 {
 673         return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
 674                             struct ioc_cgrp, cpd);
 675 }
 676
 677 /*
 678  * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
 679  * weight, the more expensive each IO.  Must round up.
 680  */
 681 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
 682 {
 683         return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
 684 }
 685
 686 /*
 687  * The inverse of abs_cost_to_cost().  Must round up.
 688  */
 689 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
 690 {
 691         return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
 692 }
 693
 694 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
 695                             u64 abs_cost, u64 cost)
 696 {
 697         struct iocg_pcpu_stat *gcs;
 698
 699         bio->bi_iocost_cost = cost;
 700         atomic64_add(cost, &iocg->vtime);
 701
 702         gcs = get_cpu_ptr(iocg->pcpu_stat);
 703         local64_add(abs_cost, &gcs->abs_vusage);
 704         put_cpu_ptr(gcs);
 705 }
 706
 707 static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
 708 {
 709         if (lock_ioc) {
 710                 spin_lock_irqsave(&iocg->ioc->lock, *flags);
 711                 spin_lock(&iocg->waitq.lock);
 712         } else {
 713                 spin_lock_irqsave(&iocg->waitq.lock, *flags);
 714         }
 715 }
 716
 717 static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
 718 {
 719         if (unlock_ioc) {
 720                 spin_unlock(&iocg->waitq.lock);
 721                 spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
 722         } else {
 723                 spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
 724         }
 725 }
 726
 727 #define CREATE_TRACE_POINTS
 728 #include <trace/events/iocost.h>
 729
 730 static void ioc_refresh_margins(struct ioc *ioc)
 731 {
 732         struct ioc_margins *margins = &ioc->margins;
 733         u32 period_us = ioc->period_us;
 734         u64 vrate = atomic64_read(&ioc->vtime_rate);
 735
 736         margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
 737         margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate;
 738 }
 739
 740 /* latency Qos params changed, update period_us and all the dependent params */
 741 static void ioc_refresh_period_us(struct ioc *ioc)
 742 {
 743         u32 ppm, lat, multi, period_us;
 744
 745         lockdep_assert_held(&ioc->lock);
 746
 747         /* pick the higher latency target */
 748         if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
 749                 ppm = ioc->params.qos[QOS_RPPM];
 750                 lat = ioc->params.qos[QOS_RLAT];
 751         } else {
 752                 ppm = ioc->params.qos[QOS_WPPM];
 753                 lat = ioc->params.qos[QOS_WLAT];
 754         }
 755
 756         /*
 757          * We want the period to be long enough to contain a healthy number
 758          * of IOs while short enough for granular control.  Define it as a
 759          * multiple of the latency target.  Ideally, the multiplier should
 760          * be scaled according to the percentile so that it would nominally
 761          * contain a certain number of requests.  Let's be simpler and
 762          * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
 763          */
 764         if (ppm)
 765                 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
 766         else
 767                 multi = 2;
 768         period_us = multi * lat;
 769         period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
 770
 771         /* calculate dependent params */
 772         ioc->period_us = period_us;
 773         ioc->timer_slack_ns = div64_u64(
 774                 (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
 775                 100);
 776         ioc_refresh_margins(ioc);
 777 }
 778
 779 static int ioc_autop_idx(struct ioc *ioc)
 780 {
 781         int idx = ioc->autop_idx;
 782         const struct ioc_params *p = &autop[idx];
 783         u32 vrate_pct;
 784         u64 now_ns;
 785
 786         /* rotational? */
 787         if (!blk_queue_nonrot(ioc->rqos.q))
 788                 return AUTOP_HDD;
 789
 790         /* handle SATA SSDs w/ broken NCQ */
 791         if (blk_queue_depth(ioc->rqos.q) == 1)
 792                 return AUTOP_SSD_QD1;
 793
 794         /* use one of the normal ssd sets */
 795         if (idx < AUTOP_SSD_DFL)
 796                 return AUTOP_SSD_DFL;
 797
 798         /* if user is overriding anything, maintain what was there */
 799         if (ioc->user_qos_params || ioc->user_cost_model)
 800                 return idx;
 801
 802         /* step up/down based on the vrate */
 803         vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
 804                               VTIME_PER_USEC);
 805         now_ns = ktime_get_ns();
 806
 807         if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
 808                 if (!ioc->autop_too_fast_at)
 809                         ioc->autop_too_fast_at = now_ns;
 810                 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
 811                         return idx + 1;
 812         } else {
 813                 ioc->autop_too_fast_at = 0;
 814         }
 815
 816         if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
 817                 if (!ioc->autop_too_slow_at)
 818                         ioc->autop_too_slow_at = now_ns;
 819                 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
 820                         return idx - 1;
 821         } else {
 822                 ioc->autop_too_slow_at = 0;
 823         }
 824
 825         return idx;
 826 }
 827
 828 /*
 829  * Take the followings as input
 830  *
 831  *  @bps        maximum sequential throughput
 832  *  @seqiops    maximum sequential 4k iops
 833  *  @randiops   maximum random 4k iops
 834  *
 835  * and calculate the linear model cost coefficients.
 836  *
 837  *  *@page      per-page cost           1s / (@bps / 4096)
 838  *  *@seqio     base cost of a seq IO   max((1s / @seqiops) - *@page, 0)
 839  *  @randiops   base cost of a rand IO  max((1s / @randiops) - *@page, 0)
 840  */
 841 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
 842                         u64 *page, u64 *seqio, u64 *randio)
 843 {
 844         u64 v;
 845
 846         *page = *seqio = *randio = 0;
 847
 848         if (bps)
 849                 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
 850                                            DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
 851
 852         if (seqiops) {
 853                 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
 854                 if (v > *page)
 855                         *seqio = v - *page;
 856         }
 857
 858         if (randiops) {
 859                 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
 860                 if (v > *page)
 861                         *randio = v - *page;
 862         }
 863 }
 864
 865 static void ioc_refresh_lcoefs(struct ioc *ioc)
 866 {
 867         u64 *u = ioc->params.i_lcoefs;
 868         u64 *c = ioc->params.lcoefs;
 869
 870         calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
 871                     &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
 872         calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
 873                     &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
 874 }
 875
 876 static bool ioc_refresh_params(struct ioc *ioc, bool force)
 877 {
 878         const struct ioc_params *p;
 879         int idx;
 880
 881         lockdep_assert_held(&ioc->lock);
 882
 883         idx = ioc_autop_idx(ioc);
 884         p = &autop[idx];
 885
 886         if (idx == ioc->autop_idx && !force)
 887                 return false;
 888
 889         if (idx != ioc->autop_idx)
 890                 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
 891
 892         ioc->autop_idx = idx;
 893         ioc->autop_too_fast_at = 0;
 894         ioc->autop_too_slow_at = 0;
 895
 896         if (!ioc->user_qos_params)
 897                 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
 898         if (!ioc->user_cost_model)
 899                 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
 900
 901         ioc_refresh_period_us(ioc);
 902         ioc_refresh_lcoefs(ioc);
 903
 904         ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
 905                                             VTIME_PER_USEC, MILLION);
 906         ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
 907                                    VTIME_PER_USEC, MILLION);
 908
 909         return true;
 910 }
 911
 912 /* take a snapshot of the current [v]time and vrate */
 913 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
 914 {
 915         unsigned seq;
 916
 917         now->now_ns = ktime_get();
 918         now->now = ktime_to_us(now->now_ns);
 919         now->vrate = atomic64_read(&ioc->vtime_rate);
 920
 921         /*
 922          * The current vtime is
 923          *
 924          *   vtime at period start + (wallclock time since the start) * vrate
 925          *
 926          * As a consistent snapshot of `period_at_vtime` and `period_at` is
 927          * needed, they're seqcount protected.
 928          */
 929         do {
 930                 seq = read_seqcount_begin(&ioc->period_seqcount);
 931                 now->vnow = ioc->period_at_vtime +
 932                         (now->now - ioc->period_at) * now->vrate;
 933         } while (read_seqcount_retry(&ioc->period_seqcount, seq));
 934 }
 935
 936 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
 937 {
 938         WARN_ON_ONCE(ioc->running != IOC_RUNNING);
 939
 940         write_seqcount_begin(&ioc->period_seqcount);
 941         ioc->period_at = now->now;
 942         ioc->period_at_vtime = now->vnow;
 943         write_seqcount_end(&ioc->period_seqcount);
 944
 945         ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
 946         add_timer(&ioc->timer);
 947 }
 948
 949 /*
 950  * Update @iocg's `active` and `inuse` to @active and @inuse, update level
 951  * weight sums and propagate upwards accordingly.
 952  */
 953 static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse)
 954 {
 955         struct ioc *ioc = iocg->ioc;
 956         int lvl;
 957
 958         lockdep_assert_held(&ioc->lock);
 959
 960         inuse = clamp_t(u32, inuse, 1, active);
 961
 962         if (active == iocg->active && inuse == iocg->inuse)
 963                 return;
 964
 965         for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
 966                 struct ioc_gq *parent = iocg->ancestors[lvl];
 967                 struct ioc_gq *child = iocg->ancestors[lvl + 1];
 968                 u32 parent_active = 0, parent_inuse = 0;
 969
 970                 /* update the level sums */
 971                 parent->child_active_sum += (s32)(active - child->active);
 972                 parent->child_inuse_sum += (s32)(inuse - child->inuse);
 973                 /* apply the udpates */
 974                 child->active = active;
 975                 child->inuse = inuse;
 976
 977                 /*
 978                  * The delta between inuse and active sums indicates that
 979                  * that much of weight is being given away.  Parent's inuse
 980                  * and active should reflect the ratio.
 981                  */
 982                 if (parent->child_active_sum) {
 983                         parent_active = parent->weight;
 984                         parent_inuse = DIV64_U64_ROUND_UP(
 985                                 parent_active * parent->child_inuse_sum,
 986                                 parent->child_active_sum);
 987                 }
 988
 989                 /* do we need to keep walking up? */
 990                 if (parent_active == parent->active &&
 991                     parent_inuse == parent->inuse)
 992                         break;
 993
 994                 active = parent_active;
 995                 inuse = parent_inuse;
 996         }
 997
 998         ioc->weights_updated = true;
 999 }
1000
1001 static void commit_weights(struct ioc *ioc)
1002 {
1003         lockdep_assert_held(&ioc->lock);
1004
1005         if (ioc->weights_updated) {
1006                 /* paired with rmb in current_hweight(), see there */
1007                 smp_wmb();
1008                 atomic_inc(&ioc->hweight_gen);
1009                 ioc->weights_updated = false;
1010         }
1011 }
1012
1013 static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse)
1014 {
1015         __propagate_weights(iocg, active, inuse);
1016         commit_weights(iocg->ioc);
1017 }
1018
1019 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1020 {
1021         struct ioc *ioc = iocg->ioc;
1022         int lvl;
1023         u32 hwa, hwi;
1024         int ioc_gen;
1025
1026         /* hot path - if uptodate, use cached */
1027         ioc_gen = atomic_read(&ioc->hweight_gen);
1028         if (ioc_gen == iocg->hweight_gen)
1029                 goto out;
1030
1031         /*
1032          * Paired with wmb in commit_weights(). If we saw the updated
1033          * hweight_gen, all the weight updates from __propagate_weights() are
1034          * visible too.
1035          *
1036          * We can race with weight updates during calculation and get it
1037          * wrong.  However, hweight_gen would have changed and a future
1038          * reader will recalculate and we're guaranteed to discard the
1039          * wrong result soon.
1040          */
1041         smp_rmb();
1042
1043         hwa = hwi = WEIGHT_ONE;
1044         for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1045                 struct ioc_gq *parent = iocg->ancestors[lvl];
1046                 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1047                 u64 active_sum = READ_ONCE(parent->child_active_sum);
1048                 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
1049                 u32 active = READ_ONCE(child->active);
1050                 u32 inuse = READ_ONCE(child->inuse);
1051
1052                 /* we can race with deactivations and either may read as zero */
1053                 if (!active_sum || !inuse_sum)
1054                         continue;
1055
1056                 active_sum = max_t(u64, active, active_sum);
1057                 hwa = div64_u64((u64)hwa * active, active_sum);
1058
1059                 inuse_sum = max_t(u64, inuse, inuse_sum);
1060                 hwi = div64_u64((u64)hwi * inuse, inuse_sum);
1061         }
1062
1063         iocg->hweight_active = max_t(u32, hwa, 1);
1064         iocg->hweight_inuse = max_t(u32, hwi, 1);
1065         iocg->hweight_gen = ioc_gen;
1066 out:
1067         if (hw_activep)
1068                 *hw_activep = iocg->hweight_active;
1069         if (hw_inusep)
1070                 *hw_inusep = iocg->hweight_inuse;
1071 }
1072
1073 static void weight_updated(struct ioc_gq *iocg)
1074 {
1075         struct ioc *ioc = iocg->ioc;
1076         struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1077         struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1078         u32 weight;
1079
1080         lockdep_assert_held(&ioc->lock);
1081
1082         weight = iocg->cfg_weight ?: iocc->dfl_weight;
1083         if (weight != iocg->weight && iocg->active)
1084                 propagate_weights(iocg, weight,
1085                                   DIV64_U64_ROUND_UP((u64)iocg->inuse * weight,
1086                                                      iocg->weight));
1087         iocg->weight = weight;
1088 }
1089
1090 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1091 {
1092         struct ioc *ioc = iocg->ioc;
1093         u64 last_period, cur_period, max_period_delta;
1094         u64 vtime, vmin;
1095         int i;
1096
1097         /*
1098          * If seem to be already active, just update the stamp to tell the
1099          * timer that we're still active.  We don't mind occassional races.
1100          */
1101         if (!list_empty(&iocg->active_list)) {
1102                 ioc_now(ioc, now);
1103                 cur_period = atomic64_read(&ioc->cur_period);
1104                 if (atomic64_read(&iocg->active_period) != cur_period)
1105                         atomic64_set(&iocg->active_period, cur_period);
1106                 return true;
1107         }
1108
1109         /* racy check on internal node IOs, treat as root level IOs */
1110         if (iocg->child_active_sum)
1111                 return false;
1112
1113         spin_lock_irq(&ioc->lock);
1114
1115         ioc_now(ioc, now);
1116
1117         /* update period */
1118         cur_period = atomic64_read(&ioc->cur_period);
1119         last_period = atomic64_read(&iocg->active_period);
1120         atomic64_set(&iocg->active_period, cur_period);
1121
1122         /* already activated or breaking leaf-only constraint? */
1123         if (!list_empty(&iocg->active_list))
1124                 goto succeed_unlock;
1125         for (i = iocg->level - 1; i > 0; i--)
1126                 if (!list_empty(&iocg->ancestors[i]->active_list))
1127                         goto fail_unlock;
1128
1129         if (iocg->child_active_sum)
1130                 goto fail_unlock;
1131
1132         /*
1133          * vtime may wrap when vrate is raised substantially due to
1134          * underestimated IO costs.  Look at the period and ignore its
1135          * vtime if the iocg has been idle for too long.  Also, cap the
1136          * budget it can start with to the margin.
1137          */
1138         max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1139         vtime = atomic64_read(&iocg->vtime);
1140         vmin = now->vnow - ioc->margins.max;
1141
1142         if (last_period + max_period_delta < cur_period ||
1143             time_before64(vtime, vmin)) {
1144                 atomic64_add(vmin - vtime, &iocg->vtime);
1145                 atomic64_add(vmin - vtime, &iocg->done_vtime);
1146                 vtime = vmin;
1147         }
1148
1149         /*
1150          * Activate, propagate weight and start period timer if not
1151          * running.  Reset hweight_gen to avoid accidental match from
1152          * wrapping.
1153          */
1154         iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1155         list_add(&iocg->active_list, &ioc->active_iocgs);
1156         propagate_weights(iocg, iocg->weight,
1157                           iocg->last_inuse ?: iocg->weight);
1158
1159         TRACE_IOCG_PATH(iocg_activate, iocg, now,
1160                         last_period, cur_period, vtime);
1161
1162         iocg->activated_at = now->now;
1163
1164         if (ioc->running == IOC_IDLE) {
1165                 ioc->running = IOC_RUNNING;
1166                 ioc_start_period(ioc, now);
1167         }
1168
1169 succeed_unlock:
1170         spin_unlock_irq(&ioc->lock);
1171         return true;
1172
1173 fail_unlock:
1174         spin_unlock_irq(&ioc->lock);
1175         return false;
1176 }
1177
1178 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1179 {
1180         struct ioc *ioc = iocg->ioc;
1181         struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1182         u64 vtime = atomic64_read(&iocg->vtime);
1183         u64 delta_ns, expires, oexpires;
1184         u32 hw_inuse;
1185
1186         lockdep_assert_held(&iocg->waitq.lock);
1187
1188         /* debt-adjust vtime */
1189         current_hweight(iocg, NULL, &hw_inuse);
1190         vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1191
1192         /*
1193          * Clear or maintain depending on the overage. Non-zero vdebt is what
1194          * guarantees that @iocg is online and future iocg_kick_delay() will
1195          * clear use_delay. Don't leave it on when there's no vdebt.
1196          */
1197         if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1198                 blkcg_clear_delay(blkg);
1199                 return false;
1200         }
1201         if (!atomic_read(&blkg->use_delay) &&
1202             time_before_eq64(vtime, now->vnow + ioc->margins.max))
1203                 return false;
1204
1205         /* use delay */
1206         delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
1207                                       now->vrate) * NSEC_PER_USEC;
1208         blkcg_set_delay(blkg, delta_ns);
1209         expires = now->now_ns + delta_ns;
1210
1211         /* if already active and close enough, don't bother */
1212         oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1213         if (hrtimer_is_queued(&iocg->delay_timer) &&
1214             abs(oexpires - expires) <= ioc->timer_slack_ns)
1215                 return true;
1216
1217         hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1218                                ioc->timer_slack_ns, HRTIMER_MODE_ABS);
1219         return true;
1220 }
1221
1222 static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1223 {
1224         struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1225         struct ioc_now now;
1226         unsigned long flags;
1227
1228         spin_lock_irqsave(&iocg->waitq.lock, flags);
1229         ioc_now(iocg->ioc, &now);
1230         iocg_kick_delay(iocg, &now);
1231         spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1232
1233         return HRTIMER_NORESTART;
1234 }
1235
1236 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1237                         int flags, void *key)
1238 {
1239         struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1240         struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1241         u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1242
1243         ctx->vbudget -= cost;
1244
1245         if (ctx->vbudget < 0)
1246                 return -1;
1247
1248         iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
1249
1250         /*
1251          * autoremove_wake_function() removes the wait entry only when it
1252          * actually changed the task state.  We want the wait always
1253          * removed.  Remove explicitly and use default_wake_function().
1254          */
1255         list_del_init(&wq_entry->entry);
1256         wait->committed = true;
1257
1258         default_wake_function(wq_entry, mode, flags, key);
1259         return 0;
1260 }
1261
1262 /*
1263  * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
1264  * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1265  * addition to iocg->waitq.lock.
1266  */
1267 static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1268                             struct ioc_now *now)
1269 {
1270         struct ioc *ioc = iocg->ioc;
1271         struct iocg_wake_ctx ctx = { .iocg = iocg };
1272         u64 vshortage, expires, oexpires;
1273         s64 vbudget;
1274         u32 hw_inuse;
1275
1276         lockdep_assert_held(&iocg->waitq.lock);
1277
1278         current_hweight(iocg, NULL, &hw_inuse);
1279         vbudget = now->vnow - atomic64_read(&iocg->vtime);
1280
1281         /* pay off debt */
1282         if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
1283                 u64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1284                 u64 delta = min_t(u64, vbudget, vdebt);
1285                 u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1286                                     iocg->abs_vdebt);
1287
1288                 lockdep_assert_held(&ioc->lock);
1289
1290                 atomic64_add(delta, &iocg->vtime);
1291                 atomic64_add(delta, &iocg->done_vtime);
1292                 iocg->abs_vdebt -= abs_delta;
1293                 vbudget -= vdebt;
1294
1295                 iocg_kick_delay(iocg, now);
1296         }
1297
1298         /*
1299          * Debt can still be outstanding if we haven't paid all yet or the
1300          * caller raced and called without @pay_debt. Shouldn't wake up waiters
1301          * under debt. Make sure @vbudget reflects the outstanding amount and is
1302          * not positive.
1303          */
1304         if (iocg->abs_vdebt) {
1305                 s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1306                 vbudget = min_t(s64, 0, vbudget - vdebt);
1307         }
1308
1309         /*
1310          * Wake up the ones which are due and see how much vtime we'll need
1311          * for the next one.
1312          */
1313         ctx.hw_inuse = hw_inuse;
1314         ctx.vbudget = vbudget;
1315         __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1316         if (!waitqueue_active(&iocg->waitq))
1317                 return;
1318         if (WARN_ON_ONCE(ctx.vbudget >= 0))
1319                 return;
1320
1321         /* determine next wakeup, add a timer margin to guarantee chunking */
1322         vshortage = -ctx.vbudget;
1323         expires = now->now_ns +
1324                 DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1325         expires += ioc->timer_slack_ns;
1326
1327         /* if already active and close enough, don't bother */
1328         oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1329         if (hrtimer_is_queued(&iocg->waitq_timer) &&
1330             abs(oexpires - expires) <= ioc->timer_slack_ns)
1331                 return;
1332
1333         hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1334                                ioc->timer_slack_ns, HRTIMER_MODE_ABS);
1335 }
1336
1337 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1338 {
1339         struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1340         bool pay_debt = READ_ONCE(iocg->abs_vdebt);
1341         struct ioc_now now;
1342         unsigned long flags;
1343
1344         ioc_now(iocg->ioc, &now);
1345
1346         iocg_lock(iocg, pay_debt, &flags);
1347         iocg_kick_waitq(iocg, pay_debt, &now);
1348         iocg_unlock(iocg, pay_debt, &flags);
1349
1350         return HRTIMER_NORESTART;
1351 }
1352
1353 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1354 {
1355         u32 nr_met[2] = { };
1356         u32 nr_missed[2] = { };
1357         u64 rq_wait_ns = 0;
1358         int cpu, rw;
1359
1360         for_each_online_cpu(cpu) {
1361                 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1362                 u64 this_rq_wait_ns;
1363
1364                 for (rw = READ; rw <= WRITE; rw++) {
1365                         u32 this_met = local_read(&stat->missed[rw].nr_met);
1366                         u32 this_missed = local_read(&stat->missed[rw].nr_missed);
1367
1368                         nr_met[rw] += this_met - stat->missed[rw].last_met;
1369                         nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1370                         stat->missed[rw].last_met = this_met;
1371                         stat->missed[rw].last_missed = this_missed;
1372                 }
1373
1374                 this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
1375                 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1376                 stat->last_rq_wait_ns = this_rq_wait_ns;
1377         }
1378
1379         for (rw = READ; rw <= WRITE; rw++) {
1380                 if (nr_met[rw] + nr_missed[rw])
1381                         missed_ppm_ar[rw] =
1382                                 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1383                                                    nr_met[rw] + nr_missed[rw]);
1384                 else
1385                         missed_ppm_ar[rw] = 0;
1386         }
1387
1388         *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1389                                    ioc->period_us * NSEC_PER_USEC);
1390 }
1391
1392 /* was iocg idle this period? */
1393 static bool iocg_is_idle(struct ioc_gq *iocg)
1394 {
1395         struct ioc *ioc = iocg->ioc;
1396
1397         /* did something get issued this period? */
1398         if (atomic64_read(&iocg->active_period) ==
1399             atomic64_read(&ioc->cur_period))
1400                 return false;
1401
1402         /* is something in flight? */
1403         if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1404                 return false;
1405
1406         return true;
1407 }
1408
1409 /*
1410  * Call this function on the target leaf @iocg's to build pre-order traversal
1411  * list of all the ancestors in @inner_walk. The inner nodes are linked through
1412  * ->walk_list and the caller is responsible for dissolving the list after use.
1413  */
1414 static void iocg_build_inner_walk(struct ioc_gq *iocg,
1415                                   struct list_head *inner_walk)
1416 {
1417         int lvl;
1418
1419         WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1420
1421         /* find the first ancestor which hasn't been visited yet */
1422         for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1423                 if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1424                         break;
1425         }
1426
1427         /* walk down and visit the inner nodes to get pre-order traversal */
1428         while (++lvl <= iocg->level - 1) {
1429                 struct ioc_gq *inner = iocg->ancestors[lvl];
1430
1431                 /* record traversal order */
1432                 list_add_tail(&inner->walk_list, inner_walk);
1433         }
1434 }
1435
1436 /* collect per-cpu counters and propagate the deltas to the parent */
1437 static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now)
1438 {
1439         struct iocg_stat new_stat;
1440         u64 abs_vusage = 0;
1441         u64 vusage_delta;
1442         int cpu;
1443
1444         lockdep_assert_held(&iocg->ioc->lock);
1445
1446         /* collect per-cpu counters */
1447         for_each_possible_cpu(cpu) {
1448                 abs_vusage += local64_read(
1449                                 per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1450         }
1451         vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1452         iocg->last_stat_abs_vusage = abs_vusage;
1453
1454         iocg->usage_delta_us = div64_u64(vusage_delta, now->vrate);
1455         iocg->local_stat.usage_us += iocg->usage_delta_us;
1456
1457         new_stat.usage_us =
1458                 iocg->local_stat.usage_us + iocg->desc_stat.usage_us;
1459
1460         /* propagate the deltas to the parent */
1461         if (iocg->level > 0) {
1462                 struct iocg_stat *parent_stat =
1463                         &iocg->ancestors[iocg->level - 1]->desc_stat;
1464
1465                 parent_stat->usage_us +=
1466                         new_stat.usage_us - iocg->last_stat.usage_us;
1467         }
1468
1469         iocg->last_stat = new_stat;
1470 }
1471
1472 /* get stat counters ready for reading on all active iocgs */
1473 static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1474 {
1475         LIST_HEAD(inner_walk);
1476         struct ioc_gq *iocg, *tiocg;
1477
1478         /* flush leaves and build inner node walk list */
1479         list_for_each_entry(iocg, target_iocgs, active_list) {
1480                 iocg_flush_stat_one(iocg, now);
1481                 iocg_build_inner_walk(iocg, &inner_walk);
1482         }
1483
1484         /* keep flushing upwards by walking the inner list backwards */
1485         list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
1486                 iocg_flush_stat_one(iocg, now);
1487                 list_del_init(&iocg->walk_list);
1488         }
1489 }
1490
1491 /* returns usage with margin added if surplus is large enough */
1492 static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1493 {
1494         /* add margin */
1495         usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1496         usage += SURPLUS_SCALE_ABS;
1497
1498         /* don't bother if the surplus is too small */
1499         if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1500                 return 0;
1501
1502         return usage;
1503 }
1504
1505 static void ioc_timer_fn(struct timer_list *timer)
1506 {
1507         struct ioc *ioc = container_of(timer, struct ioc, timer);
1508         struct ioc_gq *iocg, *tiocg;
1509         struct ioc_now now;
1510         int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1511         u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1512         u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1513         u32 missed_ppm[2], rq_wait_pct;
1514         u64 period_vtime;
1515         int prev_busy_level, i;
1516
1517         /* how were the latencies during the period? */
1518         ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1519
1520         /* take care of active iocgs */
1521         spin_lock_irq(&ioc->lock);
1522
1523         ioc_now(ioc, &now);
1524
1525         period_vtime = now.vnow - ioc->period_at_vtime;
1526         if (WARN_ON_ONCE(!period_vtime)) {
1527                 spin_unlock_irq(&ioc->lock);
1528                 return;
1529         }
1530
1531         iocg_flush_stat(&ioc->active_iocgs, &now);
1532
1533         /*
1534          * Waiters determine the sleep durations based on the vrate they
1535          * saw at the time of sleep.  If vrate has increased, some waiters
1536          * could be sleeping for too long.  Wake up tardy waiters which
1537          * should have woken up in the last period and expire idle iocgs.
1538          */
1539         list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1540                 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1541                     !iocg_is_idle(iocg))
1542                         continue;
1543
1544                 spin_lock(&iocg->waitq.lock);
1545
1546                 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
1547                         /* might be oversleeping vtime / hweight changes, kick */
1548                         iocg_kick_waitq(iocg, true, &now);
1549                 } else if (iocg_is_idle(iocg)) {
1550                         /* no waiter and idle, deactivate */
1551                         iocg->last_inuse = iocg->inuse;
1552                         __propagate_weights(iocg, 0, 0);
1553                         list_del_init(&iocg->active_list);
1554                 }
1555
1556                 spin_unlock(&iocg->waitq.lock);
1557         }
1558         commit_weights(ioc);
1559
1560         /* calc usages and see whether some weights need to be moved around */
1561         list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1562                 u64 vdone, vtime, usage_us, vmin;
1563                 u32 hw_active, hw_inuse, usage;
1564                 int uidx;
1565
1566                 /*
1567                  * Collect unused and wind vtime closer to vnow to prevent
1568                  * iocgs from accumulating a large amount of budget.
1569                  */
1570                 vdone = atomic64_read(&iocg->done_vtime);
1571                 vtime = atomic64_read(&iocg->vtime);
1572                 current_hweight(iocg, &hw_active, &hw_inuse);
1573
1574                 /*
1575                  * Latency QoS detection doesn't account for IOs which are
1576                  * in-flight for longer than a period.  Detect them by
1577                  * comparing vdone against period start.  If lagging behind
1578                  * IOs from past periods, don't increase vrate.
1579                  */
1580                 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1581                     !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1582                     time_after64(vtime, vdone) &&
1583                     time_after64(vtime, now.vnow -
1584                                  MAX_LAGGING_PERIODS * period_vtime) &&
1585                     time_before64(vdone, now.vnow - period_vtime))
1586                         nr_lagging++;
1587
1588                 /*
1589                  * Determine absolute usage factoring in pending and in-flight
1590                  * IOs to avoid stalls and high-latency completions appearing as
1591                  * idle.
1592                  */
1593                 usage_us = iocg->usage_delta_us;
1594                 if (waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow))
1595                         usage_us += DIV64_U64_ROUND_UP(
1596                                 cost_to_abs_cost(now.vnow - vtime, hw_inuse),
1597                                 now.vrate);
1598                 if (vdone != vtime) {
1599                         u64 inflight_us = DIV64_U64_ROUND_UP(
1600                                 cost_to_abs_cost(vtime - vdone, hw_inuse),
1601                                 now.vrate);
1602                         usage_us = max(usage_us, inflight_us);
1603                 }
1604
1605                 /* convert to hweight based usage ratio and record */
1606                 uidx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1607
1608                 if (time_after64(vtime, now.vnow - ioc->margins.min)) {
1609                         iocg->usage_idx = uidx;
1610                         iocg->usages[uidx] = WEIGHT_ONE;
1611                 } else if (usage_us) {
1612                         u64 started_at, dur;
1613
1614                         if (time_after64(iocg->activated_at, ioc->period_at))
1615                                 started_at = iocg->activated_at;
1616                         else
1617                                 started_at = ioc->period_at;
1618
1619                         dur = max_t(u64, now.now - started_at, 1);
1620                         usage = clamp_t(u32,
1621                                 DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur),
1622                                 1, WEIGHT_ONE);
1623
1624                         iocg->usage_idx = uidx;
1625                         iocg->usages[uidx] = usage;
1626                 } else {
1627                         usage = 0;
1628                 }
1629
1630                 /* see whether there's surplus vtime */
1631                 vmin = now.vnow - ioc->margins.max;
1632
1633                 iocg->has_surplus = false;
1634
1635                 if (!waitqueue_active(&iocg->waitq) &&
1636                     time_before64(vtime, vmin)) {
1637                         u64 delta = vmin - vtime;
1638
1639                         /* throw away surplus vtime */
1640                         atomic64_add(delta, &iocg->vtime);
1641                         atomic64_add(delta, &iocg->done_vtime);
1642                         /* if usage is sufficiently low, maybe it can donate */
1643                         if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1644                                 iocg->has_surplus = true;
1645                                 nr_surpluses++;
1646                         }
1647                 } else if (hw_inuse < hw_active) {
1648                         u32 new_hwi, new_inuse;
1649
1650                         /* was donating but might need to take back some */
1651                         if (waitqueue_active(&iocg->waitq)) {
1652                                 new_hwi = hw_active;
1653                         } else {
1654                                 new_hwi = max(hw_inuse,
1655                                               usage * SURPLUS_SCALE_PCT / 100 +
1656                                               SURPLUS_SCALE_ABS);
1657                         }
1658
1659                         new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1660                                               hw_inuse);
1661                         new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1662
1663                         if (new_inuse > iocg->inuse) {
1664                                 TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1665                                                 iocg->inuse, new_inuse,
1666                                                 hw_inuse, new_hwi);
1667                                 __propagate_weights(iocg, iocg->weight,
1668                                                     new_inuse);
1669                         }
1670                 } else {
1671                         /* genuninely out of vtime */
1672                         nr_shortages++;
1673                 }
1674         }
1675
1676         if (!nr_shortages || !nr_surpluses)
1677                 goto skip_surplus_transfers;
1678
1679         /* there are both shortages and surpluses, transfer surpluses */
1680         list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1681                 u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1682                 int nr_valid = 0;
1683
1684                 if (!iocg->has_surplus)
1685                         continue;
1686
1687                 /* base the decision on max historical usage */
1688                 for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1689                         if (iocg->usages[i]) {
1690                                 usage = max(usage, iocg->usages[i]);
1691                                 nr_valid++;
1692                         }
1693                 }
1694                 if (nr_valid < MIN_VALID_USAGES)
1695                         continue;
1696
1697                 current_hweight(iocg, &hw_active, &hw_inuse);
1698                 new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1699                 if (!new_hwi)
1700                         continue;
1701
1702                 new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1703                                                hw_inuse);
1704                 if (new_inuse < iocg->inuse) {
1705                         TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1706                                         iocg->inuse, new_inuse,
1707                                         hw_inuse, new_hwi);
1708                         __propagate_weights(iocg, iocg->weight, new_inuse);
1709                 }
1710         }
1711 skip_surplus_transfers:
1712         commit_weights(ioc);
1713
1714         /*
1715          * If q is getting clogged or we're missing too much, we're issuing
1716          * too much IO and should lower vtime rate.  If we're not missing
1717          * and experiencing shortages but not surpluses, we're too stingy
1718          * and should increase vtime rate.
1719          */
1720         prev_busy_level = ioc->busy_level;
1721         if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1722             missed_ppm[READ] > ppm_rthr ||
1723             missed_ppm[WRITE] > ppm_wthr) {
1724                 /* clearly missing QoS targets, slow down vrate */
1725                 ioc->busy_level = max(ioc->busy_level, 0);
1726                 ioc->busy_level++;
1727         } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1728                    missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1729                    missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1730                 /* QoS targets are being met with >25% margin */
1731                 if (nr_shortages) {
1732                         /*
1733                          * We're throttling while the device has spare
1734                          * capacity.  If vrate was being slowed down, stop.
1735                          */
1736                         ioc->busy_level = min(ioc->busy_level, 0);
1737
1738                         /*
1739                          * If there are IOs spanning multiple periods, wait
1740                          * them out before pushing the device harder.  If
1741                          * there are surpluses, let redistribution work it
1742                          * out first.
1743                          */
1744                         if (!nr_lagging && !nr_surpluses)
1745                                 ioc->busy_level--;
1746                 } else {
1747                         /*
1748                          * Nobody is being throttled and the users aren't
1749                          * issuing enough IOs to saturate the device.  We
1750                          * simply don't know how close the device is to
1751                          * saturation.  Coast.
1752                          */
1753                         ioc->busy_level = 0;
1754                 }
1755         } else {
1756                 /* inside the hysterisis margin, we're good */
1757                 ioc->busy_level = 0;
1758         }
1759
1760         ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1761
1762         if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1763                 u64 vrate = atomic64_read(&ioc->vtime_rate);
1764                 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1765
1766                 /* rq_wait signal is always reliable, ignore user vrate_min */
1767                 if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1768                         vrate_min = VRATE_MIN;
1769
1770                 /*
1771                  * If vrate is out of bounds, apply clamp gradually as the
1772                  * bounds can change abruptly.  Otherwise, apply busy_level
1773                  * based adjustment.
1774                  */
1775                 if (vrate < vrate_min) {
1776                         vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1777                                           100);
1778                         vrate = min(vrate, vrate_min);
1779                 } else if (vrate > vrate_max) {
1780                         vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1781                                           100);
1782                         vrate = max(vrate, vrate_max);
1783                 } else {
1784                         int idx = min_t(int, abs(ioc->busy_level),
1785                                         ARRAY_SIZE(vrate_adj_pct) - 1);
1786                         u32 adj_pct = vrate_adj_pct[idx];
1787
1788                         if (ioc->busy_level > 0)
1789                                 adj_pct = 100 - adj_pct;
1790                         else
1791                                 adj_pct = 100 + adj_pct;
1792
1793                         vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1794                                       vrate_min, vrate_max);
1795                 }
1796
1797                 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1798                                            nr_lagging, nr_shortages,
1799                                            nr_surpluses);
1800
1801                 atomic64_set(&ioc->vtime_rate, vrate);
1802                 ioc_refresh_margins(ioc);
1803         } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1804                 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1805                                            missed_ppm, rq_wait_pct, nr_lagging,
1806                                            nr_shortages, nr_surpluses);
1807         }
1808
1809         ioc_refresh_params(ioc, false);
1810
1811         /*
1812          * This period is done.  Move onto the next one.  If nothing's
1813          * going on with the device, stop the timer.
1814          */
1815         atomic64_inc(&ioc->cur_period);
1816
1817         if (ioc->running != IOC_STOP) {
1818                 if (!list_empty(&ioc->active_iocgs)) {
1819                         ioc_start_period(ioc, &now);
1820                 } else {
1821                         ioc->busy_level = 0;
1822                         ioc->running = IOC_IDLE;
1823                 }
1824         }
1825
1826         spin_unlock_irq(&ioc->lock);
1827 }
1828
1829 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1830                                     bool is_merge, u64 *costp)
1831 {
1832         struct ioc *ioc = iocg->ioc;
1833         u64 coef_seqio, coef_randio, coef_page;
1834         u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1835         u64 seek_pages = 0;
1836         u64 cost = 0;
1837
1838         switch (bio_op(bio)) {
1839         case REQ_OP_READ:
1840                 coef_seqio      = ioc->params.lcoefs[LCOEF_RSEQIO];
1841                 coef_randio     = ioc->params.lcoefs[LCOEF_RRANDIO];
1842                 coef_page       = ioc->params.lcoefs[LCOEF_RPAGE];
1843                 break;
1844         case REQ_OP_WRITE:
1845                 coef_seqio      = ioc->params.lcoefs[LCOEF_WSEQIO];
1846                 coef_randio     = ioc->params.lcoefs[LCOEF_WRANDIO];
1847                 coef_page       = ioc->params.lcoefs[LCOEF_WPAGE];
1848                 break;
1849         default:
1850                 goto out;
1851         }
1852
1853         if (iocg->cursor) {
1854                 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1855                 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1856         }
1857
1858         if (!is_merge) {
1859                 if (seek_pages > LCOEF_RANDIO_PAGES) {
1860                         cost += coef_randio;
1861                 } else {
1862                         cost += coef_seqio;
1863                 }
1864         }
1865         cost += pages * coef_page;
1866 out:
1867         *costp = cost;
1868 }
1869
1870 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1871 {
1872         u64 cost;
1873
1874         calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1875         return cost;
1876 }
1877
1878 static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
1879                                          u64 *costp)
1880 {
1881         unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
1882
1883         switch (req_op(rq)) {
1884         case REQ_OP_READ:
1885                 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
1886                 break;
1887         case REQ_OP_WRITE:
1888                 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
1889                 break;
1890         default:
1891                 *costp = 0;
1892         }
1893 }
1894
1895 static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
1896 {
1897         u64 cost;
1898
1899         calc_size_vtime_cost_builtin(rq, ioc, &cost);
1900         return cost;
1901 }
1902
1903 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1904 {
1905         struct blkcg_gq *blkg = bio->bi_blkg;
1906         struct ioc *ioc = rqos_to_ioc(rqos);
1907         struct ioc_gq *iocg = blkg_to_iocg(blkg);
1908         struct ioc_now now;
1909         struct iocg_wait wait;
1910         u32 hw_active, hw_inuse;
1911         u64 abs_cost, cost, vtime;
1912         bool use_debt, ioc_locked;
1913         unsigned long flags;
1914
1915         /* bypass IOs if disabled or for root cgroup */
1916         if (!ioc->enabled || !iocg->level)
1917                 return;
1918
1919         /* always activate so that even 0 cost IOs get protected to some level */
1920         if (!iocg_activate(iocg, &now))
1921                 return;
1922
1923         /* calculate the absolute vtime cost */
1924         abs_cost = calc_vtime_cost(bio, iocg, false);
1925         if (!abs_cost)
1926                 return;
1927
1928         iocg->cursor = bio_end_sector(bio);
1929
1930         vtime = atomic64_read(&iocg->vtime);
1931         current_hweight(iocg, &hw_active, &hw_inuse);
1932
1933         if (hw_inuse < hw_active &&
1934             time_after_eq64(vtime + ioc->margins.min, now.vnow)) {
1935                 TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1936                                 iocg->inuse, iocg->weight, hw_inuse, hw_active);
1937                 spin_lock_irq(&ioc->lock);
1938                 propagate_weights(iocg, iocg->weight, iocg->weight);
1939                 spin_unlock_irq(&ioc->lock);
1940                 current_hweight(iocg, &hw_active, &hw_inuse);
1941         }
1942
1943         cost = abs_cost_to_cost(abs_cost, hw_inuse);
1944
1945         /*
1946          * If no one's waiting and within budget, issue right away.  The
1947          * tests are racy but the races aren't systemic - we only miss once
1948          * in a while which is fine.
1949          */
1950         if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1951             time_before_eq64(vtime + cost, now.vnow)) {
1952                 iocg_commit_bio(iocg, bio, abs_cost, cost);
1953                 return;
1954         }
1955
1956         /*
1957          * We're over budget. This can be handled in two ways. IOs which may
1958          * cause priority inversions are punted to @ioc->aux_iocg and charged as
1959          * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
1960          * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
1961          * whether debt handling is needed and acquire locks accordingly.
1962          */
1963         use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
1964         ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
1965
1966         iocg_lock(iocg, ioc_locked, &flags);
1967
1968         /*
1969          * @iocg must stay activated for debt and waitq handling. Deactivation
1970          * is synchronized against both ioc->lock and waitq.lock and we won't
1971          * get deactivated as long as we're waiting or has debt, so we're good
1972          * if we're activated here. In the unlikely cases that we aren't, just
1973          * issue the IO.
1974          */
1975         if (unlikely(list_empty(&iocg->active_list))) {
1976                 iocg_unlock(iocg, ioc_locked, &flags);
1977                 iocg_commit_bio(iocg, bio, abs_cost, cost);
1978                 return;
1979         }
1980
1981         /*
1982          * We're over budget. If @bio has to be issued regardless, remember
1983          * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
1984          * off the debt before waking more IOs.
1985          *
1986          * This way, the debt is continuously paid off each period with the
1987          * actual budget available to the cgroup. If we just wound vtime, we
1988          * would incorrectly use the current hw_inuse for the entire amount
1989          * which, for example, can lead to the cgroup staying blocked for a
1990          * long time even with substantially raised hw_inuse.
1991          *
1992          * An iocg with vdebt should stay online so that the timer can keep
1993          * deducting its vdebt and [de]activate use_delay mechanism
1994          * accordingly. We don't want to race against the timer trying to
1995          * clear them and leave @iocg inactive w/ dangling use_delay heavily
1996          * penalizing the cgroup and its descendants.
1997          */
1998         if (use_debt) {
1999                 iocg->abs_vdebt += abs_cost;
2000                 if (iocg_kick_delay(iocg, &now))
2001                         blkcg_schedule_throttle(rqos->q,
2002                                         (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2003                 iocg_unlock(iocg, ioc_locked, &flags);
2004                 return;
2005         }
2006
2007         /*
2008          * Append self to the waitq and schedule the wakeup timer if we're
2009          * the first waiter.  The timer duration is calculated based on the
2010          * current vrate.  vtime and hweight changes can make it too short
2011          * or too long.  Each wait entry records the absolute cost it's
2012          * waiting for to allow re-evaluation using a custom wait entry.
2013          *
2014          * If too short, the timer simply reschedules itself.  If too long,
2015          * the period timer will notice and trigger wakeups.
2016          *
2017          * All waiters are on iocg->waitq and the wait states are
2018          * synchronized using waitq.lock.
2019          */
2020         init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2021         wait.wait.private = current;
2022         wait.bio = bio;
2023         wait.abs_cost = abs_cost;
2024         wait.committed = false; /* will be set true by waker */
2025
2026         __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
2027         iocg_kick_waitq(iocg, ioc_locked, &now);
2028
2029         iocg_unlock(iocg, ioc_locked, &flags);
2030
2031         while (true) {
2032                 set_current_state(TASK_UNINTERRUPTIBLE);
2033                 if (wait.committed)
2034                         break;
2035                 io_schedule();
2036         }
2037
2038         /* waker already committed us, proceed */
2039         finish_wait(&iocg->waitq, &wait.wait);
2040 }
2041
2042 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2043                            struct bio *bio)
2044 {
2045         struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2046         struct ioc *ioc = iocg->ioc;
2047         sector_t bio_end = bio_end_sector(bio);
2048         struct ioc_now now;
2049         u32 hw_inuse;
2050         u64 abs_cost, cost;
2051         unsigned long flags;
2052
2053         /* bypass if disabled or for root cgroup */
2054         if (!ioc->enabled || !iocg->level)
2055                 return;
2056
2057         abs_cost = calc_vtime_cost(bio, iocg, true);
2058         if (!abs_cost)
2059                 return;
2060
2061         ioc_now(ioc, &now);
2062         current_hweight(iocg, NULL, &hw_inuse);
2063         cost = abs_cost_to_cost(abs_cost, hw_inuse);
2064
2065         /* update cursor if backmerging into the request at the cursor */
2066         if (blk_rq_pos(rq) < bio_end &&
2067             blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2068                 iocg->cursor = bio_end;
2069
2070         /*
2071          * Charge if there's enough vtime budget and the existing request has
2072          * cost assigned.
2073          */
2074         if (rq->bio && rq->bio->bi_iocost_cost &&
2075             time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
2076                 iocg_commit_bio(iocg, bio, abs_cost, cost);
2077                 return;
2078         }
2079
2080         /*
2081          * Otherwise, account it as debt if @iocg is online, which it should
2082          * be for the vast majority of cases. See debt handling in
2083          * ioc_rqos_throttle() for details.
2084          */
2085         spin_lock_irqsave(&iocg->waitq.lock, flags);
2086         if (likely(!list_empty(&iocg->active_list))) {
2087                 iocg->abs_vdebt += abs_cost;
2088                 iocg_kick_delay(iocg, &now);
2089         } else {
2090                 iocg_commit_bio(iocg, bio, abs_cost, cost);
2091         }
2092         spin_unlock_irqrestore(&iocg->waitq.lock, flags);
2093 }
2094
2095 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2096 {
2097         struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2098
2099         if (iocg && bio->bi_iocost_cost)
2100                 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2101 }
2102
2103 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2104 {
2105         struct ioc *ioc = rqos_to_ioc(rqos);
2106         struct ioc_pcpu_stat *ccs;
2107         u64 on_q_ns, rq_wait_ns, size_nsec;
2108         int pidx, rw;
2109
2110         if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2111                 return;
2112
2113         switch (req_op(rq) & REQ_OP_MASK) {
2114         case REQ_OP_READ:
2115                 pidx = QOS_RLAT;
2116                 rw = READ;
2117                 break;
2118         case REQ_OP_WRITE:
2119                 pidx = QOS_WLAT;
2120                 rw = WRITE;
2121                 break;
2122         default:
2123                 return;
2124         }
2125
2126         on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
2127         rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
2128         size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
2129
2130         ccs = get_cpu_ptr(ioc->pcpu_stat);
2131
2132         if (on_q_ns <= size_nsec ||
2133             on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
2134                 local_inc(&ccs->missed[rw].nr_met);
2135         else
2136                 local_inc(&ccs->missed[rw].nr_missed);
2137
2138         local64_add(rq_wait_ns, &ccs->rq_wait_ns);
2139
2140         put_cpu_ptr(ccs);
2141 }
2142
2143 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2144 {
2145         struct ioc *ioc = rqos_to_ioc(rqos);
2146
2147         spin_lock_irq(&ioc->lock);
2148         ioc_refresh_params(ioc, false);
2149         spin_unlock_irq(&ioc->lock);
2150 }
2151
2152 static void ioc_rqos_exit(struct rq_qos *rqos)
2153 {
2154         struct ioc *ioc = rqos_to_ioc(rqos);
2155
2156         blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
2157
2158         spin_lock_irq(&ioc->lock);
2159         ioc->running = IOC_STOP;
2160         spin_unlock_irq(&ioc->lock);
2161
2162         del_timer_sync(&ioc->timer);
2163         free_percpu(ioc->pcpu_stat);
2164         kfree(ioc);
2165 }
2166
2167 static struct rq_qos_ops ioc_rqos_ops = {
2168         .throttle = ioc_rqos_throttle,
2169         .merge = ioc_rqos_merge,
2170         .done_bio = ioc_rqos_done_bio,
2171         .done = ioc_rqos_done,
2172         .queue_depth_changed = ioc_rqos_queue_depth_changed,
2173         .exit = ioc_rqos_exit,
2174 };
2175
2176 static int blk_iocost_init(struct request_queue *q)
2177 {
2178         struct ioc *ioc;
2179         struct rq_qos *rqos;
2180         int i, cpu, ret;
2181
2182         ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2183         if (!ioc)
2184                 return -ENOMEM;
2185
2186         ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2187         if (!ioc->pcpu_stat) {
2188                 kfree(ioc);
2189                 return -ENOMEM;
2190         }
2191
2192         for_each_possible_cpu(cpu) {
2193                 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2194
2195                 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2196                         local_set(&ccs->missed[i].nr_met, 0);
2197                         local_set(&ccs->missed[i].nr_missed, 0);
2198                 }
2199                 local64_set(&ccs->rq_wait_ns, 0);
2200         }
2201
2202         rqos = &ioc->rqos;
2203         rqos->id = RQ_QOS_COST;
2204         rqos->ops = &ioc_rqos_ops;
2205         rqos->q = q;
2206
2207         spin_lock_init(&ioc->lock);
2208         timer_setup(&ioc->timer, ioc_timer_fn, 0);
2209         INIT_LIST_HEAD(&ioc->active_iocgs);
2210
2211         ioc->running = IOC_IDLE;
2212         atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2213         seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
2214         ioc->period_at = ktime_to_us(ktime_get());
2215         atomic64_set(&ioc->cur_period, 0);
2216         atomic_set(&ioc->hweight_gen, 0);
2217
2218         spin_lock_irq(&ioc->lock);
2219         ioc->autop_idx = AUTOP_INVALID;
2220         ioc_refresh_params(ioc, true);
2221         spin_unlock_irq(&ioc->lock);
2222
2223         rq_qos_add(q, rqos);
2224         ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2225         if (ret) {
2226                 rq_qos_del(q, rqos);
2227                 free_percpu(ioc->pcpu_stat);
2228                 kfree(ioc);
2229                 return ret;
2230         }
2231         return 0;
2232 }
2233
2234 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2235 {
2236         struct ioc_cgrp *iocc;
2237
2238         iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2239         if (!iocc)
2240                 return NULL;
2241
2242         iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
2243         return &iocc->cpd;
2244 }
2245
2246 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2247 {
2248         kfree(container_of(cpd, struct ioc_cgrp, cpd));
2249 }
2250
2251 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2252                                              struct blkcg *blkcg)
2253 {
2254         int levels = blkcg->css.cgroup->level + 1;
2255         struct ioc_gq *iocg;
2256
2257         iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
2258         if (!iocg)
2259                 return NULL;
2260
2261         iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2262         if (!iocg->pcpu_stat) {
2263                 kfree(iocg);
2264                 return NULL;
2265         }
2266
2267         return &iocg->pd;
2268 }
2269
2270 static void ioc_pd_init(struct blkg_policy_data *pd)
2271 {
2272         struct ioc_gq *iocg = pd_to_iocg(pd);
2273         struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2274         struct ioc *ioc = q_to_ioc(blkg->q);
2275         struct ioc_now now;
2276         struct blkcg_gq *tblkg;
2277         unsigned long flags;
2278
2279         ioc_now(ioc, &now);
2280
2281         iocg->ioc = ioc;
2282         atomic64_set(&iocg->vtime, now.vnow);
2283         atomic64_set(&iocg->done_vtime, now.vnow);
2284         atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2285         INIT_LIST_HEAD(&iocg->active_list);
2286         INIT_LIST_HEAD(&iocg->walk_list);
2287         iocg->hweight_active = WEIGHT_ONE;
2288         iocg->hweight_inuse = WEIGHT_ONE;
2289
2290         init_waitqueue_head(&iocg->waitq);
2291         hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2292         iocg->waitq_timer.function = iocg_waitq_timer_fn;
2293         hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2294         iocg->delay_timer.function = iocg_delay_timer_fn;
2295
2296         iocg->level = blkg->blkcg->css.cgroup->level;
2297
2298         for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2299                 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2300                 iocg->ancestors[tiocg->level] = tiocg;
2301         }
2302
2303         spin_lock_irqsave(&ioc->lock, flags);
2304         weight_updated(iocg);
2305         spin_unlock_irqrestore(&ioc->lock, flags);
2306 }
2307
2308 static void ioc_pd_free(struct blkg_policy_data *pd)
2309 {
2310         struct ioc_gq *iocg = pd_to_iocg(pd);
2311         struct ioc *ioc = iocg->ioc;
2312         unsigned long flags;
2313
2314         if (ioc) {
2315                 spin_lock_irqsave(&ioc->lock, flags);
2316
2317                 if (!list_empty(&iocg->active_list)) {
2318                         propagate_weights(iocg, 0, 0);
2319                         list_del_init(&iocg->active_list);
2320                 }
2321
2322                 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
2323
2324                 spin_unlock_irqrestore(&ioc->lock, flags);
2325
2326                 hrtimer_cancel(&iocg->waitq_timer);
2327                 hrtimer_cancel(&iocg->delay_timer);
2328         }
2329         free_percpu(iocg->pcpu_stat);
2330         kfree(iocg);
2331 }
2332
2333 static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size)
2334 {
2335         struct ioc_gq *iocg = pd_to_iocg(pd);
2336         struct ioc *ioc = iocg->ioc;
2337         size_t pos = 0;
2338
2339         if (!ioc->enabled)
2340                 return 0;
2341
2342         if (iocg->level == 0) {
2343                 unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
2344                         atomic64_read(&ioc->vtime_rate) * 10000,
2345                         VTIME_PER_USEC);
2346                 pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u",
2347                                   vp10k / 100, vp10k % 100);
2348         }
2349
2350         pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu",
2351                          iocg->last_stat.usage_us);
2352
2353         return pos;
2354 }
2355
2356 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2357                              int off)
2358 {
2359         const char *dname = blkg_dev_name(pd->blkg);
2360         struct ioc_gq *iocg = pd_to_iocg(pd);
2361
2362         if (dname && iocg->cfg_weight)
2363                 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
2364         return 0;
2365 }
2366
2367
2368 static int ioc_weight_show(struct seq_file *sf, void *v)
2369 {
2370         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2371         struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2372
2373         seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
2374         blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2375                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
2376         return 0;
2377 }
2378
2379 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2380                                 size_t nbytes, loff_t off)
2381 {
2382         struct blkcg *blkcg = css_to_blkcg(of_css(of));
2383         struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2384         struct blkg_conf_ctx ctx;
2385         struct ioc_gq *iocg;
2386         u32 v;
2387         int ret;
2388
2389         if (!strchr(buf, ':')) {
2390                 struct blkcg_gq *blkg;
2391
2392                 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2393                         return -EINVAL;
2394
2395                 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2396                         return -EINVAL;
2397
2398                 spin_lock(&blkcg->lock);
2399                 iocc->dfl_weight = v * WEIGHT_ONE;
2400                 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2401                         struct ioc_gq *iocg = blkg_to_iocg(blkg);
2402
2403                         if (iocg) {
2404                                 spin_lock_irq(&iocg->ioc->lock);
2405                                 weight_updated(iocg);
2406                                 spin_unlock_irq(&iocg->ioc->lock);
2407                         }
2408                 }
2409                 spin_unlock(&blkcg->lock);
2410
2411                 return nbytes;
2412         }
2413
2414         ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2415         if (ret)
2416                 return ret;
2417
2418         iocg = blkg_to_iocg(ctx.blkg);
2419
2420         if (!strncmp(ctx.body, "default", 7)) {
2421                 v = 0;
2422         } else {
2423                 if (!sscanf(ctx.body, "%u", &v))
2424                         goto einval;
2425                 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2426                         goto einval;
2427         }
2428
2429         spin_lock(&iocg->ioc->lock);
2430         iocg->cfg_weight = v * WEIGHT_ONE;
2431         weight_updated(iocg);
2432         spin_unlock(&iocg->ioc->lock);
2433
2434         blkg_conf_finish(&ctx);
2435         return nbytes;
2436
2437 einval:
2438         blkg_conf_finish(&ctx);
2439         return -EINVAL;
2440 }
2441
2442 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2443                           int off)
2444 {
2445         const char *dname = blkg_dev_name(pd->blkg);
2446         struct ioc *ioc = pd_to_iocg(pd)->ioc;
2447
2448         if (!dname)
2449                 return 0;
2450
2451         seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2452                    dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2453                    ioc->params.qos[QOS_RPPM] / 10000,
2454                    ioc->params.qos[QOS_RPPM] % 10000 / 100,
2455                    ioc->params.qos[QOS_RLAT],
2456                    ioc->params.qos[QOS_WPPM] / 10000,
2457                    ioc->params.qos[QOS_WPPM] % 10000 / 100,
2458                    ioc->params.qos[QOS_WLAT],
2459                    ioc->params.qos[QOS_MIN] / 10000,
2460                    ioc->params.qos[QOS_MIN] % 10000 / 100,
2461                    ioc->params.qos[QOS_MAX] / 10000,
2462                    ioc->params.qos[QOS_MAX] % 10000 / 100);
2463         return 0;
2464 }
2465
2466 static int ioc_qos_show(struct seq_file *sf, void *v)
2467 {
2468         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2469
2470         blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2471                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
2472         return 0;
2473 }
2474
2475 static const match_table_t qos_ctrl_tokens = {
2476         { QOS_ENABLE,           "enable=%u"     },
2477         { QOS_CTRL,             "ctrl=%s"       },
2478         { NR_QOS_CTRL_PARAMS,   NULL            },
2479 };
2480
2481 static const match_table_t qos_tokens = {
2482         { QOS_RPPM,             "rpct=%s"       },
2483         { QOS_RLAT,             "rlat=%u"       },
2484         { QOS_WPPM,             "wpct=%s"       },
2485         { QOS_WLAT,             "wlat=%u"       },
2486         { QOS_MIN,              "min=%s"        },
2487         { QOS_MAX,              "max=%s"        },
2488         { NR_QOS_PARAMS,        NULL            },
2489 };
2490
2491 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2492                              size_t nbytes, loff_t off)
2493 {
2494         struct gendisk *disk;
2495         struct ioc *ioc;
2496         u32 qos[NR_QOS_PARAMS];
2497         bool enable, user;
2498         char *p;
2499         int ret;
2500
2501         disk = blkcg_conf_get_disk(&input);
2502         if (IS_ERR(disk))
2503                 return PTR_ERR(disk);
2504
2505         ioc = q_to_ioc(disk->queue);
2506         if (!ioc) {
2507                 ret = blk_iocost_init(disk->queue);
2508                 if (ret)
2509                         goto err;
2510                 ioc = q_to_ioc(disk->queue);
2511         }
2512
2513         spin_lock_irq(&ioc->lock);
2514         memcpy(qos, ioc->params.qos, sizeof(qos));
2515         enable = ioc->enabled;
2516         user = ioc->user_qos_params;
2517         spin_unlock_irq(&ioc->lock);
2518
2519         while ((p = strsep(&input, " \t\n"))) {
2520                 substring_t args[MAX_OPT_ARGS];
2521                 char buf[32];
2522                 int tok;
2523                 s64 v;
2524
2525                 if (!*p)
2526                         continue;
2527
2528                 switch (match_token(p, qos_ctrl_tokens, args)) {
2529                 case QOS_ENABLE:
2530                         match_u64(&args[0], &v);
2531                         enable = v;
2532                         continue;
2533                 case QOS_CTRL:
2534                         match_strlcpy(buf, &args[0], sizeof(buf));
2535                         if (!strcmp(buf, "auto"))
2536                                 user = false;
2537                         else if (!strcmp(buf, "user"))
2538                                 user = true;
2539                         else
2540                                 goto einval;
2541                         continue;
2542                 }
2543
2544                 tok = match_token(p, qos_tokens, args);
2545                 switch (tok) {
2546                 case QOS_RPPM:
2547                 case QOS_WPPM:
2548                         if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2549                             sizeof(buf))
2550                                 goto einval;
2551                         if (cgroup_parse_float(buf, 2, &v))
2552                                 goto einval;
2553                         if (v < 0 || v > 10000)
2554                                 goto einval;
2555                         qos[tok] = v * 100;
2556                         break;
2557                 case QOS_RLAT:
2558                 case QOS_WLAT:
2559                         if (match_u64(&args[0], &v))
2560                                 goto einval;
2561                         qos[tok] = v;
2562                         break;
2563                 case QOS_MIN:
2564                 case QOS_MAX:
2565                         if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2566                             sizeof(buf))
2567                                 goto einval;
2568                         if (cgroup_parse_float(buf, 2, &v))
2569                                 goto einval;
2570                         if (v < 0)
2571                                 goto einval;
2572                         qos[tok] = clamp_t(s64, v * 100,
2573                                            VRATE_MIN_PPM, VRATE_MAX_PPM);
2574                         break;
2575                 default:
2576                         goto einval;
2577                 }
2578                 user = true;
2579         }
2580
2581         if (qos[QOS_MIN] > qos[QOS_MAX])
2582                 goto einval;
2583
2584         spin_lock_irq(&ioc->lock);
2585
2586         if (enable) {
2587                 blk_stat_enable_accounting(ioc->rqos.q);
2588                 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2589                 ioc->enabled = true;
2590         } else {
2591                 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2592                 ioc->enabled = false;
2593         }
2594
2595         if (user) {
2596                 memcpy(ioc->params.qos, qos, sizeof(qos));
2597                 ioc->user_qos_params = true;
2598         } else {
2599                 ioc->user_qos_params = false;
2600         }
2601
2602         ioc_refresh_params(ioc, true);
2603         spin_unlock_irq(&ioc->lock);
2604
2605         put_disk_and_module(disk);
2606         return nbytes;
2607 einval:
2608         ret = -EINVAL;
2609 err:
2610         put_disk_and_module(disk);
2611         return ret;
2612 }
2613
2614 static u64 ioc_cost_model_prfill(struct seq_file *sf,
2615                                  struct blkg_policy_data *pd, int off)
2616 {
2617         const char *dname = blkg_dev_name(pd->blkg);
2618         struct ioc *ioc = pd_to_iocg(pd)->ioc;
2619         u64 *u = ioc->params.i_lcoefs;
2620
2621         if (!dname)
2622                 return 0;
2623
2624         seq_printf(sf, "%s ctrl=%s model=linear "
2625                    "rbps=%llu rseqiops=%llu rrandiops=%llu "
2626                    "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2627                    dname, ioc->user_cost_model ? "user" : "auto",
2628                    u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2629                    u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2630         return 0;
2631 }
2632
2633 static int ioc_cost_model_show(struct seq_file *sf, void *v)
2634 {
2635         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2636
2637         blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2638                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
2639         return 0;
2640 }
2641
2642 static const match_table_t cost_ctrl_tokens = {
2643         { COST_CTRL,            "ctrl=%s"       },
2644         { COST_MODEL,           "model=%s"      },
2645         { NR_COST_CTRL_PARAMS,  NULL            },
2646 };
2647
2648 static const match_table_t i_lcoef_tokens = {
2649         { I_LCOEF_RBPS,         "rbps=%u"       },
2650         { I_LCOEF_RSEQIOPS,     "rseqiops=%u"   },
2651         { I_LCOEF_RRANDIOPS,    "rrandiops=%u"  },
2652         { I_LCOEF_WBPS,         "wbps=%u"       },
2653         { I_LCOEF_WSEQIOPS,     "wseqiops=%u"   },
2654         { I_LCOEF_WRANDIOPS,    "wrandiops=%u"  },
2655         { NR_I_LCOEFS,          NULL            },
2656 };
2657
2658 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2659                                     size_t nbytes, loff_t off)
2660 {
2661         struct gendisk *disk;
2662         struct ioc *ioc;
2663         u64 u[NR_I_LCOEFS];
2664         bool user;
2665         char *p;
2666         int ret;
2667
2668         disk = blkcg_conf_get_disk(&input);
2669         if (IS_ERR(disk))
2670                 return PTR_ERR(disk);
2671
2672         ioc = q_to_ioc(disk->queue);
2673         if (!ioc) {
2674                 ret = blk_iocost_init(disk->queue);
2675                 if (ret)
2676                         goto err;
2677                 ioc = q_to_ioc(disk->queue);
2678         }
2679
2680         spin_lock_irq(&ioc->lock);
2681         memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2682         user = ioc->user_cost_model;
2683         spin_unlock_irq(&ioc->lock);
2684
2685         while ((p = strsep(&input, " \t\n"))) {
2686                 substring_t args[MAX_OPT_ARGS];
2687                 char buf[32];
2688                 int tok;
2689                 u64 v;
2690
2691                 if (!*p)
2692                         continue;
2693
2694                 switch (match_token(p, cost_ctrl_tokens, args)) {
2695                 case COST_CTRL:
2696                         match_strlcpy(buf, &args[0], sizeof(buf));
2697                         if (!strcmp(buf, "auto"))
2698                                 user = false;
2699                         else if (!strcmp(buf, "user"))
2700                                 user = true;
2701                         else
2702                                 goto einval;
2703                         continue;
2704                 case COST_MODEL:
2705                         match_strlcpy(buf, &args[0], sizeof(buf));
2706                         if (strcmp(buf, "linear"))
2707                                 goto einval;
2708                         continue;
2709                 }
2710
2711                 tok = match_token(p, i_lcoef_tokens, args);
2712                 if (tok == NR_I_LCOEFS)
2713                         goto einval;
2714                 if (match_u64(&args[0], &v))
2715                         goto einval;
2716                 u[tok] = v;
2717                 user = true;
2718         }
2719
2720         spin_lock_irq(&ioc->lock);
2721         if (user) {
2722                 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2723                 ioc->user_cost_model = true;
2724         } else {
2725                 ioc->user_cost_model = false;
2726         }
2727         ioc_refresh_params(ioc, true);
2728         spin_unlock_irq(&ioc->lock);
2729
2730         put_disk_and_module(disk);
2731         return nbytes;
2732
2733 einval:
2734         ret = -EINVAL;
2735 err:
2736         put_disk_and_module(disk);
2737         return ret;
2738 }
2739
2740 static struct cftype ioc_files[] = {
2741         {
2742                 .name = "weight",
2743                 .flags = CFTYPE_NOT_ON_ROOT,
2744                 .seq_show = ioc_weight_show,
2745                 .write = ioc_weight_write,
2746         },
2747         {
2748                 .name = "cost.qos",
2749                 .flags = CFTYPE_ONLY_ON_ROOT,
2750                 .seq_show = ioc_qos_show,
2751                 .write = ioc_qos_write,
2752         },
2753         {
2754                 .name = "cost.model",
2755                 .flags = CFTYPE_ONLY_ON_ROOT,
2756                 .seq_show = ioc_cost_model_show,
2757                 .write = ioc_cost_model_write,
2758         },
2759         {}
2760 };
2761
2762 static struct blkcg_policy blkcg_policy_iocost = {
2763         .dfl_cftypes    = ioc_files,
2764         .cpd_alloc_fn   = ioc_cpd_alloc,
2765         .cpd_free_fn    = ioc_cpd_free,
2766         .pd_alloc_fn    = ioc_pd_alloc,
2767         .pd_init_fn     = ioc_pd_init,
2768         .pd_free_fn     = ioc_pd_free,
2769         .pd_stat_fn     = ioc_pd_stat,
2770 };
2771
2772 static int __init ioc_init(void)
2773 {
2774         return blkcg_policy_register(&blkcg_policy_iocost);
2775 }
2776
2777 static void __exit ioc_exit(void)
2778 {
2779         return blkcg_policy_unregister(&blkcg_policy_iocost);
2780 }
2781
2782 module_init(ioc_init);
2783 module_exit(ioc_exit);