block/blk-iocost.c

   1 /* SPDX-License-Identifier: GPL-2.0
   2  *
   3  * IO cost model based controller.
   4  *
   5  * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
   6  * Copyright (C) 2019 Andy Newell <newella@fb.com>
   7  * Copyright (C) 2019 Facebook
   8  *
   9  * One challenge of controlling IO resources is the lack of trivially
  10  * observable cost metric.  This is distinguished from CPU and memory where
  11  * wallclock time and the number of bytes can serve as accurate enough
  12  * approximations.
  13  *
  14  * Bandwidth and iops are the most commonly used metrics for IO devices but
  15  * depending on the type and specifics of the device, different IO patterns
  16  * easily lead to multiple orders of magnitude variations rendering them
  17  * useless for the purpose of IO capacity distribution.  While on-device
  18  * time, with a lot of clutches, could serve as a useful approximation for
  19  * non-queued rotational devices, this is no longer viable with modern
  20  * devices, even the rotational ones.
  21  *
  22  * While there is no cost metric we can trivially observe, it isn't a
  23  * complete mystery.  For example, on a rotational device, seek cost
  24  * dominates while a contiguous transfer contributes a smaller amount
  25  * proportional to the size.  If we can characterize at least the relative
  26  * costs of these different types of IOs, it should be possible to
  27  * implement a reasonable work-conserving proportional IO resource
  28  * distribution.
  29  *
  30  * 1. IO Cost Model
  31  *
  32  * IO cost model estimates the cost of an IO given its basic parameters and
  33  * history (e.g. the end sector of the last IO).  The cost is measured in
  34  * device time.  If a given IO is estimated to cost 10ms, the device should
  35  * be able to process ~100 of those IOs in a second.
  36  *
  37  * Currently, there's only one builtin cost model - linear.  Each IO is
  38  * classified as sequential or random and given a base cost accordingly.
  39  * On top of that, a size cost proportional to the length of the IO is
  40  * added.  While simple, this model captures the operational
  41  * characteristics of a wide varienty of devices well enough.  Default
  42  * parameters for several different classes of devices are provided and the
  43  * parameters can be configured from userspace via
  44  * /sys/fs/cgroup/io.cost.model.
  45  *
  46  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
  47  * device-specific coefficients.
  48  *
  49  * 2. Control Strategy
  50  *
  51  * The device virtual time (vtime) is used as the primary control metric.
  52  * The control strategy is composed of the following three parts.
  53  *
  54  * 2-1. Vtime Distribution
  55  *
  56  * When a cgroup becomes active in terms of IOs, its hierarchical share is
  57  * calculated.  Please consider the following hierarchy where the numbers
  58  * inside parentheses denote the configured weights.
  59  *
  60  *           root
  61  *         /       \
  62  *      A (w:100)  B (w:300)
  63  *      /       \
  64  *  A0 (w:100)  A1 (w:100)
  65  *
  66  * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
  67  * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
  68  * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
  69  * 12.5% each.  The distribution mechanism only cares about these flattened
  70  * shares.  They're called hweights (hierarchical weights) and always add
  71  * upto 1 (WEIGHT_ONE).
  72  *
  73  * A given cgroup's vtime runs slower in inverse proportion to its hweight.
  74  * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
  75  * against the device vtime - an IO which takes 10ms on the underlying
  76  * device is considered to take 80ms on A0.
  77  *
  78  * This constitutes the basis of IO capacity distribution.  Each cgroup's
  79  * vtime is running at a rate determined by its hweight.  A cgroup tracks
  80  * the vtime consumed by past IOs and can issue a new IO if doing so
  81  * wouldn't outrun the current device vtime.  Otherwise, the IO is
  82  * suspended until the vtime has progressed enough to cover it.
  83  *
  84  * 2-2. Vrate Adjustment
  85  *
  86  * It's unrealistic to expect the cost model to be perfect.  There are too
  87  * many devices and even on the same device the overall performance
  88  * fluctuates depending on numerous factors such as IO mixture and device
  89  * internal garbage collection.  The controller needs to adapt dynamically.
  90  *
  91  * This is achieved by adjusting the overall IO rate according to how busy
  92  * the device is.  If the device becomes overloaded, we're sending down too
  93  * many IOs and should generally slow down.  If there are waiting issuers
  94  * but the device isn't saturated, we're issuing too few and should
  95  * generally speed up.
  96  *
  97  * To slow down, we lower the vrate - the rate at which the device vtime
  98  * passes compared to the wall clock.  For example, if the vtime is running
  99  * at the vrate of 75%, all cgroups added up would only be able to issue
 100  * 750ms worth of IOs per second, and vice-versa for speeding up.
 101  *
 102  * Device business is determined using two criteria - rq wait and
 103  * completion latencies.
 104  *
 105  * When a device gets saturated, the on-device and then the request queues
 106  * fill up and a bio which is ready to be issued has to wait for a request
 107  * to become available.  When this delay becomes noticeable, it's a clear
 108  * indication that the device is saturated and we lower the vrate.  This
 109  * saturation signal is fairly conservative as it only triggers when both
 110  * hardware and software queues are filled up, and is used as the default
 111  * busy signal.
 112  *
 113  * As devices can have deep queues and be unfair in how the queued commands
 114  * are executed, solely depending on rq wait may not result in satisfactory
 115  * control quality.  For a better control quality, completion latency QoS
 116  * parameters can be configured so that the device is considered saturated
 117  * if N'th percentile completion latency rises above the set point.
 118  *
 119  * The completion latency requirements are a function of both the
 120  * underlying device characteristics and the desired IO latency quality of
 121  * service.  There is an inherent trade-off - the tighter the latency QoS,
 122  * the higher the bandwidth lossage.  Latency QoS is disabled by default
 123  * and can be set through /sys/fs/cgroup/io.cost.qos.
 124  *
 125  * 2-3. Work Conservation
 126  *
 127  * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
 128  * periodically while B is sending out enough parallel IOs to saturate the
 129  * device on its own.  Let's say A's usage amounts to 100ms worth of IO
 130  * cost per second, i.e., 10% of the device capacity.  The naive
 131  * distribution of half and half would lead to 60% utilization of the
 132  * device, a significant reduction in the total amount of work done
 133  * compared to free-for-all competition.  This is too high a cost to pay
 134  * for IO control.
 135  *
 136  * To conserve the total amount of work done, we keep track of how much
 137  * each active cgroup is actually using and yield part of its weight if
 138  * there are other cgroups which can make use of it.  In the above case,
 139  * A's weight will be lowered so that it hovers above the actual usage and
 140  * B would be able to use the rest.
 141  *
 142  * As we don't want to penalize a cgroup for donating its weight, the
 143  * surplus weight adjustment factors in a margin and has an immediate
 144  * snapback mechanism in case the cgroup needs more IO vtime for itself.
 145  *
 146  * Note that adjusting down surplus weights has the same effects as
 147  * accelerating vtime for other cgroups and work conservation can also be
 148  * implemented by adjusting vrate dynamically.  However, squaring who can
 149  * donate and should take back how much requires hweight propagations
 150  * anyway making it easier to implement and understand as a separate
 151  * mechanism.
 152  *
 153  * 3. Monitoring
 154  *
 155  * Instead of debugfs or other clumsy monitoring mechanisms, this
 156  * controller uses a drgn based monitoring script -
 157  * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
 158  * https://github.com/osandov/drgn.  The output looks like the following.
 159  *
 160  *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
 161  *                 active      weight      hweight% inflt% dbt  delay usages%
 162  *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
 163  *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
 164  *
 165  * - per        : Timer period
 166  * - cur_per    : Internal wall and device vtime clock
 167  * - vrate      : Device virtual time rate against wall clock
 168  * - weight     : Surplus-adjusted and configured weights
 169  * - hweight    : Surplus-adjusted and configured hierarchical weights
 170  * - inflt      : The percentage of in-flight IO cost at the end of last period
 171  * - del_ms     : Deferred issuer delay induction level and duration
 172  * - usages     : Usage history
 173  */
 174
 175 #include <linux/kernel.h>
 176 #include <linux/module.h>
 177 #include <linux/timer.h>
 178 #include <linux/time64.h>
 179 #include <linux/parser.h>
 180 #include <linux/sched/signal.h>
 181 #include <asm/local.h>
 182 #include <asm/local64.h>
 183 #include "blk-rq-qos.h"
 184 #include "blk-stat.h"
 185 #include "blk-wbt.h"
 186 #include "blk-cgroup.h"
 187
 188 #ifdef CONFIG_TRACEPOINTS
 189
 190 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
 191 #define TRACE_IOCG_PATH_LEN 1024
 192 static DEFINE_SPINLOCK(trace_iocg_path_lock);
 193 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
 194
 195 #define TRACE_IOCG_PATH(type, iocg, ...)                                        \
 196         do {                                                                    \
 197                 unsigned long flags;                                            \
 198                 if (trace_iocost_##type##_enabled()) {                          \
 199                         spin_lock_irqsave(&trace_iocg_path_lock, flags);        \
 200                         cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,      \
 201                                     trace_iocg_path, TRACE_IOCG_PATH_LEN);      \
 202                         trace_iocost_##type(iocg, trace_iocg_path,              \
 203                                               ##__VA_ARGS__);                   \
 204                         spin_unlock_irqrestore(&trace_iocg_path_lock, flags);   \
 205                 }                                                               \
 206         } while (0)
 207
 208 #else   /* CONFIG_TRACE_POINTS */
 209 #define TRACE_IOCG_PATH(type, iocg, ...)        do { } while (0)
 210 #endif  /* CONFIG_TRACE_POINTS */
 211
 212 enum {
 213         MILLION                 = 1000000,
 214
 215         /* timer period is calculated from latency requirements, bound it */
 216         MIN_PERIOD              = USEC_PER_MSEC,
 217         MAX_PERIOD              = USEC_PER_SEC,
 218
 219         /*
 220          * iocg->vtime is targeted at 50% behind the device vtime, which
 221          * serves as its IO credit buffer.  Surplus weight adjustment is
 222          * immediately canceled if the vtime margin runs below 10%.
 223          */
 224         MARGIN_MIN_PCT          = 10,
 225         MARGIN_LOW_PCT          = 20,
 226         MARGIN_TARGET_PCT       = 50,
 227
 228         INUSE_ADJ_STEP_PCT      = 25,
 229
 230         /* Have some play in timer operations */
 231         TIMER_SLACK_PCT         = 1,
 232
 233         /* 1/64k is granular enough and can easily be handled w/ u32 */
 234         WEIGHT_ONE              = 1 << 16,
 235 };
 236
 237 enum {
 238         /*
 239          * As vtime is used to calculate the cost of each IO, it needs to
 240          * be fairly high precision.  For example, it should be able to
 241          * represent the cost of a single page worth of discard with
 242          * suffificient accuracy.  At the same time, it should be able to
 243          * represent reasonably long enough durations to be useful and
 244          * convenient during operation.
 245          *
 246          * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
 247          * granularity and days of wrap-around time even at extreme vrates.
 248          */
 249         VTIME_PER_SEC_SHIFT     = 37,
 250         VTIME_PER_SEC           = 1LLU << VTIME_PER_SEC_SHIFT,
 251         VTIME_PER_USEC          = VTIME_PER_SEC / USEC_PER_SEC,
 252         VTIME_PER_NSEC          = VTIME_PER_SEC / NSEC_PER_SEC,
 253
 254         /* bound vrate adjustments within two orders of magnitude */
 255         VRATE_MIN_PPM           = 10000,        /* 1% */
 256         VRATE_MAX_PPM           = 100000000,    /* 10000% */
 257
 258         VRATE_MIN               = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
 259         VRATE_CLAMP_ADJ_PCT     = 4,
 260
 261         /* switch iff the conditions are met for longer than this */
 262         AUTOP_CYCLE_NSEC        = 10LLU * NSEC_PER_SEC,
 263 };
 264
 265 enum {
 266         /* if IOs end up waiting for requests, issue less */
 267         RQ_WAIT_BUSY_PCT        = 5,
 268
 269         /* unbusy hysterisis */
 270         UNBUSY_THR_PCT          = 75,
 271
 272         /*
 273          * The effect of delay is indirect and non-linear and a huge amount of
 274          * future debt can accumulate abruptly while unthrottled. Linearly scale
 275          * up delay as debt is going up and then let it decay exponentially.
 276          * This gives us quick ramp ups while delay is accumulating and long
 277          * tails which can help reducing the frequency of debt explosions on
 278          * unthrottle. The parameters are experimentally determined.
 279          *
 280          * The delay mechanism provides adequate protection and behavior in many
 281          * cases. However, this is far from ideal and falls shorts on both
 282          * fronts. The debtors are often throttled too harshly costing a
 283          * significant level of fairness and possibly total work while the
 284          * protection against their impacts on the system can be choppy and
 285          * unreliable.
 286          *
 287          * The shortcoming primarily stems from the fact that, unlike for page
 288          * cache, the kernel doesn't have well-defined back-pressure propagation
 289          * mechanism and policies for anonymous memory. Fully addressing this
 290          * issue will likely require substantial improvements in the area.
 291          */
 292         MIN_DELAY_THR_PCT       = 500,
 293         MAX_DELAY_THR_PCT       = 25000,
 294         MIN_DELAY               = 250,
 295         MAX_DELAY               = 250 * USEC_PER_MSEC,
 296
 297         /* halve debts if avg usage over 100ms is under 50% */
 298         DFGV_USAGE_PCT          = 50,
 299         DFGV_PERIOD             = 100 * USEC_PER_MSEC,
 300
 301         /* don't let cmds which take a very long time pin lagging for too long */
 302         MAX_LAGGING_PERIODS     = 10,
 303
 304         /*
 305          * Count IO size in 4k pages.  The 12bit shift helps keeping
 306          * size-proportional components of cost calculation in closer
 307          * numbers of digits to per-IO cost components.
 308          */
 309         IOC_PAGE_SHIFT          = 12,
 310         IOC_PAGE_SIZE           = 1 << IOC_PAGE_SHIFT,
 311         IOC_SECT_TO_PAGE_SHIFT  = IOC_PAGE_SHIFT - SECTOR_SHIFT,
 312
 313         /* if apart further than 16M, consider randio for linear model */
 314         LCOEF_RANDIO_PAGES      = 4096,
 315 };
 316
 317 enum ioc_running {
 318         IOC_IDLE,
 319         IOC_RUNNING,
 320         IOC_STOP,
 321 };
 322
 323 /* io.cost.qos controls including per-dev enable of the whole controller */
 324 enum {
 325         QOS_ENABLE,
 326         QOS_CTRL,
 327         NR_QOS_CTRL_PARAMS,
 328 };
 329
 330 /* io.cost.qos params */
 331 enum {
 332         QOS_RPPM,
 333         QOS_RLAT,
 334         QOS_WPPM,
 335         QOS_WLAT,
 336         QOS_MIN,
 337         QOS_MAX,
 338         NR_QOS_PARAMS,
 339 };
 340
 341 /* io.cost.model controls */
 342 enum {
 343         COST_CTRL,
 344         COST_MODEL,
 345         NR_COST_CTRL_PARAMS,
 346 };
 347
 348 /* builtin linear cost model coefficients */
 349 enum {
 350         I_LCOEF_RBPS,
 351         I_LCOEF_RSEQIOPS,
 352         I_LCOEF_RRANDIOPS,
 353         I_LCOEF_WBPS,
 354         I_LCOEF_WSEQIOPS,
 355         I_LCOEF_WRANDIOPS,
 356         NR_I_LCOEFS,
 357 };
 358
 359 enum {
 360         LCOEF_RPAGE,
 361         LCOEF_RSEQIO,
 362         LCOEF_RRANDIO,
 363         LCOEF_WPAGE,
 364         LCOEF_WSEQIO,
 365         LCOEF_WRANDIO,
 366         NR_LCOEFS,
 367 };
 368
 369 enum {
 370         AUTOP_INVALID,
 371         AUTOP_HDD,
 372         AUTOP_SSD_QD1,
 373         AUTOP_SSD_DFL,
 374         AUTOP_SSD_FAST,
 375 };
 376
 377 struct ioc_params {
 378         u32                             qos[NR_QOS_PARAMS];
 379         u64                             i_lcoefs[NR_I_LCOEFS];
 380         u64                             lcoefs[NR_LCOEFS];
 381         u32                             too_fast_vrate_pct;
 382         u32                             too_slow_vrate_pct;
 383 };
 384
 385 struct ioc_margins {
 386         s64                             min;
 387         s64                             low;
 388         s64                             target;
 389 };
 390
 391 struct ioc_missed {
 392         local_t                         nr_met;
 393         local_t                         nr_missed;
 394         u32                             last_met;
 395         u32                             last_missed;
 396 };
 397
 398 struct ioc_pcpu_stat {
 399         struct ioc_missed               missed[2];
 400
 401         local64_t                       rq_wait_ns;
 402         u64                             last_rq_wait_ns;
 403 };
 404
 405 /* per device */
 406 struct ioc {
 407         struct rq_qos                   rqos;
 408
 409         bool                            enabled;
 410
 411         struct ioc_params               params;
 412         struct ioc_margins              margins;
 413         u32                             period_us;
 414         u32                             timer_slack_ns;
 415         u64                             vrate_min;
 416         u64                             vrate_max;
 417
 418         spinlock_t                      lock;
 419         struct timer_list               timer;
 420         struct list_head                active_iocgs;   /* active cgroups */
 421         struct ioc_pcpu_stat __percpu   *pcpu_stat;
 422
 423         enum ioc_running                running;
 424         atomic64_t                      vtime_rate;
 425         u64                             vtime_base_rate;
 426         s64                             vtime_err;
 427
 428         seqcount_spinlock_t             period_seqcount;
 429         u64                             period_at;      /* wallclock starttime */
 430         u64                             period_at_vtime; /* vtime starttime */
 431
 432         atomic64_t                      cur_period;     /* inc'd each period */
 433         int                             busy_level;     /* saturation history */
 434
 435         bool                            weights_updated;
 436         atomic_t                        hweight_gen;    /* for lazy hweights */
 437
 438         /* debt forgivness */
 439         u64                             dfgv_period_at;
 440         u64                             dfgv_period_rem;
 441         u64                             dfgv_usage_us_sum;
 442
 443         u64                             autop_too_fast_at;
 444         u64                             autop_too_slow_at;
 445         int                             autop_idx;
 446         bool                            user_qos_params:1;
 447         bool                            user_cost_model:1;
 448 };
 449
 450 struct iocg_pcpu_stat {
 451         local64_t                       abs_vusage;
 452 };
 453
 454 struct iocg_stat {
 455         u64                             usage_us;
 456         u64                             wait_us;
 457         u64                             indebt_us;
 458         u64                             indelay_us;
 459 };
 460
 461 /* per device-cgroup pair */
 462 struct ioc_gq {
 463         struct blkg_policy_data         pd;
 464         struct ioc                      *ioc;
 465
 466         /*
 467          * A iocg can get its weight from two sources - an explicit
 468          * per-device-cgroup configuration or the default weight of the
 469          * cgroup.  `cfg_weight` is the explicit per-device-cgroup
 470          * configuration.  `weight` is the effective considering both
 471          * sources.
 472          *
 473          * When an idle cgroup becomes active its `active` goes from 0 to
 474          * `weight`.  `inuse` is the surplus adjusted active weight.
 475          * `active` and `inuse` are used to calculate `hweight_active` and
 476          * `hweight_inuse`.
 477          *
 478          * `last_inuse` remembers `inuse` while an iocg is idle to persist
 479          * surplus adjustments.
 480          *
 481          * `inuse` may be adjusted dynamically during period. `saved_*` are used
 482          * to determine and track adjustments.
 483          */
 484         u32                             cfg_weight;
 485         u32                             weight;
 486         u32                             active;
 487         u32                             inuse;
 488
 489         u32                             last_inuse;
 490         s64                             saved_margin;
 491
 492         sector_t                        cursor;         /* to detect randio */
 493
 494         /*
 495          * `vtime` is this iocg's vtime cursor which progresses as IOs are
 496          * issued.  If lagging behind device vtime, the delta represents
 497          * the currently available IO budget.  If running ahead, the
 498          * overage.
 499          *
 500          * `vtime_done` is the same but progressed on completion rather
 501          * than issue.  The delta behind `vtime` represents the cost of
 502          * currently in-flight IOs.
 503          */
 504         atomic64_t                      vtime;
 505         atomic64_t                      done_vtime;
 506         u64                             abs_vdebt;
 507
 508         /* current delay in effect and when it started */
 509         u64                             delay;
 510         u64                             delay_at;
 511
 512         /*
 513          * The period this iocg was last active in.  Used for deactivation
 514          * and invalidating `vtime`.
 515          */
 516         atomic64_t                      active_period;
 517         struct list_head                active_list;
 518
 519         /* see __propagate_weights() and current_hweight() for details */
 520         u64                             child_active_sum;
 521         u64                             child_inuse_sum;
 522         u64                             child_adjusted_sum;
 523         int                             hweight_gen;
 524         u32                             hweight_active;
 525         u32                             hweight_inuse;
 526         u32                             hweight_donating;
 527         u32                             hweight_after_donation;
 528
 529         struct list_head                walk_list;
 530         struct list_head                surplus_list;
 531
 532         struct wait_queue_head          waitq;
 533         struct hrtimer                  waitq_timer;
 534
 535         /* timestamp at the latest activation */
 536         u64                             activated_at;
 537
 538         /* statistics */
 539         struct iocg_pcpu_stat __percpu  *pcpu_stat;
 540         struct iocg_stat                stat;
 541         struct iocg_stat                last_stat;
 542         u64                             last_stat_abs_vusage;
 543         u64                             usage_delta_us;
 544         u64                             wait_since;
 545         u64                             indebt_since;
 546         u64                             indelay_since;
 547
 548         /* this iocg's depth in the hierarchy and ancestors including self */
 549         int                             level;
 550         struct ioc_gq                   *ancestors[];
 551 };
 552
 553 /* per cgroup */
 554 struct ioc_cgrp {
 555         struct blkcg_policy_data        cpd;
 556         unsigned int                    dfl_weight;
 557 };
 558
 559 struct ioc_now {
 560         u64                             now_ns;
 561         u64                             now;
 562         u64                             vnow;
 563 };
 564
 565 struct iocg_wait {
 566         struct wait_queue_entry         wait;
 567         struct bio                      *bio;
 568         u64                             abs_cost;
 569         bool                            committed;
 570 };
 571
 572 struct iocg_wake_ctx {
 573         struct ioc_gq                   *iocg;
 574         u32                             hw_inuse;
 575         s64                             vbudget;
 576 };
 577
 578 static const struct ioc_params autop[] = {
 579         [AUTOP_HDD] = {
 580                 .qos                            = {
 581                         [QOS_RLAT]              =        250000, /* 250ms */
 582                         [QOS_WLAT]              =        250000,
 583                         [QOS_MIN]               = VRATE_MIN_PPM,
 584                         [QOS_MAX]               = VRATE_MAX_PPM,
 585                 },
 586                 .i_lcoefs                       = {
 587                         [I_LCOEF_RBPS]          =     174019176,
 588                         [I_LCOEF_RSEQIOPS]      =         41708,
 589                         [I_LCOEF_RRANDIOPS]     =           370,
 590                         [I_LCOEF_WBPS]          =     178075866,
 591                         [I_LCOEF_WSEQIOPS]      =         42705,
 592                         [I_LCOEF_WRANDIOPS]     =           378,
 593                 },
 594         },
 595         [AUTOP_SSD_QD1] = {
 596                 .qos                            = {
 597                         [QOS_RLAT]              =         25000, /* 25ms */
 598                         [QOS_WLAT]              =         25000,
 599                         [QOS_MIN]               = VRATE_MIN_PPM,
 600                         [QOS_MAX]               = VRATE_MAX_PPM,
 601                 },
 602                 .i_lcoefs                       = {
 603                         [I_LCOEF_RBPS]          =     245855193,
 604                         [I_LCOEF_RSEQIOPS]      =         61575,
 605                         [I_LCOEF_RRANDIOPS]     =          6946,
 606                         [I_LCOEF_WBPS]          =     141365009,
 607                         [I_LCOEF_WSEQIOPS]      =         33716,
 608                         [I_LCOEF_WRANDIOPS]     =         26796,
 609                 },
 610         },
 611         [AUTOP_SSD_DFL] = {
 612                 .qos                            = {
 613                         [QOS_RLAT]              =         25000, /* 25ms */
 614                         [QOS_WLAT]              =         25000,
 615                         [QOS_MIN]               = VRATE_MIN_PPM,
 616                         [QOS_MAX]               = VRATE_MAX_PPM,
 617                 },
 618                 .i_lcoefs                       = {
 619                         [I_LCOEF_RBPS]          =     488636629,
 620                         [I_LCOEF_RSEQIOPS]      =          8932,
 621                         [I_LCOEF_RRANDIOPS]     =          8518,
 622                         [I_LCOEF_WBPS]          =     427891549,
 623                         [I_LCOEF_WSEQIOPS]      =         28755,
 624                         [I_LCOEF_WRANDIOPS]     =         21940,
 625                 },
 626                 .too_fast_vrate_pct             =           500,
 627         },
 628         [AUTOP_SSD_FAST] = {
 629                 .qos                            = {
 630                         [QOS_RLAT]              =          5000, /* 5ms */
 631                         [QOS_WLAT]              =          5000,
 632                         [QOS_MIN]               = VRATE_MIN_PPM,
 633                         [QOS_MAX]               = VRATE_MAX_PPM,
 634                 },
 635                 .i_lcoefs                       = {
 636                         [I_LCOEF_RBPS]          =    3102524156LLU,
 637                         [I_LCOEF_RSEQIOPS]      =        724816,
 638                         [I_LCOEF_RRANDIOPS]     =        778122,
 639                         [I_LCOEF_WBPS]          =    1742780862LLU,
 640                         [I_LCOEF_WSEQIOPS]      =        425702,
 641                         [I_LCOEF_WRANDIOPS]     =        443193,
 642                 },
 643                 .too_slow_vrate_pct             =            10,
 644         },
 645 };
 646
 647 /*
 648  * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
 649  * vtime credit shortage and down on device saturation.
 650  */
 651 static u32 vrate_adj_pct[] =
 652         { 0, 0, 0, 0,
 653           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 654           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 655           4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
 656
 657 static struct blkcg_policy blkcg_policy_iocost;
 658
 659 /* accessors and helpers */
 660 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
 661 {
 662         return container_of(rqos, struct ioc, rqos);
 663 }
 664
 665 static struct ioc *q_to_ioc(struct request_queue *q)
 666 {
 667         return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
 668 }
 669
 670 static const char __maybe_unused *ioc_name(struct ioc *ioc)
 671 {
 672         struct gendisk *disk = ioc->rqos.disk;
 673
 674         if (!disk)
 675                 return "<unknown>";
 676         return disk->disk_name;
 677 }
 678
 679 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
 680 {
 681         return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
 682 }
 683
 684 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
 685 {
 686         return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
 687 }
 688
 689 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
 690 {
 691         return pd_to_blkg(&iocg->pd);
 692 }
 693
 694 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
 695 {
 696         return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
 697                             struct ioc_cgrp, cpd);
 698 }
 699
 700 /*
 701  * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
 702  * weight, the more expensive each IO.  Must round up.
 703  */
 704 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
 705 {
 706         return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
 707 }
 708
 709 /*
 710  * The inverse of abs_cost_to_cost().  Must round up.
 711  */
 712 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
 713 {
 714         return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
 715 }
 716
 717 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
 718                             u64 abs_cost, u64 cost)
 719 {
 720         struct iocg_pcpu_stat *gcs;
 721
 722         bio->bi_iocost_cost = cost;
 723         atomic64_add(cost, &iocg->vtime);
 724
 725         gcs = get_cpu_ptr(iocg->pcpu_stat);
 726         local64_add(abs_cost, &gcs->abs_vusage);
 727         put_cpu_ptr(gcs);
 728 }
 729
 730 static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
 731 {
 732         if (lock_ioc) {
 733                 spin_lock_irqsave(&iocg->ioc->lock, *flags);
 734                 spin_lock(&iocg->waitq.lock);
 735         } else {
 736                 spin_lock_irqsave(&iocg->waitq.lock, *flags);
 737         }
 738 }
 739
 740 static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
 741 {
 742         if (unlock_ioc) {
 743                 spin_unlock(&iocg->waitq.lock);
 744                 spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
 745         } else {
 746                 spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
 747         }
 748 }
 749
 750 #define CREATE_TRACE_POINTS
 751 #include <trace/events/iocost.h>
 752
 753 static void ioc_refresh_margins(struct ioc *ioc)
 754 {
 755         struct ioc_margins *margins = &ioc->margins;
 756         u32 period_us = ioc->period_us;
 757         u64 vrate = ioc->vtime_base_rate;
 758
 759         margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
 760         margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
 761         margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
 762 }
 763
 764 /* latency Qos params changed, update period_us and all the dependent params */
 765 static void ioc_refresh_period_us(struct ioc *ioc)
 766 {
 767         u32 ppm, lat, multi, period_us;
 768
 769         lockdep_assert_held(&ioc->lock);
 770
 771         /* pick the higher latency target */
 772         if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
 773                 ppm = ioc->params.qos[QOS_RPPM];
 774                 lat = ioc->params.qos[QOS_RLAT];
 775         } else {
 776                 ppm = ioc->params.qos[QOS_WPPM];
 777                 lat = ioc->params.qos[QOS_WLAT];
 778         }
 779
 780         /*
 781          * We want the period to be long enough to contain a healthy number
 782          * of IOs while short enough for granular control.  Define it as a
 783          * multiple of the latency target.  Ideally, the multiplier should
 784          * be scaled according to the percentile so that it would nominally
 785          * contain a certain number of requests.  Let's be simpler and
 786          * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
 787          */
 788         if (ppm)
 789                 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
 790         else
 791                 multi = 2;
 792         period_us = multi * lat;
 793         period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
 794
 795         /* calculate dependent params */
 796         ioc->period_us = period_us;
 797         ioc->timer_slack_ns = div64_u64(
 798                 (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
 799                 100);
 800         ioc_refresh_margins(ioc);
 801 }
 802
 803 /*
 804  *  ioc->rqos.disk isn't initialized when this function is called from
 805  *  the init path.
 806  */
 807 static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
 808 {
 809         int idx = ioc->autop_idx;
 810         const struct ioc_params *p = &autop[idx];
 811         u32 vrate_pct;
 812         u64 now_ns;
 813
 814         /* rotational? */
 815         if (!blk_queue_nonrot(disk->queue))
 816                 return AUTOP_HDD;
 817
 818         /* handle SATA SSDs w/ broken NCQ */
 819         if (blk_queue_depth(disk->queue) == 1)
 820                 return AUTOP_SSD_QD1;
 821
 822         /* use one of the normal ssd sets */
 823         if (idx < AUTOP_SSD_DFL)
 824                 return AUTOP_SSD_DFL;
 825
 826         /* if user is overriding anything, maintain what was there */
 827         if (ioc->user_qos_params || ioc->user_cost_model)
 828                 return idx;
 829
 830         /* step up/down based on the vrate */
 831         vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
 832         now_ns = blk_time_get_ns();
 833
 834         if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
 835                 if (!ioc->autop_too_fast_at)
 836                         ioc->autop_too_fast_at = now_ns;
 837                 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
 838                         return idx + 1;
 839         } else {
 840                 ioc->autop_too_fast_at = 0;
 841         }
 842
 843         if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
 844                 if (!ioc->autop_too_slow_at)
 845                         ioc->autop_too_slow_at = now_ns;
 846                 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
 847                         return idx - 1;
 848         } else {
 849                 ioc->autop_too_slow_at = 0;
 850         }
 851
 852         return idx;
 853 }
 854
 855 /*
 856  * Take the followings as input
 857  *
 858  *  @bps        maximum sequential throughput
 859  *  @seqiops    maximum sequential 4k iops
 860  *  @randiops   maximum random 4k iops
 861  *
 862  * and calculate the linear model cost coefficients.
 863  *
 864  *  *@page      per-page cost           1s / (@bps / 4096)
 865  *  *@seqio     base cost of a seq IO   max((1s / @seqiops) - *@page, 0)
 866  *  @randiops   base cost of a rand IO  max((1s / @randiops) - *@page, 0)
 867  */
 868 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
 869                         u64 *page, u64 *seqio, u64 *randio)
 870 {
 871         u64 v;
 872
 873         *page = *seqio = *randio = 0;
 874
 875         if (bps) {
 876                 u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE);
 877
 878                 if (bps_pages)
 879                         *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages);
 880                 else
 881                         *page = 1;
 882         }
 883
 884         if (seqiops) {
 885                 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
 886                 if (v > *page)
 887                         *seqio = v - *page;
 888         }
 889
 890         if (randiops) {
 891                 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
 892                 if (v > *page)
 893                         *randio = v - *page;
 894         }
 895 }
 896
 897 static void ioc_refresh_lcoefs(struct ioc *ioc)
 898 {
 899         u64 *u = ioc->params.i_lcoefs;
 900         u64 *c = ioc->params.lcoefs;
 901
 902         calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
 903                     &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
 904         calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
 905                     &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
 906 }
 907
 908 /*
 909  * struct gendisk is required as an argument because ioc->rqos.disk
 910  * is not properly initialized when called from the init path.
 911  */
 912 static bool ioc_refresh_params_disk(struct ioc *ioc, bool force,
 913                                     struct gendisk *disk)
 914 {
 915         const struct ioc_params *p;
 916         int idx;
 917
 918         lockdep_assert_held(&ioc->lock);
 919
 920         idx = ioc_autop_idx(ioc, disk);
 921         p = &autop[idx];
 922
 923         if (idx == ioc->autop_idx && !force)
 924                 return false;
 925
 926         if (idx != ioc->autop_idx) {
 927                 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
 928                 ioc->vtime_base_rate = VTIME_PER_USEC;
 929         }
 930
 931         ioc->autop_idx = idx;
 932         ioc->autop_too_fast_at = 0;
 933         ioc->autop_too_slow_at = 0;
 934
 935         if (!ioc->user_qos_params)
 936                 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
 937         if (!ioc->user_cost_model)
 938                 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
 939
 940         ioc_refresh_period_us(ioc);
 941         ioc_refresh_lcoefs(ioc);
 942
 943         ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
 944                                             VTIME_PER_USEC, MILLION);
 945         ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] *
 946                                             VTIME_PER_USEC, MILLION);
 947
 948         return true;
 949 }
 950
 951 static bool ioc_refresh_params(struct ioc *ioc, bool force)
 952 {
 953         return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk);
 954 }
 955
 956 /*
 957  * When an iocg accumulates too much vtime or gets deactivated, we throw away
 958  * some vtime, which lowers the overall device utilization. As the exact amount
 959  * which is being thrown away is known, we can compensate by accelerating the
 960  * vrate accordingly so that the extra vtime generated in the current period
 961  * matches what got lost.
 962  */
 963 static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
 964 {
 965         s64 pleft = ioc->period_at + ioc->period_us - now->now;
 966         s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
 967         s64 vcomp, vcomp_min, vcomp_max;
 968
 969         lockdep_assert_held(&ioc->lock);
 970
 971         /* we need some time left in this period */
 972         if (pleft <= 0)
 973                 goto done;
 974
 975         /*
 976          * Calculate how much vrate should be adjusted to offset the error.
 977          * Limit the amount of adjustment and deduct the adjusted amount from
 978          * the error.
 979          */
 980         vcomp = -div64_s64(ioc->vtime_err, pleft);
 981         vcomp_min = -(ioc->vtime_base_rate >> 1);
 982         vcomp_max = ioc->vtime_base_rate;
 983         vcomp = clamp(vcomp, vcomp_min, vcomp_max);
 984
 985         ioc->vtime_err += vcomp * pleft;
 986
 987         atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
 988 done:
 989         /* bound how much error can accumulate */
 990         ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
 991 }
 992
 993 static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
 994                                   int nr_lagging, int nr_shortages,
 995                                   int prev_busy_level, u32 *missed_ppm)
 996 {
 997         u64 vrate = ioc->vtime_base_rate;
 998         u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
 999
1000         if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
1001                 if (ioc->busy_level != prev_busy_level || nr_lagging)
1002                         trace_iocost_ioc_vrate_adj(ioc, vrate,
1003                                                    missed_ppm, rq_wait_pct,
1004                                                    nr_lagging, nr_shortages);
1005
1006                 return;
1007         }
1008
1009         /*
1010          * If vrate is out of bounds, apply clamp gradually as the
1011          * bounds can change abruptly.  Otherwise, apply busy_level
1012          * based adjustment.
1013          */
1014         if (vrate < vrate_min) {
1015                 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
1016                 vrate = min(vrate, vrate_min);
1017         } else if (vrate > vrate_max) {
1018                 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
1019                 vrate = max(vrate, vrate_max);
1020         } else {
1021                 int idx = min_t(int, abs(ioc->busy_level),
1022                                 ARRAY_SIZE(vrate_adj_pct) - 1);
1023                 u32 adj_pct = vrate_adj_pct[idx];
1024
1025                 if (ioc->busy_level > 0)
1026                         adj_pct = 100 - adj_pct;
1027                 else
1028                         adj_pct = 100 + adj_pct;
1029
1030                 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1031                               vrate_min, vrate_max);
1032         }
1033
1034         trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1035                                    nr_lagging, nr_shortages);
1036
1037         ioc->vtime_base_rate = vrate;
1038         ioc_refresh_margins(ioc);
1039 }
1040
1041 /* take a snapshot of the current [v]time and vrate */
1042 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
1043 {
1044         unsigned seq;
1045         u64 vrate;
1046
1047         now->now_ns = blk_time_get_ns();
1048         now->now = ktime_to_us(now->now_ns);
1049         vrate = atomic64_read(&ioc->vtime_rate);
1050
1051         /*
1052          * The current vtime is
1053          *
1054          *   vtime at period start + (wallclock time since the start) * vrate
1055          *
1056          * As a consistent snapshot of `period_at_vtime` and `period_at` is
1057          * needed, they're seqcount protected.
1058          */
1059         do {
1060                 seq = read_seqcount_begin(&ioc->period_seqcount);
1061                 now->vnow = ioc->period_at_vtime +
1062                         (now->now - ioc->period_at) * vrate;
1063         } while (read_seqcount_retry(&ioc->period_seqcount, seq));
1064 }
1065
1066 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
1067 {
1068         WARN_ON_ONCE(ioc->running != IOC_RUNNING);
1069
1070         write_seqcount_begin(&ioc->period_seqcount);
1071         ioc->period_at = now->now;
1072         ioc->period_at_vtime = now->vnow;
1073         write_seqcount_end(&ioc->period_seqcount);
1074
1075         ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
1076         add_timer(&ioc->timer);
1077 }
1078
1079 /*
1080  * Update @iocg's `active` and `inuse` to @active and @inuse, update level
1081  * weight sums and propagate upwards accordingly. If @save, the current margin
1082  * is saved to be used as reference for later inuse in-period adjustments.
1083  */
1084 static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1085                                 bool save, struct ioc_now *now)
1086 {
1087         struct ioc *ioc = iocg->ioc;
1088         int lvl;
1089
1090         lockdep_assert_held(&ioc->lock);
1091
1092         /*
1093          * For an active leaf node, its inuse shouldn't be zero or exceed
1094          * @active. An active internal node's inuse is solely determined by the
1095          * inuse to active ratio of its children regardless of @inuse.
1096          */
1097         if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
1098                 inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
1099                                            iocg->child_active_sum);
1100         } else {
1101                 inuse = clamp_t(u32, inuse, 1, active);
1102         }
1103
1104         iocg->last_inuse = iocg->inuse;
1105         if (save)
1106                 iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
1107
1108         if (active == iocg->active && inuse == iocg->inuse)
1109                 return;
1110
1111         for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1112                 struct ioc_gq *parent = iocg->ancestors[lvl];
1113                 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1114                 u32 parent_active = 0, parent_inuse = 0;
1115
1116                 /* update the level sums */
1117                 parent->child_active_sum += (s32)(active - child->active);
1118                 parent->child_inuse_sum += (s32)(inuse - child->inuse);
1119                 /* apply the updates */
1120                 child->active = active;
1121                 child->inuse = inuse;
1122
1123                 /*
1124                  * The delta between inuse and active sums indicates that
1125                  * much of weight is being given away.  Parent's inuse
1126                  * and active should reflect the ratio.
1127                  */
1128                 if (parent->child_active_sum) {
1129                         parent_active = parent->weight;
1130                         parent_inuse = DIV64_U64_ROUND_UP(
1131                                 parent_active * parent->child_inuse_sum,
1132                                 parent->child_active_sum);
1133                 }
1134
1135                 /* do we need to keep walking up? */
1136                 if (parent_active == parent->active &&
1137                     parent_inuse == parent->inuse)
1138                         break;
1139
1140                 active = parent_active;
1141                 inuse = parent_inuse;
1142         }
1143
1144         ioc->weights_updated = true;
1145 }
1146
1147 static void commit_weights(struct ioc *ioc)
1148 {
1149         lockdep_assert_held(&ioc->lock);
1150
1151         if (ioc->weights_updated) {
1152                 /* paired with rmb in current_hweight(), see there */
1153                 smp_wmb();
1154                 atomic_inc(&ioc->hweight_gen);
1155                 ioc->weights_updated = false;
1156         }
1157 }
1158
1159 static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1160                               bool save, struct ioc_now *now)
1161 {
1162         __propagate_weights(iocg, active, inuse, save, now);
1163         commit_weights(iocg->ioc);
1164 }
1165
1166 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1167 {
1168         struct ioc *ioc = iocg->ioc;
1169         int lvl;
1170         u32 hwa, hwi;
1171         int ioc_gen;
1172
1173         /* hot path - if uptodate, use cached */
1174         ioc_gen = atomic_read(&ioc->hweight_gen);
1175         if (ioc_gen == iocg->hweight_gen)
1176                 goto out;
1177
1178         /*
1179          * Paired with wmb in commit_weights(). If we saw the updated
1180          * hweight_gen, all the weight updates from __propagate_weights() are
1181          * visible too.
1182          *
1183          * We can race with weight updates during calculation and get it
1184          * wrong.  However, hweight_gen would have changed and a future
1185          * reader will recalculate and we're guaranteed to discard the
1186          * wrong result soon.
1187          */
1188         smp_rmb();
1189
1190         hwa = hwi = WEIGHT_ONE;
1191         for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1192                 struct ioc_gq *parent = iocg->ancestors[lvl];
1193                 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1194                 u64 active_sum = READ_ONCE(parent->child_active_sum);
1195                 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
1196                 u32 active = READ_ONCE(child->active);
1197                 u32 inuse = READ_ONCE(child->inuse);
1198
1199                 /* we can race with deactivations and either may read as zero */
1200                 if (!active_sum || !inuse_sum)
1201                         continue;
1202
1203                 active_sum = max_t(u64, active, active_sum);
1204                 hwa = div64_u64((u64)hwa * active, active_sum);
1205
1206                 inuse_sum = max_t(u64, inuse, inuse_sum);
1207                 hwi = div64_u64((u64)hwi * inuse, inuse_sum);
1208         }
1209
1210         iocg->hweight_active = max_t(u32, hwa, 1);
1211         iocg->hweight_inuse = max_t(u32, hwi, 1);
1212         iocg->hweight_gen = ioc_gen;
1213 out:
1214         if (hw_activep)
1215                 *hw_activep = iocg->hweight_active;
1216         if (hw_inusep)
1217                 *hw_inusep = iocg->hweight_inuse;
1218 }
1219
1220 /*
1221  * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
1222  * other weights stay unchanged.
1223  */
1224 static u32 current_hweight_max(struct ioc_gq *iocg)
1225 {
1226         u32 hwm = WEIGHT_ONE;
1227         u32 inuse = iocg->active;
1228         u64 child_inuse_sum;
1229         int lvl;
1230
1231         lockdep_assert_held(&iocg->ioc->lock);
1232
1233         for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1234                 struct ioc_gq *parent = iocg->ancestors[lvl];
1235                 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1236
1237                 child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
1238                 hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
1239                 inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
1240                                            parent->child_active_sum);
1241         }
1242
1243         return max_t(u32, hwm, 1);
1244 }
1245
1246 static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
1247 {
1248         struct ioc *ioc = iocg->ioc;
1249         struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1250         struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1251         u32 weight;
1252
1253         lockdep_assert_held(&ioc->lock);
1254
1255         weight = iocg->cfg_weight ?: iocc->dfl_weight;
1256         if (weight != iocg->weight && iocg->active)
1257                 propagate_weights(iocg, weight, iocg->inuse, true, now);
1258         iocg->weight = weight;
1259 }
1260
1261 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1262 {
1263         struct ioc *ioc = iocg->ioc;
1264         u64 __maybe_unused last_period, cur_period;
1265         u64 vtime, vtarget;
1266         int i;
1267
1268         /*
1269          * If seem to be already active, just update the stamp to tell the
1270          * timer that we're still active.  We don't mind occassional races.
1271          */
1272         if (!list_empty(&iocg->active_list)) {
1273                 ioc_now(ioc, now);
1274                 cur_period = atomic64_read(&ioc->cur_period);
1275                 if (atomic64_read(&iocg->active_period) != cur_period)
1276                         atomic64_set(&iocg->active_period, cur_period);
1277                 return true;
1278         }
1279
1280         /* racy check on internal node IOs, treat as root level IOs */
1281         if (iocg->child_active_sum)
1282                 return false;
1283
1284         spin_lock_irq(&ioc->lock);
1285
1286         ioc_now(ioc, now);
1287
1288         /* update period */
1289         cur_period = atomic64_read(&ioc->cur_period);
1290         last_period = atomic64_read(&iocg->active_period);
1291         atomic64_set(&iocg->active_period, cur_period);
1292
1293         /* already activated or breaking leaf-only constraint? */
1294         if (!list_empty(&iocg->active_list))
1295                 goto succeed_unlock;
1296         for (i = iocg->level - 1; i > 0; i--)
1297                 if (!list_empty(&iocg->ancestors[i]->active_list))
1298                         goto fail_unlock;
1299
1300         if (iocg->child_active_sum)
1301                 goto fail_unlock;
1302
1303         /*
1304          * Always start with the target budget. On deactivation, we throw away
1305          * anything above it.
1306          */
1307         vtarget = now->vnow - ioc->margins.target;
1308         vtime = atomic64_read(&iocg->vtime);
1309
1310         atomic64_add(vtarget - vtime, &iocg->vtime);
1311         atomic64_add(vtarget - vtime, &iocg->done_vtime);
1312         vtime = vtarget;
1313
1314         /*
1315          * Activate, propagate weight and start period timer if not
1316          * running.  Reset hweight_gen to avoid accidental match from
1317          * wrapping.
1318          */
1319         iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1320         list_add(&iocg->active_list, &ioc->active_iocgs);
1321
1322         propagate_weights(iocg, iocg->weight,
1323                           iocg->last_inuse ?: iocg->weight, true, now);
1324
1325         TRACE_IOCG_PATH(iocg_activate, iocg, now,
1326                         last_period, cur_period, vtime);
1327
1328         iocg->activated_at = now->now;
1329
1330         if (ioc->running == IOC_IDLE) {
1331                 ioc->running = IOC_RUNNING;
1332                 ioc->dfgv_period_at = now->now;
1333                 ioc->dfgv_period_rem = 0;
1334                 ioc_start_period(ioc, now);
1335         }
1336
1337 succeed_unlock:
1338         spin_unlock_irq(&ioc->lock);
1339         return true;
1340
1341 fail_unlock:
1342         spin_unlock_irq(&ioc->lock);
1343         return false;
1344 }
1345
1346 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1347 {
1348         struct ioc *ioc = iocg->ioc;
1349         struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1350         u64 tdelta, delay, new_delay;
1351         s64 vover, vover_pct;
1352         u32 hwa;
1353
1354         lockdep_assert_held(&iocg->waitq.lock);
1355
1356         /*
1357          * If the delay is set by another CPU, we may be in the past. No need to
1358          * change anything if so. This avoids decay calculation underflow.
1359          */
1360         if (time_before64(now->now, iocg->delay_at))
1361                 return false;
1362
1363         /* calculate the current delay in effect - 1/2 every second */
1364         tdelta = now->now - iocg->delay_at;
1365         if (iocg->delay)
1366                 delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
1367         else
1368                 delay = 0;
1369
1370         /* calculate the new delay from the debt amount */
1371         current_hweight(iocg, &hwa, NULL);
1372         vover = atomic64_read(&iocg->vtime) +
1373                 abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
1374         vover_pct = div64_s64(100 * vover,
1375                               ioc->period_us * ioc->vtime_base_rate);
1376
1377         if (vover_pct <= MIN_DELAY_THR_PCT)
1378                 new_delay = 0;
1379         else if (vover_pct >= MAX_DELAY_THR_PCT)
1380                 new_delay = MAX_DELAY;
1381         else
1382                 new_delay = MIN_DELAY +
1383                         div_u64((MAX_DELAY - MIN_DELAY) *
1384                                 (vover_pct - MIN_DELAY_THR_PCT),
1385                                 MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
1386
1387         /* pick the higher one and apply */
1388         if (new_delay > delay) {
1389                 iocg->delay = new_delay;
1390                 iocg->delay_at = now->now;
1391                 delay = new_delay;
1392         }
1393
1394         if (delay >= MIN_DELAY) {
1395                 if (!iocg->indelay_since)
1396                         iocg->indelay_since = now->now;
1397                 blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
1398                 return true;
1399         } else {
1400                 if (iocg->indelay_since) {
1401                         iocg->stat.indelay_us += now->now - iocg->indelay_since;
1402                         iocg->indelay_since = 0;
1403                 }
1404                 iocg->delay = 0;
1405                 blkcg_clear_delay(blkg);
1406                 return false;
1407         }
1408 }
1409
1410 static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
1411                             struct ioc_now *now)
1412 {
1413         struct iocg_pcpu_stat *gcs;
1414
1415         lockdep_assert_held(&iocg->ioc->lock);
1416         lockdep_assert_held(&iocg->waitq.lock);
1417         WARN_ON_ONCE(list_empty(&iocg->active_list));
1418
1419         /*
1420          * Once in debt, debt handling owns inuse. @iocg stays at the minimum
1421          * inuse donating all of it share to others until its debt is paid off.
1422          */
1423         if (!iocg->abs_vdebt && abs_cost) {
1424                 iocg->indebt_since = now->now;
1425                 propagate_weights(iocg, iocg->active, 0, false, now);
1426         }
1427
1428         iocg->abs_vdebt += abs_cost;
1429
1430         gcs = get_cpu_ptr(iocg->pcpu_stat);
1431         local64_add(abs_cost, &gcs->abs_vusage);
1432         put_cpu_ptr(gcs);
1433 }
1434
1435 static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
1436                           struct ioc_now *now)
1437 {
1438         lockdep_assert_held(&iocg->ioc->lock);
1439         lockdep_assert_held(&iocg->waitq.lock);
1440
1441         /* make sure that nobody messed with @iocg */
1442         WARN_ON_ONCE(list_empty(&iocg->active_list));
1443         WARN_ON_ONCE(iocg->inuse > 1);
1444
1445         iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
1446
1447         /* if debt is paid in full, restore inuse */
1448         if (!iocg->abs_vdebt) {
1449                 iocg->stat.indebt_us += now->now - iocg->indebt_since;
1450                 iocg->indebt_since = 0;
1451
1452                 propagate_weights(iocg, iocg->active, iocg->last_inuse,
1453                                   false, now);
1454         }
1455 }
1456
1457 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1458                         int flags, void *key)
1459 {
1460         struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1461         struct iocg_wake_ctx *ctx = key;
1462         u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1463
1464         ctx->vbudget -= cost;
1465
1466         if (ctx->vbudget < 0)
1467                 return -1;
1468
1469         iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
1470         wait->committed = true;
1471
1472         /*
1473          * autoremove_wake_function() removes the wait entry only when it
1474          * actually changed the task state. We want the wait always removed.
1475          * Remove explicitly and use default_wake_function(). Note that the
1476          * order of operations is important as finish_wait() tests whether
1477          * @wq_entry is removed without grabbing the lock.
1478          */
1479         default_wake_function(wq_entry, mode, flags, key);
1480         list_del_init_careful(&wq_entry->entry);
1481         return 0;
1482 }
1483
1484 /*
1485  * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
1486  * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1487  * addition to iocg->waitq.lock.
1488  */
1489 static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1490                             struct ioc_now *now)
1491 {
1492         struct ioc *ioc = iocg->ioc;
1493         struct iocg_wake_ctx ctx = { .iocg = iocg };
1494         u64 vshortage, expires, oexpires;
1495         s64 vbudget;
1496         u32 hwa;
1497
1498         lockdep_assert_held(&iocg->waitq.lock);
1499
1500         current_hweight(iocg, &hwa, NULL);
1501         vbudget = now->vnow - atomic64_read(&iocg->vtime);
1502
1503         /* pay off debt */
1504         if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
1505                 u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
1506                 u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
1507                 u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
1508
1509                 lockdep_assert_held(&ioc->lock);
1510
1511                 atomic64_add(vpay, &iocg->vtime);
1512                 atomic64_add(vpay, &iocg->done_vtime);
1513                 iocg_pay_debt(iocg, abs_vpay, now);
1514                 vbudget -= vpay;
1515         }
1516
1517         if (iocg->abs_vdebt || iocg->delay)
1518                 iocg_kick_delay(iocg, now);
1519
1520         /*
1521          * Debt can still be outstanding if we haven't paid all yet or the
1522          * caller raced and called without @pay_debt. Shouldn't wake up waiters
1523          * under debt. Make sure @vbudget reflects the outstanding amount and is
1524          * not positive.
1525          */
1526         if (iocg->abs_vdebt) {
1527                 s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
1528                 vbudget = min_t(s64, 0, vbudget - vdebt);
1529         }
1530
1531         /*
1532          * Wake up the ones which are due and see how much vtime we'll need for
1533          * the next one. As paying off debt restores hw_inuse, it must be read
1534          * after the above debt payment.
1535          */
1536         ctx.vbudget = vbudget;
1537         current_hweight(iocg, NULL, &ctx.hw_inuse);
1538
1539         __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1540
1541         if (!waitqueue_active(&iocg->waitq)) {
1542                 if (iocg->wait_since) {
1543                         iocg->stat.wait_us += now->now - iocg->wait_since;
1544                         iocg->wait_since = 0;
1545                 }
1546                 return;
1547         }
1548
1549         if (!iocg->wait_since)
1550                 iocg->wait_since = now->now;
1551
1552         if (WARN_ON_ONCE(ctx.vbudget >= 0))
1553                 return;
1554
1555         /* determine next wakeup, add a timer margin to guarantee chunking */
1556         vshortage = -ctx.vbudget;
1557         expires = now->now_ns +
1558                 DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
1559                 NSEC_PER_USEC;
1560         expires += ioc->timer_slack_ns;
1561
1562         /* if already active and close enough, don't bother */
1563         oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1564         if (hrtimer_is_queued(&iocg->waitq_timer) &&
1565             abs(oexpires - expires) <= ioc->timer_slack_ns)
1566                 return;
1567
1568         hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1569                                ioc->timer_slack_ns, HRTIMER_MODE_ABS);
1570 }
1571
1572 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1573 {
1574         struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1575         bool pay_debt = READ_ONCE(iocg->abs_vdebt);
1576         struct ioc_now now;
1577         unsigned long flags;
1578
1579         ioc_now(iocg->ioc, &now);
1580
1581         iocg_lock(iocg, pay_debt, &flags);
1582         iocg_kick_waitq(iocg, pay_debt, &now);
1583         iocg_unlock(iocg, pay_debt, &flags);
1584
1585         return HRTIMER_NORESTART;
1586 }
1587
1588 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1589 {
1590         u32 nr_met[2] = { };
1591         u32 nr_missed[2] = { };
1592         u64 rq_wait_ns = 0;
1593         int cpu, rw;
1594
1595         for_each_online_cpu(cpu) {
1596                 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1597                 u64 this_rq_wait_ns;
1598
1599                 for (rw = READ; rw <= WRITE; rw++) {
1600                         u32 this_met = local_read(&stat->missed[rw].nr_met);
1601                         u32 this_missed = local_read(&stat->missed[rw].nr_missed);
1602
1603                         nr_met[rw] += this_met - stat->missed[rw].last_met;
1604                         nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1605                         stat->missed[rw].last_met = this_met;
1606                         stat->missed[rw].last_missed = this_missed;
1607                 }
1608
1609                 this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
1610                 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1611                 stat->last_rq_wait_ns = this_rq_wait_ns;
1612         }
1613
1614         for (rw = READ; rw <= WRITE; rw++) {
1615                 if (nr_met[rw] + nr_missed[rw])
1616                         missed_ppm_ar[rw] =
1617                                 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1618                                                    nr_met[rw] + nr_missed[rw]);
1619                 else
1620                         missed_ppm_ar[rw] = 0;
1621         }
1622
1623         *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1624                                    ioc->period_us * NSEC_PER_USEC);
1625 }
1626
1627 /* was iocg idle this period? */
1628 static bool iocg_is_idle(struct ioc_gq *iocg)
1629 {
1630         struct ioc *ioc = iocg->ioc;
1631
1632         /* did something get issued this period? */
1633         if (atomic64_read(&iocg->active_period) ==
1634             atomic64_read(&ioc->cur_period))
1635                 return false;
1636
1637         /* is something in flight? */
1638         if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1639                 return false;
1640
1641         return true;
1642 }
1643
1644 /*
1645  * Call this function on the target leaf @iocg's to build pre-order traversal
1646  * list of all the ancestors in @inner_walk. The inner nodes are linked through
1647  * ->walk_list and the caller is responsible for dissolving the list after use.
1648  */
1649 static void iocg_build_inner_walk(struct ioc_gq *iocg,
1650                                   struct list_head *inner_walk)
1651 {
1652         int lvl;
1653
1654         WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1655
1656         /* find the first ancestor which hasn't been visited yet */
1657         for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1658                 if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1659                         break;
1660         }
1661
1662         /* walk down and visit the inner nodes to get pre-order traversal */
1663         while (++lvl <= iocg->level - 1) {
1664                 struct ioc_gq *inner = iocg->ancestors[lvl];
1665
1666                 /* record traversal order */
1667                 list_add_tail(&inner->walk_list, inner_walk);
1668         }
1669 }
1670
1671 /* propagate the deltas to the parent */
1672 static void iocg_flush_stat_upward(struct ioc_gq *iocg)
1673 {
1674         if (iocg->level > 0) {
1675                 struct iocg_stat *parent_stat =
1676                         &iocg->ancestors[iocg->level - 1]->stat;
1677
1678                 parent_stat->usage_us +=
1679                         iocg->stat.usage_us - iocg->last_stat.usage_us;
1680                 parent_stat->wait_us +=
1681                         iocg->stat.wait_us - iocg->last_stat.wait_us;
1682                 parent_stat->indebt_us +=
1683                         iocg->stat.indebt_us - iocg->last_stat.indebt_us;
1684                 parent_stat->indelay_us +=
1685                         iocg->stat.indelay_us - iocg->last_stat.indelay_us;
1686         }
1687
1688         iocg->last_stat = iocg->stat;
1689 }
1690
1691 /* collect per-cpu counters and propagate the deltas to the parent */
1692 static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now)
1693 {
1694         struct ioc *ioc = iocg->ioc;
1695         u64 abs_vusage = 0;
1696         u64 vusage_delta;
1697         int cpu;
1698
1699         lockdep_assert_held(&iocg->ioc->lock);
1700
1701         /* collect per-cpu counters */
1702         for_each_possible_cpu(cpu) {
1703                 abs_vusage += local64_read(
1704                                 per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1705         }
1706         vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1707         iocg->last_stat_abs_vusage = abs_vusage;
1708
1709         iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
1710         iocg->stat.usage_us += iocg->usage_delta_us;
1711
1712         iocg_flush_stat_upward(iocg);
1713 }
1714
1715 /* get stat counters ready for reading on all active iocgs */
1716 static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1717 {
1718         LIST_HEAD(inner_walk);
1719         struct ioc_gq *iocg, *tiocg;
1720
1721         /* flush leaves and build inner node walk list */
1722         list_for_each_entry(iocg, target_iocgs, active_list) {
1723                 iocg_flush_stat_leaf(iocg, now);
1724                 iocg_build_inner_walk(iocg, &inner_walk);
1725         }
1726
1727         /* keep flushing upwards by walking the inner list backwards */
1728         list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
1729                 iocg_flush_stat_upward(iocg);
1730                 list_del_init(&iocg->walk_list);
1731         }
1732 }
1733
1734 /*
1735  * Determine what @iocg's hweight_inuse should be after donating unused
1736  * capacity. @hwm is the upper bound and used to signal no donation. This
1737  * function also throws away @iocg's excess budget.
1738  */
1739 static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
1740                                   u32 usage, struct ioc_now *now)
1741 {
1742         struct ioc *ioc = iocg->ioc;
1743         u64 vtime = atomic64_read(&iocg->vtime);
1744         s64 excess, delta, target, new_hwi;
1745
1746         /* debt handling owns inuse for debtors */
1747         if (iocg->abs_vdebt)
1748                 return 1;
1749
1750         /* see whether minimum margin requirement is met */
1751         if (waitqueue_active(&iocg->waitq) ||
1752             time_after64(vtime, now->vnow - ioc->margins.min))
1753                 return hwm;
1754
1755         /* throw away excess above target */
1756         excess = now->vnow - vtime - ioc->margins.target;
1757         if (excess > 0) {
1758                 atomic64_add(excess, &iocg->vtime);
1759                 atomic64_add(excess, &iocg->done_vtime);
1760                 vtime += excess;
1761                 ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
1762         }
1763
1764         /*
1765          * Let's say the distance between iocg's and device's vtimes as a
1766          * fraction of period duration is delta. Assuming that the iocg will
1767          * consume the usage determined above, we want to determine new_hwi so
1768          * that delta equals MARGIN_TARGET at the end of the next period.
1769          *
1770          * We need to execute usage worth of IOs while spending the sum of the
1771          * new budget (1 - MARGIN_TARGET) and the leftover from the last period
1772          * (delta):
1773          *
1774          *   usage = (1 - MARGIN_TARGET + delta) * new_hwi
1775          *
1776          * Therefore, the new_hwi is:
1777          *
1778          *   new_hwi = usage / (1 - MARGIN_TARGET + delta)
1779          */
1780         delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
1781                           now->vnow - ioc->period_at_vtime);
1782         target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
1783         new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
1784
1785         return clamp_t(s64, new_hwi, 1, hwm);
1786 }
1787
1788 /*
1789  * For work-conservation, an iocg which isn't using all of its share should
1790  * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1791  * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
1792  *
1793  * #1 is mathematically simpler but has the drawback of requiring synchronous
1794  * global hweight_inuse updates when idle iocg's get activated or inuse weights
1795  * change due to donation snapbacks as it has the possibility of grossly
1796  * overshooting what's allowed by the model and vrate.
1797  *
1798  * #2 is inherently safe with local operations. The donating iocg can easily
1799  * snap back to higher weights when needed without worrying about impacts on
1800  * other nodes as the impacts will be inherently correct. This also makes idle
1801  * iocg activations safe. The only effect activations have is decreasing
1802  * hweight_inuse of others, the right solution to which is for those iocgs to
1803  * snap back to higher weights.
1804  *
1805  * So, we go with #2. The challenge is calculating how each donating iocg's
1806  * inuse should be adjusted to achieve the target donation amounts. This is done
1807  * using Andy's method described in the following pdf.
1808  *
1809  *   https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
1810  *
1811  * Given the weights and target after-donation hweight_inuse values, Andy's
1812  * method determines how the proportional distribution should look like at each
1813  * sibling level to maintain the relative relationship between all non-donating
1814  * pairs. To roughly summarize, it divides the tree into donating and
1815  * non-donating parts, calculates global donation rate which is used to
1816  * determine the target hweight_inuse for each node, and then derives per-level
1817  * proportions.
1818  *
1819  * The following pdf shows that global distribution calculated this way can be
1820  * achieved by scaling inuse weights of donating leaves and propagating the
1821  * adjustments upwards proportionally.
1822  *
1823  *   https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1824  *
1825  * Combining the above two, we can determine how each leaf iocg's inuse should
1826  * be adjusted to achieve the target donation.
1827  *
1828  *   https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
1829  *
1830  * The inline comments use symbols from the last pdf.
1831  *
1832  *   b is the sum of the absolute budgets in the subtree. 1 for the root node.
1833  *   f is the sum of the absolute budgets of non-donating nodes in the subtree.
1834  *   t is the sum of the absolute budgets of donating nodes in the subtree.
1835  *   w is the weight of the node. w = w_f + w_t
1836  *   w_f is the non-donating portion of w. w_f = w * f / b
1837  *   w_b is the donating portion of w. w_t = w * t / b
1838  *   s is the sum of all sibling weights. s = Sum(w) for siblings
1839  *   s_f and s_t are the non-donating and donating portions of s.
1840  *
1841  * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1842  * w_pt is the donating portion of the parent's weight and w'_pt the same value
1843  * after adjustments. Subscript r denotes the root node's values.
1844  */
1845 static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
1846 {
1847         LIST_HEAD(over_hwa);
1848         LIST_HEAD(inner_walk);
1849         struct ioc_gq *iocg, *tiocg, *root_iocg;
1850         u32 after_sum, over_sum, over_target, gamma;
1851
1852         /*
1853          * It's pretty unlikely but possible for the total sum of
1854          * hweight_after_donation's to be higher than WEIGHT_ONE, which will
1855          * confuse the following calculations. If such condition is detected,
1856          * scale down everyone over its full share equally to keep the sum below
1857          * WEIGHT_ONE.
1858          */
1859         after_sum = 0;
1860         over_sum = 0;
1861         list_for_each_entry(iocg, surpluses, surplus_list) {
1862                 u32 hwa;
1863
1864                 current_hweight(iocg, &hwa, NULL);
1865                 after_sum += iocg->hweight_after_donation;
1866
1867                 if (iocg->hweight_after_donation > hwa) {
1868                         over_sum += iocg->hweight_after_donation;
1869                         list_add(&iocg->walk_list, &over_hwa);
1870                 }
1871         }
1872
1873         if (after_sum >= WEIGHT_ONE) {
1874                 /*
1875                  * The delta should be deducted from the over_sum, calculate
1876                  * target over_sum value.
1877                  */
1878                 u32 over_delta = after_sum - (WEIGHT_ONE - 1);
1879                 WARN_ON_ONCE(over_sum <= over_delta);
1880                 over_target = over_sum - over_delta;
1881         } else {
1882                 over_target = 0;
1883         }
1884
1885         list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
1886                 if (over_target)
1887                         iocg->hweight_after_donation =
1888                                 div_u64((u64)iocg->hweight_after_donation *
1889                                         over_target, over_sum);
1890                 list_del_init(&iocg->walk_list);
1891         }
1892
1893         /*
1894          * Build pre-order inner node walk list and prepare for donation
1895          * adjustment calculations.
1896          */
1897         list_for_each_entry(iocg, surpluses, surplus_list) {
1898                 iocg_build_inner_walk(iocg, &inner_walk);
1899         }
1900
1901         root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
1902         WARN_ON_ONCE(root_iocg->level > 0);
1903
1904         list_for_each_entry(iocg, &inner_walk, walk_list) {
1905                 iocg->child_adjusted_sum = 0;
1906                 iocg->hweight_donating = 0;
1907                 iocg->hweight_after_donation = 0;
1908         }
1909
1910         /*
1911          * Propagate the donating budget (b_t) and after donation budget (b'_t)
1912          * up the hierarchy.
1913          */
1914         list_for_each_entry(iocg, surpluses, surplus_list) {
1915                 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1916
1917                 parent->hweight_donating += iocg->hweight_donating;
1918                 parent->hweight_after_donation += iocg->hweight_after_donation;
1919         }
1920
1921         list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
1922                 if (iocg->level > 0) {
1923                         struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1924
1925                         parent->hweight_donating += iocg->hweight_donating;
1926                         parent->hweight_after_donation += iocg->hweight_after_donation;
1927                 }
1928         }
1929
1930         /*
1931          * Calculate inner hwa's (b) and make sure the donation values are
1932          * within the accepted ranges as we're doing low res calculations with
1933          * roundups.
1934          */
1935         list_for_each_entry(iocg, &inner_walk, walk_list) {
1936                 if (iocg->level) {
1937                         struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1938
1939                         iocg->hweight_active = DIV64_U64_ROUND_UP(
1940                                 (u64)parent->hweight_active * iocg->active,
1941                                 parent->child_active_sum);
1942
1943                 }
1944
1945                 iocg->hweight_donating = min(iocg->hweight_donating,
1946                                              iocg->hweight_active);
1947                 iocg->hweight_after_donation = min(iocg->hweight_after_donation,
1948                                                    iocg->hweight_donating - 1);
1949                 if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
1950                                  iocg->hweight_donating <= 1 ||
1951                                  iocg->hweight_after_donation == 0)) {
1952                         pr_warn("iocg: invalid donation weights in ");
1953                         pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
1954                         pr_cont(": active=%u donating=%u after=%u\n",
1955                                 iocg->hweight_active, iocg->hweight_donating,
1956                                 iocg->hweight_after_donation);
1957                 }
1958         }
1959
1960         /*
1961          * Calculate the global donation rate (gamma) - the rate to adjust
1962          * non-donating budgets by.
1963          *
1964          * No need to use 64bit multiplication here as the first operand is
1965          * guaranteed to be smaller than WEIGHT_ONE (1<<16).
1966          *
1967          * We know that there are beneficiary nodes and the sum of the donating
1968          * hweights can't be whole; however, due to the round-ups during hweight
1969          * calculations, root_iocg->hweight_donating might still end up equal to
1970          * or greater than whole. Limit the range when calculating the divider.
1971          *
1972          * gamma = (1 - t_r') / (1 - t_r)
1973          */
1974         gamma = DIV_ROUND_UP(
1975                 (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
1976                 WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
1977
1978         /*
1979          * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
1980          * nodes.
1981          */
1982         list_for_each_entry(iocg, &inner_walk, walk_list) {
1983                 struct ioc_gq *parent;
1984                 u32 inuse, wpt, wptp;
1985                 u64 st, sf;
1986
1987                 if (iocg->level == 0) {
1988                         /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
1989                         iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
1990                                 iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
1991                                 WEIGHT_ONE - iocg->hweight_after_donation);
1992                         continue;
1993                 }
1994
1995                 parent = iocg->ancestors[iocg->level - 1];
1996
1997                 /* b' = gamma * b_f + b_t' */
1998                 iocg->hweight_inuse = DIV64_U64_ROUND_UP(
1999                         (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
2000                         WEIGHT_ONE) + iocg->hweight_after_donation;
2001
2002                 /* w' = s' * b' / b'_p */
2003                 inuse = DIV64_U64_ROUND_UP(
2004                         (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
2005                         parent->hweight_inuse);
2006
2007                 /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
2008                 st = DIV64_U64_ROUND_UP(
2009                         iocg->child_active_sum * iocg->hweight_donating,
2010                         iocg->hweight_active);
2011                 sf = iocg->child_active_sum - st;
2012                 wpt = DIV64_U64_ROUND_UP(
2013                         (u64)iocg->active * iocg->hweight_donating,
2014                         iocg->hweight_active);
2015                 wptp = DIV64_U64_ROUND_UP(
2016                         (u64)inuse * iocg->hweight_after_donation,
2017                         iocg->hweight_inuse);
2018
2019                 iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
2020         }
2021
2022         /*
2023          * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
2024          * we can finally determine leaf adjustments.
2025          */
2026         list_for_each_entry(iocg, surpluses, surplus_list) {
2027                 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
2028                 u32 inuse;
2029
2030                 /*
2031                  * In-debt iocgs participated in the donation calculation with
2032                  * the minimum target hweight_inuse. Configuring inuse
2033                  * accordingly would work fine but debt handling expects
2034                  * @iocg->inuse stay at the minimum and we don't wanna
2035                  * interfere.
2036                  */
2037                 if (iocg->abs_vdebt) {
2038                         WARN_ON_ONCE(iocg->inuse > 1);
2039                         continue;
2040                 }
2041
2042                 /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
2043                 inuse = DIV64_U64_ROUND_UP(
2044                         parent->child_adjusted_sum * iocg->hweight_after_donation,
2045                         parent->hweight_inuse);
2046
2047                 TRACE_IOCG_PATH(inuse_transfer, iocg, now,
2048                                 iocg->inuse, inuse,
2049                                 iocg->hweight_inuse,
2050                                 iocg->hweight_after_donation);
2051
2052                 __propagate_weights(iocg, iocg->active, inuse, true, now);
2053         }
2054
2055         /* walk list should be dissolved after use */
2056         list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
2057                 list_del_init(&iocg->walk_list);
2058 }
2059
2060 /*
2061  * A low weight iocg can amass a large amount of debt, for example, when
2062  * anonymous memory gets reclaimed aggressively. If the system has a lot of
2063  * memory paired with a slow IO device, the debt can span multiple seconds or
2064  * more. If there are no other subsequent IO issuers, the in-debt iocg may end
2065  * up blocked paying its debt while the IO device is idle.
2066  *
2067  * The following protects against such cases. If the device has been
2068  * sufficiently idle for a while, the debts are halved and delays are
2069  * recalculated.
2070  */
2071 static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
2072                               struct ioc_now *now)
2073 {
2074         struct ioc_gq *iocg;
2075         u64 dur, usage_pct, nr_cycles;
2076
2077         /* if no debtor, reset the cycle */
2078         if (!nr_debtors) {
2079                 ioc->dfgv_period_at = now->now;
2080                 ioc->dfgv_period_rem = 0;
2081                 ioc->dfgv_usage_us_sum = 0;
2082                 return;
2083         }
2084
2085         /*
2086          * Debtors can pass through a lot of writes choking the device and we
2087          * don't want to be forgiving debts while the device is struggling from
2088          * write bursts. If we're missing latency targets, consider the device
2089          * fully utilized.
2090          */
2091         if (ioc->busy_level > 0)
2092                 usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
2093
2094         ioc->dfgv_usage_us_sum += usage_us_sum;
2095         if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
2096                 return;
2097
2098         /*
2099          * At least DFGV_PERIOD has passed since the last period. Calculate the
2100          * average usage and reset the period counters.
2101          */
2102         dur = now->now - ioc->dfgv_period_at;
2103         usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
2104
2105         ioc->dfgv_period_at = now->now;
2106         ioc->dfgv_usage_us_sum = 0;
2107
2108         /* if was too busy, reset everything */
2109         if (usage_pct > DFGV_USAGE_PCT) {
2110                 ioc->dfgv_period_rem = 0;
2111                 return;
2112         }
2113
2114         /*
2115          * Usage is lower than threshold. Let's forgive some debts. Debt
2116          * forgiveness runs off of the usual ioc timer but its period usually
2117          * doesn't match ioc's. Compensate the difference by performing the
2118          * reduction as many times as would fit in the duration since the last
2119          * run and carrying over the left-over duration in @ioc->dfgv_period_rem
2120          * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
2121          * reductions is doubled.
2122          */
2123         nr_cycles = dur + ioc->dfgv_period_rem;
2124         ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
2125
2126         list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2127                 u64 __maybe_unused old_debt, __maybe_unused old_delay;
2128
2129                 if (!iocg->abs_vdebt && !iocg->delay)
2130                         continue;
2131
2132                 spin_lock(&iocg->waitq.lock);
2133
2134                 old_debt = iocg->abs_vdebt;
2135                 old_delay = iocg->delay;
2136
2137                 if (iocg->abs_vdebt)
2138                         iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
2139                 if (iocg->delay)
2140                         iocg->delay = iocg->delay >> nr_cycles ?: 1;
2141
2142                 iocg_kick_waitq(iocg, true, now);
2143
2144                 TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
2145                                 old_debt, iocg->abs_vdebt,
2146                                 old_delay, iocg->delay);
2147
2148                 spin_unlock(&iocg->waitq.lock);
2149         }
2150 }
2151
2152 /*
2153  * Check the active iocgs' state to avoid oversleeping and deactive
2154  * idle iocgs.
2155  *
2156  * Since waiters determine the sleep durations based on the vrate
2157  * they saw at the time of sleep, if vrate has increased, some
2158  * waiters could be sleeping for too long. Wake up tardy waiters
2159  * which should have woken up in the last period and expire idle
2160  * iocgs.
2161  */
2162 static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
2163 {
2164         int nr_debtors = 0;
2165         struct ioc_gq *iocg, *tiocg;
2166
2167         list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
2168                 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2169                     !iocg->delay && !iocg_is_idle(iocg))
2170                         continue;
2171
2172                 spin_lock(&iocg->waitq.lock);
2173
2174                 /* flush wait and indebt stat deltas */
2175                 if (iocg->wait_since) {
2176                         iocg->stat.wait_us += now->now - iocg->wait_since;
2177                         iocg->wait_since = now->now;
2178                 }
2179                 if (iocg->indebt_since) {
2180                         iocg->stat.indebt_us +=
2181                                 now->now - iocg->indebt_since;
2182                         iocg->indebt_since = now->now;
2183                 }
2184                 if (iocg->indelay_since) {
2185                         iocg->stat.indelay_us +=
2186                                 now->now - iocg->indelay_since;
2187                         iocg->indelay_since = now->now;
2188                 }
2189
2190                 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
2191                     iocg->delay) {
2192                         /* might be oversleeping vtime / hweight changes, kick */
2193                         iocg_kick_waitq(iocg, true, now);
2194                         if (iocg->abs_vdebt || iocg->delay)
2195                                 nr_debtors++;
2196                 } else if (iocg_is_idle(iocg)) {
2197                         /* no waiter and idle, deactivate */
2198                         u64 vtime = atomic64_read(&iocg->vtime);
2199                         s64 excess;
2200
2201                         /*
2202                          * @iocg has been inactive for a full duration and will
2203                          * have a high budget. Account anything above target as
2204                          * error and throw away. On reactivation, it'll start
2205                          * with the target budget.
2206                          */
2207                         excess = now->vnow - vtime - ioc->margins.target;
2208                         if (excess > 0) {
2209                                 u32 old_hwi;
2210
2211                                 current_hweight(iocg, NULL, &old_hwi);
2212                                 ioc->vtime_err -= div64_u64(excess * old_hwi,
2213                                                             WEIGHT_ONE);
2214                         }
2215
2216                         TRACE_IOCG_PATH(iocg_idle, iocg, now,
2217                                         atomic64_read(&iocg->active_period),
2218                                         atomic64_read(&ioc->cur_period), vtime);
2219                         __propagate_weights(iocg, 0, 0, false, now);
2220                         list_del_init(&iocg->active_list);
2221                 }
2222
2223                 spin_unlock(&iocg->waitq.lock);
2224         }
2225
2226         commit_weights(ioc);
2227         return nr_debtors;
2228 }
2229
2230 static void ioc_timer_fn(struct timer_list *timer)
2231 {
2232         struct ioc *ioc = container_of(timer, struct ioc, timer);
2233         struct ioc_gq *iocg, *tiocg;
2234         struct ioc_now now;
2235         LIST_HEAD(surpluses);
2236         int nr_debtors, nr_shortages = 0, nr_lagging = 0;
2237         u64 usage_us_sum = 0;
2238         u32 ppm_rthr;
2239         u32 ppm_wthr;
2240         u32 missed_ppm[2], rq_wait_pct;
2241         u64 period_vtime;
2242         int prev_busy_level;
2243
2244         /* how were the latencies during the period? */
2245         ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
2246
2247         /* take care of active iocgs */
2248         spin_lock_irq(&ioc->lock);
2249
2250         ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
2251         ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
2252         ioc_now(ioc, &now);
2253
2254         period_vtime = now.vnow - ioc->period_at_vtime;
2255         if (WARN_ON_ONCE(!period_vtime)) {
2256                 spin_unlock_irq(&ioc->lock);
2257                 return;
2258         }
2259
2260         nr_debtors = ioc_check_iocgs(ioc, &now);
2261
2262         /*
2263          * Wait and indebt stat are flushed above and the donation calculation
2264          * below needs updated usage stat. Let's bring stat up-to-date.
2265          */
2266         iocg_flush_stat(&ioc->active_iocgs, &now);
2267
2268         /* calc usage and see whether some weights need to be moved around */
2269         list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2270                 u64 vdone, vtime, usage_us;
2271                 u32 hw_active, hw_inuse;
2272
2273                 /*
2274                  * Collect unused and wind vtime closer to vnow to prevent
2275                  * iocgs from accumulating a large amount of budget.
2276                  */
2277                 vdone = atomic64_read(&iocg->done_vtime);
2278                 vtime = atomic64_read(&iocg->vtime);
2279                 current_hweight(iocg, &hw_active, &hw_inuse);
2280
2281                 /*
2282                  * Latency QoS detection doesn't account for IOs which are
2283                  * in-flight for longer than a period.  Detect them by
2284                  * comparing vdone against period start.  If lagging behind
2285                  * IOs from past periods, don't increase vrate.
2286                  */
2287                 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
2288                     !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
2289                     time_after64(vtime, vdone) &&
2290                     time_after64(vtime, now.vnow -
2291                                  MAX_LAGGING_PERIODS * period_vtime) &&
2292                     time_before64(vdone, now.vnow - period_vtime))
2293                         nr_lagging++;
2294
2295                 /*
2296                  * Determine absolute usage factoring in in-flight IOs to avoid
2297                  * high-latency completions appearing as idle.
2298                  */
2299                 usage_us = iocg->usage_delta_us;
2300                 usage_us_sum += usage_us;
2301
2302                 /* see whether there's surplus vtime */
2303                 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2304                 if (hw_inuse < hw_active ||
2305                     (!waitqueue_active(&iocg->waitq) &&
2306                      time_before64(vtime, now.vnow - ioc->margins.low))) {
2307                         u32 hwa, old_hwi, hwm, new_hwi, usage;
2308                         u64 usage_dur;
2309
2310                         if (vdone != vtime) {
2311                                 u64 inflight_us = DIV64_U64_ROUND_UP(
2312                                         cost_to_abs_cost(vtime - vdone, hw_inuse),
2313                                         ioc->vtime_base_rate);
2314
2315                                 usage_us = max(usage_us, inflight_us);
2316                         }
2317
2318                         /* convert to hweight based usage ratio */
2319                         if (time_after64(iocg->activated_at, ioc->period_at))
2320                                 usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
2321                         else
2322                                 usage_dur = max_t(u64, now.now - ioc->period_at, 1);
2323
2324                         usage = clamp_t(u32,
2325                                 DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
2326                                                    usage_dur),
2327                                 1, WEIGHT_ONE);
2328
2329                         /*
2330                          * Already donating or accumulated enough to start.
2331                          * Determine the donation amount.
2332                          */
2333                         current_hweight(iocg, &hwa, &old_hwi);
2334                         hwm = current_hweight_max(iocg);
2335                         new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
2336                                                          usage, &now);
2337                         /*
2338                          * Donation calculation assumes hweight_after_donation
2339                          * to be positive, a condition that a donor w/ hwa < 2
2340                          * can't meet. Don't bother with donation if hwa is
2341                          * below 2. It's not gonna make a meaningful difference
2342                          * anyway.
2343                          */
2344                         if (new_hwi < hwm && hwa >= 2) {
2345                                 iocg->hweight_donating = hwa;
2346                                 iocg->hweight_after_donation = new_hwi;
2347                                 list_add(&iocg->surplus_list, &surpluses);
2348                         } else if (!iocg->abs_vdebt) {
2349                                 /*
2350                                  * @iocg doesn't have enough to donate. Reset
2351                                  * its inuse to active.
2352                                  *
2353                                  * Don't reset debtors as their inuse's are
2354                                  * owned by debt handling. This shouldn't affect
2355                                  * donation calculuation in any meaningful way
2356                                  * as @iocg doesn't have a meaningful amount of
2357                                  * share anyway.
2358                                  */
2359                                 TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
2360                                                 iocg->inuse, iocg->active,
2361                                                 iocg->hweight_inuse, new_hwi);
2362
2363                                 __propagate_weights(iocg, iocg->active,
2364                                                     iocg->active, true, &now);
2365                                 nr_shortages++;
2366                         }
2367                 } else {
2368                         /* genuinely short on vtime */
2369                         nr_shortages++;
2370                 }
2371         }
2372
2373         if (!list_empty(&surpluses) && nr_shortages)
2374                 transfer_surpluses(&surpluses, &now);
2375
2376         commit_weights(ioc);
2377
2378         /* surplus list should be dissolved after use */
2379         list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
2380                 list_del_init(&iocg->surplus_list);
2381
2382         /*
2383          * If q is getting clogged or we're missing too much, we're issuing
2384          * too much IO and should lower vtime rate.  If we're not missing
2385          * and experiencing shortages but not surpluses, we're too stingy
2386          * and should increase vtime rate.
2387          */
2388         prev_busy_level = ioc->busy_level;
2389         if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
2390             missed_ppm[READ] > ppm_rthr ||
2391             missed_ppm[WRITE] > ppm_wthr) {
2392                 /* clearly missing QoS targets, slow down vrate */
2393                 ioc->busy_level = max(ioc->busy_level, 0);
2394                 ioc->busy_level++;
2395         } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
2396                    missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
2397                    missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
2398                 /* QoS targets are being met with >25% margin */
2399                 if (nr_shortages) {
2400                         /*
2401                          * We're throttling while the device has spare
2402                          * capacity.  If vrate was being slowed down, stop.
2403                          */
2404                         ioc->busy_level = min(ioc->busy_level, 0);
2405
2406                         /*
2407                          * If there are IOs spanning multiple periods, wait
2408                          * them out before pushing the device harder.
2409                          */
2410                         if (!nr_lagging)
2411                                 ioc->busy_level--;
2412                 } else {
2413                         /*
2414                          * Nobody is being throttled and the users aren't
2415                          * issuing enough IOs to saturate the device.  We
2416                          * simply don't know how close the device is to
2417                          * saturation.  Coast.
2418                          */
2419                         ioc->busy_level = 0;
2420                 }
2421         } else {
2422                 /* inside the hysterisis margin, we're good */
2423                 ioc->busy_level = 0;
2424         }
2425
2426         ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
2427
2428         ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
2429                               prev_busy_level, missed_ppm);
2430
2431         ioc_refresh_params(ioc, false);
2432
2433         ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
2434
2435         /*
2436          * This period is done.  Move onto the next one.  If nothing's
2437          * going on with the device, stop the timer.
2438          */
2439         atomic64_inc(&ioc->cur_period);
2440
2441         if (ioc->running != IOC_STOP) {
2442                 if (!list_empty(&ioc->active_iocgs)) {
2443                         ioc_start_period(ioc, &now);
2444                 } else {
2445                         ioc->busy_level = 0;
2446                         ioc->vtime_err = 0;
2447                         ioc->running = IOC_IDLE;
2448                 }
2449
2450                 ioc_refresh_vrate(ioc, &now);
2451         }
2452
2453         spin_unlock_irq(&ioc->lock);
2454 }
2455
2456 static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
2457                                       u64 abs_cost, struct ioc_now *now)
2458 {
2459         struct ioc *ioc = iocg->ioc;
2460         struct ioc_margins *margins = &ioc->margins;
2461         u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
2462         u32 hwi, adj_step;
2463         s64 margin;
2464         u64 cost, new_inuse;
2465         unsigned long flags;
2466
2467         current_hweight(iocg, NULL, &hwi);
2468         old_hwi = hwi;
2469         cost = abs_cost_to_cost(abs_cost, hwi);
2470         margin = now->vnow - vtime - cost;
2471
2472         /* debt handling owns inuse for debtors */
2473         if (iocg->abs_vdebt)
2474                 return cost;
2475
2476         /*
2477          * We only increase inuse during period and do so if the margin has
2478          * deteriorated since the previous adjustment.
2479          */
2480         if (margin >= iocg->saved_margin || margin >= margins->low ||
2481             iocg->inuse == iocg->active)
2482                 return cost;
2483
2484         spin_lock_irqsave(&ioc->lock, flags);
2485
2486         /* we own inuse only when @iocg is in the normal active state */
2487         if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
2488                 spin_unlock_irqrestore(&ioc->lock, flags);
2489                 return cost;
2490         }
2491
2492         /*
2493          * Bump up inuse till @abs_cost fits in the existing budget.
2494          * adj_step must be determined after acquiring ioc->lock - we might
2495          * have raced and lost to another thread for activation and could
2496          * be reading 0 iocg->active before ioc->lock which will lead to
2497          * infinite loop.
2498          */
2499         new_inuse = iocg->inuse;
2500         adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
2501         do {
2502                 new_inuse = new_inuse + adj_step;
2503                 propagate_weights(iocg, iocg->active, new_inuse, true, now);
2504                 current_hweight(iocg, NULL, &hwi);
2505                 cost = abs_cost_to_cost(abs_cost, hwi);
2506         } while (time_after64(vtime + cost, now->vnow) &&
2507                  iocg->inuse != iocg->active);
2508
2509         spin_unlock_irqrestore(&ioc->lock, flags);
2510
2511         TRACE_IOCG_PATH(inuse_adjust, iocg, now,
2512                         old_inuse, iocg->inuse, old_hwi, hwi);
2513
2514         return cost;
2515 }
2516
2517 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
2518                                     bool is_merge, u64 *costp)
2519 {
2520         struct ioc *ioc = iocg->ioc;
2521         u64 coef_seqio, coef_randio, coef_page;
2522         u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
2523         u64 seek_pages = 0;
2524         u64 cost = 0;
2525
2526         /* Can't calculate cost for empty bio */
2527         if (!bio->bi_iter.bi_size)
2528                 goto out;
2529
2530         switch (bio_op(bio)) {
2531         case REQ_OP_READ:
2532                 coef_seqio      = ioc->params.lcoefs[LCOEF_RSEQIO];
2533                 coef_randio     = ioc->params.lcoefs[LCOEF_RRANDIO];
2534                 coef_page       = ioc->params.lcoefs[LCOEF_RPAGE];
2535                 break;
2536         case REQ_OP_WRITE:
2537                 coef_seqio      = ioc->params.lcoefs[LCOEF_WSEQIO];
2538                 coef_randio     = ioc->params.lcoefs[LCOEF_WRANDIO];
2539                 coef_page       = ioc->params.lcoefs[LCOEF_WPAGE];
2540                 break;
2541         default:
2542                 goto out;
2543         }
2544
2545         if (iocg->cursor) {
2546                 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
2547                 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
2548         }
2549
2550         if (!is_merge) {
2551                 if (seek_pages > LCOEF_RANDIO_PAGES) {
2552                         cost += coef_randio;
2553                 } else {
2554                         cost += coef_seqio;
2555                 }
2556         }
2557         cost += pages * coef_page;
2558 out:
2559         *costp = cost;
2560 }
2561
2562 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
2563 {
2564         u64 cost;
2565
2566         calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
2567         return cost;
2568 }
2569
2570 static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
2571                                          u64 *costp)
2572 {
2573         unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
2574
2575         switch (req_op(rq)) {
2576         case REQ_OP_READ:
2577                 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
2578                 break;
2579         case REQ_OP_WRITE:
2580                 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
2581                 break;
2582         default:
2583                 *costp = 0;
2584         }
2585 }
2586
2587 static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
2588 {
2589         u64 cost;
2590
2591         calc_size_vtime_cost_builtin(rq, ioc, &cost);
2592         return cost;
2593 }
2594
2595 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
2596 {
2597         struct blkcg_gq *blkg = bio->bi_blkg;
2598         struct ioc *ioc = rqos_to_ioc(rqos);
2599         struct ioc_gq *iocg = blkg_to_iocg(blkg);
2600         struct ioc_now now;
2601         struct iocg_wait wait;
2602         u64 abs_cost, cost, vtime;
2603         bool use_debt, ioc_locked;
2604         unsigned long flags;
2605
2606         /* bypass IOs if disabled, still initializing, or for root cgroup */
2607         if (!ioc->enabled || !iocg || !iocg->level)
2608                 return;
2609
2610         /* calculate the absolute vtime cost */
2611         abs_cost = calc_vtime_cost(bio, iocg, false);
2612         if (!abs_cost)
2613                 return;
2614
2615         if (!iocg_activate(iocg, &now))
2616                 return;
2617
2618         iocg->cursor = bio_end_sector(bio);
2619         vtime = atomic64_read(&iocg->vtime);
2620         cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2621
2622         /*
2623          * If no one's waiting and within budget, issue right away.  The
2624          * tests are racy but the races aren't systemic - we only miss once
2625          * in a while which is fine.
2626          */
2627         if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2628             time_before_eq64(vtime + cost, now.vnow)) {
2629                 iocg_commit_bio(iocg, bio, abs_cost, cost);
2630                 return;
2631         }
2632
2633         /*
2634          * We're over budget. This can be handled in two ways. IOs which may
2635          * cause priority inversions are punted to @ioc->aux_iocg and charged as
2636          * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
2637          * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
2638          * whether debt handling is needed and acquire locks accordingly.
2639          */
2640         use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
2641         ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
2642 retry_lock:
2643         iocg_lock(iocg, ioc_locked, &flags);
2644
2645         /*
2646          * @iocg must stay activated for debt and waitq handling. Deactivation
2647          * is synchronized against both ioc->lock and waitq.lock and we won't
2648          * get deactivated as long as we're waiting or has debt, so we're good
2649          * if we're activated here. In the unlikely cases that we aren't, just
2650          * issue the IO.
2651          */
2652         if (unlikely(list_empty(&iocg->active_list))) {
2653                 iocg_unlock(iocg, ioc_locked, &flags);
2654                 iocg_commit_bio(iocg, bio, abs_cost, cost);
2655                 return;
2656         }
2657
2658         /*
2659          * We're over budget. If @bio has to be issued regardless, remember
2660          * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
2661          * off the debt before waking more IOs.
2662          *
2663          * This way, the debt is continuously paid off each period with the
2664          * actual budget available to the cgroup. If we just wound vtime, we
2665          * would incorrectly use the current hw_inuse for the entire amount
2666          * which, for example, can lead to the cgroup staying blocked for a
2667          * long time even with substantially raised hw_inuse.
2668          *
2669          * An iocg with vdebt should stay online so that the timer can keep
2670          * deducting its vdebt and [de]activate use_delay mechanism
2671          * accordingly. We don't want to race against the timer trying to
2672          * clear them and leave @iocg inactive w/ dangling use_delay heavily
2673          * penalizing the cgroup and its descendants.
2674          */
2675         if (use_debt) {
2676                 iocg_incur_debt(iocg, abs_cost, &now);
2677                 if (iocg_kick_delay(iocg, &now))
2678                         blkcg_schedule_throttle(rqos->disk,
2679                                         (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2680                 iocg_unlock(iocg, ioc_locked, &flags);
2681                 return;
2682         }
2683
2684         /* guarantee that iocgs w/ waiters have maximum inuse */
2685         if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
2686                 if (!ioc_locked) {
2687                         iocg_unlock(iocg, false, &flags);
2688                         ioc_locked = true;
2689                         goto retry_lock;
2690                 }
2691                 propagate_weights(iocg, iocg->active, iocg->active, true,
2692                                   &now);
2693         }
2694
2695         /*
2696          * Append self to the waitq and schedule the wakeup timer if we're
2697          * the first waiter.  The timer duration is calculated based on the
2698          * current vrate.  vtime and hweight changes can make it too short
2699          * or too long.  Each wait entry records the absolute cost it's
2700          * waiting for to allow re-evaluation using a custom wait entry.
2701          *
2702          * If too short, the timer simply reschedules itself.  If too long,
2703          * the period timer will notice and trigger wakeups.
2704          *
2705          * All waiters are on iocg->waitq and the wait states are
2706          * synchronized using waitq.lock.
2707          */
2708         init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2709         wait.wait.private = current;
2710         wait.bio = bio;
2711         wait.abs_cost = abs_cost;
2712         wait.committed = false; /* will be set true by waker */
2713
2714         __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
2715         iocg_kick_waitq(iocg, ioc_locked, &now);
2716
2717         iocg_unlock(iocg, ioc_locked, &flags);
2718
2719         while (true) {
2720                 set_current_state(TASK_UNINTERRUPTIBLE);
2721                 if (wait.committed)
2722                         break;
2723                 io_schedule();
2724         }
2725
2726         /* waker already committed us, proceed */
2727         finish_wait(&iocg->waitq, &wait.wait);
2728 }
2729
2730 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2731                            struct bio *bio)
2732 {
2733         struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2734         struct ioc *ioc = rqos_to_ioc(rqos);
2735         sector_t bio_end = bio_end_sector(bio);
2736         struct ioc_now now;
2737         u64 vtime, abs_cost, cost;
2738         unsigned long flags;
2739
2740         /* bypass if disabled, still initializing, or for root cgroup */
2741         if (!ioc->enabled || !iocg || !iocg->level)
2742                 return;
2743
2744         abs_cost = calc_vtime_cost(bio, iocg, true);
2745         if (!abs_cost)
2746                 return;
2747
2748         ioc_now(ioc, &now);
2749
2750         vtime = atomic64_read(&iocg->vtime);
2751         cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2752
2753         /* update cursor if backmerging into the request at the cursor */
2754         if (blk_rq_pos(rq) < bio_end &&
2755             blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2756                 iocg->cursor = bio_end;
2757
2758         /*
2759          * Charge if there's enough vtime budget and the existing request has
2760          * cost assigned.
2761          */
2762         if (rq->bio && rq->bio->bi_iocost_cost &&
2763             time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
2764                 iocg_commit_bio(iocg, bio, abs_cost, cost);
2765                 return;
2766         }
2767
2768         /*
2769          * Otherwise, account it as debt if @iocg is online, which it should
2770          * be for the vast majority of cases. See debt handling in
2771          * ioc_rqos_throttle() for details.
2772          */
2773         spin_lock_irqsave(&ioc->lock, flags);
2774         spin_lock(&iocg->waitq.lock);
2775
2776         if (likely(!list_empty(&iocg->active_list))) {
2777                 iocg_incur_debt(iocg, abs_cost, &now);
2778                 if (iocg_kick_delay(iocg, &now))
2779                         blkcg_schedule_throttle(rqos->disk,
2780                                         (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2781         } else {
2782                 iocg_commit_bio(iocg, bio, abs_cost, cost);
2783         }
2784
2785         spin_unlock(&iocg->waitq.lock);
2786         spin_unlock_irqrestore(&ioc->lock, flags);
2787 }
2788
2789 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2790 {
2791         struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2792
2793         if (iocg && bio->bi_iocost_cost)
2794                 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2795 }
2796
2797 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2798 {
2799         struct ioc *ioc = rqos_to_ioc(rqos);
2800         struct ioc_pcpu_stat *ccs;
2801         u64 on_q_ns, rq_wait_ns, size_nsec;
2802         int pidx, rw;
2803
2804         if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2805                 return;
2806
2807         switch (req_op(rq)) {
2808         case REQ_OP_READ:
2809                 pidx = QOS_RLAT;
2810                 rw = READ;
2811                 break;
2812         case REQ_OP_WRITE:
2813                 pidx = QOS_WLAT;
2814                 rw = WRITE;
2815                 break;
2816         default:
2817                 return;
2818         }
2819
2820         on_q_ns = blk_time_get_ns() - rq->alloc_time_ns;
2821         rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
2822         size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
2823
2824         ccs = get_cpu_ptr(ioc->pcpu_stat);
2825
2826         if (on_q_ns <= size_nsec ||
2827             on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
2828                 local_inc(&ccs->missed[rw].nr_met);
2829         else
2830                 local_inc(&ccs->missed[rw].nr_missed);
2831
2832         local64_add(rq_wait_ns, &ccs->rq_wait_ns);
2833
2834         put_cpu_ptr(ccs);
2835 }
2836
2837 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2838 {
2839         struct ioc *ioc = rqos_to_ioc(rqos);
2840
2841         spin_lock_irq(&ioc->lock);
2842         ioc_refresh_params(ioc, false);
2843         spin_unlock_irq(&ioc->lock);
2844 }
2845
2846 static void ioc_rqos_exit(struct rq_qos *rqos)
2847 {
2848         struct ioc *ioc = rqos_to_ioc(rqos);
2849
2850         blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost);
2851
2852         spin_lock_irq(&ioc->lock);
2853         ioc->running = IOC_STOP;
2854         spin_unlock_irq(&ioc->lock);
2855
2856         timer_shutdown_sync(&ioc->timer);
2857         free_percpu(ioc->pcpu_stat);
2858         kfree(ioc);
2859 }
2860
2861 static const struct rq_qos_ops ioc_rqos_ops = {
2862         .throttle = ioc_rqos_throttle,
2863         .merge = ioc_rqos_merge,
2864         .done_bio = ioc_rqos_done_bio,
2865         .done = ioc_rqos_done,
2866         .queue_depth_changed = ioc_rqos_queue_depth_changed,
2867         .exit = ioc_rqos_exit,
2868 };
2869
2870 static int blk_iocost_init(struct gendisk *disk)
2871 {
2872         struct ioc *ioc;
2873         int i, cpu, ret;
2874
2875         ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2876         if (!ioc)
2877                 return -ENOMEM;
2878
2879         ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2880         if (!ioc->pcpu_stat) {
2881                 kfree(ioc);
2882                 return -ENOMEM;
2883         }
2884
2885         for_each_possible_cpu(cpu) {
2886                 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2887
2888                 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2889                         local_set(&ccs->missed[i].nr_met, 0);
2890                         local_set(&ccs->missed[i].nr_missed, 0);
2891                 }
2892                 local64_set(&ccs->rq_wait_ns, 0);
2893         }
2894
2895         spin_lock_init(&ioc->lock);
2896         timer_setup(&ioc->timer, ioc_timer_fn, 0);
2897         INIT_LIST_HEAD(&ioc->active_iocgs);
2898
2899         ioc->running = IOC_IDLE;
2900         ioc->vtime_base_rate = VTIME_PER_USEC;
2901         atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2902         seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
2903         ioc->period_at = ktime_to_us(blk_time_get());
2904         atomic64_set(&ioc->cur_period, 0);
2905         atomic_set(&ioc->hweight_gen, 0);
2906
2907         spin_lock_irq(&ioc->lock);
2908         ioc->autop_idx = AUTOP_INVALID;
2909         ioc_refresh_params_disk(ioc, true, disk);
2910         spin_unlock_irq(&ioc->lock);
2911
2912         /*
2913          * rqos must be added before activation to allow ioc_pd_init() to
2914          * lookup the ioc from q. This means that the rqos methods may get
2915          * called before policy activation completion, can't assume that the
2916          * target bio has an iocg associated and need to test for NULL iocg.
2917          */
2918         ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
2919         if (ret)
2920                 goto err_free_ioc;
2921
2922         ret = blkcg_activate_policy(disk, &blkcg_policy_iocost);
2923         if (ret)
2924                 goto err_del_qos;
2925         return 0;
2926
2927 err_del_qos:
2928         rq_qos_del(&ioc->rqos);
2929 err_free_ioc:
2930         free_percpu(ioc->pcpu_stat);
2931         kfree(ioc);
2932         return ret;
2933 }
2934
2935 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2936 {
2937         struct ioc_cgrp *iocc;
2938
2939         iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2940         if (!iocc)
2941                 return NULL;
2942
2943         iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
2944         return &iocc->cpd;
2945 }
2946
2947 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2948 {
2949         kfree(container_of(cpd, struct ioc_cgrp, cpd));
2950 }
2951
2952 static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk,
2953                 struct blkcg *blkcg, gfp_t gfp)
2954 {
2955         int levels = blkcg->css.cgroup->level + 1;
2956         struct ioc_gq *iocg;
2957
2958         iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp,
2959                             disk->node_id);
2960         if (!iocg)
2961                 return NULL;
2962
2963         iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2964         if (!iocg->pcpu_stat) {
2965                 kfree(iocg);
2966                 return NULL;
2967         }
2968
2969         return &iocg->pd;
2970 }
2971
2972 static void ioc_pd_init(struct blkg_policy_data *pd)
2973 {
2974         struct ioc_gq *iocg = pd_to_iocg(pd);
2975         struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2976         struct ioc *ioc = q_to_ioc(blkg->q);
2977         struct ioc_now now;
2978         struct blkcg_gq *tblkg;
2979         unsigned long flags;
2980
2981         ioc_now(ioc, &now);
2982
2983         iocg->ioc = ioc;
2984         atomic64_set(&iocg->vtime, now.vnow);
2985         atomic64_set(&iocg->done_vtime, now.vnow);
2986         atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2987         INIT_LIST_HEAD(&iocg->active_list);
2988         INIT_LIST_HEAD(&iocg->walk_list);
2989         INIT_LIST_HEAD(&iocg->surplus_list);
2990         iocg->hweight_active = WEIGHT_ONE;
2991         iocg->hweight_inuse = WEIGHT_ONE;
2992
2993         init_waitqueue_head(&iocg->waitq);
2994         hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2995         iocg->waitq_timer.function = iocg_waitq_timer_fn;
2996
2997         iocg->level = blkg->blkcg->css.cgroup->level;
2998
2999         for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
3000                 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
3001                 iocg->ancestors[tiocg->level] = tiocg;
3002         }
3003
3004         spin_lock_irqsave(&ioc->lock, flags);
3005         weight_updated(iocg, &now);
3006         spin_unlock_irqrestore(&ioc->lock, flags);
3007 }
3008
3009 static void ioc_pd_free(struct blkg_policy_data *pd)
3010 {
3011         struct ioc_gq *iocg = pd_to_iocg(pd);
3012         struct ioc *ioc = iocg->ioc;
3013         unsigned long flags;
3014
3015         if (ioc) {
3016                 spin_lock_irqsave(&ioc->lock, flags);
3017
3018                 if (!list_empty(&iocg->active_list)) {
3019                         struct ioc_now now;
3020
3021                         ioc_now(ioc, &now);
3022                         propagate_weights(iocg, 0, 0, false, &now);
3023                         list_del_init(&iocg->active_list);
3024                 }
3025
3026                 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
3027                 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
3028
3029                 spin_unlock_irqrestore(&ioc->lock, flags);
3030
3031                 hrtimer_cancel(&iocg->waitq_timer);
3032         }
3033         free_percpu(iocg->pcpu_stat);
3034         kfree(iocg);
3035 }
3036
3037 static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
3038 {
3039         struct ioc_gq *iocg = pd_to_iocg(pd);
3040         struct ioc *ioc = iocg->ioc;
3041
3042         if (!ioc->enabled)
3043                 return;
3044
3045         if (iocg->level == 0) {
3046                 unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
3047                         ioc->vtime_base_rate * 10000,
3048                         VTIME_PER_USEC);
3049                 seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
3050         }
3051
3052         seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
3053
3054         if (blkcg_debug_stats)
3055                 seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
3056                         iocg->last_stat.wait_us,
3057                         iocg->last_stat.indebt_us,
3058                         iocg->last_stat.indelay_us);
3059 }
3060
3061 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3062                              int off)
3063 {
3064         const char *dname = blkg_dev_name(pd->blkg);
3065         struct ioc_gq *iocg = pd_to_iocg(pd);
3066
3067         if (dname && iocg->cfg_weight)
3068                 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
3069         return 0;
3070 }
3071
3072
3073 static int ioc_weight_show(struct seq_file *sf, void *v)
3074 {
3075         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3076         struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3077
3078         seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
3079         blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
3080                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
3081         return 0;
3082 }
3083
3084 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
3085                                 size_t nbytes, loff_t off)
3086 {
3087         struct blkcg *blkcg = css_to_blkcg(of_css(of));
3088         struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3089         struct blkg_conf_ctx ctx;
3090         struct ioc_now now;
3091         struct ioc_gq *iocg;
3092         u32 v;
3093         int ret;
3094
3095         if (!strchr(buf, ':')) {
3096                 struct blkcg_gq *blkg;
3097
3098                 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
3099                         return -EINVAL;
3100
3101                 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3102                         return -EINVAL;
3103
3104                 spin_lock_irq(&blkcg->lock);
3105                 iocc->dfl_weight = v * WEIGHT_ONE;
3106                 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3107                         struct ioc_gq *iocg = blkg_to_iocg(blkg);
3108
3109                         if (iocg) {
3110                                 spin_lock(&iocg->ioc->lock);
3111                                 ioc_now(iocg->ioc, &now);
3112                                 weight_updated(iocg, &now);
3113                                 spin_unlock(&iocg->ioc->lock);
3114                         }
3115                 }
3116                 spin_unlock_irq(&blkcg->lock);
3117
3118                 return nbytes;
3119         }
3120
3121         blkg_conf_init(&ctx, buf);
3122
3123         ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
3124         if (ret)
3125                 goto err;
3126
3127         iocg = blkg_to_iocg(ctx.blkg);
3128
3129         if (!strncmp(ctx.body, "default", 7)) {
3130                 v = 0;
3131         } else {
3132                 if (!sscanf(ctx.body, "%u", &v))
3133                         goto einval;
3134                 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3135                         goto einval;
3136         }
3137
3138         spin_lock(&iocg->ioc->lock);
3139         iocg->cfg_weight = v * WEIGHT_ONE;
3140         ioc_now(iocg->ioc, &now);
3141         weight_updated(iocg, &now);
3142         spin_unlock(&iocg->ioc->lock);
3143
3144         blkg_conf_exit(&ctx);
3145         return nbytes;
3146
3147 einval:
3148         ret = -EINVAL;
3149 err:
3150         blkg_conf_exit(&ctx);
3151         return ret;
3152 }
3153
3154 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3155                           int off)
3156 {
3157         const char *dname = blkg_dev_name(pd->blkg);
3158         struct ioc *ioc = pd_to_iocg(pd)->ioc;
3159
3160         if (!dname)
3161                 return 0;
3162
3163         spin_lock_irq(&ioc->lock);
3164         seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
3165                    dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
3166                    ioc->params.qos[QOS_RPPM] / 10000,
3167                    ioc->params.qos[QOS_RPPM] % 10000 / 100,
3168                    ioc->params.qos[QOS_RLAT],
3169                    ioc->params.qos[QOS_WPPM] / 10000,
3170                    ioc->params.qos[QOS_WPPM] % 10000 / 100,
3171                    ioc->params.qos[QOS_WLAT],
3172                    ioc->params.qos[QOS_MIN] / 10000,
3173                    ioc->params.qos[QOS_MIN] % 10000 / 100,
3174                    ioc->params.qos[QOS_MAX] / 10000,
3175                    ioc->params.qos[QOS_MAX] % 10000 / 100);
3176         spin_unlock_irq(&ioc->lock);
3177         return 0;
3178 }
3179
3180 static int ioc_qos_show(struct seq_file *sf, void *v)
3181 {
3182         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3183
3184         blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
3185                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
3186         return 0;
3187 }
3188
3189 static const match_table_t qos_ctrl_tokens = {
3190         { QOS_ENABLE,           "enable=%u"     },
3191         { QOS_CTRL,             "ctrl=%s"       },
3192         { NR_QOS_CTRL_PARAMS,   NULL            },
3193 };
3194
3195 static const match_table_t qos_tokens = {
3196         { QOS_RPPM,             "rpct=%s"       },
3197         { QOS_RLAT,             "rlat=%u"       },
3198         { QOS_WPPM,             "wpct=%s"       },
3199         { QOS_WLAT,             "wlat=%u"       },
3200         { QOS_MIN,              "min=%s"        },
3201         { QOS_MAX,              "max=%s"        },
3202         { NR_QOS_PARAMS,        NULL            },
3203 };
3204
3205 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
3206                              size_t nbytes, loff_t off)
3207 {
3208         struct blkg_conf_ctx ctx;
3209         struct gendisk *disk;
3210         struct ioc *ioc;
3211         u32 qos[NR_QOS_PARAMS];
3212         bool enable, user;
3213         char *body, *p;
3214         int ret;
3215
3216         blkg_conf_init(&ctx, input);
3217
3218         ret = blkg_conf_open_bdev(&ctx);
3219         if (ret)
3220                 goto err;
3221
3222         body = ctx.body;
3223         disk = ctx.bdev->bd_disk;
3224         if (!queue_is_mq(disk->queue)) {
3225                 ret = -EOPNOTSUPP;
3226                 goto err;
3227         }
3228
3229         ioc = q_to_ioc(disk->queue);
3230         if (!ioc) {
3231                 ret = blk_iocost_init(disk);
3232                 if (ret)
3233                         goto err;
3234                 ioc = q_to_ioc(disk->queue);
3235         }
3236
3237         blk_mq_freeze_queue(disk->queue);
3238         blk_mq_quiesce_queue(disk->queue);
3239
3240         spin_lock_irq(&ioc->lock);
3241         memcpy(qos, ioc->params.qos, sizeof(qos));
3242         enable = ioc->enabled;
3243         user = ioc->user_qos_params;
3244
3245         while ((p = strsep(&body, " \t\n"))) {
3246                 substring_t args[MAX_OPT_ARGS];
3247                 char buf[32];
3248                 int tok;
3249                 s64 v;
3250
3251                 if (!*p)
3252                         continue;
3253
3254                 switch (match_token(p, qos_ctrl_tokens, args)) {
3255                 case QOS_ENABLE:
3256                         if (match_u64(&args[0], &v))
3257                                 goto einval;
3258                         enable = v;
3259                         continue;
3260                 case QOS_CTRL:
3261                         match_strlcpy(buf, &args[0], sizeof(buf));
3262                         if (!strcmp(buf, "auto"))
3263                                 user = false;
3264                         else if (!strcmp(buf, "user"))
3265                                 user = true;
3266                         else
3267                                 goto einval;
3268                         continue;
3269                 }
3270
3271                 tok = match_token(p, qos_tokens, args);
3272                 switch (tok) {
3273                 case QOS_RPPM:
3274                 case QOS_WPPM:
3275                         if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3276                             sizeof(buf))
3277                                 goto einval;
3278                         if (cgroup_parse_float(buf, 2, &v))
3279                                 goto einval;
3280                         if (v < 0 || v > 10000)
3281                                 goto einval;
3282                         qos[tok] = v * 100;
3283                         break;
3284                 case QOS_RLAT:
3285                 case QOS_WLAT:
3286                         if (match_u64(&args[0], &v))
3287                                 goto einval;
3288                         qos[tok] = v;
3289                         break;
3290                 case QOS_MIN:
3291                 case QOS_MAX:
3292                         if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3293                             sizeof(buf))
3294                                 goto einval;
3295                         if (cgroup_parse_float(buf, 2, &v))
3296                                 goto einval;
3297                         if (v < 0)
3298                                 goto einval;
3299                         qos[tok] = clamp_t(s64, v * 100,
3300                                            VRATE_MIN_PPM, VRATE_MAX_PPM);
3301                         break;
3302                 default:
3303                         goto einval;
3304                 }
3305                 user = true;
3306         }
3307
3308         if (qos[QOS_MIN] > qos[QOS_MAX])
3309                 goto einval;
3310
3311         if (enable && !ioc->enabled) {
3312                 blk_stat_enable_accounting(disk->queue);
3313                 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
3314                 ioc->enabled = true;
3315         } else if (!enable && ioc->enabled) {
3316                 blk_stat_disable_accounting(disk->queue);
3317                 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
3318                 ioc->enabled = false;
3319         }
3320
3321         if (user) {
3322                 memcpy(ioc->params.qos, qos, sizeof(qos));
3323                 ioc->user_qos_params = true;
3324         } else {
3325                 ioc->user_qos_params = false;
3326         }
3327
3328         ioc_refresh_params(ioc, true);
3329         spin_unlock_irq(&ioc->lock);
3330
3331         if (enable)
3332                 wbt_disable_default(disk);
3333         else
3334                 wbt_enable_default(disk);
3335
3336         blk_mq_unquiesce_queue(disk->queue);
3337         blk_mq_unfreeze_queue(disk->queue);
3338
3339         blkg_conf_exit(&ctx);
3340         return nbytes;
3341 einval:
3342         spin_unlock_irq(&ioc->lock);
3343
3344         blk_mq_unquiesce_queue(disk->queue);
3345         blk_mq_unfreeze_queue(disk->queue);
3346
3347         ret = -EINVAL;
3348 err:
3349         blkg_conf_exit(&ctx);
3350         return ret;
3351 }
3352
3353 static u64 ioc_cost_model_prfill(struct seq_file *sf,
3354                                  struct blkg_policy_data *pd, int off)
3355 {
3356         const char *dname = blkg_dev_name(pd->blkg);
3357         struct ioc *ioc = pd_to_iocg(pd)->ioc;
3358         u64 *u = ioc->params.i_lcoefs;
3359
3360         if (!dname)
3361                 return 0;
3362
3363         spin_lock_irq(&ioc->lock);
3364         seq_printf(sf, "%s ctrl=%s model=linear "
3365                    "rbps=%llu rseqiops=%llu rrandiops=%llu "
3366                    "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
3367                    dname, ioc->user_cost_model ? "user" : "auto",
3368                    u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
3369                    u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
3370         spin_unlock_irq(&ioc->lock);
3371         return 0;
3372 }
3373
3374 static int ioc_cost_model_show(struct seq_file *sf, void *v)
3375 {
3376         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3377
3378         blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
3379                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
3380         return 0;
3381 }
3382
3383 static const match_table_t cost_ctrl_tokens = {
3384         { COST_CTRL,            "ctrl=%s"       },
3385         { COST_MODEL,           "model=%s"      },
3386         { NR_COST_CTRL_PARAMS,  NULL            },
3387 };
3388
3389 static const match_table_t i_lcoef_tokens = {
3390         { I_LCOEF_RBPS,         "rbps=%u"       },
3391         { I_LCOEF_RSEQIOPS,     "rseqiops=%u"   },
3392         { I_LCOEF_RRANDIOPS,    "rrandiops=%u"  },
3393         { I_LCOEF_WBPS,         "wbps=%u"       },
3394         { I_LCOEF_WSEQIOPS,     "wseqiops=%u"   },
3395         { I_LCOEF_WRANDIOPS,    "wrandiops=%u"  },
3396         { NR_I_LCOEFS,          NULL            },
3397 };
3398
3399 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
3400                                     size_t nbytes, loff_t off)
3401 {
3402         struct blkg_conf_ctx ctx;
3403         struct request_queue *q;
3404         struct ioc *ioc;
3405         u64 u[NR_I_LCOEFS];
3406         bool user;
3407         char *body, *p;
3408         int ret;
3409
3410         blkg_conf_init(&ctx, input);
3411
3412         ret = blkg_conf_open_bdev(&ctx);
3413         if (ret)
3414                 goto err;
3415
3416         body = ctx.body;
3417         q = bdev_get_queue(ctx.bdev);
3418         if (!queue_is_mq(q)) {
3419                 ret = -EOPNOTSUPP;
3420                 goto err;
3421         }
3422
3423         ioc = q_to_ioc(q);
3424         if (!ioc) {
3425                 ret = blk_iocost_init(ctx.bdev->bd_disk);
3426                 if (ret)
3427                         goto err;
3428                 ioc = q_to_ioc(q);
3429         }
3430
3431         blk_mq_freeze_queue(q);
3432         blk_mq_quiesce_queue(q);
3433
3434         spin_lock_irq(&ioc->lock);
3435         memcpy(u, ioc->params.i_lcoefs, sizeof(u));
3436         user = ioc->user_cost_model;
3437
3438         while ((p = strsep(&body, " \t\n"))) {
3439                 substring_t args[MAX_OPT_ARGS];
3440                 char buf[32];
3441                 int tok;
3442                 u64 v;
3443
3444                 if (!*p)
3445                         continue;
3446
3447                 switch (match_token(p, cost_ctrl_tokens, args)) {
3448                 case COST_CTRL:
3449                         match_strlcpy(buf, &args[0], sizeof(buf));
3450                         if (!strcmp(buf, "auto"))
3451                                 user = false;
3452                         else if (!strcmp(buf, "user"))
3453                                 user = true;
3454                         else
3455                                 goto einval;
3456                         continue;
3457                 case COST_MODEL:
3458                         match_strlcpy(buf, &args[0], sizeof(buf));
3459                         if (strcmp(buf, "linear"))
3460                                 goto einval;
3461                         continue;
3462                 }
3463
3464                 tok = match_token(p, i_lcoef_tokens, args);
3465                 if (tok == NR_I_LCOEFS)
3466                         goto einval;
3467                 if (match_u64(&args[0], &v))
3468                         goto einval;
3469                 u[tok] = v;
3470                 user = true;
3471         }
3472
3473         if (user) {
3474                 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
3475                 ioc->user_cost_model = true;
3476         } else {
3477                 ioc->user_cost_model = false;
3478         }
3479         ioc_refresh_params(ioc, true);
3480         spin_unlock_irq(&ioc->lock);
3481
3482         blk_mq_unquiesce_queue(q);
3483         blk_mq_unfreeze_queue(q);
3484
3485         blkg_conf_exit(&ctx);
3486         return nbytes;
3487
3488 einval:
3489         spin_unlock_irq(&ioc->lock);
3490
3491         blk_mq_unquiesce_queue(q);
3492         blk_mq_unfreeze_queue(q);
3493
3494         ret = -EINVAL;
3495 err:
3496         blkg_conf_exit(&ctx);
3497         return ret;
3498 }
3499
3500 static struct cftype ioc_files[] = {
3501         {
3502                 .name = "weight",
3503                 .flags = CFTYPE_NOT_ON_ROOT,
3504                 .seq_show = ioc_weight_show,
3505                 .write = ioc_weight_write,
3506         },
3507         {
3508                 .name = "cost.qos",
3509                 .flags = CFTYPE_ONLY_ON_ROOT,
3510                 .seq_show = ioc_qos_show,
3511                 .write = ioc_qos_write,
3512         },
3513         {
3514                 .name = "cost.model",
3515                 .flags = CFTYPE_ONLY_ON_ROOT,
3516                 .seq_show = ioc_cost_model_show,
3517                 .write = ioc_cost_model_write,
3518         },
3519         {}
3520 };
3521
3522 static struct blkcg_policy blkcg_policy_iocost = {
3523         .dfl_cftypes    = ioc_files,
3524         .cpd_alloc_fn   = ioc_cpd_alloc,
3525         .cpd_free_fn    = ioc_cpd_free,
3526         .pd_alloc_fn    = ioc_pd_alloc,
3527         .pd_init_fn     = ioc_pd_init,
3528         .pd_free_fn     = ioc_pd_free,
3529         .pd_stat_fn     = ioc_pd_stat,
3530 };
3531
3532 static int __init ioc_init(void)
3533 {
3534         return blkcg_policy_register(&blkcg_policy_iocost);
3535 }
3536
3537 static void __exit ioc_exit(void)
3538 {
3539         blkcg_policy_unregister(&blkcg_policy_iocost);
3540 }
3541
3542 module_init(ioc_init);
3543 module_exit(ioc_exit);