block/blk-iocost.c

   1 /* SPDX-License-Identifier: GPL-2.0
   2  *
   3  * IO cost model based controller.
   4  *
   5  * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
   6  * Copyright (C) 2019 Andy Newell <newella@fb.com>
   7  * Copyright (C) 2019 Facebook
   8  *
   9  * One challenge of controlling IO resources is the lack of trivially
  10  * observable cost metric.  This is distinguished from CPU and memory where
  11  * wallclock time and the number of bytes can serve as accurate enough
  12  * approximations.
  13  *
  14  * Bandwidth and iops are the most commonly used metrics for IO devices but
  15  * depending on the type and specifics of the device, different IO patterns
  16  * easily lead to multiple orders of magnitude variations rendering them
  17  * useless for the purpose of IO capacity distribution.  While on-device
  18  * time, with a lot of clutches, could serve as a useful approximation for
  19  * non-queued rotational devices, this is no longer viable with modern
  20  * devices, even the rotational ones.
  21  *
  22  * While there is no cost metric we can trivially observe, it isn't a
  23  * complete mystery.  For example, on a rotational device, seek cost
  24  * dominates while a contiguous transfer contributes a smaller amount
  25  * proportional to the size.  If we can characterize at least the relative
  26  * costs of these different types of IOs, it should be possible to
  27  * implement a reasonable work-conserving proportional IO resource
  28  * distribution.
  29  *
  30  * 1. IO Cost Model
  31  *
  32  * IO cost model estimates the cost of an IO given its basic parameters and
  33  * history (e.g. the end sector of the last IO).  The cost is measured in
  34  * device time.  If a given IO is estimated to cost 10ms, the device should
  35  * be able to process ~100 of those IOs in a second.
  36  *
  37  * Currently, there's only one builtin cost model - linear.  Each IO is
  38  * classified as sequential or random and given a base cost accordingly.
  39  * On top of that, a size cost proportional to the length of the IO is
  40  * added.  While simple, this model captures the operational
  41  * characteristics of a wide varienty of devices well enough.  Default
  42  * paramters for several different classes of devices are provided and the
  43  * parameters can be configured from userspace via
  44  * /sys/fs/cgroup/io.cost.model.
  45  *
  46  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
  47  * device-specific coefficients.
  48  *
  49  * 2. Control Strategy
  50  *
  51  * The device virtual time (vtime) is used as the primary control metric.
  52  * The control strategy is composed of the following three parts.
  53  *
  54  * 2-1. Vtime Distribution
  55  *
  56  * When a cgroup becomes active in terms of IOs, its hierarchical share is
  57  * calculated.  Please consider the following hierarchy where the numbers
  58  * inside parentheses denote the configured weights.
  59  *
  60  *           root
  61  *         /       \
  62  *      A (w:100)  B (w:300)
  63  *      /       \
  64  *  A0 (w:100)  A1 (w:100)
  65  *
  66  * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
  67  * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
  68  * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
  69  * 12.5% each.  The distribution mechanism only cares about these flattened
  70  * shares.  They're called hweights (hierarchical weights) and always add
  71  * upto 1 (HWEIGHT_WHOLE).
  72  *
  73  * A given cgroup's vtime runs slower in inverse proportion to its hweight.
  74  * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
  75  * against the device vtime - an IO which takes 10ms on the underlying
  76  * device is considered to take 80ms on A0.
  77  *
  78  * This constitutes the basis of IO capacity distribution.  Each cgroup's
  79  * vtime is running at a rate determined by its hweight.  A cgroup tracks
  80  * the vtime consumed by past IOs and can issue a new IO iff doing so
  81  * wouldn't outrun the current device vtime.  Otherwise, the IO is
  82  * suspended until the vtime has progressed enough to cover it.
  83  *
  84  * 2-2. Vrate Adjustment
  85  *
  86  * It's unrealistic to expect the cost model to be perfect.  There are too
  87  * many devices and even on the same device the overall performance
  88  * fluctuates depending on numerous factors such as IO mixture and device
  89  * internal garbage collection.  The controller needs to adapt dynamically.
  90  *
  91  * This is achieved by adjusting the overall IO rate according to how busy
  92  * the device is.  If the device becomes overloaded, we're sending down too
  93  * many IOs and should generally slow down.  If there are waiting issuers
  94  * but the device isn't saturated, we're issuing too few and should
  95  * generally speed up.
  96  *
  97  * To slow down, we lower the vrate - the rate at which the device vtime
  98  * passes compared to the wall clock.  For example, if the vtime is running
  99  * at the vrate of 75%, all cgroups added up would only be able to issue
 100  * 750ms worth of IOs per second, and vice-versa for speeding up.
 101  *
 102  * Device business is determined using two criteria - rq wait and
 103  * completion latencies.
 104  *
 105  * When a device gets saturated, the on-device and then the request queues
 106  * fill up and a bio which is ready to be issued has to wait for a request
 107  * to become available.  When this delay becomes noticeable, it's a clear
 108  * indication that the device is saturated and we lower the vrate.  This
 109  * saturation signal is fairly conservative as it only triggers when both
 110  * hardware and software queues are filled up, and is used as the default
 111  * busy signal.
 112  *
 113  * As devices can have deep queues and be unfair in how the queued commands
 114  * are executed, soley depending on rq wait may not result in satisfactory
 115  * control quality.  For a better control quality, completion latency QoS
 116  * parameters can be configured so that the device is considered saturated
 117  * if N'th percentile completion latency rises above the set point.
 118  *
 119  * The completion latency requirements are a function of both the
 120  * underlying device characteristics and the desired IO latency quality of
 121  * service.  There is an inherent trade-off - the tighter the latency QoS,
 122  * the higher the bandwidth lossage.  Latency QoS is disabled by default
 123  * and can be set through /sys/fs/cgroup/io.cost.qos.
 124  *
 125  * 2-3. Work Conservation
 126  *
 127  * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
 128  * periodically while B is sending out enough parallel IOs to saturate the
 129  * device on its own.  Let's say A's usage amounts to 100ms worth of IO
 130  * cost per second, i.e., 10% of the device capacity.  The naive
 131  * distribution of half and half would lead to 60% utilization of the
 132  * device, a significant reduction in the total amount of work done
 133  * compared to free-for-all competition.  This is too high a cost to pay
 134  * for IO control.
 135  *
 136  * To conserve the total amount of work done, we keep track of how much
 137  * each active cgroup is actually using and yield part of its weight if
 138  * there are other cgroups which can make use of it.  In the above case,
 139  * A's weight will be lowered so that it hovers above the actual usage and
 140  * B would be able to use the rest.
 141  *
 142  * As we don't want to penalize a cgroup for donating its weight, the
 143  * surplus weight adjustment factors in a margin and has an immediate
 144  * snapback mechanism in case the cgroup needs more IO vtime for itself.
 145  *
 146  * Note that adjusting down surplus weights has the same effects as
 147  * accelerating vtime for other cgroups and work conservation can also be
 148  * implemented by adjusting vrate dynamically.  However, squaring who can
 149  * donate and should take back how much requires hweight propagations
 150  * anyway making it easier to implement and understand as a separate
 151  * mechanism.
 152  *
 153  * 3. Monitoring
 154  *
 155  * Instead of debugfs or other clumsy monitoring mechanisms, this
 156  * controller uses a drgn based monitoring script -
 157  * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
 158  * https://github.com/osandov/drgn.  The ouput looks like the following.
 159  *
 160  *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
 161  *                 active      weight      hweight% inflt% dbt  delay usages%
 162  *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
 163  *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
 164  *
 165  * - per        : Timer period
 166  * - cur_per    : Internal wall and device vtime clock
 167  * - vrate      : Device virtual time rate against wall clock
 168  * - weight     : Surplus-adjusted and configured weights
 169  * - hweight    : Surplus-adjusted and configured hierarchical weights
 170  * - inflt      : The percentage of in-flight IO cost at the end of last period
 171  * - del_ms     : Deferred issuer delay induction level and duration
 172  * - usages     : Usage history
 173  */
 174
 175 #include <linux/kernel.h>
 176 #include <linux/module.h>
 177 #include <linux/timer.h>
 178 #include <linux/time64.h>
 179 #include <linux/parser.h>
 180 #include <linux/sched/signal.h>
 181 #include <linux/blk-cgroup.h>
 182 #include "blk-rq-qos.h"
 183 #include "blk-stat.h"
 184 #include "blk-wbt.h"
 185
 186 #ifdef CONFIG_TRACEPOINTS
 187
 188 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
 189 #define TRACE_IOCG_PATH_LEN 1024
 190 static DEFINE_SPINLOCK(trace_iocg_path_lock);
 191 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
 192
 193 #define TRACE_IOCG_PATH(type, iocg, ...)                                        \
 194         do {                                                                    \
 195                 unsigned long flags;                                            \
 196                 if (trace_iocost_##type##_enabled()) {                          \
 197                         spin_lock_irqsave(&trace_iocg_path_lock, flags);        \
 198                         cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,      \
 199                                     trace_iocg_path, TRACE_IOCG_PATH_LEN);      \
 200                         trace_iocost_##type(iocg, trace_iocg_path,              \
 201                                               ##__VA_ARGS__);                   \
 202                         spin_unlock_irqrestore(&trace_iocg_path_lock, flags);   \
 203                 }                                                               \
 204         } while (0)
 205
 206 #else   /* CONFIG_TRACE_POINTS */
 207 #define TRACE_IOCG_PATH(type, iocg, ...)        do { } while (0)
 208 #endif  /* CONFIG_TRACE_POINTS */
 209
 210 enum {
 211         MILLION                 = 1000000,
 212
 213         /* timer period is calculated from latency requirements, bound it */
 214         MIN_PERIOD              = USEC_PER_MSEC,
 215         MAX_PERIOD              = USEC_PER_SEC,
 216
 217         /*
 218          * A cgroup's vtime can run 50% behind the device vtime, which
 219          * serves as its IO credit buffer.  Surplus weight adjustment is
 220          * immediately canceled if the vtime margin runs below 10%.
 221          */
 222         MARGIN_PCT              = 50,
 223         INUSE_MARGIN_PCT        = 10,
 224
 225         /* Have some play in waitq timer operations */
 226         WAITQ_TIMER_MARGIN_PCT  = 5,
 227
 228         /*
 229          * vtime can wrap well within a reasonable uptime when vrate is
 230          * consistently raised.  Don't trust recorded cgroup vtime if the
 231          * period counter indicates that it's older than 5mins.
 232          */
 233         VTIME_VALID_DUR         = 300 * USEC_PER_SEC,
 234
 235         /*
 236          * Remember the past three non-zero usages and use the max for
 237          * surplus calculation.  Three slots guarantee that we remember one
 238          * full period usage from the last active stretch even after
 239          * partial deactivation and re-activation periods.  Don't start
 240          * giving away weight before collecting two data points to prevent
 241          * hweight adjustments based on one partial activation period.
 242          */
 243         NR_USAGE_SLOTS          = 3,
 244         MIN_VALID_USAGES        = 2,
 245
 246         /* 1/64k is granular enough and can easily be handled w/ u32 */
 247         HWEIGHT_WHOLE           = 1 << 16,
 248
 249         /*
 250          * As vtime is used to calculate the cost of each IO, it needs to
 251          * be fairly high precision.  For example, it should be able to
 252          * represent the cost of a single page worth of discard with
 253          * suffificient accuracy.  At the same time, it should be able to
 254          * represent reasonably long enough durations to be useful and
 255          * convenient during operation.
 256          *
 257          * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
 258          * granularity and days of wrap-around time even at extreme vrates.
 259          */
 260         VTIME_PER_SEC_SHIFT     = 37,
 261         VTIME_PER_SEC           = 1LLU << VTIME_PER_SEC_SHIFT,
 262         VTIME_PER_USEC          = VTIME_PER_SEC / USEC_PER_SEC,
 263         VTIME_PER_NSEC          = VTIME_PER_SEC / NSEC_PER_SEC,
 264
 265         /* bound vrate adjustments within two orders of magnitude */
 266         VRATE_MIN_PPM           = 10000,        /* 1% */
 267         VRATE_MAX_PPM           = 100000000,    /* 10000% */
 268
 269         VRATE_MIN               = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
 270         VRATE_CLAMP_ADJ_PCT     = 4,
 271
 272         /* if IOs end up waiting for requests, issue less */
 273         RQ_WAIT_BUSY_PCT        = 5,
 274
 275         /* unbusy hysterisis */
 276         UNBUSY_THR_PCT          = 75,
 277
 278         /* don't let cmds which take a very long time pin lagging for too long */
 279         MAX_LAGGING_PERIODS     = 10,
 280
 281         /*
 282          * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
 283          * donate the surplus.
 284          */
 285         SURPLUS_SCALE_PCT       = 125,                  /* * 125% */
 286         SURPLUS_SCALE_ABS       = HWEIGHT_WHOLE / 50,   /* + 2% */
 287         SURPLUS_MIN_ADJ_DELTA   = HWEIGHT_WHOLE / 33,   /* 3% */
 288
 289         /* switch iff the conditions are met for longer than this */
 290         AUTOP_CYCLE_NSEC        = 10LLU * NSEC_PER_SEC,
 291
 292         /*
 293          * Count IO size in 4k pages.  The 12bit shift helps keeping
 294          * size-proportional components of cost calculation in closer
 295          * numbers of digits to per-IO cost components.
 296          */
 297         IOC_PAGE_SHIFT          = 12,
 298         IOC_PAGE_SIZE           = 1 << IOC_PAGE_SHIFT,
 299         IOC_SECT_TO_PAGE_SHIFT  = IOC_PAGE_SHIFT - SECTOR_SHIFT,
 300
 301         /* if apart further than 16M, consider randio for linear model */
 302         LCOEF_RANDIO_PAGES      = 4096,
 303 };
 304
 305 enum ioc_running {
 306         IOC_IDLE,
 307         IOC_RUNNING,
 308         IOC_STOP,
 309 };
 310
 311 /* io.cost.qos controls including per-dev enable of the whole controller */
 312 enum {
 313         QOS_ENABLE,
 314         QOS_CTRL,
 315         NR_QOS_CTRL_PARAMS,
 316 };
 317
 318 /* io.cost.qos params */
 319 enum {
 320         QOS_RPPM,
 321         QOS_RLAT,
 322         QOS_WPPM,
 323         QOS_WLAT,
 324         QOS_MIN,
 325         QOS_MAX,
 326         NR_QOS_PARAMS,
 327 };
 328
 329 /* io.cost.model controls */
 330 enum {
 331         COST_CTRL,
 332         COST_MODEL,
 333         NR_COST_CTRL_PARAMS,
 334 };
 335
 336 /* builtin linear cost model coefficients */
 337 enum {
 338         I_LCOEF_RBPS,
 339         I_LCOEF_RSEQIOPS,
 340         I_LCOEF_RRANDIOPS,
 341         I_LCOEF_WBPS,
 342         I_LCOEF_WSEQIOPS,
 343         I_LCOEF_WRANDIOPS,
 344         NR_I_LCOEFS,
 345 };
 346
 347 enum {
 348         LCOEF_RPAGE,
 349         LCOEF_RSEQIO,
 350         LCOEF_RRANDIO,
 351         LCOEF_WPAGE,
 352         LCOEF_WSEQIO,
 353         LCOEF_WRANDIO,
 354         NR_LCOEFS,
 355 };
 356
 357 enum {
 358         AUTOP_INVALID,
 359         AUTOP_HDD,
 360         AUTOP_SSD_QD1,
 361         AUTOP_SSD_DFL,
 362         AUTOP_SSD_FAST,
 363 };
 364
 365 struct ioc_gq;
 366
 367 struct ioc_params {
 368         u32                             qos[NR_QOS_PARAMS];
 369         u64                             i_lcoefs[NR_I_LCOEFS];
 370         u64                             lcoefs[NR_LCOEFS];
 371         u32                             too_fast_vrate_pct;
 372         u32                             too_slow_vrate_pct;
 373 };
 374
 375 struct ioc_missed {
 376         u32                             nr_met;
 377         u32                             nr_missed;
 378         u32                             last_met;
 379         u32                             last_missed;
 380 };
 381
 382 struct ioc_pcpu_stat {
 383         struct ioc_missed               missed[2];
 384
 385         u64                             rq_wait_ns;
 386         u64                             last_rq_wait_ns;
 387 };
 388
 389 /* per device */
 390 struct ioc {
 391         struct rq_qos                   rqos;
 392
 393         bool                            enabled;
 394
 395         struct ioc_params               params;
 396         u32                             period_us;
 397         u32                             margin_us;
 398         u64                             vrate_min;
 399         u64                             vrate_max;
 400
 401         spinlock_t                      lock;
 402         struct timer_list               timer;
 403         struct list_head                active_iocgs;   /* active cgroups */
 404         struct ioc_pcpu_stat __percpu   *pcpu_stat;
 405
 406         enum ioc_running                running;
 407         atomic64_t                      vtime_rate;
 408
 409         seqcount_t                      period_seqcount;
 410         u32                             period_at;      /* wallclock starttime */
 411         u64                             period_at_vtime; /* vtime starttime */
 412
 413         atomic64_t                      cur_period;     /* inc'd each period */
 414         int                             busy_level;     /* saturation history */
 415
 416         u64                             inuse_margin_vtime;
 417         bool                            weights_updated;
 418         atomic_t                        hweight_gen;    /* for lazy hweights */
 419
 420         u64                             autop_too_fast_at;
 421         u64                             autop_too_slow_at;
 422         int                             autop_idx;
 423         bool                            user_qos_params:1;
 424         bool                            user_cost_model:1;
 425 };
 426
 427 /* per device-cgroup pair */
 428 struct ioc_gq {
 429         struct blkg_policy_data         pd;
 430         struct ioc                      *ioc;
 431
 432         /*
 433          * A iocg can get its weight from two sources - an explicit
 434          * per-device-cgroup configuration or the default weight of the
 435          * cgroup.  `cfg_weight` is the explicit per-device-cgroup
 436          * configuration.  `weight` is the effective considering both
 437          * sources.
 438          *
 439          * When an idle cgroup becomes active its `active` goes from 0 to
 440          * `weight`.  `inuse` is the surplus adjusted active weight.
 441          * `active` and `inuse` are used to calculate `hweight_active` and
 442          * `hweight_inuse`.
 443          *
 444          * `last_inuse` remembers `inuse` while an iocg is idle to persist
 445          * surplus adjustments.
 446          */
 447         u32                             cfg_weight;
 448         u32                             weight;
 449         u32                             active;
 450         u32                             inuse;
 451         u32                             last_inuse;
 452
 453         sector_t                        cursor;         /* to detect randio */
 454
 455         /*
 456          * `vtime` is this iocg's vtime cursor which progresses as IOs are
 457          * issued.  If lagging behind device vtime, the delta represents
 458          * the currently available IO budget.  If runnning ahead, the
 459          * overage.
 460          *
 461          * `vtime_done` is the same but progressed on completion rather
 462          * than issue.  The delta behind `vtime` represents the cost of
 463          * currently in-flight IOs.
 464          *
 465          * `last_vtime` is used to remember `vtime` at the end of the last
 466          * period to calculate utilization.
 467          */
 468         atomic64_t                      vtime;
 469         atomic64_t                      done_vtime;
 470         u64                             abs_vdebt;
 471         u64                             last_vtime;
 472
 473         /*
 474          * The period this iocg was last active in.  Used for deactivation
 475          * and invalidating `vtime`.
 476          */
 477         atomic64_t                      active_period;
 478         struct list_head                active_list;
 479
 480         /* see __propagate_active_weight() and current_hweight() for details */
 481         u64                             child_active_sum;
 482         u64                             child_inuse_sum;
 483         int                             hweight_gen;
 484         u32                             hweight_active;
 485         u32                             hweight_inuse;
 486         bool                            has_surplus;
 487
 488         struct wait_queue_head          waitq;
 489         struct hrtimer                  waitq_timer;
 490         struct hrtimer                  delay_timer;
 491
 492         /* usage is recorded as fractions of HWEIGHT_WHOLE */
 493         int                             usage_idx;
 494         u32                             usages[NR_USAGE_SLOTS];
 495
 496         /* this iocg's depth in the hierarchy and ancestors including self */
 497         int                             level;
 498         struct ioc_gq                   *ancestors[];
 499 };
 500
 501 /* per cgroup */
 502 struct ioc_cgrp {
 503         struct blkcg_policy_data        cpd;
 504         unsigned int                    dfl_weight;
 505 };
 506
 507 struct ioc_now {
 508         u64                             now_ns;
 509         u32                             now;
 510         u64                             vnow;
 511         u64                             vrate;
 512 };
 513
 514 struct iocg_wait {
 515         struct wait_queue_entry         wait;
 516         struct bio                      *bio;
 517         u64                             abs_cost;
 518         bool                            committed;
 519 };
 520
 521 struct iocg_wake_ctx {
 522         struct ioc_gq                   *iocg;
 523         u32                             hw_inuse;
 524         s64                             vbudget;
 525 };
 526
 527 static const struct ioc_params autop[] = {
 528         [AUTOP_HDD] = {
 529                 .qos                            = {
 530                         [QOS_RLAT]              =        250000, /* 250ms */
 531                         [QOS_WLAT]              =        250000,
 532                         [QOS_MIN]               = VRATE_MIN_PPM,
 533                         [QOS_MAX]               = VRATE_MAX_PPM,
 534                 },
 535                 .i_lcoefs                       = {
 536                         [I_LCOEF_RBPS]          =     174019176,
 537                         [I_LCOEF_RSEQIOPS]      =         41708,
 538                         [I_LCOEF_RRANDIOPS]     =           370,
 539                         [I_LCOEF_WBPS]          =     178075866,
 540                         [I_LCOEF_WSEQIOPS]      =         42705,
 541                         [I_LCOEF_WRANDIOPS]     =           378,
 542                 },
 543         },
 544         [AUTOP_SSD_QD1] = {
 545                 .qos                            = {
 546                         [QOS_RLAT]              =         25000, /* 25ms */
 547                         [QOS_WLAT]              =         25000,
 548                         [QOS_MIN]               = VRATE_MIN_PPM,
 549                         [QOS_MAX]               = VRATE_MAX_PPM,
 550                 },
 551                 .i_lcoefs                       = {
 552                         [I_LCOEF_RBPS]          =     245855193,
 553                         [I_LCOEF_RSEQIOPS]      =         61575,
 554                         [I_LCOEF_RRANDIOPS]     =          6946,
 555                         [I_LCOEF_WBPS]          =     141365009,
 556                         [I_LCOEF_WSEQIOPS]      =         33716,
 557                         [I_LCOEF_WRANDIOPS]     =         26796,
 558                 },
 559         },
 560         [AUTOP_SSD_DFL] = {
 561                 .qos                            = {
 562                         [QOS_RLAT]              =         25000, /* 25ms */
 563                         [QOS_WLAT]              =         25000,
 564                         [QOS_MIN]               = VRATE_MIN_PPM,
 565                         [QOS_MAX]               = VRATE_MAX_PPM,
 566                 },
 567                 .i_lcoefs                       = {
 568                         [I_LCOEF_RBPS]          =     488636629,
 569                         [I_LCOEF_RSEQIOPS]      =          8932,
 570                         [I_LCOEF_RRANDIOPS]     =          8518,
 571                         [I_LCOEF_WBPS]          =     427891549,
 572                         [I_LCOEF_WSEQIOPS]      =         28755,
 573                         [I_LCOEF_WRANDIOPS]     =         21940,
 574                 },
 575                 .too_fast_vrate_pct             =           500,
 576         },
 577         [AUTOP_SSD_FAST] = {
 578                 .qos                            = {
 579                         [QOS_RLAT]              =          5000, /* 5ms */
 580                         [QOS_WLAT]              =          5000,
 581                         [QOS_MIN]               = VRATE_MIN_PPM,
 582                         [QOS_MAX]               = VRATE_MAX_PPM,
 583                 },
 584                 .i_lcoefs                       = {
 585                         [I_LCOEF_RBPS]          =    3102524156LLU,
 586                         [I_LCOEF_RSEQIOPS]      =        724816,
 587                         [I_LCOEF_RRANDIOPS]     =        778122,
 588                         [I_LCOEF_WBPS]          =    1742780862LLU,
 589                         [I_LCOEF_WSEQIOPS]      =        425702,
 590                         [I_LCOEF_WRANDIOPS]     =        443193,
 591                 },
 592                 .too_slow_vrate_pct             =            10,
 593         },
 594 };
 595
 596 /*
 597  * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
 598  * vtime credit shortage and down on device saturation.
 599  */
 600 static u32 vrate_adj_pct[] =
 601         { 0, 0, 0, 0,
 602           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 603           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 604           4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
 605
 606 static struct blkcg_policy blkcg_policy_iocost;
 607
 608 /* accessors and helpers */
 609 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
 610 {
 611         return container_of(rqos, struct ioc, rqos);
 612 }
 613
 614 static struct ioc *q_to_ioc(struct request_queue *q)
 615 {
 616         return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
 617 }
 618
 619 static const char *q_name(struct request_queue *q)
 620 {
 621         if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
 622                 return kobject_name(q->kobj.parent);
 623         else
 624                 return "<unknown>";
 625 }
 626
 627 static const char __maybe_unused *ioc_name(struct ioc *ioc)
 628 {
 629         return q_name(ioc->rqos.q);
 630 }
 631
 632 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
 633 {
 634         return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
 635 }
 636
 637 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
 638 {
 639         return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
 640 }
 641
 642 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
 643 {
 644         return pd_to_blkg(&iocg->pd);
 645 }
 646
 647 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
 648 {
 649         return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
 650                             struct ioc_cgrp, cpd);
 651 }
 652
 653 /*
 654  * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
 655  * weight, the more expensive each IO.  Must round up.
 656  */
 657 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
 658 {
 659         return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
 660 }
 661
 662 /*
 663  * The inverse of abs_cost_to_cost().  Must round up.
 664  */
 665 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
 666 {
 667         return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
 668 }
 669
 670 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
 671 {
 672         bio->bi_iocost_cost = cost;
 673         atomic64_add(cost, &iocg->vtime);
 674 }
 675
 676 #define CREATE_TRACE_POINTS
 677 #include <trace/events/iocost.h>
 678
 679 /* latency Qos params changed, update period_us and all the dependent params */
 680 static void ioc_refresh_period_us(struct ioc *ioc)
 681 {
 682         u32 ppm, lat, multi, period_us;
 683
 684         lockdep_assert_held(&ioc->lock);
 685
 686         /* pick the higher latency target */
 687         if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
 688                 ppm = ioc->params.qos[QOS_RPPM];
 689                 lat = ioc->params.qos[QOS_RLAT];
 690         } else {
 691                 ppm = ioc->params.qos[QOS_WPPM];
 692                 lat = ioc->params.qos[QOS_WLAT];
 693         }
 694
 695         /*
 696          * We want the period to be long enough to contain a healthy number
 697          * of IOs while short enough for granular control.  Define it as a
 698          * multiple of the latency target.  Ideally, the multiplier should
 699          * be scaled according to the percentile so that it would nominally
 700          * contain a certain number of requests.  Let's be simpler and
 701          * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
 702          */
 703         if (ppm)
 704                 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
 705         else
 706                 multi = 2;
 707         period_us = multi * lat;
 708         period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
 709
 710         /* calculate dependent params */
 711         ioc->period_us = period_us;
 712         ioc->margin_us = period_us * MARGIN_PCT / 100;
 713         ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
 714                         period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
 715 }
 716
 717 static int ioc_autop_idx(struct ioc *ioc)
 718 {
 719         int idx = ioc->autop_idx;
 720         const struct ioc_params *p = &autop[idx];
 721         u32 vrate_pct;
 722         u64 now_ns;
 723
 724         /* rotational? */
 725         if (!blk_queue_nonrot(ioc->rqos.q))
 726                 return AUTOP_HDD;
 727
 728         /* handle SATA SSDs w/ broken NCQ */
 729         if (blk_queue_depth(ioc->rqos.q) == 1)
 730                 return AUTOP_SSD_QD1;
 731
 732         /* use one of the normal ssd sets */
 733         if (idx < AUTOP_SSD_DFL)
 734                 return AUTOP_SSD_DFL;
 735
 736         /* if user is overriding anything, maintain what was there */
 737         if (ioc->user_qos_params || ioc->user_cost_model)
 738                 return idx;
 739
 740         /* step up/down based on the vrate */
 741         vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
 742                               VTIME_PER_USEC);
 743         now_ns = ktime_get_ns();
 744
 745         if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
 746                 if (!ioc->autop_too_fast_at)
 747                         ioc->autop_too_fast_at = now_ns;
 748                 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
 749                         return idx + 1;
 750         } else {
 751                 ioc->autop_too_fast_at = 0;
 752         }
 753
 754         if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
 755                 if (!ioc->autop_too_slow_at)
 756                         ioc->autop_too_slow_at = now_ns;
 757                 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
 758                         return idx - 1;
 759         } else {
 760                 ioc->autop_too_slow_at = 0;
 761         }
 762
 763         return idx;
 764 }
 765
 766 /*
 767  * Take the followings as input
 768  *
 769  *  @bps        maximum sequential throughput
 770  *  @seqiops    maximum sequential 4k iops
 771  *  @randiops   maximum random 4k iops
 772  *
 773  * and calculate the linear model cost coefficients.
 774  *
 775  *  *@page      per-page cost           1s / (@bps / 4096)
 776  *  *@seqio     base cost of a seq IO   max((1s / @seqiops) - *@page, 0)
 777  *  @randiops   base cost of a rand IO  max((1s / @randiops) - *@page, 0)
 778  */
 779 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
 780                         u64 *page, u64 *seqio, u64 *randio)
 781 {
 782         u64 v;
 783
 784         *page = *seqio = *randio = 0;
 785
 786         if (bps)
 787                 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
 788                                            DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
 789
 790         if (seqiops) {
 791                 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
 792                 if (v > *page)
 793                         *seqio = v - *page;
 794         }
 795
 796         if (randiops) {
 797                 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
 798                 if (v > *page)
 799                         *randio = v - *page;
 800         }
 801 }
 802
 803 static void ioc_refresh_lcoefs(struct ioc *ioc)
 804 {
 805         u64 *u = ioc->params.i_lcoefs;
 806         u64 *c = ioc->params.lcoefs;
 807
 808         calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
 809                     &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
 810         calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
 811                     &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
 812 }
 813
 814 static bool ioc_refresh_params(struct ioc *ioc, bool force)
 815 {
 816         const struct ioc_params *p;
 817         int idx;
 818
 819         lockdep_assert_held(&ioc->lock);
 820
 821         idx = ioc_autop_idx(ioc);
 822         p = &autop[idx];
 823
 824         if (idx == ioc->autop_idx && !force)
 825                 return false;
 826
 827         if (idx != ioc->autop_idx)
 828                 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
 829
 830         ioc->autop_idx = idx;
 831         ioc->autop_too_fast_at = 0;
 832         ioc->autop_too_slow_at = 0;
 833
 834         if (!ioc->user_qos_params)
 835                 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
 836         if (!ioc->user_cost_model)
 837                 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
 838
 839         ioc_refresh_period_us(ioc);
 840         ioc_refresh_lcoefs(ioc);
 841
 842         ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
 843                                             VTIME_PER_USEC, MILLION);
 844         ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
 845                                    VTIME_PER_USEC, MILLION);
 846
 847         return true;
 848 }
 849
 850 /* take a snapshot of the current [v]time and vrate */
 851 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
 852 {
 853         unsigned seq;
 854
 855         now->now_ns = ktime_get();
 856         now->now = ktime_to_us(now->now_ns);
 857         now->vrate = atomic64_read(&ioc->vtime_rate);
 858
 859         /*
 860          * The current vtime is
 861          *
 862          *   vtime at period start + (wallclock time since the start) * vrate
 863          *
 864          * As a consistent snapshot of `period_at_vtime` and `period_at` is
 865          * needed, they're seqcount protected.
 866          */
 867         do {
 868                 seq = read_seqcount_begin(&ioc->period_seqcount);
 869                 now->vnow = ioc->period_at_vtime +
 870                         (now->now - ioc->period_at) * now->vrate;
 871         } while (read_seqcount_retry(&ioc->period_seqcount, seq));
 872 }
 873
 874 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
 875 {
 876         lockdep_assert_held(&ioc->lock);
 877         WARN_ON_ONCE(ioc->running != IOC_RUNNING);
 878
 879         write_seqcount_begin(&ioc->period_seqcount);
 880         ioc->period_at = now->now;
 881         ioc->period_at_vtime = now->vnow;
 882         write_seqcount_end(&ioc->period_seqcount);
 883
 884         ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
 885         add_timer(&ioc->timer);
 886 }
 887
 888 /*
 889  * Update @iocg's `active` and `inuse` to @active and @inuse, update level
 890  * weight sums and propagate upwards accordingly.
 891  */
 892 static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
 893 {
 894         struct ioc *ioc = iocg->ioc;
 895         int lvl;
 896
 897         lockdep_assert_held(&ioc->lock);
 898
 899         inuse = min(active, inuse);
 900
 901         for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
 902                 struct ioc_gq *parent = iocg->ancestors[lvl];
 903                 struct ioc_gq *child = iocg->ancestors[lvl + 1];
 904                 u32 parent_active = 0, parent_inuse = 0;
 905
 906                 /* update the level sums */
 907                 parent->child_active_sum += (s32)(active - child->active);
 908                 parent->child_inuse_sum += (s32)(inuse - child->inuse);
 909                 /* apply the udpates */
 910                 child->active = active;
 911                 child->inuse = inuse;
 912
 913                 /*
 914                  * The delta between inuse and active sums indicates that
 915                  * that much of weight is being given away.  Parent's inuse
 916                  * and active should reflect the ratio.
 917                  */
 918                 if (parent->child_active_sum) {
 919                         parent_active = parent->weight;
 920                         parent_inuse = DIV64_U64_ROUND_UP(
 921                                 parent_active * parent->child_inuse_sum,
 922                                 parent->child_active_sum);
 923                 }
 924
 925                 /* do we need to keep walking up? */
 926                 if (parent_active == parent->active &&
 927                     parent_inuse == parent->inuse)
 928                         break;
 929
 930                 active = parent_active;
 931                 inuse = parent_inuse;
 932         }
 933
 934         ioc->weights_updated = true;
 935 }
 936
 937 static void commit_active_weights(struct ioc *ioc)
 938 {
 939         lockdep_assert_held(&ioc->lock);
 940
 941         if (ioc->weights_updated) {
 942                 /* paired with rmb in current_hweight(), see there */
 943                 smp_wmb();
 944                 atomic_inc(&ioc->hweight_gen);
 945                 ioc->weights_updated = false;
 946         }
 947 }
 948
 949 static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
 950 {
 951         __propagate_active_weight(iocg, active, inuse);
 952         commit_active_weights(iocg->ioc);
 953 }
 954
 955 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
 956 {
 957         struct ioc *ioc = iocg->ioc;
 958         int lvl;
 959         u32 hwa, hwi;
 960         int ioc_gen;
 961
 962         /* hot path - if uptodate, use cached */
 963         ioc_gen = atomic_read(&ioc->hweight_gen);
 964         if (ioc_gen == iocg->hweight_gen)
 965                 goto out;
 966
 967         /*
 968          * Paired with wmb in commit_active_weights().  If we saw the
 969          * updated hweight_gen, all the weight updates from
 970          * __propagate_active_weight() are visible too.
 971          *
 972          * We can race with weight updates during calculation and get it
 973          * wrong.  However, hweight_gen would have changed and a future
 974          * reader will recalculate and we're guaranteed to discard the
 975          * wrong result soon.
 976          */
 977         smp_rmb();
 978
 979         hwa = hwi = HWEIGHT_WHOLE;
 980         for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
 981                 struct ioc_gq *parent = iocg->ancestors[lvl];
 982                 struct ioc_gq *child = iocg->ancestors[lvl + 1];
 983                 u32 active_sum = READ_ONCE(parent->child_active_sum);
 984                 u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
 985                 u32 active = READ_ONCE(child->active);
 986                 u32 inuse = READ_ONCE(child->inuse);
 987
 988                 /* we can race with deactivations and either may read as zero */
 989                 if (!active_sum || !inuse_sum)
 990                         continue;
 991
 992                 active_sum = max(active, active_sum);
 993                 hwa = hwa * active / active_sum;        /* max 16bits * 10000 */
 994
 995                 inuse_sum = max(inuse, inuse_sum);
 996                 hwi = hwi * inuse / inuse_sum;          /* max 16bits * 10000 */
 997         }
 998
 999         iocg->hweight_active = max_t(u32, hwa, 1);
1000         iocg->hweight_inuse = max_t(u32, hwi, 1);
1001         iocg->hweight_gen = ioc_gen;
1002 out:
1003         if (hw_activep)
1004                 *hw_activep = iocg->hweight_active;
1005         if (hw_inusep)
1006                 *hw_inusep = iocg->hweight_inuse;
1007 }
1008
1009 static void weight_updated(struct ioc_gq *iocg)
1010 {
1011         struct ioc *ioc = iocg->ioc;
1012         struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1013         struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1014         u32 weight;
1015
1016         lockdep_assert_held(&ioc->lock);
1017
1018         weight = iocg->cfg_weight ?: iocc->dfl_weight;
1019         if (weight != iocg->weight && iocg->active)
1020                 propagate_active_weight(iocg, weight,
1021                         DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1022         iocg->weight = weight;
1023 }
1024
1025 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1026 {
1027         struct ioc *ioc = iocg->ioc;
1028         u64 last_period, cur_period, max_period_delta;
1029         u64 vtime, vmargin, vmin;
1030         int i;
1031
1032         /*
1033          * If seem to be already active, just update the stamp to tell the
1034          * timer that we're still active.  We don't mind occassional races.
1035          */
1036         if (!list_empty(&iocg->active_list)) {
1037                 ioc_now(ioc, now);
1038                 cur_period = atomic64_read(&ioc->cur_period);
1039                 if (atomic64_read(&iocg->active_period) != cur_period)
1040                         atomic64_set(&iocg->active_period, cur_period);
1041                 return true;
1042         }
1043
1044         /* racy check on internal node IOs, treat as root level IOs */
1045         if (iocg->child_active_sum)
1046                 return false;
1047
1048         spin_lock_irq(&ioc->lock);
1049
1050         ioc_now(ioc, now);
1051
1052         /* update period */
1053         cur_period = atomic64_read(&ioc->cur_period);
1054         last_period = atomic64_read(&iocg->active_period);
1055         atomic64_set(&iocg->active_period, cur_period);
1056
1057         /* already activated or breaking leaf-only constraint? */
1058         if (!list_empty(&iocg->active_list))
1059                 goto succeed_unlock;
1060         for (i = iocg->level - 1; i > 0; i--)
1061                 if (!list_empty(&iocg->ancestors[i]->active_list))
1062                         goto fail_unlock;
1063
1064         if (iocg->child_active_sum)
1065                 goto fail_unlock;
1066
1067         /*
1068          * vtime may wrap when vrate is raised substantially due to
1069          * underestimated IO costs.  Look at the period and ignore its
1070          * vtime if the iocg has been idle for too long.  Also, cap the
1071          * budget it can start with to the margin.
1072          */
1073         max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1074         vtime = atomic64_read(&iocg->vtime);
1075         vmargin = ioc->margin_us * now->vrate;
1076         vmin = now->vnow - vmargin;
1077
1078         if (last_period + max_period_delta < cur_period ||
1079             time_before64(vtime, vmin)) {
1080                 atomic64_add(vmin - vtime, &iocg->vtime);
1081                 atomic64_add(vmin - vtime, &iocg->done_vtime);
1082                 vtime = vmin;
1083         }
1084
1085         /*
1086          * Activate, propagate weight and start period timer if not
1087          * running.  Reset hweight_gen to avoid accidental match from
1088          * wrapping.
1089          */
1090         iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1091         list_add(&iocg->active_list, &ioc->active_iocgs);
1092         propagate_active_weight(iocg, iocg->weight,
1093                                 iocg->last_inuse ?: iocg->weight);
1094
1095         TRACE_IOCG_PATH(iocg_activate, iocg, now,
1096                         last_period, cur_period, vtime);
1097
1098         iocg->last_vtime = vtime;
1099
1100         if (ioc->running == IOC_IDLE) {
1101                 ioc->running = IOC_RUNNING;
1102                 ioc_start_period(ioc, now);
1103         }
1104
1105 succeed_unlock:
1106         spin_unlock_irq(&ioc->lock);
1107         return true;
1108
1109 fail_unlock:
1110         spin_unlock_irq(&ioc->lock);
1111         return false;
1112 }
1113
1114 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1115                         int flags, void *key)
1116 {
1117         struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1118         struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1119         u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1120
1121         ctx->vbudget -= cost;
1122
1123         if (ctx->vbudget < 0)
1124                 return -1;
1125
1126         iocg_commit_bio(ctx->iocg, wait->bio, cost);
1127
1128         /*
1129          * autoremove_wake_function() removes the wait entry only when it
1130          * actually changed the task state.  We want the wait always
1131          * removed.  Remove explicitly and use default_wake_function().
1132          */
1133         list_del_init(&wq_entry->entry);
1134         wait->committed = true;
1135
1136         default_wake_function(wq_entry, mode, flags, key);
1137         return 0;
1138 }
1139
1140 static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1141 {
1142         struct ioc *ioc = iocg->ioc;
1143         struct iocg_wake_ctx ctx = { .iocg = iocg };
1144         u64 margin_ns = (u64)(ioc->period_us *
1145                               WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1146         u64 vdebt, vshortage, expires, oexpires;
1147         s64 vbudget;
1148         u32 hw_inuse;
1149
1150         lockdep_assert_held(&iocg->waitq.lock);
1151
1152         current_hweight(iocg, NULL, &hw_inuse);
1153         vbudget = now->vnow - atomic64_read(&iocg->vtime);
1154
1155         /* pay off debt */
1156         vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1157         if (vdebt && vbudget > 0) {
1158                 u64 delta = min_t(u64, vbudget, vdebt);
1159                 u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1160                                     iocg->abs_vdebt);
1161
1162                 atomic64_add(delta, &iocg->vtime);
1163                 atomic64_add(delta, &iocg->done_vtime);
1164                 iocg->abs_vdebt -= abs_delta;
1165         }
1166
1167         /*
1168          * Wake up the ones which are due and see how much vtime we'll need
1169          * for the next one.
1170          */
1171         ctx.hw_inuse = hw_inuse;
1172         ctx.vbudget = vbudget - vdebt;
1173         __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1174         if (!waitqueue_active(&iocg->waitq))
1175                 return;
1176         if (WARN_ON_ONCE(ctx.vbudget >= 0))
1177                 return;
1178
1179         /* determine next wakeup, add a quarter margin to guarantee chunking */
1180         vshortage = -ctx.vbudget;
1181         expires = now->now_ns +
1182                 DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1183         expires += margin_ns / 4;
1184
1185         /* if already active and close enough, don't bother */
1186         oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1187         if (hrtimer_is_queued(&iocg->waitq_timer) &&
1188             abs(oexpires - expires) <= margin_ns / 4)
1189                 return;
1190
1191         hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1192                                margin_ns / 4, HRTIMER_MODE_ABS);
1193 }
1194
1195 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1196 {
1197         struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1198         struct ioc_now now;
1199         unsigned long flags;
1200
1201         ioc_now(iocg->ioc, &now);
1202
1203         spin_lock_irqsave(&iocg->waitq.lock, flags);
1204         iocg_kick_waitq(iocg, &now);
1205         spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1206
1207         return HRTIMER_NORESTART;
1208 }
1209
1210 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1211 {
1212         struct ioc *ioc = iocg->ioc;
1213         struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1214         u64 vtime = atomic64_read(&iocg->vtime);
1215         u64 vmargin = ioc->margin_us * now->vrate;
1216         u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1217         u64 delta_ns, expires, oexpires;
1218         u32 hw_inuse;
1219
1220         lockdep_assert_held(&iocg->waitq.lock);
1221
1222         /* debt-adjust vtime */
1223         current_hweight(iocg, NULL, &hw_inuse);
1224         vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1225
1226         /*
1227          * Clear or maintain depending on the overage. Non-zero vdebt is what
1228          * guarantees that @iocg is online and future iocg_kick_delay() will
1229          * clear use_delay. Don't leave it on when there's no vdebt.
1230          */
1231         if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1232                 blkcg_clear_delay(blkg);
1233                 return false;
1234         }
1235         if (!atomic_read(&blkg->use_delay) &&
1236             time_before_eq64(vtime, now->vnow + vmargin))
1237                 return false;
1238
1239         /* use delay */
1240         delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
1241                                       now->vrate) * NSEC_PER_USEC;
1242         blkcg_set_delay(blkg, delta_ns);
1243         expires = now->now_ns + delta_ns;
1244
1245         /* if already active and close enough, don't bother */
1246         oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1247         if (hrtimer_is_queued(&iocg->delay_timer) &&
1248             abs(oexpires - expires) <= margin_ns / 4)
1249                 return true;
1250
1251         hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1252                                margin_ns / 4, HRTIMER_MODE_ABS);
1253         return true;
1254 }
1255
1256 static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1257 {
1258         struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1259         struct ioc_now now;
1260         unsigned long flags;
1261
1262         spin_lock_irqsave(&iocg->waitq.lock, flags);
1263         ioc_now(iocg->ioc, &now);
1264         iocg_kick_delay(iocg, &now);
1265         spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1266
1267         return HRTIMER_NORESTART;
1268 }
1269
1270 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1271 {
1272         u32 nr_met[2] = { };
1273         u32 nr_missed[2] = { };
1274         u64 rq_wait_ns = 0;
1275         int cpu, rw;
1276
1277         for_each_online_cpu(cpu) {
1278                 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1279                 u64 this_rq_wait_ns;
1280
1281                 for (rw = READ; rw <= WRITE; rw++) {
1282                         u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1283                         u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1284
1285                         nr_met[rw] += this_met - stat->missed[rw].last_met;
1286                         nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1287                         stat->missed[rw].last_met = this_met;
1288                         stat->missed[rw].last_missed = this_missed;
1289                 }
1290
1291                 this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1292                 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1293                 stat->last_rq_wait_ns = this_rq_wait_ns;
1294         }
1295
1296         for (rw = READ; rw <= WRITE; rw++) {
1297                 if (nr_met[rw] + nr_missed[rw])
1298                         missed_ppm_ar[rw] =
1299                                 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1300                                                    nr_met[rw] + nr_missed[rw]);
1301                 else
1302                         missed_ppm_ar[rw] = 0;
1303         }
1304
1305         *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1306                                    ioc->period_us * NSEC_PER_USEC);
1307 }
1308
1309 /* was iocg idle this period? */
1310 static bool iocg_is_idle(struct ioc_gq *iocg)
1311 {
1312         struct ioc *ioc = iocg->ioc;
1313
1314         /* did something get issued this period? */
1315         if (atomic64_read(&iocg->active_period) ==
1316             atomic64_read(&ioc->cur_period))
1317                 return false;
1318
1319         /* is something in flight? */
1320         if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1321                 return false;
1322
1323         return true;
1324 }
1325
1326 /* returns usage with margin added if surplus is large enough */
1327 static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1328 {
1329         /* add margin */
1330         usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1331         usage += SURPLUS_SCALE_ABS;
1332
1333         /* don't bother if the surplus is too small */
1334         if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1335                 return 0;
1336
1337         return usage;
1338 }
1339
1340 static void ioc_timer_fn(struct timer_list *timer)
1341 {
1342         struct ioc *ioc = container_of(timer, struct ioc, timer);
1343         struct ioc_gq *iocg, *tiocg;
1344         struct ioc_now now;
1345         int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1346         u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1347         u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1348         u32 missed_ppm[2], rq_wait_pct;
1349         u64 period_vtime;
1350         int prev_busy_level, i;
1351
1352         /* how were the latencies during the period? */
1353         ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1354
1355         /* take care of active iocgs */
1356         spin_lock_irq(&ioc->lock);
1357
1358         ioc_now(ioc, &now);
1359
1360         period_vtime = now.vnow - ioc->period_at_vtime;
1361         if (WARN_ON_ONCE(!period_vtime)) {
1362                 spin_unlock_irq(&ioc->lock);
1363                 return;
1364         }
1365
1366         /*
1367          * Waiters determine the sleep durations based on the vrate they
1368          * saw at the time of sleep.  If vrate has increased, some waiters
1369          * could be sleeping for too long.  Wake up tardy waiters which
1370          * should have woken up in the last period and expire idle iocgs.
1371          */
1372         list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1373                 if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt &&
1374                     !iocg_is_idle(iocg))
1375                         continue;
1376
1377                 spin_lock(&iocg->waitq.lock);
1378
1379                 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
1380                         /* might be oversleeping vtime / hweight changes, kick */
1381                         iocg_kick_waitq(iocg, &now);
1382                         iocg_kick_delay(iocg, &now);
1383                 } else if (iocg_is_idle(iocg)) {
1384                         /* no waiter and idle, deactivate */
1385                         iocg->last_inuse = iocg->inuse;
1386                         __propagate_active_weight(iocg, 0, 0);
1387                         list_del_init(&iocg->active_list);
1388                 }
1389
1390                 spin_unlock(&iocg->waitq.lock);
1391         }
1392         commit_active_weights(ioc);
1393
1394         /* calc usages and see whether some weights need to be moved around */
1395         list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1396                 u64 vdone, vtime, vusage, vmargin, vmin;
1397                 u32 hw_active, hw_inuse, usage;
1398
1399                 /*
1400                  * Collect unused and wind vtime closer to vnow to prevent
1401                  * iocgs from accumulating a large amount of budget.
1402                  */
1403                 vdone = atomic64_read(&iocg->done_vtime);
1404                 vtime = atomic64_read(&iocg->vtime);
1405                 current_hweight(iocg, &hw_active, &hw_inuse);
1406
1407                 /*
1408                  * Latency QoS detection doesn't account for IOs which are
1409                  * in-flight for longer than a period.  Detect them by
1410                  * comparing vdone against period start.  If lagging behind
1411                  * IOs from past periods, don't increase vrate.
1412                  */
1413                 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1414                     !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1415                     time_after64(vtime, vdone) &&
1416                     time_after64(vtime, now.vnow -
1417                                  MAX_LAGGING_PERIODS * period_vtime) &&
1418                     time_before64(vdone, now.vnow - period_vtime))
1419                         nr_lagging++;
1420
1421                 if (waitqueue_active(&iocg->waitq))
1422                         vusage = now.vnow - iocg->last_vtime;
1423                 else if (time_before64(iocg->last_vtime, vtime))
1424                         vusage = vtime - iocg->last_vtime;
1425                 else
1426                         vusage = 0;
1427
1428                 iocg->last_vtime += vusage;
1429                 /*
1430                  * Factor in in-flight vtime into vusage to avoid
1431                  * high-latency completions appearing as idle.  This should
1432                  * be done after the above ->last_time adjustment.
1433                  */
1434                 vusage = max(vusage, vtime - vdone);
1435
1436                 /* calculate hweight based usage ratio and record */
1437                 if (vusage) {
1438                         usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1439                                                    period_vtime);
1440                         iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1441                         iocg->usages[iocg->usage_idx] = usage;
1442                 } else {
1443                         usage = 0;
1444                 }
1445
1446                 /* see whether there's surplus vtime */
1447                 vmargin = ioc->margin_us * now.vrate;
1448                 vmin = now.vnow - vmargin;
1449
1450                 iocg->has_surplus = false;
1451
1452                 if (!waitqueue_active(&iocg->waitq) &&
1453                     time_before64(vtime, vmin)) {
1454                         u64 delta = vmin - vtime;
1455
1456                         /* throw away surplus vtime */
1457                         atomic64_add(delta, &iocg->vtime);
1458                         atomic64_add(delta, &iocg->done_vtime);
1459                         iocg->last_vtime += delta;
1460                         /* if usage is sufficiently low, maybe it can donate */
1461                         if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1462                                 iocg->has_surplus = true;
1463                                 nr_surpluses++;
1464                         }
1465                 } else if (hw_inuse < hw_active) {
1466                         u32 new_hwi, new_inuse;
1467
1468                         /* was donating but might need to take back some */
1469                         if (waitqueue_active(&iocg->waitq)) {
1470                                 new_hwi = hw_active;
1471                         } else {
1472                                 new_hwi = max(hw_inuse,
1473                                               usage * SURPLUS_SCALE_PCT / 100 +
1474                                               SURPLUS_SCALE_ABS);
1475                         }
1476
1477                         new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1478                                               hw_inuse);
1479                         new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1480
1481                         if (new_inuse > iocg->inuse) {
1482                                 TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1483                                                 iocg->inuse, new_inuse,
1484                                                 hw_inuse, new_hwi);
1485                                 __propagate_active_weight(iocg, iocg->weight,
1486                                                           new_inuse);
1487                         }
1488                 } else {
1489                         /* genuninely out of vtime */
1490                         nr_shortages++;
1491                 }
1492         }
1493
1494         if (!nr_shortages || !nr_surpluses)
1495                 goto skip_surplus_transfers;
1496
1497         /* there are both shortages and surpluses, transfer surpluses */
1498         list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1499                 u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1500                 int nr_valid = 0;
1501
1502                 if (!iocg->has_surplus)
1503                         continue;
1504
1505                 /* base the decision on max historical usage */
1506                 for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1507                         if (iocg->usages[i]) {
1508                                 usage = max(usage, iocg->usages[i]);
1509                                 nr_valid++;
1510                         }
1511                 }
1512                 if (nr_valid < MIN_VALID_USAGES)
1513                         continue;
1514
1515                 current_hweight(iocg, &hw_active, &hw_inuse);
1516                 new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1517                 if (!new_hwi)
1518                         continue;
1519
1520                 new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1521                                                hw_inuse);
1522                 if (new_inuse < iocg->inuse) {
1523                         TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1524                                         iocg->inuse, new_inuse,
1525                                         hw_inuse, new_hwi);
1526                         __propagate_active_weight(iocg, iocg->weight, new_inuse);
1527                 }
1528         }
1529 skip_surplus_transfers:
1530         commit_active_weights(ioc);
1531
1532         /*
1533          * If q is getting clogged or we're missing too much, we're issuing
1534          * too much IO and should lower vtime rate.  If we're not missing
1535          * and experiencing shortages but not surpluses, we're too stingy
1536          * and should increase vtime rate.
1537          */
1538         prev_busy_level = ioc->busy_level;
1539         if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1540             missed_ppm[READ] > ppm_rthr ||
1541             missed_ppm[WRITE] > ppm_wthr) {
1542                 /* clearly missing QoS targets, slow down vrate */
1543                 ioc->busy_level = max(ioc->busy_level, 0);
1544                 ioc->busy_level++;
1545         } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1546                    missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1547                    missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1548                 /* QoS targets are being met with >25% margin */
1549                 if (nr_shortages) {
1550                         /*
1551                          * We're throttling while the device has spare
1552                          * capacity.  If vrate was being slowed down, stop.
1553                          */
1554                         ioc->busy_level = min(ioc->busy_level, 0);
1555
1556                         /*
1557                          * If there are IOs spanning multiple periods, wait
1558                          * them out before pushing the device harder.  If
1559                          * there are surpluses, let redistribution work it
1560                          * out first.
1561                          */
1562                         if (!nr_lagging && !nr_surpluses)
1563                                 ioc->busy_level--;
1564                 } else {
1565                         /*
1566                          * Nobody is being throttled and the users aren't
1567                          * issuing enough IOs to saturate the device.  We
1568                          * simply don't know how close the device is to
1569                          * saturation.  Coast.
1570                          */
1571                         ioc->busy_level = 0;
1572                 }
1573         } else {
1574                 /* inside the hysterisis margin, we're good */
1575                 ioc->busy_level = 0;
1576         }
1577
1578         ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1579
1580         if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1581                 u64 vrate = atomic64_read(&ioc->vtime_rate);
1582                 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1583
1584                 /* rq_wait signal is always reliable, ignore user vrate_min */
1585                 if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1586                         vrate_min = VRATE_MIN;
1587
1588                 /*
1589                  * If vrate is out of bounds, apply clamp gradually as the
1590                  * bounds can change abruptly.  Otherwise, apply busy_level
1591                  * based adjustment.
1592                  */
1593                 if (vrate < vrate_min) {
1594                         vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1595                                           100);
1596                         vrate = min(vrate, vrate_min);
1597                 } else if (vrate > vrate_max) {
1598                         vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1599                                           100);
1600                         vrate = max(vrate, vrate_max);
1601                 } else {
1602                         int idx = min_t(int, abs(ioc->busy_level),
1603                                         ARRAY_SIZE(vrate_adj_pct) - 1);
1604                         u32 adj_pct = vrate_adj_pct[idx];
1605
1606                         if (ioc->busy_level > 0)
1607                                 adj_pct = 100 - adj_pct;
1608                         else
1609                                 adj_pct = 100 + adj_pct;
1610
1611                         vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1612                                       vrate_min, vrate_max);
1613                 }
1614
1615                 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1616                                            nr_lagging, nr_shortages,
1617                                            nr_surpluses);
1618
1619                 atomic64_set(&ioc->vtime_rate, vrate);
1620                 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1621                         ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1622         } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1623                 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1624                                            missed_ppm, rq_wait_pct, nr_lagging,
1625                                            nr_shortages, nr_surpluses);
1626         }
1627
1628         ioc_refresh_params(ioc, false);
1629
1630         /*
1631          * This period is done.  Move onto the next one.  If nothing's
1632          * going on with the device, stop the timer.
1633          */
1634         atomic64_inc(&ioc->cur_period);
1635
1636         if (ioc->running != IOC_STOP) {
1637                 if (!list_empty(&ioc->active_iocgs)) {
1638                         ioc_start_period(ioc, &now);
1639                 } else {
1640                         ioc->busy_level = 0;
1641                         ioc->running = IOC_IDLE;
1642                 }
1643         }
1644
1645         spin_unlock_irq(&ioc->lock);
1646 }
1647
1648 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1649                                     bool is_merge, u64 *costp)
1650 {
1651         struct ioc *ioc = iocg->ioc;
1652         u64 coef_seqio, coef_randio, coef_page;
1653         u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1654         u64 seek_pages = 0;
1655         u64 cost = 0;
1656
1657         switch (bio_op(bio)) {
1658         case REQ_OP_READ:
1659                 coef_seqio      = ioc->params.lcoefs[LCOEF_RSEQIO];
1660                 coef_randio     = ioc->params.lcoefs[LCOEF_RRANDIO];
1661                 coef_page       = ioc->params.lcoefs[LCOEF_RPAGE];
1662                 break;
1663         case REQ_OP_WRITE:
1664                 coef_seqio      = ioc->params.lcoefs[LCOEF_WSEQIO];
1665                 coef_randio     = ioc->params.lcoefs[LCOEF_WRANDIO];
1666                 coef_page       = ioc->params.lcoefs[LCOEF_WPAGE];
1667                 break;
1668         default:
1669                 goto out;
1670         }
1671
1672         if (iocg->cursor) {
1673                 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1674                 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1675         }
1676
1677         if (!is_merge) {
1678                 if (seek_pages > LCOEF_RANDIO_PAGES) {
1679                         cost += coef_randio;
1680                 } else {
1681                         cost += coef_seqio;
1682                 }
1683         }
1684         cost += pages * coef_page;
1685 out:
1686         *costp = cost;
1687 }
1688
1689 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1690 {
1691         u64 cost;
1692
1693         calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1694         return cost;
1695 }
1696
1697 static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
1698                                          u64 *costp)
1699 {
1700         unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
1701
1702         switch (req_op(rq)) {
1703         case REQ_OP_READ:
1704                 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
1705                 break;
1706         case REQ_OP_WRITE:
1707                 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
1708                 break;
1709         default:
1710                 *costp = 0;
1711         }
1712 }
1713
1714 static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
1715 {
1716         u64 cost;
1717
1718         calc_size_vtime_cost_builtin(rq, ioc, &cost);
1719         return cost;
1720 }
1721
1722 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1723 {
1724         struct blkcg_gq *blkg = bio->bi_blkg;
1725         struct ioc *ioc = rqos_to_ioc(rqos);
1726         struct ioc_gq *iocg = blkg_to_iocg(blkg);
1727         struct ioc_now now;
1728         struct iocg_wait wait;
1729         u32 hw_active, hw_inuse;
1730         u64 abs_cost, cost, vtime;
1731
1732         /* bypass IOs if disabled or for root cgroup */
1733         if (!ioc->enabled || !iocg->level)
1734                 return;
1735
1736         /* always activate so that even 0 cost IOs get protected to some level */
1737         if (!iocg_activate(iocg, &now))
1738                 return;
1739
1740         /* calculate the absolute vtime cost */
1741         abs_cost = calc_vtime_cost(bio, iocg, false);
1742         if (!abs_cost)
1743                 return;
1744
1745         iocg->cursor = bio_end_sector(bio);
1746
1747         vtime = atomic64_read(&iocg->vtime);
1748         current_hweight(iocg, &hw_active, &hw_inuse);
1749
1750         if (hw_inuse < hw_active &&
1751             time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1752                 TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1753                                 iocg->inuse, iocg->weight, hw_inuse, hw_active);
1754                 spin_lock_irq(&ioc->lock);
1755                 propagate_active_weight(iocg, iocg->weight, iocg->weight);
1756                 spin_unlock_irq(&ioc->lock);
1757                 current_hweight(iocg, &hw_active, &hw_inuse);
1758         }
1759
1760         cost = abs_cost_to_cost(abs_cost, hw_inuse);
1761
1762         /*
1763          * If no one's waiting and within budget, issue right away.  The
1764          * tests are racy but the races aren't systemic - we only miss once
1765          * in a while which is fine.
1766          */
1767         if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1768             time_before_eq64(vtime + cost, now.vnow)) {
1769                 iocg_commit_bio(iocg, bio, cost);
1770                 return;
1771         }
1772
1773         /*
1774          * We activated above but w/o any synchronization. Deactivation is
1775          * synchronized with waitq.lock and we won't get deactivated as long
1776          * as we're waiting or has debt, so we're good if we're activated
1777          * here. In the unlikely case that we aren't, just issue the IO.
1778          */
1779         spin_lock_irq(&iocg->waitq.lock);
1780
1781         if (unlikely(list_empty(&iocg->active_list))) {
1782                 spin_unlock_irq(&iocg->waitq.lock);
1783                 iocg_commit_bio(iocg, bio, cost);
1784                 return;
1785         }
1786
1787         /*
1788          * We're over budget. If @bio has to be issued regardless, remember
1789          * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
1790          * off the debt before waking more IOs.
1791          *
1792          * This way, the debt is continuously paid off each period with the
1793          * actual budget available to the cgroup. If we just wound vtime, we
1794          * would incorrectly use the current hw_inuse for the entire amount
1795          * which, for example, can lead to the cgroup staying blocked for a
1796          * long time even with substantially raised hw_inuse.
1797          *
1798          * An iocg with vdebt should stay online so that the timer can keep
1799          * deducting its vdebt and [de]activate use_delay mechanism
1800          * accordingly. We don't want to race against the timer trying to
1801          * clear them and leave @iocg inactive w/ dangling use_delay heavily
1802          * penalizing the cgroup and its descendants.
1803          */
1804         if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1805                 iocg->abs_vdebt += abs_cost;
1806                 if (iocg_kick_delay(iocg, &now))
1807                         blkcg_schedule_throttle(rqos->q,
1808                                         (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
1809                 spin_unlock_irq(&iocg->waitq.lock);
1810                 return;
1811         }
1812
1813         /*
1814          * Append self to the waitq and schedule the wakeup timer if we're
1815          * the first waiter.  The timer duration is calculated based on the
1816          * current vrate.  vtime and hweight changes can make it too short
1817          * or too long.  Each wait entry records the absolute cost it's
1818          * waiting for to allow re-evaluation using a custom wait entry.
1819          *
1820          * If too short, the timer simply reschedules itself.  If too long,
1821          * the period timer will notice and trigger wakeups.
1822          *
1823          * All waiters are on iocg->waitq and the wait states are
1824          * synchronized using waitq.lock.
1825          */
1826         init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1827         wait.wait.private = current;
1828         wait.bio = bio;
1829         wait.abs_cost = abs_cost;
1830         wait.committed = false; /* will be set true by waker */
1831
1832         __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1833         iocg_kick_waitq(iocg, &now);
1834
1835         spin_unlock_irq(&iocg->waitq.lock);
1836
1837         while (true) {
1838                 set_current_state(TASK_UNINTERRUPTIBLE);
1839                 if (wait.committed)
1840                         break;
1841                 io_schedule();
1842         }
1843
1844         /* waker already committed us, proceed */
1845         finish_wait(&iocg->waitq, &wait.wait);
1846 }
1847
1848 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1849                            struct bio *bio)
1850 {
1851         struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1852         struct ioc *ioc = iocg->ioc;
1853         sector_t bio_end = bio_end_sector(bio);
1854         struct ioc_now now;
1855         u32 hw_inuse;
1856         u64 abs_cost, cost;
1857         unsigned long flags;
1858
1859         /* bypass if disabled or for root cgroup */
1860         if (!ioc->enabled || !iocg->level)
1861                 return;
1862
1863         abs_cost = calc_vtime_cost(bio, iocg, true);
1864         if (!abs_cost)
1865                 return;
1866
1867         ioc_now(ioc, &now);
1868         current_hweight(iocg, NULL, &hw_inuse);
1869         cost = abs_cost_to_cost(abs_cost, hw_inuse);
1870
1871         /* update cursor if backmerging into the request at the cursor */
1872         if (blk_rq_pos(rq) < bio_end &&
1873             blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1874                 iocg->cursor = bio_end;
1875
1876         /*
1877          * Charge if there's enough vtime budget and the existing request has
1878          * cost assigned.
1879          */
1880         if (rq->bio && rq->bio->bi_iocost_cost &&
1881             time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
1882                 iocg_commit_bio(iocg, bio, cost);
1883                 return;
1884         }
1885
1886         /*
1887          * Otherwise, account it as debt if @iocg is online, which it should
1888          * be for the vast majority of cases. See debt handling in
1889          * ioc_rqos_throttle() for details.
1890          */
1891         spin_lock_irqsave(&iocg->waitq.lock, flags);
1892         if (likely(!list_empty(&iocg->active_list))) {
1893                 iocg->abs_vdebt += abs_cost;
1894                 iocg_kick_delay(iocg, &now);
1895         } else {
1896                 iocg_commit_bio(iocg, bio, cost);
1897         }
1898         spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1899 }
1900
1901 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1902 {
1903         struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1904
1905         if (iocg && bio->bi_iocost_cost)
1906                 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1907 }
1908
1909 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1910 {
1911         struct ioc *ioc = rqos_to_ioc(rqos);
1912         u64 on_q_ns, rq_wait_ns, size_nsec;
1913         int pidx, rw;
1914
1915         if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1916                 return;
1917
1918         switch (req_op(rq) & REQ_OP_MASK) {
1919         case REQ_OP_READ:
1920                 pidx = QOS_RLAT;
1921                 rw = READ;
1922                 break;
1923         case REQ_OP_WRITE:
1924                 pidx = QOS_WLAT;
1925                 rw = WRITE;
1926                 break;
1927         default:
1928                 return;
1929         }
1930
1931         on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1932         rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
1933         size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
1934
1935         if (on_q_ns <= size_nsec ||
1936             on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1937                 this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1938         else
1939                 this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1940
1941         this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1942 }
1943
1944 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1945 {
1946         struct ioc *ioc = rqos_to_ioc(rqos);
1947
1948         spin_lock_irq(&ioc->lock);
1949         ioc_refresh_params(ioc, false);
1950         spin_unlock_irq(&ioc->lock);
1951 }
1952
1953 static void ioc_rqos_exit(struct rq_qos *rqos)
1954 {
1955         struct ioc *ioc = rqos_to_ioc(rqos);
1956
1957         blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1958
1959         spin_lock_irq(&ioc->lock);
1960         ioc->running = IOC_STOP;
1961         spin_unlock_irq(&ioc->lock);
1962
1963         del_timer_sync(&ioc->timer);
1964         free_percpu(ioc->pcpu_stat);
1965         kfree(ioc);
1966 }
1967
1968 static struct rq_qos_ops ioc_rqos_ops = {
1969         .throttle = ioc_rqos_throttle,
1970         .merge = ioc_rqos_merge,
1971         .done_bio = ioc_rqos_done_bio,
1972         .done = ioc_rqos_done,
1973         .queue_depth_changed = ioc_rqos_queue_depth_changed,
1974         .exit = ioc_rqos_exit,
1975 };
1976
1977 static int blk_iocost_init(struct request_queue *q)
1978 {
1979         struct ioc *ioc;
1980         struct rq_qos *rqos;
1981         int ret;
1982
1983         ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1984         if (!ioc)
1985                 return -ENOMEM;
1986
1987         ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1988         if (!ioc->pcpu_stat) {
1989                 kfree(ioc);
1990                 return -ENOMEM;
1991         }
1992
1993         rqos = &ioc->rqos;
1994         rqos->id = RQ_QOS_COST;
1995         rqos->ops = &ioc_rqos_ops;
1996         rqos->q = q;
1997
1998         spin_lock_init(&ioc->lock);
1999         timer_setup(&ioc->timer, ioc_timer_fn, 0);
2000         INIT_LIST_HEAD(&ioc->active_iocgs);
2001
2002         ioc->running = IOC_IDLE;
2003         atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2004         seqcount_init(&ioc->period_seqcount);
2005         ioc->period_at = ktime_to_us(ktime_get());
2006         atomic64_set(&ioc->cur_period, 0);
2007         atomic_set(&ioc->hweight_gen, 0);
2008
2009         spin_lock_irq(&ioc->lock);
2010         ioc->autop_idx = AUTOP_INVALID;
2011         ioc_refresh_params(ioc, true);
2012         spin_unlock_irq(&ioc->lock);
2013
2014         rq_qos_add(q, rqos);
2015         ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2016         if (ret) {
2017                 rq_qos_del(q, rqos);
2018                 free_percpu(ioc->pcpu_stat);
2019                 kfree(ioc);
2020                 return ret;
2021         }
2022         return 0;
2023 }
2024
2025 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2026 {
2027         struct ioc_cgrp *iocc;
2028
2029         iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2030         if (!iocc)
2031                 return NULL;
2032
2033         iocc->dfl_weight = CGROUP_WEIGHT_DFL;
2034         return &iocc->cpd;
2035 }
2036
2037 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2038 {
2039         kfree(container_of(cpd, struct ioc_cgrp, cpd));
2040 }
2041
2042 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2043                                              struct blkcg *blkcg)
2044 {
2045         int levels = blkcg->css.cgroup->level + 1;
2046         struct ioc_gq *iocg;
2047
2048         iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
2049                             gfp, q->node);
2050         if (!iocg)
2051                 return NULL;
2052
2053         return &iocg->pd;
2054 }
2055
2056 static void ioc_pd_init(struct blkg_policy_data *pd)
2057 {
2058         struct ioc_gq *iocg = pd_to_iocg(pd);
2059         struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2060         struct ioc *ioc = q_to_ioc(blkg->q);
2061         struct ioc_now now;
2062         struct blkcg_gq *tblkg;
2063         unsigned long flags;
2064
2065         ioc_now(ioc, &now);
2066
2067         iocg->ioc = ioc;
2068         atomic64_set(&iocg->vtime, now.vnow);
2069         atomic64_set(&iocg->done_vtime, now.vnow);
2070         atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2071         INIT_LIST_HEAD(&iocg->active_list);
2072         iocg->hweight_active = HWEIGHT_WHOLE;
2073         iocg->hweight_inuse = HWEIGHT_WHOLE;
2074
2075         init_waitqueue_head(&iocg->waitq);
2076         hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2077         iocg->waitq_timer.function = iocg_waitq_timer_fn;
2078         hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2079         iocg->delay_timer.function = iocg_delay_timer_fn;
2080
2081         iocg->level = blkg->blkcg->css.cgroup->level;
2082
2083         for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2084                 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2085                 iocg->ancestors[tiocg->level] = tiocg;
2086         }
2087
2088         spin_lock_irqsave(&ioc->lock, flags);
2089         weight_updated(iocg);
2090         spin_unlock_irqrestore(&ioc->lock, flags);
2091 }
2092
2093 static void ioc_pd_free(struct blkg_policy_data *pd)
2094 {
2095         struct ioc_gq *iocg = pd_to_iocg(pd);
2096         struct ioc *ioc = iocg->ioc;
2097
2098         if (ioc) {
2099                 spin_lock(&ioc->lock);
2100                 if (!list_empty(&iocg->active_list)) {
2101                         propagate_active_weight(iocg, 0, 0);
2102                         list_del_init(&iocg->active_list);
2103                 }
2104                 spin_unlock(&ioc->lock);
2105
2106                 hrtimer_cancel(&iocg->waitq_timer);
2107                 hrtimer_cancel(&iocg->delay_timer);
2108         }
2109         kfree(iocg);
2110 }
2111
2112 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2113                              int off)
2114 {
2115         const char *dname = blkg_dev_name(pd->blkg);
2116         struct ioc_gq *iocg = pd_to_iocg(pd);
2117
2118         if (dname && iocg->cfg_weight)
2119                 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
2120         return 0;
2121 }
2122
2123
2124 static int ioc_weight_show(struct seq_file *sf, void *v)
2125 {
2126         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2127         struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2128
2129         seq_printf(sf, "default %u\n", iocc->dfl_weight);
2130         blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2131                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
2132         return 0;
2133 }
2134
2135 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2136                                 size_t nbytes, loff_t off)
2137 {
2138         struct blkcg *blkcg = css_to_blkcg(of_css(of));
2139         struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2140         struct blkg_conf_ctx ctx;
2141         struct ioc_gq *iocg;
2142         u32 v;
2143         int ret;
2144
2145         if (!strchr(buf, ':')) {
2146                 struct blkcg_gq *blkg;
2147
2148                 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2149                         return -EINVAL;
2150
2151                 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2152                         return -EINVAL;
2153
2154                 spin_lock(&blkcg->lock);
2155                 iocc->dfl_weight = v;
2156                 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2157                         struct ioc_gq *iocg = blkg_to_iocg(blkg);
2158
2159                         if (iocg) {
2160                                 spin_lock_irq(&iocg->ioc->lock);
2161                                 weight_updated(iocg);
2162                                 spin_unlock_irq(&iocg->ioc->lock);
2163                         }
2164                 }
2165                 spin_unlock(&blkcg->lock);
2166
2167                 return nbytes;
2168         }
2169
2170         ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2171         if (ret)
2172                 return ret;
2173
2174         iocg = blkg_to_iocg(ctx.blkg);
2175
2176         if (!strncmp(ctx.body, "default", 7)) {
2177                 v = 0;
2178         } else {
2179                 if (!sscanf(ctx.body, "%u", &v))
2180                         goto einval;
2181                 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2182                         goto einval;
2183         }
2184
2185         spin_lock(&iocg->ioc->lock);
2186         iocg->cfg_weight = v;
2187         weight_updated(iocg);
2188         spin_unlock(&iocg->ioc->lock);
2189
2190         blkg_conf_finish(&ctx);
2191         return nbytes;
2192
2193 einval:
2194         blkg_conf_finish(&ctx);
2195         return -EINVAL;
2196 }
2197
2198 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2199                           int off)
2200 {
2201         const char *dname = blkg_dev_name(pd->blkg);
2202         struct ioc *ioc = pd_to_iocg(pd)->ioc;
2203
2204         if (!dname)
2205                 return 0;
2206
2207         seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2208                    dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2209                    ioc->params.qos[QOS_RPPM] / 10000,
2210                    ioc->params.qos[QOS_RPPM] % 10000 / 100,
2211                    ioc->params.qos[QOS_RLAT],
2212                    ioc->params.qos[QOS_WPPM] / 10000,
2213                    ioc->params.qos[QOS_WPPM] % 10000 / 100,
2214                    ioc->params.qos[QOS_WLAT],
2215                    ioc->params.qos[QOS_MIN] / 10000,
2216                    ioc->params.qos[QOS_MIN] % 10000 / 100,
2217                    ioc->params.qos[QOS_MAX] / 10000,
2218                    ioc->params.qos[QOS_MAX] % 10000 / 100);
2219         return 0;
2220 }
2221
2222 static int ioc_qos_show(struct seq_file *sf, void *v)
2223 {
2224         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2225
2226         blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2227                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
2228         return 0;
2229 }
2230
2231 static const match_table_t qos_ctrl_tokens = {
2232         { QOS_ENABLE,           "enable=%u"     },
2233         { QOS_CTRL,             "ctrl=%s"       },
2234         { NR_QOS_CTRL_PARAMS,   NULL            },
2235 };
2236
2237 static const match_table_t qos_tokens = {
2238         { QOS_RPPM,             "rpct=%s"       },
2239         { QOS_RLAT,             "rlat=%u"       },
2240         { QOS_WPPM,             "wpct=%s"       },
2241         { QOS_WLAT,             "wlat=%u"       },
2242         { QOS_MIN,              "min=%s"        },
2243         { QOS_MAX,              "max=%s"        },
2244         { NR_QOS_PARAMS,        NULL            },
2245 };
2246
2247 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2248                              size_t nbytes, loff_t off)
2249 {
2250         struct gendisk *disk;
2251         struct ioc *ioc;
2252         u32 qos[NR_QOS_PARAMS];
2253         bool enable, user;
2254         char *p;
2255         int ret;
2256
2257         disk = blkcg_conf_get_disk(&input);
2258         if (IS_ERR(disk))
2259                 return PTR_ERR(disk);
2260
2261         ioc = q_to_ioc(disk->queue);
2262         if (!ioc) {
2263                 ret = blk_iocost_init(disk->queue);
2264                 if (ret)
2265                         goto err;
2266                 ioc = q_to_ioc(disk->queue);
2267         }
2268
2269         spin_lock_irq(&ioc->lock);
2270         memcpy(qos, ioc->params.qos, sizeof(qos));
2271         enable = ioc->enabled;
2272         user = ioc->user_qos_params;
2273         spin_unlock_irq(&ioc->lock);
2274
2275         while ((p = strsep(&input, " \t\n"))) {
2276                 substring_t args[MAX_OPT_ARGS];
2277                 char buf[32];
2278                 int tok;
2279                 s64 v;
2280
2281                 if (!*p)
2282                         continue;
2283
2284                 switch (match_token(p, qos_ctrl_tokens, args)) {
2285                 case QOS_ENABLE:
2286                         match_u64(&args[0], &v);
2287                         enable = v;
2288                         continue;
2289                 case QOS_CTRL:
2290                         match_strlcpy(buf, &args[0], sizeof(buf));
2291                         if (!strcmp(buf, "auto"))
2292                                 user = false;
2293                         else if (!strcmp(buf, "user"))
2294                                 user = true;
2295                         else
2296                                 goto einval;
2297                         continue;
2298                 }
2299
2300                 tok = match_token(p, qos_tokens, args);
2301                 switch (tok) {
2302                 case QOS_RPPM:
2303                 case QOS_WPPM:
2304                         if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2305                             sizeof(buf))
2306                                 goto einval;
2307                         if (cgroup_parse_float(buf, 2, &v))
2308                                 goto einval;
2309                         if (v < 0 || v > 10000)
2310                                 goto einval;
2311                         qos[tok] = v * 100;
2312                         break;
2313                 case QOS_RLAT:
2314                 case QOS_WLAT:
2315                         if (match_u64(&args[0], &v))
2316                                 goto einval;
2317                         qos[tok] = v;
2318                         break;
2319                 case QOS_MIN:
2320                 case QOS_MAX:
2321                         if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2322                             sizeof(buf))
2323                                 goto einval;
2324                         if (cgroup_parse_float(buf, 2, &v))
2325                                 goto einval;
2326                         if (v < 0)
2327                                 goto einval;
2328                         qos[tok] = clamp_t(s64, v * 100,
2329                                            VRATE_MIN_PPM, VRATE_MAX_PPM);
2330                         break;
2331                 default:
2332                         goto einval;
2333                 }
2334                 user = true;
2335         }
2336
2337         if (qos[QOS_MIN] > qos[QOS_MAX])
2338                 goto einval;
2339
2340         spin_lock_irq(&ioc->lock);
2341
2342         if (enable) {
2343                 blk_stat_enable_accounting(ioc->rqos.q);
2344                 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2345                 ioc->enabled = true;
2346         } else {
2347                 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2348                 ioc->enabled = false;
2349         }
2350
2351         if (user) {
2352                 memcpy(ioc->params.qos, qos, sizeof(qos));
2353                 ioc->user_qos_params = true;
2354         } else {
2355                 ioc->user_qos_params = false;
2356         }
2357
2358         ioc_refresh_params(ioc, true);
2359         spin_unlock_irq(&ioc->lock);
2360
2361         put_disk_and_module(disk);
2362         return nbytes;
2363 einval:
2364         ret = -EINVAL;
2365 err:
2366         put_disk_and_module(disk);
2367         return ret;
2368 }
2369
2370 static u64 ioc_cost_model_prfill(struct seq_file *sf,
2371                                  struct blkg_policy_data *pd, int off)
2372 {
2373         const char *dname = blkg_dev_name(pd->blkg);
2374         struct ioc *ioc = pd_to_iocg(pd)->ioc;
2375         u64 *u = ioc->params.i_lcoefs;
2376
2377         if (!dname)
2378                 return 0;
2379
2380         seq_printf(sf, "%s ctrl=%s model=linear "
2381                    "rbps=%llu rseqiops=%llu rrandiops=%llu "
2382                    "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2383                    dname, ioc->user_cost_model ? "user" : "auto",
2384                    u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2385                    u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2386         return 0;
2387 }
2388
2389 static int ioc_cost_model_show(struct seq_file *sf, void *v)
2390 {
2391         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2392
2393         blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2394                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
2395         return 0;
2396 }
2397
2398 static const match_table_t cost_ctrl_tokens = {
2399         { COST_CTRL,            "ctrl=%s"       },
2400         { COST_MODEL,           "model=%s"      },
2401         { NR_COST_CTRL_PARAMS,  NULL            },
2402 };
2403
2404 static const match_table_t i_lcoef_tokens = {
2405         { I_LCOEF_RBPS,         "rbps=%u"       },
2406         { I_LCOEF_RSEQIOPS,     "rseqiops=%u"   },
2407         { I_LCOEF_RRANDIOPS,    "rrandiops=%u"  },
2408         { I_LCOEF_WBPS,         "wbps=%u"       },
2409         { I_LCOEF_WSEQIOPS,     "wseqiops=%u"   },
2410         { I_LCOEF_WRANDIOPS,    "wrandiops=%u"  },
2411         { NR_I_LCOEFS,          NULL            },
2412 };
2413
2414 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2415                                     size_t nbytes, loff_t off)
2416 {
2417         struct gendisk *disk;
2418         struct ioc *ioc;
2419         u64 u[NR_I_LCOEFS];
2420         bool user;
2421         char *p;
2422         int ret;
2423
2424         disk = blkcg_conf_get_disk(&input);
2425         if (IS_ERR(disk))
2426                 return PTR_ERR(disk);
2427
2428         ioc = q_to_ioc(disk->queue);
2429         if (!ioc) {
2430                 ret = blk_iocost_init(disk->queue);
2431                 if (ret)
2432                         goto err;
2433                 ioc = q_to_ioc(disk->queue);
2434         }
2435
2436         spin_lock_irq(&ioc->lock);
2437         memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2438         user = ioc->user_cost_model;
2439         spin_unlock_irq(&ioc->lock);
2440
2441         while ((p = strsep(&input, " \t\n"))) {
2442                 substring_t args[MAX_OPT_ARGS];
2443                 char buf[32];
2444                 int tok;
2445                 u64 v;
2446
2447                 if (!*p)
2448                         continue;
2449
2450                 switch (match_token(p, cost_ctrl_tokens, args)) {
2451                 case COST_CTRL:
2452                         match_strlcpy(buf, &args[0], sizeof(buf));
2453                         if (!strcmp(buf, "auto"))
2454                                 user = false;
2455                         else if (!strcmp(buf, "user"))
2456                                 user = true;
2457                         else
2458                                 goto einval;
2459                         continue;
2460                 case COST_MODEL:
2461                         match_strlcpy(buf, &args[0], sizeof(buf));
2462                         if (strcmp(buf, "linear"))
2463                                 goto einval;
2464                         continue;
2465                 }
2466
2467                 tok = match_token(p, i_lcoef_tokens, args);
2468                 if (tok == NR_I_LCOEFS)
2469                         goto einval;
2470                 if (match_u64(&args[0], &v))
2471                         goto einval;
2472                 u[tok] = v;
2473                 user = true;
2474         }
2475
2476         spin_lock_irq(&ioc->lock);
2477         if (user) {
2478                 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2479                 ioc->user_cost_model = true;
2480         } else {
2481                 ioc->user_cost_model = false;
2482         }
2483         ioc_refresh_params(ioc, true);
2484         spin_unlock_irq(&ioc->lock);
2485
2486         put_disk_and_module(disk);
2487         return nbytes;
2488
2489 einval:
2490         ret = -EINVAL;
2491 err:
2492         put_disk_and_module(disk);
2493         return ret;
2494 }
2495
2496 static struct cftype ioc_files[] = {
2497         {
2498                 .name = "weight",
2499                 .flags = CFTYPE_NOT_ON_ROOT,
2500                 .seq_show = ioc_weight_show,
2501                 .write = ioc_weight_write,
2502         },
2503         {
2504                 .name = "cost.qos",
2505                 .flags = CFTYPE_ONLY_ON_ROOT,
2506                 .seq_show = ioc_qos_show,
2507                 .write = ioc_qos_write,
2508         },
2509         {
2510                 .name = "cost.model",
2511                 .flags = CFTYPE_ONLY_ON_ROOT,
2512                 .seq_show = ioc_cost_model_show,
2513                 .write = ioc_cost_model_write,
2514         },
2515         {}
2516 };
2517
2518 static struct blkcg_policy blkcg_policy_iocost = {
2519         .dfl_cftypes    = ioc_files,
2520         .cpd_alloc_fn   = ioc_cpd_alloc,
2521         .cpd_free_fn    = ioc_cpd_free,
2522         .pd_alloc_fn    = ioc_pd_alloc,
2523         .pd_init_fn     = ioc_pd_init,
2524         .pd_free_fn     = ioc_pd_free,
2525 };
2526
2527 static int __init ioc_init(void)
2528 {
2529         return blkcg_policy_register(&blkcg_policy_iocost);
2530 }
2531
2532 static void __exit ioc_exit(void)
2533 {
2534         return blkcg_policy_unregister(&blkcg_policy_iocost);
2535 }
2536
2537 module_init(ioc_init);
2538 module_exit(ioc_exit);