1 /* SPDX-License-Identifier: GPL-2.0
3 * IO cost model based controller.
5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
7 * Copyright (C) 2019 Facebook
9 * One challenge of controlling IO resources is the lack of trivially
10 * observable cost metric. This is distinguished from CPU and memory where
11 * wallclock time and the number of bytes can serve as accurate enough
14 * Bandwidth and iops are the most commonly used metrics for IO devices but
15 * depending on the type and specifics of the device, different IO patterns
16 * easily lead to multiple orders of magnitude variations rendering them
17 * useless for the purpose of IO capacity distribution. While on-device
18 * time, with a lot of clutches, could serve as a useful approximation for
19 * non-queued rotational devices, this is no longer viable with modern
20 * devices, even the rotational ones.
22 * While there is no cost metric we can trivially observe, it isn't a
23 * complete mystery. For example, on a rotational device, seek cost
24 * dominates while a contiguous transfer contributes a smaller amount
25 * proportional to the size. If we can characterize at least the relative
26 * costs of these different types of IOs, it should be possible to
27 * implement a reasonable work-conserving proportional IO resource
32 * IO cost model estimates the cost of an IO given its basic parameters and
33 * history (e.g. the end sector of the last IO). The cost is measured in
34 * device time. If a given IO is estimated to cost 10ms, the device should
35 * be able to process ~100 of those IOs in a second.
37 * Currently, there's only one builtin cost model - linear. Each IO is
38 * classified as sequential or random and given a base cost accordingly.
39 * On top of that, a size cost proportional to the length of the IO is
40 * added. While simple, this model captures the operational
41 * characteristics of a wide varienty of devices well enough. Default
42 * paramters for several different classes of devices are provided and the
43 * parameters can be configured from userspace via
44 * /sys/fs/cgroup/io.cost.model.
46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47 * device-specific coefficients.
49 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
50 * device-specific coefficients.
54 * The device virtual time (vtime) is used as the primary control metric.
55 * The control strategy is composed of the following three parts.
57 * 2-1. Vtime Distribution
59 * When a cgroup becomes active in terms of IOs, its hierarchical share is
60 * calculated. Please consider the following hierarchy where the numbers
61 * inside parentheses denote the configured weights.
67 * A0 (w:100) A1 (w:100)
69 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
70 * of equal weight, each gets 50% share. If then B starts issuing IOs, B
71 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
72 * 12.5% each. The distribution mechanism only cares about these flattened
73 * shares. They're called hweights (hierarchical weights) and always add
74 * upto 1 (HWEIGHT_WHOLE).
76 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
77 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
78 * against the device vtime - an IO which takes 10ms on the underlying
79 * device is considered to take 80ms on A0.
81 * This constitutes the basis of IO capacity distribution. Each cgroup's
82 * vtime is running at a rate determined by its hweight. A cgroup tracks
83 * the vtime consumed by past IOs and can issue a new IO iff doing so
84 * wouldn't outrun the current device vtime. Otherwise, the IO is
85 * suspended until the vtime has progressed enough to cover it.
87 * 2-2. Vrate Adjustment
89 * It's unrealistic to expect the cost model to be perfect. There are too
90 * many devices and even on the same device the overall performance
91 * fluctuates depending on numerous factors such as IO mixture and device
92 * internal garbage collection. The controller needs to adapt dynamically.
94 * This is achieved by adjusting the overall IO rate according to how busy
95 * the device is. If the device becomes overloaded, we're sending down too
96 * many IOs and should generally slow down. If there are waiting issuers
97 * but the device isn't saturated, we're issuing too few and should
100 * To slow down, we lower the vrate - the rate at which the device vtime
101 * passes compared to the wall clock. For example, if the vtime is running
102 * at the vrate of 75%, all cgroups added up would only be able to issue
103 * 750ms worth of IOs per second, and vice-versa for speeding up.
105 * Device business is determined using two criteria - rq wait and
106 * completion latencies.
108 * When a device gets saturated, the on-device and then the request queues
109 * fill up and a bio which is ready to be issued has to wait for a request
110 * to become available. When this delay becomes noticeable, it's a clear
111 * indication that the device is saturated and we lower the vrate. This
112 * saturation signal is fairly conservative as it only triggers when both
113 * hardware and software queues are filled up, and is used as the default
116 * As devices can have deep queues and be unfair in how the queued commands
117 * are executed, soley depending on rq wait may not result in satisfactory
118 * control quality. For a better control quality, completion latency QoS
119 * parameters can be configured so that the device is considered saturated
120 * if N'th percentile completion latency rises above the set point.
122 * The completion latency requirements are a function of both the
123 * underlying device characteristics and the desired IO latency quality of
124 * service. There is an inherent trade-off - the tighter the latency QoS,
125 * the higher the bandwidth lossage. Latency QoS is disabled by default
126 * and can be set through /sys/fs/cgroup/io.cost.qos.
128 * 2-3. Work Conservation
130 * Imagine two cgroups A and B with equal weights. A is issuing a small IO
131 * periodically while B is sending out enough parallel IOs to saturate the
132 * device on its own. Let's say A's usage amounts to 100ms worth of IO
133 * cost per second, i.e., 10% of the device capacity. The naive
134 * distribution of half and half would lead to 60% utilization of the
135 * device, a significant reduction in the total amount of work done
136 * compared to free-for-all competition. This is too high a cost to pay
139 * To conserve the total amount of work done, we keep track of how much
140 * each active cgroup is actually using and yield part of its weight if
141 * there are other cgroups which can make use of it. In the above case,
142 * A's weight will be lowered so that it hovers above the actual usage and
143 * B would be able to use the rest.
145 * As we don't want to penalize a cgroup for donating its weight, the
146 * surplus weight adjustment factors in a margin and has an immediate
147 * snapback mechanism in case the cgroup needs more IO vtime for itself.
149 * Note that adjusting down surplus weights has the same effects as
150 * accelerating vtime for other cgroups and work conservation can also be
151 * implemented by adjusting vrate dynamically. However, squaring who can
152 * donate and should take back how much requires hweight propagations
153 * anyway making it easier to implement and understand as a separate
158 * Instead of debugfs or other clumsy monitoring mechanisms, this
159 * controller uses a drgn based monitoring script -
160 * tools/cgroup/iocost_monitor.py. For details on drgn, please see
161 * https://github.com/osandov/drgn. The ouput looks like the following.
163 * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
164 * active weight hweight% inflt% del_ms usages%
165 * test/a * 50/ 50 33.33/ 33.33 27.65 0*041 033:033:033
166 * test/b * 100/ 100 66.67/ 66.67 17.56 0*000 066:079:077
168 * - per : Timer period
169 * - cur_per : Internal wall and device vtime clock
170 * - vrate : Device virtual time rate against wall clock
171 * - weight : Surplus-adjusted and configured weights
172 * - hweight : Surplus-adjusted and configured hierarchical weights
173 * - inflt : The percentage of in-flight IO cost at the end of last period
174 * - del_ms : Deferred issuer delay induction level and duration
175 * - usages : Usage history
178 #include <linux/kernel.h>
179 #include <linux/module.h>
180 #include <linux/timer.h>
181 #include <linux/time64.h>
182 #include <linux/parser.h>
183 #include <linux/sched/signal.h>
184 #include <linux/blk-cgroup.h>
185 #include "blk-rq-qos.h"
186 #include "blk-stat.h"
189 #ifdef CONFIG_TRACEPOINTS
191 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
192 #define TRACE_IOCG_PATH_LEN 1024
193 static DEFINE_SPINLOCK(trace_iocg_path_lock);
194 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
196 #define TRACE_IOCG_PATH(type, iocg, ...) \
198 unsigned long flags; \
199 if (trace_iocost_##type##_enabled()) { \
200 spin_lock_irqsave(&trace_iocg_path_lock, flags); \
201 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
202 trace_iocg_path, TRACE_IOCG_PATH_LEN); \
203 trace_iocost_##type(iocg, trace_iocg_path, \
205 spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
209 #else /* CONFIG_TRACE_POINTS */
210 #define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
211 #endif /* CONFIG_TRACE_POINTS */
216 /* timer period is calculated from latency requirements, bound it */
217 MIN_PERIOD = USEC_PER_MSEC,
218 MAX_PERIOD = USEC_PER_SEC,
221 * A cgroup's vtime can run 50% behind the device vtime, which
222 * serves as its IO credit buffer. Surplus weight adjustment is
223 * immediately canceled if the vtime margin runs below 10%.
226 INUSE_MARGIN_PCT = 10,
228 /* Have some play in waitq timer operations */
229 WAITQ_TIMER_MARGIN_PCT = 5,
232 * vtime can wrap well within a reasonable uptime when vrate is
233 * consistently raised. Don't trust recorded cgroup vtime if the
234 * period counter indicates that it's older than 5mins.
236 VTIME_VALID_DUR = 300 * USEC_PER_SEC,
239 * Remember the past three non-zero usages and use the max for
240 * surplus calculation. Three slots guarantee that we remember one
241 * full period usage from the last active stretch even after
242 * partial deactivation and re-activation periods. Don't start
243 * giving away weight before collecting two data points to prevent
244 * hweight adjustments based on one partial activation period.
247 MIN_VALID_USAGES = 2,
249 /* 1/64k is granular enough and can easily be handled w/ u32 */
250 HWEIGHT_WHOLE = 1 << 16,
253 * As vtime is used to calculate the cost of each IO, it needs to
254 * be fairly high precision. For example, it should be able to
255 * represent the cost of a single page worth of discard with
256 * suffificient accuracy. At the same time, it should be able to
257 * represent reasonably long enough durations to be useful and
258 * convenient during operation.
260 * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
261 * granularity and days of wrap-around time even at extreme vrates.
263 VTIME_PER_SEC_SHIFT = 37,
264 VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
265 VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
267 /* bound vrate adjustments within two orders of magnitude */
268 VRATE_MIN_PPM = 10000, /* 1% */
269 VRATE_MAX_PPM = 100000000, /* 10000% */
271 VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
272 VRATE_CLAMP_ADJ_PCT = 4,
274 /* if IOs end up waiting for requests, issue less */
275 RQ_WAIT_BUSY_PCT = 5,
277 /* unbusy hysterisis */
280 /* don't let cmds which take a very long time pin lagging for too long */
281 MAX_LAGGING_PERIODS = 10,
284 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
285 * donate the surplus.
287 SURPLUS_SCALE_PCT = 125, /* * 125% */
288 SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */
289 SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */
291 /* switch iff the conditions are met for longer than this */
292 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
295 * Count IO size in 4k pages. The 12bit shift helps keeping
296 * size-proportional components of cost calculation in closer
297 * numbers of digits to per-IO cost components.
300 IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
301 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
303 /* if apart further than 16M, consider randio for linear model */
304 LCOEF_RANDIO_PAGES = 4096,
313 /* io.cost.qos controls including per-dev enable of the whole controller */
320 /* io.cost.qos params */
331 /* io.cost.model controls */
338 /* builtin linear cost model coefficients */
370 u32 qos[NR_QOS_PARAMS];
371 u64 i_lcoefs[NR_I_LCOEFS];
372 u64 lcoefs[NR_LCOEFS];
373 u32 too_fast_vrate_pct;
374 u32 too_slow_vrate_pct;
384 struct ioc_pcpu_stat {
385 struct ioc_missed missed[2];
397 struct ioc_params params;
404 struct timer_list timer;
405 struct list_head active_iocgs; /* active cgroups */
406 struct ioc_pcpu_stat __percpu *pcpu_stat;
408 enum ioc_running running;
409 atomic64_t vtime_rate;
411 seqcount_t period_seqcount;
412 u32 period_at; /* wallclock starttime */
413 u64 period_at_vtime; /* vtime starttime */
415 atomic64_t cur_period; /* inc'd each period */
416 int busy_level; /* saturation history */
418 u64 inuse_margin_vtime;
419 bool weights_updated;
420 atomic_t hweight_gen; /* for lazy hweights */
422 u64 autop_too_fast_at;
423 u64 autop_too_slow_at;
425 bool user_qos_params:1;
426 bool user_cost_model:1;
429 /* per device-cgroup pair */
431 struct blkg_policy_data pd;
435 * A iocg can get its weight from two sources - an explicit
436 * per-device-cgroup configuration or the default weight of the
437 * cgroup. `cfg_weight` is the explicit per-device-cgroup
438 * configuration. `weight` is the effective considering both
441 * When an idle cgroup becomes active its `active` goes from 0 to
442 * `weight`. `inuse` is the surplus adjusted active weight.
443 * `active` and `inuse` are used to calculate `hweight_active` and
446 * `last_inuse` remembers `inuse` while an iocg is idle to persist
447 * surplus adjustments.
455 sector_t cursor; /* to detect randio */
458 * `vtime` is this iocg's vtime cursor which progresses as IOs are
459 * issued. If lagging behind device vtime, the delta represents
460 * the currently available IO budget. If runnning ahead, the
463 * `vtime_done` is the same but progressed on completion rather
464 * than issue. The delta behind `vtime` represents the cost of
465 * currently in-flight IOs.
467 * `last_vtime` is used to remember `vtime` at the end of the last
468 * period to calculate utilization.
471 atomic64_t done_vtime;
475 * The period this iocg was last active in. Used for deactivation
476 * and invalidating `vtime`.
478 atomic64_t active_period;
479 struct list_head active_list;
481 /* see __propagate_active_weight() and current_hweight() for details */
482 u64 child_active_sum;
489 struct wait_queue_head waitq;
490 struct hrtimer waitq_timer;
491 struct hrtimer delay_timer;
493 /* usage is recorded as fractions of HWEIGHT_WHOLE */
495 u32 usages[NR_USAGE_SLOTS];
497 /* this iocg's depth in the hierarchy and ancestors including self */
499 struct ioc_gq *ancestors[];
504 struct blkcg_policy_data cpd;
505 unsigned int dfl_weight;
516 struct wait_queue_entry wait;
522 struct iocg_wake_ctx {
528 static const struct ioc_params autop[] = {
531 [QOS_RLAT] = 50000, /* 50ms */
533 [QOS_MIN] = VRATE_MIN_PPM,
534 [QOS_MAX] = VRATE_MAX_PPM,
537 [I_LCOEF_RBPS] = 174019176,
538 [I_LCOEF_RSEQIOPS] = 41708,
539 [I_LCOEF_RRANDIOPS] = 370,
540 [I_LCOEF_WBPS] = 178075866,
541 [I_LCOEF_WSEQIOPS] = 42705,
542 [I_LCOEF_WRANDIOPS] = 378,
547 [QOS_RLAT] = 25000, /* 25ms */
549 [QOS_MIN] = VRATE_MIN_PPM,
550 [QOS_MAX] = VRATE_MAX_PPM,
553 [I_LCOEF_RBPS] = 245855193,
554 [I_LCOEF_RSEQIOPS] = 61575,
555 [I_LCOEF_RRANDIOPS] = 6946,
556 [I_LCOEF_WBPS] = 141365009,
557 [I_LCOEF_WSEQIOPS] = 33716,
558 [I_LCOEF_WRANDIOPS] = 26796,
563 [QOS_RLAT] = 25000, /* 25ms */
565 [QOS_MIN] = VRATE_MIN_PPM,
566 [QOS_MAX] = VRATE_MAX_PPM,
569 [I_LCOEF_RBPS] = 488636629,
570 [I_LCOEF_RSEQIOPS] = 8932,
571 [I_LCOEF_RRANDIOPS] = 8518,
572 [I_LCOEF_WBPS] = 427891549,
573 [I_LCOEF_WSEQIOPS] = 28755,
574 [I_LCOEF_WRANDIOPS] = 21940,
576 .too_fast_vrate_pct = 500,
580 [QOS_RLAT] = 5000, /* 5ms */
582 [QOS_MIN] = VRATE_MIN_PPM,
583 [QOS_MAX] = VRATE_MAX_PPM,
586 [I_LCOEF_RBPS] = 3102524156LLU,
587 [I_LCOEF_RSEQIOPS] = 724816,
588 [I_LCOEF_RRANDIOPS] = 778122,
589 [I_LCOEF_WBPS] = 1742780862LLU,
590 [I_LCOEF_WSEQIOPS] = 425702,
591 [I_LCOEF_WRANDIOPS] = 443193,
593 .too_slow_vrate_pct = 10,
598 * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
599 * vtime credit shortage and down on device saturation.
601 static u32 vrate_adj_pct[] =
603 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
604 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
605 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
607 static struct blkcg_policy blkcg_policy_iocost;
609 /* accessors and helpers */
610 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
612 return container_of(rqos, struct ioc, rqos);
615 static struct ioc *q_to_ioc(struct request_queue *q)
617 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
620 static const char *q_name(struct request_queue *q)
622 if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
623 return kobject_name(q->kobj.parent);
628 static const char __maybe_unused *ioc_name(struct ioc *ioc)
630 return q_name(ioc->rqos.q);
633 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
635 return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
638 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
640 return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
643 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
645 return pd_to_blkg(&iocg->pd);
648 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
650 return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
651 struct ioc_cgrp, cpd);
655 * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
656 * weight, the more expensive each IO.
658 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
660 return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
663 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
665 bio->bi_iocost_cost = cost;
666 atomic64_add(cost, &iocg->vtime);
669 #define CREATE_TRACE_POINTS
670 #include <trace/events/iocost.h>
672 /* latency Qos params changed, update period_us and all the dependent params */
673 static void ioc_refresh_period_us(struct ioc *ioc)
675 u32 ppm, lat, multi, period_us;
677 lockdep_assert_held(&ioc->lock);
679 /* pick the higher latency target */
680 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
681 ppm = ioc->params.qos[QOS_RPPM];
682 lat = ioc->params.qos[QOS_RLAT];
684 ppm = ioc->params.qos[QOS_WPPM];
685 lat = ioc->params.qos[QOS_WLAT];
689 * We want the period to be long enough to contain a healthy number
690 * of IOs while short enough for granular control. Define it as a
691 * multiple of the latency target. Ideally, the multiplier should
692 * be scaled according to the percentile so that it would nominally
693 * contain a certain number of requests. Let's be simpler and
694 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
697 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
700 period_us = multi * lat;
701 period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
703 /* calculate dependent params */
704 ioc->period_us = period_us;
705 ioc->margin_us = period_us * MARGIN_PCT / 100;
706 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
707 period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
710 static int ioc_autop_idx(struct ioc *ioc)
712 int idx = ioc->autop_idx;
713 const struct ioc_params *p = &autop[idx];
718 if (!blk_queue_nonrot(ioc->rqos.q))
721 /* handle SATA SSDs w/ broken NCQ */
722 if (blk_queue_depth(ioc->rqos.q) == 1)
723 return AUTOP_SSD_QD1;
725 /* use one of the normal ssd sets */
726 if (idx < AUTOP_SSD_DFL)
727 return AUTOP_SSD_DFL;
729 /* if user is overriding anything, maintain what was there */
730 if (ioc->user_qos_params || ioc->user_cost_model)
733 /* step up/down based on the vrate */
734 vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
736 now_ns = ktime_get_ns();
738 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
739 if (!ioc->autop_too_fast_at)
740 ioc->autop_too_fast_at = now_ns;
741 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
744 ioc->autop_too_fast_at = 0;
747 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
748 if (!ioc->autop_too_slow_at)
749 ioc->autop_too_slow_at = now_ns;
750 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
753 ioc->autop_too_slow_at = 0;
760 * Take the followings as input
762 * @bps maximum sequential throughput
763 * @seqiops maximum sequential 4k iops
764 * @randiops maximum random 4k iops
766 * and calculate the linear model cost coefficients.
768 * *@page per-page cost 1s / (@bps / 4096)
769 * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
770 * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
772 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
773 u64 *page, u64 *seqio, u64 *randio)
777 *page = *seqio = *randio = 0;
780 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
781 DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
784 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
790 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
796 static void ioc_refresh_lcoefs(struct ioc *ioc)
798 u64 *u = ioc->params.i_lcoefs;
799 u64 *c = ioc->params.lcoefs;
801 calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
802 &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
803 calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
804 &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
807 static bool ioc_refresh_params(struct ioc *ioc, bool force)
809 const struct ioc_params *p;
812 lockdep_assert_held(&ioc->lock);
814 idx = ioc_autop_idx(ioc);
817 if (idx == ioc->autop_idx && !force)
820 if (idx != ioc->autop_idx)
821 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
823 ioc->autop_idx = idx;
824 ioc->autop_too_fast_at = 0;
825 ioc->autop_too_slow_at = 0;
827 if (!ioc->user_qos_params)
828 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
829 if (!ioc->user_cost_model)
830 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
832 ioc_refresh_period_us(ioc);
833 ioc_refresh_lcoefs(ioc);
835 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
836 VTIME_PER_USEC, MILLION);
837 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
838 VTIME_PER_USEC, MILLION);
843 /* take a snapshot of the current [v]time and vrate */
844 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
848 now->now_ns = ktime_get();
849 now->now = ktime_to_us(now->now_ns);
850 now->vrate = atomic64_read(&ioc->vtime_rate);
853 * The current vtime is
855 * vtime at period start + (wallclock time since the start) * vrate
857 * As a consistent snapshot of `period_at_vtime` and `period_at` is
858 * needed, they're seqcount protected.
861 seq = read_seqcount_begin(&ioc->period_seqcount);
862 now->vnow = ioc->period_at_vtime +
863 (now->now - ioc->period_at) * now->vrate;
864 } while (read_seqcount_retry(&ioc->period_seqcount, seq));
867 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
869 lockdep_assert_held(&ioc->lock);
870 WARN_ON_ONCE(ioc->running != IOC_RUNNING);
872 write_seqcount_begin(&ioc->period_seqcount);
873 ioc->period_at = now->now;
874 ioc->period_at_vtime = now->vnow;
875 write_seqcount_end(&ioc->period_seqcount);
877 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
878 add_timer(&ioc->timer);
882 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
883 * weight sums and propagate upwards accordingly.
885 static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
887 struct ioc *ioc = iocg->ioc;
890 lockdep_assert_held(&ioc->lock);
892 inuse = min(active, inuse);
894 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
895 struct ioc_gq *parent = iocg->ancestors[lvl];
896 struct ioc_gq *child = iocg->ancestors[lvl + 1];
897 u32 parent_active = 0, parent_inuse = 0;
899 /* update the level sums */
900 parent->child_active_sum += (s32)(active - child->active);
901 parent->child_inuse_sum += (s32)(inuse - child->inuse);
902 /* apply the udpates */
903 child->active = active;
904 child->inuse = inuse;
907 * The delta between inuse and active sums indicates that
908 * that much of weight is being given away. Parent's inuse
909 * and active should reflect the ratio.
911 if (parent->child_active_sum) {
912 parent_active = parent->weight;
913 parent_inuse = DIV64_U64_ROUND_UP(
914 parent_active * parent->child_inuse_sum,
915 parent->child_active_sum);
918 /* do we need to keep walking up? */
919 if (parent_active == parent->active &&
920 parent_inuse == parent->inuse)
923 active = parent_active;
924 inuse = parent_inuse;
927 ioc->weights_updated = true;
930 static void commit_active_weights(struct ioc *ioc)
932 lockdep_assert_held(&ioc->lock);
934 if (ioc->weights_updated) {
935 /* paired with rmb in current_hweight(), see there */
937 atomic_inc(&ioc->hweight_gen);
938 ioc->weights_updated = false;
942 static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
944 __propagate_active_weight(iocg, active, inuse);
945 commit_active_weights(iocg->ioc);
948 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
950 struct ioc *ioc = iocg->ioc;
955 /* hot path - if uptodate, use cached */
956 ioc_gen = atomic_read(&ioc->hweight_gen);
957 if (ioc_gen == iocg->hweight_gen)
961 * Paired with wmb in commit_active_weights(). If we saw the
962 * updated hweight_gen, all the weight updates from
963 * __propagate_active_weight() are visible too.
965 * We can race with weight updates during calculation and get it
966 * wrong. However, hweight_gen would have changed and a future
967 * reader will recalculate and we're guaranteed to discard the
972 hwa = hwi = HWEIGHT_WHOLE;
973 for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
974 struct ioc_gq *parent = iocg->ancestors[lvl];
975 struct ioc_gq *child = iocg->ancestors[lvl + 1];
976 u32 active_sum = READ_ONCE(parent->child_active_sum);
977 u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
978 u32 active = READ_ONCE(child->active);
979 u32 inuse = READ_ONCE(child->inuse);
981 /* we can race with deactivations and either may read as zero */
982 if (!active_sum || !inuse_sum)
985 active_sum = max(active, active_sum);
986 hwa = hwa * active / active_sum; /* max 16bits * 10000 */
988 inuse_sum = max(inuse, inuse_sum);
989 hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */
992 iocg->hweight_active = max_t(u32, hwa, 1);
993 iocg->hweight_inuse = max_t(u32, hwi, 1);
994 iocg->hweight_gen = ioc_gen;
997 *hw_activep = iocg->hweight_active;
999 *hw_inusep = iocg->hweight_inuse;
1002 static void weight_updated(struct ioc_gq *iocg)
1004 struct ioc *ioc = iocg->ioc;
1005 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1006 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1009 lockdep_assert_held(&ioc->lock);
1011 weight = iocg->cfg_weight ?: iocc->dfl_weight;
1012 if (weight != iocg->weight && iocg->active)
1013 propagate_active_weight(iocg, weight,
1014 DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1015 iocg->weight = weight;
1018 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1020 struct ioc *ioc = iocg->ioc;
1021 u64 last_period, cur_period, max_period_delta;
1022 u64 vtime, vmargin, vmin;
1026 * If seem to be already active, just update the stamp to tell the
1027 * timer that we're still active. We don't mind occassional races.
1029 if (!list_empty(&iocg->active_list)) {
1031 cur_period = atomic64_read(&ioc->cur_period);
1032 if (atomic64_read(&iocg->active_period) != cur_period)
1033 atomic64_set(&iocg->active_period, cur_period);
1037 /* racy check on internal node IOs, treat as root level IOs */
1038 if (iocg->child_active_sum)
1041 spin_lock_irq(&ioc->lock);
1046 cur_period = atomic64_read(&ioc->cur_period);
1047 last_period = atomic64_read(&iocg->active_period);
1048 atomic64_set(&iocg->active_period, cur_period);
1050 /* already activated or breaking leaf-only constraint? */
1051 for (i = iocg->level; i > 0; i--)
1052 if (!list_empty(&iocg->active_list))
1054 if (iocg->child_active_sum)
1058 * vtime may wrap when vrate is raised substantially due to
1059 * underestimated IO costs. Look at the period and ignore its
1060 * vtime if the iocg has been idle for too long. Also, cap the
1061 * budget it can start with to the margin.
1063 max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1064 vtime = atomic64_read(&iocg->vtime);
1065 vmargin = ioc->margin_us * now->vrate;
1066 vmin = now->vnow - vmargin;
1068 if (last_period + max_period_delta < cur_period ||
1069 time_before64(vtime, vmin)) {
1070 atomic64_add(vmin - vtime, &iocg->vtime);
1071 atomic64_add(vmin - vtime, &iocg->done_vtime);
1076 * Activate, propagate weight and start period timer if not
1077 * running. Reset hweight_gen to avoid accidental match from
1080 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1081 list_add(&iocg->active_list, &ioc->active_iocgs);
1082 propagate_active_weight(iocg, iocg->weight,
1083 iocg->last_inuse ?: iocg->weight);
1085 TRACE_IOCG_PATH(iocg_activate, iocg, now,
1086 last_period, cur_period, vtime);
1088 iocg->last_vtime = vtime;
1090 if (ioc->running == IOC_IDLE) {
1091 ioc->running = IOC_RUNNING;
1092 ioc_start_period(ioc, now);
1095 spin_unlock_irq(&ioc->lock);
1099 spin_unlock_irq(&ioc->lock);
1103 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1104 int flags, void *key)
1106 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1107 struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1108 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1110 ctx->vbudget -= cost;
1112 if (ctx->vbudget < 0)
1115 iocg_commit_bio(ctx->iocg, wait->bio, cost);
1118 * autoremove_wake_function() removes the wait entry only when it
1119 * actually changed the task state. We want the wait always
1120 * removed. Remove explicitly and use default_wake_function().
1122 list_del_init(&wq_entry->entry);
1123 wait->committed = true;
1125 default_wake_function(wq_entry, mode, flags, key);
1129 static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1131 struct ioc *ioc = iocg->ioc;
1132 struct iocg_wake_ctx ctx = { .iocg = iocg };
1133 u64 margin_ns = (u64)(ioc->period_us *
1134 WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1135 u64 vshortage, expires, oexpires;
1137 lockdep_assert_held(&iocg->waitq.lock);
1140 * Wake up the ones which are due and see how much vtime we'll need
1143 current_hweight(iocg, NULL, &ctx.hw_inuse);
1144 ctx.vbudget = now->vnow - atomic64_read(&iocg->vtime);
1145 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1146 if (!waitqueue_active(&iocg->waitq))
1148 if (WARN_ON_ONCE(ctx.vbudget >= 0))
1151 /* determine next wakeup, add a quarter margin to guarantee chunking */
1152 vshortage = -ctx.vbudget;
1153 expires = now->now_ns +
1154 DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1155 expires += margin_ns / 4;
1157 /* if already active and close enough, don't bother */
1158 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1159 if (hrtimer_is_queued(&iocg->waitq_timer) &&
1160 abs(oexpires - expires) <= margin_ns / 4)
1163 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1164 margin_ns / 4, HRTIMER_MODE_ABS);
1167 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1169 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1171 unsigned long flags;
1173 ioc_now(iocg->ioc, &now);
1175 spin_lock_irqsave(&iocg->waitq.lock, flags);
1176 iocg_kick_waitq(iocg, &now);
1177 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1179 return HRTIMER_NORESTART;
1182 static void iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
1184 struct ioc *ioc = iocg->ioc;
1185 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1186 u64 vtime = atomic64_read(&iocg->vtime);
1187 u64 vmargin = ioc->margin_us * now->vrate;
1188 u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1189 u64 expires, oexpires;
1191 /* clear or maintain depending on the overage */
1192 if (time_before_eq64(vtime, now->vnow)) {
1193 blkcg_clear_delay(blkg);
1196 if (!atomic_read(&blkg->use_delay) &&
1197 time_before_eq64(vtime, now->vnow + vmargin))
1202 u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
1204 blkcg_add_delay(blkg, now->now_ns, cost_ns);
1206 blkcg_use_delay(blkg);
1208 expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
1209 now->vrate) * NSEC_PER_USEC;
1211 /* if already active and close enough, don't bother */
1212 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1213 if (hrtimer_is_queued(&iocg->delay_timer) &&
1214 abs(oexpires - expires) <= margin_ns / 4)
1217 hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1218 margin_ns / 4, HRTIMER_MODE_ABS);
1221 static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1223 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1226 ioc_now(iocg->ioc, &now);
1227 iocg_kick_delay(iocg, &now, 0);
1229 return HRTIMER_NORESTART;
1232 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1234 u32 nr_met[2] = { };
1235 u32 nr_missed[2] = { };
1239 for_each_online_cpu(cpu) {
1240 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1241 u64 this_rq_wait_ns;
1243 for (rw = READ; rw <= WRITE; rw++) {
1244 u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1245 u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1247 nr_met[rw] += this_met - stat->missed[rw].last_met;
1248 nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1249 stat->missed[rw].last_met = this_met;
1250 stat->missed[rw].last_missed = this_missed;
1253 this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1254 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1255 stat->last_rq_wait_ns = this_rq_wait_ns;
1258 for (rw = READ; rw <= WRITE; rw++) {
1259 if (nr_met[rw] + nr_missed[rw])
1261 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1262 nr_met[rw] + nr_missed[rw]);
1264 missed_ppm_ar[rw] = 0;
1267 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1268 ioc->period_us * NSEC_PER_USEC);
1271 /* was iocg idle this period? */
1272 static bool iocg_is_idle(struct ioc_gq *iocg)
1274 struct ioc *ioc = iocg->ioc;
1276 /* did something get issued this period? */
1277 if (atomic64_read(&iocg->active_period) ==
1278 atomic64_read(&ioc->cur_period))
1281 /* is something in flight? */
1282 if (atomic64_read(&iocg->done_vtime) < atomic64_read(&iocg->vtime))
1288 /* returns usage with margin added if surplus is large enough */
1289 static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1292 usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1293 usage += SURPLUS_SCALE_ABS;
1295 /* don't bother if the surplus is too small */
1296 if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1302 static void ioc_timer_fn(struct timer_list *timer)
1304 struct ioc *ioc = container_of(timer, struct ioc, timer);
1305 struct ioc_gq *iocg, *tiocg;
1307 int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1308 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1309 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1310 u32 missed_ppm[2], rq_wait_pct;
1314 /* how were the latencies during the period? */
1315 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1317 /* take care of active iocgs */
1318 spin_lock_irq(&ioc->lock);
1322 period_vtime = now.vnow - ioc->period_at_vtime;
1323 if (WARN_ON_ONCE(!period_vtime)) {
1324 spin_unlock_irq(&ioc->lock);
1329 * Waiters determine the sleep durations based on the vrate they
1330 * saw at the time of sleep. If vrate has increased, some waiters
1331 * could be sleeping for too long. Wake up tardy waiters which
1332 * should have woken up in the last period and expire idle iocgs.
1334 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1335 if (!waitqueue_active(&iocg->waitq) && !iocg_is_idle(iocg))
1338 spin_lock(&iocg->waitq.lock);
1340 if (waitqueue_active(&iocg->waitq)) {
1341 /* might be oversleeping vtime / hweight changes, kick */
1342 iocg_kick_waitq(iocg, &now);
1343 iocg_kick_delay(iocg, &now, 0);
1344 } else if (iocg_is_idle(iocg)) {
1345 /* no waiter and idle, deactivate */
1346 iocg->last_inuse = iocg->inuse;
1347 __propagate_active_weight(iocg, 0, 0);
1348 list_del_init(&iocg->active_list);
1351 spin_unlock(&iocg->waitq.lock);
1353 commit_active_weights(ioc);
1355 /* calc usages and see whether some weights need to be moved around */
1356 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1357 u64 vdone, vtime, vusage, vmargin, vmin;
1358 u32 hw_active, hw_inuse, usage;
1361 * Collect unused and wind vtime closer to vnow to prevent
1362 * iocgs from accumulating a large amount of budget.
1364 vdone = atomic64_read(&iocg->done_vtime);
1365 vtime = atomic64_read(&iocg->vtime);
1366 current_hweight(iocg, &hw_active, &hw_inuse);
1369 * Latency QoS detection doesn't account for IOs which are
1370 * in-flight for longer than a period. Detect them by
1371 * comparing vdone against period start. If lagging behind
1372 * IOs from past periods, don't increase vrate.
1374 if (!atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1375 time_after64(vtime, vdone) &&
1376 time_after64(vtime, now.vnow -
1377 MAX_LAGGING_PERIODS * period_vtime) &&
1378 time_before64(vdone, now.vnow - period_vtime))
1381 if (waitqueue_active(&iocg->waitq))
1382 vusage = now.vnow - iocg->last_vtime;
1383 else if (time_before64(iocg->last_vtime, vtime))
1384 vusage = vtime - iocg->last_vtime;
1388 iocg->last_vtime += vusage;
1390 * Factor in in-flight vtime into vusage to avoid
1391 * high-latency completions appearing as idle. This should
1392 * be done after the above ->last_time adjustment.
1394 vusage = max(vusage, vtime - vdone);
1396 /* calculate hweight based usage ratio and record */
1398 usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1400 iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1401 iocg->usages[iocg->usage_idx] = usage;
1406 /* see whether there's surplus vtime */
1407 vmargin = ioc->margin_us * now.vrate;
1408 vmin = now.vnow - vmargin;
1410 iocg->has_surplus = false;
1412 if (!waitqueue_active(&iocg->waitq) &&
1413 time_before64(vtime, vmin)) {
1414 u64 delta = vmin - vtime;
1416 /* throw away surplus vtime */
1417 atomic64_add(delta, &iocg->vtime);
1418 atomic64_add(delta, &iocg->done_vtime);
1419 iocg->last_vtime += delta;
1420 /* if usage is sufficiently low, maybe it can donate */
1421 if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1422 iocg->has_surplus = true;
1425 } else if (hw_inuse < hw_active) {
1426 u32 new_hwi, new_inuse;
1428 /* was donating but might need to take back some */
1429 if (waitqueue_active(&iocg->waitq)) {
1430 new_hwi = hw_active;
1432 new_hwi = max(hw_inuse,
1433 usage * SURPLUS_SCALE_PCT / 100 +
1437 new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1439 new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1441 if (new_inuse > iocg->inuse) {
1442 TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1443 iocg->inuse, new_inuse,
1445 __propagate_active_weight(iocg, iocg->weight,
1449 /* genuninely out of vtime */
1454 if (!nr_shortages || !nr_surpluses)
1455 goto skip_surplus_transfers;
1457 /* there are both shortages and surpluses, transfer surpluses */
1458 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1459 u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1462 if (!iocg->has_surplus)
1465 /* base the decision on max historical usage */
1466 for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1467 if (iocg->usages[i]) {
1468 usage = max(usage, iocg->usages[i]);
1472 if (nr_valid < MIN_VALID_USAGES)
1475 current_hweight(iocg, &hw_active, &hw_inuse);
1476 new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1480 new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1482 if (new_inuse < iocg->inuse) {
1483 TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1484 iocg->inuse, new_inuse,
1486 __propagate_active_weight(iocg, iocg->weight, new_inuse);
1489 skip_surplus_transfers:
1490 commit_active_weights(ioc);
1493 * If q is getting clogged or we're missing too much, we're issuing
1494 * too much IO and should lower vtime rate. If we're not missing
1495 * and experiencing shortages but not surpluses, we're too stingy
1496 * and should increase vtime rate.
1498 if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1499 missed_ppm[READ] > ppm_rthr ||
1500 missed_ppm[WRITE] > ppm_wthr) {
1501 ioc->busy_level = max(ioc->busy_level, 0);
1503 } else if (nr_lagging) {
1504 ioc->busy_level = max(ioc->busy_level, 0);
1505 } else if (nr_shortages && !nr_surpluses &&
1506 rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1507 missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1508 missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1509 ioc->busy_level = min(ioc->busy_level, 0);
1512 ioc->busy_level = 0;
1515 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1517 if (ioc->busy_level) {
1518 u64 vrate = atomic64_read(&ioc->vtime_rate);
1519 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1521 /* rq_wait signal is always reliable, ignore user vrate_min */
1522 if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1523 vrate_min = VRATE_MIN;
1526 * If vrate is out of bounds, apply clamp gradually as the
1527 * bounds can change abruptly. Otherwise, apply busy_level
1530 if (vrate < vrate_min) {
1531 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1533 vrate = min(vrate, vrate_min);
1534 } else if (vrate > vrate_max) {
1535 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1537 vrate = max(vrate, vrate_max);
1539 int idx = min_t(int, abs(ioc->busy_level),
1540 ARRAY_SIZE(vrate_adj_pct) - 1);
1541 u32 adj_pct = vrate_adj_pct[idx];
1543 if (ioc->busy_level > 0)
1544 adj_pct = 100 - adj_pct;
1546 adj_pct = 100 + adj_pct;
1548 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1549 vrate_min, vrate_max);
1552 trace_iocost_ioc_vrate_adj(ioc, vrate, &missed_ppm, rq_wait_pct,
1553 nr_lagging, nr_shortages,
1556 atomic64_set(&ioc->vtime_rate, vrate);
1557 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1558 ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1561 ioc_refresh_params(ioc, false);
1564 * This period is done. Move onto the next one. If nothing's
1565 * going on with the device, stop the timer.
1567 atomic64_inc(&ioc->cur_period);
1569 if (ioc->running != IOC_STOP) {
1570 if (!list_empty(&ioc->active_iocgs)) {
1571 ioc_start_period(ioc, &now);
1573 ioc->busy_level = 0;
1574 ioc->running = IOC_IDLE;
1578 spin_unlock_irq(&ioc->lock);
1581 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1582 bool is_merge, u64 *costp)
1584 struct ioc *ioc = iocg->ioc;
1585 u64 coef_seqio, coef_randio, coef_page;
1586 u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1590 switch (bio_op(bio)) {
1592 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
1593 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
1594 coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
1597 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
1598 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
1599 coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
1606 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1607 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1611 if (seek_pages > LCOEF_RANDIO_PAGES) {
1612 cost += coef_randio;
1617 cost += pages * coef_page;
1622 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1626 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1630 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1632 struct blkcg_gq *blkg = bio->bi_blkg;
1633 struct ioc *ioc = rqos_to_ioc(rqos);
1634 struct ioc_gq *iocg = blkg_to_iocg(blkg);
1636 struct iocg_wait wait;
1637 u32 hw_active, hw_inuse;
1638 u64 abs_cost, cost, vtime;
1640 /* bypass IOs if disabled or for root cgroup */
1641 if (!ioc->enabled || !iocg->level)
1644 /* always activate so that even 0 cost IOs get protected to some level */
1645 if (!iocg_activate(iocg, &now))
1648 /* calculate the absolute vtime cost */
1649 abs_cost = calc_vtime_cost(bio, iocg, false);
1653 iocg->cursor = bio_end_sector(bio);
1655 vtime = atomic64_read(&iocg->vtime);
1656 current_hweight(iocg, &hw_active, &hw_inuse);
1658 if (hw_inuse < hw_active &&
1659 time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1660 TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1661 iocg->inuse, iocg->weight, hw_inuse, hw_active);
1662 spin_lock_irq(&ioc->lock);
1663 propagate_active_weight(iocg, iocg->weight, iocg->weight);
1664 spin_unlock_irq(&ioc->lock);
1665 current_hweight(iocg, &hw_active, &hw_inuse);
1668 cost = abs_cost_to_cost(abs_cost, hw_inuse);
1671 * If no one's waiting and within budget, issue right away. The
1672 * tests are racy but the races aren't systemic - we only miss once
1673 * in a while which is fine.
1675 if (!waitqueue_active(&iocg->waitq) &&
1676 time_before_eq64(vtime + cost, now.vnow)) {
1677 iocg_commit_bio(iocg, bio, cost);
1681 if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1682 iocg_commit_bio(iocg, bio, cost);
1683 iocg_kick_delay(iocg, &now, cost);
1688 * Append self to the waitq and schedule the wakeup timer if we're
1689 * the first waiter. The timer duration is calculated based on the
1690 * current vrate. vtime and hweight changes can make it too short
1691 * or too long. Each wait entry records the absolute cost it's
1692 * waiting for to allow re-evaluation using a custom wait entry.
1694 * If too short, the timer simply reschedules itself. If too long,
1695 * the period timer will notice and trigger wakeups.
1697 * All waiters are on iocg->waitq and the wait states are
1698 * synchronized using waitq.lock.
1700 spin_lock_irq(&iocg->waitq.lock);
1703 * We activated above but w/o any synchronization. Deactivation is
1704 * synchronized with waitq.lock and we won't get deactivated as
1705 * long as we're waiting, so we're good if we're activated here.
1706 * In the unlikely case that we are deactivated, just issue the IO.
1708 if (unlikely(list_empty(&iocg->active_list))) {
1709 spin_unlock_irq(&iocg->waitq.lock);
1710 iocg_commit_bio(iocg, bio, cost);
1714 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1715 wait.wait.private = current;
1717 wait.abs_cost = abs_cost;
1718 wait.committed = false; /* will be set true by waker */
1720 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1721 iocg_kick_waitq(iocg, &now);
1723 spin_unlock_irq(&iocg->waitq.lock);
1726 set_current_state(TASK_UNINTERRUPTIBLE);
1732 /* waker already committed us, proceed */
1733 finish_wait(&iocg->waitq, &wait.wait);
1736 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1739 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1740 sector_t bio_end = bio_end_sector(bio);
1744 /* add iff the existing request has cost assigned */
1745 if (!rq->bio || !rq->bio->bi_iocost_cost)
1748 abs_cost = calc_vtime_cost(bio, iocg, true);
1752 /* update cursor if backmerging into the request at the cursor */
1753 if (blk_rq_pos(rq) < bio_end &&
1754 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1755 iocg->cursor = bio_end;
1757 current_hweight(iocg, NULL, &hw_inuse);
1758 cost = div64_u64(abs_cost * HWEIGHT_WHOLE, hw_inuse);
1759 bio->bi_iocost_cost = cost;
1761 atomic64_add(cost, &iocg->vtime);
1764 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1766 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1768 if (iocg && bio->bi_iocost_cost)
1769 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1772 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1774 struct ioc *ioc = rqos_to_ioc(rqos);
1775 u64 on_q_ns, rq_wait_ns;
1778 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1781 switch (req_op(rq) & REQ_OP_MASK) {
1794 on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1795 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
1797 if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1798 this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1800 this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1802 this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1805 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1807 struct ioc *ioc = rqos_to_ioc(rqos);
1809 spin_lock_irq(&ioc->lock);
1810 ioc_refresh_params(ioc, false);
1811 spin_unlock_irq(&ioc->lock);
1814 static void ioc_rqos_exit(struct rq_qos *rqos)
1816 struct ioc *ioc = rqos_to_ioc(rqos);
1818 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1820 spin_lock_irq(&ioc->lock);
1821 ioc->running = IOC_STOP;
1822 spin_unlock_irq(&ioc->lock);
1824 del_timer_sync(&ioc->timer);
1825 free_percpu(ioc->pcpu_stat);
1829 static struct rq_qos_ops ioc_rqos_ops = {
1830 .throttle = ioc_rqos_throttle,
1831 .merge = ioc_rqos_merge,
1832 .done_bio = ioc_rqos_done_bio,
1833 .done = ioc_rqos_done,
1834 .queue_depth_changed = ioc_rqos_queue_depth_changed,
1835 .exit = ioc_rqos_exit,
1838 static int blk_iocost_init(struct request_queue *q)
1841 struct rq_qos *rqos;
1844 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1848 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1849 if (!ioc->pcpu_stat) {
1855 rqos->id = RQ_QOS_COST;
1856 rqos->ops = &ioc_rqos_ops;
1859 spin_lock_init(&ioc->lock);
1860 timer_setup(&ioc->timer, ioc_timer_fn, 0);
1861 INIT_LIST_HEAD(&ioc->active_iocgs);
1863 ioc->running = IOC_IDLE;
1864 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
1865 seqcount_init(&ioc->period_seqcount);
1866 ioc->period_at = ktime_to_us(ktime_get());
1867 atomic64_set(&ioc->cur_period, 0);
1868 atomic_set(&ioc->hweight_gen, 0);
1870 spin_lock_irq(&ioc->lock);
1871 ioc->autop_idx = AUTOP_INVALID;
1872 ioc_refresh_params(ioc, true);
1873 spin_unlock_irq(&ioc->lock);
1875 rq_qos_add(q, rqos);
1876 ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
1878 rq_qos_del(q, rqos);
1879 free_percpu(ioc->pcpu_stat);
1886 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
1888 struct ioc_cgrp *iocc;
1890 iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
1891 iocc->dfl_weight = CGROUP_WEIGHT_DFL;
1896 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
1898 kfree(container_of(cpd, struct ioc_cgrp, cpd));
1901 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
1902 struct blkcg *blkcg)
1904 int levels = blkcg->css.cgroup->level + 1;
1905 struct ioc_gq *iocg;
1907 iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
1915 static void ioc_pd_init(struct blkg_policy_data *pd)
1917 struct ioc_gq *iocg = pd_to_iocg(pd);
1918 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
1919 struct ioc *ioc = q_to_ioc(blkg->q);
1921 struct blkcg_gq *tblkg;
1922 unsigned long flags;
1927 atomic64_set(&iocg->vtime, now.vnow);
1928 atomic64_set(&iocg->done_vtime, now.vnow);
1929 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
1930 INIT_LIST_HEAD(&iocg->active_list);
1931 iocg->hweight_active = HWEIGHT_WHOLE;
1932 iocg->hweight_inuse = HWEIGHT_WHOLE;
1934 init_waitqueue_head(&iocg->waitq);
1935 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1936 iocg->waitq_timer.function = iocg_waitq_timer_fn;
1937 hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1938 iocg->delay_timer.function = iocg_delay_timer_fn;
1940 iocg->level = blkg->blkcg->css.cgroup->level;
1942 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
1943 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
1944 iocg->ancestors[tiocg->level] = tiocg;
1947 spin_lock_irqsave(&ioc->lock, flags);
1948 weight_updated(iocg);
1949 spin_unlock_irqrestore(&ioc->lock, flags);
1952 static void ioc_pd_free(struct blkg_policy_data *pd)
1954 struct ioc_gq *iocg = pd_to_iocg(pd);
1955 struct ioc *ioc = iocg->ioc;
1958 hrtimer_cancel(&iocg->waitq_timer);
1959 hrtimer_cancel(&iocg->delay_timer);
1961 spin_lock(&ioc->lock);
1962 if (!list_empty(&iocg->active_list)) {
1963 propagate_active_weight(iocg, 0, 0);
1964 list_del_init(&iocg->active_list);
1966 spin_unlock(&ioc->lock);
1971 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
1974 const char *dname = blkg_dev_name(pd->blkg);
1975 struct ioc_gq *iocg = pd_to_iocg(pd);
1977 if (dname && iocg->cfg_weight)
1978 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
1983 static int ioc_weight_show(struct seq_file *sf, void *v)
1985 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1986 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
1988 seq_printf(sf, "default %u\n", iocc->dfl_weight);
1989 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
1990 &blkcg_policy_iocost, seq_cft(sf)->private, false);
1994 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
1995 size_t nbytes, loff_t off)
1997 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1998 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
1999 struct blkg_conf_ctx ctx;
2000 struct ioc_gq *iocg;
2004 if (!strchr(buf, ':')) {
2005 struct blkcg_gq *blkg;
2007 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2010 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2013 spin_lock(&blkcg->lock);
2014 iocc->dfl_weight = v;
2015 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2016 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2019 spin_lock_irq(&iocg->ioc->lock);
2020 weight_updated(iocg);
2021 spin_unlock_irq(&iocg->ioc->lock);
2024 spin_unlock(&blkcg->lock);
2029 ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2033 iocg = blkg_to_iocg(ctx.blkg);
2035 if (!strncmp(ctx.body, "default", 7)) {
2038 if (!sscanf(ctx.body, "%u", &v))
2040 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2044 spin_lock_irq(&iocg->ioc->lock);
2045 iocg->cfg_weight = v;
2046 weight_updated(iocg);
2047 spin_unlock_irq(&iocg->ioc->lock);
2049 blkg_conf_finish(&ctx);
2053 blkg_conf_finish(&ctx);
2057 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2060 const char *dname = blkg_dev_name(pd->blkg);
2061 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2066 seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2067 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2068 ioc->params.qos[QOS_RPPM] / 10000,
2069 ioc->params.qos[QOS_RPPM] % 10000 / 100,
2070 ioc->params.qos[QOS_RLAT],
2071 ioc->params.qos[QOS_WPPM] / 10000,
2072 ioc->params.qos[QOS_WPPM] % 10000 / 100,
2073 ioc->params.qos[QOS_WLAT],
2074 ioc->params.qos[QOS_MIN] / 10000,
2075 ioc->params.qos[QOS_MIN] % 10000 / 100,
2076 ioc->params.qos[QOS_MAX] / 10000,
2077 ioc->params.qos[QOS_MAX] % 10000 / 100);
2081 static int ioc_qos_show(struct seq_file *sf, void *v)
2083 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2085 blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2086 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2090 static const match_table_t qos_ctrl_tokens = {
2091 { QOS_ENABLE, "enable=%u" },
2092 { QOS_CTRL, "ctrl=%s" },
2093 { NR_QOS_CTRL_PARAMS, NULL },
2096 static const match_table_t qos_tokens = {
2097 { QOS_RPPM, "rpct=%s" },
2098 { QOS_RLAT, "rlat=%u" },
2099 { QOS_WPPM, "wpct=%s" },
2100 { QOS_WLAT, "wlat=%u" },
2101 { QOS_MIN, "min=%s" },
2102 { QOS_MAX, "max=%s" },
2103 { NR_QOS_PARAMS, NULL },
2106 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2107 size_t nbytes, loff_t off)
2109 struct gendisk *disk;
2111 u32 qos[NR_QOS_PARAMS];
2116 disk = blkcg_conf_get_disk(&input);
2118 return PTR_ERR(disk);
2120 ioc = q_to_ioc(disk->queue);
2122 ret = blk_iocost_init(disk->queue);
2125 ioc = q_to_ioc(disk->queue);
2128 spin_lock_irq(&ioc->lock);
2129 memcpy(qos, ioc->params.qos, sizeof(qos));
2130 enable = ioc->enabled;
2131 user = ioc->user_qos_params;
2132 spin_unlock_irq(&ioc->lock);
2134 while ((p = strsep(&input, " \t\n"))) {
2135 substring_t args[MAX_OPT_ARGS];
2143 switch (match_token(p, qos_ctrl_tokens, args)) {
2145 match_u64(&args[0], &v);
2149 match_strlcpy(buf, &args[0], sizeof(buf));
2150 if (!strcmp(buf, "auto"))
2152 else if (!strcmp(buf, "user"))
2159 tok = match_token(p, qos_tokens, args);
2163 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2166 if (cgroup_parse_float(buf, 2, &v))
2168 if (v < 0 || v > 10000)
2174 if (match_u64(&args[0], &v))
2180 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2183 if (cgroup_parse_float(buf, 2, &v))
2187 qos[tok] = clamp_t(s64, v * 100,
2188 VRATE_MIN_PPM, VRATE_MAX_PPM);
2196 if (qos[QOS_MIN] > qos[QOS_MAX])
2199 spin_lock_irq(&ioc->lock);
2202 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2203 ioc->enabled = true;
2205 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2206 ioc->enabled = false;
2210 memcpy(ioc->params.qos, qos, sizeof(qos));
2211 ioc->user_qos_params = true;
2213 ioc->user_qos_params = false;
2216 ioc_refresh_params(ioc, true);
2217 spin_unlock_irq(&ioc->lock);
2219 put_disk_and_module(disk);
2224 put_disk_and_module(disk);
2228 static u64 ioc_cost_model_prfill(struct seq_file *sf,
2229 struct blkg_policy_data *pd, int off)
2231 const char *dname = blkg_dev_name(pd->blkg);
2232 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2233 u64 *u = ioc->params.i_lcoefs;
2238 seq_printf(sf, "%s ctrl=%s model=linear "
2239 "rbps=%llu rseqiops=%llu rrandiops=%llu "
2240 "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2241 dname, ioc->user_cost_model ? "user" : "auto",
2242 u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2243 u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2247 static int ioc_cost_model_show(struct seq_file *sf, void *v)
2249 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2251 blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2252 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2256 static const match_table_t cost_ctrl_tokens = {
2257 { COST_CTRL, "ctrl=%s" },
2258 { COST_MODEL, "model=%s" },
2259 { NR_COST_CTRL_PARAMS, NULL },
2262 static const match_table_t i_lcoef_tokens = {
2263 { I_LCOEF_RBPS, "rbps=%u" },
2264 { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
2265 { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
2266 { I_LCOEF_WBPS, "wbps=%u" },
2267 { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
2268 { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
2269 { NR_I_LCOEFS, NULL },
2272 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2273 size_t nbytes, loff_t off)
2275 struct gendisk *disk;
2282 disk = blkcg_conf_get_disk(&input);
2284 return PTR_ERR(disk);
2286 ioc = q_to_ioc(disk->queue);
2288 ret = blk_iocost_init(disk->queue);
2291 ioc = q_to_ioc(disk->queue);
2294 spin_lock_irq(&ioc->lock);
2295 memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2296 user = ioc->user_cost_model;
2297 spin_unlock_irq(&ioc->lock);
2299 while ((p = strsep(&input, " \t\n"))) {
2300 substring_t args[MAX_OPT_ARGS];
2308 switch (match_token(p, cost_ctrl_tokens, args)) {
2310 match_strlcpy(buf, &args[0], sizeof(buf));
2311 if (!strcmp(buf, "auto"))
2313 else if (!strcmp(buf, "user"))
2319 match_strlcpy(buf, &args[0], sizeof(buf));
2320 if (strcmp(buf, "linear"))
2325 tok = match_token(p, i_lcoef_tokens, args);
2326 if (tok == NR_I_LCOEFS)
2328 if (match_u64(&args[0], &v))
2334 spin_lock_irq(&ioc->lock);
2336 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2337 ioc->user_cost_model = true;
2339 ioc->user_cost_model = false;
2341 ioc_refresh_params(ioc, true);
2342 spin_unlock_irq(&ioc->lock);
2344 put_disk_and_module(disk);
2350 put_disk_and_module(disk);
2354 static struct cftype ioc_files[] = {
2357 .flags = CFTYPE_NOT_ON_ROOT,
2358 .seq_show = ioc_weight_show,
2359 .write = ioc_weight_write,
2363 .flags = CFTYPE_ONLY_ON_ROOT,
2364 .seq_show = ioc_qos_show,
2365 .write = ioc_qos_write,
2368 .name = "cost.model",
2369 .flags = CFTYPE_ONLY_ON_ROOT,
2370 .seq_show = ioc_cost_model_show,
2371 .write = ioc_cost_model_write,
2376 static struct blkcg_policy blkcg_policy_iocost = {
2377 .dfl_cftypes = ioc_files,
2378 .cpd_alloc_fn = ioc_cpd_alloc,
2379 .cpd_free_fn = ioc_cpd_free,
2380 .pd_alloc_fn = ioc_pd_alloc,
2381 .pd_init_fn = ioc_pd_init,
2382 .pd_free_fn = ioc_pd_free,
2385 static int __init ioc_init(void)
2387 return blkcg_policy_register(&blkcg_policy_iocost);
2390 static void __exit ioc_exit(void)
2392 return blkcg_policy_unregister(&blkcg_policy_iocost);
2395 module_init(ioc_init);
2396 module_exit(ioc_exit);