drm/etnaviv: Implement mmap as GEM object function
[linux-2.6-microblaze.git] / drivers / md / dm-ps-historical-service-time.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Historical Service Time
4  *
5  *  Keeps a time-weighted exponential moving average of the historical
6  *  service time. Estimates future service time based on the historical
7  *  service time and the number of outstanding requests.
8  *
9  *  Marks paths stale if they have not finished within hst *
10  *  num_paths. If a path is stale and unused, we will send a single
11  *  request to probe in case the path has improved. This situation
12  *  generally arises if the path is so much worse than others that it
13  *  will never have the best estimated service time, or if the entire
14  *  multipath device is unused. If a path is stale and in use, limit the
15  *  number of requests it can receive with the assumption that the path
16  *  has become degraded.
17  *
18  *  To avoid repeatedly calculating exponents for time weighting, times
19  *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
20  *  ns, and the weighting is pre-calculated.
21  *
22  */
23
24 #include "dm.h"
25 #include "dm-path-selector.h"
26
27 #include <linux/blkdev.h>
28 #include <linux/slab.h>
29 #include <linux/module.h>
30
31
32 #define DM_MSG_PREFIX   "multipath historical-service-time"
33 #define HST_MIN_IO 1
34 #define HST_VERSION "0.1.1"
35
36 #define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
37 #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
38 #define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
39 #define HST_FIXED_95 972
40 #define HST_MAX_INFLIGHT HST_FIXED_1
41 #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
42 #define HST_WEIGHT_COUNT 64ULL
43
44 struct selector {
45         struct list_head valid_paths;
46         struct list_head failed_paths;
47         int valid_count;
48         spinlock_t lock;
49
50         unsigned int weights[HST_WEIGHT_COUNT];
51         unsigned int threshold_multiplier;
52 };
53
54 struct path_info {
55         struct list_head list;
56         struct dm_path *path;
57         unsigned int repeat_count;
58
59         spinlock_t lock;
60
61         u64 historical_service_time; /* Fixed point */
62
63         u64 stale_after;
64         u64 last_finish;
65
66         u64 outstanding;
67 };
68
69 /**
70  * fixed_power - compute: x^n, in O(log n) time
71  *
72  * @x:         base of the power
73  * @frac_bits: fractional bits of @x
74  * @n:         power to raise @x to.
75  *
76  * By exploiting the relation between the definition of the natural power
77  * function: x^n := x*x*...*x (x multiplied by itself for n times), and
78  * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
79  * (where: n_i \elem {0, 1}, the binary vector representing n),
80  * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
81  * of course trivially computable in O(log_2 n), the length of our binary
82  * vector.
83  *
84  * (see: kernel/sched/loadavg.c)
85  */
86 static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
87 {
88         unsigned long result = 1UL << frac_bits;
89
90         if (n) {
91                 for (;;) {
92                         if (n & 1) {
93                                 result *= x;
94                                 result += 1UL << (frac_bits - 1);
95                                 result >>= frac_bits;
96                         }
97                         n >>= 1;
98                         if (!n)
99                                 break;
100                         x *= x;
101                         x += 1UL << (frac_bits - 1);
102                         x >>= frac_bits;
103                 }
104         }
105
106         return result;
107 }
108
109 /*
110  * Calculate the next value of an exponential moving average
111  * a_1 = a_0 * e + a * (1 - e)
112  *
113  * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
114  * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
115  * @weight: [0, HST_FIXED_1]
116  *
117  * Note:
118  *   To account for multiple periods in the same calculation,
119  *   a_n = a_0 * e^n + a * (1 - e^n),
120  *   so call fixed_ema(last, next, pow(weight, N))
121  */
122 static u64 fixed_ema(u64 last, u64 next, u64 weight)
123 {
124         last *= weight;
125         last += next * (HST_FIXED_1 - weight);
126         last += 1ULL << (HST_FIXED_SHIFT - 1);
127         return last >> HST_FIXED_SHIFT;
128 }
129
130 static struct selector *alloc_selector(void)
131 {
132         struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
133
134         if (s) {
135                 INIT_LIST_HEAD(&s->valid_paths);
136                 INIT_LIST_HEAD(&s->failed_paths);
137                 spin_lock_init(&s->lock);
138                 s->valid_count = 0;
139         }
140
141         return s;
142 }
143
144 /*
145  * Get the weight for a given time span.
146  */
147 static u64 hst_weight(struct path_selector *ps, u64 delta)
148 {
149         struct selector *s = ps->context;
150         int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
151                            HST_WEIGHT_COUNT - 1);
152
153         return s->weights[bucket];
154 }
155
156 /*
157  * Set up the weights array.
158  *
159  * weights[len-1] = 0
160  * weights[n] = base ^ (n + 1)
161  */
162 static void hst_set_weights(struct path_selector *ps, unsigned int base)
163 {
164         struct selector *s = ps->context;
165         int i;
166
167         if (base >= HST_FIXED_1)
168                 return;
169
170         for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
171                 s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
172         s->weights[HST_WEIGHT_COUNT - 1] = 0;
173 }
174
175 static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
176 {
177         struct selector *s;
178         unsigned int base_weight = HST_FIXED_95;
179         unsigned int threshold_multiplier = 0;
180         char dummy;
181
182         /*
183          * Arguments: [<base_weight> [<threshold_multiplier>]]
184          *   <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
185          *                  value of 0 will completely ignore any history.
186          *                  If not given, default (HST_FIXED_95) is used.
187          *   <threshold_multiplier>: Minimum threshold multiplier for paths to
188          *                  be considered different. That is, a path is
189          *                  considered different iff (p1 > N * p2) where p1
190          *                  is the path with higher service time. A threshold
191          *                  of 1 or 0 has no effect. Defaults to 0.
192          */
193         if (argc > 2)
194                 return -EINVAL;
195
196         if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
197              base_weight >= HST_FIXED_1)) {
198                 return -EINVAL;
199         }
200
201         if (argc > 1 && (sscanf(argv[1], "%u%c",
202                                 &threshold_multiplier, &dummy) != 1)) {
203                 return -EINVAL;
204         }
205
206         s = alloc_selector();
207         if (!s)
208                 return -ENOMEM;
209
210         ps->context = s;
211
212         hst_set_weights(ps, base_weight);
213         s->threshold_multiplier = threshold_multiplier;
214         return 0;
215 }
216
217 static void free_paths(struct list_head *paths)
218 {
219         struct path_info *pi, *next;
220
221         list_for_each_entry_safe(pi, next, paths, list) {
222                 list_del(&pi->list);
223                 kfree(pi);
224         }
225 }
226
227 static void hst_destroy(struct path_selector *ps)
228 {
229         struct selector *s = ps->context;
230
231         free_paths(&s->valid_paths);
232         free_paths(&s->failed_paths);
233         kfree(s);
234         ps->context = NULL;
235 }
236
237 static int hst_status(struct path_selector *ps, struct dm_path *path,
238                      status_type_t type, char *result, unsigned int maxlen)
239 {
240         unsigned int sz = 0;
241         struct path_info *pi;
242
243         if (!path) {
244                 struct selector *s = ps->context;
245
246                 DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
247         } else {
248                 pi = path->pscontext;
249
250                 switch (type) {
251                 case STATUSTYPE_INFO:
252                         DMEMIT("%llu %llu %llu ", pi->historical_service_time,
253                                pi->outstanding, pi->stale_after);
254                         break;
255                 case STATUSTYPE_TABLE:
256                         DMEMIT("0 ");
257                         break;
258                 }
259         }
260
261         return sz;
262 }
263
264 static int hst_add_path(struct path_selector *ps, struct dm_path *path,
265                        int argc, char **argv, char **error)
266 {
267         struct selector *s = ps->context;
268         struct path_info *pi;
269         unsigned int repeat_count = HST_MIN_IO;
270         char dummy;
271         unsigned long flags;
272
273         /*
274          * Arguments: [<repeat_count>]
275          *   <repeat_count>: The number of I/Os before switching path.
276          *                   If not given, default (HST_MIN_IO) is used.
277          */
278         if (argc > 1) {
279                 *error = "historical-service-time ps: incorrect number of arguments";
280                 return -EINVAL;
281         }
282
283         if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
284                 *error = "historical-service-time ps: invalid repeat count";
285                 return -EINVAL;
286         }
287
288         /* allocate the path */
289         pi = kmalloc(sizeof(*pi), GFP_KERNEL);
290         if (!pi) {
291                 *error = "historical-service-time ps: Error allocating path context";
292                 return -ENOMEM;
293         }
294
295         pi->path = path;
296         pi->repeat_count = repeat_count;
297
298         pi->historical_service_time = HST_FIXED_1;
299
300         spin_lock_init(&pi->lock);
301         pi->outstanding = 0;
302
303         pi->stale_after = 0;
304         pi->last_finish = 0;
305
306         path->pscontext = pi;
307
308         spin_lock_irqsave(&s->lock, flags);
309         list_add_tail(&pi->list, &s->valid_paths);
310         s->valid_count++;
311         spin_unlock_irqrestore(&s->lock, flags);
312
313         return 0;
314 }
315
316 static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
317 {
318         struct selector *s = ps->context;
319         struct path_info *pi = path->pscontext;
320         unsigned long flags;
321
322         spin_lock_irqsave(&s->lock, flags);
323         list_move(&pi->list, &s->failed_paths);
324         s->valid_count--;
325         spin_unlock_irqrestore(&s->lock, flags);
326 }
327
328 static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
329 {
330         struct selector *s = ps->context;
331         struct path_info *pi = path->pscontext;
332         unsigned long flags;
333
334         spin_lock_irqsave(&s->lock, flags);
335         list_move_tail(&pi->list, &s->valid_paths);
336         s->valid_count++;
337         spin_unlock_irqrestore(&s->lock, flags);
338
339         return 0;
340 }
341
342 static void hst_fill_compare(struct path_info *pi, u64 *hst,
343                              u64 *out, u64 *stale)
344 {
345         unsigned long flags;
346
347         spin_lock_irqsave(&pi->lock, flags);
348         *hst = pi->historical_service_time;
349         *out = pi->outstanding;
350         *stale = pi->stale_after;
351         spin_unlock_irqrestore(&pi->lock, flags);
352 }
353
354 /*
355  * Compare the estimated service time of 2 paths, pi1 and pi2,
356  * for the incoming I/O.
357  *
358  * Returns:
359  * < 0 : pi1 is better
360  * 0   : no difference between pi1 and pi2
361  * > 0 : pi2 is better
362  *
363  */
364 static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
365                              u64 time_now, struct path_selector *ps)
366 {
367         struct selector *s = ps->context;
368         u64 hst1, hst2;
369         long long out1, out2, stale1, stale2;
370         int pi2_better, over_threshold;
371
372         hst_fill_compare(pi1, &hst1, &out1, &stale1);
373         hst_fill_compare(pi2, &hst2, &out2, &stale2);
374
375         /* Check here if estimated latency for two paths are too similar.
376          * If this is the case, we skip extra calculation and just compare
377          * outstanding requests. In this case, any unloaded paths will
378          * be preferred.
379          */
380         if (hst1 > hst2)
381                 over_threshold = hst1 > (s->threshold_multiplier * hst2);
382         else
383                 over_threshold = hst2 > (s->threshold_multiplier * hst1);
384
385         if (!over_threshold)
386                 return out1 - out2;
387
388         /*
389          * If an unloaded path is stale, choose it. If both paths are unloaded,
390          * choose path that is the most stale.
391          * (If one path is loaded, choose the other)
392          */
393         if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
394             (!out1 && !out2))
395                 return (!out2 * stale1) - (!out1 * stale2);
396
397         /* Compare estimated service time. If outstanding is the same, we
398          * don't need to multiply
399          */
400         if (out1 == out2) {
401                 pi2_better = hst1 > hst2;
402         } else {
403                 /* Potential overflow with out >= 1024 */
404                 if (unlikely(out1 >= HST_MAX_INFLIGHT ||
405                              out2 >= HST_MAX_INFLIGHT)) {
406                         /* If over 1023 in-flights, we may overflow if hst
407                          * is at max. (With this shift we still overflow at
408                          * 1048576 in-flights, which is high enough).
409                          */
410                         hst1 >>= HST_FIXED_SHIFT;
411                         hst2 >>= HST_FIXED_SHIFT;
412                 }
413                 pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
414         }
415
416         /* In the case that the 'winner' is stale, limit to equal usage. */
417         if (pi2_better) {
418                 if (stale2 < time_now)
419                         return out1 - out2;
420                 return 1;
421         }
422         if (stale1 < time_now)
423                 return out1 - out2;
424         return -1;
425 }
426
427 static struct dm_path *hst_select_path(struct path_selector *ps,
428                                        size_t nr_bytes)
429 {
430         struct selector *s = ps->context;
431         struct path_info *pi = NULL, *best = NULL;
432         u64 time_now = sched_clock();
433         struct dm_path *ret = NULL;
434         unsigned long flags;
435
436         spin_lock_irqsave(&s->lock, flags);
437         if (list_empty(&s->valid_paths))
438                 goto out;
439
440         list_for_each_entry(pi, &s->valid_paths, list) {
441                 if (!best || (hst_compare(pi, best, time_now, ps) < 0))
442                         best = pi;
443         }
444
445         if (!best)
446                 goto out;
447
448         /* Move last used path to end (least preferred in case of ties) */
449         list_move_tail(&best->list, &s->valid_paths);
450
451         ret = best->path;
452
453 out:
454         spin_unlock_irqrestore(&s->lock, flags);
455         return ret;
456 }
457
458 static int hst_start_io(struct path_selector *ps, struct dm_path *path,
459                         size_t nr_bytes)
460 {
461         struct path_info *pi = path->pscontext;
462         unsigned long flags;
463
464         spin_lock_irqsave(&pi->lock, flags);
465         pi->outstanding++;
466         spin_unlock_irqrestore(&pi->lock, flags);
467
468         return 0;
469 }
470
471 static u64 path_service_time(struct path_info *pi, u64 start_time)
472 {
473         u64 sched_now = ktime_get_ns();
474
475         /* if a previous disk request has finished after this IO was
476          * sent to the hardware, pretend the submission happened
477          * serially.
478          */
479         if (time_after64(pi->last_finish, start_time))
480                 start_time = pi->last_finish;
481
482         pi->last_finish = sched_now;
483         if (time_before64(sched_now, start_time))
484                 return 0;
485
486         return sched_now - start_time;
487 }
488
489 static int hst_end_io(struct path_selector *ps, struct dm_path *path,
490                       size_t nr_bytes, u64 start_time)
491 {
492         struct path_info *pi = path->pscontext;
493         struct selector *s = ps->context;
494         unsigned long flags;
495         u64 st;
496
497         spin_lock_irqsave(&pi->lock, flags);
498
499         st = path_service_time(pi, start_time);
500         pi->outstanding--;
501         pi->historical_service_time =
502                 fixed_ema(pi->historical_service_time,
503                           min(st * HST_FIXED_1, HST_FIXED_MAX),
504                           hst_weight(ps, st));
505
506         /*
507          * On request end, mark path as fresh. If a path hasn't
508          * finished any requests within the fresh period, the estimated
509          * service time is considered too optimistic and we limit the
510          * maximum requests on that path.
511          */
512         pi->stale_after = pi->last_finish +
513                 (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
514
515         spin_unlock_irqrestore(&pi->lock, flags);
516
517         return 0;
518 }
519
520 static struct path_selector_type hst_ps = {
521         .name           = "historical-service-time",
522         .module         = THIS_MODULE,
523         .table_args     = 1,
524         .info_args      = 3,
525         .create         = hst_create,
526         .destroy        = hst_destroy,
527         .status         = hst_status,
528         .add_path       = hst_add_path,
529         .fail_path      = hst_fail_path,
530         .reinstate_path = hst_reinstate_path,
531         .select_path    = hst_select_path,
532         .start_io       = hst_start_io,
533         .end_io         = hst_end_io,
534 };
535
536 static int __init dm_hst_init(void)
537 {
538         int r = dm_register_path_selector(&hst_ps);
539
540         if (r < 0)
541                 DMERR("register failed %d", r);
542
543         DMINFO("version " HST_VERSION " loaded");
544
545         return r;
546 }
547
548 static void __exit dm_hst_exit(void)
549 {
550         int r = dm_unregister_path_selector(&hst_ps);
551
552         if (r < 0)
553                 DMERR("unregister failed %d", r);
554 }
555
556 module_init(dm_hst_init);
557 module_exit(dm_hst_exit);
558
559 MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
560 MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
561 MODULE_LICENSE("GPL");