dm: rename multipath path selector source files to have "dm-ps" prefix
authorMike Snitzer <snitzer@redhat.com>
Tue, 10 Nov 2020 18:41:53 +0000 (13:41 -0500)
committerMike Snitzer <snitzer@redhat.com>
Fri, 4 Dec 2020 23:04:35 +0000 (18:04 -0500)
Additional prefix helps clarify that these source files implement path
selectors.

Required updating Makefile to still build modules _without_ the
"dm-ps" prefix to preserve dm-multipath's ability to autoload path
selector modules. While at it, cleaned up some DM whitespace in
Makefile.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
drivers/md/Makefile
drivers/md/dm-historical-service-time.c [deleted file]
drivers/md/dm-io-affinity.c [deleted file]
drivers/md/dm-ps-historical-service-time.c [new file with mode: 0644]
drivers/md/dm-ps-io-affinity.c [new file with mode: 0644]
drivers/md/dm-ps-queue-length.c [new file with mode: 0644]
drivers/md/dm-ps-round-robin.c [new file with mode: 0644]
drivers/md/dm-ps-service-time.c [new file with mode: 0644]
drivers/md/dm-queue-length.c [deleted file]
drivers/md/dm-round-robin.c [deleted file]
drivers/md/dm-service-time.c [deleted file]

index 4f95f33..ef7ddc2 100644 (file)
@@ -7,23 +7,28 @@ dm-mod-y      += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
                   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o \
                   dm-rq.o
 dm-multipath-y += dm-path-selector.o dm-mpath.o
+dm-historical-service-time-y += dm-ps-historical-service-time.o
+dm-io-affinity-y += dm-ps-io-affinity.o
+dm-queue-length-y += dm-ps-queue-length.o
+dm-round-robin-y += dm-ps-round-robin.o
+dm-service-time-y += dm-ps-service-time.o
 dm-snapshot-y  += dm-snap.o dm-exception-store.o dm-snap-transient.o \
                    dm-snap-persistent.o
 dm-mirror-y    += dm-raid1.o
-dm-log-userspace-y \
-               += dm-log-userspace-base.o dm-log-userspace-transfer.o
+dm-log-userspace-y += dm-log-userspace-base.o dm-log-userspace-transfer.o
 dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o
 dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
 dm-cache-y     += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
                    dm-cache-background-tracker.o
-dm-cache-smq-y   += dm-cache-policy-smq.o
+dm-cache-smq-y += dm-cache-policy-smq.o
 dm-ebs-y       += dm-ebs-target.o
 dm-era-y       += dm-era-target.o
 dm-clone-y     += dm-clone-target.o dm-clone-metadata.o
 dm-verity-y    += dm-verity-target.o
+dm-zoned-y     += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
+
 md-mod-y       += md.o md-bitmap.o
 raid456-y      += raid5.o raid5-cache.o raid5-ppl.o
-dm-zoned-y     += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
 linear-y       += md-linear.o
 multipath-y    += md-multipath.o
 faulty-y       += md-faulty.o
@@ -62,12 +67,12 @@ obj-$(CONFIG_DM_MULTIPATH_HST)      += dm-historical-service-time.o
 obj-$(CONFIG_DM_MULTIPATH_IOA) += dm-io-affinity.o
 obj-$(CONFIG_DM_SWITCH)                += dm-switch.o
 obj-$(CONFIG_DM_SNAPSHOT)      += dm-snapshot.o
-obj-$(CONFIG_DM_PERSISTENT_DATA)       += persistent-data/
+obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/
 obj-$(CONFIG_DM_MIRROR)                += dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)          += dm-zero.o
-obj-$(CONFIG_DM_RAID)  += dm-raid.o
-obj-$(CONFIG_DM_THIN_PROVISIONING)     += dm-thin-pool.o
+obj-$(CONFIG_DM_RAID)          += dm-raid.o
+obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
 obj-$(CONFIG_DM_VERITY)                += dm-verity.o
 obj-$(CONFIG_DM_CACHE)         += dm-cache.o
 obj-$(CONFIG_DM_CACHE_SMQ)     += dm-cache-smq.o
diff --git a/drivers/md/dm-historical-service-time.c b/drivers/md/dm-historical-service-time.c
deleted file mode 100644 (file)
index 186f91e..0000000
+++ /dev/null
@@ -1,561 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Historical Service Time
- *
- *  Keeps a time-weighted exponential moving average of the historical
- *  service time. Estimates future service time based on the historical
- *  service time and the number of outstanding requests.
- *
- *  Marks paths stale if they have not finished within hst *
- *  num_paths. If a path is stale and unused, we will send a single
- *  request to probe in case the path has improved. This situation
- *  generally arises if the path is so much worse than others that it
- *  will never have the best estimated service time, or if the entire
- *  multipath device is unused. If a path is stale and in use, limit the
- *  number of requests it can receive with the assumption that the path
- *  has become degraded.
- *
- *  To avoid repeatedly calculating exponents for time weighting, times
- *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
- *  ns, and the weighting is pre-calculated.
- *
- */
-
-#include "dm.h"
-#include "dm-path-selector.h"
-
-#include <linux/blkdev.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-
-
-#define DM_MSG_PREFIX  "multipath historical-service-time"
-#define HST_MIN_IO 1
-#define HST_VERSION "0.1.1"
-
-#define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
-#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
-#define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
-#define HST_FIXED_95 972
-#define HST_MAX_INFLIGHT HST_FIXED_1
-#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
-#define HST_WEIGHT_COUNT 64ULL
-
-struct selector {
-       struct list_head valid_paths;
-       struct list_head failed_paths;
-       int valid_count;
-       spinlock_t lock;
-
-       unsigned int weights[HST_WEIGHT_COUNT];
-       unsigned int threshold_multiplier;
-};
-
-struct path_info {
-       struct list_head list;
-       struct dm_path *path;
-       unsigned int repeat_count;
-
-       spinlock_t lock;
-
-       u64 historical_service_time; /* Fixed point */
-
-       u64 stale_after;
-       u64 last_finish;
-
-       u64 outstanding;
-};
-
-/**
- * fixed_power - compute: x^n, in O(log n) time
- *
- * @x:         base of the power
- * @frac_bits: fractional bits of @x
- * @n:         power to raise @x to.
- *
- * By exploiting the relation between the definition of the natural power
- * function: x^n := x*x*...*x (x multiplied by itself for n times), and
- * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
- * (where: n_i \elem {0, 1}, the binary vector representing n),
- * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
- * of course trivially computable in O(log_2 n), the length of our binary
- * vector.
- *
- * (see: kernel/sched/loadavg.c)
- */
-static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
-{
-       unsigned long result = 1UL << frac_bits;
-
-       if (n) {
-               for (;;) {
-                       if (n & 1) {
-                               result *= x;
-                               result += 1UL << (frac_bits - 1);
-                               result >>= frac_bits;
-                       }
-                       n >>= 1;
-                       if (!n)
-                               break;
-                       x *= x;
-                       x += 1UL << (frac_bits - 1);
-                       x >>= frac_bits;
-               }
-       }
-
-       return result;
-}
-
-/*
- * Calculate the next value of an exponential moving average
- * a_1 = a_0 * e + a * (1 - e)
- *
- * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
- * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
- * @weight: [0, HST_FIXED_1]
- *
- * Note:
- *   To account for multiple periods in the same calculation,
- *   a_n = a_0 * e^n + a * (1 - e^n),
- *   so call fixed_ema(last, next, pow(weight, N))
- */
-static u64 fixed_ema(u64 last, u64 next, u64 weight)
-{
-       last *= weight;
-       last += next * (HST_FIXED_1 - weight);
-       last += 1ULL << (HST_FIXED_SHIFT - 1);
-       return last >> HST_FIXED_SHIFT;
-}
-
-static struct selector *alloc_selector(void)
-{
-       struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
-       if (s) {
-               INIT_LIST_HEAD(&s->valid_paths);
-               INIT_LIST_HEAD(&s->failed_paths);
-               spin_lock_init(&s->lock);
-               s->valid_count = 0;
-       }
-
-       return s;
-}
-
-/*
- * Get the weight for a given time span.
- */
-static u64 hst_weight(struct path_selector *ps, u64 delta)
-{
-       struct selector *s = ps->context;
-       int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
-                          HST_WEIGHT_COUNT - 1);
-
-       return s->weights[bucket];
-}
-
-/*
- * Set up the weights array.
- *
- * weights[len-1] = 0
- * weights[n] = base ^ (n + 1)
- */
-static void hst_set_weights(struct path_selector *ps, unsigned int base)
-{
-       struct selector *s = ps->context;
-       int i;
-
-       if (base >= HST_FIXED_1)
-               return;
-
-       for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
-               s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
-       s->weights[HST_WEIGHT_COUNT - 1] = 0;
-}
-
-static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
-{
-       struct selector *s;
-       unsigned int base_weight = HST_FIXED_95;
-       unsigned int threshold_multiplier = 0;
-       char dummy;
-
-       /*
-        * Arguments: [<base_weight> [<threshold_multiplier>]]
-        *   <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
-        *                  value of 0 will completely ignore any history.
-        *                  If not given, default (HST_FIXED_95) is used.
-        *   <threshold_multiplier>: Minimum threshold multiplier for paths to
-        *                  be considered different. That is, a path is
-        *                  considered different iff (p1 > N * p2) where p1
-        *                  is the path with higher service time. A threshold
-        *                  of 1 or 0 has no effect. Defaults to 0.
-        */
-       if (argc > 2)
-               return -EINVAL;
-
-       if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
-            base_weight >= HST_FIXED_1)) {
-               return -EINVAL;
-       }
-
-       if (argc > 1 && (sscanf(argv[1], "%u%c",
-                               &threshold_multiplier, &dummy) != 1)) {
-               return -EINVAL;
-       }
-
-       s = alloc_selector();
-       if (!s)
-               return -ENOMEM;
-
-       ps->context = s;
-
-       hst_set_weights(ps, base_weight);
-       s->threshold_multiplier = threshold_multiplier;
-       return 0;
-}
-
-static void free_paths(struct list_head *paths)
-{
-       struct path_info *pi, *next;
-
-       list_for_each_entry_safe(pi, next, paths, list) {
-               list_del(&pi->list);
-               kfree(pi);
-       }
-}
-
-static void hst_destroy(struct path_selector *ps)
-{
-       struct selector *s = ps->context;
-
-       free_paths(&s->valid_paths);
-       free_paths(&s->failed_paths);
-       kfree(s);
-       ps->context = NULL;
-}
-
-static int hst_status(struct path_selector *ps, struct dm_path *path,
-                    status_type_t type, char *result, unsigned int maxlen)
-{
-       unsigned int sz = 0;
-       struct path_info *pi;
-
-       if (!path) {
-               struct selector *s = ps->context;
-
-               DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
-       } else {
-               pi = path->pscontext;
-
-               switch (type) {
-               case STATUSTYPE_INFO:
-                       DMEMIT("%llu %llu %llu ", pi->historical_service_time,
-                              pi->outstanding, pi->stale_after);
-                       break;
-               case STATUSTYPE_TABLE:
-                       DMEMIT("0 ");
-                       break;
-               }
-       }
-
-       return sz;
-}
-
-static int hst_add_path(struct path_selector *ps, struct dm_path *path,
-                      int argc, char **argv, char **error)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi;
-       unsigned int repeat_count = HST_MIN_IO;
-       char dummy;
-       unsigned long flags;
-
-       /*
-        * Arguments: [<repeat_count>]
-        *   <repeat_count>: The number of I/Os before switching path.
-        *                   If not given, default (HST_MIN_IO) is used.
-        */
-       if (argc > 1) {
-               *error = "historical-service-time ps: incorrect number of arguments";
-               return -EINVAL;
-       }
-
-       if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
-               *error = "historical-service-time ps: invalid repeat count";
-               return -EINVAL;
-       }
-
-       /* allocate the path */
-       pi = kmalloc(sizeof(*pi), GFP_KERNEL);
-       if (!pi) {
-               *error = "historical-service-time ps: Error allocating path context";
-               return -ENOMEM;
-       }
-
-       pi->path = path;
-       pi->repeat_count = repeat_count;
-
-       pi->historical_service_time = HST_FIXED_1;
-
-       spin_lock_init(&pi->lock);
-       pi->outstanding = 0;
-
-       pi->stale_after = 0;
-       pi->last_finish = 0;
-
-       path->pscontext = pi;
-
-       spin_lock_irqsave(&s->lock, flags);
-       list_add_tail(&pi->list, &s->valid_paths);
-       s->valid_count++;
-       spin_unlock_irqrestore(&s->lock, flags);
-
-       return 0;
-}
-
-static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi = path->pscontext;
-       unsigned long flags;
-
-       spin_lock_irqsave(&s->lock, flags);
-       list_move(&pi->list, &s->failed_paths);
-       s->valid_count--;
-       spin_unlock_irqrestore(&s->lock, flags);
-}
-
-static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi = path->pscontext;
-       unsigned long flags;
-
-       spin_lock_irqsave(&s->lock, flags);
-       list_move_tail(&pi->list, &s->valid_paths);
-       s->valid_count++;
-       spin_unlock_irqrestore(&s->lock, flags);
-
-       return 0;
-}
-
-static void hst_fill_compare(struct path_info *pi, u64 *hst,
-                            u64 *out, u64 *stale)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&pi->lock, flags);
-       *hst = pi->historical_service_time;
-       *out = pi->outstanding;
-       *stale = pi->stale_after;
-       spin_unlock_irqrestore(&pi->lock, flags);
-}
-
-/*
- * Compare the estimated service time of 2 paths, pi1 and pi2,
- * for the incoming I/O.
- *
- * Returns:
- * < 0 : pi1 is better
- * 0   : no difference between pi1 and pi2
- * > 0 : pi2 is better
- *
- */
-static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
-                            u64 time_now, struct path_selector *ps)
-{
-       struct selector *s = ps->context;
-       u64 hst1, hst2;
-       long long out1, out2, stale1, stale2;
-       int pi2_better, over_threshold;
-
-       hst_fill_compare(pi1, &hst1, &out1, &stale1);
-       hst_fill_compare(pi2, &hst2, &out2, &stale2);
-
-       /* Check here if estimated latency for two paths are too similar.
-        * If this is the case, we skip extra calculation and just compare
-        * outstanding requests. In this case, any unloaded paths will
-        * be preferred.
-        */
-       if (hst1 > hst2)
-               over_threshold = hst1 > (s->threshold_multiplier * hst2);
-       else
-               over_threshold = hst2 > (s->threshold_multiplier * hst1);
-
-       if (!over_threshold)
-               return out1 - out2;
-
-       /*
-        * If an unloaded path is stale, choose it. If both paths are unloaded,
-        * choose path that is the most stale.
-        * (If one path is loaded, choose the other)
-        */
-       if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
-           (!out1 && !out2))
-               return (!out2 * stale1) - (!out1 * stale2);
-
-       /* Compare estimated service time. If outstanding is the same, we
-        * don't need to multiply
-        */
-       if (out1 == out2) {
-               pi2_better = hst1 > hst2;
-       } else {
-               /* Potential overflow with out >= 1024 */
-               if (unlikely(out1 >= HST_MAX_INFLIGHT ||
-                            out2 >= HST_MAX_INFLIGHT)) {
-                       /* If over 1023 in-flights, we may overflow if hst
-                        * is at max. (With this shift we still overflow at
-                        * 1048576 in-flights, which is high enough).
-                        */
-                       hst1 >>= HST_FIXED_SHIFT;
-                       hst2 >>= HST_FIXED_SHIFT;
-               }
-               pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
-       }
-
-       /* In the case that the 'winner' is stale, limit to equal usage. */
-       if (pi2_better) {
-               if (stale2 < time_now)
-                       return out1 - out2;
-               return 1;
-       }
-       if (stale1 < time_now)
-               return out1 - out2;
-       return -1;
-}
-
-static struct dm_path *hst_select_path(struct path_selector *ps,
-                                      size_t nr_bytes)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi = NULL, *best = NULL;
-       u64 time_now = sched_clock();
-       struct dm_path *ret = NULL;
-       unsigned long flags;
-
-       spin_lock_irqsave(&s->lock, flags);
-       if (list_empty(&s->valid_paths))
-               goto out;
-
-       list_for_each_entry(pi, &s->valid_paths, list) {
-               if (!best || (hst_compare(pi, best, time_now, ps) < 0))
-                       best = pi;
-       }
-
-       if (!best)
-               goto out;
-
-       /* Move last used path to end (least preferred in case of ties) */
-       list_move_tail(&best->list, &s->valid_paths);
-
-       ret = best->path;
-
-out:
-       spin_unlock_irqrestore(&s->lock, flags);
-       return ret;
-}
-
-static int hst_start_io(struct path_selector *ps, struct dm_path *path,
-                       size_t nr_bytes)
-{
-       struct path_info *pi = path->pscontext;
-       unsigned long flags;
-
-       spin_lock_irqsave(&pi->lock, flags);
-       pi->outstanding++;
-       spin_unlock_irqrestore(&pi->lock, flags);
-
-       return 0;
-}
-
-static u64 path_service_time(struct path_info *pi, u64 start_time)
-{
-       u64 sched_now = ktime_get_ns();
-
-       /* if a previous disk request has finished after this IO was
-        * sent to the hardware, pretend the submission happened
-        * serially.
-        */
-       if (time_after64(pi->last_finish, start_time))
-               start_time = pi->last_finish;
-
-       pi->last_finish = sched_now;
-       if (time_before64(sched_now, start_time))
-               return 0;
-
-       return sched_now - start_time;
-}
-
-static int hst_end_io(struct path_selector *ps, struct dm_path *path,
-                     size_t nr_bytes, u64 start_time)
-{
-       struct path_info *pi = path->pscontext;
-       struct selector *s = ps->context;
-       unsigned long flags;
-       u64 st;
-
-       spin_lock_irqsave(&pi->lock, flags);
-
-       st = path_service_time(pi, start_time);
-       pi->outstanding--;
-       pi->historical_service_time =
-               fixed_ema(pi->historical_service_time,
-                         min(st * HST_FIXED_1, HST_FIXED_MAX),
-                         hst_weight(ps, st));
-
-       /*
-        * On request end, mark path as fresh. If a path hasn't
-        * finished any requests within the fresh period, the estimated
-        * service time is considered too optimistic and we limit the
-        * maximum requests on that path.
-        */
-       pi->stale_after = pi->last_finish +
-               (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
-
-       spin_unlock_irqrestore(&pi->lock, flags);
-
-       return 0;
-}
-
-static struct path_selector_type hst_ps = {
-       .name           = "historical-service-time",
-       .module         = THIS_MODULE,
-       .table_args     = 1,
-       .info_args      = 3,
-       .create         = hst_create,
-       .destroy        = hst_destroy,
-       .status         = hst_status,
-       .add_path       = hst_add_path,
-       .fail_path      = hst_fail_path,
-       .reinstate_path = hst_reinstate_path,
-       .select_path    = hst_select_path,
-       .start_io       = hst_start_io,
-       .end_io         = hst_end_io,
-};
-
-static int __init dm_hst_init(void)
-{
-       int r = dm_register_path_selector(&hst_ps);
-
-       if (r < 0)
-               DMERR("register failed %d", r);
-
-       DMINFO("version " HST_VERSION " loaded");
-
-       return r;
-}
-
-static void __exit dm_hst_exit(void)
-{
-       int r = dm_unregister_path_selector(&hst_ps);
-
-       if (r < 0)
-               DMERR("unregister failed %d", r);
-}
-
-module_init(dm_hst_init);
-module_exit(dm_hst_exit);
-
-MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
-MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-io-affinity.c b/drivers/md/dm-io-affinity.c
deleted file mode 100644 (file)
index 077655c..0000000
+++ /dev/null
@@ -1,272 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2020 Oracle Corporation
- *
- * Module Author: Mike Christie
- */
-#include "dm-path-selector.h"
-
-#include <linux/device-mapper.h>
-#include <linux/module.h>
-
-#define DM_MSG_PREFIX "multipath io-affinity"
-
-struct path_info {
-       struct dm_path *path;
-       cpumask_var_t cpumask;
-       refcount_t refcount;
-       bool failed;
-};
-
-struct selector {
-       struct path_info **path_map;
-       cpumask_var_t path_mask;
-       atomic_t map_misses;
-};
-
-static void ioa_free_path(struct selector *s, unsigned int cpu)
-{
-       struct path_info *pi = s->path_map[cpu];
-
-       if (!pi)
-               return;
-
-       if (refcount_dec_and_test(&pi->refcount)) {
-               cpumask_clear_cpu(cpu, s->path_mask);
-               free_cpumask_var(pi->cpumask);
-               kfree(pi);
-
-               s->path_map[cpu] = NULL;
-       }
-}
-
-static int ioa_add_path(struct path_selector *ps, struct dm_path *path,
-                       int argc, char **argv, char **error)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi = NULL;
-       unsigned int cpu;
-       int ret;
-
-       if (argc != 1) {
-               *error = "io-affinity ps: invalid number of arguments";
-               return -EINVAL;
-       }
-
-       pi = kzalloc(sizeof(*pi), GFP_KERNEL);
-       if (!pi) {
-               *error = "io-affinity ps: Error allocating path context";
-               return -ENOMEM;
-       }
-
-       pi->path = path;
-       path->pscontext = pi;
-       refcount_set(&pi->refcount, 1);
-
-       if (!zalloc_cpumask_var(&pi->cpumask, GFP_KERNEL)) {
-               *error = "io-affinity ps: Error allocating cpumask context";
-               ret = -ENOMEM;
-               goto free_pi;
-       }
-
-       ret = cpumask_parse(argv[0], pi->cpumask);
-       if (ret) {
-               *error = "io-affinity ps: invalid cpumask";
-               ret = -EINVAL;
-               goto free_mask;
-       }
-
-       for_each_cpu(cpu, pi->cpumask) {
-               if (cpu >= nr_cpu_ids) {
-                       DMWARN_LIMIT("Ignoring mapping for CPU %u. Max CPU is %u",
-                                    cpu, nr_cpu_ids);
-                       break;
-               }
-
-               if (s->path_map[cpu]) {
-                       DMWARN("CPU mapping for %u exists. Ignoring.", cpu);
-                       continue;
-               }
-
-               cpumask_set_cpu(cpu, s->path_mask);
-               s->path_map[cpu] = pi;
-               refcount_inc(&pi->refcount);
-               continue;
-       }
-
-       if (refcount_dec_and_test(&pi->refcount)) {
-               *error = "io-affinity ps: No new/valid CPU mapping found";
-               ret = -EINVAL;
-               goto free_mask;
-       }
-
-       return 0;
-
-free_mask:
-       free_cpumask_var(pi->cpumask);
-free_pi:
-       kfree(pi);
-       return ret;
-}
-
-static int ioa_create(struct path_selector *ps, unsigned argc, char **argv)
-{
-       struct selector *s;
-
-       s = kmalloc(sizeof(*s), GFP_KERNEL);
-       if (!s)
-               return -ENOMEM;
-
-       s->path_map = kzalloc(nr_cpu_ids * sizeof(struct path_info *),
-                             GFP_KERNEL);
-       if (!s->path_map)
-               goto free_selector;
-
-       if (!zalloc_cpumask_var(&s->path_mask, GFP_KERNEL))
-               goto free_map;
-
-       atomic_set(&s->map_misses, 0);
-       ps->context = s;
-       return 0;
-
-free_map:
-       kfree(s->path_map);
-free_selector:
-       kfree(s);
-       return -ENOMEM;
-}
-
-static void ioa_destroy(struct path_selector *ps)
-{
-       struct selector *s = ps->context;
-       unsigned cpu;
-
-       for_each_cpu(cpu, s->path_mask)
-               ioa_free_path(s, cpu);
-
-       free_cpumask_var(s->path_mask);
-       kfree(s->path_map);
-       kfree(s);
-
-       ps->context = NULL;
-}
-
-static int ioa_status(struct path_selector *ps, struct dm_path *path,
-                     status_type_t type, char *result, unsigned int maxlen)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi;
-       int sz = 0;
-
-       if (!path) {
-               DMEMIT("0 ");
-               return sz;
-       }
-
-       switch(type) {
-       case STATUSTYPE_INFO:
-               DMEMIT("%d ", atomic_read(&s->map_misses));
-               break;
-       case STATUSTYPE_TABLE:
-               pi = path->pscontext;
-               DMEMIT("%*pb ", cpumask_pr_args(pi->cpumask));
-               break;
-       }
-
-       return sz;
-}
-
-static void ioa_fail_path(struct path_selector *ps, struct dm_path *p)
-{
-       struct path_info *pi = p->pscontext;
-
-       pi->failed = true;
-}
-
-static int ioa_reinstate_path(struct path_selector *ps, struct dm_path *p)
-{
-       struct path_info *pi = p->pscontext;
-
-       pi->failed = false;
-       return 0;
-}
-
-static struct dm_path *ioa_select_path(struct path_selector *ps,
-                                      size_t nr_bytes)
-{
-       unsigned int cpu, node;
-       struct selector *s = ps->context;
-       const struct cpumask *cpumask;
-       struct path_info *pi;
-       int i;
-
-       cpu = get_cpu();
-
-       pi = s->path_map[cpu];
-       if (pi && !pi->failed)
-               goto done;
-
-       /*
-        * Perf is not optimal, but we at least try the local node then just
-        * try not to fail.
-        */
-       if (!pi)
-               atomic_inc(&s->map_misses);
-
-       node = cpu_to_node(cpu);
-       cpumask = cpumask_of_node(node);
-       for_each_cpu(i, cpumask) {
-               pi = s->path_map[i];
-               if (pi && !pi->failed)
-                       goto done;
-       }
-
-       for_each_cpu(i, s->path_mask) {
-               pi = s->path_map[i];
-               if (pi && !pi->failed)
-                       goto done;
-       }
-       pi = NULL;
-
-done:
-       put_cpu();
-       return pi ? pi->path : NULL;
-}
-
-static struct path_selector_type ioa_ps = {
-       .name           = "io-affinity",
-       .module         = THIS_MODULE,
-       .table_args     = 1,
-       .info_args      = 1,
-       .create         = ioa_create,
-       .destroy        = ioa_destroy,
-       .status         = ioa_status,
-       .add_path       = ioa_add_path,
-       .fail_path      = ioa_fail_path,
-       .reinstate_path = ioa_reinstate_path,
-       .select_path    = ioa_select_path,
-};
-
-static int __init dm_ioa_init(void)
-{
-       int ret = dm_register_path_selector(&ioa_ps);
-
-       if (ret < 0)
-               DMERR("register failed %d", ret);
-       return ret;
-}
-
-static void __exit dm_ioa_exit(void)
-{
-       int ret = dm_unregister_path_selector(&ioa_ps);
-
-       if (ret < 0)
-               DMERR("unregister failed %d", ret);
-}
-
-module_init(dm_ioa_init);
-module_exit(dm_ioa_exit);
-
-MODULE_DESCRIPTION(DM_NAME " multipath path selector that selects paths based on the CPU IO is being executed on");
-MODULE_AUTHOR("Mike Christie <michael.christie@oracle.com>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-ps-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c
new file mode 100644 (file)
index 0000000..186f91e
--- /dev/null
@@ -0,0 +1,561 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Historical Service Time
+ *
+ *  Keeps a time-weighted exponential moving average of the historical
+ *  service time. Estimates future service time based on the historical
+ *  service time and the number of outstanding requests.
+ *
+ *  Marks paths stale if they have not finished within hst *
+ *  num_paths. If a path is stale and unused, we will send a single
+ *  request to probe in case the path has improved. This situation
+ *  generally arises if the path is so much worse than others that it
+ *  will never have the best estimated service time, or if the entire
+ *  multipath device is unused. If a path is stale and in use, limit the
+ *  number of requests it can receive with the assumption that the path
+ *  has become degraded.
+ *
+ *  To avoid repeatedly calculating exponents for time weighting, times
+ *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
+ *  ns, and the weighting is pre-calculated.
+ *
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+
+#define DM_MSG_PREFIX  "multipath historical-service-time"
+#define HST_MIN_IO 1
+#define HST_VERSION "0.1.1"
+
+#define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
+#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
+#define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
+#define HST_FIXED_95 972
+#define HST_MAX_INFLIGHT HST_FIXED_1
+#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
+#define HST_WEIGHT_COUNT 64ULL
+
+struct selector {
+       struct list_head valid_paths;
+       struct list_head failed_paths;
+       int valid_count;
+       spinlock_t lock;
+
+       unsigned int weights[HST_WEIGHT_COUNT];
+       unsigned int threshold_multiplier;
+};
+
+struct path_info {
+       struct list_head list;
+       struct dm_path *path;
+       unsigned int repeat_count;
+
+       spinlock_t lock;
+
+       u64 historical_service_time; /* Fixed point */
+
+       u64 stale_after;
+       u64 last_finish;
+
+       u64 outstanding;
+};
+
+/**
+ * fixed_power - compute: x^n, in O(log n) time
+ *
+ * @x:         base of the power
+ * @frac_bits: fractional bits of @x
+ * @n:         power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ *
+ * (see: kernel/sched/loadavg.c)
+ */
+static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
+{
+       unsigned long result = 1UL << frac_bits;
+
+       if (n) {
+               for (;;) {
+                       if (n & 1) {
+                               result *= x;
+                               result += 1UL << (frac_bits - 1);
+                               result >>= frac_bits;
+                       }
+                       n >>= 1;
+                       if (!n)
+                               break;
+                       x *= x;
+                       x += 1UL << (frac_bits - 1);
+                       x >>= frac_bits;
+               }
+       }
+
+       return result;
+}
+
+/*
+ * Calculate the next value of an exponential moving average
+ * a_1 = a_0 * e + a * (1 - e)
+ *
+ * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
+ * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
+ * @weight: [0, HST_FIXED_1]
+ *
+ * Note:
+ *   To account for multiple periods in the same calculation,
+ *   a_n = a_0 * e^n + a * (1 - e^n),
+ *   so call fixed_ema(last, next, pow(weight, N))
+ */
+static u64 fixed_ema(u64 last, u64 next, u64 weight)
+{
+       last *= weight;
+       last += next * (HST_FIXED_1 - weight);
+       last += 1ULL << (HST_FIXED_SHIFT - 1);
+       return last >> HST_FIXED_SHIFT;
+}
+
+static struct selector *alloc_selector(void)
+{
+       struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+       if (s) {
+               INIT_LIST_HEAD(&s->valid_paths);
+               INIT_LIST_HEAD(&s->failed_paths);
+               spin_lock_init(&s->lock);
+               s->valid_count = 0;
+       }
+
+       return s;
+}
+
+/*
+ * Get the weight for a given time span.
+ */
+static u64 hst_weight(struct path_selector *ps, u64 delta)
+{
+       struct selector *s = ps->context;
+       int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
+                          HST_WEIGHT_COUNT - 1);
+
+       return s->weights[bucket];
+}
+
+/*
+ * Set up the weights array.
+ *
+ * weights[len-1] = 0
+ * weights[n] = base ^ (n + 1)
+ */
+static void hst_set_weights(struct path_selector *ps, unsigned int base)
+{
+       struct selector *s = ps->context;
+       int i;
+
+       if (base >= HST_FIXED_1)
+               return;
+
+       for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
+               s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
+       s->weights[HST_WEIGHT_COUNT - 1] = 0;
+}
+
+static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
+{
+       struct selector *s;
+       unsigned int base_weight = HST_FIXED_95;
+       unsigned int threshold_multiplier = 0;
+       char dummy;
+
+       /*
+        * Arguments: [<base_weight> [<threshold_multiplier>]]
+        *   <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
+        *                  value of 0 will completely ignore any history.
+        *                  If not given, default (HST_FIXED_95) is used.
+        *   <threshold_multiplier>: Minimum threshold multiplier for paths to
+        *                  be considered different. That is, a path is
+        *                  considered different iff (p1 > N * p2) where p1
+        *                  is the path with higher service time. A threshold
+        *                  of 1 or 0 has no effect. Defaults to 0.
+        */
+       if (argc > 2)
+               return -EINVAL;
+
+       if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
+            base_weight >= HST_FIXED_1)) {
+               return -EINVAL;
+       }
+
+       if (argc > 1 && (sscanf(argv[1], "%u%c",
+                               &threshold_multiplier, &dummy) != 1)) {
+               return -EINVAL;
+       }
+
+       s = alloc_selector();
+       if (!s)
+               return -ENOMEM;
+
+       ps->context = s;
+
+       hst_set_weights(ps, base_weight);
+       s->threshold_multiplier = threshold_multiplier;
+       return 0;
+}
+
+static void free_paths(struct list_head *paths)
+{
+       struct path_info *pi, *next;
+
+       list_for_each_entry_safe(pi, next, paths, list) {
+               list_del(&pi->list);
+               kfree(pi);
+       }
+}
+
+static void hst_destroy(struct path_selector *ps)
+{
+       struct selector *s = ps->context;
+
+       free_paths(&s->valid_paths);
+       free_paths(&s->failed_paths);
+       kfree(s);
+       ps->context = NULL;
+}
+
+static int hst_status(struct path_selector *ps, struct dm_path *path,
+                    status_type_t type, char *result, unsigned int maxlen)
+{
+       unsigned int sz = 0;
+       struct path_info *pi;
+
+       if (!path) {
+               struct selector *s = ps->context;
+
+               DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
+       } else {
+               pi = path->pscontext;
+
+               switch (type) {
+               case STATUSTYPE_INFO:
+                       DMEMIT("%llu %llu %llu ", pi->historical_service_time,
+                              pi->outstanding, pi->stale_after);
+                       break;
+               case STATUSTYPE_TABLE:
+                       DMEMIT("0 ");
+                       break;
+               }
+       }
+
+       return sz;
+}
+
+static int hst_add_path(struct path_selector *ps, struct dm_path *path,
+                      int argc, char **argv, char **error)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi;
+       unsigned int repeat_count = HST_MIN_IO;
+       char dummy;
+       unsigned long flags;
+
+       /*
+        * Arguments: [<repeat_count>]
+        *   <repeat_count>: The number of I/Os before switching path.
+        *                   If not given, default (HST_MIN_IO) is used.
+        */
+       if (argc > 1) {
+               *error = "historical-service-time ps: incorrect number of arguments";
+               return -EINVAL;
+       }
+
+       if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
+               *error = "historical-service-time ps: invalid repeat count";
+               return -EINVAL;
+       }
+
+       /* allocate the path */
+       pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+       if (!pi) {
+               *error = "historical-service-time ps: Error allocating path context";
+               return -ENOMEM;
+       }
+
+       pi->path = path;
+       pi->repeat_count = repeat_count;
+
+       pi->historical_service_time = HST_FIXED_1;
+
+       spin_lock_init(&pi->lock);
+       pi->outstanding = 0;
+
+       pi->stale_after = 0;
+       pi->last_finish = 0;
+
+       path->pscontext = pi;
+
+       spin_lock_irqsave(&s->lock, flags);
+       list_add_tail(&pi->list, &s->valid_paths);
+       s->valid_count++;
+       spin_unlock_irqrestore(&s->lock, flags);
+
+       return 0;
+}
+
+static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi = path->pscontext;
+       unsigned long flags;
+
+       spin_lock_irqsave(&s->lock, flags);
+       list_move(&pi->list, &s->failed_paths);
+       s->valid_count--;
+       spin_unlock_irqrestore(&s->lock, flags);
+}
+
+static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi = path->pscontext;
+       unsigned long flags;
+
+       spin_lock_irqsave(&s->lock, flags);
+       list_move_tail(&pi->list, &s->valid_paths);
+       s->valid_count++;
+       spin_unlock_irqrestore(&s->lock, flags);
+
+       return 0;
+}
+
+static void hst_fill_compare(struct path_info *pi, u64 *hst,
+                            u64 *out, u64 *stale)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&pi->lock, flags);
+       *hst = pi->historical_service_time;
+       *out = pi->outstanding;
+       *stale = pi->stale_after;
+       spin_unlock_irqrestore(&pi->lock, flags);
+}
+
+/*
+ * Compare the estimated service time of 2 paths, pi1 and pi2,
+ * for the incoming I/O.
+ *
+ * Returns:
+ * < 0 : pi1 is better
+ * 0   : no difference between pi1 and pi2
+ * > 0 : pi2 is better
+ *
+ */
+static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
+                            u64 time_now, struct path_selector *ps)
+{
+       struct selector *s = ps->context;
+       u64 hst1, hst2;
+       long long out1, out2, stale1, stale2;
+       int pi2_better, over_threshold;
+
+       hst_fill_compare(pi1, &hst1, &out1, &stale1);
+       hst_fill_compare(pi2, &hst2, &out2, &stale2);
+
+       /* Check here if estimated latency for two paths are too similar.
+        * If this is the case, we skip extra calculation and just compare
+        * outstanding requests. In this case, any unloaded paths will
+        * be preferred.
+        */
+       if (hst1 > hst2)
+               over_threshold = hst1 > (s->threshold_multiplier * hst2);
+       else
+               over_threshold = hst2 > (s->threshold_multiplier * hst1);
+
+       if (!over_threshold)
+               return out1 - out2;
+
+       /*
+        * If an unloaded path is stale, choose it. If both paths are unloaded,
+        * choose path that is the most stale.
+        * (If one path is loaded, choose the other)
+        */
+       if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
+           (!out1 && !out2))
+               return (!out2 * stale1) - (!out1 * stale2);
+
+       /* Compare estimated service time. If outstanding is the same, we
+        * don't need to multiply
+        */
+       if (out1 == out2) {
+               pi2_better = hst1 > hst2;
+       } else {
+               /* Potential overflow with out >= 1024 */
+               if (unlikely(out1 >= HST_MAX_INFLIGHT ||
+                            out2 >= HST_MAX_INFLIGHT)) {
+                       /* If over 1023 in-flights, we may overflow if hst
+                        * is at max. (With this shift we still overflow at
+                        * 1048576 in-flights, which is high enough).
+                        */
+                       hst1 >>= HST_FIXED_SHIFT;
+                       hst2 >>= HST_FIXED_SHIFT;
+               }
+               pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
+       }
+
+       /* In the case that the 'winner' is stale, limit to equal usage. */
+       if (pi2_better) {
+               if (stale2 < time_now)
+                       return out1 - out2;
+               return 1;
+       }
+       if (stale1 < time_now)
+               return out1 - out2;
+       return -1;
+}
+
+static struct dm_path *hst_select_path(struct path_selector *ps,
+                                      size_t nr_bytes)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi = NULL, *best = NULL;
+       u64 time_now = sched_clock();
+       struct dm_path *ret = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&s->lock, flags);
+       if (list_empty(&s->valid_paths))
+               goto out;
+
+       list_for_each_entry(pi, &s->valid_paths, list) {
+               if (!best || (hst_compare(pi, best, time_now, ps) < 0))
+                       best = pi;
+       }
+
+       if (!best)
+               goto out;
+
+       /* Move last used path to end (least preferred in case of ties) */
+       list_move_tail(&best->list, &s->valid_paths);
+
+       ret = best->path;
+
+out:
+       spin_unlock_irqrestore(&s->lock, flags);
+       return ret;
+}
+
+static int hst_start_io(struct path_selector *ps, struct dm_path *path,
+                       size_t nr_bytes)
+{
+       struct path_info *pi = path->pscontext;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pi->lock, flags);
+       pi->outstanding++;
+       spin_unlock_irqrestore(&pi->lock, flags);
+
+       return 0;
+}
+
+static u64 path_service_time(struct path_info *pi, u64 start_time)
+{
+       u64 sched_now = ktime_get_ns();
+
+       /* if a previous disk request has finished after this IO was
+        * sent to the hardware, pretend the submission happened
+        * serially.
+        */
+       if (time_after64(pi->last_finish, start_time))
+               start_time = pi->last_finish;
+
+       pi->last_finish = sched_now;
+       if (time_before64(sched_now, start_time))
+               return 0;
+
+       return sched_now - start_time;
+}
+
+static int hst_end_io(struct path_selector *ps, struct dm_path *path,
+                     size_t nr_bytes, u64 start_time)
+{
+       struct path_info *pi = path->pscontext;
+       struct selector *s = ps->context;
+       unsigned long flags;
+       u64 st;
+
+       spin_lock_irqsave(&pi->lock, flags);
+
+       st = path_service_time(pi, start_time);
+       pi->outstanding--;
+       pi->historical_service_time =
+               fixed_ema(pi->historical_service_time,
+                         min(st * HST_FIXED_1, HST_FIXED_MAX),
+                         hst_weight(ps, st));
+
+       /*
+        * On request end, mark path as fresh. If a path hasn't
+        * finished any requests within the fresh period, the estimated
+        * service time is considered too optimistic and we limit the
+        * maximum requests on that path.
+        */
+       pi->stale_after = pi->last_finish +
+               (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
+
+       spin_unlock_irqrestore(&pi->lock, flags);
+
+       return 0;
+}
+
+static struct path_selector_type hst_ps = {
+       .name           = "historical-service-time",
+       .module         = THIS_MODULE,
+       .table_args     = 1,
+       .info_args      = 3,
+       .create         = hst_create,
+       .destroy        = hst_destroy,
+       .status         = hst_status,
+       .add_path       = hst_add_path,
+       .fail_path      = hst_fail_path,
+       .reinstate_path = hst_reinstate_path,
+       .select_path    = hst_select_path,
+       .start_io       = hst_start_io,
+       .end_io         = hst_end_io,
+};
+
+static int __init dm_hst_init(void)
+{
+       int r = dm_register_path_selector(&hst_ps);
+
+       if (r < 0)
+               DMERR("register failed %d", r);
+
+       DMINFO("version " HST_VERSION " loaded");
+
+       return r;
+}
+
+static void __exit dm_hst_exit(void)
+{
+       int r = dm_unregister_path_selector(&hst_ps);
+
+       if (r < 0)
+               DMERR("unregister failed %d", r);
+}
+
+module_init(dm_hst_init);
+module_exit(dm_hst_exit);
+
+MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
+MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-ps-io-affinity.c b/drivers/md/dm-ps-io-affinity.c
new file mode 100644 (file)
index 0000000..077655c
--- /dev/null
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Oracle Corporation
+ *
+ * Module Author: Mike Christie
+ */
+#include "dm-path-selector.h"
+
+#include <linux/device-mapper.h>
+#include <linux/module.h>
+
+#define DM_MSG_PREFIX "multipath io-affinity"
+
+struct path_info {
+       struct dm_path *path;
+       cpumask_var_t cpumask;
+       refcount_t refcount;
+       bool failed;
+};
+
+struct selector {
+       struct path_info **path_map;
+       cpumask_var_t path_mask;
+       atomic_t map_misses;
+};
+
+static void ioa_free_path(struct selector *s, unsigned int cpu)
+{
+       struct path_info *pi = s->path_map[cpu];
+
+       if (!pi)
+               return;
+
+       if (refcount_dec_and_test(&pi->refcount)) {
+               cpumask_clear_cpu(cpu, s->path_mask);
+               free_cpumask_var(pi->cpumask);
+               kfree(pi);
+
+               s->path_map[cpu] = NULL;
+       }
+}
+
+static int ioa_add_path(struct path_selector *ps, struct dm_path *path,
+                       int argc, char **argv, char **error)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi = NULL;
+       unsigned int cpu;
+       int ret;
+
+       if (argc != 1) {
+               *error = "io-affinity ps: invalid number of arguments";
+               return -EINVAL;
+       }
+
+       pi = kzalloc(sizeof(*pi), GFP_KERNEL);
+       if (!pi) {
+               *error = "io-affinity ps: Error allocating path context";
+               return -ENOMEM;
+       }
+
+       pi->path = path;
+       path->pscontext = pi;
+       refcount_set(&pi->refcount, 1);
+
+       if (!zalloc_cpumask_var(&pi->cpumask, GFP_KERNEL)) {
+               *error = "io-affinity ps: Error allocating cpumask context";
+               ret = -ENOMEM;
+               goto free_pi;
+       }
+
+       ret = cpumask_parse(argv[0], pi->cpumask);
+       if (ret) {
+               *error = "io-affinity ps: invalid cpumask";
+               ret = -EINVAL;
+               goto free_mask;
+       }
+
+       for_each_cpu(cpu, pi->cpumask) {
+               if (cpu >= nr_cpu_ids) {
+                       DMWARN_LIMIT("Ignoring mapping for CPU %u. Max CPU is %u",
+                                    cpu, nr_cpu_ids);
+                       break;
+               }
+
+               if (s->path_map[cpu]) {
+                       DMWARN("CPU mapping for %u exists. Ignoring.", cpu);
+                       continue;
+               }
+
+               cpumask_set_cpu(cpu, s->path_mask);
+               s->path_map[cpu] = pi;
+               refcount_inc(&pi->refcount);
+               continue;
+       }
+
+       if (refcount_dec_and_test(&pi->refcount)) {
+               *error = "io-affinity ps: No new/valid CPU mapping found";
+               ret = -EINVAL;
+               goto free_mask;
+       }
+
+       return 0;
+
+free_mask:
+       free_cpumask_var(pi->cpumask);
+free_pi:
+       kfree(pi);
+       return ret;
+}
+
+static int ioa_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+       struct selector *s;
+
+       s = kmalloc(sizeof(*s), GFP_KERNEL);
+       if (!s)
+               return -ENOMEM;
+
+       s->path_map = kzalloc(nr_cpu_ids * sizeof(struct path_info *),
+                             GFP_KERNEL);
+       if (!s->path_map)
+               goto free_selector;
+
+       if (!zalloc_cpumask_var(&s->path_mask, GFP_KERNEL))
+               goto free_map;
+
+       atomic_set(&s->map_misses, 0);
+       ps->context = s;
+       return 0;
+
+free_map:
+       kfree(s->path_map);
+free_selector:
+       kfree(s);
+       return -ENOMEM;
+}
+
+static void ioa_destroy(struct path_selector *ps)
+{
+       struct selector *s = ps->context;
+       unsigned cpu;
+
+       for_each_cpu(cpu, s->path_mask)
+               ioa_free_path(s, cpu);
+
+       free_cpumask_var(s->path_mask);
+       kfree(s->path_map);
+       kfree(s);
+
+       ps->context = NULL;
+}
+
+static int ioa_status(struct path_selector *ps, struct dm_path *path,
+                     status_type_t type, char *result, unsigned int maxlen)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi;
+       int sz = 0;
+
+       if (!path) {
+               DMEMIT("0 ");
+               return sz;
+       }
+
+       switch(type) {
+       case STATUSTYPE_INFO:
+               DMEMIT("%d ", atomic_read(&s->map_misses));
+               break;
+       case STATUSTYPE_TABLE:
+               pi = path->pscontext;
+               DMEMIT("%*pb ", cpumask_pr_args(pi->cpumask));
+               break;
+       }
+
+       return sz;
+}
+
+static void ioa_fail_path(struct path_selector *ps, struct dm_path *p)
+{
+       struct path_info *pi = p->pscontext;
+
+       pi->failed = true;
+}
+
+static int ioa_reinstate_path(struct path_selector *ps, struct dm_path *p)
+{
+       struct path_info *pi = p->pscontext;
+
+       pi->failed = false;
+       return 0;
+}
+
+static struct dm_path *ioa_select_path(struct path_selector *ps,
+                                      size_t nr_bytes)
+{
+       unsigned int cpu, node;
+       struct selector *s = ps->context;
+       const struct cpumask *cpumask;
+       struct path_info *pi;
+       int i;
+
+       cpu = get_cpu();
+
+       pi = s->path_map[cpu];
+       if (pi && !pi->failed)
+               goto done;
+
+       /*
+        * Perf is not optimal, but we at least try the local node then just
+        * try not to fail.
+        */
+       if (!pi)
+               atomic_inc(&s->map_misses);
+
+       node = cpu_to_node(cpu);
+       cpumask = cpumask_of_node(node);
+       for_each_cpu(i, cpumask) {
+               pi = s->path_map[i];
+               if (pi && !pi->failed)
+                       goto done;
+       }
+
+       for_each_cpu(i, s->path_mask) {
+               pi = s->path_map[i];
+               if (pi && !pi->failed)
+                       goto done;
+       }
+       pi = NULL;
+
+done:
+       put_cpu();
+       return pi ? pi->path : NULL;
+}
+
+static struct path_selector_type ioa_ps = {
+       .name           = "io-affinity",
+       .module         = THIS_MODULE,
+       .table_args     = 1,
+       .info_args      = 1,
+       .create         = ioa_create,
+       .destroy        = ioa_destroy,
+       .status         = ioa_status,
+       .add_path       = ioa_add_path,
+       .fail_path      = ioa_fail_path,
+       .reinstate_path = ioa_reinstate_path,
+       .select_path    = ioa_select_path,
+};
+
+static int __init dm_ioa_init(void)
+{
+       int ret = dm_register_path_selector(&ioa_ps);
+
+       if (ret < 0)
+               DMERR("register failed %d", ret);
+       return ret;
+}
+
+static void __exit dm_ioa_exit(void)
+{
+       int ret = dm_unregister_path_selector(&ioa_ps);
+
+       if (ret < 0)
+               DMERR("unregister failed %d", ret);
+}
+
+module_init(dm_ioa_init);
+module_exit(dm_ioa_exit);
+
+MODULE_DESCRIPTION(DM_NAME " multipath path selector that selects paths based on the CPU IO is being executed on");
+MODULE_AUTHOR("Mike Christie <michael.christie@oracle.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-ps-queue-length.c b/drivers/md/dm-ps-queue-length.c
new file mode 100644 (file)
index 0000000..5fd018d
--- /dev/null
@@ -0,0 +1,283 @@
+/*
+ * Copyright (C) 2004-2005 IBM Corp.  All Rights Reserved.
+ * Copyright (C) 2006-2009 NEC Corporation.
+ *
+ * dm-queue-length.c
+ *
+ * Module Author: Stefan Bader, IBM
+ * Modified by: Kiyoshi Ueda, NEC
+ *
+ * This file is released under the GPL.
+ *
+ * queue-length path selector - choose a path with the least number of
+ * in-flight I/Os.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/atomic.h>
+
+#define DM_MSG_PREFIX  "multipath queue-length"
+#define QL_MIN_IO      1
+#define QL_VERSION     "0.2.0"
+
+struct selector {
+       struct list_head        valid_paths;
+       struct list_head        failed_paths;
+       spinlock_t lock;
+};
+
+struct path_info {
+       struct list_head        list;
+       struct dm_path          *path;
+       unsigned                repeat_count;
+       atomic_t                qlen;   /* the number of in-flight I/Os */
+};
+
+static struct selector *alloc_selector(void)
+{
+       struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+       if (s) {
+               INIT_LIST_HEAD(&s->valid_paths);
+               INIT_LIST_HEAD(&s->failed_paths);
+               spin_lock_init(&s->lock);
+       }
+
+       return s;
+}
+
+static int ql_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+       struct selector *s = alloc_selector();
+
+       if (!s)
+               return -ENOMEM;
+
+       ps->context = s;
+       return 0;
+}
+
+static void ql_free_paths(struct list_head *paths)
+{
+       struct path_info *pi, *next;
+
+       list_for_each_entry_safe(pi, next, paths, list) {
+               list_del(&pi->list);
+               kfree(pi);
+       }
+}
+
+static void ql_destroy(struct path_selector *ps)
+{
+       struct selector *s = ps->context;
+
+       ql_free_paths(&s->valid_paths);
+       ql_free_paths(&s->failed_paths);
+       kfree(s);
+       ps->context = NULL;
+}
+
+static int ql_status(struct path_selector *ps, struct dm_path *path,
+                    status_type_t type, char *result, unsigned maxlen)
+{
+       unsigned sz = 0;
+       struct path_info *pi;
+
+       /* When called with NULL path, return selector status/args. */
+       if (!path)
+               DMEMIT("0 ");
+       else {
+               pi = path->pscontext;
+
+               switch (type) {
+               case STATUSTYPE_INFO:
+                       DMEMIT("%d ", atomic_read(&pi->qlen));
+                       break;
+               case STATUSTYPE_TABLE:
+                       DMEMIT("%u ", pi->repeat_count);
+                       break;
+               }
+       }
+
+       return sz;
+}
+
+static int ql_add_path(struct path_selector *ps, struct dm_path *path,
+                      int argc, char **argv, char **error)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi;
+       unsigned repeat_count = QL_MIN_IO;
+       char dummy;
+       unsigned long flags;
+
+       /*
+        * Arguments: [<repeat_count>]
+        *      <repeat_count>: The number of I/Os before switching path.
+        *                      If not given, default (QL_MIN_IO) is used.
+        */
+       if (argc > 1) {
+               *error = "queue-length ps: incorrect number of arguments";
+               return -EINVAL;
+       }
+
+       if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
+               *error = "queue-length ps: invalid repeat count";
+               return -EINVAL;
+       }
+
+       if (repeat_count > 1) {
+               DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
+               repeat_count = 1;
+       }
+
+       /* Allocate the path information structure */
+       pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+       if (!pi) {
+               *error = "queue-length ps: Error allocating path information";
+               return -ENOMEM;
+       }
+
+       pi->path = path;
+       pi->repeat_count = repeat_count;
+       atomic_set(&pi->qlen, 0);
+
+       path->pscontext = pi;
+
+       spin_lock_irqsave(&s->lock, flags);
+       list_add_tail(&pi->list, &s->valid_paths);
+       spin_unlock_irqrestore(&s->lock, flags);
+
+       return 0;
+}
+
+static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi = path->pscontext;
+       unsigned long flags;
+
+       spin_lock_irqsave(&s->lock, flags);
+       list_move(&pi->list, &s->failed_paths);
+       spin_unlock_irqrestore(&s->lock, flags);
+}
+
+static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi = path->pscontext;
+       unsigned long flags;
+
+       spin_lock_irqsave(&s->lock, flags);
+       list_move_tail(&pi->list, &s->valid_paths);
+       spin_unlock_irqrestore(&s->lock, flags);
+
+       return 0;
+}
+
+/*
+ * Select a path having the minimum number of in-flight I/Os
+ */
+static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi = NULL, *best = NULL;
+       struct dm_path *ret = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&s->lock, flags);
+       if (list_empty(&s->valid_paths))
+               goto out;
+
+       list_for_each_entry(pi, &s->valid_paths, list) {
+               if (!best ||
+                   (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
+                       best = pi;
+
+               if (!atomic_read(&best->qlen))
+                       break;
+       }
+
+       if (!best)
+               goto out;
+
+       /* Move most recently used to least preferred to evenly balance. */
+       list_move_tail(&best->list, &s->valid_paths);
+
+       ret = best->path;
+out:
+       spin_unlock_irqrestore(&s->lock, flags);
+       return ret;
+}
+
+static int ql_start_io(struct path_selector *ps, struct dm_path *path,
+                      size_t nr_bytes)
+{
+       struct path_info *pi = path->pscontext;
+
+       atomic_inc(&pi->qlen);
+
+       return 0;
+}
+
+static int ql_end_io(struct path_selector *ps, struct dm_path *path,
+                    size_t nr_bytes, u64 start_time)
+{
+       struct path_info *pi = path->pscontext;
+
+       atomic_dec(&pi->qlen);
+
+       return 0;
+}
+
+static struct path_selector_type ql_ps = {
+       .name           = "queue-length",
+       .module         = THIS_MODULE,
+       .table_args     = 1,
+       .info_args      = 1,
+       .create         = ql_create,
+       .destroy        = ql_destroy,
+       .status         = ql_status,
+       .add_path       = ql_add_path,
+       .fail_path      = ql_fail_path,
+       .reinstate_path = ql_reinstate_path,
+       .select_path    = ql_select_path,
+       .start_io       = ql_start_io,
+       .end_io         = ql_end_io,
+};
+
+static int __init dm_ql_init(void)
+{
+       int r = dm_register_path_selector(&ql_ps);
+
+       if (r < 0)
+               DMERR("register failed %d", r);
+
+       DMINFO("version " QL_VERSION " loaded");
+
+       return r;
+}
+
+static void __exit dm_ql_exit(void)
+{
+       int r = dm_unregister_path_selector(&ql_ps);
+
+       if (r < 0)
+               DMERR("unregister failed %d", r);
+}
+
+module_init(dm_ql_init);
+module_exit(dm_ql_exit);
+
+MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
+MODULE_DESCRIPTION(
+       "(C) Copyright IBM Corp. 2004,2005   All Rights Reserved.\n"
+       DM_NAME " path selector to balance the number of in-flight I/Os"
+);
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-ps-round-robin.c b/drivers/md/dm-ps-round-robin.c
new file mode 100644 (file)
index 0000000..bdbb7e6
--- /dev/null
@@ -0,0 +1,236 @@
+/*
+ * Copyright (C) 2003 Sistina Software.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen
+ *
+ * This file is released under the GPL.
+ *
+ * Round-robin path selector.
+ */
+
+#include <linux/device-mapper.h>
+
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#define DM_MSG_PREFIX "multipath round-robin"
+#define RR_MIN_IO     1
+#define RR_VERSION    "1.2.0"
+
+/*-----------------------------------------------------------------
+ * Path-handling code, paths are held in lists
+ *---------------------------------------------------------------*/
+struct path_info {
+       struct list_head list;
+       struct dm_path *path;
+       unsigned repeat_count;
+};
+
+static void free_paths(struct list_head *paths)
+{
+       struct path_info *pi, *next;
+
+       list_for_each_entry_safe(pi, next, paths, list) {
+               list_del(&pi->list);
+               kfree(pi);
+       }
+}
+
+/*-----------------------------------------------------------------
+ * Round-robin selector
+ *---------------------------------------------------------------*/
+
+struct selector {
+       struct list_head valid_paths;
+       struct list_head invalid_paths;
+       spinlock_t lock;
+};
+
+static struct selector *alloc_selector(void)
+{
+       struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+       if (s) {
+               INIT_LIST_HEAD(&s->valid_paths);
+               INIT_LIST_HEAD(&s->invalid_paths);
+               spin_lock_init(&s->lock);
+       }
+
+       return s;
+}
+
+static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+       struct selector *s;
+
+       s = alloc_selector();
+       if (!s)
+               return -ENOMEM;
+
+       ps->context = s;
+       return 0;
+}
+
+static void rr_destroy(struct path_selector *ps)
+{
+       struct selector *s = ps->context;
+
+       free_paths(&s->valid_paths);
+       free_paths(&s->invalid_paths);
+       kfree(s);
+       ps->context = NULL;
+}
+
+static int rr_status(struct path_selector *ps, struct dm_path *path,
+                    status_type_t type, char *result, unsigned int maxlen)
+{
+       struct path_info *pi;
+       int sz = 0;
+
+       if (!path)
+               DMEMIT("0 ");
+       else {
+               switch(type) {
+               case STATUSTYPE_INFO:
+                       break;
+               case STATUSTYPE_TABLE:
+                       pi = path->pscontext;
+                       DMEMIT("%u ", pi->repeat_count);
+                       break;
+               }
+       }
+
+       return sz;
+}
+
+/*
+ * Called during initialisation to register each path with an
+ * optional repeat_count.
+ */
+static int rr_add_path(struct path_selector *ps, struct dm_path *path,
+                      int argc, char **argv, char **error)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi;
+       unsigned repeat_count = RR_MIN_IO;
+       char dummy;
+       unsigned long flags;
+
+       if (argc > 1) {
+               *error = "round-robin ps: incorrect number of arguments";
+               return -EINVAL;
+       }
+
+       /* First path argument is number of I/Os before switching path */
+       if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
+               *error = "round-robin ps: invalid repeat count";
+               return -EINVAL;
+       }
+
+       if (repeat_count > 1) {
+               DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
+               repeat_count = 1;
+       }
+
+       /* allocate the path */
+       pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+       if (!pi) {
+               *error = "round-robin ps: Error allocating path context";
+               return -ENOMEM;
+       }
+
+       pi->path = path;
+       pi->repeat_count = repeat_count;
+
+       path->pscontext = pi;
+
+       spin_lock_irqsave(&s->lock, flags);
+       list_add_tail(&pi->list, &s->valid_paths);
+       spin_unlock_irqrestore(&s->lock, flags);
+
+       return 0;
+}
+
+static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
+{
+       unsigned long flags;
+       struct selector *s = ps->context;
+       struct path_info *pi = p->pscontext;
+
+       spin_lock_irqsave(&s->lock, flags);
+       list_move(&pi->list, &s->invalid_paths);
+       spin_unlock_irqrestore(&s->lock, flags);
+}
+
+static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
+{
+       unsigned long flags;
+       struct selector *s = ps->context;
+       struct path_info *pi = p->pscontext;
+
+       spin_lock_irqsave(&s->lock, flags);
+       list_move(&pi->list, &s->valid_paths);
+       spin_unlock_irqrestore(&s->lock, flags);
+
+       return 0;
+}
+
+static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes)
+{
+       unsigned long flags;
+       struct selector *s = ps->context;
+       struct path_info *pi = NULL;
+
+       spin_lock_irqsave(&s->lock, flags);
+       if (!list_empty(&s->valid_paths)) {
+               pi = list_entry(s->valid_paths.next, struct path_info, list);
+               list_move_tail(&pi->list, &s->valid_paths);
+       }
+       spin_unlock_irqrestore(&s->lock, flags);
+
+       return pi ? pi->path : NULL;
+}
+
+static struct path_selector_type rr_ps = {
+       .name = "round-robin",
+       .module = THIS_MODULE,
+       .table_args = 1,
+       .info_args = 0,
+       .create = rr_create,
+       .destroy = rr_destroy,
+       .status = rr_status,
+       .add_path = rr_add_path,
+       .fail_path = rr_fail_path,
+       .reinstate_path = rr_reinstate_path,
+       .select_path = rr_select_path,
+};
+
+static int __init dm_rr_init(void)
+{
+       int r = dm_register_path_selector(&rr_ps);
+
+       if (r < 0)
+               DMERR("register failed %d", r);
+
+       DMINFO("version " RR_VERSION " loaded");
+
+       return r;
+}
+
+static void __exit dm_rr_exit(void)
+{
+       int r = dm_unregister_path_selector(&rr_ps);
+
+       if (r < 0)
+               DMERR("unregister failed %d", r);
+}
+
+module_init(dm_rr_init);
+module_exit(dm_rr_exit);
+
+MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector");
+MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-ps-service-time.c b/drivers/md/dm-ps-service-time.c
new file mode 100644 (file)
index 0000000..9cfda66
--- /dev/null
@@ -0,0 +1,362 @@
+/*
+ * Copyright (C) 2007-2009 NEC Corporation.  All Rights Reserved.
+ *
+ * Module Author: Kiyoshi Ueda
+ *
+ * This file is released under the GPL.
+ *
+ * Throughput oriented path selector.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#define DM_MSG_PREFIX  "multipath service-time"
+#define ST_MIN_IO      1
+#define ST_MAX_RELATIVE_THROUGHPUT     100
+#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT       7
+#define ST_MAX_INFLIGHT_SIZE   ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
+#define ST_VERSION     "0.3.0"
+
+struct selector {
+       struct list_head valid_paths;
+       struct list_head failed_paths;
+       spinlock_t lock;
+};
+
+struct path_info {
+       struct list_head list;
+       struct dm_path *path;
+       unsigned repeat_count;
+       unsigned relative_throughput;
+       atomic_t in_flight_size;        /* Total size of in-flight I/Os */
+};
+
+static struct selector *alloc_selector(void)
+{
+       struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+       if (s) {
+               INIT_LIST_HEAD(&s->valid_paths);
+               INIT_LIST_HEAD(&s->failed_paths);
+               spin_lock_init(&s->lock);
+       }
+
+       return s;
+}
+
+static int st_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+       struct selector *s = alloc_selector();
+
+       if (!s)
+               return -ENOMEM;
+
+       ps->context = s;
+       return 0;
+}
+
+static void free_paths(struct list_head *paths)
+{
+       struct path_info *pi, *next;
+
+       list_for_each_entry_safe(pi, next, paths, list) {
+               list_del(&pi->list);
+               kfree(pi);
+       }
+}
+
+static void st_destroy(struct path_selector *ps)
+{
+       struct selector *s = ps->context;
+
+       free_paths(&s->valid_paths);
+       free_paths(&s->failed_paths);
+       kfree(s);
+       ps->context = NULL;
+}
+
+static int st_status(struct path_selector *ps, struct dm_path *path,
+                    status_type_t type, char *result, unsigned maxlen)
+{
+       unsigned sz = 0;
+       struct path_info *pi;
+
+       if (!path)
+               DMEMIT("0 ");
+       else {
+               pi = path->pscontext;
+
+               switch (type) {
+               case STATUSTYPE_INFO:
+                       DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
+                              pi->relative_throughput);
+                       break;
+               case STATUSTYPE_TABLE:
+                       DMEMIT("%u %u ", pi->repeat_count,
+                              pi->relative_throughput);
+                       break;
+               }
+       }
+
+       return sz;
+}
+
+static int st_add_path(struct path_selector *ps, struct dm_path *path,
+                      int argc, char **argv, char **error)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi;
+       unsigned repeat_count = ST_MIN_IO;
+       unsigned relative_throughput = 1;
+       char dummy;
+       unsigned long flags;
+
+       /*
+        * Arguments: [<repeat_count> [<relative_throughput>]]
+        *      <repeat_count>: The number of I/Os before switching path.
+        *                      If not given, default (ST_MIN_IO) is used.
+        *      <relative_throughput>: The relative throughput value of
+        *                      the path among all paths in the path-group.
+        *                      The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
+        *                      If not given, minimum value '1' is used.
+        *                      If '0' is given, the path isn't selected while
+        *                      other paths having a positive value are
+        *                      available.
+        */
+       if (argc > 2) {
+               *error = "service-time ps: incorrect number of arguments";
+               return -EINVAL;
+       }
+
+       if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
+               *error = "service-time ps: invalid repeat count";
+               return -EINVAL;
+       }
+
+       if (repeat_count > 1) {
+               DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
+               repeat_count = 1;
+       }
+
+       if ((argc == 2) &&
+           (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
+            relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
+               *error = "service-time ps: invalid relative_throughput value";
+               return -EINVAL;
+       }
+
+       /* allocate the path */
+       pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+       if (!pi) {
+               *error = "service-time ps: Error allocating path context";
+               return -ENOMEM;
+       }
+
+       pi->path = path;
+       pi->repeat_count = repeat_count;
+       pi->relative_throughput = relative_throughput;
+       atomic_set(&pi->in_flight_size, 0);
+
+       path->pscontext = pi;
+
+       spin_lock_irqsave(&s->lock, flags);
+       list_add_tail(&pi->list, &s->valid_paths);
+       spin_unlock_irqrestore(&s->lock, flags);
+
+       return 0;
+}
+
+static void st_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi = path->pscontext;
+       unsigned long flags;
+
+       spin_lock_irqsave(&s->lock, flags);
+       list_move(&pi->list, &s->failed_paths);
+       spin_unlock_irqrestore(&s->lock, flags);
+}
+
+static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi = path->pscontext;
+       unsigned long flags;
+
+       spin_lock_irqsave(&s->lock, flags);
+       list_move_tail(&pi->list, &s->valid_paths);
+       spin_unlock_irqrestore(&s->lock, flags);
+
+       return 0;
+}
+
+/*
+ * Compare the estimated service time of 2 paths, pi1 and pi2,
+ * for the incoming I/O.
+ *
+ * Returns:
+ * < 0 : pi1 is better
+ * 0   : no difference between pi1 and pi2
+ * > 0 : pi2 is better
+ *
+ * Description:
+ * Basically, the service time is estimated by:
+ *     ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
+ * To reduce the calculation, some optimizations are made.
+ * (See comments inline)
+ */
+static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
+                          size_t incoming)
+{
+       size_t sz1, sz2, st1, st2;
+
+       sz1 = atomic_read(&pi1->in_flight_size);
+       sz2 = atomic_read(&pi2->in_flight_size);
+
+       /*
+        * Case 1: Both have same throughput value. Choose less loaded path.
+        */
+       if (pi1->relative_throughput == pi2->relative_throughput)
+               return sz1 - sz2;
+
+       /*
+        * Case 2a: Both have same load. Choose higher throughput path.
+        * Case 2b: One path has no throughput value. Choose the other one.
+        */
+       if (sz1 == sz2 ||
+           !pi1->relative_throughput || !pi2->relative_throughput)
+               return pi2->relative_throughput - pi1->relative_throughput;
+
+       /*
+        * Case 3: Calculate service time. Choose faster path.
+        *         Service time using pi1:
+        *             st1 = (sz1 + incoming) / pi1->relative_throughput
+        *         Service time using pi2:
+        *             st2 = (sz2 + incoming) / pi2->relative_throughput
+        *
+        *         To avoid the division, transform the expression to use
+        *         multiplication.
+        *         Because ->relative_throughput > 0 here, if st1 < st2,
+        *         the expressions below are the same meaning:
+        *             (sz1 + incoming) / pi1->relative_throughput <
+        *                 (sz2 + incoming) / pi2->relative_throughput
+        *             (sz1 + incoming) * pi2->relative_throughput <
+        *                 (sz2 + incoming) * pi1->relative_throughput
+        *         So use the later one.
+        */
+       sz1 += incoming;
+       sz2 += incoming;
+       if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
+                    sz2 >= ST_MAX_INFLIGHT_SIZE)) {
+               /*
+                * Size may be too big for multiplying pi->relative_throughput
+                * and overflow.
+                * To avoid the overflow and mis-selection, shift down both.
+                */
+               sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
+               sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
+       }
+       st1 = sz1 * pi2->relative_throughput;
+       st2 = sz2 * pi1->relative_throughput;
+       if (st1 != st2)
+               return st1 - st2;
+
+       /*
+        * Case 4: Service time is equal. Choose higher throughput path.
+        */
+       return pi2->relative_throughput - pi1->relative_throughput;
+}
+
+static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi = NULL, *best = NULL;
+       struct dm_path *ret = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&s->lock, flags);
+       if (list_empty(&s->valid_paths))
+               goto out;
+
+       list_for_each_entry(pi, &s->valid_paths, list)
+               if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
+                       best = pi;
+
+       if (!best)
+               goto out;
+
+       /* Move most recently used to least preferred to evenly balance. */
+       list_move_tail(&best->list, &s->valid_paths);
+
+       ret = best->path;
+out:
+       spin_unlock_irqrestore(&s->lock, flags);
+       return ret;
+}
+
+static int st_start_io(struct path_selector *ps, struct dm_path *path,
+                      size_t nr_bytes)
+{
+       struct path_info *pi = path->pscontext;
+
+       atomic_add(nr_bytes, &pi->in_flight_size);
+
+       return 0;
+}
+
+static int st_end_io(struct path_selector *ps, struct dm_path *path,
+                    size_t nr_bytes, u64 start_time)
+{
+       struct path_info *pi = path->pscontext;
+
+       atomic_sub(nr_bytes, &pi->in_flight_size);
+
+       return 0;
+}
+
+static struct path_selector_type st_ps = {
+       .name           = "service-time",
+       .module         = THIS_MODULE,
+       .table_args     = 2,
+       .info_args      = 2,
+       .create         = st_create,
+       .destroy        = st_destroy,
+       .status         = st_status,
+       .add_path       = st_add_path,
+       .fail_path      = st_fail_path,
+       .reinstate_path = st_reinstate_path,
+       .select_path    = st_select_path,
+       .start_io       = st_start_io,
+       .end_io         = st_end_io,
+};
+
+static int __init dm_st_init(void)
+{
+       int r = dm_register_path_selector(&st_ps);
+
+       if (r < 0)
+               DMERR("register failed %d", r);
+
+       DMINFO("version " ST_VERSION " loaded");
+
+       return r;
+}
+
+static void __exit dm_st_exit(void)
+{
+       int r = dm_unregister_path_selector(&st_ps);
+
+       if (r < 0)
+               DMERR("unregister failed %d", r);
+}
+
+module_init(dm_st_init);
+module_exit(dm_st_exit);
+
+MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
+MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
deleted file mode 100644 (file)
index 5fd018d..0000000
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Copyright (C) 2004-2005 IBM Corp.  All Rights Reserved.
- * Copyright (C) 2006-2009 NEC Corporation.
- *
- * dm-queue-length.c
- *
- * Module Author: Stefan Bader, IBM
- * Modified by: Kiyoshi Ueda, NEC
- *
- * This file is released under the GPL.
- *
- * queue-length path selector - choose a path with the least number of
- * in-flight I/Os.
- */
-
-#include "dm.h"
-#include "dm-path-selector.h"
-
-#include <linux/slab.h>
-#include <linux/ctype.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/atomic.h>
-
-#define DM_MSG_PREFIX  "multipath queue-length"
-#define QL_MIN_IO      1
-#define QL_VERSION     "0.2.0"
-
-struct selector {
-       struct list_head        valid_paths;
-       struct list_head        failed_paths;
-       spinlock_t lock;
-};
-
-struct path_info {
-       struct list_head        list;
-       struct dm_path          *path;
-       unsigned                repeat_count;
-       atomic_t                qlen;   /* the number of in-flight I/Os */
-};
-
-static struct selector *alloc_selector(void)
-{
-       struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
-       if (s) {
-               INIT_LIST_HEAD(&s->valid_paths);
-               INIT_LIST_HEAD(&s->failed_paths);
-               spin_lock_init(&s->lock);
-       }
-
-       return s;
-}
-
-static int ql_create(struct path_selector *ps, unsigned argc, char **argv)
-{
-       struct selector *s = alloc_selector();
-
-       if (!s)
-               return -ENOMEM;
-
-       ps->context = s;
-       return 0;
-}
-
-static void ql_free_paths(struct list_head *paths)
-{
-       struct path_info *pi, *next;
-
-       list_for_each_entry_safe(pi, next, paths, list) {
-               list_del(&pi->list);
-               kfree(pi);
-       }
-}
-
-static void ql_destroy(struct path_selector *ps)
-{
-       struct selector *s = ps->context;
-
-       ql_free_paths(&s->valid_paths);
-       ql_free_paths(&s->failed_paths);
-       kfree(s);
-       ps->context = NULL;
-}
-
-static int ql_status(struct path_selector *ps, struct dm_path *path,
-                    status_type_t type, char *result, unsigned maxlen)
-{
-       unsigned sz = 0;
-       struct path_info *pi;
-
-       /* When called with NULL path, return selector status/args. */
-       if (!path)
-               DMEMIT("0 ");
-       else {
-               pi = path->pscontext;
-
-               switch (type) {
-               case STATUSTYPE_INFO:
-                       DMEMIT("%d ", atomic_read(&pi->qlen));
-                       break;
-               case STATUSTYPE_TABLE:
-                       DMEMIT("%u ", pi->repeat_count);
-                       break;
-               }
-       }
-
-       return sz;
-}
-
-static int ql_add_path(struct path_selector *ps, struct dm_path *path,
-                      int argc, char **argv, char **error)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi;
-       unsigned repeat_count = QL_MIN_IO;
-       char dummy;
-       unsigned long flags;
-
-       /*
-        * Arguments: [<repeat_count>]
-        *      <repeat_count>: The number of I/Os before switching path.
-        *                      If not given, default (QL_MIN_IO) is used.
-        */
-       if (argc > 1) {
-               *error = "queue-length ps: incorrect number of arguments";
-               return -EINVAL;
-       }
-
-       if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
-               *error = "queue-length ps: invalid repeat count";
-               return -EINVAL;
-       }
-
-       if (repeat_count > 1) {
-               DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
-               repeat_count = 1;
-       }
-
-       /* Allocate the path information structure */
-       pi = kmalloc(sizeof(*pi), GFP_KERNEL);
-       if (!pi) {
-               *error = "queue-length ps: Error allocating path information";
-               return -ENOMEM;
-       }
-
-       pi->path = path;
-       pi->repeat_count = repeat_count;
-       atomic_set(&pi->qlen, 0);
-
-       path->pscontext = pi;
-
-       spin_lock_irqsave(&s->lock, flags);
-       list_add_tail(&pi->list, &s->valid_paths);
-       spin_unlock_irqrestore(&s->lock, flags);
-
-       return 0;
-}
-
-static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi = path->pscontext;
-       unsigned long flags;
-
-       spin_lock_irqsave(&s->lock, flags);
-       list_move(&pi->list, &s->failed_paths);
-       spin_unlock_irqrestore(&s->lock, flags);
-}
-
-static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi = path->pscontext;
-       unsigned long flags;
-
-       spin_lock_irqsave(&s->lock, flags);
-       list_move_tail(&pi->list, &s->valid_paths);
-       spin_unlock_irqrestore(&s->lock, flags);
-
-       return 0;
-}
-
-/*
- * Select a path having the minimum number of in-flight I/Os
- */
-static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi = NULL, *best = NULL;
-       struct dm_path *ret = NULL;
-       unsigned long flags;
-
-       spin_lock_irqsave(&s->lock, flags);
-       if (list_empty(&s->valid_paths))
-               goto out;
-
-       list_for_each_entry(pi, &s->valid_paths, list) {
-               if (!best ||
-                   (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
-                       best = pi;
-
-               if (!atomic_read(&best->qlen))
-                       break;
-       }
-
-       if (!best)
-               goto out;
-
-       /* Move most recently used to least preferred to evenly balance. */
-       list_move_tail(&best->list, &s->valid_paths);
-
-       ret = best->path;
-out:
-       spin_unlock_irqrestore(&s->lock, flags);
-       return ret;
-}
-
-static int ql_start_io(struct path_selector *ps, struct dm_path *path,
-                      size_t nr_bytes)
-{
-       struct path_info *pi = path->pscontext;
-
-       atomic_inc(&pi->qlen);
-
-       return 0;
-}
-
-static int ql_end_io(struct path_selector *ps, struct dm_path *path,
-                    size_t nr_bytes, u64 start_time)
-{
-       struct path_info *pi = path->pscontext;
-
-       atomic_dec(&pi->qlen);
-
-       return 0;
-}
-
-static struct path_selector_type ql_ps = {
-       .name           = "queue-length",
-       .module         = THIS_MODULE,
-       .table_args     = 1,
-       .info_args      = 1,
-       .create         = ql_create,
-       .destroy        = ql_destroy,
-       .status         = ql_status,
-       .add_path       = ql_add_path,
-       .fail_path      = ql_fail_path,
-       .reinstate_path = ql_reinstate_path,
-       .select_path    = ql_select_path,
-       .start_io       = ql_start_io,
-       .end_io         = ql_end_io,
-};
-
-static int __init dm_ql_init(void)
-{
-       int r = dm_register_path_selector(&ql_ps);
-
-       if (r < 0)
-               DMERR("register failed %d", r);
-
-       DMINFO("version " QL_VERSION " loaded");
-
-       return r;
-}
-
-static void __exit dm_ql_exit(void)
-{
-       int r = dm_unregister_path_selector(&ql_ps);
-
-       if (r < 0)
-               DMERR("unregister failed %d", r);
-}
-
-module_init(dm_ql_init);
-module_exit(dm_ql_exit);
-
-MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
-MODULE_DESCRIPTION(
-       "(C) Copyright IBM Corp. 2004,2005   All Rights Reserved.\n"
-       DM_NAME " path selector to balance the number of in-flight I/Os"
-);
-MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
deleted file mode 100644 (file)
index bdbb7e6..0000000
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Copyright (C) 2003 Sistina Software.
- * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
- *
- * Module Author: Heinz Mauelshagen
- *
- * This file is released under the GPL.
- *
- * Round-robin path selector.
- */
-
-#include <linux/device-mapper.h>
-
-#include "dm-path-selector.h"
-
-#include <linux/slab.h>
-#include <linux/module.h>
-
-#define DM_MSG_PREFIX "multipath round-robin"
-#define RR_MIN_IO     1
-#define RR_VERSION    "1.2.0"
-
-/*-----------------------------------------------------------------
- * Path-handling code, paths are held in lists
- *---------------------------------------------------------------*/
-struct path_info {
-       struct list_head list;
-       struct dm_path *path;
-       unsigned repeat_count;
-};
-
-static void free_paths(struct list_head *paths)
-{
-       struct path_info *pi, *next;
-
-       list_for_each_entry_safe(pi, next, paths, list) {
-               list_del(&pi->list);
-               kfree(pi);
-       }
-}
-
-/*-----------------------------------------------------------------
- * Round-robin selector
- *---------------------------------------------------------------*/
-
-struct selector {
-       struct list_head valid_paths;
-       struct list_head invalid_paths;
-       spinlock_t lock;
-};
-
-static struct selector *alloc_selector(void)
-{
-       struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
-       if (s) {
-               INIT_LIST_HEAD(&s->valid_paths);
-               INIT_LIST_HEAD(&s->invalid_paths);
-               spin_lock_init(&s->lock);
-       }
-
-       return s;
-}
-
-static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
-{
-       struct selector *s;
-
-       s = alloc_selector();
-       if (!s)
-               return -ENOMEM;
-
-       ps->context = s;
-       return 0;
-}
-
-static void rr_destroy(struct path_selector *ps)
-{
-       struct selector *s = ps->context;
-
-       free_paths(&s->valid_paths);
-       free_paths(&s->invalid_paths);
-       kfree(s);
-       ps->context = NULL;
-}
-
-static int rr_status(struct path_selector *ps, struct dm_path *path,
-                    status_type_t type, char *result, unsigned int maxlen)
-{
-       struct path_info *pi;
-       int sz = 0;
-
-       if (!path)
-               DMEMIT("0 ");
-       else {
-               switch(type) {
-               case STATUSTYPE_INFO:
-                       break;
-               case STATUSTYPE_TABLE:
-                       pi = path->pscontext;
-                       DMEMIT("%u ", pi->repeat_count);
-                       break;
-               }
-       }
-
-       return sz;
-}
-
-/*
- * Called during initialisation to register each path with an
- * optional repeat_count.
- */
-static int rr_add_path(struct path_selector *ps, struct dm_path *path,
-                      int argc, char **argv, char **error)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi;
-       unsigned repeat_count = RR_MIN_IO;
-       char dummy;
-       unsigned long flags;
-
-       if (argc > 1) {
-               *error = "round-robin ps: incorrect number of arguments";
-               return -EINVAL;
-       }
-
-       /* First path argument is number of I/Os before switching path */
-       if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
-               *error = "round-robin ps: invalid repeat count";
-               return -EINVAL;
-       }
-
-       if (repeat_count > 1) {
-               DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
-               repeat_count = 1;
-       }
-
-       /* allocate the path */
-       pi = kmalloc(sizeof(*pi), GFP_KERNEL);
-       if (!pi) {
-               *error = "round-robin ps: Error allocating path context";
-               return -ENOMEM;
-       }
-
-       pi->path = path;
-       pi->repeat_count = repeat_count;
-
-       path->pscontext = pi;
-
-       spin_lock_irqsave(&s->lock, flags);
-       list_add_tail(&pi->list, &s->valid_paths);
-       spin_unlock_irqrestore(&s->lock, flags);
-
-       return 0;
-}
-
-static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
-{
-       unsigned long flags;
-       struct selector *s = ps->context;
-       struct path_info *pi = p->pscontext;
-
-       spin_lock_irqsave(&s->lock, flags);
-       list_move(&pi->list, &s->invalid_paths);
-       spin_unlock_irqrestore(&s->lock, flags);
-}
-
-static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
-{
-       unsigned long flags;
-       struct selector *s = ps->context;
-       struct path_info *pi = p->pscontext;
-
-       spin_lock_irqsave(&s->lock, flags);
-       list_move(&pi->list, &s->valid_paths);
-       spin_unlock_irqrestore(&s->lock, flags);
-
-       return 0;
-}
-
-static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes)
-{
-       unsigned long flags;
-       struct selector *s = ps->context;
-       struct path_info *pi = NULL;
-
-       spin_lock_irqsave(&s->lock, flags);
-       if (!list_empty(&s->valid_paths)) {
-               pi = list_entry(s->valid_paths.next, struct path_info, list);
-               list_move_tail(&pi->list, &s->valid_paths);
-       }
-       spin_unlock_irqrestore(&s->lock, flags);
-
-       return pi ? pi->path : NULL;
-}
-
-static struct path_selector_type rr_ps = {
-       .name = "round-robin",
-       .module = THIS_MODULE,
-       .table_args = 1,
-       .info_args = 0,
-       .create = rr_create,
-       .destroy = rr_destroy,
-       .status = rr_status,
-       .add_path = rr_add_path,
-       .fail_path = rr_fail_path,
-       .reinstate_path = rr_reinstate_path,
-       .select_path = rr_select_path,
-};
-
-static int __init dm_rr_init(void)
-{
-       int r = dm_register_path_selector(&rr_ps);
-
-       if (r < 0)
-               DMERR("register failed %d", r);
-
-       DMINFO("version " RR_VERSION " loaded");
-
-       return r;
-}
-
-static void __exit dm_rr_exit(void)
-{
-       int r = dm_unregister_path_selector(&rr_ps);
-
-       if (r < 0)
-               DMERR("unregister failed %d", r);
-}
-
-module_init(dm_rr_init);
-module_exit(dm_rr_exit);
-
-MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector");
-MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
deleted file mode 100644 (file)
index 9cfda66..0000000
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
- * Copyright (C) 2007-2009 NEC Corporation.  All Rights Reserved.
- *
- * Module Author: Kiyoshi Ueda
- *
- * This file is released under the GPL.
- *
- * Throughput oriented path selector.
- */
-
-#include "dm.h"
-#include "dm-path-selector.h"
-
-#include <linux/slab.h>
-#include <linux/module.h>
-
-#define DM_MSG_PREFIX  "multipath service-time"
-#define ST_MIN_IO      1
-#define ST_MAX_RELATIVE_THROUGHPUT     100
-#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT       7
-#define ST_MAX_INFLIGHT_SIZE   ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
-#define ST_VERSION     "0.3.0"
-
-struct selector {
-       struct list_head valid_paths;
-       struct list_head failed_paths;
-       spinlock_t lock;
-};
-
-struct path_info {
-       struct list_head list;
-       struct dm_path *path;
-       unsigned repeat_count;
-       unsigned relative_throughput;
-       atomic_t in_flight_size;        /* Total size of in-flight I/Os */
-};
-
-static struct selector *alloc_selector(void)
-{
-       struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
-       if (s) {
-               INIT_LIST_HEAD(&s->valid_paths);
-               INIT_LIST_HEAD(&s->failed_paths);
-               spin_lock_init(&s->lock);
-       }
-
-       return s;
-}
-
-static int st_create(struct path_selector *ps, unsigned argc, char **argv)
-{
-       struct selector *s = alloc_selector();
-
-       if (!s)
-               return -ENOMEM;
-
-       ps->context = s;
-       return 0;
-}
-
-static void free_paths(struct list_head *paths)
-{
-       struct path_info *pi, *next;
-
-       list_for_each_entry_safe(pi, next, paths, list) {
-               list_del(&pi->list);
-               kfree(pi);
-       }
-}
-
-static void st_destroy(struct path_selector *ps)
-{
-       struct selector *s = ps->context;
-
-       free_paths(&s->valid_paths);
-       free_paths(&s->failed_paths);
-       kfree(s);
-       ps->context = NULL;
-}
-
-static int st_status(struct path_selector *ps, struct dm_path *path,
-                    status_type_t type, char *result, unsigned maxlen)
-{
-       unsigned sz = 0;
-       struct path_info *pi;
-
-       if (!path)
-               DMEMIT("0 ");
-       else {
-               pi = path->pscontext;
-
-               switch (type) {
-               case STATUSTYPE_INFO:
-                       DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
-                              pi->relative_throughput);
-                       break;
-               case STATUSTYPE_TABLE:
-                       DMEMIT("%u %u ", pi->repeat_count,
-                              pi->relative_throughput);
-                       break;
-               }
-       }
-
-       return sz;
-}
-
-static int st_add_path(struct path_selector *ps, struct dm_path *path,
-                      int argc, char **argv, char **error)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi;
-       unsigned repeat_count = ST_MIN_IO;
-       unsigned relative_throughput = 1;
-       char dummy;
-       unsigned long flags;
-
-       /*
-        * Arguments: [<repeat_count> [<relative_throughput>]]
-        *      <repeat_count>: The number of I/Os before switching path.
-        *                      If not given, default (ST_MIN_IO) is used.
-        *      <relative_throughput>: The relative throughput value of
-        *                      the path among all paths in the path-group.
-        *                      The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
-        *                      If not given, minimum value '1' is used.
-        *                      If '0' is given, the path isn't selected while
-        *                      other paths having a positive value are
-        *                      available.
-        */
-       if (argc > 2) {
-               *error = "service-time ps: incorrect number of arguments";
-               return -EINVAL;
-       }
-
-       if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
-               *error = "service-time ps: invalid repeat count";
-               return -EINVAL;
-       }
-
-       if (repeat_count > 1) {
-               DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
-               repeat_count = 1;
-       }
-
-       if ((argc == 2) &&
-           (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
-            relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
-               *error = "service-time ps: invalid relative_throughput value";
-               return -EINVAL;
-       }
-
-       /* allocate the path */
-       pi = kmalloc(sizeof(*pi), GFP_KERNEL);
-       if (!pi) {
-               *error = "service-time ps: Error allocating path context";
-               return -ENOMEM;
-       }
-
-       pi->path = path;
-       pi->repeat_count = repeat_count;
-       pi->relative_throughput = relative_throughput;
-       atomic_set(&pi->in_flight_size, 0);
-
-       path->pscontext = pi;
-
-       spin_lock_irqsave(&s->lock, flags);
-       list_add_tail(&pi->list, &s->valid_paths);
-       spin_unlock_irqrestore(&s->lock, flags);
-
-       return 0;
-}
-
-static void st_fail_path(struct path_selector *ps, struct dm_path *path)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi = path->pscontext;
-       unsigned long flags;
-
-       spin_lock_irqsave(&s->lock, flags);
-       list_move(&pi->list, &s->failed_paths);
-       spin_unlock_irqrestore(&s->lock, flags);
-}
-
-static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi = path->pscontext;
-       unsigned long flags;
-
-       spin_lock_irqsave(&s->lock, flags);
-       list_move_tail(&pi->list, &s->valid_paths);
-       spin_unlock_irqrestore(&s->lock, flags);
-
-       return 0;
-}
-
-/*
- * Compare the estimated service time of 2 paths, pi1 and pi2,
- * for the incoming I/O.
- *
- * Returns:
- * < 0 : pi1 is better
- * 0   : no difference between pi1 and pi2
- * > 0 : pi2 is better
- *
- * Description:
- * Basically, the service time is estimated by:
- *     ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
- * To reduce the calculation, some optimizations are made.
- * (See comments inline)
- */
-static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
-                          size_t incoming)
-{
-       size_t sz1, sz2, st1, st2;
-
-       sz1 = atomic_read(&pi1->in_flight_size);
-       sz2 = atomic_read(&pi2->in_flight_size);
-
-       /*
-        * Case 1: Both have same throughput value. Choose less loaded path.
-        */
-       if (pi1->relative_throughput == pi2->relative_throughput)
-               return sz1 - sz2;
-
-       /*
-        * Case 2a: Both have same load. Choose higher throughput path.
-        * Case 2b: One path has no throughput value. Choose the other one.
-        */
-       if (sz1 == sz2 ||
-           !pi1->relative_throughput || !pi2->relative_throughput)
-               return pi2->relative_throughput - pi1->relative_throughput;
-
-       /*
-        * Case 3: Calculate service time. Choose faster path.
-        *         Service time using pi1:
-        *             st1 = (sz1 + incoming) / pi1->relative_throughput
-        *         Service time using pi2:
-        *             st2 = (sz2 + incoming) / pi2->relative_throughput
-        *
-        *         To avoid the division, transform the expression to use
-        *         multiplication.
-        *         Because ->relative_throughput > 0 here, if st1 < st2,
-        *         the expressions below are the same meaning:
-        *             (sz1 + incoming) / pi1->relative_throughput <
-        *                 (sz2 + incoming) / pi2->relative_throughput
-        *             (sz1 + incoming) * pi2->relative_throughput <
-        *                 (sz2 + incoming) * pi1->relative_throughput
-        *         So use the later one.
-        */
-       sz1 += incoming;
-       sz2 += incoming;
-       if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
-                    sz2 >= ST_MAX_INFLIGHT_SIZE)) {
-               /*
-                * Size may be too big for multiplying pi->relative_throughput
-                * and overflow.
-                * To avoid the overflow and mis-selection, shift down both.
-                */
-               sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
-               sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
-       }
-       st1 = sz1 * pi2->relative_throughput;
-       st2 = sz2 * pi1->relative_throughput;
-       if (st1 != st2)
-               return st1 - st2;
-
-       /*
-        * Case 4: Service time is equal. Choose higher throughput path.
-        */
-       return pi2->relative_throughput - pi1->relative_throughput;
-}
-
-static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes)
-{
-       struct selector *s = ps->context;
-       struct path_info *pi = NULL, *best = NULL;
-       struct dm_path *ret = NULL;
-       unsigned long flags;
-
-       spin_lock_irqsave(&s->lock, flags);
-       if (list_empty(&s->valid_paths))
-               goto out;
-
-       list_for_each_entry(pi, &s->valid_paths, list)
-               if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
-                       best = pi;
-
-       if (!best)
-               goto out;
-
-       /* Move most recently used to least preferred to evenly balance. */
-       list_move_tail(&best->list, &s->valid_paths);
-
-       ret = best->path;
-out:
-       spin_unlock_irqrestore(&s->lock, flags);
-       return ret;
-}
-
-static int st_start_io(struct path_selector *ps, struct dm_path *path,
-                      size_t nr_bytes)
-{
-       struct path_info *pi = path->pscontext;
-
-       atomic_add(nr_bytes, &pi->in_flight_size);
-
-       return 0;
-}
-
-static int st_end_io(struct path_selector *ps, struct dm_path *path,
-                    size_t nr_bytes, u64 start_time)
-{
-       struct path_info *pi = path->pscontext;
-
-       atomic_sub(nr_bytes, &pi->in_flight_size);
-
-       return 0;
-}
-
-static struct path_selector_type st_ps = {
-       .name           = "service-time",
-       .module         = THIS_MODULE,
-       .table_args     = 2,
-       .info_args      = 2,
-       .create         = st_create,
-       .destroy        = st_destroy,
-       .status         = st_status,
-       .add_path       = st_add_path,
-       .fail_path      = st_fail_path,
-       .reinstate_path = st_reinstate_path,
-       .select_path    = st_select_path,
-       .start_io       = st_start_io,
-       .end_io         = st_end_io,
-};
-
-static int __init dm_st_init(void)
-{
-       int r = dm_register_path_selector(&st_ps);
-
-       if (r < 0)
-               DMERR("register failed %d", r);
-
-       DMINFO("version " ST_VERSION " loaded");
-
-       return r;
-}
-
-static void __exit dm_st_exit(void)
-{
-       int r = dm_unregister_path_selector(&st_ps);
-
-       if (r < 0)
-               DMERR("unregister failed %d", r);
-}
-
-module_init(dm_st_init);
-module_exit(dm_st_exit);
-
-MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
-MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
-MODULE_LICENSE("GPL");