dm mpath: add IO affinity path selector
authorMike Christie <michael.christie@oracle.com>
Fri, 23 Oct 2020 00:27:50 +0000 (19:27 -0500)
committerMike Snitzer <snitzer@redhat.com>
Fri, 4 Dec 2020 23:04:35 +0000 (18:04 -0500)
This patch adds a path selector that selects paths based on a CPU to
path mapping the user passes in and what CPU we are executing on. The
primary user for this PS is where the app is optimized to use specific
CPUs so other PSs undo the apps handy work, and the storage and it's
transport are not a bottlneck.

For these io-affinity PS setups a path's transport/interconnect
perf is not going to flucuate a lot and there is no major differences
between paths, so QL/HST smarts do not help and RR always messes up
what the app is trying to do.

On a system with 16 cores, where you have a job per CPU:

fio --filename=/dev/dm-0 --direct=1 --rw=randrw --bs=4k \
--ioengine=libaio --iodepth=128 --numjobs=16

and a dm-multipath device setup where each CPU is mapped to one path:

// When in mq mode I had to set dm_mq_nr_hw_queues=$NUM_PATHS.
// Bio mode also showed similar results.
16777216 multipath 0 0 1 1 io-affinity 0 16 1 8:16 1 8:32 2 8:64 4
8:48 8 8:80 10 8:96 20 8:112 40 8:128 80 8:144 100 8:160 200 8:176
400 8:192 800 8:208 1000 8:224 2000 8:240 4000 65:0 8000

we can see a IOPs increase of 25%.

The percent increase depends on the device and interconnect. For a
slower/medium speed path/device that can do around 180K IOPs a path
if you ran that fio command to it directly we saw a 25% increase like
above. Slower path'd devices that could do around 90K per path showed
maybe around a 2 - 5% increase. If you use something like null_blk or
scsi_debug which can multi-million IOPs and hack it up so each device
they export shows up as a path then you see 50%+ increases.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
drivers/md/Kconfig
drivers/md/Makefile
drivers/md/dm-io-affinity.c [new file with mode: 0644]

index 1d68935..b7e2d96 100644 (file)
@@ -463,6 +463,15 @@ config DM_MULTIPATH_HST
 
          If unsure, say N.
 
+config DM_MULTIPATH_IOA
+       tristate "I/O Path Selector based on CPU submission"
+       depends on DM_MULTIPATH
+       help
+         This path selector selects the path based on the CPU the IO is
+         executed on and the CPU to path mapping setup at path addition time.
+
+         If unsure, say N.
+
 config DM_DELAY
        tristate "I/O delaying target"
        depends on BLK_DEV_DM
index 6d3e234..4f95f33 100644 (file)
@@ -59,6 +59,7 @@ obj-$(CONFIG_DM_MULTIPATH)    += dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)  += dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)  += dm-service-time.o
 obj-$(CONFIG_DM_MULTIPATH_HST) += dm-historical-service-time.o
+obj-$(CONFIG_DM_MULTIPATH_IOA) += dm-io-affinity.o
 obj-$(CONFIG_DM_SWITCH)                += dm-switch.o
 obj-$(CONFIG_DM_SNAPSHOT)      += dm-snapshot.o
 obj-$(CONFIG_DM_PERSISTENT_DATA)       += persistent-data/
diff --git a/drivers/md/dm-io-affinity.c b/drivers/md/dm-io-affinity.c
new file mode 100644 (file)
index 0000000..077655c
--- /dev/null
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Oracle Corporation
+ *
+ * Module Author: Mike Christie
+ */
+#include "dm-path-selector.h"
+
+#include <linux/device-mapper.h>
+#include <linux/module.h>
+
+#define DM_MSG_PREFIX "multipath io-affinity"
+
+struct path_info {
+       struct dm_path *path;
+       cpumask_var_t cpumask;
+       refcount_t refcount;
+       bool failed;
+};
+
+struct selector {
+       struct path_info **path_map;
+       cpumask_var_t path_mask;
+       atomic_t map_misses;
+};
+
+static void ioa_free_path(struct selector *s, unsigned int cpu)
+{
+       struct path_info *pi = s->path_map[cpu];
+
+       if (!pi)
+               return;
+
+       if (refcount_dec_and_test(&pi->refcount)) {
+               cpumask_clear_cpu(cpu, s->path_mask);
+               free_cpumask_var(pi->cpumask);
+               kfree(pi);
+
+               s->path_map[cpu] = NULL;
+       }
+}
+
+static int ioa_add_path(struct path_selector *ps, struct dm_path *path,
+                       int argc, char **argv, char **error)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi = NULL;
+       unsigned int cpu;
+       int ret;
+
+       if (argc != 1) {
+               *error = "io-affinity ps: invalid number of arguments";
+               return -EINVAL;
+       }
+
+       pi = kzalloc(sizeof(*pi), GFP_KERNEL);
+       if (!pi) {
+               *error = "io-affinity ps: Error allocating path context";
+               return -ENOMEM;
+       }
+
+       pi->path = path;
+       path->pscontext = pi;
+       refcount_set(&pi->refcount, 1);
+
+       if (!zalloc_cpumask_var(&pi->cpumask, GFP_KERNEL)) {
+               *error = "io-affinity ps: Error allocating cpumask context";
+               ret = -ENOMEM;
+               goto free_pi;
+       }
+
+       ret = cpumask_parse(argv[0], pi->cpumask);
+       if (ret) {
+               *error = "io-affinity ps: invalid cpumask";
+               ret = -EINVAL;
+               goto free_mask;
+       }
+
+       for_each_cpu(cpu, pi->cpumask) {
+               if (cpu >= nr_cpu_ids) {
+                       DMWARN_LIMIT("Ignoring mapping for CPU %u. Max CPU is %u",
+                                    cpu, nr_cpu_ids);
+                       break;
+               }
+
+               if (s->path_map[cpu]) {
+                       DMWARN("CPU mapping for %u exists. Ignoring.", cpu);
+                       continue;
+               }
+
+               cpumask_set_cpu(cpu, s->path_mask);
+               s->path_map[cpu] = pi;
+               refcount_inc(&pi->refcount);
+               continue;
+       }
+
+       if (refcount_dec_and_test(&pi->refcount)) {
+               *error = "io-affinity ps: No new/valid CPU mapping found";
+               ret = -EINVAL;
+               goto free_mask;
+       }
+
+       return 0;
+
+free_mask:
+       free_cpumask_var(pi->cpumask);
+free_pi:
+       kfree(pi);
+       return ret;
+}
+
+static int ioa_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+       struct selector *s;
+
+       s = kmalloc(sizeof(*s), GFP_KERNEL);
+       if (!s)
+               return -ENOMEM;
+
+       s->path_map = kzalloc(nr_cpu_ids * sizeof(struct path_info *),
+                             GFP_KERNEL);
+       if (!s->path_map)
+               goto free_selector;
+
+       if (!zalloc_cpumask_var(&s->path_mask, GFP_KERNEL))
+               goto free_map;
+
+       atomic_set(&s->map_misses, 0);
+       ps->context = s;
+       return 0;
+
+free_map:
+       kfree(s->path_map);
+free_selector:
+       kfree(s);
+       return -ENOMEM;
+}
+
+static void ioa_destroy(struct path_selector *ps)
+{
+       struct selector *s = ps->context;
+       unsigned cpu;
+
+       for_each_cpu(cpu, s->path_mask)
+               ioa_free_path(s, cpu);
+
+       free_cpumask_var(s->path_mask);
+       kfree(s->path_map);
+       kfree(s);
+
+       ps->context = NULL;
+}
+
+static int ioa_status(struct path_selector *ps, struct dm_path *path,
+                     status_type_t type, char *result, unsigned int maxlen)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi;
+       int sz = 0;
+
+       if (!path) {
+               DMEMIT("0 ");
+               return sz;
+       }
+
+       switch(type) {
+       case STATUSTYPE_INFO:
+               DMEMIT("%d ", atomic_read(&s->map_misses));
+               break;
+       case STATUSTYPE_TABLE:
+               pi = path->pscontext;
+               DMEMIT("%*pb ", cpumask_pr_args(pi->cpumask));
+               break;
+       }
+
+       return sz;
+}
+
+static void ioa_fail_path(struct path_selector *ps, struct dm_path *p)
+{
+       struct path_info *pi = p->pscontext;
+
+       pi->failed = true;
+}
+
+static int ioa_reinstate_path(struct path_selector *ps, struct dm_path *p)
+{
+       struct path_info *pi = p->pscontext;
+
+       pi->failed = false;
+       return 0;
+}
+
+static struct dm_path *ioa_select_path(struct path_selector *ps,
+                                      size_t nr_bytes)
+{
+       unsigned int cpu, node;
+       struct selector *s = ps->context;
+       const struct cpumask *cpumask;
+       struct path_info *pi;
+       int i;
+
+       cpu = get_cpu();
+
+       pi = s->path_map[cpu];
+       if (pi && !pi->failed)
+               goto done;
+
+       /*
+        * Perf is not optimal, but we at least try the local node then just
+        * try not to fail.
+        */
+       if (!pi)
+               atomic_inc(&s->map_misses);
+
+       node = cpu_to_node(cpu);
+       cpumask = cpumask_of_node(node);
+       for_each_cpu(i, cpumask) {
+               pi = s->path_map[i];
+               if (pi && !pi->failed)
+                       goto done;
+       }
+
+       for_each_cpu(i, s->path_mask) {
+               pi = s->path_map[i];
+               if (pi && !pi->failed)
+                       goto done;
+       }
+       pi = NULL;
+
+done:
+       put_cpu();
+       return pi ? pi->path : NULL;
+}
+
+static struct path_selector_type ioa_ps = {
+       .name           = "io-affinity",
+       .module         = THIS_MODULE,
+       .table_args     = 1,
+       .info_args      = 1,
+       .create         = ioa_create,
+       .destroy        = ioa_destroy,
+       .status         = ioa_status,
+       .add_path       = ioa_add_path,
+       .fail_path      = ioa_fail_path,
+       .reinstate_path = ioa_reinstate_path,
+       .select_path    = ioa_select_path,
+};
+
+static int __init dm_ioa_init(void)
+{
+       int ret = dm_register_path_selector(&ioa_ps);
+
+       if (ret < 0)
+               DMERR("register failed %d", ret);
+       return ret;
+}
+
+static void __exit dm_ioa_exit(void)
+{
+       int ret = dm_unregister_path_selector(&ioa_ps);
+
+       if (ret < 0)
+               DMERR("unregister failed %d", ret);
+}
+
+module_init(dm_ioa_init);
+module_exit(dm_ioa_exit);
+
+MODULE_DESCRIPTION(DM_NAME " multipath path selector that selects paths based on the CPU IO is being executed on");
+MODULE_AUTHOR("Mike Christie <michael.christie@oracle.com>");
+MODULE_LICENSE("GPL");