workqueue: Add multiple affinity scopes and interface to select them

author Tejun Heo <tj@kernel.org>

Tue, 8 Aug 2023 01:57:24 +0000 (15:57 -1000)

committer Tejun Heo <tj@kernel.org>

Tue, 8 Aug 2023 01:57:24 +0000 (15:57 -1000)
author Tejun Heo <tj@kernel.org>
Tue, 8 Aug 2023 01:57:24 +0000 (15:57 -1000)
committer Tejun Heo <tj@kernel.org>
Tue, 8 Aug 2023 01:57:24 +0000 (15:57 -1000)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

index 2b89cbc..732c5c7 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -7007,6 +7007,18 @@
                         The default value of this parameter is determined by
                         the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT.
  
+        workqueue.default_affinity_scope=
+                       Select the default affinity scope to use for unbound
+                       workqueues. Can be one of "cpu", "smt", "cache",
+                       "numa" and "system". Default is "cache". For more
+                       information, see the Affinity Scopes section in
+                       Documentation/core-api/workqueue.rst.
+
+                       This can be updated after boot through the matching
+                       file under /sys/module/workqueue/parameters.
+                       However, the changed default will only apply to
+                       unbound workqueues created afterwards.
+
         workqueue.debug_force_rr_cpu
                         Workqueue used to implicitly guarantee that work
                         items queued without explicit CPU specified are put
diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst

index c9e46ac..56af317 100644 (file)
--- a/Documentation/core-api/workqueue.rst
+++ b/Documentation/core-api/workqueue.rst
@@ -347,6 +347,51 @@ Guidelines
    level of locality in wq operations and work item execution.
  
  
+Affinity Scopes
+===============
+
+An unbound workqueue groups CPUs according to its affinity scope to improve
+cache locality. For example, if a workqueue is using the default affinity
+scope of "cache", it will group CPUs according to last level cache
+boundaries. A work item queued on the workqueue will be processed by a
+worker running on one of the CPUs which share the last level cache with the
+issuing CPU.
+
+Workqueue currently supports the following five affinity scopes.
+
+``cpu``
+  CPUs are not grouped. A work item issued on one CPU is processed by a
+  worker on the same CPU. This makes unbound workqueues behave as per-cpu
+  workqueues without concurrency management.
+
+``smt``
+  CPUs are grouped according to SMT boundaries. This usually means that the
+  logical threads of each physical CPU core are grouped together.
+
+``cache``
+  CPUs are grouped according to cache boundaries. Which specific cache
+  boundary is used is determined by the arch code. L3 is used in a lot of
+  cases. This is the default affinity scope.
+
+``numa``
+  CPUs are grouped according to NUMA bounaries.
+
+``system``
+  All CPUs are put in the same group. Workqueue makes no effort to process a
+  work item on a CPU close to the issuing CPU.
+
+The default affinity scope can be changed with the module parameter
+``workqueue.default_affinity_scope`` and a specific workqueue's affinity
+scope can be changed using ``apply_workqueue_attrs()``.
+
+If ``WQ_SYSFS`` is set, the workqueue will have the following affinity scope
+related interface files under its ``/sys/devices/virtual/WQ_NAME/``
+directory.
+
+``affinity_scope``
+  Read to see the current affinity scope. Write to change.
+
+
  Examining Configuration
  =======================
  
@@ -358,6 +403,24 @@ configuration, worker pools and how workqueues map to the pools: ::
    ===============
    wq_unbound_cpumask=0000000f
  
+  CPU
+    nr_pods  4
+    pod_cpus [0]=00000001 [1]=00000002 [2]=00000004 [3]=00000008
+    pod_node [0]=0 [1]=0 [2]=1 [3]=1
+    cpu_pod  [0]=0 [1]=1 [2]=2 [3]=3
+
+  SMT
+    nr_pods  4
+    pod_cpus [0]=00000001 [1]=00000002 [2]=00000004 [3]=00000008
+    pod_node [0]=0 [1]=0 [2]=1 [3]=1
+    cpu_pod  [0]=0 [1]=1 [2]=2 [3]=3
+
+  CACHE (default)
+    nr_pods  2
+    pod_cpus [0]=00000003 [1]=0000000c
+    pod_node [0]=0 [1]=1
+    cpu_pod  [0]=0 [1]=0 [2]=1 [3]=1
+
    NUMA
      nr_pods  2
      pod_cpus [0]=00000003 [1]=0000000c
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h

index 180491e..568cfbc 100644 (file)
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -126,12 +126,15 @@ struct rcu_work {
  };
  
  enum wq_affn_scope {
+       WQ_AFFN_CPU,                    /* one pod per CPU */
+       WQ_AFFN_SMT,                    /* one pod poer SMT */
+       WQ_AFFN_CACHE,                  /* one pod per LLC */
         WQ_AFFN_NUMA,                   /* one pod per NUMA node */
         WQ_AFFN_SYSTEM,                 /* one pod across the whole system */
  
         WQ_AFFN_NR_TYPES,
  
-       WQ_AFFN_DFL = WQ_AFFN_NUMA,
+       WQ_AFFN_DFL = WQ_AFFN_CACHE,
  };
  
  /**
diff --git a/kernel/workqueue.c b/kernel/workqueue.c

index a2cc043..8e3a499 100644 (file)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -338,6 +338,15 @@ struct wq_pod_type {
  };
  
  static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
+static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_DFL;
+
+static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
+       [WQ_AFFN_CPU]                   = "cpu",
+       [WQ_AFFN_SMT]                   = "smt",
+       [WQ_AFFN_CACHE]                 = "cache",
+       [WQ_AFFN_NUMA]                  = "numa",
+       [WQ_AFFN_SYSTEM]                = "system",
+};
  
  /*
   * Per-cpu work items which run for longer than the following threshold are
@@ -3664,7 +3673,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void)
                 goto fail;
  
         cpumask_copy(attrs->cpumask, cpu_possible_mask);
-       attrs->affn_scope = WQ_AFFN_DFL;
+       attrs->affn_scope = wq_affn_dfl;
         return attrs;
  fail:
         free_workqueue_attrs(attrs);
@@ -5777,19 +5786,55 @@ out_unlock:
         return ret;
  }
  
+static int parse_affn_scope(const char *val)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) {
+               if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i])))
+                       return i;
+       }
+       return -EINVAL;
+}
+
+static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
+{
+       int affn;
+
+       affn = parse_affn_scope(val);
+       if (affn < 0)
+               return affn;
+
+       wq_affn_dfl = affn;
+       return 0;
+}
+
+static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp)
+{
+       return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]);
+}
+
+static const struct kernel_param_ops wq_affn_dfl_ops = {
+       .set    = wq_affn_dfl_set,
+       .get    = wq_affn_dfl_get,
+};
+
+module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644);
+
  #ifdef CONFIG_SYSFS
  /*
   * Workqueues with WQ_SYSFS flag set is visible to userland via
   * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
   * following attributes.
   *
- *  per_cpu    RO bool : whether the workqueue is per-cpu or unbound
- *  max_active RW int  : maximum number of in-flight work items
+ *  per_cpu            RO bool : whether the workqueue is per-cpu or unbound
+ *  max_active         RW int  : maximum number of in-flight work items
   *
   * Unbound workqueues have the following extra attributes.
   *
- *  nice       RW int  : nice value of the workers
- *  cpumask    RW mask : bitmask of allowed CPUs for the workers
+ *  nice               RW int  : nice value of the workers
+ *  cpumask            RW mask : bitmask of allowed CPUs for the workers
+ *  affinity_scope     RW str  : worker CPU affinity scope (cache, numa, none)
   */
  struct wq_device {
         struct workqueue_struct         *wq;
@@ -5932,9 +5977,47 @@ out_unlock:
         return ret ?: count;
  }
  
+static ssize_t wq_affn_scope_show(struct device *dev,
+                                 struct device_attribute *attr, char *buf)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       int written;
+
+       mutex_lock(&wq->mutex);
+       written = scnprintf(buf, PAGE_SIZE, "%s\n",
+                           wq_affn_names[wq->unbound_attrs->affn_scope]);
+       mutex_unlock(&wq->mutex);
+
+       return written;
+}
+
+static ssize_t wq_affn_scope_store(struct device *dev,
+                                  struct device_attribute *attr,
+                                  const char *buf, size_t count)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       struct workqueue_attrs *attrs;
+       int affn, ret = -ENOMEM;
+
+       affn = parse_affn_scope(buf);
+       if (affn < 0)
+               return affn;
+
+       apply_wqattrs_lock();
+       attrs = wq_sysfs_prep_attrs(wq);
+       if (attrs) {
+               attrs->affn_scope = affn;
+               ret = apply_workqueue_attrs_locked(wq, attrs);
+       }
+       apply_wqattrs_unlock();
+       free_workqueue_attrs(attrs);
+       return ret ?: count;
+}
+
  static struct device_attribute wq_sysfs_unbound_attrs[] = {
         __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
         __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+       __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store),
         __ATTR_NULL,
  };
  
@@ -6537,6 +6620,20 @@ static void __init init_pod_type(struct wq_pod_type *pt,
         }
  }
  
+static bool __init cpus_dont_share(int cpu0, int cpu1)
+{
+       return false;
+}
+
+static bool __init cpus_share_smt(int cpu0, int cpu1)
+{
+#ifdef CONFIG_SCHED_SMT
+       return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1));
+#else
+       return false;
+#endif
+}
+
  static bool __init cpus_share_numa(int cpu0, int cpu1)
  {
         return cpu_to_node(cpu0) == cpu_to_node(cpu1);
@@ -6554,6 +6651,9 @@ void __init workqueue_init_topology(void)
         struct workqueue_struct *wq;
         int cpu;
  
+       init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
+       init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
+       init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
         init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);
  
         mutex_lock(&wq_pool_mutex);
diff --git a/tools/workqueue/wq_dump.py b/tools/workqueue/wq_dump.py

index ddd0bb4..43ab71a 100644 (file)
--- a/tools/workqueue/wq_dump.py
+++ b/tools/workqueue/wq_dump.py
@@ -78,11 +78,16 @@ worker_pool_idr         = prog['worker_pool_idr']
  workqueues              = prog['workqueues']
  wq_unbound_cpumask      = prog['wq_unbound_cpumask']
  wq_pod_types            = prog['wq_pod_types']
+wq_affn_dfl             = prog['wq_affn_dfl']
+wq_affn_names           = prog['wq_affn_names']
  
  WQ_UNBOUND              = prog['WQ_UNBOUND']
  WQ_ORDERED              = prog['__WQ_ORDERED']
  WQ_MEM_RECLAIM          = prog['WQ_MEM_RECLAIM']
  
+WQ_AFFN_CPU             = prog['WQ_AFFN_CPU']
+WQ_AFFN_SMT             = prog['WQ_AFFN_SMT']
+WQ_AFFN_CACHE           = prog['WQ_AFFN_CACHE']
  WQ_AFFN_NUMA            = prog['WQ_AFFN_NUMA']
  WQ_AFFN_SYSTEM          = prog['WQ_AFFN_SYSTEM']
  
@@ -109,12 +114,10 @@ def print_pod_type(pt):
          print(f' [{cpu}]={pt.cpu_pod[cpu].value_()}', end='')
      print('')
  
-print('')
-print('NUMA')
-print_pod_type(wq_pod_types[WQ_AFFN_NUMA])
-print('')
-print('SYSTEM')
-print_pod_type(wq_pod_types[WQ_AFFN_SYSTEM])
+for affn in [WQ_AFFN_CPU, WQ_AFFN_SMT, WQ_AFFN_CACHE, WQ_AFFN_NUMA, WQ_AFFN_SYSTEM]:
+    print('')
+    print(f'{wq_affn_names[affn].string_().decode().upper()}{" (default)" if affn == wq_affn_dfl else ""}')
+    print_pod_type(wq_pod_types[affn])
  
  print('')
  print('Worker Pools')
author	Tejun Heo <tj@kernel.org>
	Tue, 8 Aug 2023 01:57:24 +0000 (15:57 -1000)
committer	Tejun Heo <tj@kernel.org>
	Tue, 8 Aug 2023 01:57:24 +0000 (15:57 -1000)
Documentation/admin-guide/kernel-parameters.txt		patch \| blob \| history
Documentation/core-api/workqueue.rst		patch \| blob \| history
include/linux/workqueue.h		patch \| blob \| history
kernel/workqueue.c		patch \| blob \| history
tools/workqueue/wq_dump.py		patch \| blob \| history