cpu/hotplug: Cure the cpusets trainwreck

author Thomas Gleixner <tglx@linutronix.de>

Sat, 27 Mar 2021 21:01:36 +0000 (22:01 +0100)

committer Thomas Gleixner <tglx@linutronix.de>

Mon, 21 Jun 2021 08:31:06 +0000 (10:31 +0200)
author Thomas Gleixner <tglx@linutronix.de>
Sat, 27 Mar 2021 21:01:36 +0000 (22:01 +0100)
committer Thomas Gleixner <tglx@linutronix.de>
Mon, 21 Jun 2021 08:31:06 +0000 (10:31 +0200)
diff --git a/kernel/cpu.c b/kernel/cpu.c

index e538518..d2e1692 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -32,6 +32,7 @@
  #include <linux/relay.h>
  #include <linux/slab.h>
  #include <linux/percpu-rwsem.h>
+#include <linux/cpuset.h>
  
  #include <trace/events/power.h>
  #define CREATE_TRACE_POINTS
@@ -873,6 +874,52 @@ void __init cpuhp_threads_init(void)
         kthread_unpark(this_cpu_read(cpuhp_state.thread));
  }
  
+/*
+ *
+ * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
+ * protected region.
+ *
+ * The operation is still serialized against concurrent CPU hotplug via
+ * cpu_add_remove_lock, i.e. CPU map protection.  But it is _not_
+ * serialized against other hotplug related activity like adding or
+ * removing of state callbacks and state instances, which invoke either the
+ * startup or the teardown callback of the affected state.
+ *
+ * This is required for subsystems which are unfixable vs. CPU hotplug and
+ * evade lock inversion problems by scheduling work which has to be
+ * completed _before_ cpu_up()/_cpu_down() returns.
+ *
+ * Don't even think about adding anything to this for any new code or even
+ * drivers. It's only purpose is to keep existing lock order trainwrecks
+ * working.
+ *
+ * For cpu_down() there might be valid reasons to finish cleanups which are
+ * not required to be done under cpu_hotplug_lock, but that's a different
+ * story and would be not invoked via this.
+ */
+static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
+{
+       /*
+        * cpusets delegate hotplug operations to a worker to "solve" the
+        * lock order problems. Wait for the worker, but only if tasks are
+        * _not_ frozen (suspend, hibernate) as that would wait forever.
+        *
+        * The wait is required because otherwise the hotplug operation
+        * returns with inconsistent state, which could even be observed in
+        * user space when a new CPU is brought up. The CPU plug uevent
+        * would be delivered and user space reacting on it would fail to
+        * move tasks to the newly plugged CPU up to the point where the
+        * work has finished because up to that point the newly plugged CPU
+        * is not assignable in cpusets/cgroups. On unplug that's not
+        * necessarily a visible issue, but it is still inconsistent state,
+        * which is the real problem which needs to be "fixed". This can't
+        * prevent the transient state between scheduling the work and
+        * returning from waiting for it.
+        */
+       if (!tasks_frozen)
+               cpuset_wait_for_hotplug();
+}
+
  #ifdef CONFIG_HOTPLUG_CPU
  #ifndef arch_clear_mm_cpumask_cpu
  #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
@@ -1108,6 +1155,7 @@ out:
          */
         lockup_detector_cleanup();
         arch_smt_update();
+       cpu_up_down_serialize_trainwrecks(tasks_frozen);
         return ret;
  }
  
@@ -1302,6 +1350,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
  out:
         cpus_write_unlock();
         arch_smt_update();
+       cpu_up_down_serialize_trainwrecks(tasks_frozen);
         return ret;
  }
author	Thomas Gleixner <tglx@linutronix.de>
	Sat, 27 Mar 2021 21:01:36 +0000 (22:01 +0100)
committer	Thomas Gleixner <tglx@linutronix.de>
	Mon, 21 Jun 2021 08:31:06 +0000 (10:31 +0200)