Linux 6.9-rc1
[linux-2.6-microblaze.git] / kernel / cpu.c
index bbad5e3..8f6affd 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include <linux/rcupdate.h>
+#include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/bug.h>
 #include <linux/kthread.h>
  * @rollback:  Perform a rollback
  * @single:    Single callback invocation
  * @bringup:   Single callback bringup or teardown selector
- * @cpu:       CPU number
  * @node:      Remote CPU node; for multi-instance, do a
  *             single entry callback for install/remove
  * @last:      For multi-instance rollback, remember how far we got
  * @cb_state:  The state for a single callback (install/uninstall)
  * @result:    Result of the operation
+ * @ap_sync_state:     State for AP synchronization
  * @done_up:   Signal completion to the issuer of the task for cpu-up
  * @done_down: Signal completion to the issuer of the task for cpu-down
  */
@@ -76,6 +77,7 @@ struct cpuhp_cpu_state {
        struct hlist_node       *last;
        enum cpuhp_state        cb_state;
        int                     result;
+       atomic_t                ap_sync_state;
        struct completion       done_up;
        struct completion       done_down;
 #endif
@@ -276,6 +278,182 @@ static bool cpuhp_is_atomic_state(enum cpuhp_state state)
        return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
 }
 
+/* Synchronization state management */
+enum cpuhp_sync_state {
+       SYNC_STATE_DEAD,
+       SYNC_STATE_KICKED,
+       SYNC_STATE_SHOULD_DIE,
+       SYNC_STATE_ALIVE,
+       SYNC_STATE_SHOULD_ONLINE,
+       SYNC_STATE_ONLINE,
+};
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC
+/**
+ * cpuhp_ap_update_sync_state - Update synchronization state during bringup/teardown
+ * @state:     The synchronization state to set
+ *
+ * No synchronization point. Just update of the synchronization state, but implies
+ * a full barrier so that the AP changes are visible before the control CPU proceeds.
+ */
+static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state)
+{
+       atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);
+
+       (void)atomic_xchg(st, state);
+}
+
+void __weak arch_cpuhp_sync_state_poll(void) { cpu_relax(); }
+
+static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state state,
+                                     enum cpuhp_sync_state next_state)
+{
+       atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+       ktime_t now, end, start = ktime_get();
+       int sync;
+
+       end = start + 10ULL * NSEC_PER_SEC;
+
+       sync = atomic_read(st);
+       while (1) {
+               if (sync == state) {
+                       if (!atomic_try_cmpxchg(st, &sync, next_state))
+                               continue;
+                       return true;
+               }
+
+               now = ktime_get();
+               if (now > end) {
+                       /* Timeout. Leave the state unchanged */
+                       return false;
+               } else if (now - start < NSEC_PER_MSEC) {
+                       /* Poll for one millisecond */
+                       arch_cpuhp_sync_state_poll();
+               } else {
+                       usleep_range_state(USEC_PER_MSEC, 2 * USEC_PER_MSEC, TASK_UNINTERRUPTIBLE);
+               }
+               sync = atomic_read(st);
+       }
+       return true;
+}
+#else  /* CONFIG_HOTPLUG_CORE_SYNC */
+static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC */
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC_DEAD
+/**
+ * cpuhp_ap_report_dead - Update synchronization state to DEAD
+ *
+ * No synchronization point. Just update of the synchronization state.
+ */
+void cpuhp_ap_report_dead(void)
+{
+       cpuhp_ap_update_sync_state(SYNC_STATE_DEAD);
+}
+
+void __weak arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { }
+
+/*
+ * Late CPU shutdown synchronization point. Cannot use cpuhp_state::done_down
+ * because the AP cannot issue complete() at this stage.
+ */
+static void cpuhp_bp_sync_dead(unsigned int cpu)
+{
+       atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+       int sync = atomic_read(st);
+
+       do {
+               /* CPU can have reported dead already. Don't overwrite that! */
+               if (sync == SYNC_STATE_DEAD)
+                       break;
+       } while (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_SHOULD_DIE));
+
+       if (cpuhp_wait_for_sync_state(cpu, SYNC_STATE_DEAD, SYNC_STATE_DEAD)) {
+               /* CPU reached dead state. Invoke the cleanup function */
+               arch_cpuhp_cleanup_dead_cpu(cpu);
+               return;
+       }
+
+       /* No further action possible. Emit message and give up. */
+       pr_err("CPU%u failed to report dead state\n", cpu);
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC_DEAD */
+static inline void cpuhp_bp_sync_dead(unsigned int cpu) { }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC_DEAD */
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC_FULL
+/**
+ * cpuhp_ap_sync_alive - Synchronize AP with the control CPU once it is alive
+ *
+ * Updates the AP synchronization state to SYNC_STATE_ALIVE and waits
+ * for the BP to release it.
+ */
+void cpuhp_ap_sync_alive(void)
+{
+       atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);
+
+       cpuhp_ap_update_sync_state(SYNC_STATE_ALIVE);
+
+       /* Wait for the control CPU to release it. */
+       while (atomic_read(st) != SYNC_STATE_SHOULD_ONLINE)
+               cpu_relax();
+}
+
+static bool cpuhp_can_boot_ap(unsigned int cpu)
+{
+       atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+       int sync = atomic_read(st);
+
+again:
+       switch (sync) {
+       case SYNC_STATE_DEAD:
+               /* CPU is properly dead */
+               break;
+       case SYNC_STATE_KICKED:
+               /* CPU did not come up in previous attempt */
+               break;
+       case SYNC_STATE_ALIVE:
+               /* CPU is stuck cpuhp_ap_sync_alive(). */
+               break;
+       default:
+               /* CPU failed to report online or dead and is in limbo state. */
+               return false;
+       }
+
+       /* Prepare for booting */
+       if (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_KICKED))
+               goto again;
+
+       return true;
+}
+
+void __weak arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { }
+
+/*
+ * Early CPU bringup synchronization point. Cannot use cpuhp_state::done_up
+ * because the AP cannot issue complete() so early in the bringup.
+ */
+static int cpuhp_bp_sync_alive(unsigned int cpu)
+{
+       int ret = 0;
+
+       if (!IS_ENABLED(CONFIG_HOTPLUG_CORE_SYNC_FULL))
+               return 0;
+
+       if (!cpuhp_wait_for_sync_state(cpu, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE)) {
+               pr_err("CPU%u failed to report alive state\n", cpu);
+               ret = -EIO;
+       }
+
+       /* Let the architecture cleanup the kick alive mechanics. */
+       arch_cpuhp_cleanup_kick_cpu(cpu);
+       return ret;
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC_FULL */
+static inline int cpuhp_bp_sync_alive(unsigned int cpu) { return 0; }
+static inline bool cpuhp_can_boot_ap(unsigned int cpu) { return true; }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC_FULL */
+
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 bool cpuhp_tasks_frozen;
@@ -413,7 +591,10 @@ static void lockdep_release_cpus_lock(void)
 void __weak arch_smt_update(void) { }
 
 #ifdef CONFIG_HOTPLUG_SMT
+
 enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
+static unsigned int cpu_smt_max_threads __ro_after_init;
+unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX;
 
 void __init cpu_smt_disable(bool force)
 {
@@ -427,16 +608,33 @@ void __init cpu_smt_disable(bool force)
                pr_info("SMT: disabled\n");
                cpu_smt_control = CPU_SMT_DISABLED;
        }
+       cpu_smt_num_threads = 1;
 }
 
 /*
  * The decision whether SMT is supported can only be done after the full
  * CPU identification. Called from architecture code.
  */
-void __init cpu_smt_check_topology(void)
+void __init cpu_smt_set_num_threads(unsigned int num_threads,
+                                   unsigned int max_threads)
 {
-       if (!topology_smt_supported())
+       WARN_ON(!num_threads || (num_threads > max_threads));
+
+       if (max_threads == 1)
                cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+
+       cpu_smt_max_threads = max_threads;
+
+       /*
+        * If SMT has been disabled via the kernel command line or SMT is
+        * not supported, set cpu_smt_num_threads to 1 for consistency.
+        * If enabled, take the architecture requested number of threads
+        * to bring up into account.
+        */
+       if (cpu_smt_control != CPU_SMT_ENABLED)
+               cpu_smt_num_threads = 1;
+       else if (num_threads < cpu_smt_num_threads)
+               cpu_smt_num_threads = num_threads;
 }
 
 static int __init smt_cmdline_disable(char *str)
@@ -446,9 +644,31 @@ static int __init smt_cmdline_disable(char *str)
 }
 early_param("nosmt", smt_cmdline_disable);
 
-static inline bool cpu_smt_allowed(unsigned int cpu)
+/*
+ * For Archicture supporting partial SMT states check if the thread is allowed.
+ * Otherwise this has already been checked through cpu_smt_max_threads when
+ * setting the SMT level.
+ */
+static inline bool cpu_smt_thread_allowed(unsigned int cpu)
+{
+#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
+       return topology_smt_thread_allowed(cpu);
+#else
+       return true;
+#endif
+}
+
+static inline bool cpu_bootable(unsigned int cpu)
 {
-       if (cpu_smt_control == CPU_SMT_ENABLED)
+       if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
+               return true;
+
+       /* All CPUs are bootable if controls are not configured */
+       if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED)
+               return true;
+
+       /* All CPUs are bootable if CPU is not SMT capable */
+       if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
                return true;
 
        if (topology_is_primary_thread(cpu))
@@ -463,15 +683,16 @@ static inline bool cpu_smt_allowed(unsigned int cpu)
        return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
 }
 
-/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */
+/* Returns true if SMT is supported and not forcefully (irreversibly) disabled */
 bool cpu_smt_possible(void)
 {
        return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
                cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
 }
 EXPORT_SYMBOL_GPL(cpu_smt_possible);
+
 #else
-static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
+static inline bool cpu_bootable(unsigned int cpu) { return true; }
 #endif
 
 static inline enum cpuhp_state
@@ -558,7 +779,7 @@ static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
        return ret;
 }
 
-static int bringup_wait_for_ap(unsigned int cpu)
+static int bringup_wait_for_ap_online(unsigned int cpu)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 
@@ -574,43 +795,99 @@ static int bringup_wait_for_ap(unsigned int cpu)
         * SMT soft disabling on X86 requires to bring the CPU out of the
         * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit.  The
         * CPU marked itself as booted_once in notify_cpu_starting() so the
-        * cpu_smt_allowed() check will now return false if this is not the
+        * cpu_bootable() check will now return false if this is not the
         * primary sibling.
         */
-       if (!cpu_smt_allowed(cpu))
+       if (!cpu_bootable(cpu))
                return -ECANCELED;
+       return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
+static int cpuhp_kick_ap_alive(unsigned int cpu)
+{
+       if (!cpuhp_can_boot_ap(cpu))
+               return -EAGAIN;
+
+       return arch_cpuhp_kick_ap_alive(cpu, idle_thread_get(cpu));
+}
+
+static int cpuhp_bringup_ap(unsigned int cpu)
+{
+       struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+       int ret;
+
+       /*
+        * Some architectures have to walk the irq descriptors to
+        * setup the vector space for the cpu which comes online.
+        * Prevent irq alloc/free across the bringup.
+        */
+       irq_lock_sparse();
+
+       ret = cpuhp_bp_sync_alive(cpu);
+       if (ret)
+               goto out_unlock;
+
+       ret = bringup_wait_for_ap_online(cpu);
+       if (ret)
+               goto out_unlock;
+
+       irq_unlock_sparse();
 
        if (st->target <= CPUHP_AP_ONLINE_IDLE)
                return 0;
 
        return cpuhp_kick_ap(cpu, st, st->target);
-}
 
+out_unlock:
+       irq_unlock_sparse();
+       return ret;
+}
+#else
 static int bringup_cpu(unsigned int cpu)
 {
+       struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct task_struct *idle = idle_thread_get(cpu);
        int ret;
 
-       /*
-        * Reset stale stack state from the last time this CPU was online.
-        */
-       scs_task_reset(idle);
-       kasan_unpoison_task_stack(idle);
+       if (!cpuhp_can_boot_ap(cpu))
+               return -EAGAIN;
 
        /*
         * Some architectures have to walk the irq descriptors to
         * setup the vector space for the cpu which comes online.
-        * Prevent irq alloc/free across the bringup.
+        *
+        * Prevent irq alloc/free across the bringup by acquiring the
+        * sparse irq lock. Hold it until the upcoming CPU completes the
+        * startup in cpuhp_online_idle() which allows to avoid
+        * intermediate synchronization points in the architecture code.
         */
        irq_lock_sparse();
 
-       /* Arch-specific enabling code. */
        ret = __cpu_up(cpu, idle);
-       irq_unlock_sparse();
        if (ret)
-               return ret;
-       return bringup_wait_for_ap(cpu);
+               goto out_unlock;
+
+       ret = cpuhp_bp_sync_alive(cpu);
+       if (ret)
+               goto out_unlock;
+
+       ret = bringup_wait_for_ap_online(cpu);
+       if (ret)
+               goto out_unlock;
+
+       irq_unlock_sparse();
+
+       if (st->target <= CPUHP_AP_ONLINE_IDLE)
+               return 0;
+
+       return cpuhp_kick_ap(cpu, st, st->target);
+
+out_unlock:
+       irq_unlock_sparse();
+       return ret;
 }
+#endif
 
 static int finish_cpu(unsigned int cpu)
 {
@@ -623,7 +900,7 @@ static int finish_cpu(unsigned int cpu)
         */
        if (mm != &init_mm)
                idle->active_mm = &init_mm;
-       mmdrop(mm);
+       mmdrop_lazy_tlb(mm);
        return 0;
 }
 
@@ -663,21 +940,51 @@ static bool cpuhp_next_state(bool bringup,
        return true;
 }
 
-static int cpuhp_invoke_callback_range(bool bringup,
-                                      unsigned int cpu,
-                                      struct cpuhp_cpu_state *st,
-                                      enum cpuhp_state target)
+static int __cpuhp_invoke_callback_range(bool bringup,
+                                        unsigned int cpu,
+                                        struct cpuhp_cpu_state *st,
+                                        enum cpuhp_state target,
+                                        bool nofail)
 {
        enum cpuhp_state state;
-       int err = 0;
+       int ret = 0;
 
        while (cpuhp_next_state(bringup, &state, st, target)) {
+               int err;
+
                err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
-               if (err)
+               if (!err)
+                       continue;
+
+               if (nofail) {
+                       pr_warn("CPU %u %s state %s (%d) failed (%d)\n",
+                               cpu, bringup ? "UP" : "DOWN",
+                               cpuhp_get_step(st->state)->name,
+                               st->state, err);
+                       ret = -1;
+               } else {
+                       ret = err;
                        break;
+               }
        }
 
-       return err;
+       return ret;
+}
+
+static inline int cpuhp_invoke_callback_range(bool bringup,
+                                             unsigned int cpu,
+                                             struct cpuhp_cpu_state *st,
+                                             enum cpuhp_state target)
+{
+       return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false);
+}
+
+static inline void cpuhp_invoke_callback_range_nofail(bool bringup,
+                                                     unsigned int cpu,
+                                                     struct cpuhp_cpu_state *st,
+                                                     enum cpuhp_state target)
+{
+       __cpuhp_invoke_callback_range(bringup, cpu, st, target, true);
 }
 
 static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
@@ -999,7 +1306,6 @@ static int take_cpu_down(void *_param)
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
        enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
        int err, cpu = smp_processor_id();
-       int ret;
 
        /* Ensure this CPU doesn't handle any more interrupts. */
        err = __cpu_disable();
@@ -1012,18 +1318,11 @@ static int take_cpu_down(void *_param)
         */
        WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
 
-       /* Invoke the former CPU_DYING callbacks */
-       ret = cpuhp_invoke_callback_range(false, cpu, st, target);
-
        /*
-        * DYING must not fail!
+        * Invoke the former CPU_DYING callbacks. DYING must not fail!
         */
-       WARN_ON_ONCE(ret);
+       cpuhp_invoke_callback_range_nofail(false, cpu, st, target);
 
-       /* Give up timekeeping duties */
-       tick_handover_do_timer();
-       /* Remove CPU from timer broadcasting */
-       tick_offline_cpu(cpu);
        /* Park the stopper thread */
        stop_machine_park(cpu);
        return 0;
@@ -1073,8 +1372,17 @@ static int takedown_cpu(unsigned int cpu)
        /* This actually kills the CPU. */
        __cpu_die(cpu);
 
+       cpuhp_bp_sync_dead(cpu);
+
        tick_cleanup_dead_cpu(cpu);
+
+       /*
+        * Callbacks must be re-integrated right away to the RCU state machine.
+        * Otherwise an RCU callback could block a further teardown function
+        * waiting for its completion.
+        */
        rcutree_migrate_callbacks(cpu);
+
        return 0;
 }
 
@@ -1090,10 +1398,11 @@ void cpuhp_report_idle_dead(void)
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
 
        BUG_ON(st->state != CPUHP_AP_OFFLINE);
-       rcu_report_dead(smp_processor_id());
+       tick_assert_timekeeping_handover();
+       rcutree_report_cpu_dead();
        st->state = CPUHP_AP_IDLE_DEAD;
        /*
-        * We cannot call complete after rcu_report_dead() so we delegate it
+        * We cannot call complete after rcutree_report_cpu_dead() so we delegate it
         * to an online cpu.
         */
        smp_call_function_single(cpumask_first(cpu_online_mask),
@@ -1189,8 +1498,22 @@ out:
        return ret;
 }
 
+struct cpu_down_work {
+       unsigned int            cpu;
+       enum cpuhp_state        target;
+};
+
+static long __cpu_down_maps_locked(void *arg)
+{
+       struct cpu_down_work *work = arg;
+
+       return _cpu_down(work->cpu, 0, work->target);
+}
+
 static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
 {
+       struct cpu_down_work work = { .cpu = cpu, .target = target, };
+
        /*
         * If the platform does not support hotplug, report it explicitly to
         * differentiate it from a transient offlining failure.
@@ -1199,7 +1522,18 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
                return -EOPNOTSUPP;
        if (cpu_hotplug_disabled)
                return -EBUSY;
-       return _cpu_down(cpu, 0, target);
+
+       /*
+        * Ensure that the control task does not run on the to be offlined
+        * CPU to prevent a deadlock against cfs_b->period_timer.
+        * Also keep at least one housekeeping cpu onlined to avoid generating
+        * an empty sched_domain span.
+        */
+       for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) {
+               if (cpu != work.cpu)
+                       return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
+       }
+       return -EBUSY;
 }
 
 static int cpu_down(unsigned int cpu, enum cpuhp_state target)
@@ -1296,16 +1630,14 @@ void notify_cpu_starting(unsigned int cpu)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
-       int ret;
 
-       rcu_cpu_starting(cpu);  /* Enables RCU usage on this CPU. */
+       rcutree_report_cpu_starting(cpu);       /* Enables RCU usage on this CPU. */
        cpumask_set_cpu(cpu, &cpus_booted_once_mask);
-       ret = cpuhp_invoke_callback_range(true, cpu, st, target);
 
        /*
         * STARTING must not fail!
         */
-       WARN_ON_ONCE(ret);
+       cpuhp_invoke_callback_range_nofail(true, cpu, st, target);
 }
 
 /*
@@ -1321,8 +1653,10 @@ void cpuhp_online_idle(enum cpuhp_state state)
        if (state != CPUHP_AP_ONLINE_IDLE)
                return;
 
+       cpuhp_ap_update_sync_state(SYNC_STATE_ONLINE);
+
        /*
-        * Unpart the stopper thread before we start the idle loop (and start
+        * Unpark the stopper thread before we start the idle loop (and start
         * scheduling); this ensures the stopper task is always available.
         */
        stop_machine_unpark(smp_processor_id());
@@ -1359,6 +1693,12 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
                        ret = PTR_ERR(idle);
                        goto out;
                }
+
+               /*
+                * Reset stale stack state from the last time this CPU was online.
+                */
+               scs_task_reset(idle);
+               kasan_unpoison_task_stack(idle);
        }
 
        cpuhp_tasks_frozen = tasks_frozen;
@@ -1399,9 +1739,6 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target)
        if (!cpu_possible(cpu)) {
                pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
                       cpu);
-#if defined(CONFIG_IA64)
-               pr_err("please check additional_cpus= boot parameter\n");
-#endif
                return -EINVAL;
        }
 
@@ -1415,7 +1752,7 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target)
                err = -EBUSY;
                goto out;
        }
-       if (!cpu_smt_allowed(cpu)) {
+       if (!cpu_bootable(cpu)) {
                err = -EPERM;
                goto out;
        }
@@ -1478,18 +1815,106 @@ int bringup_hibernate_cpu(unsigned int sleep_cpu)
        return 0;
 }
 
-void bringup_nonboot_cpus(unsigned int setup_max_cpus)
+static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int ncpus,
+                                     enum cpuhp_state target)
 {
        unsigned int cpu;
 
-       for_each_present_cpu(cpu) {
-               if (num_online_cpus() >= setup_max_cpus)
+       for_each_cpu(cpu, mask) {
+               struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+               if (cpu_up(cpu, target) && can_rollback_cpu(st)) {
+                       /*
+                        * If this failed then cpu_up() might have only
+                        * rolled back to CPUHP_BP_KICK_AP for the final
+                        * online. Clean it up. NOOP if already rolled back.
+                        */
+                       WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE));
+               }
+
+               if (!--ncpus)
                        break;
-               if (!cpu_online(cpu))
-                       cpu_up(cpu, CPUHP_ONLINE);
        }
 }
 
+#ifdef CONFIG_HOTPLUG_PARALLEL
+static bool __cpuhp_parallel_bringup __ro_after_init = true;
+
+static int __init parallel_bringup_parse_param(char *arg)
+{
+       return kstrtobool(arg, &__cpuhp_parallel_bringup);
+}
+early_param("cpuhp.parallel", parallel_bringup_parse_param);
+
+static inline bool cpuhp_smt_aware(void)
+{
+       return cpu_smt_max_threads > 1;
+}
+
+static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
+{
+       return cpu_primary_thread_mask;
+}
+
+/*
+ * On architectures which have enabled parallel bringup this invokes all BP
+ * prepare states for each of the to be onlined APs first. The last state
+ * sends the startup IPI to the APs. The APs proceed through the low level
+ * bringup code in parallel and then wait for the control CPU to release
+ * them one by one for the final onlining procedure.
+ *
+ * This avoids waiting for each AP to respond to the startup IPI in
+ * CPUHP_BRINGUP_CPU.
+ */
+static bool __init cpuhp_bringup_cpus_parallel(unsigned int ncpus)
+{
+       const struct cpumask *mask = cpu_present_mask;
+
+       if (__cpuhp_parallel_bringup)
+               __cpuhp_parallel_bringup = arch_cpuhp_init_parallel_bringup();
+       if (!__cpuhp_parallel_bringup)
+               return false;
+
+       if (cpuhp_smt_aware()) {
+               const struct cpumask *pmask = cpuhp_get_primary_thread_mask();
+               static struct cpumask tmp_mask __initdata;
+
+               /*
+                * X86 requires to prevent that SMT siblings stopped while
+                * the primary thread does a microcode update for various
+                * reasons. Bring the primary threads up first.
+                */
+               cpumask_and(&tmp_mask, mask, pmask);
+               cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_BP_KICK_AP);
+               cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_ONLINE);
+               /* Account for the online CPUs */
+               ncpus -= num_online_cpus();
+               if (!ncpus)
+                       return true;
+               /* Create the mask for secondary CPUs */
+               cpumask_andnot(&tmp_mask, mask, pmask);
+               mask = &tmp_mask;
+       }
+
+       /* Bring the not-yet started CPUs up */
+       cpuhp_bringup_mask(mask, ncpus, CPUHP_BP_KICK_AP);
+       cpuhp_bringup_mask(mask, ncpus, CPUHP_ONLINE);
+       return true;
+}
+#else
+static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return false; }
+#endif /* CONFIG_HOTPLUG_PARALLEL */
+
+void __init bringup_nonboot_cpus(unsigned int max_cpus)
+{
+       /* Try parallel bringup optimization if enabled */
+       if (cpuhp_bringup_cpus_parallel(max_cpus))
+               return;
+
+       /* Full per CPU serialized bringup */
+       cpuhp_bringup_mask(cpu_present_mask, max_cpus, CPUHP_ONLINE);
+}
+
 #ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_var_t frozen_cpus;
 
@@ -1684,7 +2109,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
        [CPUHP_HRTIMERS_PREPARE] = {
                .name                   = "hrtimers:prepare",
                .startup.single         = hrtimers_prepare_cpu,
-               .teardown.single        = hrtimers_dead_cpu,
+               .teardown.single        = NULL,
        },
        [CPUHP_SMPCFD_PREPARE] = {
                .name                   = "smpcfd:prepare",
@@ -1696,11 +2121,6 @@ static struct cpuhp_step cpuhp_hp_states[] = {
                .startup.single         = relay_prepare_cpu,
                .teardown.single        = NULL,
        },
-       [CPUHP_SLAB_PREPARE] = {
-               .name                   = "slab:prepare",
-               .startup.single         = slab_prepare_cpu,
-               .teardown.single        = slab_dead_cpu,
-       },
        [CPUHP_RCUTREE_PREP] = {
                .name                   = "RCU/tree:prepare",
                .startup.single         = rcutree_prepare_cpu,
@@ -1716,13 +2136,38 @@ static struct cpuhp_step cpuhp_hp_states[] = {
                .startup.single         = timers_prepare_cpu,
                .teardown.single        = timers_dead_cpu,
        },
-       /* Kicks the plugged cpu into life */
+
+#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
+       /*
+        * Kicks the AP alive. AP will wait in cpuhp_ap_sync_alive() until
+        * the next step will release it.
+        */
+       [CPUHP_BP_KICK_AP] = {
+               .name                   = "cpu:kick_ap",
+               .startup.single         = cpuhp_kick_ap_alive,
+       },
+
+       /*
+        * Waits for the AP to reach cpuhp_ap_sync_alive() and then
+        * releases it for the complete bringup.
+        */
+       [CPUHP_BRINGUP_CPU] = {
+               .name                   = "cpu:bringup",
+               .startup.single         = cpuhp_bringup_ap,
+               .teardown.single        = finish_cpu,
+               .cant_stop              = true,
+       },
+#else
+       /*
+        * All-in-one CPU bringup state which includes the kick alive.
+        */
        [CPUHP_BRINGUP_CPU] = {
                .name                   = "cpu:bringup",
                .startup.single         = bringup_cpu,
                .teardown.single        = finish_cpu,
                .cant_stop              = true,
        },
+#endif
        /* Final state before CPU kills itself */
        [CPUHP_AP_IDLE_DEAD] = {
                .name                   = "idle:dead",
@@ -1751,6 +2196,16 @@ static struct cpuhp_step cpuhp_hp_states[] = {
                .startup.single         = NULL,
                .teardown.single        = smpcfd_dying_cpu,
        },
+       [CPUHP_AP_HRTIMERS_DYING] = {
+               .name                   = "hrtimers:dying",
+               .startup.single         = NULL,
+               .teardown.single        = hrtimers_cpu_dying,
+       },
+       [CPUHP_AP_TICK_DYING] = {
+               .name                   = "tick:dying",
+               .startup.single         = NULL,
+               .teardown.single        = tick_cpu_dying,
+       },
        /* Entry state on starting. Interrupts enabled from here on. Transient
         * state for synchronsization */
        [CPUHP_AP_ONLINE] = {
@@ -2239,6 +2694,12 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
        for_each_online_cpu(cpu) {
                if (topology_is_primary_thread(cpu))
                        continue;
+               /*
+                * Disable can be called with CPU_SMT_ENABLED when changing
+                * from a higher to lower number of SMT threads per core.
+                */
+               if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
+                       continue;
                ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
                if (ret)
                        break;
@@ -2273,6 +2734,8 @@ int cpuhp_smt_enable(void)
                /* Skip online CPUs and CPUs on offline nodes */
                if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
                        continue;
+               if (!cpu_smt_thread_allowed(cpu))
+                       continue;
                ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
                if (ret)
                        break;
@@ -2326,8 +2789,10 @@ static ssize_t target_store(struct device *dev, struct device_attribute *attr,
 
        if (st->state < target)
                ret = cpu_up(dev->id, target);
-       else
+       else if (st->state > target)
                ret = cpu_down(dev->id, target);
+       else if (WARN_ON(st->target != target))
+               st->target = target;
 out:
        unlock_device_hotplug();
        return ret ? ret : count;
@@ -2449,20 +2914,19 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
 
 #ifdef CONFIG_HOTPLUG_SMT
 
+static bool cpu_smt_num_threads_valid(unsigned int threads)
+{
+       if (IS_ENABLED(CONFIG_SMT_NUM_THREADS_DYNAMIC))
+               return threads >= 1 && threads <= cpu_smt_max_threads;
+       return threads == 1 || threads == cpu_smt_max_threads;
+}
+
 static ssize_t
 __store_smt_control(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
 {
-       int ctrlval, ret;
-
-       if (sysfs_streq(buf, "on"))
-               ctrlval = CPU_SMT_ENABLED;
-       else if (sysfs_streq(buf, "off"))
-               ctrlval = CPU_SMT_DISABLED;
-       else if (sysfs_streq(buf, "forceoff"))
-               ctrlval = CPU_SMT_FORCE_DISABLED;
-       else
-               return -EINVAL;
+       int ctrlval, ret, num_threads, orig_threads;
+       bool force_off;
 
        if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
                return -EPERM;
@@ -2470,21 +2934,39 @@ __store_smt_control(struct device *dev, struct device_attribute *attr,
        if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
                return -ENODEV;
 
+       if (sysfs_streq(buf, "on")) {
+               ctrlval = CPU_SMT_ENABLED;
+               num_threads = cpu_smt_max_threads;
+       } else if (sysfs_streq(buf, "off")) {
+               ctrlval = CPU_SMT_DISABLED;
+               num_threads = 1;
+       } else if (sysfs_streq(buf, "forceoff")) {
+               ctrlval = CPU_SMT_FORCE_DISABLED;
+               num_threads = 1;
+       } else if (kstrtoint(buf, 10, &num_threads) == 0) {
+               if (num_threads == 1)
+                       ctrlval = CPU_SMT_DISABLED;
+               else if (cpu_smt_num_threads_valid(num_threads))
+                       ctrlval = CPU_SMT_ENABLED;
+               else
+                       return -EINVAL;
+       } else {
+               return -EINVAL;
+       }
+
        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;
 
-       if (ctrlval != cpu_smt_control) {
-               switch (ctrlval) {
-               case CPU_SMT_ENABLED:
-                       ret = cpuhp_smt_enable();
-                       break;
-               case CPU_SMT_DISABLED:
-               case CPU_SMT_FORCE_DISABLED:
-                       ret = cpuhp_smt_disable(ctrlval);
-                       break;
-               }
-       }
+       orig_threads = cpu_smt_num_threads;
+       cpu_smt_num_threads = num_threads;
+
+       force_off = ctrlval != cpu_smt_control && ctrlval == CPU_SMT_FORCE_DISABLED;
+
+       if (num_threads > orig_threads)
+               ret = cpuhp_smt_enable();
+       else if (num_threads < orig_threads || force_off)
+               ret = cpuhp_smt_disable(ctrlval);
 
        unlock_device_hotplug();
        return ret ? ret : count;
@@ -2512,7 +2994,18 @@ static ssize_t control_show(struct device *dev,
 {
        const char *state = smt_states[cpu_smt_control];
 
-       return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
+#ifdef CONFIG_HOTPLUG_SMT
+       /*
+        * If SMT is enabled but not all threads are enabled then show the
+        * number of threads. If all threads are enabled show "on". Otherwise
+        * show the state name.
+        */
+       if (cpu_smt_control == CPU_SMT_ENABLED &&
+           cpu_smt_num_threads != cpu_smt_max_threads)
+               return sysfs_emit(buf, "%d\n", cpu_smt_num_threads);
+#endif
+
+       return sysfs_emit(buf, "%s\n", state);
 }
 
 static ssize_t control_store(struct device *dev, struct device_attribute *attr,
@@ -2525,7 +3018,7 @@ static DEVICE_ATTR_RW(control);
 static ssize_t active_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
 {
-       return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
+       return sysfs_emit(buf, "%d\n", sched_smt_active());
 }
 static DEVICE_ATTR_RO(active);
 
@@ -2543,22 +3036,33 @@ static const struct attribute_group cpuhp_smt_attr_group = {
 
 static int __init cpu_smt_sysfs_init(void)
 {
-       return sysfs_create_group(&cpu_subsys.dev_root->kobj,
-                                 &cpuhp_smt_attr_group);
+       struct device *dev_root;
+       int ret = -ENODEV;
+
+       dev_root = bus_get_dev_root(&cpu_subsys);
+       if (dev_root) {
+               ret = sysfs_create_group(&dev_root->kobj, &cpuhp_smt_attr_group);
+               put_device(dev_root);
+       }
+       return ret;
 }
 
 static int __init cpuhp_sysfs_init(void)
 {
+       struct device *dev_root;
        int cpu, ret;
 
        ret = cpu_smt_sysfs_init();
        if (ret)
                return ret;
 
-       ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
-                                &cpuhp_cpu_root_attr_group);
-       if (ret)
-               return ret;
+       dev_root = bus_get_dev_root(&cpu_subsys);
+       if (dev_root) {
+               ret = sysfs_create_group(&dev_root->kobj, &cpuhp_cpu_root_attr_group);
+               put_device(dev_root);
+               if (ret)
+                       return ret;
+       }
 
        for_each_possible_cpu(cpu) {
                struct device *dev = get_cpu_device(cpu);
@@ -2603,10 +3107,10 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
 EXPORT_SYMBOL(cpu_all_bits);
 
 #ifdef CONFIG_INIT_ALL_POSSIBLE
-struct cpumask __cpu_possible_mask __read_mostly
+struct cpumask __cpu_possible_mask __ro_after_init
        = {CPU_BITS_ALL};
 #else
-struct cpumask __cpu_possible_mask __read_mostly;
+struct cpumask __cpu_possible_mask __ro_after_init;
 #endif
 EXPORT_SYMBOL(__cpu_possible_mask);
 
@@ -2686,8 +3190,10 @@ void __init boot_cpu_hotplug_init(void)
 {
 #ifdef CONFIG_SMP
        cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
+       atomic_set(this_cpu_ptr(&cpuhp_state.ap_sync_state), SYNC_STATE_ONLINE);
 #endif
        this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
+       this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
 }
 
 /*