rcu-tasks: Handle idle tasks for recently offlined CPUs
[linux-2.6-microblaze.git] / kernel / rcu / tasks.h
index 3925e32..414861d 100644 (file)
@@ -48,6 +48,7 @@ struct rcu_tasks_percpu {
  * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
  * @cbs_wait: RCU wait allowing a new callback to get kthread's attention.
  * @cbs_gbl_lock: Lock protecting callback list.
+ * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone.
  * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
  * @gp_func: This flavor's grace-period-wait function.
  * @gp_state: Grace period's most recent state transition (debugging).
@@ -79,6 +80,7 @@ struct rcu_tasks_percpu {
 struct rcu_tasks {
        struct rcuwait cbs_wait;
        raw_spinlock_t cbs_gbl_lock;
+       struct mutex tasks_gp_mutex;
        int gp_state;
        int gp_sleep;
        int init_fract;
@@ -119,6 +121,7 @@ static struct rcu_tasks rt_name =                                                   \
 {                                                                                      \
        .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait),                                \
        .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock),                 \
+       .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex),                  \
        .gp_func = gp,                                                                  \
        .call_func = call,                                                              \
        .rtpcpu = &rt_name ## __percpu,                                                 \
@@ -323,17 +326,6 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
                irq_work_queue(&rtpcp->rtp_irq_work);
 }
 
-// Wait for a grace period for the specified flavor of Tasks RCU.
-static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
-{
-       /* Complain if the scheduler has not started.  */
-       RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
-                        "synchronize_rcu_tasks called too soon");
-
-       /* Wait for the grace period. */
-       wait_rcu_gp(rtp->call_func);
-}
-
 // RCU callback function for rcu_barrier_tasks_generic().
 static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
 {
@@ -439,6 +431,11 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
                        WRITE_ONCE(rtp->percpu_dequeue_lim, 1);
                        pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
                }
+               for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) {
+                       struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+                       WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
+               }
                raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
        }
 
@@ -497,10 +494,41 @@ static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp)
        rcu_tasks_invoke_cbs(rtp, rtpcp);
 }
 
-/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
-static int __noreturn rcu_tasks_kthread(void *arg)
+// Wait for one grace period.
+static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
 {
        int needgpcb;
+
+       mutex_lock(&rtp->tasks_gp_mutex);
+
+       // If there were none, wait a bit and start over.
+       if (unlikely(midboot)) {
+               needgpcb = 0x2;
+       } else {
+               set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
+               rcuwait_wait_event(&rtp->cbs_wait,
+                                  (needgpcb = rcu_tasks_need_gpcb(rtp)),
+                                  TASK_IDLE);
+       }
+
+       if (needgpcb & 0x2) {
+               // Wait for one grace period.
+               set_tasks_gp_state(rtp, RTGS_WAIT_GP);
+               rtp->gp_start = jiffies;
+               rcu_seq_start(&rtp->tasks_gp_seq);
+               rtp->gp_func(rtp);
+               rcu_seq_end(&rtp->tasks_gp_seq);
+       }
+
+       // Invoke callbacks.
+       set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
+       rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0));
+       mutex_unlock(&rtp->tasks_gp_mutex);
+}
+
+// RCU-tasks kthread that detects grace periods and invokes callbacks.
+static int __noreturn rcu_tasks_kthread(void *arg)
+{
        struct rcu_tasks *rtp = arg;
 
        /* Run on housekeeping CPUs by default.  Sysadm can move if desired. */
@@ -514,29 +542,28 @@ static int __noreturn rcu_tasks_kthread(void *arg)
         * This loop is terminated by the system going down.  ;-)
         */
        for (;;) {
-               set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
+               // Wait for one grace period and invoke any callbacks
+               // that are ready.
+               rcu_tasks_one_gp(rtp, false);
 
-               /* If there were none, wait a bit and start over. */
-               rcuwait_wait_event(&rtp->cbs_wait,
-                                  (needgpcb = rcu_tasks_need_gpcb(rtp)),
-                                  TASK_IDLE);
-
-               if (needgpcb & 0x2) {
-                       // Wait for one grace period.
-                       set_tasks_gp_state(rtp, RTGS_WAIT_GP);
-                       rtp->gp_start = jiffies;
-                       rcu_seq_start(&rtp->tasks_gp_seq);
-                       rtp->gp_func(rtp);
-                       rcu_seq_end(&rtp->tasks_gp_seq);
-               }
+               // Paranoid sleep to keep this from entering a tight loop.
+               schedule_timeout_idle(rtp->gp_sleep);
+       }
+}
 
-               /* Invoke callbacks. */
-               set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
-               rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0));
+// Wait for a grace period for the specified flavor of Tasks RCU.
+static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
+{
+       /* Complain if the scheduler has not started.  */
+       RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
+                        "synchronize_rcu_tasks called too soon");
 
-               /* Paranoid sleep to keep this from entering a tight loop */
-               schedule_timeout_idle(rtp->gp_sleep);
+       // If the grace-period kthread is running, use it.
+       if (READ_ONCE(rtp->kthread_ptr)) {
+               wait_rcu_gp(rtp->call_func);
+               return;
        }
+       rcu_tasks_one_gp(rtp, true);
 }
 
 /* Spawn RCU-tasks grace-period kthread. */
@@ -1165,9 +1192,6 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
 
 #ifdef CONFIG_TASKS_TRACE_RCU
 
-static atomic_t trc_n_readers_need_end;                // Number of waited-for readers.
-static DECLARE_WAIT_QUEUE_HEAD(trc_wait);      // List of holdout tasks.
-
 // Record outstanding IPIs to each CPU.  No point in sending two...
 static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
 
@@ -1181,30 +1205,55 @@ void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
 DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace,
                 "RCU Tasks Trace");
 
+/* Load from ->trc_reader_special.b.need_qs with proper ordering. */
+static u8 rcu_ld_need_qs(struct task_struct *t)
+{
+       smp_mb(); // Enforce full grace-period ordering.
+       return smp_load_acquire(&t->trc_reader_special.b.need_qs);
+}
+
+/* Store to ->trc_reader_special.b.need_qs with proper ordering. */
+static void rcu_st_need_qs(struct task_struct *t, u8 v)
+{
+       smp_store_release(&t->trc_reader_special.b.need_qs, v);
+       smp_mb(); // Enforce full grace-period ordering.
+}
+
 /*
- * This irq_work handler allows rcu_read_unlock_trace() to be invoked
- * while the scheduler locks are held.
+ * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
+ * the four-byte operand-size restriction of some platforms.
+ * Returns the old value, which is often ignored.
  */
-static void rcu_read_unlock_iw(struct irq_work *iwp)
+u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
 {
-       wake_up(&trc_wait);
+       union rcu_special ret;
+       union rcu_special trs_old = READ_ONCE(t->trc_reader_special);
+       union rcu_special trs_new = trs_old;
+
+       if (trs_old.b.need_qs != old)
+               return trs_old.b.need_qs;
+       trs_new.b.need_qs = new;
+       ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s);
+       return ret.b.need_qs;
 }
-static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw);
+EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
 
 /* If we are the last reader, wake up the grace-period kthread. */
 void rcu_read_unlock_trace_special(struct task_struct *t)
 {
-       int nq = READ_ONCE(t->trc_reader_special.b.need_qs);
+       int nqs = (rcu_ld_need_qs(t) == (TRC_NEED_QS_CHECKED | TRC_NEED_QS));
 
-       if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) &&
-           t->trc_reader_special.b.need_mb)
+       if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb)
                smp_mb(); // Pairs with update-side barriers.
        // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers.
-       if (nq)
-               WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
+       if (nqs) {
+               u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS,
+                                                      TRC_NEED_QS_CHECKED);
+
+               WARN_ONCE(result != (TRC_NEED_QS_CHECKED | TRC_NEED_QS),
+                         "%s: result = %d", __func__, result);
+       }
        WRITE_ONCE(t->trc_reader_nesting, 0);
-       if (nq && atomic_dec_and_test(&trc_n_readers_need_end))
-               irq_work_queue(&rcu_tasks_trace_iw);
 }
 EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special);
 
@@ -1229,31 +1278,29 @@ static void trc_del_holdout(struct task_struct *t)
 /* IPI handler to check task state. */
 static void trc_read_check_handler(void *t_in)
 {
+       int nesting;
        struct task_struct *t = current;
        struct task_struct *texp = t_in;
 
        // If the task is no longer running on this CPU, leave.
-       if (unlikely(texp != t)) {
+       if (unlikely(texp != t))
                goto reset_ipi; // Already on holdout list, so will check later.
-       }
 
        // If the task is not in a read-side critical section, and
        // if this is the last reader, awaken the grace-period kthread.
-       if (likely(!READ_ONCE(t->trc_reader_nesting))) {
-               WRITE_ONCE(t->trc_reader_checked, true);
+       nesting = READ_ONCE(t->trc_reader_nesting);
+       if (likely(!nesting)) {
+               rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
                goto reset_ipi;
        }
        // If we are racing with an rcu_read_unlock_trace(), try again later.
-       if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0))
+       if (unlikely(nesting < 0))
                goto reset_ipi;
-       WRITE_ONCE(t->trc_reader_checked, true);
 
        // Get here if the task is in a read-side critical section.  Set
        // its state so that it will awaken the grace-period kthread upon
        // exit from that critical section.
-       atomic_inc(&trc_n_readers_need_end); // One more to wait on.
-       WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
-       WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
+       rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED);
 
 reset_ipi:
        // Allow future IPIs to be sent on CPU and for task.
@@ -1264,48 +1311,49 @@ reset_ipi:
 }
 
 /* Callback function for scheduler to check locked-down task.  */
-static int trc_inspect_reader(struct task_struct *t, void *arg)
+static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
 {
+       struct list_head *bhp = bhp_in;
        int cpu = task_cpu(t);
        int nesting;
        bool ofl = cpu_is_offline(cpu);
 
-       if (task_curr(t)) {
-               WARN_ON_ONCE(ofl && !is_idle_task(t));
-
+       if (task_curr(t) && !ofl) {
                // If no chance of heavyweight readers, do it the hard way.
-               if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+               if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
                        return -EINVAL;
 
                // If heavyweight readers are enabled on the remote task,
                // we can inspect its state despite its currently running.
                // However, we cannot safely change its state.
                n_heavy_reader_attempts++;
-               if (!ofl && // Check for "running" idle tasks on offline CPUs.
-                   !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
+               // Check for "running" idle tasks on offline CPUs.
+               if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
                        return -EINVAL; // No quiescent state, do it the hard way.
                n_heavy_reader_updates++;
-               if (ofl)
-                       n_heavy_reader_ofl_updates++;
                nesting = 0;
        } else {
                // The task is not running, so C-language access is safe.
                nesting = t->trc_reader_nesting;
+               WARN_ON_ONCE(ofl && task_curr(t) && !is_idle_task(t));
+               if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl)
+                       n_heavy_reader_ofl_updates++;
        }
 
        // If not exiting a read-side critical section, mark as checked
        // so that the grace-period kthread will remove it from the
        // holdout list.
-       t->trc_reader_checked = nesting >= 0;
-       if (nesting <= 0)
+       if (nesting <= 0) {
+               if (!nesting)
+                       rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
                return nesting ? -EINVAL : 0;  // If in QS, done, otherwise try again later.
+       }
 
        // The task is in a read-side critical section, so set up its
        // state so that it will awaken the grace-period kthread upon exit
        // from that critical section.
-       atomic_inc(&trc_n_readers_need_end); // One more to wait on.
-       WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
-       WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
+       if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED))
+               trc_add_holdout(t, bhp);
        return 0;
 }
 
@@ -1321,14 +1369,14 @@ static void trc_wait_for_one_reader(struct task_struct *t,
 
        // The current task had better be in a quiescent state.
        if (t == current) {
-               t->trc_reader_checked = true;
+               rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
                WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
                return;
        }
 
        // Attempt to nail down the task for inspection.
        get_task_struct(t);
-       if (!task_call_func(t, trc_inspect_reader, NULL)) {
+       if (!task_call_func(t, trc_inspect_reader, bhp)) {
                put_task_struct(t);
                return;
        }
@@ -1371,9 +1419,6 @@ static void rcu_tasks_trace_pregp_step(void)
 {
        int cpu;
 
-       // Allow for fast-acting IPIs.
-       atomic_set(&trc_n_readers_need_end, 1);
-
        // There shouldn't be any old IPIs, but...
        for_each_possible_cpu(cpu)
                WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu));
@@ -1392,8 +1437,7 @@ static void rcu_tasks_trace_pertask(struct task_struct *t,
        if (unlikely(t == NULL))
                return;
 
-       WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
-       WRITE_ONCE(t->trc_reader_checked, false);
+       rcu_st_need_qs(t, 0);
        t->trc_ipi_to_cpu = -1;
        trc_wait_for_one_reader(t, hop);
 }
@@ -1406,7 +1450,7 @@ static void rcu_tasks_trace_postscan(struct list_head *hop)
 {
        int cpu;
 
-       for_each_possible_cpu(cpu)
+       for_each_online_cpu(cpu)
                rcu_tasks_trace_pertask(idle_task(cpu), hop);
 
        // Re-enable CPU hotplug now that the tasklist scan has completed.
@@ -1415,7 +1459,8 @@ static void rcu_tasks_trace_postscan(struct list_head *hop)
        // Wait for late-stage exiting tasks to finish exiting.
        // These might have passed the call to exit_tasks_rcu_finish().
        synchronize_rcu();
-       // Any tasks that exit after this point will set ->trc_reader_checked.
+       // Any tasks that exit after this point will set
+       // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
 }
 
 /* Communicate task state back to the RCU tasks trace stall warning request. */
@@ -1433,7 +1478,7 @@ static int trc_check_slow_task(struct task_struct *t, void *arg)
                return false; // It is running, so decline to inspect it.
        trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting);
        trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu);
-       trc_rdrp->needqs = READ_ONCE(t->trc_reader_special.b.need_qs);
+       trc_rdrp->needqs = rcu_ld_need_qs(t);
        return true;
 }
 
@@ -1487,12 +1532,12 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
        list_for_each_entry_safe(t, g, hop, trc_holdout_list) {
                // If safe and needed, try to check the current task.
                if (READ_ONCE(t->trc_ipi_to_cpu) == -1 &&
-                   !READ_ONCE(t->trc_reader_checked))
+                   !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED))
                        trc_wait_for_one_reader(t, hop);
 
                // If check succeeded, remove this task from the list.
                if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 &&
-                   READ_ONCE(t->trc_reader_checked))
+                   rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED)
                        trc_del_holdout(t);
                else if (needreport)
                        show_stalled_task_trace(t, firstreport);
@@ -1516,10 +1561,6 @@ static void rcu_tasks_trace_empty_fn(void *unused)
 static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
 {
        int cpu;
-       bool firstreport;
-       struct task_struct *g, *t;
-       LIST_HEAD(holdouts);
-       long ret;
 
        // Wait for any lingering IPI handlers to complete.  Note that
        // if a CPU has gone offline or transitioned to userspace in the
@@ -1530,37 +1571,6 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
                if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))))
                        smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1);
 
-       // Remove the safety count.
-       smp_mb__before_atomic();  // Order vs. earlier atomics
-       atomic_dec(&trc_n_readers_need_end);
-       smp_mb__after_atomic();  // Order vs. later atomics
-
-       // Wait for readers.
-       set_tasks_gp_state(rtp, RTGS_WAIT_READERS);
-       for (;;) {
-               ret = wait_event_idle_exclusive_timeout(
-                               trc_wait,
-                               atomic_read(&trc_n_readers_need_end) == 0,
-                               READ_ONCE(rcu_task_stall_timeout));
-               if (ret)
-                       break;  // Count reached zero.
-               // Stall warning time, so make a list of the offenders.
-               rcu_read_lock();
-               for_each_process_thread(g, t)
-                       if (READ_ONCE(t->trc_reader_special.b.need_qs))
-                               trc_add_holdout(t, &holdouts);
-               rcu_read_unlock();
-               firstreport = true;
-               list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) {
-                       if (READ_ONCE(t->trc_reader_special.b.need_qs))
-                               show_stalled_task_trace(t, &firstreport);
-                       trc_del_holdout(t); // Release task_struct reference.
-               }
-               if (firstreport)
-                       pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n");
-               show_stalled_ipi_trace();
-               pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end));
-       }
        smp_mb(); // Caller's code must be ordered after wakeup.
                  // Pairs with pretty much every ordering primitive.
 }
@@ -1568,11 +1578,12 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
 /* Report any needed quiescent state for this exiting task. */
 static void exit_tasks_rcu_finish_trace(struct task_struct *t)
 {
-       WRITE_ONCE(t->trc_reader_checked, true);
+       rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
        WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
-       WRITE_ONCE(t->trc_reader_nesting, 0);
-       if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)))
+       if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS))
                rcu_read_unlock_trace_special(t);
+       else
+               WRITE_ONCE(t->trc_reader_nesting, 0);
 }
 
 /**
@@ -1659,7 +1670,7 @@ void show_rcu_tasks_trace_gp_kthread(void)
 {
        char buf[64];
 
-       sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end),
+       sprintf(buf, "h:%lu/%lu/%lu",
                data_race(n_heavy_reader_ofl_updates),
                data_race(n_heavy_reader_updates),
                data_race(n_heavy_reader_attempts));