Merge tag 'rcu.2021.11.01a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck...

[linux-2.6-microblaze.git] / kernel / rcu / tasks.h
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h

index 171bc84..7da3c81 100644 (file)
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -197,6 +197,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
          * This loop is terminated by the system going down.  ;-)
          */
         for (;;) {
+               set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
  
                 /* Pick up any new callbacks. */
                 raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
@@ -236,8 +237,6 @@ static int __noreturn rcu_tasks_kthread(void *arg)
                 }
                 /* Paranoid sleep to keep this from entering a tight loop */
                 schedule_timeout_idle(rtp->gp_sleep);
-
-               set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
         }
  }
  
@@ -369,7 +368,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
  ////////////////////////////////////////////////////////////////////////
  //
  // Simple variant of RCU whose quiescent states are voluntary context
-// switch, cond_resched_rcu_qs(), user-space execution, and idle.
+// switch, cond_resched_tasks_rcu_qs(), user-space execution, and idle.
  // As such, grace periods can take one good long time.  There are no
  // read-side primitives similar to rcu_read_lock() and rcu_read_unlock()
  // because this implementation is intended to get the system into a safe
@@ -540,7 +539,7 @@ DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
   * period elapses, in other words after all currently executing RCU
   * read-side critical sections have completed. call_rcu_tasks() assumes
   * that the read-side critical sections end at a voluntary context
- * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle,
+ * switch (not a preemption!), cond_resched_tasks_rcu_qs(), entry into idle,
   * or transition to usermode execution.  As such, there are no read-side
   * primitives analogous to rcu_read_lock() and rcu_read_unlock() because
   * this primitive is intended to determine that all tasks have passed
@@ -678,11 +677,11 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
   * period elapses, in other words after all currently executing RCU
   * read-side critical sections have completed. call_rcu_tasks_rude()
   * assumes that the read-side critical sections end at context switch,
- * cond_resched_rcu_qs(), or transition to usermode execution.  As such,
- * there are no read-side primitives analogous to rcu_read_lock() and
- * rcu_read_unlock() because this primitive is intended to determine
- * that all tasks have passed through a safe state, not so much for
- * data-structure synchronization.
+ * cond_resched_tasks_rcu_qs(), or transition to usermode execution (as
+ * usermode execution is schedulable). As such, there are no read-side
+ * primitives analogous to rcu_read_lock() and rcu_read_unlock() because
+ * this primitive is intended to determine that all tasks have passed
+ * through a safe state, not so much for data-structure synchronization.
   *
   * See the description of call_rcu() for more detailed information on
   * memory ordering guarantees.
@@ -700,8 +699,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_rude);
   * grace period has elapsed, in other words after all currently
   * executing rcu-tasks read-side critical sections have elapsed.  These
   * read-side critical sections are delimited by calls to schedule(),
- * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory,
- * anyway) cond_resched().
+ * cond_resched_tasks_rcu_qs(), userspace execution (which is a schedulable
+ * context), and (in theory, anyway) cond_resched().
   *
   * This is a very specialized primitive, intended only for a few uses in
   * tracing and other situations requiring manipulation of function preambles
@@ -758,7 +757,7 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
  // 2.  Protects code in the idle loop, exception entry/exit, and
  //     CPU-hotplug code paths, similar to the capabilities of SRCU.
  //
-// 3.  Avoids expensive read-side instruction, having overhead similar
+// 3.  Avoids expensive read-side instructions, having overhead similar
  //     to that of Preemptible RCU.
  //
  // There are of course downsides.  The grace-period code can send IPIs to
@@ -848,7 +847,7 @@ static void rcu_read_unlock_iw(struct irq_work *iwp)
  static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw);
  
  /* If we are the last reader, wake up the grace-period kthread. */
-void rcu_read_unlock_trace_special(struct task_struct *t, int nesting)
+void rcu_read_unlock_trace_special(struct task_struct *t)
  {
         int nq = READ_ONCE(t->trc_reader_special.b.need_qs);
  
@@ -858,7 +857,7 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting)
         // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers.
         if (nq)
                 WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
-       WRITE_ONCE(t->trc_reader_nesting, nesting);
+       WRITE_ONCE(t->trc_reader_nesting, 0);
         if (nq && atomic_dec_and_test(&trc_n_readers_need_end))
                 irq_work_queue(&rcu_tasks_trace_iw);
  }
@@ -890,32 +889,24 @@ static void trc_read_check_handler(void *t_in)
  
         // If the task is no longer running on this CPU, leave.
         if (unlikely(texp != t)) {
-               if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
-                       wake_up(&trc_wait);
                 goto reset_ipi; // Already on holdout list, so will check later.
         }
  
         // If the task is not in a read-side critical section, and
         // if this is the last reader, awaken the grace-period kthread.
         if (likely(!READ_ONCE(t->trc_reader_nesting))) {
-               if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
-                       wake_up(&trc_wait);
-               // Mark as checked after decrement to avoid false
-               // positives on the above WARN_ON_ONCE().
                 WRITE_ONCE(t->trc_reader_checked, true);
                 goto reset_ipi;
         }
         // If we are racing with an rcu_read_unlock_trace(), try again later.
-       if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) {
-               if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
-                       wake_up(&trc_wait);
+       if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0))
                 goto reset_ipi;
-       }
         WRITE_ONCE(t->trc_reader_checked, true);
  
         // Get here if the task is in a read-side critical section.  Set
         // its state so that it will awaken the grace-period kthread upon
         // exit from that critical section.
+       atomic_inc(&trc_n_readers_need_end); // One more to wait on.
         WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
         WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
  
@@ -931,7 +922,7 @@ reset_ipi:
  static int trc_inspect_reader(struct task_struct *t, void *arg)
  {
         int cpu = task_cpu(t);
-       bool in_qs = false;
+       int nesting;
         bool ofl = cpu_is_offline(cpu);
  
         if (task_curr(t)) {
@@ -951,18 +942,18 @@ static int trc_inspect_reader(struct task_struct *t, void *arg)
                 n_heavy_reader_updates++;
                 if (ofl)
                         n_heavy_reader_ofl_updates++;
-               in_qs = true;
+               nesting = 0;
         } else {
                 // The task is not running, so C-language access is safe.
-               in_qs = likely(!t->trc_reader_nesting);
+               nesting = t->trc_reader_nesting;
         }
  
-       // Mark as checked so that the grace-period kthread will
-       // remove it from the holdout list.
-       t->trc_reader_checked = true;
-
-       if (in_qs)
-               return 0;  // Already in quiescent state, done!!!
+       // If not exiting a read-side critical section, mark as checked
+       // so that the grace-period kthread will remove it from the
+       // holdout list.
+       t->trc_reader_checked = nesting >= 0;
+       if (nesting <= 0)
+               return nesting ? -EINVAL : 0;  // If in QS, done, otherwise try again later.
  
         // The task is in a read-side critical section, so set up its
         // state so that it will awaken the grace-period kthread upon exit
@@ -1000,7 +991,7 @@ static void trc_wait_for_one_reader(struct task_struct *t,
  
         // If this task is not yet on the holdout list, then we are in
         // an RCU read-side critical section.  Otherwise, the invocation of
-       // rcu_add_holdout() that added it to the list did the necessary
+       // trc_add_holdout() that added it to the list did the necessary
         // get_task_struct().  Either way, the task cannot be freed out
         // from under this code.
  
@@ -1015,21 +1006,17 @@ static void trc_wait_for_one_reader(struct task_struct *t,
                 if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0)
                         return;
  
-               atomic_inc(&trc_n_readers_need_end);
                 per_cpu(trc_ipi_to_cpu, cpu) = true;
                 t->trc_ipi_to_cpu = cpu;
                 rcu_tasks_trace.n_ipis++;
-               if (smp_call_function_single(cpu,
-                                            trc_read_check_handler, t, 0)) {
+               if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) {
                         // Just in case there is some other reason for
                         // failure than the target CPU being offline.
+                       WARN_ONCE(1, "%s():  smp_call_function_single() failed for CPU: %d\n",
+                                 __func__, cpu);
                         rcu_tasks_trace.n_ipis_fails++;
                         per_cpu(trc_ipi_to_cpu, cpu) = false;
-                       t->trc_ipi_to_cpu = cpu;
-                       if (atomic_dec_and_test(&trc_n_readers_need_end)) {
-                               WARN_ON_ONCE(1);
-                               wake_up(&trc_wait);
-                       }
+                       t->trc_ipi_to_cpu = -1;
                 }
         }
  }
@@ -1099,9 +1086,9 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
         cpu = task_cpu(t);
         pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n",
                  t->pid,
-                ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0],
+                ".I"[READ_ONCE(t->trc_ipi_to_cpu) >= 0],
                  ".i"[is_idle_task(t)],
-                ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)],
+                ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)],
                  READ_ONCE(t->trc_reader_nesting),
                  " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)],
                  cpu);
@@ -1144,20 +1131,34 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
         cpus_read_unlock();
  
         if (needreport) {
-               if (firstreport)
+               if (*firstreport)
                         pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n");
                 show_stalled_ipi_trace();
         }
  }
  
+static void rcu_tasks_trace_empty_fn(void *unused)
+{
+}
+
  /* Wait for grace period to complete and provide ordering. */
  static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
  {
+       int cpu;
         bool firstreport;
         struct task_struct *g, *t;
         LIST_HEAD(holdouts);
         long ret;
  
+       // Wait for any lingering IPI handlers to complete.  Note that
+       // if a CPU has gone offline or transitioned to userspace in the
+       // meantime, all IPI handlers should have been drained beforehand.
+       // Yes, this assumes that CPUs process IPIs in order.  If that ever
+       // changes, there will need to be a recheck and/or timed wait.
+       for_each_online_cpu(cpu)
+               if (smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu)))
+                       smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1);
+
         // Remove the safety count.
         smp_mb__before_atomic();  // Order vs. earlier atomics
         atomic_dec(&trc_n_readers_need_end);
@@ -1200,7 +1201,7 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t)
         WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
         WRITE_ONCE(t->trc_reader_nesting, 0);
         if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)))
-               rcu_read_unlock_trace_special(t, 0);
+               rcu_read_unlock_trace_special(t);
  }
  
  /**
@@ -1208,15 +1209,11 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t)
   * @rhp: structure to be used for queueing the RCU updates.
   * @func: actual callback function to be invoked after the grace period
   *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_tasks_trace()
- * assumes that the read-side critical sections end at context switch,
- * cond_resched_rcu_qs(), or transition to usermode execution.  As such,
- * there are no read-side primitives analogous to rcu_read_lock() and
- * rcu_read_unlock() because this primitive is intended to determine
- * that all tasks have passed through a safe state, not so much for
- * data-structure synchronization.
+ * The callback function will be invoked some time after a trace rcu-tasks
+ * grace period elapses, in other words after all currently executing
+ * trace rcu-tasks read-side critical sections have completed. These
+ * read-side critical sections are delimited by calls to rcu_read_lock_trace()
+ * and rcu_read_unlock_trace().
   *
   * See the description of call_rcu() for more detailed information on
   * memory ordering guarantees.
@@ -1232,7 +1229,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace);
   *
   * Control will return to the caller some time after a trace rcu-tasks
   * grace period has elapsed, in other words after all currently executing
- * rcu-tasks read-side critical sections have elapsed.  These read-side
+ * trace rcu-tasks read-side critical sections have elapsed. These read-side
   * critical sections are delimited by calls to rcu_read_lock_trace()
   * and rcu_read_unlock_trace().
   *