Merge branches 'bitmaprange.2021.05.10c', 'doc.2021.05.10c', 'fixes.2021.05.13a'...

author Paul E. McKenney <paulmck@kernel.org>

Tue, 18 May 2021 17:56:19 +0000 (10:56 -0700)

committer Paul E. McKenney <paulmck@kernel.org>

Tue, 18 May 2021 17:56:19 +0000 (10:56 -0700)
author Paul E. McKenney <paulmck@kernel.org>
Tue, 18 May 2021 17:56:19 +0000 (10:56 -0700)
committer Paul E. McKenney <paulmck@kernel.org>
Tue, 18 May 2021 17:56:19 +0000 (10:56 -0700)
diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst

index a648b42..11cdab0 100644 (file)
--- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
+++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
@@ -21,7 +21,7 @@ Any code that happens after the end of a given RCU grace period is guaranteed
  to see the effects of all accesses prior to the beginning of that grace
  period that are within RCU read-side critical sections.
  Similarly, any code that happens before the beginning of a given RCU grace
-period is guaranteed to see the effects of all accesses following the end
+period is guaranteed to not see the effects of all accesses following the end
  of that grace period that are within RCU read-side critical sections.
  
  Note well that RCU-sched read-side critical sections include any region
@@ -339,14 +339,14 @@ The diagram below shows the path of ordering if the leftmost
  leftmost ``rcu_node`` structure offlines its last CPU and if the next
  ``rcu_node`` structure has no online CPUs).
  
-.. kernel-figure:: TreeRCU-gp-init-1.svg
+.. kernel-figure:: TreeRCU-gp-init-2.svg
  
  The final ``rcu_gp_init()`` pass through the ``rcu_node`` tree traverses
  breadth-first, setting each ``rcu_node`` structure's ``->gp_seq`` field
  to the newly advanced value from the ``rcu_state`` structure, as shown
  in the following diagram.
  
-.. kernel-figure:: TreeRCU-gp-init-1.svg
+.. kernel-figure:: TreeRCU-gp-init-3.svg
  
  This change will also cause each CPU's next call to
  ``__note_gp_changes()`` to notice that a new grace period has started,
diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst

index 3996b54..01ba293 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -76,6 +76,11 @@ to change, such as less cores in the CPU list, then N and any ranges using N
  will also change.  Use the same on a small 4 core system, and "16-N" becomes
  "16-3" and now the same boot input will be flagged as invalid (start > end).
  
+The special case-tolerant group name "all" has a meaning of selecting all CPUs,
+so that "nohz_full=all" is the equivalent of "nohz_full=0-N".
+
+The semantics of "N" and "all" is supported on a level of bitmaps and holds for
+all users of bitmap_parse().
  
  This document may not be entirely up to date and comprehensive. The command
  "modinfo -p ${modulename}" shows a current list of all parameters of a loadable
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

index cb89dbd..4405fd3 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4290,6 +4290,11 @@
                         whole algorithm to behave better in low memory
                         condition.
  
+       rcutree.rcu_delay_page_cache_fill_msec= [KNL]
+                       Set the page-cache refill delay (in milliseconds)
+                       in response to low-memory conditions.  The range
+                       of permitted values is in the range 0:100000.
+
         rcutree.jiffies_till_first_fqs= [KNL]
                         Set delay from grace-period initialization to
                         first attempt to force quiescent states.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h

index 9455476..d9680b7 100644 (file)
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -315,7 +315,7 @@ static inline int rcu_read_lock_any_held(void)
  #define RCU_LOCKDEP_WARN(c, s)                                         \
         do {                                                            \
                 static bool __section(".data.unlikely") __warned;       \
-               if (debug_lockdep_rcu_enabled() && !__warned && (c)) {  \
+               if ((c) && debug_lockdep_rcu_enabled() && !__warned) {  \
                         __warned = true;                                \
                         lockdep_rcu_suspicious(__FILE__, __LINE__, s);  \
                 }                                                       \
@@ -363,6 +363,20 @@ static inline void rcu_preempt_sleep_check(void) { }
  #define rcu_check_sparse(p, space)
  #endif /* #else #ifdef __CHECKER__ */
  
+/**
+ * unrcu_pointer - mark a pointer as not being RCU protected
+ * @p: pointer needing to lose its __rcu property
+ *
+ * Converts @p from an __rcu pointer to a __kernel pointer.
+ * This allows an __rcu pointer to be used with xchg() and friends.
+ */
+#define unrcu_pointer(p)                                               \
+({                                                                     \
+       typeof(*p) *_________p1 = (typeof(*p) *__force)(p);             \
+       rcu_check_sparse(p, __rcu);                                     \
+       ((typeof(*p) __force __kernel *)(_________p1));                 \
+})
+
  #define __rcu_access_pointer(p, space) \
  ({ \
         typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \
@@ -518,7 +532,12 @@ do {                                                                             \
   * @p: The pointer to read, prior to dereferencing
   * @c: The conditions under which the dereference will take place
   *
- * This is the RCU-bh counterpart to rcu_dereference_check().
+ * This is the RCU-bh counterpart to rcu_dereference_check().  However,
+ * please note that starting in v5.0 kernels, vanilla RCU grace periods
+ * wait for local_bh_disable() regions of code in addition to regions of
+ * code demarked by rcu_read_lock() and rcu_read_unlock().  This means
+ * that synchronize_rcu(), call_rcu, and friends all take not only
+ * rcu_read_lock() but also rcu_read_lock_bh() into account.
   */
  #define rcu_dereference_bh_check(p, c) \
         __rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu)
@@ -529,6 +548,11 @@ do {                                                                             \
   * @c: The conditions under which the dereference will take place
   *
   * This is the RCU-sched counterpart to rcu_dereference_check().
+ * However, please note that starting in v5.0 kernels, vanilla RCU grace
+ * periods wait for preempt_disable() regions of code in addition to
+ * regions of code demarked by rcu_read_lock() and rcu_read_unlock().
+ * This means that synchronize_rcu(), call_rcu, and friends all take not
+ * only rcu_read_lock() but also rcu_read_lock_sched() into account.
   */
  #define rcu_dereference_sched_check(p, c) \
         __rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \
@@ -620,6 +644,12 @@ do {                                                                             \
   * sections, invocation of the corresponding RCU callback is deferred
   * until after the all the other CPUs exit their critical sections.
   *
+ * In v5.0 and later kernels, synchronize_rcu() and call_rcu() also
+ * wait for regions of code with preemption disabled, including regions of
+ * code with interrupts or softirqs disabled.  In pre-v5.0 kernels, which
+ * define synchronize_sched(), only code enclosed within rcu_read_lock()
+ * and rcu_read_unlock() are guaranteed to be waited for.
+ *
   * Note, however, that RCU callbacks are permitted to run concurrently
   * with new RCU read-side critical sections.  One way that this can happen
   * is via the following sequence of events: (1) CPU 0 enters an RCU
@@ -672,33 +702,12 @@ static __always_inline void rcu_read_lock(void)
  /**
   * rcu_read_unlock() - marks the end of an RCU read-side critical section.
   *
- * In most situations, rcu_read_unlock() is immune from deadlock.
- * However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock()
- * is responsible for deboosting, which it does via rt_mutex_unlock().
- * Unfortunately, this function acquires the scheduler's runqueue and
- * priority-inheritance spinlocks.  This means that deadlock could result
- * if the caller of rcu_read_unlock() already holds one of these locks or
- * any lock that is ever acquired while holding them.
- *
- * That said, RCU readers are never priority boosted unless they were
- * preempted.  Therefore, one way to avoid deadlock is to make sure
- * that preemption never happens within any RCU read-side critical
- * section whose outermost rcu_read_unlock() is called with one of
- * rt_mutex_unlock()'s locks held.  Such preemption can be avoided in
- * a number of ways, for example, by invoking preempt_disable() before
- * critical section's outermost rcu_read_lock().
- *
- * Given that the set of locks acquired by rt_mutex_unlock() might change
- * at any time, a somewhat more future-proofed approach is to make sure
- * that that preemption never happens within any RCU read-side critical
- * section whose outermost rcu_read_unlock() is called with irqs disabled.
- * This approach relies on the fact that rt_mutex_unlock() currently only
- * acquires irq-disabled locks.
- *
- * The second of these two approaches is best in most situations,
- * however, the first approach can also be useful, at least to those
- * developers willing to keep abreast of the set of locks acquired by
- * rt_mutex_unlock().
+ * In almost all situations, rcu_read_unlock() is immune from deadlock.
+ * In recent kernels that have consolidated synchronize_sched() and
+ * synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity
+ * also extends to the scheduler's runqueue and priority-inheritance
+ * spinlocks, courtesy of the quiescent-state deferral that is carried
+ * out when rcu_read_unlock() is invoked with interrupts disabled.
   *
   * See rcu_read_lock() for more information.
   */
@@ -714,9 +723,11 @@ static inline void rcu_read_unlock(void)
  /**
   * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
   *
- * This is equivalent of rcu_read_lock(), but also disables softirqs.
- * Note that anything else that disables softirqs can also serve as
- * an RCU read-side critical section.
+ * This is equivalent to rcu_read_lock(), but also disables softirqs.
+ * Note that anything else that disables softirqs can also serve as an RCU
+ * read-side critical section.  However, please note that this equivalence
+ * applies only to v5.0 and later.  Before v5.0, rcu_read_lock() and
+ * rcu_read_lock_bh() were unrelated.
   *
   * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
   * must occur in the same context, for example, it is illegal to invoke
@@ -749,9 +760,12 @@ static inline void rcu_read_unlock_bh(void)
  /**
   * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
   *
- * This is equivalent of rcu_read_lock(), but disables preemption.
- * Read-side critical sections can also be introduced by anything else
- * that disables preemption, including local_irq_disable() and friends.
+ * This is equivalent to rcu_read_lock(), but also disables preemption.
+ * Read-side critical sections can also be introduced by anything else that
+ * disables preemption, including local_irq_disable() and friends.  However,
+ * please note that the equivalence to rcu_read_lock() applies only to
+ * v5.0 and later.  Before v5.0, rcu_read_lock() and rcu_read_lock_sched()
+ * were unrelated.
   *
   * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
   * must occur in the same context, for example, it is illegal to invoke
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h

index 35e0be3..953e70f 100644 (file)
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -86,7 +86,6 @@ static inline void rcu_irq_enter(void) { }
  static inline void rcu_irq_exit_irqson(void) { }
  static inline void rcu_irq_enter_irqson(void) { }
  static inline void rcu_irq_exit(void) { }
-static inline void rcu_irq_exit_preempt(void) { }
  static inline void rcu_irq_exit_check_preempt(void) { }
  #define rcu_is_idle_cpu(cpu) \
         (is_idle_task(current) && !in_nmi() && !in_irq() && !in_serving_softirq())
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h

index b89b541..53209d6 100644 (file)
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -49,7 +49,6 @@ void rcu_idle_enter(void);
  void rcu_idle_exit(void);
  void rcu_irq_enter(void);
  void rcu_irq_exit(void);
-void rcu_irq_exit_preempt(void);
  void rcu_irq_enter_irqson(void);
  void rcu_irq_exit_irqson(void);
  bool rcu_is_idle_cpu(int cpu);
diff --git a/include/linux/srcu.h b/include/linux/srcu.h

index a0895bb..e6011a9 100644 (file)
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -64,6 +64,12 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
  unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
  bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);
  
+#ifdef CONFIG_SRCU
+void srcu_init(void);
+#else /* #ifdef CONFIG_SRCU */
+static inline void srcu_init(void) { }
+#endif /* #else #ifdef CONFIG_SRCU */
+
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
  
  /**
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h

index 9cfcc8a..cb1f435 100644 (file)
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -82,9 +82,7 @@ struct srcu_struct {
                                                 /*  callback for the barrier */
                                                 /*  operation. */
         struct delayed_work work;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
         struct lockdep_map dep_map;
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  };
  
  /* Values for state variable (bottom bits of ->srcu_gp_seq). */
diff --git a/init/main.c b/init/main.c

index eb01e12..7b6f49c 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -42,6 +42,7 @@
  #include <linux/profile.h>
  #include <linux/kfence.h>
  #include <linux/rcupdate.h>
+#include <linux/srcu.h>
  #include <linux/moduleparam.h>
  #include <linux/kallsyms.h>
  #include <linux/writeback.h>
@@ -979,6 +980,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
         tick_init();
         rcu_init_nohz();
         init_timers();
+       srcu_init();
         hrtimers_init();
         softirq_init();
         timekeeping_init();
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c

index 48d736a..d6c3c98 100644 (file)
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -6393,6 +6393,7 @@ asmlinkage __visible void lockdep_sys_exit(void)
  void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
  {
         struct task_struct *curr = current;
+       int dl = READ_ONCE(debug_locks);
  
         /* Note: the following can be executed concurrently, so be careful. */
         pr_warn("\n");
@@ -6402,11 +6403,12 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
         pr_warn("-----------------------------\n");
         pr_warn("%s:%d %s!\n", file, line, s);
         pr_warn("\nother info that might help us debug this:\n\n");
-       pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+       pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n%s",
                !rcu_lockdep_current_cpu_online()
                         ? "RCU used illegally from offline CPU!\n"
                         : "",
-              rcu_scheduler_active, debug_locks);
+              rcu_scheduler_active, dl,
+              dl ? "" : "Possible false positive due to lockdep disabling via debug_locks = 0\n");
  
         /*
          * If a CPU is in the RCU-free window in idle (ie: in the section
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug

index 1942c1f..4fd6499 100644 (file)
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -116,7 +116,7 @@ config RCU_EQS_DEBUG
  
  config RCU_STRICT_GRACE_PERIOD
         bool "Provide debug RCU implementation with short grace periods"
-       depends on DEBUG_KERNEL && RCU_EXPERT
+       depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4
         default n
         select PREEMPT_COUNT if PREEMPT=n
         help
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h

index bf0827d..24b5f2c 100644 (file)
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -308,6 +308,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
         }
  }
  
+extern void rcu_init_geometry(void);
+
  /* Returns a pointer to the first leaf rcu_node structure. */
  #define rcu_first_leaf_node() (rcu_state.level[rcu_num_lvls - 1])
  
@@ -422,12 +424,6 @@ do {                                                                       \
  
  #endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
  
-#ifdef CONFIG_SRCU
-void srcu_init(void);
-#else /* #ifdef CONFIG_SRCU */
-static inline void srcu_init(void) { }
-#endif /* #else #ifdef CONFIG_SRCU */
-
  #ifdef CONFIG_TINY_RCU
  /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
  static inline bool rcu_gp_is_normal(void) { return true; }
@@ -441,7 +437,11 @@ bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
  void rcu_expedite_gp(void);
  void rcu_unexpedite_gp(void);
  void rcupdate_announce_bootup_oddness(void);
+#ifdef CONFIG_TASKS_RCU_GENERIC
  void show_rcu_tasks_gp_kthreads(void);
+#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
+static inline void show_rcu_tasks_gp_kthreads(void) {}
+#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
  void rcu_request_urgent_qs_task(struct task_struct *t);
  #endif /* #else #ifdef CONFIG_TINY_RCU */
  
@@ -519,6 +519,7 @@ static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
  static inline unsigned long
  srcu_batches_completed(struct srcu_struct *sp) { return 0; }
  static inline void rcu_force_quiescent_state(void) { }
+static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; }
  static inline void show_rcu_gp_kthreads(void) { }
  static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
  static inline void rcu_fwd_progress_check(unsigned long j) { }
@@ -527,6 +528,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp);
  unsigned long rcu_get_gp_seq(void);
  unsigned long rcu_exp_batches_completed(void);
  unsigned long srcu_batches_completed(struct srcu_struct *sp);
+bool rcu_check_boost_fail(unsigned long gp_state, int *cpup);
  void show_rcu_gp_kthreads(void);
  int rcu_get_gp_kthreads_prio(void);
  void rcu_fwd_progress_check(unsigned long j);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c

index 29d2f4c..ec69273 100644 (file)
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -245,12 +245,6 @@ static const char *rcu_torture_writer_state_getname(void)
         return rcu_torture_writer_state_names[i];
  }
  
-#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_PREEMPT_RT)
-# define rcu_can_boost() 1
-#else
-# define rcu_can_boost() 0
-#endif
-
  #ifdef CONFIG_RCU_TRACE
  static u64 notrace rcu_trace_clock_local(void)
  {
@@ -331,6 +325,7 @@ struct rcu_torture_ops {
         void (*read_delay)(struct torture_random_state *rrsp,
                            struct rt_read_seg *rtrsp);
         void (*readunlock)(int idx);
+       int (*readlock_held)(void);
         unsigned long (*get_gp_seq)(void);
         unsigned long (*gp_diff)(unsigned long new, unsigned long old);
         void (*deferred_free)(struct rcu_torture *p);
@@ -345,6 +340,7 @@ struct rcu_torture_ops {
         void (*fqs)(void);
         void (*stats)(void);
         void (*gp_kthread_dbg)(void);
+       bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
         int (*stall_dur)(void);
         int irq_capable;
         int can_boost;
@@ -359,6 +355,11 @@ static struct rcu_torture_ops *cur_ops;
   * Definitions for rcu torture testing.
   */
  
+static int torture_readlock_not_held(void)
+{
+       return rcu_read_lock_bh_held() || rcu_read_lock_sched_held();
+}
+
  static int rcu_torture_read_lock(void) __acquires(RCU)
  {
         rcu_read_lock();
@@ -483,30 +484,32 @@ static void rcu_sync_torture_init(void)
  }
  
  static struct rcu_torture_ops rcu_ops = {
-       .ttype          = RCU_FLAVOR,
-       .init           = rcu_sync_torture_init,
-       .readlock       = rcu_torture_read_lock,
-       .read_delay     = rcu_read_delay,
-       .readunlock     = rcu_torture_read_unlock,
-       .get_gp_seq     = rcu_get_gp_seq,
-       .gp_diff        = rcu_seq_diff,
-       .deferred_free  = rcu_torture_deferred_free,
-       .sync           = synchronize_rcu,
-       .exp_sync       = synchronize_rcu_expedited,
-       .get_gp_state   = get_state_synchronize_rcu,
-       .start_gp_poll  = start_poll_synchronize_rcu,
-       .poll_gp_state  = poll_state_synchronize_rcu,
-       .cond_sync      = cond_synchronize_rcu,
-       .call           = call_rcu,
-       .cb_barrier     = rcu_barrier,
-       .fqs            = rcu_force_quiescent_state,
-       .stats          = NULL,
-       .gp_kthread_dbg = show_rcu_gp_kthreads,
-       .stall_dur      = rcu_jiffies_till_stall_check,
-       .irq_capable    = 1,
-       .can_boost      = rcu_can_boost(),
-       .extendables    = RCUTORTURE_MAX_EXTEND,
-       .name           = "rcu"
+       .ttype                  = RCU_FLAVOR,
+       .init                   = rcu_sync_torture_init,
+       .readlock               = rcu_torture_read_lock,
+       .read_delay             = rcu_read_delay,
+       .readunlock             = rcu_torture_read_unlock,
+       .readlock_held          = torture_readlock_not_held,
+       .get_gp_seq             = rcu_get_gp_seq,
+       .gp_diff                = rcu_seq_diff,
+       .deferred_free          = rcu_torture_deferred_free,
+       .sync                   = synchronize_rcu,
+       .exp_sync               = synchronize_rcu_expedited,
+       .get_gp_state           = get_state_synchronize_rcu,
+       .start_gp_poll          = start_poll_synchronize_rcu,
+       .poll_gp_state          = poll_state_synchronize_rcu,
+       .cond_sync              = cond_synchronize_rcu,
+       .call                   = call_rcu,
+       .cb_barrier             = rcu_barrier,
+       .fqs                    = rcu_force_quiescent_state,
+       .stats                  = NULL,
+       .gp_kthread_dbg         = show_rcu_gp_kthreads,
+       .check_boost_failed     = rcu_check_boost_fail,
+       .stall_dur              = rcu_jiffies_till_stall_check,
+       .irq_capable            = 1,
+       .can_boost              = IS_ENABLED(CONFIG_RCU_BOOST),
+       .extendables            = RCUTORTURE_MAX_EXTEND,
+       .name                   = "rcu"
  };
  
  /*
@@ -540,6 +543,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
         .readlock       = rcu_torture_read_lock,
         .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
         .readunlock     = rcu_torture_read_unlock,
+       .readlock_held  = torture_readlock_not_held,
         .get_gp_seq     = rcu_no_completed,
         .deferred_free  = rcu_busted_torture_deferred_free,
         .sync           = synchronize_rcu_busted,
@@ -589,6 +593,11 @@ static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
         srcu_read_unlock(srcu_ctlp, idx);
  }
  
+static int torture_srcu_read_lock_held(void)
+{
+       return srcu_read_lock_held(srcu_ctlp);
+}
+
  static unsigned long srcu_torture_completed(void)
  {
         return srcu_batches_completed(srcu_ctlp);
@@ -646,6 +655,7 @@ static struct rcu_torture_ops srcu_ops = {
         .readlock       = srcu_torture_read_lock,
         .read_delay     = srcu_read_delay,
         .readunlock     = srcu_torture_read_unlock,
+       .readlock_held  = torture_srcu_read_lock_held,
         .get_gp_seq     = srcu_torture_completed,
         .deferred_free  = srcu_torture_deferred_free,
         .sync           = srcu_torture_synchronize,
@@ -681,6 +691,7 @@ static struct rcu_torture_ops srcud_ops = {
         .readlock       = srcu_torture_read_lock,
         .read_delay     = srcu_read_delay,
         .readunlock     = srcu_torture_read_unlock,
+       .readlock_held  = torture_srcu_read_lock_held,
         .get_gp_seq     = srcu_torture_completed,
         .deferred_free  = srcu_torture_deferred_free,
         .sync           = srcu_torture_synchronize,
@@ -700,6 +711,7 @@ static struct rcu_torture_ops busted_srcud_ops = {
         .readlock       = srcu_torture_read_lock,
         .read_delay     = rcu_read_delay,
         .readunlock     = srcu_torture_read_unlock,
+       .readlock_held  = torture_srcu_read_lock_held,
         .get_gp_seq     = srcu_torture_completed,
         .deferred_free  = srcu_torture_deferred_free,
         .sync           = srcu_torture_synchronize,
@@ -787,6 +799,7 @@ static struct rcu_torture_ops trivial_ops = {
         .readlock       = rcu_torture_read_lock_trivial,
         .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
         .readunlock     = rcu_torture_read_unlock_trivial,
+       .readlock_held  = torture_readlock_not_held,
         .get_gp_seq     = rcu_no_completed,
         .sync           = synchronize_rcu_trivial,
         .exp_sync       = synchronize_rcu_trivial,
@@ -850,6 +863,7 @@ static struct rcu_torture_ops tasks_tracing_ops = {
         .readlock       = tasks_tracing_torture_read_lock,
         .read_delay     = srcu_read_delay,  /* just reuse srcu's version. */
         .readunlock     = tasks_tracing_torture_read_unlock,
+       .readlock_held  = rcu_read_lock_trace_held,
         .get_gp_seq     = rcu_no_completed,
         .deferred_free  = rcu_tasks_tracing_torture_deferred_free,
         .sync           = synchronize_rcu_tasks_trace,
@@ -871,32 +885,13 @@ static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
         return cur_ops->gp_diff(new, old);
  }
  
-static bool __maybe_unused torturing_tasks(void)
-{
-       return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops;
-}
-
  /*
   * RCU torture priority-boost testing.  Runs one real-time thread per
- * CPU for moderate bursts, repeatedly registering RCU callbacks and
- * spinning waiting for them to be invoked.  If a given callback takes
- * too long to be invoked, we assume that priority inversion has occurred.
+ * CPU for moderate bursts, repeatedly starting grace periods and waiting
+ * for them to complete.  If a given grace period takes too long, we assume
+ * that priority inversion has occurred.
   */
  
-struct rcu_boost_inflight {
-       struct rcu_head rcu;
-       int inflight;
-};
-
-static void rcu_torture_boost_cb(struct rcu_head *head)
-{
-       struct rcu_boost_inflight *rbip =
-               container_of(head, struct rcu_boost_inflight, rcu);
-
-       /* Ensure RCU-core accesses precede clearing ->inflight */
-       smp_store_release(&rbip->inflight, 0);
-}
-
  static int old_rt_runtime = -1;
  
  static void rcu_torture_disable_rt_throttle(void)
@@ -923,49 +918,68 @@ static void rcu_torture_enable_rt_throttle(void)
         old_rt_runtime = -1;
  }
  
-static bool rcu_torture_boost_failed(unsigned long start, unsigned long end)
+static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *start)
  {
+       int cpu;
         static int dbg_done;
-
-       if (end - start > test_boost_duration * HZ - HZ / 2) {
+       unsigned long end = jiffies;
+       bool gp_done;
+       unsigned long j;
+       static unsigned long last_persist;
+       unsigned long lp;
+       unsigned long mininterval = test_boost_duration * HZ - HZ / 2;
+
+       if (end - *start > mininterval) {
+               // Recheck after checking time to avoid false positives.
+               smp_mb(); // Time check before grace-period check.
+               if (cur_ops->poll_gp_state(gp_state))
+                       return false; // passed, though perhaps just barely
+               if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, &cpu)) {
+                       // At most one persisted message per boost test.
+                       j = jiffies;
+                       lp = READ_ONCE(last_persist);
+                       if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp)
+                               pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu);
+                       return false; // passed on a technicality
+               }
                 VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
                 n_rcu_torture_boost_failure++;
-               if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg)
+               if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) {
+                       pr_info("Boost inversion thread ->rt_priority %u gp_state %lu jiffies %lu\n",
+                               current->rt_priority, gp_state, end - *start);
                         cur_ops->gp_kthread_dbg();
+                       // Recheck after print to flag grace period ending during splat.
+                       gp_done = cur_ops->poll_gp_state(gp_state);
+                       pr_info("Boost inversion: GP %lu %s.\n", gp_state,
+                               gp_done ? "ended already" : "still pending");
  
-               return true; /* failed */
+               }
+
+               return true; // failed
+       } else if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, NULL)) {
+               *start = jiffies;
         }
  
-       return false; /* passed */
+       return false; // passed
  }
  
  static int rcu_torture_boost(void *arg)
  {
-       unsigned long call_rcu_time;
         unsigned long endtime;
+       unsigned long gp_state;
+       unsigned long gp_state_time;
         unsigned long oldstarttime;
-       struct rcu_boost_inflight rbi = { .inflight = 0 };
  
         VERBOSE_TOROUT_STRING("rcu_torture_boost started");
  
         /* Set real-time priority. */
         sched_set_fifo_low(current);
  
-       init_rcu_head_on_stack(&rbi.rcu);
         /* Each pass through the following loop does one boost-test cycle. */
         do {
                 bool failed = false; // Test failed already in this test interval
-               bool firsttime = true;
+               bool gp_initiated = false;
  
-               /* Increment n_rcu_torture_boosts once per boost-test */
-               while (!kthread_should_stop()) {
-                       if (mutex_trylock(&boost_mutex)) {
-                               n_rcu_torture_boosts++;
-                               mutex_unlock(&boost_mutex);
-                               break;
-                       }
-                       schedule_timeout_uninterruptible(1);
-               }
                 if (kthread_should_stop())
                         goto checkwait;
  
@@ -979,33 +993,33 @@ static int rcu_torture_boost(void *arg)
                                 goto checkwait;
                 }
  
-               /* Do one boost-test interval. */
+               // Do one boost-test interval.
                 endtime = oldstarttime + test_boost_duration * HZ;
                 while (time_before(jiffies, endtime)) {
-                       /* If we don't have a callback in flight, post one. */
-                       if (!smp_load_acquire(&rbi.inflight)) {
-                               /* RCU core before ->inflight = 1. */
-                               smp_store_release(&rbi.inflight, 1);
-                               cur_ops->call(&rbi.rcu, rcu_torture_boost_cb);
-                               /* Check if the boost test failed */
-                               if (!firsttime && !failed)
-                                       failed = rcu_torture_boost_failed(call_rcu_time, jiffies);
-                               call_rcu_time = jiffies;
-                               firsttime = false;
+                       // Has current GP gone too long?
+                       if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+                               failed = rcu_torture_boost_failed(gp_state, &gp_state_time);
+                       // If we don't have a grace period in flight, start one.
+                       if (!gp_initiated || cur_ops->poll_gp_state(gp_state)) {
+                               gp_state = cur_ops->start_gp_poll();
+                               gp_initiated = true;
+                               gp_state_time = jiffies;
                         }
-                       if (stutter_wait("rcu_torture_boost"))
+                       if (stutter_wait("rcu_torture_boost")) {
                                 sched_set_fifo_low(current);
+                               // If the grace period already ended,
+                               // we don't know when that happened, so
+                               // start over.
+                               if (cur_ops->poll_gp_state(gp_state))
+                                       gp_initiated = false;
+                       }
                         if (torture_must_stop())
                                 goto checkwait;
                 }
  
-               /*
-                * If boost never happened, then inflight will always be 1, in
-                * this case the boost check would never happen in the above
-                * loop so do another one here.
-                */
-               if (!firsttime && !failed && smp_load_acquire(&rbi.inflight))
-                       rcu_torture_boost_failed(call_rcu_time, jiffies);
+               // In case the grace period extended beyond the end of the loop.
+               if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+                       rcu_torture_boost_failed(gp_state, &gp_state_time);
  
                 /*
                  * Set the start time of the next test interval.
@@ -1014,11 +1028,12 @@ static int rcu_torture_boost(void *arg)
                  * interval.  Besides, we are running at RT priority,
                  * so delays should be relatively rare.
                  */
-               while (oldstarttime == boost_starttime &&
-                      !kthread_should_stop()) {
+               while (oldstarttime == boost_starttime && !kthread_should_stop()) {
                         if (mutex_trylock(&boost_mutex)) {
-                               boost_starttime = jiffies +
-                                                 test_boost_interval * HZ;
+                               if (oldstarttime == boost_starttime) {
+                                       boost_starttime = jiffies + test_boost_interval * HZ;
+                                       n_rcu_torture_boosts++;
+                               }
                                 mutex_unlock(&boost_mutex);
                                 break;
                         }
@@ -1030,15 +1045,11 @@ checkwait:      if (stutter_wait("rcu_torture_boost"))
                         sched_set_fifo_low(current);
         } while (!torture_must_stop());
  
-       while (smp_load_acquire(&rbi.inflight))
-               schedule_timeout_uninterruptible(1); // rcu_barrier() deadlocks.
-
         /* Clean up and exit. */
-       while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) {
+       while (!kthread_should_stop()) {
                 torture_shutdown_absorb("rcu_torture_boost");
                 schedule_timeout_uninterruptible(1);
         }
-       destroy_rcu_head_on_stack(&rbi.rcu);
         torture_kthread_stopping("rcu_torture_boost");
         return 0;
  }
@@ -1553,11 +1564,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
         started = cur_ops->get_gp_seq();
         ts = rcu_trace_clock_local();
         p = rcu_dereference_check(rcu_torture_current,
-                                 rcu_read_lock_bh_held() ||
-                                 rcu_read_lock_sched_held() ||
-                                 srcu_read_lock_held(srcu_ctlp) ||
-                                 rcu_read_lock_trace_held() ||
-                                 torturing_tasks());
+                                 !cur_ops->readlock_held || cur_ops->readlock_held());
         if (p == NULL) {
                 /* Wait for rcu_torture_writer to get underway */
                 rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
@@ -1861,48 +1868,49 @@ rcu_torture_stats(void *arg)
                 torture_shutdown_absorb("rcu_torture_stats");
         } while (!torture_must_stop());
         torture_kthread_stopping("rcu_torture_stats");
-
-       {
-               struct rcu_head *rhp;
-               struct kmem_cache *kcp;
-               static int z;
-
-               kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
-               rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
-               pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
-               pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
-               mem_dump_obj(ZERO_SIZE_PTR);
-               pr_alert("mem_dump_obj(NULL):");
-               mem_dump_obj(NULL);
-               pr_alert("mem_dump_obj(%px):", &rhp);
-               mem_dump_obj(&rhp);
-               pr_alert("mem_dump_obj(%px):", rhp);
-               mem_dump_obj(rhp);
-               pr_alert("mem_dump_obj(%px):", &rhp->func);
-               mem_dump_obj(&rhp->func);
-               pr_alert("mem_dump_obj(%px):", &z);
-               mem_dump_obj(&z);
-               kmem_cache_free(kcp, rhp);
-               kmem_cache_destroy(kcp);
-               rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
-               pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
-               pr_alert("mem_dump_obj(kmalloc %px):", rhp);
-               mem_dump_obj(rhp);
-               pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
-               mem_dump_obj(&rhp->func);
-               kfree(rhp);
-               rhp = vmalloc(4096);
-               pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
-               pr_alert("mem_dump_obj(vmalloc %px):", rhp);
-               mem_dump_obj(rhp);
-               pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
-               mem_dump_obj(&rhp->func);
-               vfree(rhp);
-       }
-
         return 0;
  }
  
+/* Test mem_dump_obj() and friends.  */
+static void rcu_torture_mem_dump_obj(void)
+{
+       struct rcu_head *rhp;
+       struct kmem_cache *kcp;
+       static int z;
+
+       kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
+       rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
+       pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
+       pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
+       mem_dump_obj(ZERO_SIZE_PTR);
+       pr_alert("mem_dump_obj(NULL):");
+       mem_dump_obj(NULL);
+       pr_alert("mem_dump_obj(%px):", &rhp);
+       mem_dump_obj(&rhp);
+       pr_alert("mem_dump_obj(%px):", rhp);
+       mem_dump_obj(rhp);
+       pr_alert("mem_dump_obj(%px):", &rhp->func);
+       mem_dump_obj(&rhp->func);
+       pr_alert("mem_dump_obj(%px):", &z);
+       mem_dump_obj(&z);
+       kmem_cache_free(kcp, rhp);
+       kmem_cache_destroy(kcp);
+       rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+       pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+       pr_alert("mem_dump_obj(kmalloc %px):", rhp);
+       mem_dump_obj(rhp);
+       pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
+       mem_dump_obj(&rhp->func);
+       kfree(rhp);
+       rhp = vmalloc(4096);
+       pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+       pr_alert("mem_dump_obj(vmalloc %px):", rhp);
+       mem_dump_obj(rhp);
+       pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
+       mem_dump_obj(&rhp->func);
+       vfree(rhp);
+}
+
  static void
  rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
  {
@@ -2634,7 +2642,7 @@ static bool rcu_torture_can_boost(void)
  
         if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2)
                 return false;
-       if (!cur_ops->call)
+       if (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)
                 return false;
  
         prio = rcu_get_gp_kthreads_prio();
@@ -2642,7 +2650,7 @@ static bool rcu_torture_can_boost(void)
                 return false;
  
         if (prio < 2) {
-               if (boost_warn_once  == 1)
+               if (boost_warn_once == 1)
                         return false;
  
                 pr_alert("%s: WARN: RCU kthread priority too low to test boosting.  Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME);
@@ -2818,6 +2826,8 @@ rcu_torture_cleanup(void)
         if (cur_ops->cleanup != NULL)
                 cur_ops->cleanup();
  
+       rcu_torture_mem_dump_obj();
+
         rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
  
         if (err_segs_recorded) {
@@ -3120,6 +3130,21 @@ rcu_torture_init(void)
                 if (firsterr < 0)
                         goto unwind;
                 rcutor_hp = firsterr;
+
+               // Testing RCU priority boosting requires rcutorture do
+               // some serious abuse.  Counter this by running ksoftirqd
+               // at higher priority.
+               if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
+                       for_each_online_cpu(cpu) {
+                               struct sched_param sp;
+                               struct task_struct *t;
+
+                               t = per_cpu(ksoftirqd, cpu);
+                               WARN_ON_ONCE(!t);
+                               sp.sched_priority = 2;
+                               sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                       }
+               }
         }
         shutdown_jiffies = jiffies + shutdown_secs * HZ;
         firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c

index 02dd976..313d454 100644 (file)
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -362,6 +362,111 @@ static struct ref_scale_ops rwsem_ops = {
         .name           = "rwsem"
  };
  
+// Definitions for global spinlock
+static DEFINE_SPINLOCK(test_lock);
+
+static void ref_lock_section(const int nloops)
+{
+       int i;
+
+       preempt_disable();
+       for (i = nloops; i >= 0; i--) {
+               spin_lock(&test_lock);
+               spin_unlock(&test_lock);
+       }
+       preempt_enable();
+}
+
+static void ref_lock_delay_section(const int nloops, const int udl, const int ndl)
+{
+       int i;
+
+       preempt_disable();
+       for (i = nloops; i >= 0; i--) {
+               spin_lock(&test_lock);
+               un_delay(udl, ndl);
+               spin_unlock(&test_lock);
+       }
+       preempt_enable();
+}
+
+static struct ref_scale_ops lock_ops = {
+       .readsection    = ref_lock_section,
+       .delaysection   = ref_lock_delay_section,
+       .name           = "lock"
+};
+
+// Definitions for global irq-save spinlock
+
+static void ref_lock_irq_section(const int nloops)
+{
+       unsigned long flags;
+       int i;
+
+       preempt_disable();
+       for (i = nloops; i >= 0; i--) {
+               spin_lock_irqsave(&test_lock, flags);
+               spin_unlock_irqrestore(&test_lock, flags);
+       }
+       preempt_enable();
+}
+
+static void ref_lock_irq_delay_section(const int nloops, const int udl, const int ndl)
+{
+       unsigned long flags;
+       int i;
+
+       preempt_disable();
+       for (i = nloops; i >= 0; i--) {
+               spin_lock_irqsave(&test_lock, flags);
+               un_delay(udl, ndl);
+               spin_unlock_irqrestore(&test_lock, flags);
+       }
+       preempt_enable();
+}
+
+static struct ref_scale_ops lock_irq_ops = {
+       .readsection    = ref_lock_irq_section,
+       .delaysection   = ref_lock_irq_delay_section,
+       .name           = "lock-irq"
+};
+
+// Definitions acquire-release.
+static DEFINE_PER_CPU(unsigned long, test_acqrel);
+
+static void ref_acqrel_section(const int nloops)
+{
+       unsigned long x;
+       int i;
+
+       preempt_disable();
+       for (i = nloops; i >= 0; i--) {
+               x = smp_load_acquire(this_cpu_ptr(&test_acqrel));
+               smp_store_release(this_cpu_ptr(&test_acqrel), x + 1);
+       }
+       preempt_enable();
+}
+
+static void ref_acqrel_delay_section(const int nloops, const int udl, const int ndl)
+{
+       unsigned long x;
+       int i;
+
+       preempt_disable();
+       for (i = nloops; i >= 0; i--) {
+               x = smp_load_acquire(this_cpu_ptr(&test_acqrel));
+               un_delay(udl, ndl);
+               smp_store_release(this_cpu_ptr(&test_acqrel), x + 1);
+       }
+       preempt_enable();
+}
+
+static struct ref_scale_ops acqrel_ops = {
+       .readsection    = ref_acqrel_section,
+       .delaysection   = ref_acqrel_delay_section,
+       .name           = "acqrel"
+};
+
  static void rcu_scale_one_reader(void)
  {
         if (readdelay <= 0)
@@ -653,8 +758,8 @@ ref_scale_init(void)
         long i;
         int firsterr = 0;
         static struct ref_scale_ops *scale_ops[] = {
-               &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops,
-               &refcnt_ops, &rwlock_ops, &rwsem_ops,
+               &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops,
+               &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops,
         };
  
         if (!torture_init_begin(scale_type, verbose))
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c

index 036ff54..6833d88 100644 (file)
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -80,7 +80,7 @@ do {                                                                  \
   * srcu_read_unlock() running against them.  So if the is_static parameter
   * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
   */
-static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
+static void init_srcu_struct_nodes(struct srcu_struct *ssp)
  {
         int cpu;
         int i;
@@ -90,6 +90,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
         struct srcu_node *snp;
         struct srcu_node *snp_first;
  
+       /* Initialize geometry if it has not already been initialized. */
+       rcu_init_geometry();
+
         /* Work out the overall tree geometry. */
         ssp->level[0] = &ssp->node[0];
         for (i = 1; i < rcu_num_lvls; i++)
@@ -148,14 +151,6 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
                 timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
                 sdp->ssp = ssp;
                 sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
-               if (is_static)
-                       continue;
-
-               /* Dynamically allocated, better be no srcu_read_locks()! */
-               for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
-                       sdp->srcu_lock_count[i] = 0;
-                       sdp->srcu_unlock_count[i] = 0;
-               }
         }
  }
  
@@ -179,7 +174,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
                 ssp->sda = alloc_percpu(struct srcu_data);
         if (!ssp->sda)
                 return -ENOMEM;
-       init_srcu_struct_nodes(ssp, is_static);
+       init_srcu_struct_nodes(ssp);
         ssp->srcu_gp_seq_needed_exp = 0;
         ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
         smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
@@ -1000,6 +995,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
   * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
   * passed the same srcu_struct structure.
   *
+ * Implementation of these memory-ordering guarantees is similar to
+ * that of synchronize_rcu().
+ *
   * If SRCU is likely idle, expedite the first request.  This semantic
   * was provided by Classic SRCU, and is relied upon by its users, so TREE
   * SRCU must also provide it.  Note that detecting idleness is heuristic
@@ -1392,11 +1390,15 @@ void __init srcu_init(void)
  {
         struct srcu_struct *ssp;
  
+       /*
+        * Once that is set, call_srcu() can follow the normal path and
+        * queue delayed work. This must follow RCU workqueues creation
+        * and timers initialization.
+        */
         srcu_init_done = true;
         while (!list_empty(&srcu_boot_list)) {
                 ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
                                       work.work.entry);
-               check_init_srcu_struct(ssp);
                 list_del_init(&ssp->work.work.entry);
                 queue_work(rcu_gp_wq, &ssp->work.work);
         }
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h

index da906b7..03a118d 100644 (file)
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -377,6 +377,46 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
  // Finally, this implementation does not support high call_rcu_tasks()
  // rates from multiple CPUs.  If this is required, per-CPU callback lists
  // will be needed.
+//
+// The implementation uses rcu_tasks_wait_gp(), which relies on function
+// pointers in the rcu_tasks structure.  The rcu_spawn_tasks_kthread()
+// function sets these function pointers up so that rcu_tasks_wait_gp()
+// invokes these functions in this order:
+//
+// rcu_tasks_pregp_step():
+//     Invokes synchronize_rcu() in order to wait for all in-flight
+//     t->on_rq and t->nvcsw transitions to complete.  This works because
+//     all such transitions are carried out with interrupts disabled.
+// rcu_tasks_pertask(), invoked on every non-idle task:
+//     For every runnable non-idle task other than the current one, use
+//     get_task_struct() to pin down that task, snapshot that task's
+//     number of voluntary context switches, and add that task to the
+//     holdout list.
+// rcu_tasks_postscan():
+//     Invoke synchronize_srcu() to ensure that all tasks that were
+//     in the process of exiting (and which thus might not know to
+//     synchronize with this RCU Tasks grace period) have completed
+//     exiting.
+// check_all_holdout_tasks(), repeatedly until holdout list is empty:
+//     Scans the holdout list, attempting to identify a quiescent state
+//     for each task on the list.  If there is a quiescent state, the
+//     corresponding task is removed from the holdout list.
+// rcu_tasks_postgp():
+//     Invokes synchronize_rcu() in order to ensure that all prior
+//     t->on_rq and t->nvcsw transitions are seen by all CPUs and tasks
+//     to have happened before the end of this RCU Tasks grace period.
+//     Again, this works because all such transitions are carried out
+//     with interrupts disabled.
+//
+// For each exiting task, the exit_tasks_rcu_start() and
+// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
+// read-side critical sections waited for by rcu_tasks_postscan().
+//
+// Pre-grace-period update-side code is ordered before the grace via the
+// ->cbs_lock and the smp_mb__after_spinlock().  Pre-grace-period read-side
+// code is ordered before the grace period via synchronize_rcu() call
+// in rcu_tasks_pregp_step() and by the scheduler's locks and interrupt
+// disabling.
  
  /* Pre-grace-period preparation. */
  static void rcu_tasks_pregp_step(void)
@@ -605,8 +645,13 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
  // passing an empty function to schedule_on_each_cpu().  This approach
  // provides an asynchronous call_rcu_tasks_rude() API and batching
  // of concurrent calls to the synchronous synchronize_rcu_rude() API.
-// This sends IPIs far and wide and induces otherwise unnecessary context
-// switches on all online CPUs, whether idle or not.
+// This invokes schedule_on_each_cpu() in order to send IPIs far and wide
+// and induces otherwise unnecessary context switches on all online CPUs,
+// whether idle or not.
+//
+// Callback handling is provided by the rcu_tasks_kthread() function.
+//
+// Ordering is provided by the scheduler's context-switch code.
  
  // Empty function to allow workqueues to force a context switch.
  static void rcu_tasks_be_rude(struct work_struct *work)
@@ -1356,5 +1401,4 @@ void __init rcu_init_tasks_generic(void)
  
  #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
  static inline void rcu_tasks_bootup_oddness(void) {}
-void show_rcu_tasks_gp_kthreads(void) {}
  #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c

index c8a029f..340b3f8 100644 (file)
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -221,5 +221,4 @@ void __init rcu_init(void)
  {
         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
         rcu_early_boot_tests();
-       srcu_init();
  }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index ed1b546..28f1093 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -186,6 +186,17 @@ module_param(rcu_unlock_delay, int, 0444);
  static int rcu_min_cached_objs = 5;
  module_param(rcu_min_cached_objs, int, 0444);
  
+// A page shrinker can ask for pages to be freed to make them
+// available for other parts of the system. This usually happens
+// under low memory conditions, and in that case we should also
+// defer page-cache filling for a short time period.
+//
+// The default value is 5 seconds, which is long enough to reduce
+// interference with the shrinker while it asks other systems to
+// drain their caches.
+static int rcu_delay_page_cache_fill_msec = 5000;
+module_param(rcu_delay_page_cache_fill_msec, int, 0444);
+
  /* Retrieve RCU kthreads priority for rcutorture */
  int rcu_get_gp_kthreads_prio(void)
  {
@@ -202,7 +213,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
   * the need for long delays to increase some race probabilities with the
   * need for fast grace periods to increase other race probabilities.
   */
-#define PER_RCU_NODE_PERIOD 3  /* Number of grace periods between delays. */
+#define PER_RCU_NODE_PERIOD 3  /* Number of grace periods between delays for debugging. */
  
  /*
   * Compute the mask of online CPUs for the specified rcu_node structure.
@@ -242,6 +253,7 @@ void rcu_softirq_qs(void)
  {
         rcu_qs();
         rcu_preempt_deferred_qs(current);
+       rcu_tasks_qs(current, false);
  }
  
  /*
@@ -833,28 +845,6 @@ void noinstr rcu_irq_exit(void)
         rcu_nmi_exit();
  }
  
-/**
- * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq
- *                       towards in kernel preemption
- *
- * Same as rcu_irq_exit() but has a sanity check that scheduling is safe
- * from RCU point of view. Invoked from return from interrupt before kernel
- * preemption.
- */
-void rcu_irq_exit_preempt(void)
-{
-       lockdep_assert_irqs_disabled();
-       rcu_nmi_exit();
-
-       RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
-                        "RCU dynticks_nesting counter underflow/zero!");
-       RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
-                        DYNTICK_IRQ_NONIDLE,
-                        "Bad RCU  dynticks_nmi_nesting counter\n");
-       RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
-                        "RCU in extended quiescent state!");
-}
-
  #ifdef CONFIG_PROVE_RCU
  /**
   * rcu_irq_exit_check_preempt - Validate that scheduling is possible
@@ -959,7 +949,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
   */
  void noinstr rcu_user_exit(void)
  {
-       rcu_eqs_exit(1);
+       rcu_eqs_exit(true);
  }
  
  /**
@@ -1225,7 +1215,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
  #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
  
  /*
- * We are reporting a quiescent state on behalf of some other CPU, so
+ * When trying to report a quiescent state on behalf of some other CPU,
   * it is our responsibility to check for and handle potential overflow
   * of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
   * After all, the CPU might be in deep idle state, and thus executing no
@@ -2048,7 +2038,7 @@ static void rcu_gp_fqs_loop(void)
  /*
   * Clean up after the old grace period.
   */
-static void rcu_gp_cleanup(void)
+static noinline void rcu_gp_cleanup(void)
  {
         int cpu;
         bool needgp = false;
@@ -2629,7 +2619,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
   * state, for example, user mode or idle loop.  It also schedules RCU
   * core processing.  If the current grace period has gone on too long,
   * it will ask the scheduler to manufacture a context switch for the sole
- * purpose of providing a providing the needed quiescent state.
+ * purpose of providing the needed quiescent state.
   */
  void rcu_sched_clock_irq(int user)
  {
@@ -2911,7 +2901,6 @@ static int __init rcu_spawn_core_kthreads(void)
                   "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
         return 0;
  }
-early_initcall(rcu_spawn_core_kthreads);
  
  /*
   * Handle any core-RCU processing required by a call_rcu() invocation.
@@ -3082,12 +3071,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
   * period elapses, in other words after all pre-existing RCU read-side
   * critical sections have completed.  However, the callback function
   * might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
- * may be nested.  In addition, regions of code across which interrupts,
- * preemption, or softirqs have been disabled also serve as RCU read-side
- * critical sections.  This includes hardware interrupt handlers, softirq
- * handlers, and NMI handlers.
+ * that started after call_rcu() was invoked.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested.  In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
+ * sections.  This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
   *
   * Note that all CPUs must agree that the grace period extended beyond
   * all pre-existing RCU read-side critical section.  On systems with more
@@ -3107,6 +3098,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
   * between the call to call_rcu() and the invocation of "func()" -- even
   * if CPU A and CPU B are the same CPU (but again only if the system has
   * more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
   */
  void call_rcu(struct rcu_head *head, rcu_callback_t func)
  {
@@ -3171,6 +3165,7 @@ struct kfree_rcu_cpu_work {
   *     Even though it is lockless an access has to be protected by the
   *     per-cpu lock.
   * @page_cache_work: A work to refill the cache when it is empty
+ * @backoff_page_cache_fill: Delay cache refills
   * @work_in_progress: Indicates that page_cache_work is running
   * @hrtimer: A hrtimer for scheduling a page_cache_work
   * @nr_bkv_objs: number of allocated objects at @bkvcache.
@@ -3190,7 +3185,8 @@ struct kfree_rcu_cpu {
         bool initialized;
         int count;
  
-       struct work_struct page_cache_work;
+       struct delayed_work page_cache_work;
+       atomic_t backoff_page_cache_fill;
         atomic_t work_in_progress;
         struct hrtimer hrtimer;
  
@@ -3237,7 +3233,7 @@ get_cached_bnode(struct kfree_rcu_cpu *krcp)
         if (!krcp->nr_bkv_objs)
                 return NULL;
  
-       krcp->nr_bkv_objs--;
+       WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
         return (struct kvfree_rcu_bulk_data *)
                 llist_del_first(&krcp->bkvcache);
  }
@@ -3251,14 +3247,33 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp,
                 return false;
  
         llist_add((struct llist_node *) bnode, &krcp->bkvcache);
-       krcp->nr_bkv_objs++;
+       WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
         return true;
+}
+
+static int
+drain_page_cache(struct kfree_rcu_cpu *krcp)
+{
+       unsigned long flags;
+       struct llist_node *page_list, *pos, *n;
+       int freed = 0;
  
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+       page_list = llist_del_all(&krcp->bkvcache);
+       WRITE_ONCE(krcp->nr_bkv_objs, 0);
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+       llist_for_each_safe(pos, n, page_list) {
+               free_page((unsigned long)pos);
+               freed++;
+       }
+
+       return freed;
  }
  
  /*
   * This function is invoked in workqueue context after a grace period.
- * It frees all the objects queued on ->bhead_free or ->head_free.
+ * It frees all the objects queued on ->bkvhead_free or ->head_free.
   */
  static void kfree_rcu_work(struct work_struct *work)
  {
@@ -3285,7 +3300,7 @@ static void kfree_rcu_work(struct work_struct *work)
         krwp->head_free = NULL;
         raw_spin_unlock_irqrestore(&krcp->lock, flags);
  
-       // Handle two first channels.
+       // Handle the first two channels.
         for (i = 0; i < FREE_N_CHANNELS; i++) {
                 for (; bkvhead[i]; bkvhead[i] = bnext) {
                         bnext = bkvhead[i]->next;
@@ -3323,9 +3338,11 @@ static void kfree_rcu_work(struct work_struct *work)
         }
  
         /*
-        * Emergency case only. It can happen under low memory
-        * condition when an allocation gets failed, so the "bulk"
-        * path can not be temporary maintained.
+        * This is used when the "bulk" path can not be used for the
+        * double-argument of kvfree_rcu().  This happens when the
+        * page-cache is empty, which means that objects are instead
+        * queued on a linked list through their rcu_head structures.
+        * This list is named "Channel 3".
          */
         for (; head; head = next) {
                 unsigned long offset = (unsigned long)head->func;
@@ -3345,34 +3362,31 @@ static void kfree_rcu_work(struct work_struct *work)
  }
  
  /*
- * Schedule the kfree batch RCU work to run in workqueue context after a GP.
- *
- * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
- * timeout has been reached.
+ * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
   */
-static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
+static void kfree_rcu_monitor(struct work_struct *work)
  {
-       struct kfree_rcu_cpu_work *krwp;
-       bool repeat = false;
+       struct kfree_rcu_cpu *krcp = container_of(work,
+               struct kfree_rcu_cpu, monitor_work.work);
+       unsigned long flags;
         int i, j;
  
-       lockdep_assert_held(&krcp->lock);
+       raw_spin_lock_irqsave(&krcp->lock, flags);
  
+       // Attempt to start a new batch.
         for (i = 0; i < KFREE_N_BATCHES; i++) {
-               krwp = &(krcp->krw_arr[i]);
+               struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
  
-               /*
-                * Try to detach bkvhead or head and attach it over any
-                * available corresponding free channel. It can be that
-                * a previous RCU batch is in progress, it means that
-                * immediately to queue another one is not possible so
-                * return false to tell caller to retry.
-                */
+               // Try to detach bkvhead or head and attach it over any
+               // available corresponding free channel. It can be that
+               // a previous RCU batch is in progress, it means that
+               // immediately to queue another one is not possible so
+               // in that case the monitor work is rearmed.
                 if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
                         (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
                                 (krcp->head && !krwp->head_free)) {
-                       // Channel 1 corresponds to SLAB ptrs.
-                       // Channel 2 corresponds to vmalloc ptrs.
+                       // Channel 1 corresponds to the SLAB-pointer bulk path.
+                       // Channel 2 corresponds to vmalloc-pointer bulk path.
                         for (j = 0; j < FREE_N_CHANNELS; j++) {
                                 if (!krwp->bkvhead_free[j]) {
                                         krwp->bkvhead_free[j] = krcp->bkvhead[j];
@@ -3380,7 +3394,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
                                 }
                         }
  
-                       // Channel 3 corresponds to emergency path.
+                       // Channel 3 corresponds to both SLAB and vmalloc
+                       // objects queued on the linked list.
                         if (!krwp->head_free) {
                                 krwp->head_free = krcp->head;
                                 krcp->head = NULL;
@@ -3388,65 +3403,35 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
  
                         WRITE_ONCE(krcp->count, 0);
  
-                       /*
-                        * One work is per one batch, so there are three
-                        * "free channels", the batch can handle. It can
-                        * be that the work is in the pending state when
-                        * channels have been detached following by each
-                        * other.
-                        */
+                       // One work is per one batch, so there are three
+                       // "free channels", the batch can handle. It can
+                       // be that the work is in the pending state when
+                       // channels have been detached following by each
+                       // other.
                         queue_rcu_work(system_wq, &krwp->rcu_work);
                 }
-
-               // Repeat if any "free" corresponding channel is still busy.
-               if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
-                       repeat = true;
         }
  
-       return !repeat;
-}
-
-static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
-                                         unsigned long flags)
-{
-       // Attempt to start a new batch.
-       krcp->monitor_todo = false;
-       if (queue_kfree_rcu_work(krcp)) {
-               // Success! Our job is done here.
-               raw_spin_unlock_irqrestore(&krcp->lock, flags);
-               return;
-       }
+       // If there is nothing to detach, it means that our job is
+       // successfully done here. In case of having at least one
+       // of the channels that is still busy we should rearm the
+       // work to repeat an attempt. Because previous batches are
+       // still in progress.
+       if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head)
+               krcp->monitor_todo = false;
+       else
+               schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
  
-       // Previous RCU batch still in progress, try again later.
-       krcp->monitor_todo = true;
-       schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
         raw_spin_unlock_irqrestore(&krcp->lock, flags);
  }
  
-/*
- * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
- * It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
- */
-static void kfree_rcu_monitor(struct work_struct *work)
-{
-       unsigned long flags;
-       struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
-                                                monitor_work.work);
-
-       raw_spin_lock_irqsave(&krcp->lock, flags);
-       if (krcp->monitor_todo)
-               kfree_rcu_drain_unlock(krcp, flags);
-       else
-               raw_spin_unlock_irqrestore(&krcp->lock, flags);
-}
-
  static enum hrtimer_restart
  schedule_page_work_fn(struct hrtimer *t)
  {
         struct kfree_rcu_cpu *krcp =
                 container_of(t, struct kfree_rcu_cpu, hrtimer);
  
-       queue_work(system_highpri_wq, &krcp->page_cache_work);
+       queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
         return HRTIMER_NORESTART;
  }
  
@@ -3455,12 +3440,16 @@ static void fill_page_cache_func(struct work_struct *work)
         struct kvfree_rcu_bulk_data *bnode;
         struct kfree_rcu_cpu *krcp =
                 container_of(work, struct kfree_rcu_cpu,
-                       page_cache_work);
+                       page_cache_work.work);
         unsigned long flags;
+       int nr_pages;
         bool pushed;
         int i;
  
-       for (i = 0; i < rcu_min_cached_objs; i++) {
+       nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
+               1 : rcu_min_cached_objs;
+
+       for (i = 0; i < nr_pages; i++) {
                 bnode = (struct kvfree_rcu_bulk_data *)
                         __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
  
@@ -3477,6 +3466,7 @@ static void fill_page_cache_func(struct work_struct *work)
         }
  
         atomic_set(&krcp->work_in_progress, 0);
+       atomic_set(&krcp->backoff_page_cache_fill, 0);
  }
  
  static void
@@ -3484,10 +3474,15 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
  {
         if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
                         !atomic_xchg(&krcp->work_in_progress, 1)) {
-               hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
-                       HRTIMER_MODE_REL);
-               krcp->hrtimer.function = schedule_page_work_fn;
-               hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+               if (atomic_read(&krcp->backoff_page_cache_fill)) {
+                       queue_delayed_work(system_wq,
+                               &krcp->page_cache_work,
+                                       msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
+               } else {
+                       hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+                       krcp->hrtimer.function = schedule_page_work_fn;
+                       hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+               }
         }
  }
  
@@ -3552,11 +3547,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
  }
  
  /*
- * Queue a request for lazy invocation of appropriate free routine after a
- * grace period. Please note there are three paths are maintained, two are the
- * main ones that use array of pointers interface and third one is emergency
- * one, that is used only when the main path can not be maintained temporary,
- * due to memory pressure.
+ * Queue a request for lazy invocation of the appropriate free routine
+ * after a grace period.  Please note that three paths are maintained,
+ * two for the common case using arrays of pointers and a third one that
+ * is used only when the main paths cannot be used, for example, due to
+ * memory pressure.
   *
   * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
   * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
@@ -3645,6 +3640,8 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
  
                 count += READ_ONCE(krcp->count);
+               count += READ_ONCE(krcp->nr_bkv_objs);
+               atomic_set(&krcp->backoff_page_cache_fill, 1);
         }
  
         return count;
@@ -3654,18 +3651,14 @@ static unsigned long
  kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
  {
         int cpu, freed = 0;
-       unsigned long flags;
  
         for_each_possible_cpu(cpu) {
                 int count;
                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
  
                 count = krcp->count;
-               raw_spin_lock_irqsave(&krcp->lock, flags);
-               if (krcp->monitor_todo)
-                       kfree_rcu_drain_unlock(krcp, flags);
-               else
-                       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+               count += drain_page_cache(krcp);
+               kfree_rcu_monitor(&krcp->monitor_work.work);
  
                 sc->nr_to_scan -= count;
                 freed += count;
@@ -3693,7 +3686,8 @@ void __init kfree_rcu_scheduler_running(void)
                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
  
                 raw_spin_lock_irqsave(&krcp->lock, flags);
-               if (!krcp->head || krcp->monitor_todo) {
+               if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) ||
+                               krcp->monitor_todo) {
                         raw_spin_unlock_irqrestore(&krcp->lock, flags);
                         continue;
                 }
@@ -3750,10 +3744,12 @@ static int rcu_blocking_is_gp(void)
   * read-side critical sections have completed.  Note, however, that
   * upon return from synchronize_rcu(), the caller might well be executing
   * concurrently with new RCU read-side critical sections that began while
- * synchronize_rcu() was waiting.  RCU read-side critical sections are
- * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
- * In addition, regions of code across which interrupts, preemption, or
- * softirqs have been disabled also serve as RCU read-side critical
+ * synchronize_rcu() was waiting.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested.  In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
   * sections.  This includes hardware interrupt handlers, softirq handlers,
   * and NMI handlers.
   *
@@ -3774,6 +3770,9 @@ static int rcu_blocking_is_gp(void)
   * to have executed a full memory barrier during the execution of
   * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
   * again only if the system has more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
   */
  void synchronize_rcu(void)
  {
@@ -3844,7 +3843,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
  /**
   * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period
   *
- * @oldstate: return from call to get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+ * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
   *
   * If a full RCU grace period has elapsed since the earlier call from
   * which oldstate was obtained, return @true, otherwise return @false.
@@ -3860,6 +3859,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
   * (many hours even on 32-bit systems) should check them occasionally
   * and either refresh them or set a flag indicating that the grace period
   * has completed.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate, and that returned at the end
+ * of this function.
   */
  bool poll_state_synchronize_rcu(unsigned long oldstate)
  {
@@ -3874,7 +3878,7 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
  /**
   * cond_synchronize_rcu - Conditionally wait for an RCU grace period
   *
- * @oldstate: return value from earlier call to get_state_synchronize_rcu()
+ * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
   *
   * If a full RCU grace period has elapsed since the earlier call to
   * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
@@ -3884,6 +3888,11 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
   * counter wrap is harmless.  If the counter wraps, we have waited for
   * more than 2 billion grace periods (and way more on a 64-bit system!),
   * so waiting for one additional grace period should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate, and that returned at the end
+ * of this function.
   */
  void cond_synchronize_rcu(unsigned long oldstate)
  {
@@ -4189,7 +4198,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
         rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
         trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-       rcu_prepare_kthreads(cpu);
+       rcu_spawn_one_boost_kthread(rnp);
         rcu_spawn_cpu_nocb_kthread(cpu);
         WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
  
@@ -4472,6 +4481,7 @@ static int __init rcu_spawn_gp_kthread(void)
         wake_up_process(t);
         rcu_spawn_nocb_kthreads();
         rcu_spawn_boost_kthreads();
+       rcu_spawn_core_kthreads();
         return 0;
  }
  early_initcall(rcu_spawn_gp_kthread);
@@ -4582,11 +4592,25 @@ static void __init rcu_init_one(void)
   * replace the definitions in tree.h because those are needed to size
   * the ->node array in the rcu_state structure.
   */
-static void __init rcu_init_geometry(void)
+void rcu_init_geometry(void)
  {
         ulong d;
         int i;
+       static unsigned long old_nr_cpu_ids;
         int rcu_capacity[RCU_NUM_LVLS];
+       static bool initialized;
+
+       if (initialized) {
+               /*
+                * Warn if setup_nr_cpu_ids() had not yet been invoked,
+                * unless nr_cpus_ids == NR_CPUS, in which case who cares?
+                */
+               WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
+               return;
+       }
+
+       old_nr_cpu_ids = nr_cpu_ids;
+       initialized = true;
  
         /*
          * Initialize any unspecified boot parameters.
@@ -4687,6 +4711,18 @@ static void __init kfree_rcu_batch_init(void)
         int cpu;
         int i;
  
+       /* Clamp it to [0:100] seconds interval. */
+       if (rcu_delay_page_cache_fill_msec < 0 ||
+               rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
+
+               rcu_delay_page_cache_fill_msec =
+                       clamp(rcu_delay_page_cache_fill_msec, 0,
+                               (int) (100 * MSEC_PER_SEC));
+
+               pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
+                       rcu_delay_page_cache_fill_msec);
+       }
+
         for_each_possible_cpu(cpu) {
                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
  
@@ -4696,7 +4732,7 @@ static void __init kfree_rcu_batch_init(void)
                 }
  
                 INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
-               INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
+               INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
                 krcp->initialized = true;
         }
         if (register_shrinker(&kfree_rcu_shrinker))
@@ -4730,12 +4766,11 @@ void __init rcu_init(void)
                 rcutree_online_cpu(cpu);
         }
  
-       /* Create workqueue for expedited GPs and for Tree SRCU. */
+       /* Create workqueue for Tree SRCU and for expedited GPs. */
         rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
         WARN_ON(!rcu_gp_wq);
         rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
         WARN_ON(!rcu_par_gp_wq);
-       srcu_init();
  
         /* Fill in default value for rcutree.qovld boot parameter. */
         /* -After- the rcu_node ->lock fields are initialized! */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h

index c1ed047..305cf6a 100644 (file)
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -115,6 +115,7 @@ struct rcu_node {
                                 /*  boosting for this rcu_node structure. */
         unsigned int boost_kthread_status;
                                 /* State of boost_kthread_task for tracing. */
+       unsigned long n_boosts; /* Number of boosts for this rcu_node structure. */
  #ifdef CONFIG_RCU_NOCB_CPU
         struct swait_queue_head nocb_gp_wq[2];
                                 /* Place for rcu_nocb_kthread() to wait GP. */
@@ -416,8 +417,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
  static bool rcu_is_callbacks_kthread(void);
  static void rcu_cpu_kthread_setup(unsigned int cpu);
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
  static void __init rcu_spawn_boost_kthreads(void);
-static void rcu_prepare_kthreads(int cpu);
  static void rcu_cleanup_after_idle(void);
  static void rcu_prepare_for_idle(void);
  static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h

index b0c3fb4..334eaf4 100644 (file)
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1088,6 +1088,7 @@ static int rcu_boost(struct rcu_node *rnp)
         /* Lock only for side effect: boosts task t's priority. */
         rt_mutex_lock(&rnp->boost_mtx);
         rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
+       rnp->n_boosts++;
  
         return READ_ONCE(rnp->exp_tasks) != NULL ||
                READ_ONCE(rnp->boost_tasks) != NULL;
@@ -1187,22 +1188,16 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
   */
  static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
  {
-       int rnp_index = rnp - rcu_get_root();
         unsigned long flags;
+       int rnp_index = rnp - rcu_get_root();
         struct sched_param sp;
         struct task_struct *t;
  
-       if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
-               return;
-
-       if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
+       if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
                 return;
  
         rcu_state.boost = 1;
  
-       if (rnp->boost_kthread_task != NULL)
-               return;
-
         t = kthread_create(rcu_boost_kthread, (void *)rnp,
                            "rcub/%d", rnp_index);
         if (WARN_ON_ONCE(IS_ERR(t)))
@@ -1254,17 +1249,8 @@ static void __init rcu_spawn_boost_kthreads(void)
         struct rcu_node *rnp;
  
         rcu_for_each_leaf_node(rnp)
-               rcu_spawn_one_boost_kthread(rnp);
-}
-
-static void rcu_prepare_kthreads(int cpu)
-{
-       struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-       struct rcu_node *rnp = rdp->mynode;
-
-       /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
-       if (rcu_scheduler_fully_active)
-               rcu_spawn_one_boost_kthread(rnp);
+               if (rcu_rnp_online_cpus(rnp))
+                       rcu_spawn_one_boost_kthread(rnp);
  }
  
  #else /* #ifdef CONFIG_RCU_BOOST */
@@ -1284,15 +1270,15 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
  {
  }
  
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
  {
  }
  
-static void __init rcu_spawn_boost_kthreads(void)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
  {
  }
  
-static void rcu_prepare_kthreads(int cpu)
+static void __init rcu_spawn_boost_kthreads(void)
  {
  }
  
@@ -1525,13 +1511,10 @@ static void rcu_cleanup_after_idle(void)
  static int __init rcu_nocb_setup(char *str)
  {
         alloc_bootmem_cpumask_var(&rcu_nocb_mask);
-       if (!strcasecmp(str, "all"))            /* legacy: use "0-N" instead */
+       if (cpulist_parse(str, rcu_nocb_mask)) {
+               pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
                 cpumask_setall(rcu_nocb_mask);
-       else
-               if (cpulist_parse(str, rcu_nocb_mask)) {
-                       pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
-                       cpumask_setall(rcu_nocb_mask);
-               }
+       }
         return 1;
  }
  __setup("rcu_nocbs=", rcu_nocb_setup);
@@ -1952,7 +1935,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
  }
  
  /*
- * Awaken the no-CBs grace-period kthead if needed, either due to it
+ * Awaken the no-CBs grace-period kthread if needed, either due to it
   * legitimately being asleep or due to overload conditions.
   *
   * If warranted, also wake up the kthread servicing this CPUs queues.
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h

index 59b95cc..f4152aa 100644 (file)
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -314,6 +314,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
   * tasks blocked within RCU read-side critical sections.
   */
  static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
+       __releases(rnp->lock)
  {
         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
         return 0;
@@ -716,6 +717,63 @@ static void check_cpu_stall(struct rcu_data *rdp)
  // RCU forward-progress mechanisms, including of callback invocation.
  
  
+/*
+ * Check to see if a failure to end RCU priority inversion was due to
+ * a CPU not passing through a quiescent state.  When this happens, there
+ * is nothing that RCU priority boosting can do to help, so we shouldn't
+ * count this as an RCU priority boosting failure.  A return of true says
+ * RCU priority boosting is to blame, and false says otherwise.  If false
+ * is returned, the first of the CPUs to blame is stored through cpup.
+ * If there was no CPU blocking the current grace period, but also nothing
+ * in need of being boosted, *cpup is set to -1.  This can happen in case
+ * of vCPU preemption while the last CPU is reporting its quiscent state,
+ * for example.
+ *
+ * If cpup is NULL, then a lockless quick check is carried out, suitable
+ * for high-rate usage.  On the other hand, if cpup is non-NULL, each
+ * rcu_node structure's ->lock is acquired, ruling out high-rate usage.
+ */
+bool rcu_check_boost_fail(unsigned long gp_state, int *cpup)
+{
+       bool atb = false;
+       int cpu;
+       unsigned long flags;
+       struct rcu_node *rnp;
+
+       rcu_for_each_leaf_node(rnp) {
+               if (!cpup) {
+                       if (READ_ONCE(rnp->qsmask)) {
+                               return false;
+                       } else {
+                               if (READ_ONCE(rnp->gp_tasks))
+                                       atb = true;
+                               continue;
+                       }
+               }
+               *cpup = -1;
+               raw_spin_lock_irqsave_rcu_node(rnp, flags);
+               if (rnp->gp_tasks)
+                       atb = true;
+               if (!rnp->qsmask) {
+                       // No CPUs without quiescent states for this rnp.
+                       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                       continue;
+               }
+               // Find the first holdout CPU.
+               for_each_leaf_node_possible_cpu(rnp, cpu) {
+                       if (rnp->qsmask & (1UL << (cpu - rnp->grplo))) {
+                               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                               *cpup = cpu;
+                               return false;
+                       }
+               }
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+       }
+       // Can't blame CPUs, so must blame RCU priority boosting.
+       return atb;
+}
+EXPORT_SYMBOL_GPL(rcu_check_boost_fail);
+
  /*
   * Show the state of the grace-period kthreads.
   */
@@ -726,6 +784,7 @@ void show_rcu_gp_kthreads(void)
         unsigned long j;
         unsigned long ja;
         unsigned long jr;
+       unsigned long js;
         unsigned long jw;
         struct rcu_data *rdp;
         struct rcu_node *rnp;
@@ -734,21 +793,30 @@ void show_rcu_gp_kthreads(void)
         j = jiffies;
         ja = j - data_race(rcu_state.gp_activity);
         jr = j - data_race(rcu_state.gp_req_activity);
+       js = j - data_race(rcu_state.gp_start);
         jw = j - data_race(rcu_state.gp_wake_time);
-       pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
+       pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
                 rcu_state.name, gp_state_getname(rcu_state.gp_state),
-               rcu_state.gp_state, t ? t->state : 0x1ffffL,
-               ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
+               rcu_state.gp_state, t ? t->state : 0x1ffffL, t ? t->rt_priority : 0xffU,
+               js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
                 (long)data_race(rcu_state.gp_seq),
                 (long)data_race(rcu_get_root()->gp_seq_needed),
+               data_race(rcu_state.gp_max),
                 data_race(rcu_state.gp_flags));
         rcu_for_each_node_breadth_first(rnp) {
-               if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq),
-                                READ_ONCE(rnp->gp_seq_needed)))
+               if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) &&
+                   !data_race(rnp->qsmask) && !data_race(rnp->boost_tasks) &&
+                   !data_race(rnp->exp_tasks) && !data_race(rnp->gp_tasks))
                         continue;
-               pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
-                       rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq),
-                       (long)data_race(rnp->gp_seq_needed));
+               pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n",
+                       rnp->grplo, rnp->grphi,
+                       (long)data_race(rnp->gp_seq), (long)data_race(rnp->gp_seq_needed),
+                       data_race(rnp->qsmask),
+                       ".b"[!!data_race(rnp->boost_kthread_task)],
+                       ".B"[!!data_race(rnp->boost_tasks)],
+                       ".E"[!!data_race(rnp->exp_tasks)],
+                       ".G"[!!data_race(rnp->gp_tasks)],
+                       data_race(rnp->n_boosts));
                 if (!rcu_is_leaf_node(rnp))
                         continue;
                 for_each_leaf_node_possible_cpu(rnp, cpu) {
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c

index b95ae86..c21b38c 100644 (file)
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -277,7 +277,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
  
  noinstr int notrace debug_lockdep_rcu_enabled(void)
  {
-       return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
+       return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && READ_ONCE(debug_locks) &&
                current->lockdep_recursion == 0;
  }
  EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
@@ -524,6 +524,7 @@ static void test_callback(struct rcu_head *r)
  }
  
  DEFINE_STATIC_SRCU(early_srcu);
+static unsigned long early_srcu_cookie;
  
  struct early_boot_kfree_rcu {
         struct rcu_head rh;
@@ -536,8 +537,10 @@ static void early_boot_test_call_rcu(void)
         struct early_boot_kfree_rcu *rhp;
  
         call_rcu(&head, test_callback);
-       if (IS_ENABLED(CONFIG_SRCU))
+       if (IS_ENABLED(CONFIG_SRCU)) {
+               early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
                 call_srcu(&early_srcu, &shead, test_callback);
+       }
         rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
         if (!WARN_ON_ONCE(!rhp))
                 kfree_rcu(rhp, rh);
@@ -563,6 +566,7 @@ static int rcu_verify_early_boot_tests(void)
                 if (IS_ENABLED(CONFIG_SRCU)) {
                         early_boot_test_counter++;
                         srcu_barrier(&early_srcu);
+                       WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
                 }
         }
         if (rcu_self_test_counter != early_boot_test_counter) {
diff --git a/lib/bitmap.c b/lib/bitmap.c

index 74ceb02..6e29b2a 100644 (file)
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -581,6 +581,14 @@ static const char *bitmap_parse_region(const char *str, struct region *r)
  {
         unsigned int lastbit = r->nbits - 1;
  
+       if (!strncasecmp(str, "all", 3)) {
+               r->start = 0;
+               r->end = lastbit;
+               str += 3;
+
+               goto check_pattern;
+       }
+
         str = bitmap_getnum(str, &r->start, lastbit);
         if (IS_ERR(str))
                 return str;
@@ -595,6 +603,7 @@ static const char *bitmap_parse_region(const char *str, struct region *r)
         if (IS_ERR(str))
                 return str;
  
+check_pattern:
         if (end_of_region(*str))
                 goto no_pattern;
  
diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c

index 9cd5755..4ea73f5 100644 (file)
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -366,6 +366,13 @@ static const struct test_bitmap_parselist parselist_tests[] __initconst = {
         {0, "0-31:1/3,1-31:1/3,2-31:1/3",       &exp1[8 * step], 32, 0},
         {0, "1-10:8/12,8-31:24/29,0-31:0/3",    &exp1[9 * step], 32, 0},
  
+       {0,       "all",                &exp1[8 * step], 32, 0},
+       {0,       "0, 1, all,  ",       &exp1[8 * step], 32, 0},
+       {0,       "all:1/2",            &exp1[4 * step], 32, 0},
+       {0,       "ALL:1/2",            &exp1[4 * step], 32, 0},
+       {-EINVAL, "al", NULL, 8, 0},
+       {-EINVAL, "alll", NULL, 8, 0},
+
         {-EINVAL, "-1", NULL, 8, 0},
         {-EINVAL, "-0", NULL, 8, 0},
         {-EINVAL, "10-1", NULL, 8, 0},
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index eefd3f5..54527de 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -922,7 +922,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
                         continue;
                 }
                 /*
-                * No kthead_use_mm() user needs to read from the userspace so
+                * No kthread_use_mm() user needs to read from the userspace so
                  * we are ok to reap it.
                  */
                 if (unlikely(p->flags & PF_KTHREAD))
diff --git a/mm/slab.h b/mm/slab.h

index 18c1927..7189daa 100644 (file)
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -640,6 +640,7 @@ struct kmem_obj_info {
         struct kmem_cache *kp_slab_cache;
         void *kp_ret;
         void *kp_stack[KS_ADDRS_COUNT];
+       void *kp_free_stack[KS_ADDRS_COUNT];
  };
  void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
  #endif
diff --git a/mm/slab_common.c b/mm/slab_common.c

index f8833d3..92e3aa7 100644 (file)
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -564,7 +564,7 @@ EXPORT_SYMBOL_GPL(kmem_valid_obj);
   * depends on the type of object and on how much debugging is enabled.
   * For a slab-cache object, the fact that it is a slab object is printed,
   * and, if available, the slab name, return address, and stack trace from
- * the allocation of that object.
+ * the allocation and last free path of that object.
   *
   * This function will splat if passed a pointer to a non-slab object.
   * If you are not sure what type of object you have, you should instead
@@ -609,6 +609,16 @@ void kmem_dump_obj(void *object)
                         break;
                 pr_info("    %pS\n", kp.kp_stack[i]);
         }
+
+       if (kp.kp_free_stack[0])
+               pr_cont(" Free path:\n");
+
+       for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) {
+               if (!kp.kp_free_stack[i])
+                       break;
+               pr_info("    %pS\n", kp.kp_free_stack[i]);
+       }
+
  }
  EXPORT_SYMBOL_GPL(kmem_dump_obj);
  #endif
diff --git a/mm/slub.c b/mm/slub.c

index feda53a..deec894 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4002,6 +4002,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
             !(s->flags & SLAB_STORE_USER))
                 return;
  #ifdef CONFIG_SLUB_DEBUG
+       objp = fixup_red_left(s, objp);
         trackp = get_track(s, objp, TRACK_ALLOC);
         kpp->kp_ret = (void *)trackp->addr;
  #ifdef CONFIG_STACKTRACE
@@ -4010,6 +4011,13 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
                 if (!kpp->kp_stack[i])
                         break;
         }
+
+       trackp = get_track(s, objp, TRACK_FREE);
+       for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
+               kpp->kp_free_stack[i] = (void *)trackp->addrs[i];
+               if (!kpp->kp_free_stack[i])
+                       break;
+       }
  #endif
  #endif
  }
diff --git a/mm/util.c b/mm/util.c

index a8bf17f..0b6dd9d 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -983,7 +983,7 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
   * depends on the type of object and on how much debugging is enabled.
   * For example, for a slab-cache object, the slab name is printed, and,
   * if available, the return address and stack trace from the allocation
- * of that object.
+ * and last free path of that object.
   */
  void mem_dump_obj(void *object)
  {
diff --git a/tools/rcu/rcu-cbs.py b/tools/rcu/rcu-cbs.py

new file mode 100644 (file)

index 0000000..f8b461b
--- /dev/null
+++ b/tools/rcu/rcu-cbs.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env drgn
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Dump out the number of RCU callbacks outstanding.
+#
+# On older kernels having multiple flavors of RCU, this dumps out the
+# number of callbacks for the most heavily used flavor.
+#
+# Usage: sudo drgn rcu-cbs.py
+#
+# Copyright (C) 2021 Facebook, Inc.
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+import sys
+import drgn
+from drgn import NULL, Object
+from drgn.helpers.linux import *
+
+def get_rdp0(prog):
+       try:
+               rdp0 = prog.variable('rcu_preempt_data', 'kernel/rcu/tree.c');
+       except LookupError:
+               rdp0 = NULL;
+
+       if rdp0 == NULL:
+               try:
+                       rdp0 = prog.variable('rcu_sched_data',
+                                            'kernel/rcu/tree.c');
+               except LookupError:
+                       rdp0 = NULL;
+
+       if rdp0 == NULL:
+               rdp0 = prog.variable('rcu_data', 'kernel/rcu/tree.c');
+       return rdp0.address_of_();
+
+rdp0 = get_rdp0(prog);
+
+# Sum up RCU callbacks.
+sum = 0;
+for cpu in for_each_possible_cpu(prog):
+       rdp = per_cpu_ptr(rdp0, cpu);
+       len = rdp.cblist.len.value_();
+       # print("CPU " + str(cpu) + " RCU callbacks: " + str(len));
+       sum += len;
+print("Number of RCU callbacks in flight: " + str(sum));
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh

index 46e47a0..d8c8483 100755 (executable)
--- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh
@@ -29,7 +29,7 @@ then
         echo "Usage: $scriptname /path/to/old/run [ options ]"
         exit 1
  fi
-if ! cp "$oldrun/batches" $T/batches.oldrun
+if ! cp "$oldrun/scenarios" $T/scenarios.oldrun
  then
         # Later on, can reconstitute this from console.log files.
         echo Prior run batches file does not exist: $oldrun/batches
@@ -143,6 +143,8 @@ then
         usage
  fi
  rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log
+touch "$rundir/log"
+echo $scriptname $args | tee -a "$rundir/log"
  echo $oldrun > "$rundir/re-run"
  if ! test -d "$rundir/../../bin"
  then
@@ -165,22 +167,12 @@ done
  grep '^#' $i | sed -e 's/^# //' > $T/qemu-cmd-settings
  . $T/qemu-cmd-settings
  
-grep -v '^#' $T/batches.oldrun | awk '
-BEGIN {
-       oldbatch = 1;
-}
-
+grep -v '^#' $T/scenarios.oldrun | awk '
  {
-       if (oldbatch != $1) {
-               print "kvm-test-1-run-batch.sh" curbatch;
-               curbatch = "";
-               oldbatch = $1;
-       }
-       curbatch = curbatch " " $2;
-}
-
-END {
-       print "kvm-test-1-run-batch.sh" curbatch
+       curbatch = "";
+       for (i = 2; i <= NF; i++)
+               curbatch = curbatch " " $i;
+       print "kvm-test-1-run-batch.sh" curbatch;
  }' > $T/runbatches.sh
  
  if test -n "$dryrun"
@@ -188,12 +180,5 @@ then
         echo ---- Dryrun complete, directory: $rundir | tee -a "$rundir/log"
  else
         ( cd "$rundir"; sh $T/runbatches.sh )
-       kcsan-collapse.sh "$rundir" | tee -a "$rundir/log"
-       echo | tee -a "$rundir/log"
-       echo ---- Results directory: $rundir | tee -a "$rundir/log"
-       kvm-recheck.sh "$rundir" > $T/kvm-recheck.sh.out 2>&1
-       ret=$?
-       cat $T/kvm-recheck.sh.out | tee -a "$rundir/log"
-       echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a "$rundir/log"
-       exit $ret
+       kvm-end-run-stats.sh "$rundir" "$starttime"
  fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh

index 115e182..5ad973d 100755 (executable)
--- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
@@ -40,8 +40,10 @@ if test $retval -gt 1
  then
         exit 2
  fi
-ncpus=`cpus2use.sh`
-make -j$ncpus $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1
+
+# Tell "make" to use double the number of real CPUs on the build system.
+ncpus="`getconf _NPROCESSORS_ONLN`"
+make -j$((2 * ncpus)) $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1
  retval=$?
  if test $retval -ne 0 || grep "rcu[^/]*": < $resdir/Make.out | egrep -q "Stop|Error|error:|warning:" || egrep -q "Stop|Error|error:" < $resdir/Make.out
  then
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh b/tools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh

new file mode 100755 (executable)

index 0000000..e4a0077
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Check the status of the specified run.
+#
+# Usage: kvm-end-run-stats.sh /path/to/run starttime
+#
+# Copyright (C) 2021 Facebook, Inc.
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+# scriptname=$0
+# args="$*"
+rundir="$1"
+if ! test -d "$rundir"
+then
+       echo kvm-end-run-stats.sh: Specified run directory does not exist: $rundir
+       exit 1
+fi
+
+T=${TMPDIR-/tmp}/kvm-end-run-stats.sh.$$
+trap 'rm -rf $T' 0
+mkdir $T
+
+KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
+PATH=${KVM}/bin:$PATH; export PATH
+. functions.sh
+default_starttime="`get_starttime`"
+starttime="${2-default_starttime}"
+
+echo | tee -a "$rundir/log"
+echo | tee -a "$rundir/log"
+echo " --- `date` Test summary:" | tee -a "$rundir/log"
+echo Results directory: $rundir | tee -a "$rundir/log"
+kcsan-collapse.sh "$rundir" | tee -a "$rundir/log"
+kvm-recheck.sh "$rundir" > $T/kvm-recheck.sh.out 2>&1
+ret=$?
+cat $T/kvm-recheck.sh.out | tee -a "$rundir/log"
+echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a "$rundir/log"
+exit $ret
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh

index 0670841..daf64b5 100755 (executable)
--- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
@@ -43,7 +43,7 @@ then
  else
         echo No build errors.
  fi
-if grep -q -e "--buildonly" < ${rundir}/log
+if grep -q -e "--build-\?only" < ${rundir}/log && ! test -f "${rundir}/remote-log"
  then
         echo Build-only run, no console logs to check.
         exit $editorret
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh

index 1706cd4..fbdf162 100755 (executable)
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
@@ -31,7 +31,7 @@ then
         echo "$configfile ------- " $stopstate
  else
         title="$configfile ------- $ngps GPs"
-       dur=`sed -e 's/^.* rcutorture.shutdown_secs=//' -e 's/ .*$//' < $i/qemu-cmd 2> /dev/null`
+       dur=`grep -v '^#' $i/qemu-cmd | sed -e 's/^.* rcutorture.shutdown_secs=//' -e 's/ .*$//'`
         if test -z "$dur"
         then
                 :
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh

new file mode 100755 (executable)

index 0000000..79e680e
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Run a series of tests on remote systems under KVM.
+#
+# Usage: kvm-remote.sh "systems" [ <kvm.sh args> ]
+#       kvm-remote.sh "systems" /path/to/old/run [ <kvm-again.sh args> ]
+#
+# Copyright (C) 2021 Facebook, Inc.
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+scriptname=$0
+args="$*"
+
+if ! test -d tools/testing/selftests/rcutorture/bin
+then
+       echo $scriptname must be run from top-level directory of kernel source tree.
+       exit 1
+fi
+
+KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
+PATH=${KVM}/bin:$PATH; export PATH
+. functions.sh
+
+starttime="`get_starttime`"
+
+systems="$1"
+if test -z "$systems"
+then
+       echo $scriptname: Empty list of systems will go nowhere good, giving up.
+       exit 1
+fi
+shift
+
+# Pathnames:
+# T:     /tmp/kvm-remote.sh.$$
+# resdir: /tmp/kvm-remote.sh.$$/res
+# rundir: /tmp/kvm-remote.sh.$$/res/$ds ("-remote" suffix)
+# oldrun: `pwd`/tools/testing/.../res/$otherds
+#
+# Pathname segments:
+# TD:    kvm-remote.sh.$$
+# ds:    yyyy.mm.dd-hh.mm.ss-remote
+
+TD=kvm-remote.sh.$$
+T=${TMPDIR-/tmp}/$TD
+trap 'rm -rf $T' 0
+mkdir $T
+
+resdir="$T/res"
+ds=`date +%Y.%m.%d-%H.%M.%S`-remote
+rundir=$resdir/$ds
+echo Results directory: $rundir
+echo $scriptname $args
+if echo $1 | grep -q '^--'
+then
+       # Fresh build.  Create a datestamp unless the caller supplied one.
+       datestamp="`echo "$@" | awk -v ds="$ds" '{
+               for (i = 1; i < NF; i++) {
+                       if ($i == "--datestamp") {
+                               ds = "";
+                               break;
+                       }
+               }
+               if (ds != "")
+                       print "--datestamp " ds;
+       }'`"
+       kvm.sh --remote "$@" $datestamp --buildonly > $T/kvm.sh.out 2>&1
+       ret=$?
+       if test "$ret" -ne 0
+       then
+               echo $scriptname: kvm.sh failed exit code $?
+               cat $T/kvm.sh.out
+               exit 2
+       fi
+       oldrun="`grep -m 1 "^Results directory: " $T/kvm.sh.out | awk '{ print $3 }'`"
+       touch "$oldrun/remote-log"
+       echo $scriptname $args >> "$oldrun/remote-log"
+       echo | tee -a "$oldrun/remote-log"
+       echo " ----" kvm.sh output: "(`date`)" | tee -a "$oldrun/remote-log"
+       cat $T/kvm.sh.out | tee -a "$oldrun/remote-log"
+       # We are going to run this, so remove the buildonly files.
+       rm -f "$oldrun"/*/buildonly
+       kvm-again.sh $oldrun --dryrun --remote --rundir "$rundir" > $T/kvm-again.sh.out 2>&1
+       ret=$?
+       if test "$ret" -ne 0
+       then
+               echo $scriptname: kvm-again.sh failed exit code $? | tee -a "$oldrun/remote-log"
+               cat $T/kvm-again.sh.out | tee -a "$oldrun/remote-log"
+               exit 2
+       fi
+else
+       # Re-use old run.
+       oldrun="$1"
+       if ! echo $oldrun | grep -q '^/'
+       then
+               oldrun="`pwd`/$oldrun"
+       fi
+       shift
+       touch "$oldrun/remote-log"
+       echo $scriptname $args >> "$oldrun/remote-log"
+       kvm-again.sh "$oldrun" "$@" --dryrun --remote --rundir "$rundir" > $T/kvm-again.sh.out 2>&1
+       ret=$?
+       if test "$ret" -ne 0
+       then
+               echo $scriptname: kvm-again.sh failed exit code $? | tee -a "$oldrun/remote-log"
+               cat $T/kvm-again.sh.out | tee -a "$oldrun/remote-log"
+               exit 2
+       fi
+       cp -a "$rundir" "$KVM/res/"
+       oldrun="$KVM/res/$ds"
+fi
+echo | tee -a "$oldrun/remote-log"
+echo " ----" kvm-again.sh output: "(`date`)" | tee -a "$oldrun/remote-log"
+cat $T/kvm-again.sh.out
+echo | tee -a "$oldrun/remote-log"
+echo Remote run directory: $rundir | tee -a "$oldrun/remote-log"
+echo Local build-side run directory: $oldrun | tee -a "$oldrun/remote-log"
+
+# Create the kvm-remote-N.sh scripts in the bin directory.
+awk < "$rundir"/scenarios -v dest="$T/bin" -v rundir="$rundir" '
+{
+       n = $1;
+       sub(/\./, "", n);
+       fn = dest "/kvm-remote-" n ".sh"
+       scenarios = "";
+       for (i = 2; i <= NF; i++)
+               scenarios = scenarios " " $i;
+       print "kvm-test-1-run-batch.sh" scenarios > fn;
+       print "rm " rundir "/remote.run" >> fn;
+}'
+chmod +x $T/bin/kvm-remote-*.sh
+( cd "`dirname $T`"; tar -chzf $T/binres.tgz "$TD/bin" "$TD/res" )
+
+# Check first to avoid the need for cleanup for system-name typos
+for i in $systems
+do
+       ncpus="`ssh $i getconf _NPROCESSORS_ONLN 2> /dev/null`"
+       echo $i: $ncpus CPUs " " `date` | tee -a "$oldrun/remote-log"
+       ret=$?
+       if test "$ret" -ne 0
+       then
+               echo System $i unreachable, giving up. | tee -a "$oldrun/remote-log"
+               exit 4 | tee -a "$oldrun/remote-log"
+       fi
+done
+
+# Download and expand the tarball on all systems.
+for i in $systems
+do
+       echo Downloading tarball to $i `date` | tee -a "$oldrun/remote-log"
+       cat $T/binres.tgz | ssh $i "cd /tmp; tar -xzf -"
+       ret=$?
+       if test "$ret" -ne 0
+       then
+               echo Unable to download $T/binres.tgz to system $i, giving up. | tee -a "$oldrun/remote-log"
+               exit 10 | tee -a "$oldrun/remote-log"
+       fi
+done
+
+# Function to check for presence of a file on the specified system.
+# Complain if the system cannot be reached, and retry after a wait.
+# Currently just waits forever if a machine disappears.
+#
+# Usage: checkremotefile system pathname
+checkremotefile () {
+       local ret
+       local sleeptime=60
+
+       while :
+       do
+               ssh $1 "test -f \"$2\""
+               ret=$?
+               if test "$ret" -ne 255
+               then
+                       return $ret
+               fi
+               echo " ---" ssh failure to $1 checking for file $2, retry after $sleeptime seconds. `date`
+               sleep $sleeptime
+       done
+}
+
+# Function to start batches on idle remote $systems
+#
+# Usage: startbatches curbatch nbatches
+#
+# Batches are numbered starting at 1.  Returns the next batch to start.
+# Be careful to redirect all debug output to FD 2 (stderr).
+startbatches () {
+       local curbatch="$1"
+       local nbatches="$2"
+       local ret
+
+       # Each pass through the following loop examines one system.
+       for i in $systems
+       do
+               if test "$curbatch" -gt "$nbatches"
+               then
+                       echo $((nbatches + 1))
+                       return 0
+               fi
+               if checkremotefile "$i" "$resdir/$ds/remote.run" 1>&2
+               then
+                       continue # System still running last test, skip.
+               fi
+               ssh "$i" "cd \"$resdir/$ds\"; touch remote.run; PATH=\"$T/bin:$PATH\" nohup kvm-remote-$curbatch.sh > kvm-remote-$curbatch.sh.out 2>&1 &" 1>&2
+               ret=$?
+               if test "$ret" -ne 0
+               then
+                       echo ssh $i failed: exitcode $ret 1>&2
+                       exit 11
+               fi
+               echo " ----" System $i Batch `head -n $curbatch < "$rundir"/scenarios | tail -1` `date` 1>&2
+               curbatch=$((curbatch + 1))
+       done
+       echo $curbatch
+}
+
+# Launch all the scenarios.
+nbatches="`wc -l "$rundir"/scenarios | awk '{ print $1 }'`"
+curbatch=1
+while test "$curbatch" -le "$nbatches"
+do
+       startbatches $curbatch $nbatches > $T/curbatch 2> $T/startbatches.stderr
+       curbatch="`cat $T/curbatch`"
+       if test -s "$T/startbatches.stderr"
+       then
+               cat "$T/startbatches.stderr" | tee -a "$oldrun/remote-log"
+       fi
+       if test "$curbatch" -le "$nbatches"
+       then
+               sleep 30
+       fi
+done
+echo All batches started. `date`
+
+# Wait for all remaining scenarios to complete and collect results.
+for i in $systems
+do
+       while checkremotefile "$i" "$resdir/$ds/remote.run"
+       do
+               sleep 30
+       done
+       ( cd "$oldrun"; ssh $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu_pid */qemu-retval; rm -rf $T > /dev/null 2>&1" | tar -xzf - )
+done
+
+( kvm-end-run-stats.sh "$oldrun" "$starttime"; echo $? > $T/exitcode ) | tee -a "$oldrun/remote-log"
+exit "`cat $T/exitcode`"
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh

index 6bf00a0..b4ac4ee 100755 (executable)
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -20,6 +20,9 @@ mkdir $T
  
  cd `dirname $scriptname`/../../../../../
  
+# This script knows only English.
+LANG=en_US.UTF-8; export LANG
+
  dur=$((30*60))
  dryrun=""
  KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
@@ -41,6 +44,7 @@ TORTURE_KCONFIG_KASAN_ARG=""
  TORTURE_KCONFIG_KCSAN_ARG=""
  TORTURE_KMAKE_ARG=""
  TORTURE_QEMU_MEM=512
+TORTURE_REMOTE=
  TORTURE_SHUTDOWN_GRACE=180
  TORTURE_SUITE=rcu
  TORTURE_MOD=rcutorture
@@ -64,7 +68,7 @@ usage () {
         echo "       --cpus N"
         echo "       --datestamp string"
         echo "       --defconfig string"
-       echo "       --dryrun batches|sched|script"
+       echo "       --dryrun batches|scenarios|sched|script"
         echo "       --duration minutes | <seconds>s | <hours>h | <days>d"
         echo "       --gdb"
         echo "       --help"
@@ -77,6 +81,7 @@ usage () {
         echo "       --no-initrd"
         echo "       --qemu-args qemu-arguments"
         echo "       --qemu-cmd qemu-system-..."
+       echo "       --remote"
         echo "       --results absolute-pathname"
         echo "       --torture lock|rcu|rcuscale|refscale|scf"
         echo "       --trust-make"
@@ -112,10 +117,13 @@ do
                 checkarg --cpus "(number)" "$#" "$2" '^[0-9]*$' '^--'
                 cpus=$2
                 TORTURE_ALLOTED_CPUS="$2"
-               max_cpus="`identify_qemu_vcpus`"
-               if test "$TORTURE_ALLOTED_CPUS" -gt "$max_cpus"
+               if test -z "$TORTURE_REMOTE"
                 then
-                       TORTURE_ALLOTED_CPUS=$max_cpus
+                       max_cpus="`identify_qemu_vcpus`"
+                       if test "$TORTURE_ALLOTED_CPUS" -gt "$max_cpus"
+                       then
+                               TORTURE_ALLOTED_CPUS=$max_cpus
+                       fi
                 fi
                 shift
                 ;;
@@ -130,7 +138,7 @@ do
                 shift
                 ;;
         --dryrun)
-               checkarg --dryrun "batches|sched|script" $# "$2" 'batches\|sched\|script' '^--'
+               checkarg --dryrun "batches|sched|script" $# "$2" 'batches\|scenarios\|sched\|script' '^--'
                 dryrun=$2
                 shift
                 ;;
@@ -206,6 +214,9 @@ do
                 TORTURE_QEMU_CMD="$2"
                 shift
                 ;;
+       --remote)
+               TORTURE_REMOTE=1
+               ;;
         --results)
                 checkarg --results "(absolute pathname)" "$#" "$2" '^/' '^error'
                 resdir=$2
@@ -550,20 +561,7 @@ END {
         if (ncpus != 0)
                 dump(first, i, batchnum);
  }' >> $T/script
-
-cat << '___EOF___' >> $T/script
-echo | tee -a $TORTURE_RESDIR/log
-echo | tee -a $TORTURE_RESDIR/log
-echo " --- `date` Test summary:" | tee -a $TORTURE_RESDIR/log
-___EOF___
-cat << ___EOF___ >> $T/script
-echo Results directory: $resdir/$ds | tee -a $resdir/$ds/log
-kcsan-collapse.sh $resdir/$ds | tee -a $resdir/$ds/log
-kvm-recheck.sh $resdir/$ds > $T/kvm-recheck.sh.out 2>&1
-___EOF___
-echo 'ret=$?' >> $T/script
-echo "cat $T/kvm-recheck.sh.out | tee -a $resdir/$ds/log" >> $T/script
-echo 'exit $ret' >> $T/script
+echo kvm-end-run-stats.sh "$resdir/$ds" "$starttime" >> $T/script
  
  # Extract the tests and their batches from the script.
  egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" |
@@ -577,6 +575,25 @@ egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" |
                 print batchno, $1, $2
         }' > $T/batches
  
+# As above, but one line per batch.
+grep -v '^#' $T/batches | awk '
+BEGIN {
+       oldbatch = 1;
+}
+
+{
+       if (oldbatch != $1) {
+               print ++n ". " curbatch;
+               curbatch = "";
+               oldbatch = $1;
+       }
+       curbatch = curbatch " " $2;
+}
+
+END {
+       print ++n ". " curbatch;
+}' > $T/scenarios
+
  if test "$dryrun" = script
  then
         cat $T/script
@@ -597,13 +614,17 @@ elif test "$dryrun" = batches
  then
         cat $T/batches
         exit 0
+elif test "$dryrun" = scenarios
+then
+       cat $T/scenarios
+       exit 0
  else
         # Not a dryrun.  Record the batches and the number of CPUs, then run the script.
         bash $T/script
         ret=$?
         cp $T/batches $resdir/$ds/batches
+       cp $T/scenarios $resdir/$ds/scenarios
         echo '#' cpus=$cpus >> $resdir/$ds/batches
-       echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a $resdir/$ds/log
         exit $ret
  fi
  
diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh

index 56e2e1a..53ec7c0 100755 (executable)
--- a/tools/testing/selftests/rcutorture/bin/torture.sh
+++ b/tools/testing/selftests/rcutorture/bin/torture.sh
@@ -302,7 +302,7 @@ function torture_set {
                         kcsan_kmake_tag="--kmake-args"
                         cur_kcsan_kmake_args="$kcsan_kmake_args"
                 fi
-               torture_one $* --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan
+               torture_one "$@" --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan
         fi
  }
  
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST

new file mode 100644 (file)

index 0000000..22d598f
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST
@@ -0,0 +1,17 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=16
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=y
+CONFIG_NO_HZ_IDLE=n
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_TRACE=y
+CONFIG_HOTPLUG_CPU=y
+CONFIG_RCU_FANOUT=2
+CONFIG_RCU_FANOUT_LEAF=2
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot

new file mode 100644 (file)

index 0000000..f57720c
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot
@@ -0,0 +1,8 @@
+rcutorture.test_boost=2
+rcutorture.stutter=0
+rcutree.gp_preinit_delay=12
+rcutree.gp_init_delay=3
+rcutree.gp_cleanup_delay=3
+rcutree.kthread_prio=2
+threadirqs
+tree.use_softirq=0
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE

index 721cfda..4cc1cc5 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE
@@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n
  CONFIG_NO_HZ_IDLE=y
  CONFIG_NO_HZ_FULL=n
  CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_HOTPLUG_CPU=n
+CONFIG_HOTPLUG_CPU=y
  CONFIG_SUSPEND=n
  CONFIG_HIBERNATION=n
  CONFIG_RCU_NOCB_CPU=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54

index 7629f5d..f595206 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54
@@ -8,7 +8,7 @@ CONFIG_HZ_PERIODIC=n
  CONFIG_NO_HZ_IDLE=y
  CONFIG_NO_HZ_FULL=n
  CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_HOTPLUG_CPU=n
+CONFIG_HOTPLUG_CPU=y
  CONFIG_SUSPEND=n
  CONFIG_HIBERNATION=n
  CONFIG_RCU_FANOUT=3
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT

index 1cd25b7..ad505a8 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
+++ b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
@@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n
  CONFIG_NO_HZ_IDLE=y
  CONFIG_NO_HZ_FULL=n
  CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_HOTPLUG_CPU=n
+CONFIG_HOTPLUG_CPU=y
  CONFIG_SUSPEND=n
  CONFIG_HIBERNATION=n
  CONFIG_RCU_NOCB_CPU=n
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT

index d10bc69..4f08e64 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT
+++ b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT
@@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n
  CONFIG_NO_HZ_IDLE=y
  CONFIG_NO_HZ_FULL=n
  CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_HOTPLUG_CPU=n
+CONFIG_HOTPLUG_CPU=y
  CONFIG_SUSPEND=n
  CONFIG_HIBERNATION=n
  CONFIG_RCU_NOCB_CPU=n
author	Paul E. McKenney <paulmck@kernel.org>
	Tue, 18 May 2021 17:56:19 +0000 (10:56 -0700)
committer	Paul E. McKenney <paulmck@kernel.org>
	Tue, 18 May 2021 17:56:19 +0000 (10:56 -0700)
Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst		patch \| blob \| history
Documentation/admin-guide/kernel-parameters.rst		patch \| blob \| history
Documentation/admin-guide/kernel-parameters.txt		patch \| blob \| history
include/linux/rcupdate.h		patch \| blob \| history
include/linux/rcutiny.h		patch \| blob \| history
include/linux/rcutree.h		patch \| blob \| history
include/linux/srcu.h		patch \| blob \| history
include/linux/srcutree.h		patch \| blob \| history
init/main.c		patch \| blob \| history
kernel/locking/lockdep.c		patch \| blob \| history
kernel/rcu/Kconfig.debug		patch \| blob \| history
kernel/rcu/rcu.h		patch \| blob \| history
kernel/rcu/rcutorture.c		patch \| blob \| history
kernel/rcu/refscale.c		patch \| blob \| history
kernel/rcu/srcutree.c		patch \| blob \| history
kernel/rcu/tasks.h		patch \| blob \| history
kernel/rcu/tiny.c		patch \| blob \| history
kernel/rcu/tree.c		patch \| blob \| history
kernel/rcu/tree.h		patch \| blob \| history
kernel/rcu/tree_plugin.h		patch \| blob \| history
kernel/rcu/tree_stall.h		patch \| blob \| history
kernel/rcu/update.c		patch \| blob \| history
lib/bitmap.c		patch \| blob \| history
lib/test_bitmap.c		patch \| blob \| history
mm/oom_kill.c		patch \| blob \| history
mm/slab.h		patch \| blob \| history
mm/slab_common.c		patch \| blob \| history
mm/slub.c		patch \| blob \| history
mm/util.c		patch \| blob \| history
tools/rcu/rcu-cbs.py	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/rcutorture/bin/kvm-again.sh		patch \| blob \| history
tools/testing/selftests/rcutorture/bin/kvm-build.sh		patch \| blob \| history
tools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh	[new file with mode: 0755]	patch \| blob
tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh		patch \| blob \| history
tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh		patch \| blob \| history
tools/testing/selftests/rcutorture/bin/kvm-remote.sh	[new file with mode: 0755]	patch \| blob
tools/testing/selftests/rcutorture/bin/kvm.sh		patch \| blob \| history
tools/testing/selftests/rcutorture/bin/torture.sh		patch \| blob \| history
tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/rcutorture/configs/rcuscale/TREE		patch \| blob \| history
tools/testing/selftests/rcutorture/configs/rcuscale/TREE54		patch \| blob \| history
tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT		patch \| blob \| history
tools/testing/selftests/rcutorture/configs/refscale/PREEMPT		patch \| blob \| history