sched/wakeup: Prepare for RT sleeping spin/rwlocks

author Thomas Gleixner <tglx@linutronix.de>

Sun, 15 Aug 2021 21:27:44 +0000 (23:27 +0200)

committer Ingo Molnar <mingo@kernel.org>

Tue, 17 Aug 2021 14:49:02 +0000 (16:49 +0200)
author Thomas Gleixner <tglx@linutronix.de>
Sun, 15 Aug 2021 21:27:44 +0000 (23:27 +0200)
committer Ingo Molnar <mingo@kernel.org>
Tue, 17 Aug 2021 14:49:02 +0000 (16:49 +0200)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 4c72cf6..02714b9 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -143,9 +143,22 @@ struct task_group;
                 current->task_state_change = _THIS_IP_;                 \
         } while (0)
  
+# define debug_rtlock_wait_set_state()                                 \
+       do {                                                             \
+               current->saved_state_change = current->task_state_change;\
+               current->task_state_change = _THIS_IP_;                  \
+       } while (0)
+
+# define debug_rtlock_wait_restore_state()                             \
+       do {                                                             \
+               current->task_state_change = current->saved_state_change;\
+       } while (0)
+
  #else
  # define debug_normal_state_change(cond)       do { } while (0)
  # define debug_special_state_change(cond)      do { } while (0)
+# define debug_rtlock_wait_set_state()         do { } while (0)
+# define debug_rtlock_wait_restore_state()     do { } while (0)
  #endif
  
  /*
@@ -213,6 +226,51 @@ struct task_group;
                 raw_spin_unlock_irqrestore(&current->pi_lock, flags);   \
         } while (0)
  
+/*
+ * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
+ *
+ * RT's spin/rwlock substitutions are state preserving. The state of the
+ * task when blocking on the lock is saved in task_struct::saved_state and
+ * restored after the lock has been acquired.  These operations are
+ * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
+ * lock related wakeups while the task is blocked on the lock are
+ * redirected to operate on task_struct::saved_state to ensure that these
+ * are not dropped. On restore task_struct::saved_state is set to
+ * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
+ *
+ * The lock operation looks like this:
+ *
+ *     current_save_and_set_rtlock_wait_state();
+ *     for (;;) {
+ *             if (try_lock())
+ *                     break;
+ *             raw_spin_unlock_irq(&lock->wait_lock);
+ *             schedule_rtlock();
+ *             raw_spin_lock_irq(&lock->wait_lock);
+ *             set_current_state(TASK_RTLOCK_WAIT);
+ *     }
+ *     current_restore_rtlock_saved_state();
+ */
+#define current_save_and_set_rtlock_wait_state()                       \
+       do {                                                            \
+               lockdep_assert_irqs_disabled();                         \
+               raw_spin_lock(&current->pi_lock);                       \
+               current->saved_state = current->__state;                \
+               debug_rtlock_wait_set_state();                          \
+               WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);         \
+               raw_spin_unlock(&current->pi_lock);                     \
+       } while (0);
+
+#define current_restore_rtlock_saved_state()                           \
+       do {                                                            \
+               lockdep_assert_irqs_disabled();                         \
+               raw_spin_lock(&current->pi_lock);                       \
+               debug_rtlock_wait_restore_state();                      \
+               WRITE_ONCE(current->__state, current->saved_state);     \
+               current->saved_state = TASK_RUNNING;                    \
+               raw_spin_unlock(&current->pi_lock);                     \
+       } while (0);
+
  #define get_current_state()    READ_ONCE(current->__state)
  
  /* Task command name length: */
@@ -668,6 +726,11 @@ struct task_struct {
  #endif
         unsigned int                    __state;
  
+#ifdef CONFIG_PREEMPT_RT
+       /* saved state for "spinlock sleepers" */
+       unsigned int                    saved_state;
+#endif
+
         /*
          * This begins the randomizable portion of task_struct. Only
          * scheduling-critical items should be added above here.
@@ -1357,6 +1420,9 @@ struct task_struct {
         struct kmap_ctrl                kmap_ctrl;
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
         unsigned long                   task_state_change;
+# ifdef CONFIG_PREEMPT_RT
+       unsigned long                   saved_state_change;
+# endif
  #endif
         int                             pagefault_disabled;
  #ifdef CONFIG_MMU
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 961991e..e407c6a 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3566,14 +3566,47 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
   *
   * The caller holds p::pi_lock if p != current or has preemption
   * disabled when p == current.
+ *
+ * The rules of PREEMPT_RT saved_state:
+ *
+ *   The related locking code always holds p::pi_lock when updating
+ *   p::saved_state, which means the code is fully serialized in both cases.
+ *
+ *   The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
+ *   bits set. This allows to distinguish all wakeup scenarios.
   */
  static __always_inline
  bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
  {
+       if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
+               WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
+                            state != TASK_RTLOCK_WAIT);
+       }
+
         if (READ_ONCE(p->__state) & state) {
                 *success = 1;
                 return true;
         }
+
+#ifdef CONFIG_PREEMPT_RT
+       /*
+        * Saved state preserves the task state across blocking on
+        * an RT lock.  If the state matches, set p::saved_state to
+        * TASK_RUNNING, but do not wake the task because it waits
+        * for a lock wakeup. Also indicate success because from
+        * the regular waker's point of view this has succeeded.
+        *
+        * After acquiring the lock the task will restore p::__state
+        * from p::saved_state which ensures that the regular
+        * wakeup is not lost. The restore will also set
+        * p::saved_state to TASK_RUNNING so any further tests will
+        * not result in false positives vs. @success
+        */
+       if (p->saved_state & state) {
+               p->saved_state = TASK_RUNNING;
+               *success = 1;
+       }
+#endif
         return false;
  }
author	Thomas Gleixner <tglx@linutronix.de>
	Sun, 15 Aug 2021 21:27:44 +0000 (23:27 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Tue, 17 Aug 2021 14:49:02 +0000 (16:49 +0200)
include/linux/sched.h		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history