kernel/locking/percpu-rwsem.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 #include <linux/atomic.h>
   3 #include <linux/percpu.h>
   4 #include <linux/wait.h>
   5 #include <linux/lockdep.h>
   6 #include <linux/percpu-rwsem.h>
   7 #include <linux/rcupdate.h>
   8 #include <linux/sched.h>
   9 #include <linux/sched/task.h>
  10 #include <linux/sched/debug.h>
  11 #include <linux/errno.h>
  12 #include <trace/events/lock.h>
  13
  14 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
  15                         const char *name, struct lock_class_key *key)
  16 {
  17         sem->read_count = alloc_percpu(int);
  18         if (unlikely(!sem->read_count))
  19                 return -ENOMEM;
  20
  21         rcu_sync_init(&sem->rss);
  22         rcuwait_init(&sem->writer);
  23         init_waitqueue_head(&sem->waiters);
  24         atomic_set(&sem->block, 0);
  25 #ifdef CONFIG_DEBUG_LOCK_ALLOC
  26         debug_check_no_locks_freed((void *)sem, sizeof(*sem));
  27         lockdep_init_map(&sem->dep_map, name, key, 0);
  28 #endif
  29         return 0;
  30 }
  31 EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
  32
  33 void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
  34 {
  35         /*
  36          * XXX: temporary kludge. The error path in alloc_super()
  37          * assumes that percpu_free_rwsem() is safe after kzalloc().
  38          */
  39         if (!sem->read_count)
  40                 return;
  41
  42         rcu_sync_dtor(&sem->rss);
  43         free_percpu(sem->read_count);
  44         sem->read_count = NULL; /* catch use after free bugs */
  45 }
  46 EXPORT_SYMBOL_GPL(percpu_free_rwsem);
  47
  48 static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
  49 {
  50         this_cpu_inc(*sem->read_count);
  51
  52         /*
  53          * Due to having preemption disabled the decrement happens on
  54          * the same CPU as the increment, avoiding the
  55          * increment-on-one-CPU-and-decrement-on-another problem.
  56          *
  57          * If the reader misses the writer's assignment of sem->block, then the
  58          * writer is guaranteed to see the reader's increment.
  59          *
  60          * Conversely, any readers that increment their sem->read_count after
  61          * the writer looks are guaranteed to see the sem->block value, which
  62          * in turn means that they are guaranteed to immediately decrement
  63          * their sem->read_count, so that it doesn't matter that the writer
  64          * missed them.
  65          */
  66
  67         smp_mb(); /* A matches D */
  68
  69         /*
  70          * If !sem->block the critical section starts here, matched by the
  71          * release in percpu_up_write().
  72          */
  73         if (likely(!atomic_read_acquire(&sem->block)))
  74                 return true;
  75
  76         this_cpu_dec(*sem->read_count);
  77
  78         /* Prod writer to re-evaluate readers_active_check() */
  79         rcuwait_wake_up(&sem->writer);
  80
  81         return false;
  82 }
  83
  84 static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem)
  85 {
  86         if (atomic_read(&sem->block))
  87                 return false;
  88
  89         return atomic_xchg(&sem->block, 1) == 0;
  90 }
  91
  92 static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
  93 {
  94         if (reader) {
  95                 bool ret;
  96
  97                 preempt_disable();
  98                 ret = __percpu_down_read_trylock(sem);
  99                 preempt_enable();
 100
 101                 return ret;
 102         }
 103         return __percpu_down_write_trylock(sem);
 104 }
 105
 106 /*
 107  * The return value of wait_queue_entry::func means:
 108  *
 109  *  <0 - error, wakeup is terminated and the error is returned
 110  *   0 - no wakeup, a next waiter is tried
 111  *  >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive.
 112  *
 113  * We use EXCLUSIVE for both readers and writers to preserve FIFO order,
 114  * and play games with the return value to allow waking multiple readers.
 115  *
 116  * Specifically, we wake readers until we've woken a single writer, or until a
 117  * trylock fails.
 118  */
 119 static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
 120                                       unsigned int mode, int wake_flags,
 121                                       void *key)
 122 {
 123         bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
 124         struct percpu_rw_semaphore *sem = key;
 125         struct task_struct *p;
 126
 127         /* concurrent against percpu_down_write(), can get stolen */
 128         if (!__percpu_rwsem_trylock(sem, reader))
 129                 return 1;
 130
 131         p = get_task_struct(wq_entry->private);
 132         list_del_init(&wq_entry->entry);
 133         smp_store_release(&wq_entry->private, NULL);
 134
 135         wake_up_process(p);
 136         put_task_struct(p);
 137
 138         return !reader; /* wake (readers until) 1 writer */
 139 }
 140
 141 static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
 142 {
 143         DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
 144         bool wait;
 145
 146         spin_lock_irq(&sem->waiters.lock);
 147         /*
 148          * Serialize against the wakeup in percpu_up_write(), if we fail
 149          * the trylock, the wakeup must see us on the list.
 150          */
 151         wait = !__percpu_rwsem_trylock(sem, reader);
 152         if (wait) {
 153                 wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM;
 154                 __add_wait_queue_entry_tail(&sem->waiters, &wq_entry);
 155         }
 156         spin_unlock_irq(&sem->waiters.lock);
 157
 158         while (wait) {
 159                 set_current_state(TASK_UNINTERRUPTIBLE);
 160                 if (!smp_load_acquire(&wq_entry.private))
 161                         break;
 162                 schedule();
 163         }
 164         __set_current_state(TASK_RUNNING);
 165 }
 166
 167 bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 168 {
 169         if (__percpu_down_read_trylock(sem))
 170                 return true;
 171
 172         if (try)
 173                 return false;
 174
 175         trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ);
 176         preempt_enable();
 177         percpu_rwsem_wait(sem, /* .reader = */ true);
 178         preempt_disable();
 179         trace_contention_end(sem, 0);
 180
 181         return true;
 182 }
 183 EXPORT_SYMBOL_GPL(__percpu_down_read);
 184
 185 #define per_cpu_sum(var)                                                \
 186 ({                                                                      \
 187         typeof(var) __sum = 0;                                          \
 188         int cpu;                                                        \
 189         compiletime_assert_atomic_type(__sum);                          \
 190         for_each_possible_cpu(cpu)                                      \
 191                 __sum += per_cpu(var, cpu);                             \
 192         __sum;                                                          \
 193 })
 194
 195 /*
 196  * Return true if the modular sum of the sem->read_count per-CPU variable is
 197  * zero.  If this sum is zero, then it is stable due to the fact that if any
 198  * newly arriving readers increment a given counter, they will immediately
 199  * decrement that same counter.
 200  *
 201  * Assumes sem->block is set.
 202  */
 203 static bool readers_active_check(struct percpu_rw_semaphore *sem)
 204 {
 205         if (per_cpu_sum(*sem->read_count) != 0)
 206                 return false;
 207
 208         /*
 209          * If we observed the decrement; ensure we see the entire critical
 210          * section.
 211          */
 212
 213         smp_mb(); /* C matches B */
 214
 215         return true;
 216 }
 217
 218 void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
 219 {
 220         might_sleep();
 221         rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
 222         trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);
 223
 224         /* Notify readers to take the slow path. */
 225         rcu_sync_enter(&sem->rss);
 226
 227         /*
 228          * Try set sem->block; this provides writer-writer exclusion.
 229          * Having sem->block set makes new readers block.
 230          */
 231         if (!__percpu_down_write_trylock(sem))
 232                 percpu_rwsem_wait(sem, /* .reader = */ false);
 233
 234         /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */
 235
 236         /*
 237          * If they don't see our store of sem->block, then we are guaranteed to
 238          * see their sem->read_count increment, and therefore will wait for
 239          * them.
 240          */
 241
 242         /* Wait for all active readers to complete. */
 243         rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
 244         trace_contention_end(sem, 0);
 245 }
 246 EXPORT_SYMBOL_GPL(percpu_down_write);
 247
 248 void percpu_up_write(struct percpu_rw_semaphore *sem)
 249 {
 250         rwsem_release(&sem->dep_map, _RET_IP_);
 251
 252         /*
 253          * Signal the writer is done, no fast path yet.
 254          *
 255          * One reason that we cannot just immediately flip to readers_fast is
 256          * that new readers might fail to see the results of this writer's
 257          * critical section.
 258          *
 259          * Therefore we force it through the slow path which guarantees an
 260          * acquire and thereby guarantees the critical section's consistency.
 261          */
 262         atomic_set_release(&sem->block, 0);
 263
 264         /*
 265          * Prod any pending reader/writer to make progress.
 266          */
 267         __wake_up(&sem->waiters, TASK_NORMAL, 1, sem);
 268
 269         /*
 270          * Once this completes (at least one RCU-sched grace period hence) the
 271          * reader fast path will be available again. Safe to use outside the
 272          * exclusive write lock because its counting.
 273          */
 274         rcu_sync_exit(&sem->rss);
 275 }
 276 EXPORT_SYMBOL_GPL(percpu_up_write);