kernel/sched/membarrier.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   4  *
   5  * membarrier system call
   6  */
   7 #include "sched.h"
   8
   9 /*
  10  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  11  * except MEMBARRIER_CMD_QUERY.
  12  */
  13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
  14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK                  \
  15         (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE                     \
  16         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
  17 #else
  18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK  0
  19 #endif
  20
  21 #define MEMBARRIER_CMD_BITMASK                                          \
  22         (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED        \
  23         | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED                      \
  24         | MEMBARRIER_CMD_PRIVATE_EXPEDITED                              \
  25         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED                     \
  26         | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
  27
  28 static void ipi_mb(void *info)
  29 {
  30         smp_mb();       /* IPIs should be serializing but paranoid. */
  31 }
  32
  33 static int membarrier_global_expedited(void)
  34 {
  35         int cpu;
  36         bool fallback = false;
  37         cpumask_var_t tmpmask;
  38
  39         if (num_online_cpus() == 1)
  40                 return 0;
  41
  42         /*
  43          * Matches memory barriers around rq->curr modification in
  44          * scheduler.
  45          */
  46         smp_mb();       /* system call entry is not a mb. */
  47
  48         /*
  49          * Expedited membarrier commands guarantee that they won't
  50          * block, hence the GFP_NOWAIT allocation flag and fallback
  51          * implementation.
  52          */
  53         if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
  54                 /* Fallback for OOM. */
  55                 fallback = true;
  56         }
  57
  58         cpus_read_lock();
  59         for_each_online_cpu(cpu) {
  60                 struct task_struct *p;
  61
  62                 /*
  63                  * Skipping the current CPU is OK even through we can be
  64                  * migrated at any point. The current CPU, at the point
  65                  * where we read raw_smp_processor_id(), is ensured to
  66                  * be in program order with respect to the caller
  67                  * thread. Therefore, we can skip this CPU from the
  68                  * iteration.
  69                  */
  70                 if (cpu == raw_smp_processor_id())
  71                         continue;
  72
  73                 rcu_read_lock();
  74                 p = task_rcu_dereference(&cpu_rq(cpu)->curr);
  75                 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
  76                                    MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
  77                         if (!fallback)
  78                                 __cpumask_set_cpu(cpu, tmpmask);
  79                         else
  80                                 smp_call_function_single(cpu, ipi_mb, NULL, 1);
  81                 }
  82                 rcu_read_unlock();
  83         }
  84         if (!fallback) {
  85                 preempt_disable();
  86                 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
  87                 preempt_enable();
  88                 free_cpumask_var(tmpmask);
  89         }
  90         cpus_read_unlock();
  91
  92         /*
  93          * Memory barrier on the caller thread _after_ we finished
  94          * waiting for the last IPI. Matches memory barriers around
  95          * rq->curr modification in scheduler.
  96          */
  97         smp_mb();       /* exit from system call is not a mb */
  98         return 0;
  99 }
 100
 101 static int membarrier_private_expedited(int flags)
 102 {
 103         int cpu;
 104         bool fallback = false;
 105         cpumask_var_t tmpmask;
 106
 107         if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 108                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 109                         return -EINVAL;
 110                 if (!(atomic_read(&current->mm->membarrier_state) &
 111                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
 112                         return -EPERM;
 113         } else {
 114                 if (!(atomic_read(&current->mm->membarrier_state) &
 115                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
 116                         return -EPERM;
 117         }
 118
 119         if (num_online_cpus() == 1)
 120                 return 0;
 121
 122         /*
 123          * Matches memory barriers around rq->curr modification in
 124          * scheduler.
 125          */
 126         smp_mb();       /* system call entry is not a mb. */
 127
 128         /*
 129          * Expedited membarrier commands guarantee that they won't
 130          * block, hence the GFP_NOWAIT allocation flag and fallback
 131          * implementation.
 132          */
 133         if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
 134                 /* Fallback for OOM. */
 135                 fallback = true;
 136         }
 137
 138         cpus_read_lock();
 139         for_each_online_cpu(cpu) {
 140                 struct task_struct *p;
 141
 142                 /*
 143                  * Skipping the current CPU is OK even through we can be
 144                  * migrated at any point. The current CPU, at the point
 145                  * where we read raw_smp_processor_id(), is ensured to
 146                  * be in program order with respect to the caller
 147                  * thread. Therefore, we can skip this CPU from the
 148                  * iteration.
 149                  */
 150                 if (cpu == raw_smp_processor_id())
 151                         continue;
 152                 rcu_read_lock();
 153                 p = task_rcu_dereference(&cpu_rq(cpu)->curr);
 154                 if (p && p->mm == current->mm) {
 155                         if (!fallback)
 156                                 __cpumask_set_cpu(cpu, tmpmask);
 157                         else
 158                                 smp_call_function_single(cpu, ipi_mb, NULL, 1);
 159                 }
 160                 rcu_read_unlock();
 161         }
 162         if (!fallback) {
 163                 preempt_disable();
 164                 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
 165                 preempt_enable();
 166                 free_cpumask_var(tmpmask);
 167         }
 168         cpus_read_unlock();
 169
 170         /*
 171          * Memory barrier on the caller thread _after_ we finished
 172          * waiting for the last IPI. Matches memory barriers around
 173          * rq->curr modification in scheduler.
 174          */
 175         smp_mb();       /* exit from system call is not a mb */
 176
 177         return 0;
 178 }
 179
 180 static int membarrier_register_global_expedited(void)
 181 {
 182         struct task_struct *p = current;
 183         struct mm_struct *mm = p->mm;
 184
 185         if (atomic_read(&mm->membarrier_state) &
 186             MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
 187                 return 0;
 188         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
 189         if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
 190                 /*
 191                  * For single mm user, single threaded process, we can
 192                  * simply issue a memory barrier after setting
 193                  * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
 194                  * no memory access following registration is reordered
 195                  * before registration.
 196                  */
 197                 smp_mb();
 198         } else {
 199                 /*
 200                  * For multi-mm user threads, we need to ensure all
 201                  * future scheduler executions will observe the new
 202                  * thread flag state for this mm.
 203                  */
 204                 synchronize_rcu();
 205         }
 206         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
 207                   &mm->membarrier_state);
 208
 209         return 0;
 210 }
 211
 212 static int membarrier_register_private_expedited(int flags)
 213 {
 214         struct task_struct *p = current;
 215         struct mm_struct *mm = p->mm;
 216         int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
 217
 218         if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 219                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 220                         return -EINVAL;
 221                 state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
 222         }
 223
 224         /*
 225          * We need to consider threads belonging to different thread
 226          * groups, which use the same mm. (CLONE_VM but not
 227          * CLONE_THREAD).
 228          */
 229         if (atomic_read(&mm->membarrier_state) & state)
 230                 return 0;
 231         atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
 232         if (flags & MEMBARRIER_FLAG_SYNC_CORE)
 233                 atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
 234                           &mm->membarrier_state);
 235         if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
 236                 /*
 237                  * Ensure all future scheduler executions will observe the
 238                  * new thread flag state for this process.
 239                  */
 240                 synchronize_rcu();
 241         }
 242         atomic_or(state, &mm->membarrier_state);
 243
 244         return 0;
 245 }
 246
 247 /**
 248  * sys_membarrier - issue memory barriers on a set of threads
 249  * @cmd:   Takes command values defined in enum membarrier_cmd.
 250  * @flags: Currently needs to be 0. For future extensions.
 251  *
 252  * If this system call is not implemented, -ENOSYS is returned. If the
 253  * command specified does not exist, not available on the running
 254  * kernel, or if the command argument is invalid, this system call
 255  * returns -EINVAL. For a given command, with flags argument set to 0,
 256  * this system call is guaranteed to always return the same value until
 257  * reboot.
 258  *
 259  * All memory accesses performed in program order from each targeted thread
 260  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
 261  * the semantic "barrier()" to represent a compiler barrier forcing memory
 262  * accesses to be performed in program order across the barrier, and
 263  * smp_mb() to represent explicit memory barriers forcing full memory
 264  * ordering across the barrier, we have the following ordering table for
 265  * each pair of barrier(), sys_membarrier() and smp_mb():
 266  *
 267  * The pair ordering is detailed as (O: ordered, X: not ordered):
 268  *
 269  *                        barrier()   smp_mb() sys_membarrier()
 270  *        barrier()          X           X            O
 271  *        smp_mb()           X           O            O
 272  *        sys_membarrier()   O           O            O
 273  */
 274 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 275 {
 276         if (unlikely(flags))
 277                 return -EINVAL;
 278         switch (cmd) {
 279         case MEMBARRIER_CMD_QUERY:
 280         {
 281                 int cmd_mask = MEMBARRIER_CMD_BITMASK;
 282
 283                 if (tick_nohz_full_enabled())
 284                         cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
 285                 return cmd_mask;
 286         }
 287         case MEMBARRIER_CMD_GLOBAL:
 288                 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
 289                 if (tick_nohz_full_enabled())
 290                         return -EINVAL;
 291                 if (num_online_cpus() > 1)
 292                         synchronize_rcu();
 293                 return 0;
 294         case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
 295                 return membarrier_global_expedited();
 296         case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
 297                 return membarrier_register_global_expedited();
 298         case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
 299                 return membarrier_private_expedited(0);
 300         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
 301                 return membarrier_register_private_expedited(0);
 302         case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
 303                 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 304         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
 305                 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 306         default:
 307                 return -EINVAL;
 308         }
 309 }