1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
5 * membarrier system call
10 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
11 * except MEMBARRIER_CMD_QUERY.
13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
15 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
16 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
22 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \
23 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
24 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
26 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
29 #define MEMBARRIER_CMD_BITMASK \
30 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
31 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
32 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
33 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
34 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
36 static void ipi_mb(void *info)
38 smp_mb(); /* IPIs should be serializing but paranoid. */
41 static void ipi_rseq(void *info)
43 rseq_preempt(current);
46 static void ipi_sync_rq_state(void *info)
48 struct mm_struct *mm = (struct mm_struct *) info;
50 if (current->mm != mm)
52 this_cpu_write(runqueues.membarrier_state,
53 atomic_read(&mm->membarrier_state));
55 * Issue a memory barrier after setting
56 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
57 * guarantee that no memory access following registration is reordered
58 * before registration.
63 void membarrier_exec_mmap(struct mm_struct *mm)
66 * Issue a memory barrier before clearing membarrier_state to
67 * guarantee that no memory access prior to exec is reordered after
68 * clearing this state.
71 atomic_set(&mm->membarrier_state, 0);
73 * Keep the runqueue membarrier_state in sync with this mm
76 this_cpu_write(runqueues.membarrier_state, 0);
79 static int membarrier_global_expedited(void)
82 cpumask_var_t tmpmask;
84 if (num_online_cpus() == 1)
88 * Matches memory barriers around rq->curr modification in
91 smp_mb(); /* system call entry is not a mb. */
93 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
98 for_each_online_cpu(cpu) {
99 struct task_struct *p;
102 * Skipping the current CPU is OK even through we can be
103 * migrated at any point. The current CPU, at the point
104 * where we read raw_smp_processor_id(), is ensured to
105 * be in program order with respect to the caller
106 * thread. Therefore, we can skip this CPU from the
109 if (cpu == raw_smp_processor_id())
112 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
113 MEMBARRIER_STATE_GLOBAL_EXPEDITED))
117 * Skip the CPU if it runs a kernel thread. The scheduler
118 * leaves the prior task mm in place as an optimization when
119 * scheduling a kthread.
121 p = rcu_dereference(cpu_rq(cpu)->curr);
122 if (p->flags & PF_KTHREAD)
125 __cpumask_set_cpu(cpu, tmpmask);
130 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
133 free_cpumask_var(tmpmask);
137 * Memory barrier on the caller thread _after_ we finished
138 * waiting for the last IPI. Matches memory barriers around
139 * rq->curr modification in scheduler.
141 smp_mb(); /* exit from system call is not a mb */
145 static int membarrier_private_expedited(int flags, int cpu_id)
147 cpumask_var_t tmpmask;
148 struct mm_struct *mm = current->mm;
149 smp_call_func_t ipi_func = ipi_mb;
151 if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
152 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
154 if (!(atomic_read(&mm->membarrier_state) &
155 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
157 } else if (flags == MEMBARRIER_FLAG_RSEQ) {
158 if (!IS_ENABLED(CONFIG_RSEQ))
160 if (!(atomic_read(&mm->membarrier_state) &
161 MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
166 if (!(atomic_read(&mm->membarrier_state) &
167 MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
171 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
175 * Matches memory barriers around rq->curr modification in
178 smp_mb(); /* system call entry is not a mb. */
180 if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
186 struct task_struct *p;
188 if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
190 if (cpu_id == raw_smp_processor_id())
193 p = rcu_dereference(cpu_rq(cpu_id)->curr);
194 if (!p || p->mm != mm) {
203 for_each_online_cpu(cpu) {
204 struct task_struct *p;
207 * Skipping the current CPU is OK even through we can be
208 * migrated at any point. The current CPU, at the point
209 * where we read raw_smp_processor_id(), is ensured to
210 * be in program order with respect to the caller
211 * thread. Therefore, we can skip this CPU from the
214 if (cpu == raw_smp_processor_id())
216 p = rcu_dereference(cpu_rq(cpu)->curr);
217 if (p && p->mm == mm)
218 __cpumask_set_cpu(cpu, tmpmask);
225 smp_call_function_single(cpu_id, ipi_func, NULL, 1);
227 smp_call_function_many(tmpmask, ipi_func, NULL, 1);
232 free_cpumask_var(tmpmask);
236 * Memory barrier on the caller thread _after_ we finished
237 * waiting for the last IPI. Matches memory barriers around
238 * rq->curr modification in scheduler.
240 smp_mb(); /* exit from system call is not a mb */
245 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
247 int membarrier_state = atomic_read(&mm->membarrier_state);
248 cpumask_var_t tmpmask;
251 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
252 this_cpu_write(runqueues.membarrier_state, membarrier_state);
255 * For single mm user, we can simply issue a memory barrier
256 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
257 * mm and in the current runqueue to guarantee that no memory
258 * access following registration is reordered before
265 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
269 * For mm with multiple users, we need to ensure all future
270 * scheduler executions will observe @mm's new membarrier
276 * For each cpu runqueue, if the task's mm match @mm, ensure that all
277 * @mm's membarrier state set bits are also set in in the runqueue's
278 * membarrier state. This ensures that a runqueue scheduling
279 * between threads which are users of @mm has its membarrier state
284 for_each_online_cpu(cpu) {
285 struct rq *rq = cpu_rq(cpu);
286 struct task_struct *p;
288 p = rcu_dereference(rq->curr);
289 if (p && p->mm == mm)
290 __cpumask_set_cpu(cpu, tmpmask);
295 smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
298 free_cpumask_var(tmpmask);
304 static int membarrier_register_global_expedited(void)
306 struct task_struct *p = current;
307 struct mm_struct *mm = p->mm;
310 if (atomic_read(&mm->membarrier_state) &
311 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
313 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
314 ret = sync_runqueues_membarrier_state(mm);
317 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
318 &mm->membarrier_state);
323 static int membarrier_register_private_expedited(int flags)
325 struct task_struct *p = current;
326 struct mm_struct *mm = p->mm;
327 int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
328 set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
331 if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
332 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
335 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
336 } else if (flags == MEMBARRIER_FLAG_RSEQ) {
337 if (!IS_ENABLED(CONFIG_RSEQ))
340 MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
346 * We need to consider threads belonging to different thread
347 * groups, which use the same mm. (CLONE_VM but not
350 if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
352 if (flags & MEMBARRIER_FLAG_SYNC_CORE)
353 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
354 if (flags & MEMBARRIER_FLAG_RSEQ)
355 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
356 atomic_or(set_state, &mm->membarrier_state);
357 ret = sync_runqueues_membarrier_state(mm);
360 atomic_or(ready_state, &mm->membarrier_state);
366 * sys_membarrier - issue memory barriers on a set of threads
367 * @cmd: Takes command values defined in enum membarrier_cmd.
368 * @flags: Currently needs to be 0 for all commands other than
369 * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
370 * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
371 * contains the CPU on which to interrupt (= restart)
372 * the RSEQ critical section.
373 * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
374 * RSEQ CS should be interrupted (@cmd must be
375 * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
377 * If this system call is not implemented, -ENOSYS is returned. If the
378 * command specified does not exist, not available on the running
379 * kernel, or if the command argument is invalid, this system call
380 * returns -EINVAL. For a given command, with flags argument set to 0,
381 * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
382 * always return the same value until reboot. In addition, it can return
383 * -ENOMEM if there is not enough memory available to perform the system
386 * All memory accesses performed in program order from each targeted thread
387 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
388 * the semantic "barrier()" to represent a compiler barrier forcing memory
389 * accesses to be performed in program order across the barrier, and
390 * smp_mb() to represent explicit memory barriers forcing full memory
391 * ordering across the barrier, we have the following ordering table for
392 * each pair of barrier(), sys_membarrier() and smp_mb():
394 * The pair ordering is detailed as (O: ordered, X: not ordered):
396 * barrier() smp_mb() sys_membarrier()
399 * sys_membarrier() O O O
401 SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
404 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
405 if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
413 if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
417 case MEMBARRIER_CMD_QUERY:
419 int cmd_mask = MEMBARRIER_CMD_BITMASK;
421 if (tick_nohz_full_enabled())
422 cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
425 case MEMBARRIER_CMD_GLOBAL:
426 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
427 if (tick_nohz_full_enabled())
429 if (num_online_cpus() > 1)
432 case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
433 return membarrier_global_expedited();
434 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
435 return membarrier_register_global_expedited();
436 case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
437 return membarrier_private_expedited(0, cpu_id);
438 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
439 return membarrier_register_private_expedited(0);
440 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
441 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
442 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
443 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
444 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
445 return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
446 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
447 return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);