1 // SPDX-License-Identifier: GPL-2.0
5 * (C) Copyright 1995 Linus Torvalds
6 * (C) Copyright 2002 Christoph Hellwig
9 #include <linux/capability.h>
10 #include <linux/mman.h>
12 #include <linux/sched/user.h>
13 #include <linux/swap.h>
14 #include <linux/swapops.h>
15 #include <linux/pagemap.h>
16 #include <linux/pagevec.h>
17 #include <linux/mempolicy.h>
18 #include <linux/syscalls.h>
19 #include <linux/sched.h>
20 #include <linux/export.h>
21 #include <linux/rmap.h>
22 #include <linux/mmzone.h>
23 #include <linux/hugetlb.h>
24 #include <linux/memcontrol.h>
25 #include <linux/mm_inline.h>
26 #include <linux/secretmem.h>
30 bool can_do_mlock(void)
32 if (rlimit(RLIMIT_MEMLOCK) != 0)
34 if (capable(CAP_IPC_LOCK))
38 EXPORT_SYMBOL(can_do_mlock);
41 * Mlocked pages are marked with PageMlocked() flag for efficient testing
42 * in vmscan and, possibly, the fault path; and to support semi-accurate
45 * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
46 * be placed on the LRU "unevictable" list, rather than the [in]active lists.
47 * The unevictable list is an LRU sibling list to the [in]active lists.
48 * PageUnevictable is set to indicate the unevictable state.
52 * LRU accounting for clear_page_mlock()
54 void clear_page_mlock(struct page *page)
58 if (!TestClearPageMlocked(page))
61 nr_pages = thp_nr_pages(page);
62 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
63 count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
65 * The previous TestClearPageMlocked() corresponds to the smp_mb()
66 * in __pagevec_lru_add_fn().
68 * See __pagevec_lru_add_fn for more explanation.
70 if (!isolate_lru_page(page)) {
71 putback_lru_page(page);
74 * We lost the race. the page already moved to evictable list.
76 if (PageUnevictable(page))
77 count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
82 * mlock_page - mlock a page
83 * @page: page to be mlocked, either a normal page or a THP head.
85 void mlock_page(struct page *page)
87 VM_BUG_ON_PAGE(PageTail(page), page);
89 if (!TestSetPageMlocked(page)) {
90 int nr_pages = thp_nr_pages(page);
92 mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
93 count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
94 if (!isolate_lru_page(page))
95 putback_lru_page(page);
100 * munlock_page - munlock a page
101 * @page: page to be munlocked, either a normal page or a THP head.
103 void munlock_page(struct page *page)
105 VM_BUG_ON_PAGE(PageTail(page), page);
107 if (TestClearPageMlocked(page)) {
108 int nr_pages = thp_nr_pages(page);
110 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
111 if (!isolate_lru_page(page)) {
112 putback_lru_page(page);
113 count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
114 } else if (PageUnevictable(page)) {
115 count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
121 * munlock_vma_pages_range() - munlock all pages in the vma range.'
122 * @vma - vma containing range to be munlock()ed.
123 * @start - start address in @vma of the range
124 * @end - end of range in @vma.
126 * For mremap(), munmap() and exit().
128 * Called with @vma VM_LOCKED.
130 * Returns with VM_LOCKED cleared. Callers must be prepared to
133 static void munlock_vma_pages_range(struct vm_area_struct *vma,
134 unsigned long start, unsigned long end)
136 vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
138 /* Reimplementation to follow in later commit */
142 * mlock_fixup - handle mlock[all]/munlock[all] requests.
144 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
145 * munlock is a no-op. However, for some special vmas, we go ahead and
148 * For vmas that pass the filters, merge/split as appropriate.
150 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
151 unsigned long start, unsigned long end, vm_flags_t newflags)
153 struct mm_struct *mm = vma->vm_mm;
157 int lock = !!(newflags & VM_LOCKED);
158 vm_flags_t old_flags = vma->vm_flags;
160 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
161 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
162 vma_is_dax(vma) || vma_is_secretmem(vma))
163 /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
166 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
167 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
168 vma->vm_file, pgoff, vma_policy(vma),
169 vma->vm_userfaultfd_ctx, vma_anon_name(vma));
175 if (start != vma->vm_start) {
176 ret = split_vma(mm, vma, start, 1);
181 if (end != vma->vm_end) {
182 ret = split_vma(mm, vma, end, 0);
189 * Keep track of amount of locked VM.
191 nr_pages = (end - start) >> PAGE_SHIFT;
193 nr_pages = -nr_pages;
194 else if (old_flags & VM_LOCKED)
196 mm->locked_vm += nr_pages;
199 * vm_flags is protected by the mmap_lock held in write mode.
200 * It's okay if try_to_unmap_one unmaps a page just after we
201 * set VM_LOCKED, populate_vma_page_range will bring it back.
205 vma->vm_flags = newflags;
207 munlock_vma_pages_range(vma, start, end);
214 static int apply_vma_lock_flags(unsigned long start, size_t len,
217 unsigned long nstart, end, tmp;
218 struct vm_area_struct *vma, *prev;
221 VM_BUG_ON(offset_in_page(start));
222 VM_BUG_ON(len != PAGE_ALIGN(len));
228 vma = find_vma(current->mm, start);
229 if (!vma || vma->vm_start > start)
233 if (start > vma->vm_start)
236 for (nstart = start ; ; ) {
237 vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
241 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
245 error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
249 if (nstart < prev->vm_end)
250 nstart = prev->vm_end;
255 if (!vma || vma->vm_start != nstart) {
264 * Go through vma areas and sum size of mlocked
265 * vma pages, as return value.
266 * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
268 * Return value: previously mlocked page counts
270 static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
271 unsigned long start, size_t len)
273 struct vm_area_struct *vma;
274 unsigned long count = 0;
279 vma = find_vma(mm, start);
283 for (; vma ; vma = vma->vm_next) {
284 if (start >= vma->vm_end)
286 if (start + len <= vma->vm_start)
288 if (vma->vm_flags & VM_LOCKED) {
289 if (start > vma->vm_start)
290 count -= (start - vma->vm_start);
291 if (start + len < vma->vm_end) {
292 count += start + len - vma->vm_start;
295 count += vma->vm_end - vma->vm_start;
299 return count >> PAGE_SHIFT;
303 * convert get_user_pages() return value to posix mlock() error
305 static int __mlock_posix_error_return(long retval)
307 if (retval == -EFAULT)
309 else if (retval == -ENOMEM)
314 static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
316 unsigned long locked;
317 unsigned long lock_limit;
320 start = untagged_addr(start);
325 len = PAGE_ALIGN(len + (offset_in_page(start)));
328 lock_limit = rlimit(RLIMIT_MEMLOCK);
329 lock_limit >>= PAGE_SHIFT;
330 locked = len >> PAGE_SHIFT;
332 if (mmap_write_lock_killable(current->mm))
335 locked += current->mm->locked_vm;
336 if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
338 * It is possible that the regions requested intersect with
339 * previously mlocked areas, that part area in "mm->locked_vm"
340 * should not be counted to new mlock increment count. So check
341 * and adjust locked count if necessary.
343 locked -= count_mm_mlocked_page_nr(current->mm,
347 /* check against resource limits */
348 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
349 error = apply_vma_lock_flags(start, len, flags);
351 mmap_write_unlock(current->mm);
355 error = __mm_populate(start, len, 0);
357 return __mlock_posix_error_return(error);
361 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
363 return do_mlock(start, len, VM_LOCKED);
366 SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
368 vm_flags_t vm_flags = VM_LOCKED;
370 if (flags & ~MLOCK_ONFAULT)
373 if (flags & MLOCK_ONFAULT)
374 vm_flags |= VM_LOCKONFAULT;
376 return do_mlock(start, len, vm_flags);
379 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
383 start = untagged_addr(start);
385 len = PAGE_ALIGN(len + (offset_in_page(start)));
388 if (mmap_write_lock_killable(current->mm))
390 ret = apply_vma_lock_flags(start, len, 0);
391 mmap_write_unlock(current->mm);
397 * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
398 * and translate into the appropriate modifications to mm->def_flags and/or the
399 * flags for all current VMAs.
401 * There are a couple of subtleties with this. If mlockall() is called multiple
402 * times with different flags, the values do not necessarily stack. If mlockall
403 * is called once including the MCL_FUTURE flag and then a second time without
404 * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
406 static int apply_mlockall_flags(int flags)
408 struct vm_area_struct *vma, *prev = NULL;
409 vm_flags_t to_add = 0;
411 current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
412 if (flags & MCL_FUTURE) {
413 current->mm->def_flags |= VM_LOCKED;
415 if (flags & MCL_ONFAULT)
416 current->mm->def_flags |= VM_LOCKONFAULT;
418 if (!(flags & MCL_CURRENT))
422 if (flags & MCL_CURRENT) {
424 if (flags & MCL_ONFAULT)
425 to_add |= VM_LOCKONFAULT;
428 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
431 newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
435 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
442 SYSCALL_DEFINE1(mlockall, int, flags)
444 unsigned long lock_limit;
447 if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
448 flags == MCL_ONFAULT)
454 lock_limit = rlimit(RLIMIT_MEMLOCK);
455 lock_limit >>= PAGE_SHIFT;
457 if (mmap_write_lock_killable(current->mm))
461 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
462 capable(CAP_IPC_LOCK))
463 ret = apply_mlockall_flags(flags);
464 mmap_write_unlock(current->mm);
465 if (!ret && (flags & MCL_CURRENT))
466 mm_populate(0, TASK_SIZE);
471 SYSCALL_DEFINE0(munlockall)
475 if (mmap_write_lock_killable(current->mm))
477 ret = apply_mlockall_flags(0);
478 mmap_write_unlock(current->mm);
483 * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
484 * shm segments) get accounted against the user_struct instead.
486 static DEFINE_SPINLOCK(shmlock_user_lock);
488 int user_shm_lock(size_t size, struct ucounts *ucounts)
490 unsigned long lock_limit, locked;
494 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
495 lock_limit = rlimit(RLIMIT_MEMLOCK);
496 if (lock_limit == RLIM_INFINITY)
498 lock_limit >>= PAGE_SHIFT;
499 spin_lock(&shmlock_user_lock);
500 memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
502 if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
503 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
506 if (!get_ucounts(ucounts)) {
507 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
512 spin_unlock(&shmlock_user_lock);
516 void user_shm_unlock(size_t size, struct ucounts *ucounts)
518 spin_lock(&shmlock_user_lock);
519 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
520 spin_unlock(&shmlock_user_lock);
521 put_ucounts(ucounts);