1 // SPDX-License-Identifier: GPL-2.0
5 * (C) Copyright 1995 Linus Torvalds
6 * (C) Copyright 2002 Christoph Hellwig
9 #include <linux/capability.h>
10 #include <linux/mman.h>
12 #include <linux/sched/user.h>
13 #include <linux/swap.h>
14 #include <linux/swapops.h>
15 #include <linux/pagemap.h>
16 #include <linux/pagevec.h>
17 #include <linux/mempolicy.h>
18 #include <linux/syscalls.h>
19 #include <linux/sched.h>
20 #include <linux/export.h>
21 #include <linux/rmap.h>
22 #include <linux/mmzone.h>
23 #include <linux/hugetlb.h>
24 #include <linux/memcontrol.h>
25 #include <linux/mm_inline.h>
26 #include <linux/secretmem.h>
30 bool can_do_mlock(void)
32 if (rlimit(RLIMIT_MEMLOCK) != 0)
34 if (capable(CAP_IPC_LOCK))
38 EXPORT_SYMBOL(can_do_mlock);
41 * Mlocked pages are marked with PageMlocked() flag for efficient testing
42 * in vmscan and, possibly, the fault path; and to support semi-accurate
45 * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
46 * be placed on the LRU "unevictable" list, rather than the [in]active lists.
47 * The unevictable list is an LRU sibling list to the [in]active lists.
48 * PageUnevictable is set to indicate the unevictable state.
52 * LRU accounting for clear_page_mlock()
54 void clear_page_mlock(struct page *page)
58 if (!TestClearPageMlocked(page))
61 nr_pages = thp_nr_pages(page);
62 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
63 count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
65 * The previous TestClearPageMlocked() corresponds to the smp_mb()
66 * in __pagevec_lru_add_fn().
68 * See __pagevec_lru_add_fn for more explanation.
70 if (!isolate_lru_page(page)) {
71 putback_lru_page(page);
74 * We lost the race. the page already moved to evictable list.
76 if (PageUnevictable(page))
77 count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
82 * Mark page as mlocked if not already.
83 * If page on LRU, isolate and putback to move to unevictable list.
85 void mlock_vma_page(struct page *page)
87 /* Serialize with page migration */
88 BUG_ON(!PageLocked(page));
90 VM_BUG_ON_PAGE(PageTail(page), page);
91 VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
93 if (!TestSetPageMlocked(page)) {
94 int nr_pages = thp_nr_pages(page);
96 mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
97 count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
98 if (!isolate_lru_page(page))
99 putback_lru_page(page);
104 * munlock_vma_page - munlock a vma page
105 * @page: page to be unlocked, either a normal page or THP page head
107 void munlock_vma_page(struct page *page)
109 /* Serialize with page migration */
110 BUG_ON(!PageLocked(page));
112 VM_BUG_ON_PAGE(PageTail(page), page);
114 if (TestClearPageMlocked(page)) {
115 int nr_pages = thp_nr_pages(page);
117 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
118 if (!isolate_lru_page(page)) {
119 putback_lru_page(page);
120 count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
121 } else if (PageUnevictable(page)) {
122 count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
128 * munlock_vma_pages_range() - munlock all pages in the vma range.'
129 * @vma - vma containing range to be munlock()ed.
130 * @start - start address in @vma of the range
131 * @end - end of range in @vma.
133 * For mremap(), munmap() and exit().
135 * Called with @vma VM_LOCKED.
137 * Returns with VM_LOCKED cleared. Callers must be prepared to
140 void munlock_vma_pages_range(struct vm_area_struct *vma,
141 unsigned long start, unsigned long end)
143 vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
145 /* Reimplementation to follow in later commit */
149 * mlock_fixup - handle mlock[all]/munlock[all] requests.
151 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
152 * munlock is a no-op. However, for some special vmas, we go ahead and
155 * For vmas that pass the filters, merge/split as appropriate.
157 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
158 unsigned long start, unsigned long end, vm_flags_t newflags)
160 struct mm_struct *mm = vma->vm_mm;
164 int lock = !!(newflags & VM_LOCKED);
165 vm_flags_t old_flags = vma->vm_flags;
167 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
168 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
169 vma_is_dax(vma) || vma_is_secretmem(vma))
170 /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
173 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
174 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
175 vma->vm_file, pgoff, vma_policy(vma),
176 vma->vm_userfaultfd_ctx, vma_anon_name(vma));
182 if (start != vma->vm_start) {
183 ret = split_vma(mm, vma, start, 1);
188 if (end != vma->vm_end) {
189 ret = split_vma(mm, vma, end, 0);
196 * Keep track of amount of locked VM.
198 nr_pages = (end - start) >> PAGE_SHIFT;
200 nr_pages = -nr_pages;
201 else if (old_flags & VM_LOCKED)
203 mm->locked_vm += nr_pages;
206 * vm_flags is protected by the mmap_lock held in write mode.
207 * It's okay if try_to_unmap_one unmaps a page just after we
208 * set VM_LOCKED, populate_vma_page_range will bring it back.
212 vma->vm_flags = newflags;
214 munlock_vma_pages_range(vma, start, end);
221 static int apply_vma_lock_flags(unsigned long start, size_t len,
224 unsigned long nstart, end, tmp;
225 struct vm_area_struct *vma, *prev;
228 VM_BUG_ON(offset_in_page(start));
229 VM_BUG_ON(len != PAGE_ALIGN(len));
235 vma = find_vma(current->mm, start);
236 if (!vma || vma->vm_start > start)
240 if (start > vma->vm_start)
243 for (nstart = start ; ; ) {
244 vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
248 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
252 error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
256 if (nstart < prev->vm_end)
257 nstart = prev->vm_end;
262 if (!vma || vma->vm_start != nstart) {
271 * Go through vma areas and sum size of mlocked
272 * vma pages, as return value.
273 * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
275 * Return value: previously mlocked page counts
277 static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
278 unsigned long start, size_t len)
280 struct vm_area_struct *vma;
281 unsigned long count = 0;
286 vma = find_vma(mm, start);
290 for (; vma ; vma = vma->vm_next) {
291 if (start >= vma->vm_end)
293 if (start + len <= vma->vm_start)
295 if (vma->vm_flags & VM_LOCKED) {
296 if (start > vma->vm_start)
297 count -= (start - vma->vm_start);
298 if (start + len < vma->vm_end) {
299 count += start + len - vma->vm_start;
302 count += vma->vm_end - vma->vm_start;
306 return count >> PAGE_SHIFT;
310 * convert get_user_pages() return value to posix mlock() error
312 static int __mlock_posix_error_return(long retval)
314 if (retval == -EFAULT)
316 else if (retval == -ENOMEM)
321 static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
323 unsigned long locked;
324 unsigned long lock_limit;
327 start = untagged_addr(start);
332 len = PAGE_ALIGN(len + (offset_in_page(start)));
335 lock_limit = rlimit(RLIMIT_MEMLOCK);
336 lock_limit >>= PAGE_SHIFT;
337 locked = len >> PAGE_SHIFT;
339 if (mmap_write_lock_killable(current->mm))
342 locked += current->mm->locked_vm;
343 if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
345 * It is possible that the regions requested intersect with
346 * previously mlocked areas, that part area in "mm->locked_vm"
347 * should not be counted to new mlock increment count. So check
348 * and adjust locked count if necessary.
350 locked -= count_mm_mlocked_page_nr(current->mm,
354 /* check against resource limits */
355 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
356 error = apply_vma_lock_flags(start, len, flags);
358 mmap_write_unlock(current->mm);
362 error = __mm_populate(start, len, 0);
364 return __mlock_posix_error_return(error);
368 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
370 return do_mlock(start, len, VM_LOCKED);
373 SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
375 vm_flags_t vm_flags = VM_LOCKED;
377 if (flags & ~MLOCK_ONFAULT)
380 if (flags & MLOCK_ONFAULT)
381 vm_flags |= VM_LOCKONFAULT;
383 return do_mlock(start, len, vm_flags);
386 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
390 start = untagged_addr(start);
392 len = PAGE_ALIGN(len + (offset_in_page(start)));
395 if (mmap_write_lock_killable(current->mm))
397 ret = apply_vma_lock_flags(start, len, 0);
398 mmap_write_unlock(current->mm);
404 * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
405 * and translate into the appropriate modifications to mm->def_flags and/or the
406 * flags for all current VMAs.
408 * There are a couple of subtleties with this. If mlockall() is called multiple
409 * times with different flags, the values do not necessarily stack. If mlockall
410 * is called once including the MCL_FUTURE flag and then a second time without
411 * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
413 static int apply_mlockall_flags(int flags)
415 struct vm_area_struct *vma, *prev = NULL;
416 vm_flags_t to_add = 0;
418 current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
419 if (flags & MCL_FUTURE) {
420 current->mm->def_flags |= VM_LOCKED;
422 if (flags & MCL_ONFAULT)
423 current->mm->def_flags |= VM_LOCKONFAULT;
425 if (!(flags & MCL_CURRENT))
429 if (flags & MCL_CURRENT) {
431 if (flags & MCL_ONFAULT)
432 to_add |= VM_LOCKONFAULT;
435 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
438 newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
442 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
449 SYSCALL_DEFINE1(mlockall, int, flags)
451 unsigned long lock_limit;
454 if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
455 flags == MCL_ONFAULT)
461 lock_limit = rlimit(RLIMIT_MEMLOCK);
462 lock_limit >>= PAGE_SHIFT;
464 if (mmap_write_lock_killable(current->mm))
468 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
469 capable(CAP_IPC_LOCK))
470 ret = apply_mlockall_flags(flags);
471 mmap_write_unlock(current->mm);
472 if (!ret && (flags & MCL_CURRENT))
473 mm_populate(0, TASK_SIZE);
478 SYSCALL_DEFINE0(munlockall)
482 if (mmap_write_lock_killable(current->mm))
484 ret = apply_mlockall_flags(0);
485 mmap_write_unlock(current->mm);
490 * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
491 * shm segments) get accounted against the user_struct instead.
493 static DEFINE_SPINLOCK(shmlock_user_lock);
495 int user_shm_lock(size_t size, struct ucounts *ucounts)
497 unsigned long lock_limit, locked;
501 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
502 lock_limit = rlimit(RLIMIT_MEMLOCK);
503 if (lock_limit == RLIM_INFINITY)
505 lock_limit >>= PAGE_SHIFT;
506 spin_lock(&shmlock_user_lock);
507 memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
509 if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
510 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
513 if (!get_ucounts(ucounts)) {
514 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
519 spin_unlock(&shmlock_user_lock);
523 void user_shm_unlock(size_t size, struct ucounts *ucounts)
525 spin_lock(&shmlock_user_lock);
526 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
527 spin_unlock(&shmlock_user_lock);
528 put_ucounts(ucounts);