mm/munlock: delete page_mlock() and all its works
[linux-2.6-microblaze.git] / mm / mlock.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *      linux/mm/mlock.c
4  *
5  *  (C) Copyright 1995 Linus Torvalds
6  *  (C) Copyright 2002 Christoph Hellwig
7  */
8
9 #include <linux/capability.h>
10 #include <linux/mman.h>
11 #include <linux/mm.h>
12 #include <linux/sched/user.h>
13 #include <linux/swap.h>
14 #include <linux/swapops.h>
15 #include <linux/pagemap.h>
16 #include <linux/pagevec.h>
17 #include <linux/mempolicy.h>
18 #include <linux/syscalls.h>
19 #include <linux/sched.h>
20 #include <linux/export.h>
21 #include <linux/rmap.h>
22 #include <linux/mmzone.h>
23 #include <linux/hugetlb.h>
24 #include <linux/memcontrol.h>
25 #include <linux/mm_inline.h>
26 #include <linux/secretmem.h>
27
28 #include "internal.h"
29
30 bool can_do_mlock(void)
31 {
32         if (rlimit(RLIMIT_MEMLOCK) != 0)
33                 return true;
34         if (capable(CAP_IPC_LOCK))
35                 return true;
36         return false;
37 }
38 EXPORT_SYMBOL(can_do_mlock);
39
40 /*
41  * Mlocked pages are marked with PageMlocked() flag for efficient testing
42  * in vmscan and, possibly, the fault path; and to support semi-accurate
43  * statistics.
44  *
45  * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
46  * be placed on the LRU "unevictable" list, rather than the [in]active lists.
47  * The unevictable list is an LRU sibling list to the [in]active lists.
48  * PageUnevictable is set to indicate the unevictable state.
49  */
50
51 /*
52  *  LRU accounting for clear_page_mlock()
53  */
54 void clear_page_mlock(struct page *page)
55 {
56         int nr_pages;
57
58         if (!TestClearPageMlocked(page))
59                 return;
60
61         nr_pages = thp_nr_pages(page);
62         mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
63         count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
64         /*
65          * The previous TestClearPageMlocked() corresponds to the smp_mb()
66          * in __pagevec_lru_add_fn().
67          *
68          * See __pagevec_lru_add_fn for more explanation.
69          */
70         if (!isolate_lru_page(page)) {
71                 putback_lru_page(page);
72         } else {
73                 /*
74                  * We lost the race. the page already moved to evictable list.
75                  */
76                 if (PageUnevictable(page))
77                         count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
78         }
79 }
80
81 /*
82  * Mark page as mlocked if not already.
83  * If page on LRU, isolate and putback to move to unevictable list.
84  */
85 void mlock_vma_page(struct page *page)
86 {
87         /* Serialize with page migration */
88         BUG_ON(!PageLocked(page));
89
90         VM_BUG_ON_PAGE(PageTail(page), page);
91         VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
92
93         if (!TestSetPageMlocked(page)) {
94                 int nr_pages = thp_nr_pages(page);
95
96                 mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
97                 count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
98                 if (!isolate_lru_page(page))
99                         putback_lru_page(page);
100         }
101 }
102
103 /**
104  * munlock_vma_page - munlock a vma page
105  * @page: page to be unlocked, either a normal page or THP page head
106  */
107 void munlock_vma_page(struct page *page)
108 {
109         /* Serialize with page migration */
110         BUG_ON(!PageLocked(page));
111
112         VM_BUG_ON_PAGE(PageTail(page), page);
113
114         if (TestClearPageMlocked(page)) {
115                 int nr_pages = thp_nr_pages(page);
116
117                 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
118                 if (!isolate_lru_page(page)) {
119                         putback_lru_page(page);
120                         count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
121                 } else if (PageUnevictable(page)) {
122                         count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
123                 }
124         }
125 }
126
127 /*
128  * munlock_vma_pages_range() - munlock all pages in the vma range.'
129  * @vma - vma containing range to be munlock()ed.
130  * @start - start address in @vma of the range
131  * @end - end of range in @vma.
132  *
133  *  For mremap(), munmap() and exit().
134  *
135  * Called with @vma VM_LOCKED.
136  *
137  * Returns with VM_LOCKED cleared.  Callers must be prepared to
138  * deal with this.
139  */
140 void munlock_vma_pages_range(struct vm_area_struct *vma,
141                              unsigned long start, unsigned long end)
142 {
143         vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
144
145         /* Reimplementation to follow in later commit */
146 }
147
148 /*
149  * mlock_fixup  - handle mlock[all]/munlock[all] requests.
150  *
151  * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
152  * munlock is a no-op.  However, for some special vmas, we go ahead and
153  * populate the ptes.
154  *
155  * For vmas that pass the filters, merge/split as appropriate.
156  */
157 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
158         unsigned long start, unsigned long end, vm_flags_t newflags)
159 {
160         struct mm_struct *mm = vma->vm_mm;
161         pgoff_t pgoff;
162         int nr_pages;
163         int ret = 0;
164         int lock = !!(newflags & VM_LOCKED);
165         vm_flags_t old_flags = vma->vm_flags;
166
167         if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
168             is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
169             vma_is_dax(vma) || vma_is_secretmem(vma))
170                 /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
171                 goto out;
172
173         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
174         *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
175                           vma->vm_file, pgoff, vma_policy(vma),
176                           vma->vm_userfaultfd_ctx, vma_anon_name(vma));
177         if (*prev) {
178                 vma = *prev;
179                 goto success;
180         }
181
182         if (start != vma->vm_start) {
183                 ret = split_vma(mm, vma, start, 1);
184                 if (ret)
185                         goto out;
186         }
187
188         if (end != vma->vm_end) {
189                 ret = split_vma(mm, vma, end, 0);
190                 if (ret)
191                         goto out;
192         }
193
194 success:
195         /*
196          * Keep track of amount of locked VM.
197          */
198         nr_pages = (end - start) >> PAGE_SHIFT;
199         if (!lock)
200                 nr_pages = -nr_pages;
201         else if (old_flags & VM_LOCKED)
202                 nr_pages = 0;
203         mm->locked_vm += nr_pages;
204
205         /*
206          * vm_flags is protected by the mmap_lock held in write mode.
207          * It's okay if try_to_unmap_one unmaps a page just after we
208          * set VM_LOCKED, populate_vma_page_range will bring it back.
209          */
210
211         if (lock)
212                 vma->vm_flags = newflags;
213         else
214                 munlock_vma_pages_range(vma, start, end);
215
216 out:
217         *prev = vma;
218         return ret;
219 }
220
221 static int apply_vma_lock_flags(unsigned long start, size_t len,
222                                 vm_flags_t flags)
223 {
224         unsigned long nstart, end, tmp;
225         struct vm_area_struct *vma, *prev;
226         int error;
227
228         VM_BUG_ON(offset_in_page(start));
229         VM_BUG_ON(len != PAGE_ALIGN(len));
230         end = start + len;
231         if (end < start)
232                 return -EINVAL;
233         if (end == start)
234                 return 0;
235         vma = find_vma(current->mm, start);
236         if (!vma || vma->vm_start > start)
237                 return -ENOMEM;
238
239         prev = vma->vm_prev;
240         if (start > vma->vm_start)
241                 prev = vma;
242
243         for (nstart = start ; ; ) {
244                 vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
245
246                 newflags |= flags;
247
248                 /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
249                 tmp = vma->vm_end;
250                 if (tmp > end)
251                         tmp = end;
252                 error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
253                 if (error)
254                         break;
255                 nstart = tmp;
256                 if (nstart < prev->vm_end)
257                         nstart = prev->vm_end;
258                 if (nstart >= end)
259                         break;
260
261                 vma = prev->vm_next;
262                 if (!vma || vma->vm_start != nstart) {
263                         error = -ENOMEM;
264                         break;
265                 }
266         }
267         return error;
268 }
269
270 /*
271  * Go through vma areas and sum size of mlocked
272  * vma pages, as return value.
273  * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
274  * is also counted.
275  * Return value: previously mlocked page counts
276  */
277 static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
278                 unsigned long start, size_t len)
279 {
280         struct vm_area_struct *vma;
281         unsigned long count = 0;
282
283         if (mm == NULL)
284                 mm = current->mm;
285
286         vma = find_vma(mm, start);
287         if (vma == NULL)
288                 return 0;
289
290         for (; vma ; vma = vma->vm_next) {
291                 if (start >= vma->vm_end)
292                         continue;
293                 if (start + len <=  vma->vm_start)
294                         break;
295                 if (vma->vm_flags & VM_LOCKED) {
296                         if (start > vma->vm_start)
297                                 count -= (start - vma->vm_start);
298                         if (start + len < vma->vm_end) {
299                                 count += start + len - vma->vm_start;
300                                 break;
301                         }
302                         count += vma->vm_end - vma->vm_start;
303                 }
304         }
305
306         return count >> PAGE_SHIFT;
307 }
308
309 /*
310  * convert get_user_pages() return value to posix mlock() error
311  */
312 static int __mlock_posix_error_return(long retval)
313 {
314         if (retval == -EFAULT)
315                 retval = -ENOMEM;
316         else if (retval == -ENOMEM)
317                 retval = -EAGAIN;
318         return retval;
319 }
320
321 static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
322 {
323         unsigned long locked;
324         unsigned long lock_limit;
325         int error = -ENOMEM;
326
327         start = untagged_addr(start);
328
329         if (!can_do_mlock())
330                 return -EPERM;
331
332         len = PAGE_ALIGN(len + (offset_in_page(start)));
333         start &= PAGE_MASK;
334
335         lock_limit = rlimit(RLIMIT_MEMLOCK);
336         lock_limit >>= PAGE_SHIFT;
337         locked = len >> PAGE_SHIFT;
338
339         if (mmap_write_lock_killable(current->mm))
340                 return -EINTR;
341
342         locked += current->mm->locked_vm;
343         if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
344                 /*
345                  * It is possible that the regions requested intersect with
346                  * previously mlocked areas, that part area in "mm->locked_vm"
347                  * should not be counted to new mlock increment count. So check
348                  * and adjust locked count if necessary.
349                  */
350                 locked -= count_mm_mlocked_page_nr(current->mm,
351                                 start, len);
352         }
353
354         /* check against resource limits */
355         if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
356                 error = apply_vma_lock_flags(start, len, flags);
357
358         mmap_write_unlock(current->mm);
359         if (error)
360                 return error;
361
362         error = __mm_populate(start, len, 0);
363         if (error)
364                 return __mlock_posix_error_return(error);
365         return 0;
366 }
367
368 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
369 {
370         return do_mlock(start, len, VM_LOCKED);
371 }
372
373 SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
374 {
375         vm_flags_t vm_flags = VM_LOCKED;
376
377         if (flags & ~MLOCK_ONFAULT)
378                 return -EINVAL;
379
380         if (flags & MLOCK_ONFAULT)
381                 vm_flags |= VM_LOCKONFAULT;
382
383         return do_mlock(start, len, vm_flags);
384 }
385
386 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
387 {
388         int ret;
389
390         start = untagged_addr(start);
391
392         len = PAGE_ALIGN(len + (offset_in_page(start)));
393         start &= PAGE_MASK;
394
395         if (mmap_write_lock_killable(current->mm))
396                 return -EINTR;
397         ret = apply_vma_lock_flags(start, len, 0);
398         mmap_write_unlock(current->mm);
399
400         return ret;
401 }
402
403 /*
404  * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
405  * and translate into the appropriate modifications to mm->def_flags and/or the
406  * flags for all current VMAs.
407  *
408  * There are a couple of subtleties with this.  If mlockall() is called multiple
409  * times with different flags, the values do not necessarily stack.  If mlockall
410  * is called once including the MCL_FUTURE flag and then a second time without
411  * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
412  */
413 static int apply_mlockall_flags(int flags)
414 {
415         struct vm_area_struct *vma, *prev = NULL;
416         vm_flags_t to_add = 0;
417
418         current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
419         if (flags & MCL_FUTURE) {
420                 current->mm->def_flags |= VM_LOCKED;
421
422                 if (flags & MCL_ONFAULT)
423                         current->mm->def_flags |= VM_LOCKONFAULT;
424
425                 if (!(flags & MCL_CURRENT))
426                         goto out;
427         }
428
429         if (flags & MCL_CURRENT) {
430                 to_add |= VM_LOCKED;
431                 if (flags & MCL_ONFAULT)
432                         to_add |= VM_LOCKONFAULT;
433         }
434
435         for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
436                 vm_flags_t newflags;
437
438                 newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
439                 newflags |= to_add;
440
441                 /* Ignore errors */
442                 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
443                 cond_resched();
444         }
445 out:
446         return 0;
447 }
448
449 SYSCALL_DEFINE1(mlockall, int, flags)
450 {
451         unsigned long lock_limit;
452         int ret;
453
454         if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
455             flags == MCL_ONFAULT)
456                 return -EINVAL;
457
458         if (!can_do_mlock())
459                 return -EPERM;
460
461         lock_limit = rlimit(RLIMIT_MEMLOCK);
462         lock_limit >>= PAGE_SHIFT;
463
464         if (mmap_write_lock_killable(current->mm))
465                 return -EINTR;
466
467         ret = -ENOMEM;
468         if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
469             capable(CAP_IPC_LOCK))
470                 ret = apply_mlockall_flags(flags);
471         mmap_write_unlock(current->mm);
472         if (!ret && (flags & MCL_CURRENT))
473                 mm_populate(0, TASK_SIZE);
474
475         return ret;
476 }
477
478 SYSCALL_DEFINE0(munlockall)
479 {
480         int ret;
481
482         if (mmap_write_lock_killable(current->mm))
483                 return -EINTR;
484         ret = apply_mlockall_flags(0);
485         mmap_write_unlock(current->mm);
486         return ret;
487 }
488
489 /*
490  * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
491  * shm segments) get accounted against the user_struct instead.
492  */
493 static DEFINE_SPINLOCK(shmlock_user_lock);
494
495 int user_shm_lock(size_t size, struct ucounts *ucounts)
496 {
497         unsigned long lock_limit, locked;
498         long memlock;
499         int allowed = 0;
500
501         locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
502         lock_limit = rlimit(RLIMIT_MEMLOCK);
503         if (lock_limit == RLIM_INFINITY)
504                 allowed = 1;
505         lock_limit >>= PAGE_SHIFT;
506         spin_lock(&shmlock_user_lock);
507         memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
508
509         if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
510                 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
511                 goto out;
512         }
513         if (!get_ucounts(ucounts)) {
514                 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
515                 goto out;
516         }
517         allowed = 1;
518 out:
519         spin_unlock(&shmlock_user_lock);
520         return allowed;
521 }
522
523 void user_shm_unlock(size_t size, struct ucounts *ucounts)
524 {
525         spin_lock(&shmlock_user_lock);
526         dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
527         spin_unlock(&shmlock_user_lock);
528         put_ucounts(ucounts);
529 }