mm/munlock: rmap call mlock_vma_page() munlock_vma_page()
[linux-2.6-microblaze.git] / mm / mlock.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *      linux/mm/mlock.c
4  *
5  *  (C) Copyright 1995 Linus Torvalds
6  *  (C) Copyright 2002 Christoph Hellwig
7  */
8
9 #include <linux/capability.h>
10 #include <linux/mman.h>
11 #include <linux/mm.h>
12 #include <linux/sched/user.h>
13 #include <linux/swap.h>
14 #include <linux/swapops.h>
15 #include <linux/pagemap.h>
16 #include <linux/pagevec.h>
17 #include <linux/mempolicy.h>
18 #include <linux/syscalls.h>
19 #include <linux/sched.h>
20 #include <linux/export.h>
21 #include <linux/rmap.h>
22 #include <linux/mmzone.h>
23 #include <linux/hugetlb.h>
24 #include <linux/memcontrol.h>
25 #include <linux/mm_inline.h>
26 #include <linux/secretmem.h>
27
28 #include "internal.h"
29
30 bool can_do_mlock(void)
31 {
32         if (rlimit(RLIMIT_MEMLOCK) != 0)
33                 return true;
34         if (capable(CAP_IPC_LOCK))
35                 return true;
36         return false;
37 }
38 EXPORT_SYMBOL(can_do_mlock);
39
40 /*
41  * Mlocked pages are marked with PageMlocked() flag for efficient testing
42  * in vmscan and, possibly, the fault path; and to support semi-accurate
43  * statistics.
44  *
45  * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
46  * be placed on the LRU "unevictable" list, rather than the [in]active lists.
47  * The unevictable list is an LRU sibling list to the [in]active lists.
48  * PageUnevictable is set to indicate the unevictable state.
49  */
50
51 /*
52  *  LRU accounting for clear_page_mlock()
53  */
54 void clear_page_mlock(struct page *page)
55 {
56         int nr_pages;
57
58         if (!TestClearPageMlocked(page))
59                 return;
60
61         nr_pages = thp_nr_pages(page);
62         mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
63         count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
64         /*
65          * The previous TestClearPageMlocked() corresponds to the smp_mb()
66          * in __pagevec_lru_add_fn().
67          *
68          * See __pagevec_lru_add_fn for more explanation.
69          */
70         if (!isolate_lru_page(page)) {
71                 putback_lru_page(page);
72         } else {
73                 /*
74                  * We lost the race. the page already moved to evictable list.
75                  */
76                 if (PageUnevictable(page))
77                         count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
78         }
79 }
80
81 /**
82  * mlock_page - mlock a page
83  * @page: page to be mlocked, either a normal page or a THP head.
84  */
85 void mlock_page(struct page *page)
86 {
87         VM_BUG_ON_PAGE(PageTail(page), page);
88
89         if (!TestSetPageMlocked(page)) {
90                 int nr_pages = thp_nr_pages(page);
91
92                 mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
93                 count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
94                 if (!isolate_lru_page(page))
95                         putback_lru_page(page);
96         }
97 }
98
99 /**
100  * munlock_page - munlock a page
101  * @page: page to be munlocked, either a normal page or a THP head.
102  */
103 void munlock_page(struct page *page)
104 {
105         VM_BUG_ON_PAGE(PageTail(page), page);
106
107         if (TestClearPageMlocked(page)) {
108                 int nr_pages = thp_nr_pages(page);
109
110                 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
111                 if (!isolate_lru_page(page)) {
112                         putback_lru_page(page);
113                         count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
114                 } else if (PageUnevictable(page)) {
115                         count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
116                 }
117         }
118 }
119
120 /*
121  * munlock_vma_pages_range() - munlock all pages in the vma range.'
122  * @vma - vma containing range to be munlock()ed.
123  * @start - start address in @vma of the range
124  * @end - end of range in @vma.
125  *
126  *  For mremap(), munmap() and exit().
127  *
128  * Called with @vma VM_LOCKED.
129  *
130  * Returns with VM_LOCKED cleared.  Callers must be prepared to
131  * deal with this.
132  */
133 static void munlock_vma_pages_range(struct vm_area_struct *vma,
134                                     unsigned long start, unsigned long end)
135 {
136         vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
137
138         /* Reimplementation to follow in later commit */
139 }
140
141 /*
142  * mlock_fixup  - handle mlock[all]/munlock[all] requests.
143  *
144  * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
145  * munlock is a no-op.  However, for some special vmas, we go ahead and
146  * populate the ptes.
147  *
148  * For vmas that pass the filters, merge/split as appropriate.
149  */
150 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
151         unsigned long start, unsigned long end, vm_flags_t newflags)
152 {
153         struct mm_struct *mm = vma->vm_mm;
154         pgoff_t pgoff;
155         int nr_pages;
156         int ret = 0;
157         int lock = !!(newflags & VM_LOCKED);
158         vm_flags_t old_flags = vma->vm_flags;
159
160         if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
161             is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
162             vma_is_dax(vma) || vma_is_secretmem(vma))
163                 /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
164                 goto out;
165
166         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
167         *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
168                           vma->vm_file, pgoff, vma_policy(vma),
169                           vma->vm_userfaultfd_ctx, vma_anon_name(vma));
170         if (*prev) {
171                 vma = *prev;
172                 goto success;
173         }
174
175         if (start != vma->vm_start) {
176                 ret = split_vma(mm, vma, start, 1);
177                 if (ret)
178                         goto out;
179         }
180
181         if (end != vma->vm_end) {
182                 ret = split_vma(mm, vma, end, 0);
183                 if (ret)
184                         goto out;
185         }
186
187 success:
188         /*
189          * Keep track of amount of locked VM.
190          */
191         nr_pages = (end - start) >> PAGE_SHIFT;
192         if (!lock)
193                 nr_pages = -nr_pages;
194         else if (old_flags & VM_LOCKED)
195                 nr_pages = 0;
196         mm->locked_vm += nr_pages;
197
198         /*
199          * vm_flags is protected by the mmap_lock held in write mode.
200          * It's okay if try_to_unmap_one unmaps a page just after we
201          * set VM_LOCKED, populate_vma_page_range will bring it back.
202          */
203
204         if (lock)
205                 vma->vm_flags = newflags;
206         else
207                 munlock_vma_pages_range(vma, start, end);
208
209 out:
210         *prev = vma;
211         return ret;
212 }
213
214 static int apply_vma_lock_flags(unsigned long start, size_t len,
215                                 vm_flags_t flags)
216 {
217         unsigned long nstart, end, tmp;
218         struct vm_area_struct *vma, *prev;
219         int error;
220
221         VM_BUG_ON(offset_in_page(start));
222         VM_BUG_ON(len != PAGE_ALIGN(len));
223         end = start + len;
224         if (end < start)
225                 return -EINVAL;
226         if (end == start)
227                 return 0;
228         vma = find_vma(current->mm, start);
229         if (!vma || vma->vm_start > start)
230                 return -ENOMEM;
231
232         prev = vma->vm_prev;
233         if (start > vma->vm_start)
234                 prev = vma;
235
236         for (nstart = start ; ; ) {
237                 vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
238
239                 newflags |= flags;
240
241                 /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
242                 tmp = vma->vm_end;
243                 if (tmp > end)
244                         tmp = end;
245                 error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
246                 if (error)
247                         break;
248                 nstart = tmp;
249                 if (nstart < prev->vm_end)
250                         nstart = prev->vm_end;
251                 if (nstart >= end)
252                         break;
253
254                 vma = prev->vm_next;
255                 if (!vma || vma->vm_start != nstart) {
256                         error = -ENOMEM;
257                         break;
258                 }
259         }
260         return error;
261 }
262
263 /*
264  * Go through vma areas and sum size of mlocked
265  * vma pages, as return value.
266  * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
267  * is also counted.
268  * Return value: previously mlocked page counts
269  */
270 static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
271                 unsigned long start, size_t len)
272 {
273         struct vm_area_struct *vma;
274         unsigned long count = 0;
275
276         if (mm == NULL)
277                 mm = current->mm;
278
279         vma = find_vma(mm, start);
280         if (vma == NULL)
281                 return 0;
282
283         for (; vma ; vma = vma->vm_next) {
284                 if (start >= vma->vm_end)
285                         continue;
286                 if (start + len <=  vma->vm_start)
287                         break;
288                 if (vma->vm_flags & VM_LOCKED) {
289                         if (start > vma->vm_start)
290                                 count -= (start - vma->vm_start);
291                         if (start + len < vma->vm_end) {
292                                 count += start + len - vma->vm_start;
293                                 break;
294                         }
295                         count += vma->vm_end - vma->vm_start;
296                 }
297         }
298
299         return count >> PAGE_SHIFT;
300 }
301
302 /*
303  * convert get_user_pages() return value to posix mlock() error
304  */
305 static int __mlock_posix_error_return(long retval)
306 {
307         if (retval == -EFAULT)
308                 retval = -ENOMEM;
309         else if (retval == -ENOMEM)
310                 retval = -EAGAIN;
311         return retval;
312 }
313
314 static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
315 {
316         unsigned long locked;
317         unsigned long lock_limit;
318         int error = -ENOMEM;
319
320         start = untagged_addr(start);
321
322         if (!can_do_mlock())
323                 return -EPERM;
324
325         len = PAGE_ALIGN(len + (offset_in_page(start)));
326         start &= PAGE_MASK;
327
328         lock_limit = rlimit(RLIMIT_MEMLOCK);
329         lock_limit >>= PAGE_SHIFT;
330         locked = len >> PAGE_SHIFT;
331
332         if (mmap_write_lock_killable(current->mm))
333                 return -EINTR;
334
335         locked += current->mm->locked_vm;
336         if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
337                 /*
338                  * It is possible that the regions requested intersect with
339                  * previously mlocked areas, that part area in "mm->locked_vm"
340                  * should not be counted to new mlock increment count. So check
341                  * and adjust locked count if necessary.
342                  */
343                 locked -= count_mm_mlocked_page_nr(current->mm,
344                                 start, len);
345         }
346
347         /* check against resource limits */
348         if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
349                 error = apply_vma_lock_flags(start, len, flags);
350
351         mmap_write_unlock(current->mm);
352         if (error)
353                 return error;
354
355         error = __mm_populate(start, len, 0);
356         if (error)
357                 return __mlock_posix_error_return(error);
358         return 0;
359 }
360
361 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
362 {
363         return do_mlock(start, len, VM_LOCKED);
364 }
365
366 SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
367 {
368         vm_flags_t vm_flags = VM_LOCKED;
369
370         if (flags & ~MLOCK_ONFAULT)
371                 return -EINVAL;
372
373         if (flags & MLOCK_ONFAULT)
374                 vm_flags |= VM_LOCKONFAULT;
375
376         return do_mlock(start, len, vm_flags);
377 }
378
379 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
380 {
381         int ret;
382
383         start = untagged_addr(start);
384
385         len = PAGE_ALIGN(len + (offset_in_page(start)));
386         start &= PAGE_MASK;
387
388         if (mmap_write_lock_killable(current->mm))
389                 return -EINTR;
390         ret = apply_vma_lock_flags(start, len, 0);
391         mmap_write_unlock(current->mm);
392
393         return ret;
394 }
395
396 /*
397  * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
398  * and translate into the appropriate modifications to mm->def_flags and/or the
399  * flags for all current VMAs.
400  *
401  * There are a couple of subtleties with this.  If mlockall() is called multiple
402  * times with different flags, the values do not necessarily stack.  If mlockall
403  * is called once including the MCL_FUTURE flag and then a second time without
404  * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
405  */
406 static int apply_mlockall_flags(int flags)
407 {
408         struct vm_area_struct *vma, *prev = NULL;
409         vm_flags_t to_add = 0;
410
411         current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
412         if (flags & MCL_FUTURE) {
413                 current->mm->def_flags |= VM_LOCKED;
414
415                 if (flags & MCL_ONFAULT)
416                         current->mm->def_flags |= VM_LOCKONFAULT;
417
418                 if (!(flags & MCL_CURRENT))
419                         goto out;
420         }
421
422         if (flags & MCL_CURRENT) {
423                 to_add |= VM_LOCKED;
424                 if (flags & MCL_ONFAULT)
425                         to_add |= VM_LOCKONFAULT;
426         }
427
428         for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
429                 vm_flags_t newflags;
430
431                 newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
432                 newflags |= to_add;
433
434                 /* Ignore errors */
435                 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
436                 cond_resched();
437         }
438 out:
439         return 0;
440 }
441
442 SYSCALL_DEFINE1(mlockall, int, flags)
443 {
444         unsigned long lock_limit;
445         int ret;
446
447         if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
448             flags == MCL_ONFAULT)
449                 return -EINVAL;
450
451         if (!can_do_mlock())
452                 return -EPERM;
453
454         lock_limit = rlimit(RLIMIT_MEMLOCK);
455         lock_limit >>= PAGE_SHIFT;
456
457         if (mmap_write_lock_killable(current->mm))
458                 return -EINTR;
459
460         ret = -ENOMEM;
461         if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
462             capable(CAP_IPC_LOCK))
463                 ret = apply_mlockall_flags(flags);
464         mmap_write_unlock(current->mm);
465         if (!ret && (flags & MCL_CURRENT))
466                 mm_populate(0, TASK_SIZE);
467
468         return ret;
469 }
470
471 SYSCALL_DEFINE0(munlockall)
472 {
473         int ret;
474
475         if (mmap_write_lock_killable(current->mm))
476                 return -EINTR;
477         ret = apply_mlockall_flags(0);
478         mmap_write_unlock(current->mm);
479         return ret;
480 }
481
482 /*
483  * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
484  * shm segments) get accounted against the user_struct instead.
485  */
486 static DEFINE_SPINLOCK(shmlock_user_lock);
487
488 int user_shm_lock(size_t size, struct ucounts *ucounts)
489 {
490         unsigned long lock_limit, locked;
491         long memlock;
492         int allowed = 0;
493
494         locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
495         lock_limit = rlimit(RLIMIT_MEMLOCK);
496         if (lock_limit == RLIM_INFINITY)
497                 allowed = 1;
498         lock_limit >>= PAGE_SHIFT;
499         spin_lock(&shmlock_user_lock);
500         memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
501
502         if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
503                 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
504                 goto out;
505         }
506         if (!get_ucounts(ucounts)) {
507                 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
508                 goto out;
509         }
510         allowed = 1;
511 out:
512         spin_unlock(&shmlock_user_lock);
513         return allowed;
514 }
515
516 void user_shm_unlock(size_t size, struct ucounts *ucounts)
517 {
518         spin_lock(&shmlock_user_lock);
519         dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
520         spin_unlock(&shmlock_user_lock);
521         put_ucounts(ucounts);
522 }