mm/mlock.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *      linux/mm/mlock.c
   4  *
   5  *  (C) Copyright 1995 Linus Torvalds
   6  *  (C) Copyright 2002 Christoph Hellwig
   7  */
   8
   9 #include <linux/capability.h>
  10 #include <linux/mman.h>
  11 #include <linux/mm.h>
  12 #include <linux/sched/user.h>
  13 #include <linux/swap.h>
  14 #include <linux/swapops.h>
  15 #include <linux/pagemap.h>
  16 #include <linux/pagevec.h>
  17 #include <linux/mempolicy.h>
  18 #include <linux/syscalls.h>
  19 #include <linux/sched.h>
  20 #include <linux/export.h>
  21 #include <linux/rmap.h>
  22 #include <linux/mmzone.h>
  23 #include <linux/hugetlb.h>
  24 #include <linux/memcontrol.h>
  25 #include <linux/mm_inline.h>
  26 #include <linux/secretmem.h>
  27
  28 #include "internal.h"
  29
  30 bool can_do_mlock(void)
  31 {
  32         if (rlimit(RLIMIT_MEMLOCK) != 0)
  33                 return true;
  34         if (capable(CAP_IPC_LOCK))
  35                 return true;
  36         return false;
  37 }
  38 EXPORT_SYMBOL(can_do_mlock);
  39
  40 /*
  41  * Mlocked pages are marked with PageMlocked() flag for efficient testing
  42  * in vmscan and, possibly, the fault path; and to support semi-accurate
  43  * statistics.
  44  *
  45  * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
  46  * be placed on the LRU "unevictable" list, rather than the [in]active lists.
  47  * The unevictable list is an LRU sibling list to the [in]active lists.
  48  * PageUnevictable is set to indicate the unevictable state.
  49  */
  50
  51 /*
  52  *  LRU accounting for clear_page_mlock()
  53  */
  54 void clear_page_mlock(struct page *page)
  55 {
  56         int nr_pages;
  57
  58         if (!TestClearPageMlocked(page))
  59                 return;
  60
  61         nr_pages = thp_nr_pages(page);
  62         mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
  63         count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
  64         /*
  65          * The previous TestClearPageMlocked() corresponds to the smp_mb()
  66          * in __pagevec_lru_add_fn().
  67          *
  68          * See __pagevec_lru_add_fn for more explanation.
  69          */
  70         if (!isolate_lru_page(page)) {
  71                 putback_lru_page(page);
  72         } else {
  73                 /*
  74                  * We lost the race. the page already moved to evictable list.
  75                  */
  76                 if (PageUnevictable(page))
  77                         count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
  78         }
  79 }
  80
  81 /**
  82  * mlock_page - mlock a page
  83  * @page: page to be mlocked, either a normal page or a THP head.
  84  */
  85 void mlock_page(struct page *page)
  86 {
  87         VM_BUG_ON_PAGE(PageTail(page), page);
  88
  89         if (!TestSetPageMlocked(page)) {
  90                 int nr_pages = thp_nr_pages(page);
  91
  92                 mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
  93                 count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
  94                 if (!isolate_lru_page(page))
  95                         putback_lru_page(page);
  96         }
  97 }
  98
  99 /**
 100  * munlock_page - munlock a page
 101  * @page: page to be munlocked, either a normal page or a THP head.
 102  */
 103 void munlock_page(struct page *page)
 104 {
 105         VM_BUG_ON_PAGE(PageTail(page), page);
 106
 107         if (TestClearPageMlocked(page)) {
 108                 int nr_pages = thp_nr_pages(page);
 109
 110                 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
 111                 if (!isolate_lru_page(page)) {
 112                         putback_lru_page(page);
 113                         count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
 114                 } else if (PageUnevictable(page)) {
 115                         count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
 116                 }
 117         }
 118 }
 119
 120 /*
 121  * munlock_vma_pages_range() - munlock all pages in the vma range.'
 122  * @vma - vma containing range to be munlock()ed.
 123  * @start - start address in @vma of the range
 124  * @end - end of range in @vma.
 125  *
 126  *  For mremap(), munmap() and exit().
 127  *
 128  * Called with @vma VM_LOCKED.
 129  *
 130  * Returns with VM_LOCKED cleared.  Callers must be prepared to
 131  * deal with this.
 132  */
 133 static void munlock_vma_pages_range(struct vm_area_struct *vma,
 134                                     unsigned long start, unsigned long end)
 135 {
 136         vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
 137
 138         /* Reimplementation to follow in later commit */
 139 }
 140
 141 /*
 142  * mlock_fixup  - handle mlock[all]/munlock[all] requests.
 143  *
 144  * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
 145  * munlock is a no-op.  However, for some special vmas, we go ahead and
 146  * populate the ptes.
 147  *
 148  * For vmas that pass the filters, merge/split as appropriate.
 149  */
 150 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 151         unsigned long start, unsigned long end, vm_flags_t newflags)
 152 {
 153         struct mm_struct *mm = vma->vm_mm;
 154         pgoff_t pgoff;
 155         int nr_pages;
 156         int ret = 0;
 157         int lock = !!(newflags & VM_LOCKED);
 158         vm_flags_t old_flags = vma->vm_flags;
 159
 160         if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
 161             is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
 162             vma_is_dax(vma) || vma_is_secretmem(vma))
 163                 /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
 164                 goto out;
 165
 166         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 167         *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
 168                           vma->vm_file, pgoff, vma_policy(vma),
 169                           vma->vm_userfaultfd_ctx, vma_anon_name(vma));
 170         if (*prev) {
 171                 vma = *prev;
 172                 goto success;
 173         }
 174
 175         if (start != vma->vm_start) {
 176                 ret = split_vma(mm, vma, start, 1);
 177                 if (ret)
 178                         goto out;
 179         }
 180
 181         if (end != vma->vm_end) {
 182                 ret = split_vma(mm, vma, end, 0);
 183                 if (ret)
 184                         goto out;
 185         }
 186
 187 success:
 188         /*
 189          * Keep track of amount of locked VM.
 190          */
 191         nr_pages = (end - start) >> PAGE_SHIFT;
 192         if (!lock)
 193                 nr_pages = -nr_pages;
 194         else if (old_flags & VM_LOCKED)
 195                 nr_pages = 0;
 196         mm->locked_vm += nr_pages;
 197
 198         /*
 199          * vm_flags is protected by the mmap_lock held in write mode.
 200          * It's okay if try_to_unmap_one unmaps a page just after we
 201          * set VM_LOCKED, populate_vma_page_range will bring it back.
 202          */
 203
 204         if (lock)
 205                 vma->vm_flags = newflags;
 206         else
 207                 munlock_vma_pages_range(vma, start, end);
 208
 209 out:
 210         *prev = vma;
 211         return ret;
 212 }
 213
 214 static int apply_vma_lock_flags(unsigned long start, size_t len,
 215                                 vm_flags_t flags)
 216 {
 217         unsigned long nstart, end, tmp;
 218         struct vm_area_struct *vma, *prev;
 219         int error;
 220
 221         VM_BUG_ON(offset_in_page(start));
 222         VM_BUG_ON(len != PAGE_ALIGN(len));
 223         end = start + len;
 224         if (end < start)
 225                 return -EINVAL;
 226         if (end == start)
 227                 return 0;
 228         vma = find_vma(current->mm, start);
 229         if (!vma || vma->vm_start > start)
 230                 return -ENOMEM;
 231
 232         prev = vma->vm_prev;
 233         if (start > vma->vm_start)
 234                 prev = vma;
 235
 236         for (nstart = start ; ; ) {
 237                 vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
 238
 239                 newflags |= flags;
 240
 241                 /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 242                 tmp = vma->vm_end;
 243                 if (tmp > end)
 244                         tmp = end;
 245                 error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
 246                 if (error)
 247                         break;
 248                 nstart = tmp;
 249                 if (nstart < prev->vm_end)
 250                         nstart = prev->vm_end;
 251                 if (nstart >= end)
 252                         break;
 253
 254                 vma = prev->vm_next;
 255                 if (!vma || vma->vm_start != nstart) {
 256                         error = -ENOMEM;
 257                         break;
 258                 }
 259         }
 260         return error;
 261 }
 262
 263 /*
 264  * Go through vma areas and sum size of mlocked
 265  * vma pages, as return value.
 266  * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
 267  * is also counted.
 268  * Return value: previously mlocked page counts
 269  */
 270 static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
 271                 unsigned long start, size_t len)
 272 {
 273         struct vm_area_struct *vma;
 274         unsigned long count = 0;
 275
 276         if (mm == NULL)
 277                 mm = current->mm;
 278
 279         vma = find_vma(mm, start);
 280         if (vma == NULL)
 281                 return 0;
 282
 283         for (; vma ; vma = vma->vm_next) {
 284                 if (start >= vma->vm_end)
 285                         continue;
 286                 if (start + len <=  vma->vm_start)
 287                         break;
 288                 if (vma->vm_flags & VM_LOCKED) {
 289                         if (start > vma->vm_start)
 290                                 count -= (start - vma->vm_start);
 291                         if (start + len < vma->vm_end) {
 292                                 count += start + len - vma->vm_start;
 293                                 break;
 294                         }
 295                         count += vma->vm_end - vma->vm_start;
 296                 }
 297         }
 298
 299         return count >> PAGE_SHIFT;
 300 }
 301
 302 /*
 303  * convert get_user_pages() return value to posix mlock() error
 304  */
 305 static int __mlock_posix_error_return(long retval)
 306 {
 307         if (retval == -EFAULT)
 308                 retval = -ENOMEM;
 309         else if (retval == -ENOMEM)
 310                 retval = -EAGAIN;
 311         return retval;
 312 }
 313
 314 static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
 315 {
 316         unsigned long locked;
 317         unsigned long lock_limit;
 318         int error = -ENOMEM;
 319
 320         start = untagged_addr(start);
 321
 322         if (!can_do_mlock())
 323                 return -EPERM;
 324
 325         len = PAGE_ALIGN(len + (offset_in_page(start)));
 326         start &= PAGE_MASK;
 327
 328         lock_limit = rlimit(RLIMIT_MEMLOCK);
 329         lock_limit >>= PAGE_SHIFT;
 330         locked = len >> PAGE_SHIFT;
 331
 332         if (mmap_write_lock_killable(current->mm))
 333                 return -EINTR;
 334
 335         locked += current->mm->locked_vm;
 336         if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
 337                 /*
 338                  * It is possible that the regions requested intersect with
 339                  * previously mlocked areas, that part area in "mm->locked_vm"
 340                  * should not be counted to new mlock increment count. So check
 341                  * and adjust locked count if necessary.
 342                  */
 343                 locked -= count_mm_mlocked_page_nr(current->mm,
 344                                 start, len);
 345         }
 346
 347         /* check against resource limits */
 348         if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
 349                 error = apply_vma_lock_flags(start, len, flags);
 350
 351         mmap_write_unlock(current->mm);
 352         if (error)
 353                 return error;
 354
 355         error = __mm_populate(start, len, 0);
 356         if (error)
 357                 return __mlock_posix_error_return(error);
 358         return 0;
 359 }
 360
 361 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 362 {
 363         return do_mlock(start, len, VM_LOCKED);
 364 }
 365
 366 SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
 367 {
 368         vm_flags_t vm_flags = VM_LOCKED;
 369
 370         if (flags & ~MLOCK_ONFAULT)
 371                 return -EINVAL;
 372
 373         if (flags & MLOCK_ONFAULT)
 374                 vm_flags |= VM_LOCKONFAULT;
 375
 376         return do_mlock(start, len, vm_flags);
 377 }
 378
 379 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
 380 {
 381         int ret;
 382
 383         start = untagged_addr(start);
 384
 385         len = PAGE_ALIGN(len + (offset_in_page(start)));
 386         start &= PAGE_MASK;
 387
 388         if (mmap_write_lock_killable(current->mm))
 389                 return -EINTR;
 390         ret = apply_vma_lock_flags(start, len, 0);
 391         mmap_write_unlock(current->mm);
 392
 393         return ret;
 394 }
 395
 396 /*
 397  * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
 398  * and translate into the appropriate modifications to mm->def_flags and/or the
 399  * flags for all current VMAs.
 400  *
 401  * There are a couple of subtleties with this.  If mlockall() is called multiple
 402  * times with different flags, the values do not necessarily stack.  If mlockall
 403  * is called once including the MCL_FUTURE flag and then a second time without
 404  * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
 405  */
 406 static int apply_mlockall_flags(int flags)
 407 {
 408         struct vm_area_struct *vma, *prev = NULL;
 409         vm_flags_t to_add = 0;
 410
 411         current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
 412         if (flags & MCL_FUTURE) {
 413                 current->mm->def_flags |= VM_LOCKED;
 414
 415                 if (flags & MCL_ONFAULT)
 416                         current->mm->def_flags |= VM_LOCKONFAULT;
 417
 418                 if (!(flags & MCL_CURRENT))
 419                         goto out;
 420         }
 421
 422         if (flags & MCL_CURRENT) {
 423                 to_add |= VM_LOCKED;
 424                 if (flags & MCL_ONFAULT)
 425                         to_add |= VM_LOCKONFAULT;
 426         }
 427
 428         for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
 429                 vm_flags_t newflags;
 430
 431                 newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
 432                 newflags |= to_add;
 433
 434                 /* Ignore errors */
 435                 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
 436                 cond_resched();
 437         }
 438 out:
 439         return 0;
 440 }
 441
 442 SYSCALL_DEFINE1(mlockall, int, flags)
 443 {
 444         unsigned long lock_limit;
 445         int ret;
 446
 447         if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
 448             flags == MCL_ONFAULT)
 449                 return -EINVAL;
 450
 451         if (!can_do_mlock())
 452                 return -EPERM;
 453
 454         lock_limit = rlimit(RLIMIT_MEMLOCK);
 455         lock_limit >>= PAGE_SHIFT;
 456
 457         if (mmap_write_lock_killable(current->mm))
 458                 return -EINTR;
 459
 460         ret = -ENOMEM;
 461         if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
 462             capable(CAP_IPC_LOCK))
 463                 ret = apply_mlockall_flags(flags);
 464         mmap_write_unlock(current->mm);
 465         if (!ret && (flags & MCL_CURRENT))
 466                 mm_populate(0, TASK_SIZE);
 467
 468         return ret;
 469 }
 470
 471 SYSCALL_DEFINE0(munlockall)
 472 {
 473         int ret;
 474
 475         if (mmap_write_lock_killable(current->mm))
 476                 return -EINTR;
 477         ret = apply_mlockall_flags(0);
 478         mmap_write_unlock(current->mm);
 479         return ret;
 480 }
 481
 482 /*
 483  * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
 484  * shm segments) get accounted against the user_struct instead.
 485  */
 486 static DEFINE_SPINLOCK(shmlock_user_lock);
 487
 488 int user_shm_lock(size_t size, struct ucounts *ucounts)
 489 {
 490         unsigned long lock_limit, locked;
 491         long memlock;
 492         int allowed = 0;
 493
 494         locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 495         lock_limit = rlimit(RLIMIT_MEMLOCK);
 496         if (lock_limit == RLIM_INFINITY)
 497                 allowed = 1;
 498         lock_limit >>= PAGE_SHIFT;
 499         spin_lock(&shmlock_user_lock);
 500         memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
 501
 502         if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
 503                 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
 504                 goto out;
 505         }
 506         if (!get_ucounts(ucounts)) {
 507                 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
 508                 goto out;
 509         }
 510         allowed = 1;
 511 out:
 512         spin_unlock(&shmlock_user_lock);
 513         return allowed;
 514 }
 515
 516 void user_shm_unlock(size_t size, struct ucounts *ucounts)
 517 {
 518         spin_lock(&shmlock_user_lock);
 519         dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
 520         spin_unlock(&shmlock_user_lock);
 521         put_ucounts(ucounts);
 522 }