mm/mlock.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *      linux/mm/mlock.c
   4  *
   5  *  (C) Copyright 1995 Linus Torvalds
   6  *  (C) Copyright 2002 Christoph Hellwig
   7  */
   8
   9 #include <linux/capability.h>
  10 #include <linux/mman.h>
  11 #include <linux/mm.h>
  12 #include <linux/sched/user.h>
  13 #include <linux/swap.h>
  14 #include <linux/swapops.h>
  15 #include <linux/pagemap.h>
  16 #include <linux/pagevec.h>
  17 #include <linux/mempolicy.h>
  18 #include <linux/syscalls.h>
  19 #include <linux/sched.h>
  20 #include <linux/export.h>
  21 #include <linux/rmap.h>
  22 #include <linux/mmzone.h>
  23 #include <linux/hugetlb.h>
  24 #include <linux/memcontrol.h>
  25 #include <linux/mm_inline.h>
  26 #include <linux/secretmem.h>
  27
  28 #include "internal.h"
  29
  30 bool can_do_mlock(void)
  31 {
  32         if (rlimit(RLIMIT_MEMLOCK) != 0)
  33                 return true;
  34         if (capable(CAP_IPC_LOCK))
  35                 return true;
  36         return false;
  37 }
  38 EXPORT_SYMBOL(can_do_mlock);
  39
  40 /*
  41  * Mlocked pages are marked with PageMlocked() flag for efficient testing
  42  * in vmscan and, possibly, the fault path; and to support semi-accurate
  43  * statistics.
  44  *
  45  * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
  46  * be placed on the LRU "unevictable" list, rather than the [in]active lists.
  47  * The unevictable list is an LRU sibling list to the [in]active lists.
  48  * PageUnevictable is set to indicate the unevictable state.
  49  */
  50
  51 /*
  52  *  LRU accounting for clear_page_mlock()
  53  */
  54 void clear_page_mlock(struct page *page)
  55 {
  56         int nr_pages;
  57
  58         if (!TestClearPageMlocked(page))
  59                 return;
  60
  61         nr_pages = thp_nr_pages(page);
  62         mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
  63         count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
  64         /*
  65          * The previous TestClearPageMlocked() corresponds to the smp_mb()
  66          * in __pagevec_lru_add_fn().
  67          *
  68          * See __pagevec_lru_add_fn for more explanation.
  69          */
  70         if (!isolate_lru_page(page)) {
  71                 putback_lru_page(page);
  72         } else {
  73                 /*
  74                  * We lost the race. the page already moved to evictable list.
  75                  */
  76                 if (PageUnevictable(page))
  77                         count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
  78         }
  79 }
  80
  81 /*
  82  * Mark page as mlocked if not already.
  83  * If page on LRU, isolate and putback to move to unevictable list.
  84  */
  85 void mlock_vma_page(struct page *page)
  86 {
  87         /* Serialize with page migration */
  88         BUG_ON(!PageLocked(page));
  89
  90         VM_BUG_ON_PAGE(PageTail(page), page);
  91         VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
  92
  93         if (!TestSetPageMlocked(page)) {
  94                 int nr_pages = thp_nr_pages(page);
  95
  96                 mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
  97                 count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
  98                 if (!isolate_lru_page(page))
  99                         putback_lru_page(page);
 100         }
 101 }
 102
 103 /**
 104  * munlock_vma_page - munlock a vma page
 105  * @page: page to be unlocked, either a normal page or THP page head
 106  */
 107 void munlock_vma_page(struct page *page)
 108 {
 109         /* Serialize with page migration */
 110         BUG_ON(!PageLocked(page));
 111
 112         VM_BUG_ON_PAGE(PageTail(page), page);
 113
 114         if (TestClearPageMlocked(page)) {
 115                 int nr_pages = thp_nr_pages(page);
 116
 117                 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
 118                 if (!isolate_lru_page(page)) {
 119                         putback_lru_page(page);
 120                         count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
 121                 } else if (PageUnevictable(page)) {
 122                         count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
 123                 }
 124         }
 125 }
 126
 127 /*
 128  * munlock_vma_pages_range() - munlock all pages in the vma range.'
 129  * @vma - vma containing range to be munlock()ed.
 130  * @start - start address in @vma of the range
 131  * @end - end of range in @vma.
 132  *
 133  *  For mremap(), munmap() and exit().
 134  *
 135  * Called with @vma VM_LOCKED.
 136  *
 137  * Returns with VM_LOCKED cleared.  Callers must be prepared to
 138  * deal with this.
 139  */
 140 void munlock_vma_pages_range(struct vm_area_struct *vma,
 141                              unsigned long start, unsigned long end)
 142 {
 143         vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
 144
 145         /* Reimplementation to follow in later commit */
 146 }
 147
 148 /*
 149  * mlock_fixup  - handle mlock[all]/munlock[all] requests.
 150  *
 151  * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
 152  * munlock is a no-op.  However, for some special vmas, we go ahead and
 153  * populate the ptes.
 154  *
 155  * For vmas that pass the filters, merge/split as appropriate.
 156  */
 157 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 158         unsigned long start, unsigned long end, vm_flags_t newflags)
 159 {
 160         struct mm_struct *mm = vma->vm_mm;
 161         pgoff_t pgoff;
 162         int nr_pages;
 163         int ret = 0;
 164         int lock = !!(newflags & VM_LOCKED);
 165         vm_flags_t old_flags = vma->vm_flags;
 166
 167         if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
 168             is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
 169             vma_is_dax(vma) || vma_is_secretmem(vma))
 170                 /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
 171                 goto out;
 172
 173         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 174         *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
 175                           vma->vm_file, pgoff, vma_policy(vma),
 176                           vma->vm_userfaultfd_ctx, vma_anon_name(vma));
 177         if (*prev) {
 178                 vma = *prev;
 179                 goto success;
 180         }
 181
 182         if (start != vma->vm_start) {
 183                 ret = split_vma(mm, vma, start, 1);
 184                 if (ret)
 185                         goto out;
 186         }
 187
 188         if (end != vma->vm_end) {
 189                 ret = split_vma(mm, vma, end, 0);
 190                 if (ret)
 191                         goto out;
 192         }
 193
 194 success:
 195         /*
 196          * Keep track of amount of locked VM.
 197          */
 198         nr_pages = (end - start) >> PAGE_SHIFT;
 199         if (!lock)
 200                 nr_pages = -nr_pages;
 201         else if (old_flags & VM_LOCKED)
 202                 nr_pages = 0;
 203         mm->locked_vm += nr_pages;
 204
 205         /*
 206          * vm_flags is protected by the mmap_lock held in write mode.
 207          * It's okay if try_to_unmap_one unmaps a page just after we
 208          * set VM_LOCKED, populate_vma_page_range will bring it back.
 209          */
 210
 211         if (lock)
 212                 vma->vm_flags = newflags;
 213         else
 214                 munlock_vma_pages_range(vma, start, end);
 215
 216 out:
 217         *prev = vma;
 218         return ret;
 219 }
 220
 221 static int apply_vma_lock_flags(unsigned long start, size_t len,
 222                                 vm_flags_t flags)
 223 {
 224         unsigned long nstart, end, tmp;
 225         struct vm_area_struct *vma, *prev;
 226         int error;
 227
 228         VM_BUG_ON(offset_in_page(start));
 229         VM_BUG_ON(len != PAGE_ALIGN(len));
 230         end = start + len;
 231         if (end < start)
 232                 return -EINVAL;
 233         if (end == start)
 234                 return 0;
 235         vma = find_vma(current->mm, start);
 236         if (!vma || vma->vm_start > start)
 237                 return -ENOMEM;
 238
 239         prev = vma->vm_prev;
 240         if (start > vma->vm_start)
 241                 prev = vma;
 242
 243         for (nstart = start ; ; ) {
 244                 vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
 245
 246                 newflags |= flags;
 247
 248                 /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 249                 tmp = vma->vm_end;
 250                 if (tmp > end)
 251                         tmp = end;
 252                 error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
 253                 if (error)
 254                         break;
 255                 nstart = tmp;
 256                 if (nstart < prev->vm_end)
 257                         nstart = prev->vm_end;
 258                 if (nstart >= end)
 259                         break;
 260
 261                 vma = prev->vm_next;
 262                 if (!vma || vma->vm_start != nstart) {
 263                         error = -ENOMEM;
 264                         break;
 265                 }
 266         }
 267         return error;
 268 }
 269
 270 /*
 271  * Go through vma areas and sum size of mlocked
 272  * vma pages, as return value.
 273  * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
 274  * is also counted.
 275  * Return value: previously mlocked page counts
 276  */
 277 static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
 278                 unsigned long start, size_t len)
 279 {
 280         struct vm_area_struct *vma;
 281         unsigned long count = 0;
 282
 283         if (mm == NULL)
 284                 mm = current->mm;
 285
 286         vma = find_vma(mm, start);
 287         if (vma == NULL)
 288                 return 0;
 289
 290         for (; vma ; vma = vma->vm_next) {
 291                 if (start >= vma->vm_end)
 292                         continue;
 293                 if (start + len <=  vma->vm_start)
 294                         break;
 295                 if (vma->vm_flags & VM_LOCKED) {
 296                         if (start > vma->vm_start)
 297                                 count -= (start - vma->vm_start);
 298                         if (start + len < vma->vm_end) {
 299                                 count += start + len - vma->vm_start;
 300                                 break;
 301                         }
 302                         count += vma->vm_end - vma->vm_start;
 303                 }
 304         }
 305
 306         return count >> PAGE_SHIFT;
 307 }
 308
 309 /*
 310  * convert get_user_pages() return value to posix mlock() error
 311  */
 312 static int __mlock_posix_error_return(long retval)
 313 {
 314         if (retval == -EFAULT)
 315                 retval = -ENOMEM;
 316         else if (retval == -ENOMEM)
 317                 retval = -EAGAIN;
 318         return retval;
 319 }
 320
 321 static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
 322 {
 323         unsigned long locked;
 324         unsigned long lock_limit;
 325         int error = -ENOMEM;
 326
 327         start = untagged_addr(start);
 328
 329         if (!can_do_mlock())
 330                 return -EPERM;
 331
 332         len = PAGE_ALIGN(len + (offset_in_page(start)));
 333         start &= PAGE_MASK;
 334
 335         lock_limit = rlimit(RLIMIT_MEMLOCK);
 336         lock_limit >>= PAGE_SHIFT;
 337         locked = len >> PAGE_SHIFT;
 338
 339         if (mmap_write_lock_killable(current->mm))
 340                 return -EINTR;
 341
 342         locked += current->mm->locked_vm;
 343         if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
 344                 /*
 345                  * It is possible that the regions requested intersect with
 346                  * previously mlocked areas, that part area in "mm->locked_vm"
 347                  * should not be counted to new mlock increment count. So check
 348                  * and adjust locked count if necessary.
 349                  */
 350                 locked -= count_mm_mlocked_page_nr(current->mm,
 351                                 start, len);
 352         }
 353
 354         /* check against resource limits */
 355         if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
 356                 error = apply_vma_lock_flags(start, len, flags);
 357
 358         mmap_write_unlock(current->mm);
 359         if (error)
 360                 return error;
 361
 362         error = __mm_populate(start, len, 0);
 363         if (error)
 364                 return __mlock_posix_error_return(error);
 365         return 0;
 366 }
 367
 368 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 369 {
 370         return do_mlock(start, len, VM_LOCKED);
 371 }
 372
 373 SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
 374 {
 375         vm_flags_t vm_flags = VM_LOCKED;
 376
 377         if (flags & ~MLOCK_ONFAULT)
 378                 return -EINVAL;
 379
 380         if (flags & MLOCK_ONFAULT)
 381                 vm_flags |= VM_LOCKONFAULT;
 382
 383         return do_mlock(start, len, vm_flags);
 384 }
 385
 386 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
 387 {
 388         int ret;
 389
 390         start = untagged_addr(start);
 391
 392         len = PAGE_ALIGN(len + (offset_in_page(start)));
 393         start &= PAGE_MASK;
 394
 395         if (mmap_write_lock_killable(current->mm))
 396                 return -EINTR;
 397         ret = apply_vma_lock_flags(start, len, 0);
 398         mmap_write_unlock(current->mm);
 399
 400         return ret;
 401 }
 402
 403 /*
 404  * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
 405  * and translate into the appropriate modifications to mm->def_flags and/or the
 406  * flags for all current VMAs.
 407  *
 408  * There are a couple of subtleties with this.  If mlockall() is called multiple
 409  * times with different flags, the values do not necessarily stack.  If mlockall
 410  * is called once including the MCL_FUTURE flag and then a second time without
 411  * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
 412  */
 413 static int apply_mlockall_flags(int flags)
 414 {
 415         struct vm_area_struct *vma, *prev = NULL;
 416         vm_flags_t to_add = 0;
 417
 418         current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
 419         if (flags & MCL_FUTURE) {
 420                 current->mm->def_flags |= VM_LOCKED;
 421
 422                 if (flags & MCL_ONFAULT)
 423                         current->mm->def_flags |= VM_LOCKONFAULT;
 424
 425                 if (!(flags & MCL_CURRENT))
 426                         goto out;
 427         }
 428
 429         if (flags & MCL_CURRENT) {
 430                 to_add |= VM_LOCKED;
 431                 if (flags & MCL_ONFAULT)
 432                         to_add |= VM_LOCKONFAULT;
 433         }
 434
 435         for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
 436                 vm_flags_t newflags;
 437
 438                 newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
 439                 newflags |= to_add;
 440
 441                 /* Ignore errors */
 442                 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
 443                 cond_resched();
 444         }
 445 out:
 446         return 0;
 447 }
 448
 449 SYSCALL_DEFINE1(mlockall, int, flags)
 450 {
 451         unsigned long lock_limit;
 452         int ret;
 453
 454         if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
 455             flags == MCL_ONFAULT)
 456                 return -EINVAL;
 457
 458         if (!can_do_mlock())
 459                 return -EPERM;
 460
 461         lock_limit = rlimit(RLIMIT_MEMLOCK);
 462         lock_limit >>= PAGE_SHIFT;
 463
 464         if (mmap_write_lock_killable(current->mm))
 465                 return -EINTR;
 466
 467         ret = -ENOMEM;
 468         if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
 469             capable(CAP_IPC_LOCK))
 470                 ret = apply_mlockall_flags(flags);
 471         mmap_write_unlock(current->mm);
 472         if (!ret && (flags & MCL_CURRENT))
 473                 mm_populate(0, TASK_SIZE);
 474
 475         return ret;
 476 }
 477
 478 SYSCALL_DEFINE0(munlockall)
 479 {
 480         int ret;
 481
 482         if (mmap_write_lock_killable(current->mm))
 483                 return -EINTR;
 484         ret = apply_mlockall_flags(0);
 485         mmap_write_unlock(current->mm);
 486         return ret;
 487 }
 488
 489 /*
 490  * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
 491  * shm segments) get accounted against the user_struct instead.
 492  */
 493 static DEFINE_SPINLOCK(shmlock_user_lock);
 494
 495 int user_shm_lock(size_t size, struct ucounts *ucounts)
 496 {
 497         unsigned long lock_limit, locked;
 498         long memlock;
 499         int allowed = 0;
 500
 501         locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 502         lock_limit = rlimit(RLIMIT_MEMLOCK);
 503         if (lock_limit == RLIM_INFINITY)
 504                 allowed = 1;
 505         lock_limit >>= PAGE_SHIFT;
 506         spin_lock(&shmlock_user_lock);
 507         memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
 508
 509         if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
 510                 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
 511                 goto out;
 512         }
 513         if (!get_ucounts(ucounts)) {
 514                 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
 515                 goto out;
 516         }
 517         allowed = 1;
 518 out:
 519         spin_unlock(&shmlock_user_lock);
 520         return allowed;
 521 }
 522
 523 void user_shm_unlock(size_t size, struct ucounts *ucounts)
 524 {
 525         spin_lock(&shmlock_user_lock);
 526         dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
 527         spin_unlock(&shmlock_user_lock);
 528         put_ucounts(ucounts);
 529 }