mm: fix process_vm_rw page counts
[linux-2.6-microblaze.git] / mm / mprotect.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  mm/mprotect.c
4  *
5  *  (C) Copyright 1994 Linus Torvalds
6  *  (C) Copyright 2002 Christoph Hellwig
7  *
8  *  Address space accounting code       <alan@lxorguk.ukuu.org.uk>
9  *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
10  */
11
12 #include <linux/pagewalk.h>
13 #include <linux/hugetlb.h>
14 #include <linux/shm.h>
15 #include <linux/mman.h>
16 #include <linux/fs.h>
17 #include <linux/highmem.h>
18 #include <linux/security.h>
19 #include <linux/mempolicy.h>
20 #include <linux/personality.h>
21 #include <linux/syscalls.h>
22 #include <linux/swap.h>
23 #include <linux/swapops.h>
24 #include <linux/mmu_notifier.h>
25 #include <linux/migrate.h>
26 #include <linux/perf_event.h>
27 #include <linux/pkeys.h>
28 #include <linux/ksm.h>
29 #include <linux/uaccess.h>
30 #include <linux/mm_inline.h>
31 #include <linux/pgtable.h>
32 #include <linux/sched/sysctl.h>
33 #include <linux/userfaultfd_k.h>
34 #include <linux/memory-tiers.h>
35 #include <asm/cacheflush.h>
36 #include <asm/mmu_context.h>
37 #include <asm/tlbflush.h>
38 #include <asm/tlb.h>
39
40 #include "internal.h"
41
42 bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
43                              pte_t pte)
44 {
45         struct page *page;
46
47         if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
48                 return false;
49
50         /* Don't touch entries that are not even readable. */
51         if (pte_protnone(pte))
52                 return false;
53
54         /* Do we need write faults for softdirty tracking? */
55         if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
56                 return false;
57
58         /* Do we need write faults for uffd-wp tracking? */
59         if (userfaultfd_pte_wp(vma, pte))
60                 return false;
61
62         if (!(vma->vm_flags & VM_SHARED)) {
63                 /*
64                  * Writable MAP_PRIVATE mapping: We can only special-case on
65                  * exclusive anonymous pages, because we know that our
66                  * write-fault handler similarly would map them writable without
67                  * any additional checks while holding the PT lock.
68                  */
69                 page = vm_normal_page(vma, addr, pte);
70                 return page && PageAnon(page) && PageAnonExclusive(page);
71         }
72
73         /*
74          * Writable MAP_SHARED mapping: "clean" might indicate that the FS still
75          * needs a real write-fault for writenotify
76          * (see vma_wants_writenotify()). If "dirty", the assumption is that the
77          * FS was already notified and we can simply mark the PTE writable
78          * just like the write-fault handler would do.
79          */
80         return pte_dirty(pte);
81 }
82
83 static long change_pte_range(struct mmu_gather *tlb,
84                 struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
85                 unsigned long end, pgprot_t newprot, unsigned long cp_flags)
86 {
87         pte_t *pte, oldpte;
88         spinlock_t *ptl;
89         long pages = 0;
90         int target_node = NUMA_NO_NODE;
91         bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
92         bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
93         bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
94
95         tlb_change_page_size(tlb, PAGE_SIZE);
96         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
97         if (!pte)
98                 return -EAGAIN;
99
100         /* Get target node for single threaded private VMAs */
101         if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
102             atomic_read(&vma->vm_mm->mm_users) == 1)
103                 target_node = numa_node_id();
104
105         flush_tlb_batched_pending(vma->vm_mm);
106         arch_enter_lazy_mmu_mode();
107         do {
108                 oldpte = ptep_get(pte);
109                 if (pte_present(oldpte)) {
110                         pte_t ptent;
111
112                         /*
113                          * Avoid trapping faults against the zero or KSM
114                          * pages. See similar comment in change_huge_pmd.
115                          */
116                         if (prot_numa) {
117                                 struct folio *folio;
118                                 int nid;
119                                 bool toptier;
120
121                                 /* Avoid TLB flush if possible */
122                                 if (pte_protnone(oldpte))
123                                         continue;
124
125                                 folio = vm_normal_folio(vma, addr, oldpte);
126                                 if (!folio || folio_is_zone_device(folio) ||
127                                     folio_test_ksm(folio))
128                                         continue;
129
130                                 /* Also skip shared copy-on-write pages */
131                                 if (is_cow_mapping(vma->vm_flags) &&
132                                     folio_ref_count(folio) != 1)
133                                         continue;
134
135                                 /*
136                                  * While migration can move some dirty pages,
137                                  * it cannot move them all from MIGRATE_ASYNC
138                                  * context.
139                                  */
140                                 if (folio_is_file_lru(folio) &&
141                                     folio_test_dirty(folio))
142                                         continue;
143
144                                 /*
145                                  * Don't mess with PTEs if page is already on the node
146                                  * a single-threaded process is running on.
147                                  */
148                                 nid = folio_nid(folio);
149                                 if (target_node == nid)
150                                         continue;
151                                 toptier = node_is_toptier(nid);
152
153                                 /*
154                                  * Skip scanning top tier node if normal numa
155                                  * balancing is disabled
156                                  */
157                                 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
158                                     toptier)
159                                         continue;
160                                 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
161                                     !toptier)
162                                         folio_xchg_access_time(folio,
163                                                 jiffies_to_msecs(jiffies));
164                         }
165
166                         oldpte = ptep_modify_prot_start(vma, addr, pte);
167                         ptent = pte_modify(oldpte, newprot);
168
169                         if (uffd_wp)
170                                 ptent = pte_mkuffd_wp(ptent);
171                         else if (uffd_wp_resolve)
172                                 ptent = pte_clear_uffd_wp(ptent);
173
174                         /*
175                          * In some writable, shared mappings, we might want
176                          * to catch actual write access -- see
177                          * vma_wants_writenotify().
178                          *
179                          * In all writable, private mappings, we have to
180                          * properly handle COW.
181                          *
182                          * In both cases, we can sometimes still change PTEs
183                          * writable and avoid the write-fault handler, for
184                          * example, if a PTE is already dirty and no other
185                          * COW or special handling is required.
186                          */
187                         if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
188                             !pte_write(ptent) &&
189                             can_change_pte_writable(vma, addr, ptent))
190                                 ptent = pte_mkwrite(ptent, vma);
191
192                         ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
193                         if (pte_needs_flush(oldpte, ptent))
194                                 tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
195                         pages++;
196                 } else if (is_swap_pte(oldpte)) {
197                         swp_entry_t entry = pte_to_swp_entry(oldpte);
198                         pte_t newpte;
199
200                         if (is_writable_migration_entry(entry)) {
201                                 struct page *page = pfn_swap_entry_to_page(entry);
202
203                                 /*
204                                  * A protection check is difficult so
205                                  * just be safe and disable write
206                                  */
207                                 if (PageAnon(page))
208                                         entry = make_readable_exclusive_migration_entry(
209                                                              swp_offset(entry));
210                                 else
211                                         entry = make_readable_migration_entry(swp_offset(entry));
212                                 newpte = swp_entry_to_pte(entry);
213                                 if (pte_swp_soft_dirty(oldpte))
214                                         newpte = pte_swp_mksoft_dirty(newpte);
215                         } else if (is_writable_device_private_entry(entry)) {
216                                 /*
217                                  * We do not preserve soft-dirtiness. See
218                                  * copy_nonpresent_pte() for explanation.
219                                  */
220                                 entry = make_readable_device_private_entry(
221                                                         swp_offset(entry));
222                                 newpte = swp_entry_to_pte(entry);
223                                 if (pte_swp_uffd_wp(oldpte))
224                                         newpte = pte_swp_mkuffd_wp(newpte);
225                         } else if (is_writable_device_exclusive_entry(entry)) {
226                                 entry = make_readable_device_exclusive_entry(
227                                                         swp_offset(entry));
228                                 newpte = swp_entry_to_pte(entry);
229                                 if (pte_swp_soft_dirty(oldpte))
230                                         newpte = pte_swp_mksoft_dirty(newpte);
231                                 if (pte_swp_uffd_wp(oldpte))
232                                         newpte = pte_swp_mkuffd_wp(newpte);
233                         } else if (is_pte_marker_entry(entry)) {
234                                 /*
235                                  * Ignore error swap entries unconditionally,
236                                  * because any access should sigbus anyway.
237                                  */
238                                 if (is_poisoned_swp_entry(entry))
239                                         continue;
240                                 /*
241                                  * If this is uffd-wp pte marker and we'd like
242                                  * to unprotect it, drop it; the next page
243                                  * fault will trigger without uffd trapping.
244                                  */
245                                 if (uffd_wp_resolve) {
246                                         pte_clear(vma->vm_mm, addr, pte);
247                                         pages++;
248                                 }
249                                 continue;
250                         } else {
251                                 newpte = oldpte;
252                         }
253
254                         if (uffd_wp)
255                                 newpte = pte_swp_mkuffd_wp(newpte);
256                         else if (uffd_wp_resolve)
257                                 newpte = pte_swp_clear_uffd_wp(newpte);
258
259                         if (!pte_same(oldpte, newpte)) {
260                                 set_pte_at(vma->vm_mm, addr, pte, newpte);
261                                 pages++;
262                         }
263                 } else {
264                         /* It must be an none page, or what else?.. */
265                         WARN_ON_ONCE(!pte_none(oldpte));
266
267                         /*
268                          * Nobody plays with any none ptes besides
269                          * userfaultfd when applying the protections.
270                          */
271                         if (likely(!uffd_wp))
272                                 continue;
273
274                         if (userfaultfd_wp_use_markers(vma)) {
275                                 /*
276                                  * For file-backed mem, we need to be able to
277                                  * wr-protect a none pte, because even if the
278                                  * pte is none, the page/swap cache could
279                                  * exist.  Doing that by install a marker.
280                                  */
281                                 set_pte_at(vma->vm_mm, addr, pte,
282                                            make_pte_marker(PTE_MARKER_UFFD_WP));
283                                 pages++;
284                         }
285                 }
286         } while (pte++, addr += PAGE_SIZE, addr != end);
287         arch_leave_lazy_mmu_mode();
288         pte_unmap_unlock(pte - 1, ptl);
289
290         return pages;
291 }
292
293 /*
294  * Return true if we want to split THPs into PTE mappings in change
295  * protection procedure, false otherwise.
296  */
297 static inline bool
298 pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags)
299 {
300         /*
301          * pte markers only resides in pte level, if we need pte markers,
302          * we need to split.  We cannot wr-protect shmem thp because file
303          * thp is handled differently when split by erasing the pmd so far.
304          */
305         return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
306 }
307
308 /*
309  * Return true if we want to populate pgtables in change protection
310  * procedure, false otherwise
311  */
312 static inline bool
313 pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags)
314 {
315         /* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */
316         if (!(cp_flags & MM_CP_UFFD_WP))
317                 return false;
318
319         /* Populate if the userfaultfd mode requires pte markers */
320         return userfaultfd_wp_use_markers(vma);
321 }
322
323 /*
324  * Populate the pgtable underneath for whatever reason if requested.
325  * When {pte|pmd|...}_alloc() failed we treat it the same way as pgtable
326  * allocation failures during page faults by kicking OOM and returning
327  * error.
328  */
329 #define  change_pmd_prepare(vma, pmd, cp_flags)                         \
330         ({                                                              \
331                 long err = 0;                                           \
332                 if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \
333                         if (pte_alloc(vma->vm_mm, pmd))                 \
334                                 err = -ENOMEM;                          \
335                 }                                                       \
336                 err;                                                    \
337         })
338
339 /*
340  * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to
341  * have separate change_pmd_prepare() because pte_alloc() returns 0 on success,
342  * while {pmd|pud|p4d}_alloc() returns the valid pointer on success.
343  */
344 #define  change_prepare(vma, high, low, addr, cp_flags)                 \
345           ({                                                            \
346                 long err = 0;                                           \
347                 if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \
348                         low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
349                         if (p == NULL)                                  \
350                                 err = -ENOMEM;                          \
351                 }                                                       \
352                 err;                                                    \
353         })
354
355 static inline long change_pmd_range(struct mmu_gather *tlb,
356                 struct vm_area_struct *vma, pud_t *pud, unsigned long addr,
357                 unsigned long end, pgprot_t newprot, unsigned long cp_flags)
358 {
359         pmd_t *pmd;
360         unsigned long next;
361         long pages = 0;
362         unsigned long nr_huge_updates = 0;
363         struct mmu_notifier_range range;
364
365         range.start = 0;
366
367         pmd = pmd_offset(pud, addr);
368         do {
369                 long ret;
370                 pmd_t _pmd;
371 again:
372                 next = pmd_addr_end(addr, end);
373
374                 ret = change_pmd_prepare(vma, pmd, cp_flags);
375                 if (ret) {
376                         pages = ret;
377                         break;
378                 }
379
380                 if (pmd_none(*pmd))
381                         goto next;
382
383                 /* invoke the mmu notifier if the pmd is populated */
384                 if (!range.start) {
385                         mmu_notifier_range_init(&range,
386                                 MMU_NOTIFY_PROTECTION_VMA, 0,
387                                 vma->vm_mm, addr, end);
388                         mmu_notifier_invalidate_range_start(&range);
389                 }
390
391                 _pmd = pmdp_get_lockless(pmd);
392                 if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
393                         if ((next - addr != HPAGE_PMD_SIZE) ||
394                             pgtable_split_needed(vma, cp_flags)) {
395                                 __split_huge_pmd(vma, pmd, addr, false, NULL);
396                                 /*
397                                  * For file-backed, the pmd could have been
398                                  * cleared; make sure pmd populated if
399                                  * necessary, then fall-through to pte level.
400                                  */
401                                 ret = change_pmd_prepare(vma, pmd, cp_flags);
402                                 if (ret) {
403                                         pages = ret;
404                                         break;
405                                 }
406                         } else {
407                                 ret = change_huge_pmd(tlb, vma, pmd,
408                                                 addr, newprot, cp_flags);
409                                 if (ret) {
410                                         if (ret == HPAGE_PMD_NR) {
411                                                 pages += HPAGE_PMD_NR;
412                                                 nr_huge_updates++;
413                                         }
414
415                                         /* huge pmd was handled */
416                                         goto next;
417                                 }
418                         }
419                         /* fall through, the trans huge pmd just split */
420                 }
421
422                 ret = change_pte_range(tlb, vma, pmd, addr, next, newprot,
423                                        cp_flags);
424                 if (ret < 0)
425                         goto again;
426                 pages += ret;
427 next:
428                 cond_resched();
429         } while (pmd++, addr = next, addr != end);
430
431         if (range.start)
432                 mmu_notifier_invalidate_range_end(&range);
433
434         if (nr_huge_updates)
435                 count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
436         return pages;
437 }
438
439 static inline long change_pud_range(struct mmu_gather *tlb,
440                 struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr,
441                 unsigned long end, pgprot_t newprot, unsigned long cp_flags)
442 {
443         pud_t *pud;
444         unsigned long next;
445         long pages = 0, ret;
446
447         pud = pud_offset(p4d, addr);
448         do {
449                 next = pud_addr_end(addr, end);
450                 ret = change_prepare(vma, pud, pmd, addr, cp_flags);
451                 if (ret)
452                         return ret;
453                 if (pud_none_or_clear_bad(pud))
454                         continue;
455                 pages += change_pmd_range(tlb, vma, pud, addr, next, newprot,
456                                           cp_flags);
457         } while (pud++, addr = next, addr != end);
458
459         return pages;
460 }
461
462 static inline long change_p4d_range(struct mmu_gather *tlb,
463                 struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr,
464                 unsigned long end, pgprot_t newprot, unsigned long cp_flags)
465 {
466         p4d_t *p4d;
467         unsigned long next;
468         long pages = 0, ret;
469
470         p4d = p4d_offset(pgd, addr);
471         do {
472                 next = p4d_addr_end(addr, end);
473                 ret = change_prepare(vma, p4d, pud, addr, cp_flags);
474                 if (ret)
475                         return ret;
476                 if (p4d_none_or_clear_bad(p4d))
477                         continue;
478                 pages += change_pud_range(tlb, vma, p4d, addr, next, newprot,
479                                           cp_flags);
480         } while (p4d++, addr = next, addr != end);
481
482         return pages;
483 }
484
485 static long change_protection_range(struct mmu_gather *tlb,
486                 struct vm_area_struct *vma, unsigned long addr,
487                 unsigned long end, pgprot_t newprot, unsigned long cp_flags)
488 {
489         struct mm_struct *mm = vma->vm_mm;
490         pgd_t *pgd;
491         unsigned long next;
492         long pages = 0, ret;
493
494         BUG_ON(addr >= end);
495         pgd = pgd_offset(mm, addr);
496         tlb_start_vma(tlb, vma);
497         do {
498                 next = pgd_addr_end(addr, end);
499                 ret = change_prepare(vma, pgd, p4d, addr, cp_flags);
500                 if (ret) {
501                         pages = ret;
502                         break;
503                 }
504                 if (pgd_none_or_clear_bad(pgd))
505                         continue;
506                 pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot,
507                                           cp_flags);
508         } while (pgd++, addr = next, addr != end);
509
510         tlb_end_vma(tlb, vma);
511
512         return pages;
513 }
514
515 long change_protection(struct mmu_gather *tlb,
516                        struct vm_area_struct *vma, unsigned long start,
517                        unsigned long end, unsigned long cp_flags)
518 {
519         pgprot_t newprot = vma->vm_page_prot;
520         long pages;
521
522         BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
523
524 #ifdef CONFIG_NUMA_BALANCING
525         /*
526          * Ordinary protection updates (mprotect, uffd-wp, softdirty tracking)
527          * are expected to reflect their requirements via VMA flags such that
528          * vma_set_page_prot() will adjust vma->vm_page_prot accordingly.
529          */
530         if (cp_flags & MM_CP_PROT_NUMA)
531                 newprot = PAGE_NONE;
532 #else
533         WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
534 #endif
535
536         if (is_vm_hugetlb_page(vma))
537                 pages = hugetlb_change_protection(vma, start, end, newprot,
538                                                   cp_flags);
539         else
540                 pages = change_protection_range(tlb, vma, start, end, newprot,
541                                                 cp_flags);
542
543         return pages;
544 }
545
546 static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
547                                unsigned long next, struct mm_walk *walk)
548 {
549         return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
550                                   *(pgprot_t *)(walk->private)) ?
551                 0 : -EACCES;
552 }
553
554 static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
555                                    unsigned long addr, unsigned long next,
556                                    struct mm_walk *walk)
557 {
558         return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
559                                   *(pgprot_t *)(walk->private)) ?
560                 0 : -EACCES;
561 }
562
563 static int prot_none_test(unsigned long addr, unsigned long next,
564                           struct mm_walk *walk)
565 {
566         return 0;
567 }
568
569 static const struct mm_walk_ops prot_none_walk_ops = {
570         .pte_entry              = prot_none_pte_entry,
571         .hugetlb_entry          = prot_none_hugetlb_entry,
572         .test_walk              = prot_none_test,
573         .walk_lock              = PGWALK_WRLOCK,
574 };
575
576 int
577 mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
578                struct vm_area_struct *vma, struct vm_area_struct **pprev,
579                unsigned long start, unsigned long end, unsigned long newflags)
580 {
581         struct mm_struct *mm = vma->vm_mm;
582         unsigned long oldflags = vma->vm_flags;
583         long nrpages = (end - start) >> PAGE_SHIFT;
584         unsigned int mm_cp_flags = 0;
585         unsigned long charged = 0;
586         int error;
587
588         if (newflags == oldflags) {
589                 *pprev = vma;
590                 return 0;
591         }
592
593         /*
594          * Do PROT_NONE PFN permission checks here when we can still
595          * bail out without undoing a lot of state. This is a rather
596          * uncommon case, so doesn't need to be very optimized.
597          */
598         if (arch_has_pfn_modify_check() &&
599             (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
600             (newflags & VM_ACCESS_FLAGS) == 0) {
601                 pgprot_t new_pgprot = vm_get_page_prot(newflags);
602
603                 error = walk_page_range(current->mm, start, end,
604                                 &prot_none_walk_ops, &new_pgprot);
605                 if (error)
606                         return error;
607         }
608
609         /*
610          * If we make a private mapping writable we increase our commit;
611          * but (without finer accounting) cannot reduce our commit if we
612          * make it unwritable again except in the anonymous case where no
613          * anon_vma has yet to be assigned.
614          *
615          * hugetlb mapping were accounted for even if read-only so there is
616          * no need to account for them here.
617          */
618         if (newflags & VM_WRITE) {
619                 /* Check space limits when area turns into data. */
620                 if (!may_expand_vm(mm, newflags, nrpages) &&
621                                 may_expand_vm(mm, oldflags, nrpages))
622                         return -ENOMEM;
623                 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
624                                                 VM_SHARED|VM_NORESERVE))) {
625                         charged = nrpages;
626                         if (security_vm_enough_memory_mm(mm, charged))
627                                 return -ENOMEM;
628                         newflags |= VM_ACCOUNT;
629                 }
630         } else if ((oldflags & VM_ACCOUNT) && vma_is_anonymous(vma) &&
631                    !vma->anon_vma) {
632                 newflags &= ~VM_ACCOUNT;
633         }
634
635         vma = vma_modify_flags(vmi, *pprev, vma, start, end, newflags);
636         if (IS_ERR(vma)) {
637                 error = PTR_ERR(vma);
638                 goto fail;
639         }
640
641         *pprev = vma;
642
643         /*
644          * vm_flags and vm_page_prot are protected by the mmap_lock
645          * held in write mode.
646          */
647         vma_start_write(vma);
648         vm_flags_reset(vma, newflags);
649         if (vma_wants_manual_pte_write_upgrade(vma))
650                 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
651         vma_set_page_prot(vma);
652
653         change_protection(tlb, vma, start, end, mm_cp_flags);
654
655         if ((oldflags & VM_ACCOUNT) && !(newflags & VM_ACCOUNT))
656                 vm_unacct_memory(nrpages);
657
658         /*
659          * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
660          * fault on access.
661          */
662         if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
663                         (newflags & VM_WRITE)) {
664                 populate_vma_page_range(vma, start, end, NULL);
665         }
666
667         vm_stat_account(mm, oldflags, -nrpages);
668         vm_stat_account(mm, newflags, nrpages);
669         perf_event_mmap(vma);
670         return 0;
671
672 fail:
673         vm_unacct_memory(charged);
674         return error;
675 }
676
677 /*
678  * pkey==-1 when doing a legacy mprotect()
679  */
680 static int do_mprotect_pkey(unsigned long start, size_t len,
681                 unsigned long prot, int pkey)
682 {
683         unsigned long nstart, end, tmp, reqprot;
684         struct vm_area_struct *vma, *prev;
685         int error;
686         const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
687         const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
688                                 (prot & PROT_READ);
689         struct mmu_gather tlb;
690         struct vma_iterator vmi;
691
692         start = untagged_addr(start);
693
694         prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
695         if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
696                 return -EINVAL;
697
698         if (start & ~PAGE_MASK)
699                 return -EINVAL;
700         if (!len)
701                 return 0;
702         len = PAGE_ALIGN(len);
703         end = start + len;
704         if (end <= start)
705                 return -ENOMEM;
706         if (!arch_validate_prot(prot, start))
707                 return -EINVAL;
708
709         reqprot = prot;
710
711         if (mmap_write_lock_killable(current->mm))
712                 return -EINTR;
713
714         /*
715          * If userspace did not allocate the pkey, do not let
716          * them use it here.
717          */
718         error = -EINVAL;
719         if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
720                 goto out;
721
722         vma_iter_init(&vmi, current->mm, start);
723         vma = vma_find(&vmi, end);
724         error = -ENOMEM;
725         if (!vma)
726                 goto out;
727
728         if (unlikely(grows & PROT_GROWSDOWN)) {
729                 if (vma->vm_start >= end)
730                         goto out;
731                 start = vma->vm_start;
732                 error = -EINVAL;
733                 if (!(vma->vm_flags & VM_GROWSDOWN))
734                         goto out;
735         } else {
736                 if (vma->vm_start > start)
737                         goto out;
738                 if (unlikely(grows & PROT_GROWSUP)) {
739                         end = vma->vm_end;
740                         error = -EINVAL;
741                         if (!(vma->vm_flags & VM_GROWSUP))
742                                 goto out;
743                 }
744         }
745
746         prev = vma_prev(&vmi);
747         if (start > vma->vm_start)
748                 prev = vma;
749
750         tlb_gather_mmu(&tlb, current->mm);
751         nstart = start;
752         tmp = vma->vm_start;
753         for_each_vma_range(vmi, vma, end) {
754                 unsigned long mask_off_old_flags;
755                 unsigned long newflags;
756                 int new_vma_pkey;
757
758                 if (vma->vm_start != tmp) {
759                         error = -ENOMEM;
760                         break;
761                 }
762
763                 /* Does the application expect PROT_READ to imply PROT_EXEC */
764                 if (rier && (vma->vm_flags & VM_MAYEXEC))
765                         prot |= PROT_EXEC;
766
767                 /*
768                  * Each mprotect() call explicitly passes r/w/x permissions.
769                  * If a permission is not passed to mprotect(), it must be
770                  * cleared from the VMA.
771                  */
772                 mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR;
773
774                 new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
775                 newflags = calc_vm_prot_bits(prot, new_vma_pkey);
776                 newflags |= (vma->vm_flags & ~mask_off_old_flags);
777
778                 /* newflags >> 4 shift VM_MAY% in place of VM_% */
779                 if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) {
780                         error = -EACCES;
781                         break;
782                 }
783
784                 if (map_deny_write_exec(vma, newflags)) {
785                         error = -EACCES;
786                         break;
787                 }
788
789                 /* Allow architectures to sanity-check the new flags */
790                 if (!arch_validate_flags(newflags)) {
791                         error = -EINVAL;
792                         break;
793                 }
794
795                 error = security_file_mprotect(vma, reqprot, prot);
796                 if (error)
797                         break;
798
799                 tmp = vma->vm_end;
800                 if (tmp > end)
801                         tmp = end;
802
803                 if (vma->vm_ops && vma->vm_ops->mprotect) {
804                         error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags);
805                         if (error)
806                                 break;
807                 }
808
809                 error = mprotect_fixup(&vmi, &tlb, vma, &prev, nstart, tmp, newflags);
810                 if (error)
811                         break;
812
813                 tmp = vma_iter_end(&vmi);
814                 nstart = tmp;
815                 prot = reqprot;
816         }
817         tlb_finish_mmu(&tlb);
818
819         if (!error && tmp < end)
820                 error = -ENOMEM;
821
822 out:
823         mmap_write_unlock(current->mm);
824         return error;
825 }
826
827 SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
828                 unsigned long, prot)
829 {
830         return do_mprotect_pkey(start, len, prot, -1);
831 }
832
833 #ifdef CONFIG_ARCH_HAS_PKEYS
834
835 SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
836                 unsigned long, prot, int, pkey)
837 {
838         return do_mprotect_pkey(start, len, prot, pkey);
839 }
840
841 SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
842 {
843         int pkey;
844         int ret;
845
846         /* No flags supported yet. */
847         if (flags)
848                 return -EINVAL;
849         /* check for unsupported init values */
850         if (init_val & ~PKEY_ACCESS_MASK)
851                 return -EINVAL;
852
853         mmap_write_lock(current->mm);
854         pkey = mm_pkey_alloc(current->mm);
855
856         ret = -ENOSPC;
857         if (pkey == -1)
858                 goto out;
859
860         ret = arch_set_user_pkey_access(current, pkey, init_val);
861         if (ret) {
862                 mm_pkey_free(current->mm, pkey);
863                 goto out;
864         }
865         ret = pkey;
866 out:
867         mmap_write_unlock(current->mm);
868         return ret;
869 }
870
871 SYSCALL_DEFINE1(pkey_free, int, pkey)
872 {
873         int ret;
874
875         mmap_write_lock(current->mm);
876         ret = mm_pkey_free(current->mm, pkey);
877         mmap_write_unlock(current->mm);
878
879         /*
880          * We could provide warnings or errors if any VMA still
881          * has the pkey set here.
882          */
883         return ret;
884 }
885
886 #endif /* CONFIG_ARCH_HAS_PKEYS */