userfaultfd: wp: add UFFDIO_COPY_MODE_WP
authorAndrea Arcangeli <aarcange@redhat.com>
Tue, 7 Apr 2020 03:05:41 +0000 (20:05 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 7 Apr 2020 17:43:39 +0000 (10:43 -0700)
This allows UFFDIO_COPY to map pages write-protected.

[peterx@redhat.com: switch to VM_WARN_ON_ONCE in mfill_atomic_pte; add brackets
 around "dst_vma->vm_flags & VM_WRITE"; fix wordings in comments and
 commit messages]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Jerome Glisse <jglisse@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Bobby Powers <bobbypowers@gmail.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Denis Plotnikov <dplotnikov@virtuozzo.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Martin Cracauer <cracauer@cons.org>
Cc: Marty McFadden <mcfadden8@llnl.gov>
Cc: Maya Gokhale <gokhale2@llnl.gov>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@fb.com>
Link: http://lkml.kernel.org/r/20200220163112.11409-6-peterx@redhat.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/userfaultfd.c
include/linux/userfaultfd_k.h
include/uapi/linux/userfaultfd.h
mm/userfaultfd.c

index 703c1c3..c49bef5 100644 (file)
@@ -1724,11 +1724,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
        ret = -EINVAL;
        if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
                goto out;
-       if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
+       if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
                goto out;
        if (mmget_not_zero(ctx->mm)) {
                ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
-                                  uffdio_copy.len, &ctx->mmap_changing);
+                                  uffdio_copy.len, &ctx->mmap_changing,
+                                  uffdio_copy.mode);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
index 7b91b76..dcd3317 100644 (file)
@@ -36,7 +36,7 @@ extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
                            unsigned long src_start, unsigned long len,
-                           bool *mmap_changing);
+                           bool *mmap_changing, __u64 mode);
 extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
                              unsigned long dst_start,
                              unsigned long len,
index 48f1a7c..340f23b 100644 (file)
@@ -203,13 +203,14 @@ struct uffdio_copy {
        __u64 dst;
        __u64 src;
        __u64 len;
+#define UFFDIO_COPY_MODE_DONTWAKE              ((__u64)1<<0)
        /*
-        * There will be a wrprotection flag later that allows to map
-        * pages wrprotected on the fly. And such a flag will be
-        * available if the wrprotection ioctl are implemented for the
-        * range according to the uffdio_register.ioctls.
+        * UFFDIO_COPY_MODE_WP will map the page write protected on
+        * the fly.  UFFDIO_COPY_MODE_WP is available only if the
+        * write protected ioctl is implemented for the range
+        * according to the uffdio_register.ioctls.
         */
-#define UFFDIO_COPY_MODE_DONTWAKE              ((__u64)1<<0)
+#define UFFDIO_COPY_MODE_WP                    ((__u64)1<<1)
        __u64 mode;
 
        /*
index bd96855..05dbbca 100644 (file)
@@ -53,7 +53,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
                            struct vm_area_struct *dst_vma,
                            unsigned long dst_addr,
                            unsigned long src_addr,
-                           struct page **pagep)
+                           struct page **pagep,
+                           bool wp_copy)
 {
        struct mem_cgroup *memcg;
        pte_t _dst_pte, *dst_pte;
@@ -99,9 +100,9 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
        if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
                goto out_release;
 
-       _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
-       if (dst_vma->vm_flags & VM_WRITE)
-               _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+       _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
+       if ((dst_vma->vm_flags & VM_WRITE) && !wp_copy)
+               _dst_pte = pte_mkwrite(_dst_pte);
 
        dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
        if (dst_vma->vm_file) {
@@ -415,7 +416,8 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
                                                unsigned long dst_addr,
                                                unsigned long src_addr,
                                                struct page **page,
-                                               bool zeropage)
+                                               bool zeropage,
+                                               bool wp_copy)
 {
        ssize_t err;
 
@@ -432,11 +434,13 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
        if (!(dst_vma->vm_flags & VM_SHARED)) {
                if (!zeropage)
                        err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
-                                              dst_addr, src_addr, page);
+                                              dst_addr, src_addr, page,
+                                              wp_copy);
                else
                        err = mfill_zeropage_pte(dst_mm, dst_pmd,
                                                 dst_vma, dst_addr);
        } else {
+               VM_WARN_ON_ONCE(wp_copy);
                if (!zeropage)
                        err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
                                                     dst_vma, dst_addr,
@@ -454,7 +458,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
                                              unsigned long src_start,
                                              unsigned long len,
                                              bool zeropage,
-                                             bool *mmap_changing)
+                                             bool *mmap_changing,
+                                             __u64 mode)
 {
        struct vm_area_struct *dst_vma;
        ssize_t err;
@@ -462,6 +467,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
        unsigned long src_addr, dst_addr;
        long copied;
        struct page *page;
+       bool wp_copy;
 
        /*
         * Sanitize the command parameters:
@@ -507,6 +513,14 @@ retry:
            dst_vma->vm_flags & VM_SHARED))
                goto out_unlock;
 
+       /*
+        * validate 'mode' now that we know the dst_vma: don't allow
+        * a wrprotect copy if the userfaultfd didn't register as WP.
+        */
+       wp_copy = mode & UFFDIO_COPY_MODE_WP;
+       if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
+               goto out_unlock;
+
        /*
         * If this is a HUGETLB vma, pass off to appropriate routine
         */
@@ -562,7 +576,7 @@ retry:
                BUG_ON(pmd_trans_huge(*dst_pmd));
 
                err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
-                                      src_addr, &page, zeropage);
+                                      src_addr, &page, zeropage, wp_copy);
                cond_resched();
 
                if (unlikely(err == -ENOENT)) {
@@ -609,14 +623,14 @@ out:
 
 ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
                     unsigned long src_start, unsigned long len,
-                    bool *mmap_changing)
+                    bool *mmap_changing, __u64 mode)
 {
        return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
-                             mmap_changing);
+                             mmap_changing, mode);
 }
 
 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
                       unsigned long len, bool *mmap_changing)
 {
-       return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
+       return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
 }