Merge tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdim...
[linux-2.6-microblaze.git] / fs / dax.c
index f3a44a7..9598159 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
 static void *dax_insert_mapping_entry(struct address_space *mapping,
                                      struct vm_fault *vmf,
                                      void *entry, sector_t sector,
-                                     unsigned long flags)
+                                     unsigned long flags, bool dirty)
 {
        struct radix_tree_root *page_tree = &mapping->page_tree;
        void *new_entry;
        pgoff_t index = vmf->pgoff;
 
-       if (vmf->flags & FAULT_FLAG_WRITE)
+       if (dirty)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
        if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
@@ -565,11 +565,11 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
                ret = __radix_tree_lookup(page_tree, index, &node, &slot);
                WARN_ON_ONCE(ret != entry);
                __radix_tree_replace(page_tree, node, slot,
-                                    new_entry, NULL, NULL);
+                                    new_entry, NULL);
                entry = new_entry;
        }
 
-       if (vmf->flags & FAULT_FLAG_WRITE)
+       if (dirty)
                radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
 
        spin_unlock_irq(&mapping->tree_lock);
@@ -614,6 +614,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
                if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
                        continue;
 
+               /*
+                * No need to call mmu_notifier_invalidate_range() as we are
+                * downgrading page table protection not changing it to point
+                * to a new page.
+                *
+                * See Documentation/vm/mmu_notifier.txt
+                */
                if (pmdp) {
 #ifdef CONFIG_FS_DAX_PMD
                        pmd_t pmd;
@@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
                        pmd = pmd_wrprotect(pmd);
                        pmd = pmd_mkclean(pmd);
                        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-                       mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 unlock_pmd:
                        spin_unlock(ptl);
 #endif
@@ -643,7 +649,6 @@ unlock_pmd:
                        pte = pte_wrprotect(pte);
                        pte = pte_mkclean(pte);
                        set_pte_at(vma->vm_mm, address, ptep, pte);
-                       mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 unlock_pte:
                        pte_unmap_unlock(ptep, ptl);
                }
@@ -789,7 +794,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 
        tag_pages_for_writeback(mapping, start_index, end_index);
 
-       pagevec_init(&pvec, 0);
+       pagevec_init(&pvec);
        while (!done) {
                pvec.nr = find_get_entries_tag(mapping, start_index,
                                PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
@@ -820,38 +825,42 @@ out:
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
-static int dax_insert_mapping(struct address_space *mapping,
-               struct block_device *bdev, struct dax_device *dax_dev,
-               sector_t sector, size_t size, void *entry,
-               struct vm_area_struct *vma, struct vm_fault *vmf)
+static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
 {
-       unsigned long vaddr = vmf->address;
-       void *ret, *kaddr;
+       return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
+}
+
+static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
+                        pfn_t *pfnp)
+{
+       const sector_t sector = dax_iomap_sector(iomap, pos);
        pgoff_t pgoff;
+       void *kaddr;
        int id, rc;
-       pfn_t pfn;
+       long length;
 
-       rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+       rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
        if (rc)
                return rc;
-
        id = dax_read_lock();
-       rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
-       if (rc < 0) {
-               dax_read_unlock(id);
-               return rc;
+       length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
+                                  &kaddr, pfnp);
+       if (length < 0) {
+               rc = length;
+               goto out;
        }
+       rc = -EINVAL;
+       if (PFN_PHYS(length) < size)
+               goto out;
+       if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
+               goto out;
+       /* For larger pages we need devmap */
+       if (length > 1 && !pfn_t_devmap(*pfnp))
+               goto out;
+       rc = 0;
+out:
        dax_read_unlock(id);
-
-       ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
-       if (IS_ERR(ret))
-               return PTR_ERR(ret);
-
-       trace_dax_insert_mapping(mapping->host, vmf, ret);
-       if (vmf->flags & FAULT_FLAG_WRITE)
-               return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
-       else
-               return vm_insert_mixed(vma, vaddr, pfn);
+       return rc;
 }
 
 /*
@@ -877,7 +886,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
        }
 
        entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
-                       RADIX_DAX_ZERO_PAGE);
+                       RADIX_DAX_ZERO_PAGE, false);
        if (IS_ERR(entry2)) {
                ret = VM_FAULT_SIGBUS;
                goto out;
@@ -936,11 +945,6 @@ int __dax_zero_page_range(struct block_device *bdev,
 }
 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 
-static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
-{
-       return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
-}
-
 static loff_t
 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
                struct iomap *iomap)
@@ -1080,19 +1084,33 @@ static int dax_fault_return(int error)
        return VM_FAULT_SIGBUS;
 }
 
-static int dax_iomap_pte_fault(struct vm_fault *vmf,
+/*
+ * MAP_SYNC on a dax mapping guarantees dirty metadata is
+ * flushed on write-faults (non-cow), but not read-faults.
+ */
+static bool dax_fault_is_synchronous(unsigned long flags,
+               struct vm_area_struct *vma, struct iomap *iomap)
+{
+       return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
+               && (iomap->flags & IOMAP_F_DIRTY);
+}
+
+static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
                               const struct iomap_ops *ops)
 {
-       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+       struct vm_area_struct *vma = vmf->vma;
+       struct address_space *mapping = vma->vm_file->f_mapping;
        struct inode *inode = mapping->host;
        unsigned long vaddr = vmf->address;
        loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
-       sector_t sector;
        struct iomap iomap = { 0 };
        unsigned flags = IOMAP_FAULT;
        int error, major = 0;
+       bool write = vmf->flags & FAULT_FLAG_WRITE;
+       bool sync;
        int vmf_ret = 0;
        void *entry;
+       pfn_t pfn;
 
        trace_dax_pte_fault(inode, vmf, vmf_ret);
        /*
@@ -1105,7 +1123,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
                goto out;
        }
 
-       if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+       if (write && !vmf->cow_page)
                flags |= IOMAP_WRITE;
 
        entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
@@ -1140,9 +1158,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
                goto error_finish_iomap;
        }
 
-       sector = dax_iomap_sector(&iomap, pos);
-
        if (vmf->cow_page) {
+               sector_t sector = dax_iomap_sector(&iomap, pos);
+
                switch (iomap.type) {
                case IOMAP_HOLE:
                case IOMAP_UNWRITTEN:
@@ -1168,22 +1186,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
                goto finish_iomap;
        }
 
+       sync = dax_fault_is_synchronous(flags, vma, &iomap);
+
        switch (iomap.type) {
        case IOMAP_MAPPED:
                if (iomap.flags & IOMAP_F_NEW) {
                        count_vm_event(PGMAJFAULT);
-                       count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
+                       count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
                        major = VM_FAULT_MAJOR;
                }
-               error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
-                               sector, PAGE_SIZE, entry, vmf->vma, vmf);
+               error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
+               if (error < 0)
+                       goto error_finish_iomap;
+
+               entry = dax_insert_mapping_entry(mapping, vmf, entry,
+                                                dax_iomap_sector(&iomap, pos),
+                                                0, write && !sync);
+               if (IS_ERR(entry)) {
+                       error = PTR_ERR(entry);
+                       goto error_finish_iomap;
+               }
+
+               /*
+                * If we are doing synchronous page fault and inode needs fsync,
+                * we can insert PTE into page tables only after that happens.
+                * Skip insertion for now and return the pfn so that caller can
+                * insert it after fsync is done.
+                */
+               if (sync) {
+                       if (WARN_ON_ONCE(!pfnp)) {
+                               error = -EIO;
+                               goto error_finish_iomap;
+                       }
+                       *pfnp = pfn;
+                       vmf_ret = VM_FAULT_NEEDDSYNC | major;
+                       goto finish_iomap;
+               }
+               trace_dax_insert_mapping(inode, vmf, entry);
+               if (write)
+                       error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+               else
+                       error = vm_insert_mixed(vma, vaddr, pfn);
+
                /* -EBUSY is fine, somebody else faulted on the same PTE */
                if (error == -EBUSY)
                        error = 0;
                break;
        case IOMAP_UNWRITTEN:
        case IOMAP_HOLE:
-               if (!(vmf->flags & FAULT_FLAG_WRITE)) {
+               if (!write) {
                        vmf_ret = dax_load_hole(mapping, entry, vmf);
                        goto finish_iomap;
                }
@@ -1218,53 +1269,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 }
 
 #ifdef CONFIG_FS_DAX_PMD
-static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
-               loff_t pos, void *entry)
-{
-       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
-       const sector_t sector = dax_iomap_sector(iomap, pos);
-       struct dax_device *dax_dev = iomap->dax_dev;
-       struct block_device *bdev = iomap->bdev;
-       struct inode *inode = mapping->host;
-       const size_t size = PMD_SIZE;
-       void *ret = NULL, *kaddr;
-       long length = 0;
-       pgoff_t pgoff;
-       pfn_t pfn = {};
-       int id;
-
-       if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
-               goto fallback;
-
-       id = dax_read_lock();
-       length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
-       if (length < 0)
-               goto unlock_fallback;
-       length = PFN_PHYS(length);
-
-       if (length < size)
-               goto unlock_fallback;
-       if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
-               goto unlock_fallback;
-       if (!pfn_t_devmap(pfn))
-               goto unlock_fallback;
-       dax_read_unlock(id);
-
-       ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
-                       RADIX_DAX_PMD);
-       if (IS_ERR(ret))
-               goto fallback;
-
-       trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
-       return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
-                       pfn, vmf->flags & FAULT_FLAG_WRITE);
-
-unlock_fallback:
-       dax_read_unlock(id);
-fallback:
-       trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
-       return VM_FAULT_FALLBACK;
-}
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
+ * more often than one might expect in the below functions.
+ */
+#define PG_PMD_COLOUR  ((PMD_SIZE >> PAGE_SHIFT) - 1)
 
 static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
                void *entry)
@@ -1283,7 +1292,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
                goto fallback;
 
        ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
-                       RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
+                       RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
        if (IS_ERR(ret))
                goto fallback;
 
@@ -1305,13 +1314,14 @@ fallback:
        return VM_FAULT_FALLBACK;
 }
 
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
                               const struct iomap_ops *ops)
 {
        struct vm_area_struct *vma = vmf->vma;
        struct address_space *mapping = vma->vm_file->f_mapping;
        unsigned long pmd_addr = vmf->address & PMD_MASK;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
+       bool sync;
        unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
        struct inode *inode = mapping->host;
        int result = VM_FAULT_FALLBACK;
@@ -1320,6 +1330,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
        void *entry;
        loff_t pos;
        int error;
+       pfn_t pfn;
 
        /*
         * Check whether offset isn't beyond end of file now. Caller is
@@ -1327,7 +1338,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
         * this is a reliable test.
         */
        pgoff = linear_page_index(vma, pmd_addr);
-       max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+       max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 
        trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
 
@@ -1351,13 +1362,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
        if ((pmd_addr + PMD_SIZE) > vma->vm_end)
                goto fallback;
 
-       if (pgoff > max_pgoff) {
+       if (pgoff >= max_pgoff) {
                result = VM_FAULT_SIGBUS;
                goto out;
        }
 
        /* If the PMD would extend beyond the file size */
-       if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
+       if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
                goto fallback;
 
        /*
@@ -1395,9 +1406,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
        if (iomap.offset + iomap.length < pos + PMD_SIZE)
                goto finish_iomap;
 
+       sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
+
        switch (iomap.type) {
        case IOMAP_MAPPED:
-               result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
+               error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
+               if (error < 0)
+                       goto finish_iomap;
+
+               entry = dax_insert_mapping_entry(mapping, vmf, entry,
+                                               dax_iomap_sector(&iomap, pos),
+                                               RADIX_DAX_PMD, write && !sync);
+               if (IS_ERR(entry))
+                       goto finish_iomap;
+
+               /*
+                * If we are doing synchronous page fault and inode needs fsync,
+                * we can insert PMD into page tables only after that happens.
+                * Skip insertion for now and return the pfn so that caller can
+                * insert it after fsync is done.
+                */
+               if (sync) {
+                       if (WARN_ON_ONCE(!pfnp))
+                               goto finish_iomap;
+                       *pfnp = pfn;
+                       result = VM_FAULT_NEEDDSYNC;
+                       goto finish_iomap;
+               }
+
+               trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
+               result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+                                           write);
                break;
        case IOMAP_UNWRITTEN:
        case IOMAP_HOLE:
@@ -1437,7 +1476,7 @@ out:
        return result;
 }
 #else
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
                               const struct iomap_ops *ops)
 {
        return VM_FAULT_FALLBACK;
@@ -1447,7 +1486,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 /**
  * dax_iomap_fault - handle a page fault on a DAX file
  * @vmf: The description of the fault
- * @ops: iomap ops passed from the file system
+ * @pe_size: Size of the page to fault in
+ * @pfnp: PFN to insert for synchronous faults if fsync is required
+ * @ops: Iomap ops passed from the file system
  *
  * When a page fault occurs, filesystems may call this helper in
  * their fault handler for DAX files. dax_iomap_fault() assumes the caller
@@ -1455,15 +1496,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
  * successfully.
  */
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-                   const struct iomap_ops *ops)
+                   pfn_t *pfnp, const struct iomap_ops *ops)
 {
        switch (pe_size) {
        case PE_SIZE_PTE:
-               return dax_iomap_pte_fault(vmf, ops);
+               return dax_iomap_pte_fault(vmf, pfnp, ops);
        case PE_SIZE_PMD:
-               return dax_iomap_pmd_fault(vmf, ops);
+               return dax_iomap_pmd_fault(vmf, pfnp, ops);
        default:
                return VM_FAULT_FALLBACK;
        }
 }
 EXPORT_SYMBOL_GPL(dax_iomap_fault);
+
+/**
+ * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function inserts writeable PTE or PMD entry into page tables for mmaped
+ * DAX file.  It takes care of marking corresponding radix tree entry as dirty
+ * as well.
+ */
+static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+                                 enum page_entry_size pe_size,
+                                 pfn_t pfn)
+{
+       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+       void *entry, **slot;
+       pgoff_t index = vmf->pgoff;
+       int vmf_ret, error;
+
+       spin_lock_irq(&mapping->tree_lock);
+       entry = get_unlocked_mapping_entry(mapping, index, &slot);
+       /* Did we race with someone splitting entry or so? */
+       if (!entry ||
+           (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
+           (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
+               put_unlocked_mapping_entry(mapping, index, entry);
+               spin_unlock_irq(&mapping->tree_lock);
+               trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
+                                                     VM_FAULT_NOPAGE);
+               return VM_FAULT_NOPAGE;
+       }
+       radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+       entry = lock_slot(mapping, slot);
+       spin_unlock_irq(&mapping->tree_lock);
+       switch (pe_size) {
+       case PE_SIZE_PTE:
+               error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+               vmf_ret = dax_fault_return(error);
+               break;
+#ifdef CONFIG_FS_DAX_PMD
+       case PE_SIZE_PMD:
+               vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+                       pfn, true);
+               break;
+#endif
+       default:
+               vmf_ret = VM_FAULT_FALLBACK;
+       }
+       put_locked_mapping_entry(mapping, index);
+       trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
+       return vmf_ret;
+}
+
+/**
+ * dax_finish_sync_fault - finish synchronous page fault
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function ensures that the file range touched by the page fault is
+ * stored persistently on the media and handles inserting of appropriate page
+ * table entry.
+ */
+int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+                         pfn_t pfn)
+{
+       int err;
+       loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
+       size_t len = 0;
+
+       if (pe_size == PE_SIZE_PTE)
+               len = PAGE_SIZE;
+       else if (pe_size == PE_SIZE_PMD)
+               len = PMD_SIZE;
+       else
+               WARN_ON_ONCE(1);
+       err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
+       if (err)
+               return VM_FAULT_SIGBUS;
+       return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
+}
+EXPORT_SYMBOL_GPL(dax_finish_sync_fault);