Merge tag 'hole_punch_for_v5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 30 Aug 2021 17:24:50 +0000 (10:24 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 30 Aug 2021 17:24:50 +0000 (10:24 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 30 Aug 2021 17:24:50 +0000 (10:24 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 30 Aug 2021 17:24:50 +0000 (10:24 -0700)
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst

index 2183fd8..2a75dd5 100644 (file)
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -271,19 +271,19 @@ prototypes::
  locking rules:
         All except set_page_dirty and freepage may block
  
-====================== ======================== =========
-ops                    PageLocked(page)         i_rwsem
-====================== ======================== =========
+====================== ======================== =========      ===============
+ops                    PageLocked(page)         i_rwsem        invalidate_lock
+====================== ======================== =========      ===============
  writepage:             yes, unlocks (see below)
-readpage:              yes, unlocks
+readpage:              yes, unlocks                            shared
  writepages:
  set_page_dirty         no
-readahead:             yes, unlocks
-readpages:             no
+readahead:             yes, unlocks                            shared
+readpages:             no                                      shared
  write_begin:           locks the page           exclusive
  write_end:             yes, unlocks             exclusive
  bmap:
-invalidatepage:                yes
+invalidatepage:                yes                                     exclusive
  releasepage:           yes
  freepage:              yes
  direct_IO:
@@ -295,7 +295,7 @@ is_partially_uptodate:      yes
  error_remove_page:     yes
  swap_activate:         no
  swap_deactivate:       no
-====================== ======================== =========
+====================== ======================== =========      ===============
  
  ->write_begin(), ->write_end() and ->readpage() may be called from
  the request handler (/dev/loop).
@@ -378,7 +378,10 @@ keep it that way and don't breed new callers.
  ->invalidatepage() is called when the filesystem must attempt to drop
  some or all of the buffers from the page when it is being truncated. It
  returns zero on success. If ->invalidatepage is zero, the kernel uses
-block_invalidatepage() instead.
+block_invalidatepage() instead. The filesystem must exclusively acquire
+invalidate_lock before invalidating page cache in truncate / hole punch path
+(and thus calling into ->invalidatepage) to block races between page cache
+invalidation and page cache filling functions (fault, read, ...).
  
  ->releasepage() is called when the kernel is about to try to drop the
  buffers from the page in preparation for freeing it.  It returns zero to
@@ -506,6 +509,7 @@ prototypes::
         ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
         ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
         ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+       int (*iopoll) (struct kiocb *kiocb, bool spin);
         int (*iterate) (struct file *, struct dir_context *);
         int (*iterate_shared) (struct file *, struct dir_context *);
         __poll_t (*poll) (struct file *, struct poll_table_struct *);
@@ -518,12 +522,6 @@ prototypes::
         int (*fsync) (struct file *, loff_t start, loff_t end, int datasync);
         int (*fasync) (int, struct file *, int);
         int (*lock) (struct file *, int, struct file_lock *);
-       ssize_t (*readv) (struct file *, const struct iovec *, unsigned long,
-                       loff_t *);
-       ssize_t (*writev) (struct file *, const struct iovec *, unsigned long,
-                       loff_t *);
-       ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t,
-                       void __user *);
         ssize_t (*sendpage) (struct file *, struct page *, int, size_t,
                         loff_t *, int);
         unsigned long (*get_unmapped_area)(struct file *, unsigned long,
@@ -536,6 +534,14 @@ prototypes::
                         size_t, unsigned int);
         int (*setlease)(struct file *, long, struct file_lock **, void **);
         long (*fallocate)(struct file *, int, loff_t, loff_t);
+       void (*show_fdinfo)(struct seq_file *m, struct file *f);
+       unsigned (*mmap_capabilities)(struct file *);
+       ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
+                       loff_t, size_t, unsigned int);
+       loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
+                       struct file *file_out, loff_t pos_out,
+                       loff_t len, unsigned int remap_flags);
+       int (*fadvise)(struct file *, loff_t, loff_t, int);
  
  locking rules:
         All may block.
@@ -570,6 +576,25 @@ in sys_read() and friends.
  the lease within the individual filesystem to record the result of the
  operation
  
+->fallocate implementation must be really careful to maintain page cache
+consistency when punching holes or performing other operations that invalidate
+page cache contents. Usually the filesystem needs to call
+truncate_inode_pages_range() to invalidate relevant range of the page cache.
+However the filesystem usually also needs to update its internal (and on disk)
+view of file offset -> disk block mapping. Until this update is finished, the
+filesystem needs to block page faults and reads from reloading now-stale page
+cache contents from the disk. Since VFS acquires mapping->invalidate_lock in
+shared mode when loading pages from disk (filemap_fault(), filemap_read(),
+readahead paths), the fallocate implementation must take the invalidate_lock to
+prevent reloading.
+
+->copy_file_range and ->remap_file_range implementations need to serialize
+against modifications of file data while the operation is running. For
+blocking changes through write(2) and similar operations inode->i_rwsem can be
+used. To block changes to file contents via a memory mapping during the
+operation, the filesystem must take mapping->invalidate_lock to coordinate
+with ->page_mkwrite.
+
  dquot_operations
  ================
  
@@ -627,11 +652,11 @@ pfn_mkwrite:      yes
  access:                yes
  =============  =========       ===========================
  
-->fault() is called when a previously not present pte is about
-to be faulted in. The filesystem must find and return the page associated
-with the passed in "pgoff" in the vm_fault structure. If it is possible that
-the page may be truncated and/or invalidated, then the filesystem must lock
-the page, then ensure it is not already truncated (the page lock will block
+->fault() is called when a previously not present pte is about to be faulted
+in. The filesystem must find and return the page associated with the passed in
+"pgoff" in the vm_fault structure. If it is possible that the page may be
+truncated and/or invalidated, then the filesystem must lock invalidate_lock,
+then ensure the page is not already truncated (invalidate_lock will block
  subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
  locked. The VM will unlock the page.
  
@@ -644,12 +669,14 @@ page table entry. Pointer to entry associated with the page is passed in
  "pte" field in vm_fault structure. Pointers to entries for other offsets
  should be calculated relative to "pte".
  
-->page_mkwrite() is called when a previously read-only pte is
-about to become writeable. The filesystem again must ensure that there are
-no truncate/invalidate races, and then return with the page locked. If
-the page has been truncated, the filesystem should not look up a new page
-like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
-will cause the VM to retry the fault.
+->page_mkwrite() is called when a previously read-only pte is about to become
+writeable. The filesystem again must ensure that there are no
+truncate/invalidate races or races with operations such as ->remap_file_range
+or ->copy_file_range, and then return with the page locked. Usually
+mapping->invalidate_lock is suitable for proper serialization. If the page has
+been truncated, the filesystem should not look up a new page like the ->fault()
+handler, but simply return with VM_FAULT_NOPAGE, which will cause the VM to
+retry the fault.
  
  ->pfn_mkwrite() is the same as page_mkwrite but when the pte is
  VM_PFNMAP or VM_MIXEDMAP with a page-less entry. Expected return is
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c

index a1e2813..7e7a897 100644 (file)
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1395,9 +1395,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
                 ret = VM_FAULT_SIGBUS;
         } else {
                 struct address_space *mapping = inode->i_mapping;
-               struct page *page = find_or_create_page(mapping, 0,
-                                               mapping_gfp_constraint(mapping,
-                                               ~__GFP_FS));
+               struct page *page;
+
+               filemap_invalidate_lock_shared(mapping);
+               page = find_or_create_page(mapping, 0,
+                               mapping_gfp_constraint(mapping, ~__GFP_FS));
                 if (!page) {
                         ret = VM_FAULT_OOM;
                         goto out_inline;
@@ -1418,6 +1420,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
                 vmf->page = page;
                 ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
  out_inline:
+               filemap_invalidate_unlock_shared(mapping);
                 dout("filemap_fault %p %llu read inline data ret %x\n",
                      inode, off, ret);
         }
diff --git a/fs/ceph/file.c b/fs/ceph/file.c

index d1755ac..e1d605a 100644 (file)
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2088,6 +2088,7 @@ static long ceph_fallocate(struct file *file, int mode,
         if (ret < 0)
                 goto unlock;
  
+       filemap_invalidate_lock(inode->i_mapping);
         ceph_zero_pagecache_range(inode, offset, length);
         ret = ceph_zero_objects(inode, offset, length);
  
@@ -2100,6 +2101,7 @@ static long ceph_fallocate(struct file *file, int mode,
                 if (dirty)
                         __mark_inode_dirty(inode, dirty);
         }
+       filemap_invalidate_unlock(inode->i_mapping);
  
         ceph_put_cap_refs(ci, got);
  unlock:
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c

index 2dfd0d8..ddc0e8f 100644 (file)
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -3590,6 +3590,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
                 return rc;
         }
  
+       filemap_invalidate_lock(inode->i_mapping);
         /*
          * We implement the punch hole through ioctl, so we need remove the page
          * caches first, otherwise the data may be inconsistent with the server.
@@ -3607,6 +3608,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
                         sizeof(struct file_zero_data_information),
                         CIFSMaxBufSize, NULL, NULL);
         free_xid(xid);
+       filemap_invalidate_unlock(inode->i_mapping);
         return rc;
  }
  
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h

index e512630..3be9dd6 100644 (file)
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -667,9 +667,6 @@ struct ext2_inode_info {
         struct rw_semaphore xattr_sem;
  #endif
         rwlock_t i_meta_lock;
-#ifdef CONFIG_FS_DAX
-       struct rw_semaphore dax_sem;
-#endif
  
         /*
          * truncate_mutex is for serialising ext2_truncate() against
@@ -685,14 +682,6 @@ struct ext2_inode_info {
  #endif
  };
  
-#ifdef CONFIG_FS_DAX
-#define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem)
-#define dax_sem_up_write(ext2_inode)   up_write(&(ext2_inode)->dax_sem)
-#else
-#define dax_sem_down_write(ext2_inode)
-#define dax_sem_up_write(ext2_inode)
-#endif
-
  /*
   * Inode dynamic state flags
   */
diff --git a/fs/ext2/file.c b/fs/ext2/file.c

index f98466a..eb97aa3 100644 (file)
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -81,7 +81,7 @@ out_unlock:
   *
   * mmap_lock (MM)
   *   sb_start_pagefault (vfs, freeze)
- *     ext2_inode_info->dax_sem
+ *     address_space->invalidate_lock
   *       address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX)
   *         ext2_inode_info->truncate_mutex
   *
@@ -91,7 +91,6 @@ out_unlock:
  static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
  {
         struct inode *inode = file_inode(vmf->vma->vm_file);
-       struct ext2_inode_info *ei = EXT2_I(inode);
         vm_fault_t ret;
         bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
                 (vmf->vma->vm_flags & VM_SHARED);
@@ -100,11 +99,11 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
                 sb_start_pagefault(inode->i_sb);
                 file_update_time(vmf->vma->vm_file);
         }
-       down_read(&ei->dax_sem);
+       filemap_invalidate_lock_shared(inode->i_mapping);
  
         ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops);
  
-       up_read(&ei->dax_sem);
+       filemap_invalidate_unlock_shared(inode->i_mapping);
         if (write)
                 sb_end_pagefault(inode->i_sb);
         return ret;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c

index 04f0def..333fa62 100644 (file)
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1178,7 +1178,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
                 ext2_free_data(inode, p, q);
  }
  
-/* dax_sem must be held when calling this function */
+/* mapping->invalidate_lock must be held when calling this function */
  static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
  {
         __le32 *i_data = EXT2_I(inode)->i_data;
@@ -1195,7 +1195,7 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
         iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
  
  #ifdef CONFIG_FS_DAX
-       WARN_ON(!rwsem_is_locked(&ei->dax_sem));
+       WARN_ON(!rwsem_is_locked(&inode->i_mapping->invalidate_lock));
  #endif
  
         n = ext2_block_to_path(inode, iblock, offsets, NULL);
@@ -1277,9 +1277,9 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
         if (ext2_inode_is_fast_symlink(inode))
                 return;
  
-       dax_sem_down_write(EXT2_I(inode));
+       filemap_invalidate_lock(inode->i_mapping);
         __ext2_truncate_blocks(inode, offset);
-       dax_sem_up_write(EXT2_I(inode));
+       filemap_invalidate_unlock(inode->i_mapping);
  }
  
  static int ext2_setsize(struct inode *inode, loff_t newsize)
@@ -1309,10 +1309,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
         if (error)
                 return error;
  
-       dax_sem_down_write(EXT2_I(inode));
+       filemap_invalidate_lock(inode->i_mapping);
         truncate_setsize(inode, newsize);
         __ext2_truncate_blocks(inode, newsize);
-       dax_sem_up_write(EXT2_I(inode));
+       filemap_invalidate_unlock(inode->i_mapping);
  
         inode->i_mtime = inode->i_ctime = current_time(inode);
         if (inode_needs_sync(inode)) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c

index 21e09fb..987bcf3 100644 (file)
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -206,9 +206,6 @@ static void init_once(void *foo)
         init_rwsem(&ei->xattr_sem);
  #endif
         mutex_init(&ei->truncate_mutex);
-#ifdef CONFIG_FS_DAX
-       init_rwsem(&ei->dax_sem);
-#endif
         inode_init_once(&ei->vfs_inode);
  }
  
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index 3c51e24..7ebaf66 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1086,15 +1086,6 @@ struct ext4_inode_info {
          * by other means, so we have i_data_sem.
          */
         struct rw_semaphore i_data_sem;
-       /*
-        * i_mmap_sem is for serializing page faults with truncate / punch hole
-        * operations. We have to make sure that new page cannot be faulted in
-        * a section of the inode that is being punched. We cannot easily use
-        * i_data_sem for this since we need protection for the whole punch
-        * operation and i_data_sem ranks below transaction start so we have
-        * to occasionally drop it.
-        */
-       struct rw_semaphore i_mmap_sem;
         struct inode vfs_inode;
         struct jbd2_inode *jinode;
  
@@ -2972,7 +2963,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
  extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                              loff_t lstart, loff_t lend);
  extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
-extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
  extern qsize_t *ext4_get_reserved_space(struct inode *inode);
  extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
  extern void ext4_da_release_space(struct inode *inode, int to_free);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c

index 92ad64b..c33e0a2 100644 (file)
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4474,6 +4474,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                             loff_t len, int mode)
  {
         struct inode *inode = file_inode(file);
+       struct address_space *mapping = file->f_mapping;
         handle_t *handle = NULL;
         unsigned int max_blocks;
         loff_t new_size = 0;
@@ -4560,17 +4561,17 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                  * Prevent page faults from reinstantiating pages we have
                  * released from page cache.
                  */
-               down_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_lock(mapping);
  
                 ret = ext4_break_layouts(inode);
                 if (ret) {
-                       up_write(&EXT4_I(inode)->i_mmap_sem);
+                       filemap_invalidate_unlock(mapping);
                         goto out_mutex;
                 }
  
                 ret = ext4_update_disksize_before_punch(inode, offset, len);
                 if (ret) {
-                       up_write(&EXT4_I(inode)->i_mmap_sem);
+                       filemap_invalidate_unlock(mapping);
                         goto out_mutex;
                 }
                 /* Now release the pages and zero block aligned part of pages */
@@ -4579,7 +4580,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
  
                 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
                                              flags);
-               up_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_unlock(mapping);
                 if (ret)
                         goto out_mutex;
         }
@@ -5221,6 +5222,7 @@ out:
  static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
  {
         struct super_block *sb = inode->i_sb;
+       struct address_space *mapping = inode->i_mapping;
         ext4_lblk_t punch_start, punch_stop;
         handle_t *handle;
         unsigned int credits;
@@ -5274,7 +5276,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
          * Prevent page faults from reinstantiating pages we have released from
          * page cache.
          */
-       down_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(mapping);
  
         ret = ext4_break_layouts(inode);
         if (ret)
@@ -5289,15 +5291,15 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
          * Write tail of the last page before removed range since it will get
          * removed from the page cache below.
          */
-       ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
+       ret = filemap_write_and_wait_range(mapping, ioffset, offset);
         if (ret)
                 goto out_mmap;
         /*
          * Write data that will be shifted to preserve them when discarding
          * page cache below. We are also protected from pages becoming dirty
-        * by i_mmap_sem.
+        * by i_rwsem and invalidate_lock.
          */
-       ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
+       ret = filemap_write_and_wait_range(mapping, offset + len,
                                            LLONG_MAX);
         if (ret)
                 goto out_mmap;
@@ -5350,7 +5352,7 @@ out_stop:
         ext4_journal_stop(handle);
         ext4_fc_stop_ineligible(sb);
  out_mmap:
-       up_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(mapping);
  out_mutex:
         inode_unlock(inode);
         return ret;
@@ -5367,6 +5369,7 @@ out_mutex:
  static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
  {
         struct super_block *sb = inode->i_sb;
+       struct address_space *mapping = inode->i_mapping;
         handle_t *handle;
         struct ext4_ext_path *path;
         struct ext4_extent *extent;
@@ -5425,7 +5428,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
          * Prevent page faults from reinstantiating pages we have released from
          * page cache.
          */
-       down_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(mapping);
  
         ret = ext4_break_layouts(inode);
         if (ret)
@@ -5526,7 +5529,7 @@ out_stop:
         ext4_journal_stop(handle);
         ext4_fc_stop_ineligible(sb);
  out_mmap:
-       up_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(mapping);
  out_mutex:
         inode_unlock(inode);
         return ret;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index 816dedc..d3b4ed9 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -704,22 +704,23 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
          */
         bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
                 (vmf->vma->vm_flags & VM_SHARED);
+       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
         pfn_t pfn;
  
         if (write) {
                 sb_start_pagefault(sb);
                 file_update_time(vmf->vma->vm_file);
-               down_read(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_lock_shared(mapping);
  retry:
                 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
                                                EXT4_DATA_TRANS_BLOCKS(sb));
                 if (IS_ERR(handle)) {
-                       up_read(&EXT4_I(inode)->i_mmap_sem);
+                       filemap_invalidate_unlock_shared(mapping);
                         sb_end_pagefault(sb);
                         return VM_FAULT_SIGBUS;
                 }
         } else {
-               down_read(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_lock_shared(mapping);
         }
         result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
         if (write) {
@@ -731,10 +732,10 @@ retry:
                 /* Handling synchronous page fault? */
                 if (result & VM_FAULT_NEEDDSYNC)
                         result = dax_finish_sync_fault(vmf, pe_size, pfn);
-               up_read(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_unlock_shared(mapping);
                 sb_end_pagefault(sb);
         } else {
-               up_read(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_unlock_shared(mapping);
         }
  
         return result;
@@ -756,7 +757,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
  #endif
  
  static const struct vm_operations_struct ext4_file_vm_ops = {
-       .fault          = ext4_filemap_fault,
+       .fault          = filemap_fault,
         .map_pages      = filemap_map_pages,
         .page_mkwrite   = ext4_page_mkwrite,
  };
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index d8de607..325c038 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3950,20 +3950,19 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
         return ret;
  }
  
-static void ext4_wait_dax_page(struct ext4_inode_info *ei)
+static void ext4_wait_dax_page(struct inode *inode)
  {
-       up_write(&ei->i_mmap_sem);
+       filemap_invalidate_unlock(inode->i_mapping);
         schedule();
-       down_write(&ei->i_mmap_sem);
+       filemap_invalidate_lock(inode->i_mapping);
  }
  
  int ext4_break_layouts(struct inode *inode)
  {
-       struct ext4_inode_info *ei = EXT4_I(inode);
         struct page *page;
         int error;
  
-       if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
+       if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
                 return -EINVAL;
  
         do {
@@ -3974,7 +3973,7 @@ int ext4_break_layouts(struct inode *inode)
                 error = ___wait_var_event(&page->_refcount,
                                 atomic_read(&page->_refcount) == 1,
                                 TASK_INTERRUPTIBLE, 0, 0,
-                               ext4_wait_dax_page(ei));
+                               ext4_wait_dax_page(inode));
         } while (error == 0);
  
         return error;
@@ -4005,9 +4004,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
  
         ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
         if (ext4_has_inline_data(inode)) {
-               down_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_lock(mapping);
                 ret = ext4_convert_inline_data(inode);
-               up_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_unlock(mapping);
                 if (ret)
                         return ret;
         }
@@ -4058,7 +4057,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
          * Prevent page faults from reinstantiating pages we have released from
          * page cache.
          */
-       down_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(mapping);
  
         ret = ext4_break_layouts(inode);
         if (ret)
@@ -4131,7 +4130,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
  out_stop:
         ext4_journal_stop(handle);
  out_dio:
-       up_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(mapping);
  out_mutex:
         inode_unlock(inode);
         return ret;
@@ -5426,11 +5425,11 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
                         inode_dio_wait(inode);
                 }
  
-               down_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_lock(inode->i_mapping);
  
                 rc = ext4_break_layouts(inode);
                 if (rc) {
-                       up_write(&EXT4_I(inode)->i_mmap_sem);
+                       filemap_invalidate_unlock(inode->i_mapping);
                         goto err_out;
                 }
  
@@ -5506,7 +5505,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
                                 error = rc;
                 }
  out_mmap_sem:
-               up_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_unlock(inode->i_mapping);
         }
  
         if (!error) {
@@ -5983,10 +5982,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
          * data (and journalled aops don't know how to handle these cases).
          */
         if (val) {
-               down_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_lock(inode->i_mapping);
                 err = filemap_write_and_wait(inode->i_mapping);
                 if (err < 0) {
-                       up_write(&EXT4_I(inode)->i_mmap_sem);
+                       filemap_invalidate_unlock(inode->i_mapping);
                         return err;
                 }
         }
@@ -6019,7 +6018,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
         percpu_up_write(&sbi->s_writepages_rwsem);
  
         if (val)
-               up_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_unlock(inode->i_mapping);
  
         /* Finally we can mark the inode as dirty. */
  
@@ -6063,7 +6062,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
         sb_start_pagefault(inode->i_sb);
         file_update_time(vma->vm_file);
  
-       down_read(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock_shared(mapping);
  
         err = ext4_convert_inline_data(inode);
         if (err)
@@ -6176,7 +6175,7 @@ retry_alloc:
  out_ret:
         ret = block_page_mkwrite_return(err);
  out:
-       up_read(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock_shared(mapping);
         sb_end_pagefault(inode->i_sb);
         return ret;
  out_error:
@@ -6184,15 +6183,3 @@ out_error:
         ext4_journal_stop(handle);
         goto out;
  }
-
-vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
-{
-       struct inode *inode = file_inode(vmf->vma->vm_file);
-       vm_fault_t ret;
-
-       down_read(&EXT4_I(inode)->i_mmap_sem);
-       ret = filemap_fault(vmf);
-       up_read(&EXT4_I(inode)->i_mmap_sem);
-
-       return ret;
-}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c

index 6eed617..4fb5fe0 100644 (file)
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -148,7 +148,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
                 goto journal_err_out;
         }
  
-       down_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(inode->i_mapping);
         err = filemap_write_and_wait(inode->i_mapping);
         if (err)
                 goto err_out;
@@ -256,7 +256,7 @@ err_out1:
         ext4_double_up_write_data_sem(inode, inode_bl);
  
  err_out:
-       up_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(inode->i_mapping);
  journal_err_out:
         unlock_two_nondirectories(inode, inode_bl);
         iput(inode_bl);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index dfa09a2..d6df62f 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -90,12 +90,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
  /*
   * Lock ordering
   *
- * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
- * i_mmap_rwsem (inode->i_mmap_rwsem)!
- *
   * page fault path:
- * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
- *   page lock -> i_data_sem (rw)
+ * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
+ *   -> page lock -> i_data_sem (rw)
   *
   * buffered write path:
   * sb_start_write -> i_mutex -> mmap_lock
@@ -103,8 +100,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
   *   i_data_sem (rw)
   *
   * truncate:
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->
+ * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
+ *   page lock
+ * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
   *   i_data_sem (rw)
   *
   * direct IO:
@@ -1360,7 +1358,6 @@ static void init_once(void *foo)
         INIT_LIST_HEAD(&ei->i_orphan);
         init_rwsem(&ei->xattr_sem);
         init_rwsem(&ei->i_data_sem);
-       init_rwsem(&ei->i_mmap_sem);
         inode_init_once(&ei->vfs_inode);
         ext4_fc_init_inode(&ei->vfs_inode);
  }
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h

index bcbe366..ce84aa2 100644 (file)
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -11,14 +11,16 @@
   */
  static inline void ext4_truncate_failed_write(struct inode *inode)
  {
+       struct address_space *mapping = inode->i_mapping;
+
         /*
          * We don't need to call ext4_break_layouts() because the blocks we
          * are truncating were never visible to userspace.
          */
-       down_write(&EXT4_I(inode)->i_mmap_sem);
-       truncate_inode_pages(inode->i_mapping, inode->i_size);
+       filemap_invalidate_lock(mapping);
+       truncate_inode_pages(mapping, inode->i_size);
         ext4_truncate(inode);
-       up_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(mapping);
  }
  
  /*
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c

index d2cf48c..eb222b3 100644 (file)
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -3187,12 +3187,12 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
         /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */
         if (to > i_size && !f2fs_verity_in_progress(inode)) {
                 down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-               down_write(&F2FS_I(inode)->i_mmap_sem);
+               filemap_invalidate_lock(mapping);
  
                 truncate_pagecache(inode, i_size);
                 f2fs_truncate_blocks(inode, i_size, true);
  
-               up_write(&F2FS_I(inode)->i_mmap_sem);
+               filemap_invalidate_unlock(mapping);
                 up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
         }
  }
@@ -3852,7 +3852,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
         int ret = 0;
  
         down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-       down_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(inode->i_mapping);
  
         set_inode_flag(inode, FI_ALIGNED_WRITE);
  
@@ -3894,7 +3894,7 @@ done:
         clear_inode_flag(inode, FI_DO_DEFRAG);
         clear_inode_flag(inode, FI_ALIGNED_WRITE);
  
-       up_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(inode->i_mapping);
         up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
  
         return ret;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h

index ee8eb33..906b2c4 100644 (file)
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -754,7 +754,6 @@ struct f2fs_inode_info {
  
         /* avoid racing between foreground op and gc */
         struct rw_semaphore i_gc_rwsem[2];
-       struct rw_semaphore i_mmap_sem;
         struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */
  
         int i_extra_isize;              /* size of extra space located in i_addr */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c

index 6afd456..1ff3337 100644 (file)
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -38,10 +38,7 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
         struct inode *inode = file_inode(vmf->vma->vm_file);
         vm_fault_t ret;
  
-       down_read(&F2FS_I(inode)->i_mmap_sem);
         ret = filemap_fault(vmf);
-       up_read(&F2FS_I(inode)->i_mmap_sem);
-
         if (!ret)
                 f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO,
                                                         F2FS_BLKSIZE);
@@ -101,7 +98,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
         f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
  
         file_update_time(vmf->vma->vm_file);
-       down_read(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock_shared(inode->i_mapping);
         lock_page(page);
         if (unlikely(page->mapping != inode->i_mapping ||
                         page_offset(page) > i_size_read(inode) ||
@@ -159,7 +156,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
  
         trace_f2fs_vm_page_mkwrite(page, DATA);
  out_sem:
-       up_read(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock_shared(inode->i_mapping);
  
         sb_end_pagefault(inode->i_sb);
  err:
@@ -940,7 +937,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
                 }
  
                 down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-               down_write(&F2FS_I(inode)->i_mmap_sem);
+               filemap_invalidate_lock(inode->i_mapping);
  
                 truncate_setsize(inode, attr->ia_size);
  
@@ -950,7 +947,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
                  * do not trim all blocks after i_size if target size is
                  * larger than i_size.
                  */
-               up_write(&F2FS_I(inode)->i_mmap_sem);
+               filemap_invalidate_unlock(inode->i_mapping);
                 up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
                 if (err)
                         return err;
@@ -1095,7 +1092,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
                         blk_end = (loff_t)pg_end << PAGE_SHIFT;
  
                         down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-                       down_write(&F2FS_I(inode)->i_mmap_sem);
+                       filemap_invalidate_lock(mapping);
  
                         truncate_inode_pages_range(mapping, blk_start,
                                         blk_end - 1);
@@ -1104,7 +1101,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
                         ret = f2fs_truncate_hole(inode, pg_start, pg_end);
                         f2fs_unlock_op(sbi);
  
-                       up_write(&F2FS_I(inode)->i_mmap_sem);
+                       filemap_invalidate_unlock(mapping);
                         up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
                 }
         }
@@ -1339,7 +1336,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
  
         /* avoid gc operation during block exchange */
         down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-       down_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(inode->i_mapping);
  
         f2fs_lock_op(sbi);
         f2fs_drop_extent_tree(inode);
@@ -1347,7 +1344,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
         ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true);
         f2fs_unlock_op(sbi);
  
-       up_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(inode->i_mapping);
         up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
         return ret;
  }
@@ -1378,13 +1375,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
                 return ret;
  
         /* write out all moved pages, if possible */
-       down_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(inode->i_mapping);
         filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
         truncate_pagecache(inode, offset);
  
         new_size = i_size_read(inode) - len;
         ret = f2fs_truncate_blocks(inode, new_size, true);
-       up_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(inode->i_mapping);
         if (!ret)
                 f2fs_i_size_write(inode, new_size);
         return ret;
@@ -1484,7 +1481,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
                         pgoff_t end;
  
                         down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-                       down_write(&F2FS_I(inode)->i_mmap_sem);
+                       filemap_invalidate_lock(mapping);
  
                         truncate_pagecache_range(inode,
                                 (loff_t)index << PAGE_SHIFT,
@@ -1496,7 +1493,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
                         ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE);
                         if (ret) {
                                 f2fs_unlock_op(sbi);
-                               up_write(&F2FS_I(inode)->i_mmap_sem);
+                               filemap_invalidate_unlock(mapping);
                                 up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
                                 goto out;
                         }
@@ -1508,7 +1505,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
                         f2fs_put_dnode(&dn);
  
                         f2fs_unlock_op(sbi);
-                       up_write(&F2FS_I(inode)->i_mmap_sem);
+                       filemap_invalidate_unlock(mapping);
                         up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
  
                         f2fs_balance_fs(sbi, dn.node_changed);
@@ -1543,6 +1540,7 @@ out:
  static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
  {
         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+       struct address_space *mapping = inode->i_mapping;
         pgoff_t nr, pg_start, pg_end, delta, idx;
         loff_t new_size;
         int ret = 0;
@@ -1565,14 +1563,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
  
         f2fs_balance_fs(sbi, true);
  
-       down_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(mapping);
         ret = f2fs_truncate_blocks(inode, i_size_read(inode), true);
-       up_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(mapping);
         if (ret)
                 return ret;
  
         /* write out all dirty pages from offset */
-       ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+       ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX);
         if (ret)
                 return ret;
  
@@ -1583,7 +1581,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
  
         /* avoid gc operation during block exchange */
         down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-       down_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(mapping);
         truncate_pagecache(inode, offset);
  
         while (!ret && idx > pg_start) {
@@ -1599,14 +1597,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
                                         idx + delta, nr, false);
                 f2fs_unlock_op(sbi);
         }
-       up_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(mapping);
         up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
  
         /* write out all moved pages, if possible */
-       down_write(&F2FS_I(inode)->i_mmap_sem);
-       filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+       filemap_invalidate_lock(mapping);
+       filemap_write_and_wait_range(mapping, offset, LLONG_MAX);
         truncate_pagecache(inode, offset);
-       up_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(mapping);
  
         if (!ret)
                 f2fs_i_size_write(inode, new_size);
@@ -3440,7 +3438,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
                 goto out;
  
         down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-       down_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(inode->i_mapping);
  
         last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  
@@ -3476,7 +3474,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
         }
  
         up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-       up_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(inode->i_mapping);
  out:
         inode_unlock(inode);
  
@@ -3593,7 +3591,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
         }
  
         down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-       down_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(inode->i_mapping);
  
         last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  
@@ -3629,7 +3627,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
         }
  
         up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-       up_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(inode->i_mapping);
  
         if (ret >= 0) {
                 clear_inode_flag(inode, FI_COMPRESS_RELEASED);
@@ -3748,7 +3746,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
                 goto err;
  
         down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-       down_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(mapping);
  
         ret = filemap_write_and_wait_range(mapping, range.start,
                         to_end ? LLONG_MAX : end_addr - 1);
@@ -3835,7 +3833,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
                 ret = f2fs_secure_erase(prev_bdev, inode, prev_index,
                                 prev_block, len, range.flags);
  out:
-       up_write(&F2FS_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(mapping);
         up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
  err:
         inode_unlock(inode);
@@ -4313,9 +4311,9 @@ write:
                 /* if we couldn't write data, we should deallocate blocks. */
                 if (preallocated && i_size_read(inode) < target_size) {
                         down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-                       down_write(&F2FS_I(inode)->i_mmap_sem);
+                       filemap_invalidate_lock(inode->i_mapping);
                         f2fs_truncate(inode);
-                       up_write(&F2FS_I(inode)->i_mmap_sem);
+                       filemap_invalidate_unlock(inode->i_mapping);
                         up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
                 }
  
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c

index 8fecd30..ce2ab1b 100644 (file)
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1289,7 +1289,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
         mutex_init(&fi->inmem_lock);
         init_rwsem(&fi->i_gc_rwsem[READ]);
         init_rwsem(&fi->i_gc_rwsem[WRITE]);
-       init_rwsem(&fi->i_mmap_sem);
         init_rwsem(&fi->i_xattr_sem);
  
         /* Will be used by directory only */
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c

index 9d58371..281d79f 100644 (file)
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -444,12 +444,12 @@ static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
         /*
          * Can't do inline reclaim in fault path. We call
          * dax_layout_busy_page() before we free a range. And
-        * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it.
-        * In fault path we enter with fi->i_mmap_sem held and can't drop
-        * it. Also in fault path we hold fi->i_mmap_sem shared and not
-        * exclusive, so that creates further issues with fuse_wait_dax_page().
-        * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory
-        * range to become free and retry.
+        * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it.
+        * In fault path we enter with mapping->invalidate_lock held and can't
+        * drop it. Also in fault path we hold mapping->invalidate_lock shared
+        * and not exclusive, so that creates further issues with
+        * fuse_wait_dax_page().  Hence return -EAGAIN and fuse_dax_fault()
+        * will wait for a memory range to become free and retry.
          */
         if (flags & IOMAP_FAULT) {
                 alloc_dmap = alloc_dax_mapping(fcd);
@@ -513,7 +513,7 @@ static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos,
         down_write(&fi->dax->sem);
         node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
  
-       /* We are holding either inode lock or i_mmap_sem, and that should
+       /* We are holding either inode lock or invalidate_lock, and that should
          * ensure that dmap can't be truncated. We are holding a reference
          * on dmap and that should make sure it can't be reclaimed. So dmap
          * should still be there in tree despite the fact we dropped and
@@ -660,14 +660,12 @@ static const struct iomap_ops fuse_iomap_ops = {
  
  static void fuse_wait_dax_page(struct inode *inode)
  {
-       struct fuse_inode *fi = get_fuse_inode(inode);
-
-       up_write(&fi->i_mmap_sem);
+       filemap_invalidate_unlock(inode->i_mapping);
         schedule();
-       down_write(&fi->i_mmap_sem);
+       filemap_invalidate_lock(inode->i_mapping);
  }
  
-/* Should be called with fi->i_mmap_sem lock held exclusively */
+/* Should be called with mapping->invalidate_lock held exclusively */
  static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
                                     loff_t start, loff_t end)
  {
@@ -813,18 +811,18 @@ retry:
          * we do not want any read/write/mmap to make progress and try
          * to populate page cache or access memory we are trying to free.
          */
-       down_read(&get_fuse_inode(inode)->i_mmap_sem);
+       filemap_invalidate_lock_shared(inode->i_mapping);
         ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
         if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
                 error = 0;
                 retry = true;
-               up_read(&get_fuse_inode(inode)->i_mmap_sem);
+               filemap_invalidate_unlock_shared(inode->i_mapping);
                 goto retry;
         }
  
         if (ret & VM_FAULT_NEEDDSYNC)
                 ret = dax_finish_sync_fault(vmf, pe_size, pfn);
-       up_read(&get_fuse_inode(inode)->i_mmap_sem);
+       filemap_invalidate_unlock_shared(inode->i_mapping);
  
         if (write)
                 sb_end_pagefault(sb);
@@ -960,7 +958,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
         int ret;
         struct interval_tree_node *node;
  
-       down_write(&fi->i_mmap_sem);
+       filemap_invalidate_lock(inode->i_mapping);
  
         /* Lookup a dmap and corresponding file offset to reclaim. */
         down_read(&fi->dax->sem);
@@ -1021,7 +1019,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
  out_write_dmap_sem:
         up_write(&fi->dax->sem);
  out_mmap_sem:
-       up_write(&fi->i_mmap_sem);
+       filemap_invalidate_unlock(inode->i_mapping);
         return dmap;
  }
  
@@ -1050,10 +1048,10 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
                  * had a reference or some other temporary failure,
                  * Try again. We want to give up inline reclaim only
                  * if there is no range assigned to this node. Otherwise
-                * if a deadlock is possible if we sleep with fi->i_mmap_sem
-                * held and worker to free memory can't make progress due
-                * to unavailability of fi->i_mmap_sem lock. So sleep
-                * only if fi->dax->nr=0
+                * if a deadlock is possible if we sleep with
+                * mapping->invalidate_lock held and worker to free memory
+                * can't make progress due to unavailability of
+                * mapping->invalidate_lock.  So sleep only if fi->dax->nr=0
                  */
                 if (retry)
                         continue;
@@ -1061,8 +1059,8 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
                  * There are no mappings which can be reclaimed. Wait for one.
                  * We are not holding fi->dax->sem. So it is possible
                  * that range gets added now. But as we are not holding
-                * fi->i_mmap_sem, worker should still be able to free up
-                * a range and wake us up.
+                * mapping->invalidate_lock, worker should still be able to
+                * free up a range and wake us up.
                  */
                 if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) {
                         if (wait_event_killable_exclusive(fcd->range_waitq,
@@ -1108,7 +1106,7 @@ static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd,
  /*
   * Free a range of memory.
   * Locking:
- * 1. Take fi->i_mmap_sem to block dax faults.
+ * 1. Take mapping->invalidate_lock to block dax faults.
   * 2. Take fi->dax->sem to protect interval tree and also to make sure
   *    read/write can not reuse a dmap which we might be freeing.
   */
@@ -1122,7 +1120,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
         loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
         loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;
  
-       down_write(&fi->i_mmap_sem);
+       filemap_invalidate_lock(inode->i_mapping);
         ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
         if (ret) {
                 pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
@@ -1134,7 +1132,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
         ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx);
         up_write(&fi->dax->sem);
  out_mmap_sem:
-       up_write(&fi->i_mmap_sem);
+       filemap_invalidate_unlock(inode->i_mapping);
         return ret;
  }
  
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c

index eade6f9..d9b977c 100644 (file)
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1556,6 +1556,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
         struct fuse_mount *fm = get_fuse_mount(inode);
         struct fuse_conn *fc = fm->fc;
         struct fuse_inode *fi = get_fuse_inode(inode);
+       struct address_space *mapping = inode->i_mapping;
         FUSE_ARGS(args);
         struct fuse_setattr_in inarg;
         struct fuse_attr_out outarg;
@@ -1580,11 +1581,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
         }
  
         if (FUSE_IS_DAX(inode) && is_truncate) {
-               down_write(&fi->i_mmap_sem);
+               filemap_invalidate_lock(mapping);
                 fault_blocked = true;
                 err = fuse_dax_break_layouts(inode, 0, 0);
                 if (err) {
-                       up_write(&fi->i_mmap_sem);
+                       filemap_invalidate_unlock(mapping);
                         return err;
                 }
         }
@@ -1694,13 +1695,13 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
         if ((is_truncate || !is_wb) &&
             S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
                 truncate_pagecache(inode, outarg.attr.size);
-               invalidate_inode_pages2(inode->i_mapping);
+               invalidate_inode_pages2(mapping);
         }
  
         clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
  out:
         if (fault_blocked)
-               up_write(&fi->i_mmap_sem);
+               filemap_invalidate_unlock(mapping);
  
         return 0;
  
@@ -1711,7 +1712,7 @@ error:
         clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
  
         if (fault_blocked)
-               up_write(&fi->i_mmap_sem);
+               filemap_invalidate_unlock(mapping);
         return err;
  }
  
diff --git a/fs/fuse/file.c b/fs/fuse/file.c

index 97f860c..621a662 100644 (file)
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -243,7 +243,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
         }
  
         if (dax_truncate) {
-               down_write(&get_fuse_inode(inode)->i_mmap_sem);
+               filemap_invalidate_lock(inode->i_mapping);
                 err = fuse_dax_break_layouts(inode, 0, 0);
                 if (err)
                         goto out;
@@ -255,7 +255,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
  
  out:
         if (dax_truncate)
-               up_write(&get_fuse_inode(inode)->i_mmap_sem);
+               filemap_invalidate_unlock(inode->i_mapping);
  
         if (is_wb_truncate | dax_truncate) {
                 fuse_release_nowrite(inode);
@@ -2920,7 +2920,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
         if (lock_inode) {
                 inode_lock(inode);
                 if (block_faults) {
-                       down_write(&fi->i_mmap_sem);
+                       filemap_invalidate_lock(inode->i_mapping);
                         err = fuse_dax_break_layouts(inode, 0, 0);
                         if (err)
                                 goto out;
@@ -2976,7 +2976,7 @@ out:
                 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
  
         if (block_faults)
-               up_write(&fi->i_mmap_sem);
+               filemap_invalidate_unlock(inode->i_mapping);
  
         if (lock_inode)
                 inode_unlock(inode);
@@ -3045,7 +3045,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
          * modifications.  Yet this does give less guarantees than if the
          * copying was performed with write(2).
          *
-        * To fix this a i_mmap_sem style lock could be used to prevent new
+        * To fix this a mapping->invalidate_lock could be used to prevent new
          * faults while the copy is ongoing.
          */
         err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h

index 07829ce..6fb639b 100644 (file)
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -149,13 +149,6 @@ struct fuse_inode {
         /** Lock to protect write related fields */
         spinlock_t lock;
  
-       /**
-        * Can't take inode lock in fault path (leads to circular dependency).
-        * Introduce another semaphore which can be taken in fault path and
-        * then other filesystem paths can take this to block faults.
-        */
-       struct rw_semaphore i_mmap_sem;
-
  #ifdef CONFIG_FUSE_DAX
         /*
          * Dax specific inode data
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c

index b9beb39..e07e429 100644 (file)
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -85,7 +85,6 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
         fi->orig_ino = 0;
         fi->state = 0;
         mutex_init(&fi->mutex);
-       init_rwsem(&fi->i_mmap_sem);
         spin_lock_init(&fi->lock);
         fi->forget = fuse_alloc_forget();
         if (!fi->forget)
diff --git a/fs/inode.c b/fs/inode.c

index c93500d..84c528c 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -190,6 +190,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
         mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
         mapping->private_data = NULL;
         mapping->writeback_index = 0;
+       __init_rwsem(&mapping->invalidate_lock, "mapping.invalidate_lock",
+                    &sb->s_type->invalidate_lock_key);
         inode->i_private = NULL;
         inode->i_mapping = mapping;
         INIT_HLIST_HEAD(&inode->i_dentry);      /* buggered by rcu freeing */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c

index 213a97a..1cd3f94 100644 (file)
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1626,7 +1626,6 @@ xfs_swap_extents(
         struct xfs_bstat        *sbp = &sxp->sx_stat;
         int                     src_log_flags, target_log_flags;
         int                     error = 0;
-       int                     lock_flags;
         uint64_t                f;
         int                     resblks = 0;
         unsigned int            flags = 0;
@@ -1638,8 +1637,8 @@ xfs_swap_extents(
          * do the rest of the checks.
          */
         lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
-       lock_flags = XFS_MMAPLOCK_EXCL;
-       xfs_lock_two_inodes(ip, XFS_MMAPLOCK_EXCL, tip, XFS_MMAPLOCK_EXCL);
+       filemap_invalidate_lock_two(VFS_I(ip)->i_mapping,
+                                   VFS_I(tip)->i_mapping);
  
         /* Verify that both files have the same format */
         if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
@@ -1711,7 +1710,6 @@ xfs_swap_extents(
          * or cancel will unlock the inodes from this point onwards.
          */
         xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL);
-       lock_flags |= XFS_ILOCK_EXCL;
         xfs_trans_ijoin(tp, ip, 0);
         xfs_trans_ijoin(tp, tip, 0);
  
@@ -1830,13 +1828,16 @@ xfs_swap_extents(
         trace_xfs_swap_extent_after(ip, 0);
         trace_xfs_swap_extent_after(tip, 1);
  
+out_unlock_ilock:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       xfs_iunlock(tip, XFS_ILOCK_EXCL);
  out_unlock:
-       xfs_iunlock(ip, lock_flags);
-       xfs_iunlock(tip, lock_flags);
+       filemap_invalidate_unlock_two(VFS_I(ip)->i_mapping,
+                                     VFS_I(tip)->i_mapping);
         unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
         return error;
  
  out_trans_cancel:
         xfs_trans_cancel(tp);
-       goto out_unlock;
+       goto out_unlock_ilock;
  }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index cc3cfb1..3dfbdcd 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1302,7 +1302,7 @@ xfs_file_llseek(
   *
   * mmap_lock (MM)
   *   sb_start_pagefault(vfs, freeze)
- *     i_mmaplock (XFS - truncate serialisation)
+ *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
   *       page_lock (MM)
   *         i_lock (XFS - extent map serialisation)
   */
@@ -1323,24 +1323,27 @@ __xfs_filemap_fault(
                 file_update_time(vmf->vma->vm_file);
         }
  
-       xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
         if (IS_DAX(inode)) {
                 pfn_t pfn;
  
+               xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
                 ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
                                 (write_fault && !vmf->cow_page) ?
                                  &xfs_direct_write_iomap_ops :
                                  &xfs_read_iomap_ops);
                 if (ret & VM_FAULT_NEEDDSYNC)
                         ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+               xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
         } else {
-               if (write_fault)
+               if (write_fault) {
+                       xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
                         ret = iomap_page_mkwrite(vmf,
                                         &xfs_buffered_write_iomap_ops);
-               else
+                       xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+               } else {
                         ret = filemap_fault(vmf);
+               }
         }
-       xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
         if (write_fault)
                 sb_end_pagefault(inode->i_sb);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index 990b72a..f00145e 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -132,7 +132,7 @@ xfs_ilock_attr_map_shared(
  
  /*
   * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
- * multi-reader locks: i_mmap_lock and the i_lock.  This routine allows
+ * multi-reader locks: invalidate_lock and the i_lock.  This routine allows
   * various combinations of the locks to be obtained.
   *
   * The 3 locks should always be ordered so that the IO lock is obtained first,
@@ -140,23 +140,23 @@ xfs_ilock_attr_map_shared(
   *
   * Basic locking order:
   *
- * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
+ * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
   *
   * mmap_lock locking order:
   *
   * i_rwsem -> page lock -> mmap_lock
- * mmap_lock -> i_mmap_lock -> page_lock
+ * mmap_lock -> invalidate_lock -> page_lock
   *
   * The difference in mmap_lock locking order mean that we cannot hold the
- * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
- * fault in pages during copy in/out (for buffered IO) or require the mmap_lock
- * in get_user_pages() to map the user pages into the kernel address space for
- * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
- * page faults already hold the mmap_lock.
+ * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
+ * can fault in pages during copy in/out (for buffered IO) or require the
+ * mmap_lock in get_user_pages() to map the user pages into the kernel address
+ * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
+ * fault because page faults already hold the mmap_lock.
   *
   * Hence to serialise fully against both syscall and mmap based IO, we need to
- * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
- * taken in places where we need to invalidate the page cache in a race
+ * take both the i_rwsem and the invalidate_lock. These locks should *only* be
+ * both taken in places where we need to invalidate the page cache in a race
   * free manner (e.g. truncate, hole punch and other extent manipulation
   * functions).
   */
@@ -188,10 +188,13 @@ xfs_ilock(
                                  XFS_IOLOCK_DEP(lock_flags));
         }
  
-       if (lock_flags & XFS_MMAPLOCK_EXCL)
-               mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
-       else if (lock_flags & XFS_MMAPLOCK_SHARED)
-               mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+       if (lock_flags & XFS_MMAPLOCK_EXCL) {
+               down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+                                 XFS_MMAPLOCK_DEP(lock_flags));
+       } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
+               down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+                                XFS_MMAPLOCK_DEP(lock_flags));
+       }
  
         if (lock_flags & XFS_ILOCK_EXCL)
                 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
@@ -240,10 +243,10 @@ xfs_ilock_nowait(
         }
  
         if (lock_flags & XFS_MMAPLOCK_EXCL) {
-               if (!mrtryupdate(&ip->i_mmaplock))
+               if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
                         goto out_undo_iolock;
         } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
-               if (!mrtryaccess(&ip->i_mmaplock))
+               if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
                         goto out_undo_iolock;
         }
  
@@ -258,9 +261,9 @@ xfs_ilock_nowait(
  
  out_undo_mmaplock:
         if (lock_flags & XFS_MMAPLOCK_EXCL)
-               mrunlock_excl(&ip->i_mmaplock);
+               up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
         else if (lock_flags & XFS_MMAPLOCK_SHARED)
-               mrunlock_shared(&ip->i_mmaplock);
+               up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
  out_undo_iolock:
         if (lock_flags & XFS_IOLOCK_EXCL)
                 up_write(&VFS_I(ip)->i_rwsem);
@@ -307,9 +310,9 @@ xfs_iunlock(
                 up_read(&VFS_I(ip)->i_rwsem);
  
         if (lock_flags & XFS_MMAPLOCK_EXCL)
-               mrunlock_excl(&ip->i_mmaplock);
+               up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
         else if (lock_flags & XFS_MMAPLOCK_SHARED)
-               mrunlock_shared(&ip->i_mmaplock);
+               up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
  
         if (lock_flags & XFS_ILOCK_EXCL)
                 mrunlock_excl(&ip->i_lock);
@@ -335,7 +338,7 @@ xfs_ilock_demote(
         if (lock_flags & XFS_ILOCK_EXCL)
                 mrdemote(&ip->i_lock);
         if (lock_flags & XFS_MMAPLOCK_EXCL)
-               mrdemote(&ip->i_mmaplock);
+               downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
         if (lock_flags & XFS_IOLOCK_EXCL)
                 downgrade_write(&VFS_I(ip)->i_rwsem);
  
@@ -343,9 +346,29 @@ xfs_ilock_demote(
  }
  
  #if defined(DEBUG) || defined(XFS_WARN)
-int
+static inline bool
+__xfs_rwsem_islocked(
+       struct rw_semaphore     *rwsem,
+       bool                    shared)
+{
+       if (!debug_locks)
+               return rwsem_is_locked(rwsem);
+
+       if (!shared)
+               return lockdep_is_held_type(rwsem, 0);
+
+       /*
+        * We are checking that the lock is held at least in shared
+        * mode but don't care that it might be held exclusively
+        * (i.e. shared | excl). Hence we check if the lock is held
+        * in any mode rather than an explicit shared mode.
+        */
+       return lockdep_is_held_type(rwsem, -1);
+}
+
+bool
  xfs_isilocked(
-       xfs_inode_t             *ip,
+       struct xfs_inode        *ip,
         uint                    lock_flags)
  {
         if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
@@ -355,20 +378,17 @@ xfs_isilocked(
         }
  
         if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
-               if (!(lock_flags & XFS_MMAPLOCK_SHARED))
-                       return !!ip->i_mmaplock.mr_writer;
-               return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
+               return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
+                               (lock_flags & XFS_IOLOCK_SHARED));
         }
  
-       if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
-               if (!(lock_flags & XFS_IOLOCK_SHARED))
-                       return !debug_locks ||
-                               lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
-               return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
+       if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
+               return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
+                               (lock_flags & XFS_IOLOCK_SHARED));
         }
  
         ASSERT(0);
-       return 0;
+       return false;
  }
  #endif
  
@@ -532,12 +552,10 @@ again:
  }
  
  /*
- * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
- * the mmaplock or the ilock, but not more than one type at a time. If we lock
- * more than one at a time, lockdep will report false positives saying we have
- * violated locking orders.  The iolock must be double-locked separately since
- * we use i_rwsem for that.  We now support taking one lock EXCL and the other
- * SHARED.
+ * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
+ * mmaplock must be double-locked separately since we use i_rwsem and
+ * invalidate_lock for that. We now support taking one lock EXCL and the
+ * other SHARED.
   */
  void
  xfs_lock_two_inodes(
@@ -555,15 +573,8 @@ xfs_lock_two_inodes(
         ASSERT(hweight32(ip1_mode) == 1);
         ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
         ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
-       ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
-              !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
-       ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
-              !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
-       ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
-              !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
-       ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
-              !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
-
+       ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
+       ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
         ASSERT(ip0->i_ino != ip1->i_ino);
  
         if (ip0->i_ino > ip1->i_ino) {
@@ -3741,11 +3752,8 @@ xfs_ilock2_io_mmap(
         ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
         if (ret)
                 return ret;
-       if (ip1 == ip2)
-               xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
-       else
-               xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
-                                   ip2, XFS_MMAPLOCK_EXCL);
+       filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
+                                   VFS_I(ip2)->i_mapping);
         return 0;
  }
  
@@ -3755,12 +3763,9 @@ xfs_iunlock2_io_mmap(
         struct xfs_inode        *ip1,
         struct xfs_inode        *ip2)
  {
-       bool                    same_inode = (ip1 == ip2);
-
-       xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
-       if (!same_inode)
-               xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+       filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
+                                     VFS_I(ip2)->i_mapping);
         inode_unlock(VFS_I(ip2));
-       if (!same_inode)
+       if (ip1 != ip2)
                 inode_unlock(VFS_I(ip1));
  }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h

index 4b6703d..e0ae905 100644 (file)
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -40,7 +40,6 @@ typedef struct xfs_inode {
         /* Transaction and locking information. */
         struct xfs_inode_log_item *i_itemp;     /* logging information */
         mrlock_t                i_lock;         /* inode lock */
-       mrlock_t                i_mmaplock;     /* inode mmap IO lock */
         atomic_t                i_pincount;     /* inode pin count */
  
         /*
@@ -410,7 +409,7 @@ void                xfs_ilock(xfs_inode_t *, uint);
  int            xfs_ilock_nowait(xfs_inode_t *, uint);
  void           xfs_iunlock(xfs_inode_t *, uint);
  void           xfs_ilock_demote(xfs_inode_t *, uint);
-int            xfs_isilocked(xfs_inode_t *, uint);
+bool           xfs_isilocked(struct xfs_inode *, uint);
  uint           xfs_ilock_data_map_shared(struct xfs_inode *);
  uint           xfs_ilock_attr_map_shared(struct xfs_inode *);
  
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index 2c9e26a..102cbd6 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -709,8 +709,6 @@ xfs_fs_inode_init_once(
         atomic_set(&ip->i_pincount, 0);
         spin_lock_init(&ip->i_flags_lock);
  
-       mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
-                    "xfsino", ip->i_ino);
         mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
                      "xfsino", ip->i_ino);
  }
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c

index 70055d4..ddc346a 100644 (file)
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -462,7 +462,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
         inode_dio_wait(inode);
  
         /* Serialize against page faults */
-       down_write(&zi->i_mmap_sem);
+       filemap_invalidate_lock(inode->i_mapping);
  
         /* Serialize against zonefs_iomap_begin() */
         mutex_lock(&zi->i_truncate_mutex);
@@ -500,7 +500,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
  
  unlock:
         mutex_unlock(&zi->i_truncate_mutex);
-       up_write(&zi->i_mmap_sem);
+       filemap_invalidate_unlock(inode->i_mapping);
  
         return ret;
  }
@@ -575,18 +575,6 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
         return ret;
  }
  
-static vm_fault_t zonefs_filemap_fault(struct vm_fault *vmf)
-{
-       struct zonefs_inode_info *zi = ZONEFS_I(file_inode(vmf->vma->vm_file));
-       vm_fault_t ret;
-
-       down_read(&zi->i_mmap_sem);
-       ret = filemap_fault(vmf);
-       up_read(&zi->i_mmap_sem);
-
-       return ret;
-}
-
  static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
  {
         struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -607,16 +595,16 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
         file_update_time(vmf->vma->vm_file);
  
         /* Serialize against truncates */
-       down_read(&zi->i_mmap_sem);
+       filemap_invalidate_lock_shared(inode->i_mapping);
         ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops);
-       up_read(&zi->i_mmap_sem);
+       filemap_invalidate_unlock_shared(inode->i_mapping);
  
         sb_end_pagefault(inode->i_sb);
         return ret;
  }
  
  static const struct vm_operations_struct zonefs_file_vm_ops = {
-       .fault          = zonefs_filemap_fault,
+       .fault          = filemap_fault,
         .map_pages      = filemap_map_pages,
         .page_mkwrite   = zonefs_filemap_page_mkwrite,
  };
@@ -1155,7 +1143,6 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb)
  
         inode_init_once(&zi->i_vnode);
         mutex_init(&zi->i_truncate_mutex);
-       init_rwsem(&zi->i_mmap_sem);
         zi->i_wr_refcnt = 0;
  
         return &zi->i_vnode;
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h

index 5114190..7b14790 100644 (file)
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -70,12 +70,11 @@ struct zonefs_inode_info {
          * and changes to the inode private data, and in particular changes to
          * a sequential file size on completion of direct IO writes.
          * Serialization of mmap read IOs with truncate and syscall IO
-        * operations is done with i_mmap_sem in addition to i_truncate_mutex.
-        * Only zonefs_seq_file_truncate() takes both lock (i_mmap_sem first,
-        * i_truncate_mutex second).
+        * operations is done with invalidate_lock in addition to
+        * i_truncate_mutex.  Only zonefs_seq_file_truncate() takes both lock
+        * (invalidate_lock first, i_truncate_mutex second).
          */
         struct mutex            i_truncate_mutex;
-       struct rw_semaphore     i_mmap_sem;
  
         /* guarded by i_truncate_mutex */
         unsigned int            i_wr_refcnt;
diff --git a/include/linux/fs.h b/include/linux/fs.h

index bea8ec5..eb668b5 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -436,6 +436,10 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
   * struct address_space - Contents of a cacheable, mappable object.
   * @host: Owner, either the inode or the block_device.
   * @i_pages: Cached pages.
+ * @invalidate_lock: Guards coherency between page cache contents and
+ *   file offset->disk block mappings in the filesystem during invalidates.
+ *   It is also used to block modification of page cache contents through
+ *   memory mappings.
   * @gfp_mask: Memory allocation flags to use for allocating pages.
   * @i_mmap_writable: Number of VM_SHARED mappings.
   * @nr_thps: Number of THPs in the pagecache (non-shmem only).
@@ -453,6 +457,7 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
  struct address_space {
         struct inode            *host;
         struct xarray           i_pages;
+       struct rw_semaphore     invalidate_lock;
         gfp_t                   gfp_mask;
         atomic_t                i_mmap_writable;
  #ifdef CONFIG_READ_ONLY_THP_FOR_FS
@@ -814,9 +819,42 @@ static inline void inode_lock_shared_nested(struct inode *inode, unsigned subcla
         down_read_nested(&inode->i_rwsem, subclass);
  }
  
+static inline void filemap_invalidate_lock(struct address_space *mapping)
+{
+       down_write(&mapping->invalidate_lock);
+}
+
+static inline void filemap_invalidate_unlock(struct address_space *mapping)
+{
+       up_write(&mapping->invalidate_lock);
+}
+
+static inline void filemap_invalidate_lock_shared(struct address_space *mapping)
+{
+       down_read(&mapping->invalidate_lock);
+}
+
+static inline int filemap_invalidate_trylock_shared(
+                                       struct address_space *mapping)
+{
+       return down_read_trylock(&mapping->invalidate_lock);
+}
+
+static inline void filemap_invalidate_unlock_shared(
+                                       struct address_space *mapping)
+{
+       up_read(&mapping->invalidate_lock);
+}
+
  void lock_two_nondirectories(struct inode *, struct inode*);
  void unlock_two_nondirectories(struct inode *, struct inode*);
  
+void filemap_invalidate_lock_two(struct address_space *mapping1,
+                                struct address_space *mapping2);
+void filemap_invalidate_unlock_two(struct address_space *mapping1,
+                                  struct address_space *mapping2);
+
+
  /*
   * NOTE: in a 32bit arch with a preemptable kernel and
   * an UP compile the i_size_read/write must be atomic
@@ -2490,6 +2528,7 @@ struct file_system_type {
  
         struct lock_class_key i_lock_key;
         struct lock_class_key i_mutex_key;
+       struct lock_class_key invalidate_lock_key;
         struct lock_class_key i_mutex_dir_key;
  };
  
diff --git a/mm/filemap.c b/mm/filemap.c

index d1458ec..0fad083 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -76,8 +76,9 @@
   *      ->swap_lock            (exclusive_swap_page, others)
   *        ->i_pages lock
   *
- *  ->i_mutex
- *    ->i_mmap_rwsem           (truncate->unmap_mapping_range)
+ *  ->i_rwsem
+ *    ->invalidate_lock                (acquired by fs in truncate path)
+ *      ->i_mmap_rwsem         (truncate->unmap_mapping_range)
   *
   *  ->mmap_lock
   *    ->i_mmap_rwsem
@@ -85,9 +86,10 @@
   *        ->i_pages lock       (arch-dependent flush_dcache_mmap_lock)
   *
   *  ->mmap_lock
- *    ->lock_page              (access_process_vm)
+ *    ->invalidate_lock                (filemap_fault)
+ *      ->lock_page            (filemap_fault, access_process_vm)
   *
- *  ->i_mutex                  (generic_perform_write)
+ *  ->i_rwsem                  (generic_perform_write)
   *    ->mmap_lock              (fault_in_pages_readable->do_page_fault)
   *
   *  bdi->wb.list_lock
@@ -1007,6 +1009,44 @@ struct page *__page_cache_alloc(gfp_t gfp)
  EXPORT_SYMBOL(__page_cache_alloc);
  #endif
  
+/*
+ * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
+ *
+ * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
+ *
+ * @mapping1: the first mapping to lock
+ * @mapping2: the second mapping to lock
+ */
+void filemap_invalidate_lock_two(struct address_space *mapping1,
+                                struct address_space *mapping2)
+{
+       if (mapping1 > mapping2)
+               swap(mapping1, mapping2);
+       if (mapping1)
+               down_write(&mapping1->invalidate_lock);
+       if (mapping2 && mapping1 != mapping2)
+               down_write_nested(&mapping2->invalidate_lock, 1);
+}
+EXPORT_SYMBOL(filemap_invalidate_lock_two);
+
+/*
+ * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
+ *
+ * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
+ *
+ * @mapping1: the first mapping to unlock
+ * @mapping2: the second mapping to unlock
+ */
+void filemap_invalidate_unlock_two(struct address_space *mapping1,
+                                  struct address_space *mapping2)
+{
+       if (mapping1)
+               up_write(&mapping1->invalidate_lock);
+       if (mapping2 && mapping1 != mapping2)
+               up_write(&mapping2->invalidate_lock);
+}
+EXPORT_SYMBOL(filemap_invalidate_unlock_two);
+
  /*
   * In order to wait for pages to become available there must be
   * waitqueues associated with pages. By using a hash table of
@@ -2368,20 +2408,30 @@ static int filemap_update_page(struct kiocb *iocb,
  {
         int error;
  
+       if (iocb->ki_flags & IOCB_NOWAIT) {
+               if (!filemap_invalidate_trylock_shared(mapping))
+                       return -EAGAIN;
+       } else {
+               filemap_invalidate_lock_shared(mapping);
+       }
+
         if (!trylock_page(page)) {
+               error = -EAGAIN;
                 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
-                       return -EAGAIN;
+                       goto unlock_mapping;
                 if (!(iocb->ki_flags & IOCB_WAITQ)) {
+                       filemap_invalidate_unlock_shared(mapping);
                         put_and_wait_on_page_locked(page, TASK_KILLABLE);
                         return AOP_TRUNCATED_PAGE;
                 }
                 error = __lock_page_async(page, iocb->ki_waitq);
                 if (error)
-                       return error;
+                       goto unlock_mapping;
         }
  
+       error = AOP_TRUNCATED_PAGE;
         if (!page->mapping)
-               goto truncated;
+               goto unlock;
  
         error = 0;
         if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
@@ -2392,15 +2442,13 @@ static int filemap_update_page(struct kiocb *iocb,
                 goto unlock;
  
         error = filemap_read_page(iocb->ki_filp, mapping, page);
-       if (error == AOP_TRUNCATED_PAGE)
-               put_page(page);
-       return error;
-truncated:
-       unlock_page(page);
-       put_page(page);
-       return AOP_TRUNCATED_PAGE;
+       goto unlock_mapping;
  unlock:
         unlock_page(page);
+unlock_mapping:
+       filemap_invalidate_unlock_shared(mapping);
+       if (error == AOP_TRUNCATED_PAGE)
+               put_page(page);
         return error;
  }
  
@@ -2415,6 +2463,19 @@ static int filemap_create_page(struct file *file,
         if (!page)
                 return -ENOMEM;
  
+       /*
+        * Protect against truncate / hole punch. Grabbing invalidate_lock here
+        * assures we cannot instantiate and bring uptodate new pagecache pages
+        * after evicting page cache during truncate and before actually
+        * freeing blocks.  Note that we could release invalidate_lock after
+        * inserting the page into page cache as the locked page would then be
+        * enough to synchronize with hole punching. But there are code paths
+        * such as filemap_update_page() filling in partially uptodate pages or
+        * ->readpages() that need to hold invalidate_lock while mapping blocks
+        * for IO so let's hold the lock here as well to keep locking rules
+        * simple.
+        */
+       filemap_invalidate_lock_shared(mapping);
         error = add_to_page_cache_lru(page, mapping, index,
                         mapping_gfp_constraint(mapping, GFP_KERNEL));
         if (error == -EEXIST)
@@ -2426,9 +2487,11 @@ static int filemap_create_page(struct file *file,
         if (error)
                 goto error;
  
+       filemap_invalidate_unlock_shared(mapping);
         pagevec_add(pvec, page);
         return 0;
  error:
+       filemap_invalidate_unlock_shared(mapping);
         put_page(page);
         return error;
  }
@@ -2967,6 +3030,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
         pgoff_t max_off;
         struct page *page;
         vm_fault_t ret = 0;
+       bool mapping_locked = false;
  
         max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
         if (unlikely(offset >= max_off))
@@ -2976,25 +3040,39 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
          * Do we have something in the page cache already?
          */
         page = find_get_page(mapping, offset);
-       if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
+       if (likely(page)) {
                 /*
-                * We found the page, so try async readahead before
-                * waiting for the lock.
+                * We found the page, so try async readahead before waiting for
+                * the lock.
                  */
-               fpin = do_async_mmap_readahead(vmf, page);
-       } else if (!page) {
+               if (!(vmf->flags & FAULT_FLAG_TRIED))
+                       fpin = do_async_mmap_readahead(vmf, page);
+               if (unlikely(!PageUptodate(page))) {
+                       filemap_invalidate_lock_shared(mapping);
+                       mapping_locked = true;
+               }
+       } else {
                 /* No page in the page cache at all */
                 count_vm_event(PGMAJFAULT);
                 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
                 ret = VM_FAULT_MAJOR;
                 fpin = do_sync_mmap_readahead(vmf);
  retry_find:
+               /*
+                * See comment in filemap_create_page() why we need
+                * invalidate_lock
+                */
+               if (!mapping_locked) {
+                       filemap_invalidate_lock_shared(mapping);
+                       mapping_locked = true;
+               }
                 page = pagecache_get_page(mapping, offset,
                                           FGP_CREAT|FGP_FOR_MMAP,
                                           vmf->gfp_mask);
                 if (!page) {
                         if (fpin)
                                 goto out_retry;
+                       filemap_invalidate_unlock_shared(mapping);
                         return VM_FAULT_OOM;
                 }
         }
@@ -3014,8 +3092,20 @@ retry_find:
          * We have a locked page in the page cache, now we need to check
          * that it's up-to-date. If not, it is going to be due to an error.
          */
-       if (unlikely(!PageUptodate(page)))
+       if (unlikely(!PageUptodate(page))) {
+               /*
+                * The page was in cache and uptodate and now it is not.
+                * Strange but possible since we didn't hold the page lock all
+                * the time. Let's drop everything get the invalidate lock and
+                * try again.
+                */
+               if (!mapping_locked) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto retry_find;
+               }
                 goto page_not_uptodate;
+       }
  
         /*
          * We've made it this far and we had to drop our mmap_lock, now is the
@@ -3026,6 +3116,8 @@ retry_find:
                 unlock_page(page);
                 goto out_retry;
         }
+       if (mapping_locked)
+               filemap_invalidate_unlock_shared(mapping);
  
         /*
          * Found the page and have a reference on it.
@@ -3056,6 +3148,7 @@ page_not_uptodate:
  
         if (!error || error == AOP_TRUNCATED_PAGE)
                 goto retry_find;
+       filemap_invalidate_unlock_shared(mapping);
  
         return VM_FAULT_SIGBUS;
  
@@ -3067,6 +3160,8 @@ out_retry:
          */
         if (page)
                 put_page(page);
+       if (mapping_locked)
+               filemap_invalidate_unlock_shared(mapping);
         if (fpin)
                 fput(fpin);
         return ret | VM_FAULT_RETRY;
@@ -3437,6 +3532,8 @@ out:
   *
   * If the page does not get brought uptodate, return -EIO.
   *
+ * The function expects mapping->invalidate_lock to be already held.
+ *
   * Return: up to date page on success, ERR_PTR() on failure.
   */
  struct page *read_cache_page(struct address_space *mapping,
@@ -3460,6 +3557,8 @@ EXPORT_SYMBOL(read_cache_page);
   *
   * If the page does not get brought uptodate, return -EIO.
   *
+ * The function expects mapping->invalidate_lock to be already held.
+ *
   * Return: up to date page on success, ERR_PTR() on failure.
   */
  struct page *read_cache_page_gfp(struct address_space *mapping,
@@ -3704,12 +3803,12 @@ EXPORT_SYMBOL(generic_perform_write);
   * modification times and calls proper subroutines depending on whether we
   * do direct IO or a standard buffered write.
   *
- * It expects i_mutex to be grabbed unless we work on a block device or similar
+ * It expects i_rwsem to be grabbed unless we work on a block device or similar
   * object which does not need locking at all.
   *
   * This function does *not* take care of syncing data in case of O_SYNC write.
   * A caller has to handle it. This is mainly due to the fact that we want to
- * avoid syncing under i_mutex.
+ * avoid syncing under i_rwsem.
   *
   * Return:
   * * number of bytes written, even for truncated writes
@@ -3797,7 +3896,7 @@ EXPORT_SYMBOL(__generic_file_write_iter);
   *
   * This is a wrapper around __generic_file_write_iter() to be used by most
   * filesystems. It takes care of syncing the file in case of O_SYNC file
- * and acquires i_mutex as needed.
+ * and acquires i_rwsem as needed.
   * Return:
   * * negative error code if no data has been written at all of
   *   vfs_fsync_range() failed for a synchronous write
diff --git a/mm/madvise.c b/mm/madvise.c

index 5c065bc..56324a3 100644 (file)
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -912,7 +912,7 @@ static long madvise_remove(struct vm_area_struct *vma,
                         + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
  
         /*
-        * Filesystem's fallocate may need to take i_mutex.  We need to
+        * Filesystem's fallocate may need to take i_rwsem.  We need to
          * explicitly grab a reference because the vma (and hence the
          * vma's reference to the file) can go away as soon as we drop
          * mmap_lock.
diff --git a/mm/memory-failure.c b/mm/memory-failure.c

index 470400c..e1f87cf 100644 (file)
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -866,7 +866,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
         /*
          * Truncation is a bit tricky. Enable it per file system for now.
          *
-        * Open: to take i_mutex or not for this? Right now we don't.
+        * Open: to take i_rwsem or not for this? Right now we don't.
          */
         ret = truncate_error_page(p, pfn, mapping);
  out:
diff --git a/mm/readahead.c b/mm/readahead.c

index d589f14..41b75d7 100644 (file)
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -192,6 +192,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
          */
         unsigned int nofs = memalloc_nofs_save();
  
+       filemap_invalidate_lock_shared(mapping);
         /*
          * Preallocate as many pages as we will need.
          */
@@ -236,6 +237,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
          * will then handle the error.
          */
         read_pages(ractl, &page_pool, false);
+       filemap_invalidate_unlock_shared(mapping);
         memalloc_nofs_restore(nofs);
  }
  EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
diff --git a/mm/rmap.c b/mm/rmap.c

index b9eb5c1..2d29a57 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -20,28 +20,29 @@
  /*
   * Lock ordering in mm:
   *
- * inode->i_mutex      (while writing or truncating, not reading or faulting)
+ * inode->i_rwsem      (while writing or truncating, not reading or faulting)
   *   mm->mmap_lock
- *     page->flags PG_locked (lock_page)   * (see huegtlbfs below)
- *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
- *         mapping->i_mmap_rwsem
- *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
- *           anon_vma->rwsem
- *             mm->page_table_lock or pte_lock
- *               swap_lock (in swap_duplicate, swap_info_get)
- *                 mmlist_lock (in mmput, drain_mmlist and others)
- *                 mapping->private_lock (in __set_page_dirty_buffers)
- *                   lock_page_memcg move_lock (in __set_page_dirty_buffers)
- *                     i_pages lock (widely used)
- *                       lruvec->lru_lock (in lock_page_lruvec_irq)
- *                 inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- *                 bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
- *                   sb_lock (within inode_lock in fs/fs-writeback.c)
- *                   i_pages lock (widely used, in set_page_dirty,
- *                             in arch-dependent flush_dcache_mmap_lock,
- *                             within bdi.wb->list_lock in __sync_single_inode)
+ *     mapping->invalidate_lock (in filemap_fault)
+ *       page->flags PG_locked (lock_page)   * (see hugetlbfs below)
+ *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ *           mapping->i_mmap_rwsem
+ *             hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ *             anon_vma->rwsem
+ *               mm->page_table_lock or pte_lock
+ *                 swap_lock (in swap_duplicate, swap_info_get)
+ *                   mmlist_lock (in mmput, drain_mmlist and others)
+ *                   mapping->private_lock (in __set_page_dirty_buffers)
+ *                     lock_page_memcg move_lock (in __set_page_dirty_buffers)
+ *                       i_pages lock (widely used)
+ *                         lruvec->lru_lock (in lock_page_lruvec_irq)
+ *                   inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ *                   bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ *                     sb_lock (within inode_lock in fs/fs-writeback.c)
+ *                     i_pages lock (widely used, in set_page_dirty,
+ *                               in arch-dependent flush_dcache_mmap_lock,
+ *                               within bdi.wb->list_lock in __sync_single_inode)
   *
- * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)
+ * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
   *   ->tasklist_lock
   *     pte map lock
   *
diff --git a/mm/shmem.c b/mm/shmem.c

index dacda74..3107ace 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -96,7 +96,7 @@ static struct vfsmount *shm_mnt;
  
  /*
   * shmem_fallocate communicates with shmem_fault or shmem_writepage via
- * inode->i_private (with i_mutex making sure that it has only one user at
+ * inode->i_private (with i_rwsem making sure that it has only one user at
   * a time): we would prefer not to enlarge the shmem inode just for that.
   */
  struct shmem_falloc {
@@ -774,7 +774,7 @@ static int shmem_free_swap(struct address_space *mapping,
   * Determine (in bytes) how many of the shmem object's pages mapped by the
   * given offsets are swapped out.
   *
- * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
+ * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
   * as long as the inode doesn't go away and racy results are not a problem.
   */
  unsigned long shmem_partial_swap_usage(struct address_space *mapping,
@@ -806,7 +806,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
   * Determine (in bytes) how many of the shmem object's pages mapped by the
   * given vma is swapped out.
   *
- * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
+ * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
   * as long as the inode doesn't go away and racy results are not a problem.
   */
  unsigned long shmem_swap_usage(struct vm_area_struct *vma)
@@ -1069,7 +1069,7 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
                 loff_t oldsize = inode->i_size;
                 loff_t newsize = attr->ia_size;
  
-               /* protected by i_mutex */
+               /* protected by i_rwsem */
                 if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
                     (newsize > oldsize && (info->seals & F_SEAL_GROW)))
                         return -EPERM;
@@ -2059,7 +2059,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
         /*
          * Trinity finds that probing a hole which tmpfs is punching can
          * prevent the hole-punch from ever completing: which in turn
-        * locks writers out with its hold on i_mutex.  So refrain from
+        * locks writers out with its hold on i_rwsem.  So refrain from
          * faulting pages into the hole while it's being punched.  Although
          * shmem_undo_range() does remove the additions, it may be unable to
          * keep up, as each new page needs its own unmap_mapping_range() call,
@@ -2070,7 +2070,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
          * we just need to make racing faults a rare case.
          *
          * The implementation below would be much simpler if we just used a
-        * standard mutex or completion: but we cannot take i_mutex in fault,
+        * standard mutex or completion: but we cannot take i_rwsem in fault,
          * and bloating every shmem inode for this unlikely case would be sad.
          */
         if (unlikely(inode->i_private)) {
@@ -2470,7 +2470,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
         struct shmem_inode_info *info = SHMEM_I(inode);
         pgoff_t index = pos >> PAGE_SHIFT;
  
-       /* i_mutex is held by caller */
+       /* i_rwsem is held by caller */
         if (unlikely(info->seals & (F_SEAL_GROW |
                                    F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
                 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
@@ -2570,7 +2570,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
  
                 /*
                  * We must evaluate after, since reads (unlike writes)
-                * are called without i_mutex protection against truncate
+                * are called without i_rwsem protection against truncate
                  */
                 nr = PAGE_SIZE;
                 i_size = i_size_read(inode);
@@ -2640,7 +2640,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
                 return -ENXIO;
  
         inode_lock(inode);
-       /* We're holding i_mutex so we can access i_size directly */
+       /* We're holding i_rwsem so we can access i_size directly */
         offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
         if (offset >= 0)
                 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
@@ -2669,7 +2669,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
                 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
  
-               /* protected by i_mutex */
+               /* protected by i_rwsem */
                 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
                         error = -EPERM;
                         goto out;
diff --git a/mm/truncate.c b/mm/truncate.c

index 234ddd8..44ad5e5 100644 (file)
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -412,7 +412,8 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
   * @mapping: mapping to truncate
   * @lstart: offset from which to truncate
   *
- * Called under (and serialised by) inode->i_mutex.
+ * Called under (and serialised by) inode->i_rwsem and
+ * mapping->invalidate_lock.
   *
   * Note: When this function returns, there can be a page in the process of
   * deletion (inside __delete_from_page_cache()) in the specified range.  Thus
@@ -429,7 +430,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
   * truncate_inode_pages_final - truncate *all* pages before inode dies
   * @mapping: mapping to truncate
   *
- * Called under (and serialized by) inode->i_mutex.
+ * Called under (and serialized by) inode->i_rwsem.
   *
   * Filesystems have to use this in the .evict_inode path to inform the
   * VM that this is the final truncate and the inode is going away.
@@ -748,7 +749,7 @@ EXPORT_SYMBOL(truncate_pagecache);
   * setattr function when ATTR_SIZE is passed in.
   *
   * Must be called with a lock serializing truncates and writes (generally
- * i_mutex but e.g. xfs uses a different lock) and before all filesystem
+ * i_rwsem but e.g. xfs uses a different lock) and before all filesystem
   * specific block truncation has been performed.
   */
  void truncate_setsize(struct inode *inode, loff_t newsize)
@@ -777,7 +778,7 @@ EXPORT_SYMBOL(truncate_setsize);
   *
   * The function must be called after i_size is updated so that page fault
   * coming after we unlock the page will already see the new i_size.
- * The function must be called while we still hold i_mutex - this not only
+ * The function must be called while we still hold i_rwsem - this not only
   * makes sure i_size is stable but also that userspace cannot observe new
   * i_size value before we are prepared to store mmap writes at new inode size.
   */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 30 Aug 2021 17:24:50 +0000 (10:24 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 30 Aug 2021 17:24:50 +0000 (10:24 -0700)
Documentation/filesystems/locking.rst		patch \| blob \| history
fs/ceph/addr.c		patch \| blob \| history
fs/ceph/file.c		patch \| blob \| history
fs/cifs/smb2ops.c		patch \| blob \| history
fs/ext2/ext2.h		patch \| blob \| history
fs/ext2/file.c		patch \| blob \| history
fs/ext2/inode.c		patch \| blob \| history
fs/ext2/super.c		patch \| blob \| history
fs/ext4/ext4.h		patch \| blob \| history
fs/ext4/extents.c		patch \| blob \| history
fs/ext4/file.c		patch \| blob \| history
fs/ext4/inode.c		patch \| blob \| history
fs/ext4/ioctl.c		patch \| blob \| history
fs/ext4/super.c		patch \| blob \| history
fs/ext4/truncate.h		patch \| blob \| history
fs/f2fs/data.c		patch \| blob \| history
fs/f2fs/f2fs.h		patch \| blob \| history
fs/f2fs/file.c		patch \| blob \| history
fs/f2fs/super.c		patch \| blob \| history
fs/fuse/dax.c		patch \| blob \| history
fs/fuse/dir.c		patch \| blob \| history
fs/fuse/file.c		patch \| blob \| history
fs/fuse/fuse_i.h		patch \| blob \| history
fs/fuse/inode.c		patch \| blob \| history
fs/inode.c		patch \| blob \| history
fs/xfs/xfs_bmap_util.c		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
fs/xfs/xfs_inode.c		patch \| blob \| history
fs/xfs/xfs_inode.h		patch \| blob \| history
fs/xfs/xfs_super.c		patch \| blob \| history
fs/zonefs/super.c		patch \| blob \| history
fs/zonefs/zonefs.h		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/madvise.c		patch \| blob \| history
mm/memory-failure.c		patch \| blob \| history
mm/readahead.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history
mm/shmem.c		patch \| blob \| history
mm/truncate.c		patch \| blob \| history