Merge tag 'for-5.15-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

[linux-2.6-microblaze.git] / mm / filemap.c
diff --git a/mm/filemap.c b/mm/filemap.c

index 034d370..920e8dc 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -76,8 +76,9 @@
   *      ->swap_lock            (exclusive_swap_page, others)
   *        ->i_pages lock
   *
- *  ->i_mutex
- *    ->i_mmap_rwsem           (truncate->unmap_mapping_range)
+ *  ->i_rwsem
+ *    ->invalidate_lock                (acquired by fs in truncate path)
+ *      ->i_mmap_rwsem         (truncate->unmap_mapping_range)
   *
   *  ->mmap_lock
   *    ->i_mmap_rwsem
@@ -85,9 +86,10 @@
   *        ->i_pages lock       (arch-dependent flush_dcache_mmap_lock)
   *
   *  ->mmap_lock
- *    ->lock_page              (access_process_vm)
+ *    ->invalidate_lock                (filemap_fault)
+ *      ->lock_page            (filemap_fault, access_process_vm)
   *
- *  ->i_mutex                  (generic_perform_write)
+ *  ->i_rwsem                  (generic_perform_write)
   *    ->mmap_lock              (fault_in_pages_readable->do_page_fault)
   *
   *  bdi->wb.list_lock
@@ -1025,6 +1027,44 @@ struct page *__page_cache_alloc(gfp_t gfp)
  EXPORT_SYMBOL(__page_cache_alloc);
  #endif
  
+/*
+ * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
+ *
+ * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
+ *
+ * @mapping1: the first mapping to lock
+ * @mapping2: the second mapping to lock
+ */
+void filemap_invalidate_lock_two(struct address_space *mapping1,
+                                struct address_space *mapping2)
+{
+       if (mapping1 > mapping2)
+               swap(mapping1, mapping2);
+       if (mapping1)
+               down_write(&mapping1->invalidate_lock);
+       if (mapping2 && mapping1 != mapping2)
+               down_write_nested(&mapping2->invalidate_lock, 1);
+}
+EXPORT_SYMBOL(filemap_invalidate_lock_two);
+
+/*
+ * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
+ *
+ * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
+ *
+ * @mapping1: the first mapping to unlock
+ * @mapping2: the second mapping to unlock
+ */
+void filemap_invalidate_unlock_two(struct address_space *mapping1,
+                                  struct address_space *mapping2)
+{
+       if (mapping1)
+               up_write(&mapping1->invalidate_lock);
+       if (mapping2 && mapping1 != mapping2)
+               up_write(&mapping2->invalidate_lock);
+}
+EXPORT_SYMBOL(filemap_invalidate_unlock_two);
+
  /*
   * In order to wait for pages to become available there must be
   * waitqueues associated with pages. By using a hash table of
@@ -2386,20 +2426,30 @@ static int filemap_update_page(struct kiocb *iocb,
  {
         int error;
  
+       if (iocb->ki_flags & IOCB_NOWAIT) {
+               if (!filemap_invalidate_trylock_shared(mapping))
+                       return -EAGAIN;
+       } else {
+               filemap_invalidate_lock_shared(mapping);
+       }
+
         if (!trylock_page(page)) {
+               error = -EAGAIN;
                 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
-                       return -EAGAIN;
+                       goto unlock_mapping;
                 if (!(iocb->ki_flags & IOCB_WAITQ)) {
+                       filemap_invalidate_unlock_shared(mapping);
                         put_and_wait_on_page_locked(page, TASK_KILLABLE);
                         return AOP_TRUNCATED_PAGE;
                 }
                 error = __lock_page_async(page, iocb->ki_waitq);
                 if (error)
-                       return error;
+                       goto unlock_mapping;
         }
  
+       error = AOP_TRUNCATED_PAGE;
         if (!page->mapping)
-               goto truncated;
+               goto unlock;
  
         error = 0;
         if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
@@ -2410,15 +2460,13 @@ static int filemap_update_page(struct kiocb *iocb,
                 goto unlock;
  
         error = filemap_read_page(iocb->ki_filp, mapping, page);
-       if (error == AOP_TRUNCATED_PAGE)
-               put_page(page);
-       return error;
-truncated:
-       unlock_page(page);
-       put_page(page);
-       return AOP_TRUNCATED_PAGE;
+       goto unlock_mapping;
  unlock:
         unlock_page(page);
+unlock_mapping:
+       filemap_invalidate_unlock_shared(mapping);
+       if (error == AOP_TRUNCATED_PAGE)
+               put_page(page);
         return error;
  }
  
@@ -2433,6 +2481,19 @@ static int filemap_create_page(struct file *file,
         if (!page)
                 return -ENOMEM;
  
+       /*
+        * Protect against truncate / hole punch. Grabbing invalidate_lock here
+        * assures we cannot instantiate and bring uptodate new pagecache pages
+        * after evicting page cache during truncate and before actually
+        * freeing blocks.  Note that we could release invalidate_lock after
+        * inserting the page into page cache as the locked page would then be
+        * enough to synchronize with hole punching. But there are code paths
+        * such as filemap_update_page() filling in partially uptodate pages or
+        * ->readpages() that need to hold invalidate_lock while mapping blocks
+        * for IO so let's hold the lock here as well to keep locking rules
+        * simple.
+        */
+       filemap_invalidate_lock_shared(mapping);
         error = add_to_page_cache_lru(page, mapping, index,
                         mapping_gfp_constraint(mapping, GFP_KERNEL));
         if (error == -EEXIST)
@@ -2444,9 +2505,11 @@ static int filemap_create_page(struct file *file,
         if (error)
                 goto error;
  
+       filemap_invalidate_unlock_shared(mapping);
         pagevec_add(pvec, page);
         return 0;
  error:
+       filemap_invalidate_unlock_shared(mapping);
         put_page(page);
         return error;
  }
@@ -2985,6 +3048,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
         pgoff_t max_off;
         struct page *page;
         vm_fault_t ret = 0;
+       bool mapping_locked = false;
  
         max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
         if (unlikely(offset >= max_off))
@@ -2994,25 +3058,39 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
          * Do we have something in the page cache already?
          */
         page = find_get_page(mapping, offset);
-       if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
+       if (likely(page)) {
                 /*
-                * We found the page, so try async readahead before
-                * waiting for the lock.
+                * We found the page, so try async readahead before waiting for
+                * the lock.
                  */
-               fpin = do_async_mmap_readahead(vmf, page);
-       } else if (!page) {
+               if (!(vmf->flags & FAULT_FLAG_TRIED))
+                       fpin = do_async_mmap_readahead(vmf, page);
+               if (unlikely(!PageUptodate(page))) {
+                       filemap_invalidate_lock_shared(mapping);
+                       mapping_locked = true;
+               }
+       } else {
                 /* No page in the page cache at all */
                 count_vm_event(PGMAJFAULT);
                 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
                 ret = VM_FAULT_MAJOR;
                 fpin = do_sync_mmap_readahead(vmf);
  retry_find:
+               /*
+                * See comment in filemap_create_page() why we need
+                * invalidate_lock
+                */
+               if (!mapping_locked) {
+                       filemap_invalidate_lock_shared(mapping);
+                       mapping_locked = true;
+               }
                 page = pagecache_get_page(mapping, offset,
                                           FGP_CREAT|FGP_FOR_MMAP,
                                           vmf->gfp_mask);
                 if (!page) {
                         if (fpin)
                                 goto out_retry;
+                       filemap_invalidate_unlock_shared(mapping);
                         return VM_FAULT_OOM;
                 }
         }
@@ -3032,8 +3110,20 @@ retry_find:
          * We have a locked page in the page cache, now we need to check
          * that it's up-to-date. If not, it is going to be due to an error.
          */
-       if (unlikely(!PageUptodate(page)))
+       if (unlikely(!PageUptodate(page))) {
+               /*
+                * The page was in cache and uptodate and now it is not.
+                * Strange but possible since we didn't hold the page lock all
+                * the time. Let's drop everything get the invalidate lock and
+                * try again.
+                */
+               if (!mapping_locked) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto retry_find;
+               }
                 goto page_not_uptodate;
+       }
  
         /*
          * We've made it this far and we had to drop our mmap_lock, now is the
@@ -3044,6 +3134,8 @@ retry_find:
                 unlock_page(page);
                 goto out_retry;
         }
+       if (mapping_locked)
+               filemap_invalidate_unlock_shared(mapping);
  
         /*
          * Found the page and have a reference on it.
@@ -3074,6 +3166,7 @@ page_not_uptodate:
  
         if (!error || error == AOP_TRUNCATED_PAGE)
                 goto retry_find;
+       filemap_invalidate_unlock_shared(mapping);
  
         return VM_FAULT_SIGBUS;
  
@@ -3085,6 +3178,8 @@ out_retry:
          */
         if (page)
                 put_page(page);
+       if (mapping_locked)
+               filemap_invalidate_unlock_shared(mapping);
         if (fpin)
                 fput(fpin);
         return ret | VM_FAULT_RETRY;
@@ -3455,6 +3550,8 @@ out:
   *
   * If the page does not get brought uptodate, return -EIO.
   *
+ * The function expects mapping->invalidate_lock to be already held.
+ *
   * Return: up to date page on success, ERR_PTR() on failure.
   */
  struct page *read_cache_page(struct address_space *mapping,
@@ -3478,6 +3575,8 @@ EXPORT_SYMBOL(read_cache_page);
   *
   * If the page does not get brought uptodate, return -EIO.
   *
+ * The function expects mapping->invalidate_lock to be already held.
+ *
   * Return: up to date page on success, ERR_PTR() on failure.
   */
  struct page *read_cache_page_gfp(struct address_space *mapping,
@@ -3722,12 +3821,12 @@ EXPORT_SYMBOL(generic_perform_write);
   * modification times and calls proper subroutines depending on whether we
   * do direct IO or a standard buffered write.
   *
- * It expects i_mutex to be grabbed unless we work on a block device or similar
+ * It expects i_rwsem to be grabbed unless we work on a block device or similar
   * object which does not need locking at all.
   *
   * This function does *not* take care of syncing data in case of O_SYNC write.
   * A caller has to handle it. This is mainly due to the fact that we want to
- * avoid syncing under i_mutex.
+ * avoid syncing under i_rwsem.
   *
   * Return:
   * * number of bytes written, even for truncated writes
@@ -3815,7 +3914,7 @@ EXPORT_SYMBOL(__generic_file_write_iter);
   *
   * This is a wrapper around __generic_file_write_iter() to be used by most
   * filesystems. It takes care of syncing the file in case of O_SYNC file
- * and acquires i_mutex as needed.
+ * and acquires i_rwsem as needed.
   * Return:
   * * negative error code if no data has been written at all of
   *   vfs_fsync_range() failed for a synchronous write