Merge branch 'work.read_write' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 15 Sep 2017 02:29:55 +0000 (19:29 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 15 Sep 2017 02:29:55 +0000 (19:29 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 15 Sep 2017 02:29:55 +0000 (19:29 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 15 Sep 2017 02:29:55 +0000 (19:29 -0700)
diff --combined fs/aio.c

index b5d69f2,d93daa0..5a24872
--- 1/fs/aio.c
--- 2/fs/aio.c
+++ b/fs/aio.c
@@@ -373,14 -373,6 +373,14 @@@ static int aio_migratepage(struct addre
         pgoff_t idx;
         int rc;
   
+ +      /*
+ +       * We cannot support the _NO_COPY case here, because copy needs to
+ +       * happen under the ctx->completion_lock. That does not work with the
+ +       * migration workflow of MIGRATE_SYNC_NO_COPY.
+ +       */
+ +      if (mode == MIGRATE_SYNC_NO_COPY)
+ +              return -EINVAL;
+ +
         rc = 0;
   
         /* mapping->private_lock here protects against the kioctx teardown.  */
@@@ -449,9 -441,10 +449,9 @@@ static const struct address_space_opera
   #endif
   };
   
- -static int aio_setup_ring(struct kioctx *ctx)
+ +static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
   {
         struct aio_ring *ring;
- -      unsigned nr_events = ctx->max_reqs;
         struct mm_struct *mm = current->mm;
         unsigned long size, unused;
         int nr_pages;
@@@ -713,12 -706,6 +713,12 @@@ static struct kioctx *ioctx_alloc(unsig
         struct kioctx *ctx;
         int err = -ENOMEM;
   
+ +      /*
+ +       * Store the original nr_events -- what userspace passed to io_setup(),
+ +       * for counting against the global limit -- before it changes.
+ +       */
+ +      unsigned int max_reqs = nr_events;
+ +
         /*
          * We keep track of the number of available ringbuffer slots, to prevent
          * overflow (reqs_available), and we also use percpu counters for this.
@@@ -737,14 -724,14 +737,14 @@@
                 return ERR_PTR(-EINVAL);
         }
   
- -      if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
+ +      if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
                 return ERR_PTR(-EAGAIN);
   
         ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
         if (!ctx)
                 return ERR_PTR(-ENOMEM);
   
- -      ctx->max_reqs = nr_events;
+ +      ctx->max_reqs = max_reqs;
   
         spin_lock_init(&ctx->ctx_lock);
         spin_lock_init(&ctx->completion_lock);
@@@ -766,7 -753,7 +766,7 @@@
         if (!ctx->cpu)
                 goto err;
   
- -      err = aio_setup_ring(ctx);
+ +      err = aio_setup_ring(ctx, nr_events);
         if (err < 0)
                 goto err;
   
@@@ -777,8 -764,8 +777,8 @@@
   
         /* limit the number of system wide aios */
         spin_lock(&aio_nr_lock);
- -      if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
- -          aio_nr + nr_events < aio_nr) {
+ +      if (aio_nr + ctx->max_reqs > aio_max_nr ||
+ +          aio_nr + ctx->max_reqs < aio_nr) {
                 spin_unlock(&aio_nr_lock);
                 err = -EAGAIN;
                 goto err_ctx;
@@@ -1606,12 -1593,6 +1606,6 @@@ static int io_submit_one(struct kioctx 
                 goto out_put_req;
         }
   
-       if ((req->common.ki_flags & IOCB_NOWAIT) &&
-                       !(req->common.ki_flags & IOCB_DIRECT)) {
-               ret = -EOPNOTSUPP;
-               goto out_put_req;
-       }
- 
         ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
         if (unlikely(ret)) {
                 pr_debug("EFAULT: aio_key\n");
diff --combined fs/block_dev.c

index bb715b2,ea21d18..93d088f
--- 1/fs/block_dev.c
--- 2/fs/block_dev.c
+++ b/fs/block_dev.c
@@@ -223,7 -223,7 +223,7 @@@ __blkdev_direct_IO_simple(struct kiocb 
         }
   
         bio_init(&bio, vecs, nr_pages);
- -      bio.bi_bdev = bdev;
+ +      bio_set_dev(&bio, bdev);
         bio.bi_iter.bi_sector = pos >> 9;
         bio.bi_write_hint = iocb->ki_hint;
         bio.bi_private = current;
@@@ -362,7 -362,7 +362,7 @@@ __blkdev_direct_IO(struct kiocb *iocb, 
   
         blk_start_plug(&plug);
         for (;;) {
- -              bio->bi_bdev = bdev;
+ +              bio_set_dev(bio, bdev);
                 bio->bi_iter.bi_sector = pos >> 9;
                 bio->bi_write_hint = iocb->ki_hint;
                 bio->bi_private = dio;
@@@ -1451,7 -1451,6 +1451,7 @@@ static int __blkdev_get(struct block_de
                 bdev->bd_disk = disk;
                 bdev->bd_queue = disk->queue;
                 bdev->bd_contains = bdev;
+ +              bdev->bd_partno = partno;
   
                 if (!partno) {
                         ret = -ENXIO;
@@@ -1740,6 -1739,8 +1740,8 @@@ static int blkdev_open(struct inode * i
          */
         filp->f_flags |= O_LARGEFILE;
   
+       filp->f_mode |= FMODE_NOWAIT;
+ 
         if (filp->f_flags & O_NDELAY)
                 filp->f_mode |= FMODE_NDELAY;
         if (filp->f_flags & O_EXCL)
@@@ -1892,6 -1893,9 +1894,9 @@@ ssize_t blkdev_write_iter(struct kiocb 
         if (iocb->ki_pos >= size)
                 return -ENOSPC;
   
+       if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
+               return -EOPNOTSUPP;
+ 
         iov_iter_truncate(from, size - iocb->ki_pos);
   
         blk_start_plug(&plug);
diff --combined fs/btrfs/file.c

index 74fd775,e62dd55..aafcc78
--- 1/fs/btrfs/file.c
--- 2/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@@ -1536,7 -1536,7 +1536,7 @@@ static noinline int check_can_nocow(str
         u64 num_bytes;
         int ret;
   
- -      ret = btrfs_start_write_no_snapshoting(root);
+ +      ret = btrfs_start_write_no_snapshotting(root);
         if (!ret)
                 return -ENOSPC;
   
@@@ -1561,7 -1561,7 +1561,7 @@@
                         NULL, NULL, NULL);
         if (ret <= 0) {
                 ret = 0;
- -              btrfs_end_write_no_snapshoting(root);
+ +              btrfs_end_write_no_snapshotting(root);
         } else {
                 *write_bytes = min_t(size_t, *write_bytes ,
                                      num_bytes - pos + lockstart);
@@@ -1664,7 -1664,7 +1664,7 @@@ static noinline ssize_t __btrfs_buffere
                                                 data_reserved, pos,
                                                 write_bytes);
                         else
- -                              btrfs_end_write_no_snapshoting(root);
+ +                              btrfs_end_write_no_snapshotting(root);
                         break;
                 }
   
@@@ -1767,7 -1767,7 +1767,7 @@@ again
   
                 release_bytes = 0;
                 if (only_release_metadata)
- -                      btrfs_end_write_no_snapshoting(root);
+ +                      btrfs_end_write_no_snapshotting(root);
   
                 if (only_release_metadata && copied > 0) {
                         lockstart = round_down(pos,
@@@ -1797,7 -1797,7 +1797,7 @@@
   
         if (release_bytes) {
                 if (only_release_metadata) {
- -                      btrfs_end_write_no_snapshoting(root);
+ +                      btrfs_end_write_no_snapshotting(root);
                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
                                         release_bytes);
                 } else {
@@@ -1886,6 -1886,10 +1886,10 @@@ static ssize_t btrfs_file_write_iter(st
         loff_t oldsize;
         int clean_page = 0;
   
+       if (!(iocb->ki_flags & IOCB_DIRECT) &&
+           (iocb->ki_flags & IOCB_NOWAIT))
+               return -EOPNOTSUPP;
+ 
         if (!inode_trylock(inode)) {
                 if (iocb->ki_flags & IOCB_NOWAIT)
                         return -EAGAIN;
@@@ -1990,15 -1994,8 +1994,15 @@@ out
   
   int btrfs_release_file(struct inode *inode, struct file *filp)
   {
- -      if (filp->private_data)
+ +      struct btrfs_file_private *private = filp->private_data;
+ +
+ +      if (private && private->trans)
                 btrfs_ioctl_trans_end(filp);
+ +      if (private && private->filldir_buf)
+ +              kfree(private->filldir_buf);
+ +      kfree(private);
+ +      filp->private_data = NULL;
+ +
         /*
          * ordered_data_close is set by settattr when we are about to truncate
          * a file from a non-zero size to a zero size.  This tries to
@@@ -3112,7 -3109,7 +3116,7 @@@ out
   
   static int btrfs_file_open(struct inode *inode, struct file *filp)
   {
-       filp->f_mode |= FMODE_AIO_NOWAIT;
+       filp->f_mode |= FMODE_NOWAIT;
         return generic_file_open(inode, filp);
   }
   
diff --combined fs/ext4/file.c

index da9c694,f835213..b1da660
--- 1/fs/ext4/file.c
--- 2/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@@ -223,6 -223,8 +223,8 @@@ ext4_file_write_iter(struct kiocb *iocb
         if (IS_DAX(inode))
                 return ext4_dax_write_iter(iocb, from);
   #endif
+       if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
+               return -EOPNOTSUPP;
   
         if (!inode_trylock(inode)) {
                 if (iocb->ki_flags & IOCB_NOWAIT)
@@@ -279,20 -281,7 +281,20 @@@ static int ext4_dax_huge_fault(struct v
         handle_t *handle = NULL;
         struct inode *inode = file_inode(vmf->vma->vm_file);
         struct super_block *sb = inode->i_sb;
- -      bool write = vmf->flags & FAULT_FLAG_WRITE;
+ +
+ +      /*
+ +       * We have to distinguish real writes from writes which will result in a
+ +       * COW page; COW writes should *not* poke the journal (the file will not
+ +       * be changed). Doing so would cause unintended failures when mounted
+ +       * read-only.
+ +       *
+ +       * We check for VM_SHARED rather than vmf->cow_page since the latter is
+ +       * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
+ +       * other sizes, dax_iomap_fault will handle splitting / fallback so that
+ +       * we eventually come back with a COW page.
+ +       */
+ +      bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
+ +              (vmf->vma->vm_flags & VM_SHARED);
   
         if (write) {
                 sb_start_pagefault(sb);
@@@ -324,11 -313,41 +326,11 @@@ static int ext4_dax_fault(struct vm_fau
         return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
   }
   
- -/*
- - * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
- - * handler we check for races agaist truncate. Note that since we cycle through
- - * i_mmap_sem, we are sure that also any hole punching that began before we
- - * were called is finished by now and so if it included part of the file we
- - * are working on, our pte will get unmapped and the check for pte_same() in
- - * wp_pfn_shared() fails. Thus fault gets retried and things work out as
- - * desired.
- - */
- -static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
- -{
- -      struct inode *inode = file_inode(vmf->vma->vm_file);
- -      struct super_block *sb = inode->i_sb;
- -      loff_t size;
- -      int ret;
- -
- -      sb_start_pagefault(sb);
- -      file_update_time(vmf->vma->vm_file);
- -      down_read(&EXT4_I(inode)->i_mmap_sem);
- -      size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- -      if (vmf->pgoff >= size)
- -              ret = VM_FAULT_SIGBUS;
- -      else
- -              ret = dax_pfn_mkwrite(vmf);
- -      up_read(&EXT4_I(inode)->i_mmap_sem);
- -      sb_end_pagefault(sb);
- -
- -      return ret;
- -}
- -
   static const struct vm_operations_struct ext4_dax_vm_ops = {
         .fault          = ext4_dax_fault,
         .huge_fault     = ext4_dax_huge_fault,
         .page_mkwrite   = ext4_dax_fault,
- -      .pfn_mkwrite    = ext4_dax_pfn_mkwrite,
+ +      .pfn_mkwrite    = ext4_dax_fault,
   };
   #else
   #define ext4_dax_vm_ops       ext4_file_vm_ops
@@@ -371,7 -390,7 +373,7 @@@ static int ext4_file_open(struct inode 
                 return -EIO;
   
         if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
- -                   !(sb->s_flags & MS_RDONLY))) {
+ +                   !sb_rdonly(sb))) {
                 sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
                 /*
                  * Sample where the filesystem has been mounted and
@@@ -431,9 -450,7 +433,7 @@@
                         return ret;
         }
   
-       /* Set the flags to support nowait AIO */
-       filp->f_mode |= FMODE_AIO_NOWAIT;
- 
+       filp->f_mode |= FMODE_NOWAIT;
         return dquot_file_open(inode, filp);
   }
   
@@@ -477,11 -494,12 +477,11 @@@ static int ext4_find_unwritten_pgoff(st
   
         pagevec_init(&pvec, 0);
         do {
- -              int i, num;
+ +              int i;
                 unsigned long nr_pages;
   
- -              num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
- -              nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
- -                                        (pgoff_t)num);
+ +              nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
+ +                                      &index, end);
                 if (nr_pages == 0)
                         break;
   
@@@ -500,6 -518,9 +500,6 @@@
                                 goto out;
                         }
   
- -                      if (page->index > end)
- -                              goto out;
- -
                         lock_page(page);
   
                         if (unlikely(page->mapping != inode->i_mapping)) {
@@@ -542,10 -563,14 +542,10 @@@ next
                         unlock_page(page);
                 }
   
- -              /* The no. of pages is less than our desired, we are done. */
- -              if (nr_pages < num)
- -                      break;
- -
- -              index = pvec.pages[i - 1]->index + 1;
                 pagevec_release(&pvec);
         } while (index <= end);
   
+ +      /* There are no pages upto endoff - that would be a hole in there. */
         if (whence == SEEK_HOLE && lastoff < endoff) {
                 found = 1;
                 *offset = lastoff;
@@@ -570,7 -595,7 +570,7 @@@ static loff_t ext4_seek_data(struct fil
         inode_lock(inode);
   
         isize = i_size_read(inode);
- -      if (offset >= isize) {
+ +      if (offset < 0 || offset >= isize) {
                 inode_unlock(inode);
                 return -ENXIO;
         }
@@@ -633,7 -658,7 +633,7 @@@ static loff_t ext4_seek_hole(struct fil
         inode_lock(inode);
   
         isize = i_size_read(inode);
- -      if (offset >= isize) {
+ +      if (offset < 0 || offset >= isize) {
                 inode_unlock(inode);
                 return -ENXIO;
         }
diff --combined fs/xfs/xfs_file.c

index ec3e44f,1a09104..ebdd0bd
--- 1/fs/xfs/xfs_file.c
--- 2/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@@ -259,7 -259,11 +259,11 @@@ xfs_file_buffered_aio_read
   
         trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
   
-       xfs_ilock(ip, XFS_IOLOCK_SHARED);
+       if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
+               if (iocb->ki_flags & IOCB_NOWAIT)
+                       return -EAGAIN;
+               xfs_ilock(ip, XFS_IOLOCK_SHARED);
+       }
         ret = generic_file_read_iter(iocb, to);
         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
   
@@@ -636,6 -640,9 +640,9 @@@ xfs_file_buffered_aio_write
         int                     enospc = 0;
         int                     iolock;
   
+       if (iocb->ki_flags & IOCB_NOWAIT)
+               return -EOPNOTSUPP;
+ 
   write_retry:
         iolock = XFS_IOLOCK_EXCL;
         xfs_ilock(ip, iolock);
@@@ -912,7 -919,7 +919,7 @@@ xfs_file_open
                 return -EFBIG;
         if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
                 return -EIO;
-       file->f_mode |= FMODE_AIO_NOWAIT;
+       file->f_mode |= FMODE_NOWAIT;
         return 0;
   }
   
@@@ -1011,67 -1018,96 +1018,67 @@@ xfs_file_llseek
    *       page_lock (MM)
    *         i_lock (XFS - extent map serialisation)
    */
- -
- -/*
- - * mmap()d file has taken write protection fault and is being made writable. We
- - * can set the page state up correctly for a writable page, which means we can
- - * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
- - * mapping.
- - */
- -STATIC int
- -xfs_filemap_page_mkwrite(
- -      struct vm_fault         *vmf)
+ +static int
+ +__xfs_filemap_fault(
+ +      struct vm_fault         *vmf,
+ +      enum page_entry_size    pe_size,
+ +      bool                    write_fault)
   {
         struct inode            *inode = file_inode(vmf->vma->vm_file);
+ +      struct xfs_inode        *ip = XFS_I(inode);
         int                     ret;
   
- -      trace_xfs_filemap_page_mkwrite(XFS_I(inode));
+ +      trace_xfs_filemap_fault(ip, pe_size, write_fault);
   
- -      sb_start_pagefault(inode->i_sb);
- -      file_update_time(vmf->vma->vm_file);
- -      xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ +      if (write_fault) {
+ +              sb_start_pagefault(inode->i_sb);
+ +              file_update_time(vmf->vma->vm_file);
+ +      }
   
+ +      xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
         if (IS_DAX(inode)) {
- -              ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
+ +              ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
         } else {
- -              ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
- -              ret = block_page_mkwrite_return(ret);
+ +              if (write_fault)
+ +                      ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
+ +              else
+ +                      ret = filemap_fault(vmf);
         }
- -
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- -      sb_end_pagefault(inode->i_sb);
   
+ +      if (write_fault)
+ +              sb_end_pagefault(inode->i_sb);
         return ret;
   }
   
- -STATIC int
+ +static int
   xfs_filemap_fault(
         struct vm_fault         *vmf)
   {
- -      struct inode            *inode = file_inode(vmf->vma->vm_file);
- -      int                     ret;
- -
- -      trace_xfs_filemap_fault(XFS_I(inode));
- -
         /* DAX can shortcut the normal fault path on write faults! */
- -      if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode))
- -              return xfs_filemap_page_mkwrite(vmf);
- -
- -      xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- -      if (IS_DAX(inode))
- -              ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
- -      else
- -              ret = filemap_fault(vmf);
- -      xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- -
- -      return ret;
+ +      return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
+ +                      IS_DAX(file_inode(vmf->vma->vm_file)) &&
+ +                      (vmf->flags & FAULT_FLAG_WRITE));
   }
   
- -/*
- - * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
- - * both read and write faults. Hence we need to handle both cases. There is no
- - * ->huge_mkwrite callout for huge pages, so we have a single function here to
- - * handle both cases here. @flags carries the information on the type of fault
- - * occuring.
- - */
- -STATIC int
+ +static int
   xfs_filemap_huge_fault(
         struct vm_fault         *vmf,
         enum page_entry_size    pe_size)
   {
- -      struct inode            *inode = file_inode(vmf->vma->vm_file);
- -      struct xfs_inode        *ip = XFS_I(inode);
- -      int                     ret;
- -
- -      if (!IS_DAX(inode))
+ +      if (!IS_DAX(file_inode(vmf->vma->vm_file)))
                 return VM_FAULT_FALLBACK;
   
- -      trace_xfs_filemap_huge_fault(ip);
- -
- -      if (vmf->flags & FAULT_FLAG_WRITE) {
- -              sb_start_pagefault(inode->i_sb);
- -              file_update_time(vmf->vma->vm_file);
- -      }
- -
- -      xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- -      ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
- -      xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- -
- -      if (vmf->flags & FAULT_FLAG_WRITE)
- -              sb_end_pagefault(inode->i_sb);
+ +      /* DAX can shortcut the normal fault path on write faults! */
+ +      return __xfs_filemap_fault(vmf, pe_size,
+ +                      (vmf->flags & FAULT_FLAG_WRITE));
+ +}
   
- -      return ret;
+ +static int
+ +xfs_filemap_page_mkwrite(
+ +      struct vm_fault         *vmf)
+ +{
+ +      return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
   }
   
   /*
@@@ -1101,7 -1137,7 +1108,7 @@@ xfs_filemap_pfn_mkwrite
         if (vmf->pgoff >= size)
                 ret = VM_FAULT_SIGBUS;
         else if (IS_DAX(inode))
- -              ret = dax_pfn_mkwrite(vmf);
+ +              ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
         xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
         sb_end_pagefault(inode->i_sb);
         return ret;
diff --combined include/linux/fs.h

index b4ae080,94582c3..bc475df
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -72,8 -72,6 +72,8 @@@ extern int leases_enable, lease_break_t
   extern int sysctl_protected_symlinks;
   extern int sysctl_protected_hardlinks;
   
+ +typedef __kernel_rwf_t rwf_t;
+ +
   struct buffer_head;
   typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                         struct buffer_head *bh_result, int create);
@@@ -148,8 -146,8 +148,8 @@@ typedef int (dio_iodone_t)(struct kioc
   /* File was opened by fanotify and shouldn't generate fanotify events */
   #define FMODE_NONOTIFY                ((__force fmode_t)0x4000000)
   
- /* File is capable of returning -EAGAIN if AIO will block */
- #define FMODE_AIO_NOWAIT      ((__force fmode_t)0x8000000)
+ /* File is capable of returning -EAGAIN if I/O will block */
+ #define FMODE_NOWAIT  ((__force fmode_t)0x8000000)
   
   /*
    * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
@@@ -392,7 -390,7 +392,7 @@@ struct address_space 
         struct radix_tree_root  page_tree;      /* radix tree of all pages */
         spinlock_t              tree_lock;      /* and lock protecting it */
         atomic_t                i_mmap_writable;/* count VM_SHARED mappings */
- -      struct rb_root          i_mmap;         /* tree of private and shared mappings */
+ +      struct rb_root_cached   i_mmap;         /* tree of private and shared mappings */
         struct rw_semaphore     i_mmap_rwsem;   /* protect tree, count, list */
         /* Protected by tree_lock together with the radix tree */
         unsigned long           nrpages;        /* number of total pages */
@@@ -429,7 -427,6 +429,7 @@@ struct block_device 
   #endif
         struct block_device *   bd_contains;
         unsigned                bd_block_size;
+ +      u8                      bd_partno;
         struct hd_struct *      bd_part;
         /* number of times partitions within this device have been opened. */
         unsigned                bd_part_count;
@@@ -487,7 -484,7 +487,7 @@@ static inline void i_mmap_unlock_read(s
    */
   static inline int mapping_mapped(struct address_space *mapping)
   {
- -      return  !RB_EMPTY_ROOT(&mapping->i_mmap);
+ +      return  !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
   }
   
   /*
@@@ -1003,6 -1000,7 +1003,6 @@@ struct file_lock 
         unsigned char fl_type;
         unsigned int fl_pid;
         int fl_link_cpu;                /* what cpu's list is this on? */
- -      struct pid *fl_nspid;
         wait_queue_head_t fl_wait;
         struct file *fl_file;
         loff_t fl_start;
@@@ -1235,7 -1233,7 +1235,7 @@@ static inline struct inode *file_inode(
   
   static inline struct dentry *file_dentry(const struct file *file)
   {
- -      return d_real(file->f_path.dentry, file_inode(file), 0);
+ +      return d_real(file->f_path.dentry, file_inode(file), 0, 0);
   }
   
   static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
@@@ -1270,32 -1268,7 +1270,32 @@@ extern void f_delown(struct file *filp)
   extern pid_t f_getown(struct file *filp);
   extern int send_sigurg(struct fown_struct *fown);
   
- -struct mm_struct;
+ +/*
+ + * sb->s_flags.  Note that these mirror the equivalent MS_* flags where
+ + * represented in both.
+ + */
+ +#define SB_RDONLY      1      /* Mount read-only */
+ +#define SB_NOSUID      2      /* Ignore suid and sgid bits */
+ +#define SB_NODEV       4      /* Disallow access to device special files */
+ +#define SB_NOEXEC      8      /* Disallow program execution */
+ +#define SB_SYNCHRONOUS        16      /* Writes are synced at once */
+ +#define SB_MANDLOCK   64      /* Allow mandatory locks on an FS */
+ +#define SB_DIRSYNC    128     /* Directory modifications are synchronous */
+ +#define SB_NOATIME    1024    /* Do not update access times. */
+ +#define SB_NODIRATIME 2048    /* Do not update directory access times */
+ +#define SB_SILENT     32768
+ +#define SB_POSIXACL   (1<<16) /* VFS does not apply the umask */
+ +#define SB_KERNMOUNT  (1<<22) /* this is a kern_mount call */
+ +#define SB_I_VERSION  (1<<23) /* Update inode I_version field */
+ +#define SB_LAZYTIME   (1<<25) /* Update the on-disk [acm]times lazily */
+ +
+ +/* These sb flags are internal to the kernel */
+ +#define SB_SUBMOUNT     (1<<26)
+ +#define SB_NOREMOTELOCK       (1<<27)
+ +#define SB_NOSEC      (1<<28)
+ +#define SB_BORN               (1<<29)
+ +#define SB_ACTIVE     (1<<30)
+ +#define SB_NOUSER     (1<<31)
   
   /*
    *    Umount options
@@@ -1781,10 -1754,13 +1781,10 @@@ ssize_t rw_copy_check_uvector(int type
                               struct iovec **ret_pointer);
   
   extern ssize_t __vfs_read(struct file *, char __user *, size_t, loff_t *);
- -extern ssize_t __vfs_write(struct file *, const char __user *, size_t, loff_t *);
   extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
   extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
   extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
- -              unsigned long, loff_t *, int);
- -extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
- -              unsigned long, loff_t *, int);
+ +              unsigned long, loff_t *, rwf_t);
   extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                    loff_t, size_t, unsigned int);
   extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
@@@ -1860,7 -1836,7 +1860,7 @@@ struct super_operations 
    * possible to override it selectively if you really wanted to with some
    * ioctl() that is not currently implemented.
    *
- - * Exception: MS_RDONLY is always applied to the entire file system.
+ + * Exception: SB_RDONLY is always applied to the entire file system.
    *
    * Unfortunately, it is possible to change a filesystems flags with it mounted
    * with files in use.  This means that all of the inodes will not have their
@@@ -1869,20 -1845,19 +1869,20 @@@
    */
   #define __IS_FLG(inode, flg)  ((inode)->i_sb->s_flags & (flg))
   
- -#define IS_RDONLY(inode)      ((inode)->i_sb->s_flags & MS_RDONLY)
- -#define IS_SYNC(inode)                (__IS_FLG(inode, MS_SYNCHRONOUS) || \
+ +static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & MS_RDONLY; }
+ +#define IS_RDONLY(inode)      sb_rdonly((inode)->i_sb)
+ +#define IS_SYNC(inode)                (__IS_FLG(inode, SB_SYNCHRONOUS) || \
                                         ((inode)->i_flags & S_SYNC))
- -#define IS_DIRSYNC(inode)     (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \
+ +#define IS_DIRSYNC(inode)     (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \
                                         ((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
- -#define IS_MANDLOCK(inode)    __IS_FLG(inode, MS_MANDLOCK)
- -#define IS_NOATIME(inode)     __IS_FLG(inode, MS_RDONLY|MS_NOATIME)
- -#define IS_I_VERSION(inode)   __IS_FLG(inode, MS_I_VERSION)
+ +#define IS_MANDLOCK(inode)    __IS_FLG(inode, SB_MANDLOCK)
+ +#define IS_NOATIME(inode)     __IS_FLG(inode, SB_RDONLY|SB_NOATIME)
+ +#define IS_I_VERSION(inode)   __IS_FLG(inode, SB_I_VERSION)
   
   #define IS_NOQUOTA(inode)     ((inode)->i_flags & S_NOQUOTA)
   #define IS_APPEND(inode)      ((inode)->i_flags & S_APPEND)
   #define IS_IMMUTABLE(inode)   ((inode)->i_flags & S_IMMUTABLE)
- -#define IS_POSIXACL(inode)    __IS_FLG(inode, MS_POSIXACL)
+ +#define IS_POSIXACL(inode)    __IS_FLG(inode, SB_POSIXACL)
   
   #define IS_DEADDIR(inode)     ((inode)->i_flags & S_DEAD)
   #define IS_NOCMTIME(inode)    ((inode)->i_flags & S_NOCMTIME)
@@@ -2203,7 -2178,7 +2203,7 @@@ static inline int __mandatory_lock(stru
   }
   
   /*
- - * ... and these candidates should be on MS_MANDLOCK mounted fs,
+ + * ... and these candidates should be on SB_MANDLOCK mounted fs,
    * otherwise these will be advisory locks
    */
   
@@@ -2496,13 -2471,9 +2496,13 @@@ static inline void bd_unlink_disk_holde
   #endif
   
   /* fs/char_dev.c */
- -#define CHRDEV_MAJOR_HASH_SIZE        255
+ +#define CHRDEV_MAJOR_MAX 512
   /* Marks the bottom of the first segment of free char majors */
   #define CHRDEV_MAJOR_DYN_END 234
+ +/* Marks the top and bottom of the second segment of free char majors */
+ +#define CHRDEV_MAJOR_DYN_EXT_START 511
+ +#define CHRDEV_MAJOR_DYN_EXT_END 384
+ +
   extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
   extern int register_chrdev_region(dev_t, unsigned, const char *);
   extern int __register_chrdev(unsigned int major, unsigned int baseminor,
@@@ -2529,14 -2500,14 +2529,14 @@@ static inline void unregister_chrdev(un
   #define BDEVT_SIZE    10      /* Largest string for MAJ:MIN for blkdev */
   
   #ifdef CONFIG_BLOCK
- -#define BLKDEV_MAJOR_HASH_SIZE        255
+ +#define BLKDEV_MAJOR_MAX      512
   extern const char *__bdevname(dev_t, char *buffer);
   extern const char *bdevname(struct block_device *bdev, char *buffer);
   extern struct block_device *lookup_bdev(const char *);
   extern void blkdev_show(struct seq_file *,off_t);
   
   #else
- -#define BLKDEV_MAJOR_HASH_SIZE        0
+ +#define BLKDEV_MAJOR_MAX      0
   #endif
   
   extern void init_special_inode(struct inode *, umode_t, dev_t);
@@@ -2568,19 -2539,12 +2568,19 @@@ extern int invalidate_inode_pages2_rang
   extern int write_inode_now(struct inode *, int);
   extern int filemap_fdatawrite(struct address_space *);
   extern int filemap_flush(struct address_space *);
- -extern int filemap_fdatawait(struct address_space *);
   extern int filemap_fdatawait_keep_errors(struct address_space *mapping);
   extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
                                    loff_t lend);
+ +
+ +static inline int filemap_fdatawait(struct address_space *mapping)
+ +{
+ +      return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
+ +}
+ +
   extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
                                   loff_t lend);
+ +extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
+ +                                              loff_t lend);
   extern int filemap_write_and_wait(struct address_space *mapping);
   extern int filemap_write_and_wait_range(struct address_space *mapping,
                                         loff_t lstart, loff_t lend);
@@@ -2589,19 -2553,12 +2589,19 @@@ extern int __filemap_fdatawrite_range(s
   extern int filemap_fdatawrite_range(struct address_space *mapping,
                                 loff_t start, loff_t end);
   extern int filemap_check_errors(struct address_space *mapping);
- -
   extern void __filemap_set_wb_err(struct address_space *mapping, int err);
+ +
+ +extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
+ +                                              loff_t lend);
   extern int __must_check file_check_and_advance_wb_err(struct file *file);
   extern int __must_check file_write_and_wait_range(struct file *file,
                                                 loff_t start, loff_t end);
   
+ +static inline int file_write_and_wait(struct file *file)
+ +{
+ +      return file_write_and_wait_range(file, 0, LLONG_MAX);
+ +}
+ +
   /**
    * filemap_set_wb_err - set a writeback error on an address_space
    * @mapping: mapping in which to set writeback error
@@@ -2615,6 -2572,8 +2615,6 @@@
    * When a writeback error occurs, most filesystems will want to call
    * filemap_set_wb_err to record the error in the mapping so that it will be
    * automatically reported whenever fsync is called on the file.
- - *
- - * FIXME: mention FS_* flag here?
    */
   static inline void filemap_set_wb_err(struct address_space *mapping, int err)
   {
@@@ -2813,15 -2772,15 +2813,15 @@@ static inline const char *kernel_read_f
         return kernel_read_file_str[id];
   }
   
- -extern int kernel_read(struct file *, loff_t, char *, unsigned long);
   extern int kernel_read_file(struct file *, void **, loff_t *, loff_t,
                             enum kernel_read_file_id);
   extern int kernel_read_file_from_path(char *, void **, loff_t *, loff_t,
                                       enum kernel_read_file_id);
   extern int kernel_read_file_from_fd(int, void **, loff_t *, loff_t,
                                     enum kernel_read_file_id);
- -extern ssize_t kernel_write(struct file *, const char *, size_t, loff_t);
- -extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
+ +extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
+ +extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
+ +extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
   extern struct file * open_exec(const char *);
    
   /* fs/dcache.c -- generic fs support functions */
@@@ -2872,7 -2831,6 +2872,7 @@@ static inline void lockdep_annotate_ino
   #endif
   extern void unlock_new_inode(struct inode *);
   extern unsigned int get_next_ino(void);
+ +extern void evict_inodes(struct super_block *sb);
   
   extern void __iget(struct inode * inode);
   extern void iget_failed(struct inode *);
@@@ -2916,9 -2874,9 +2916,9 @@@ extern ssize_t generic_file_direct_writ
   extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
   
   ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
- -              int flags);
+ +              rwf_t flags);
   ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
- -              int flags);
+ +              rwf_t flags);
   
   /* fs/block_dev.c */
   extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to);
@@@ -3040,10 -2998,6 +3040,10 @@@ void __inode_add_bytes(struct inode *in
   void inode_add_bytes(struct inode *inode, loff_t bytes);
   void __inode_sub_bytes(struct inode *inode, loff_t bytes);
   void inode_sub_bytes(struct inode *inode, loff_t bytes);
+ +static inline loff_t __inode_get_bytes(struct inode *inode)
+ +{
+ +      return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
+ +}
   loff_t inode_get_bytes(struct inode *inode);
   void inode_set_bytes(struct inode *inode, loff_t bytes);
   const char *simple_get_link(struct dentry *, struct inode *,
@@@ -3068,7 -3022,8 +3068,7 @@@ static inline int vfs_lstat(const char 
   static inline int vfs_fstatat(int dfd, const char __user *filename,
                               struct kstat *stat, int flags)
   {
- -      return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
- -                       stat, STATX_BASIC_STATS);
+ +      return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS);
   }
   static inline int vfs_fstat(int fd, struct kstat *stat)
   {
@@@ -3188,13 -3143,13 +3188,13 @@@ static inline int iocb_flags(struct fil
         return res;
   }
   
- -static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
+ +static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
   {
         if (unlikely(flags & ~RWF_SUPPORTED))
                 return -EOPNOTSUPP;
   
         if (flags & RWF_NOWAIT) {
-               if (!(ki->ki_filp->f_mode & FMODE_AIO_NOWAIT))
+               if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
                         return -EOPNOTSUPP;
                 ki->ki_flags |= IOCB_NOWAIT;
         }
@@@ -3319,7 -3274,7 +3319,7 @@@ static inline int check_sticky(struct i
   
   static inline void inode_has_no_xattr(struct inode *inode)
   {
- -      if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC))
+ +      if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC))
                 inode->i_flags |= S_NOSEC;
   }
   
diff --combined mm/filemap.c

index 8c88e18,92d4e0a..870971e
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -130,8 -130,17 +130,8 @@@ static int page_cache_tree_insert(struc
                         return -EEXIST;
   
                 mapping->nrexceptional--;
- -              if (!dax_mapping(mapping)) {
- -                      if (shadowp)
- -                              *shadowp = p;
- -              } else {
- -                      /* DAX can replace empty locked entry with a hole */
- -                      WARN_ON_ONCE(p !=
- -                              dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
- -                      /* Wakeup waiters for exceptional entry lock */
- -                      dax_wake_mapping_entry_waiter(mapping, page->index, p,
- -                                                    true);
- -              }
+ +              if (shadowp)
+ +                      *shadowp = p;
         }
         __radix_tree_replace(&mapping->page_tree, node, slot, page,
                              workingset_update_node, mapping);
@@@ -393,7 -402,8 +393,7 @@@ bool filemap_range_has_page(struct addr
   {
         pgoff_t index = start_byte >> PAGE_SHIFT;
         pgoff_t end = end_byte >> PAGE_SHIFT;
- -      struct pagevec pvec;
- -      bool ret;
+ +      struct page *page;
   
         if (end_byte < start_byte)
                 return false;
@@@ -401,10 -411,12 +401,10 @@@
         if (mapping->nrpages == 0)
                 return false;
   
- -      pagevec_init(&pvec, 0);
- -      if (!pagevec_lookup(&pvec, mapping, index, 1))
+ +      if (!find_get_pages_range(mapping, &index, end, 1, &page))
                 return false;
- -      ret = (pvec.pages[0]->index <= end);
- -      pagevec_release(&pvec);
- -      return ret;
+ +      put_page(page);
+ +      return true;
   }
   EXPORT_SYMBOL(filemap_range_has_page);
   
@@@ -463,29 -475,6 +463,29 @@@ int filemap_fdatawait_range(struct addr
   }
   EXPORT_SYMBOL(filemap_fdatawait_range);
   
+ +/**
+ + * file_fdatawait_range - wait for writeback to complete
+ + * @file:             file pointing to address space structure to wait for
+ + * @start_byte:               offset in bytes where the range starts
+ + * @end_byte:         offset in bytes where the range ends (inclusive)
+ + *
+ + * Walk the list of under-writeback pages of the address space that file
+ + * refers to, in the given range and wait for all of them.  Check error
+ + * status of the address space vs. the file->f_wb_err cursor and return it.
+ + *
+ + * Since the error status of the file is advanced by this function,
+ + * callers are responsible for checking the return value and handling and/or
+ + * reporting the error.
+ + */
+ +int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
+ +{
+ +      struct address_space *mapping = file->f_mapping;
+ +
+ +      __filemap_fdatawait_range(mapping, start_byte, end_byte);
+ +      return file_check_and_advance_wb_err(file);
+ +}
+ +EXPORT_SYMBOL(file_fdatawait_range);
+ +
   /**
    * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
    * @mapping: address space structure to wait for
@@@ -500,22 -489,45 +500,22 @@@
    */
   int filemap_fdatawait_keep_errors(struct address_space *mapping)
   {
- -      loff_t i_size = i_size_read(mapping->host);
- -
- -      if (i_size == 0)
- -              return 0;
- -
- -      __filemap_fdatawait_range(mapping, 0, i_size - 1);
+ +      __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
         return filemap_check_and_keep_errors(mapping);
   }
   EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
   
- -/**
- - * filemap_fdatawait - wait for all under-writeback pages to complete
- - * @mapping: address space structure to wait for
- - *
- - * Walk the list of under-writeback pages of the given address space
- - * and wait for all of them.  Check error status of the address space
- - * and return it.
- - *
- - * Since the error status of the address space is cleared by this function,
- - * callers are responsible for checking the return value and handling and/or
- - * reporting the error.
- - */
- -int filemap_fdatawait(struct address_space *mapping)
+ +static bool mapping_needs_writeback(struct address_space *mapping)
   {
- -      loff_t i_size = i_size_read(mapping->host);
- -
- -      if (i_size == 0)
- -              return 0;
- -
- -      return filemap_fdatawait_range(mapping, 0, i_size - 1);
+ +      return (!dax_mapping(mapping) && mapping->nrpages) ||
+ +          (dax_mapping(mapping) && mapping->nrexceptional);
   }
- -EXPORT_SYMBOL(filemap_fdatawait);
   
   int filemap_write_and_wait(struct address_space *mapping)
   {
         int err = 0;
   
- -      if ((!dax_mapping(mapping) && mapping->nrpages) ||
- -          (dax_mapping(mapping) && mapping->nrexceptional)) {
+ +      if (mapping_needs_writeback(mapping)) {
                 err = filemap_fdatawrite(mapping);
                 /*
                  * Even if the above returned error, the pages may be
@@@ -554,7 -566,8 +554,7 @@@ int filemap_write_and_wait_range(struc
   {
         int err = 0;
   
- -      if ((!dax_mapping(mapping) && mapping->nrpages) ||
- -          (dax_mapping(mapping) && mapping->nrexceptional)) {
+ +      if (mapping_needs_writeback(mapping)) {
                 err = __filemap_fdatawrite_range(mapping, lstart, lend,
                                                  WB_SYNC_ALL);
                 /* See comment of filemap_write_and_wait() */
@@@ -576,7 -589,7 +576,7 @@@ EXPORT_SYMBOL(filemap_write_and_wait_ra
   
   void __filemap_set_wb_err(struct address_space *mapping, int err)
   {
- -      errseq_t eseq = __errseq_set(&mapping->wb_err, err);
+ +      errseq_t eseq = errseq_set(&mapping->wb_err, err);
   
         trace_filemap_set_wb_err(mapping, eseq);
   }
@@@ -643,7 -656,8 +643,7 @@@ int file_write_and_wait_range(struct fi
         int err = 0, err2;
         struct address_space *mapping = file->f_mapping;
   
- -      if ((!dax_mapping(mapping) && mapping->nrpages) ||
- -          (dax_mapping(mapping) && mapping->nrexceptional)) {
+ +      if (mapping_needs_writeback(mapping)) {
                 err = __filemap_fdatawrite_range(mapping, lstart, lend,
                                                  WB_SYNC_ALL);
                 /* See comment of filemap_write_and_wait() */
@@@ -909,33 -923,13 +909,33 @@@ static void wake_up_page_bit(struct pag
         wait_queue_head_t *q = page_waitqueue(page);
         struct wait_page_key key;
         unsigned long flags;
+ +      wait_queue_entry_t bookmark;
   
         key.page = page;
         key.bit_nr = bit_nr;
         key.page_match = 0;
   
+ +      bookmark.flags = 0;
+ +      bookmark.private = NULL;
+ +      bookmark.func = NULL;
+ +      INIT_LIST_HEAD(&bookmark.entry);
+ +
         spin_lock_irqsave(&q->lock, flags);
- -      __wake_up_locked_key(q, TASK_NORMAL, &key);
+ +      __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+ +
+ +      while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+ +              /*
+ +               * Take a breather from holding the lock,
+ +               * allow pages that finish wake up asynchronously
+ +               * to acquire the lock and remove themselves
+ +               * from wait queue
+ +               */
+ +              spin_unlock_irqrestore(&q->lock, flags);
+ +              cpu_relax();
+ +              spin_lock_irqsave(&q->lock, flags);
+ +              __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+ +      }
+ +
         /*
          * It is possible for other pages to have collided on the waitqueue
          * hash, so in that case check for a page match. That prevents a long-
@@@ -1047,7 -1041,7 +1047,7 @@@ void add_page_wait_queue(struct page *p
         unsigned long flags;
   
         spin_lock_irqsave(&q->lock, flags);
- -      __add_wait_queue(q, waiter);
+ +      __add_wait_queue_entry_tail(q, waiter);
         SetPageWaiters(page);
         spin_unlock_irqrestore(&q->lock, flags);
   }
@@@ -1572,29 -1566,23 +1572,29 @@@ export
   }
   
   /**
- - * find_get_pages - gang pagecache lookup
+ + * find_get_pages_range - gang pagecache lookup
    * @mapping:  The address_space to search
    * @start:    The starting page index
+ + * @end:      The final page index (inclusive)
    * @nr_pages: The maximum number of pages
    * @pages:    Where the resulting pages are placed
    *
- - * find_get_pages() will search for and return a group of up to
- - * @nr_pages pages in the mapping.  The pages are placed at @pages.
- - * find_get_pages() takes a reference against the returned pages.
+ + * find_get_pages_range() will search for and return a group of up to @nr_pages
+ + * pages in the mapping starting at index @start and up to index @end
+ + * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
+ + * a reference against the returned pages.
    *
    * The search returns a group of mapping-contiguous pages with ascending
    * indexes.  There may be holes in the indices due to not-present pages.
+ + * We also update @start to index the next page for the traversal.
    *
- - * find_get_pages() returns the number of pages which were found.
+ + * find_get_pages_range() returns the number of pages which were found. If this
+ + * number is smaller than @nr_pages, the end of specified range has been
+ + * reached.
    */
- -unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
- -                          unsigned int nr_pages, struct page **pages)
+ +unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
+ +                            pgoff_t end, unsigned int nr_pages,
+ +                            struct page **pages)
   {
         struct radix_tree_iter iter;
         void **slot;
@@@ -1604,11 -1592,8 +1604,11 @@@
                 return 0;
   
         rcu_read_lock();
- -      radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ +      radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
                 struct page *head, *page;
+ +
+ +              if (iter.index > end)
+ +                      break;
   repeat:
                 page = radix_tree_deref_slot(slot);
                 if (unlikely(!page))
@@@ -1644,25 -1629,11 +1644,25 @@@
                 }
   
                 pages[ret] = page;
- -              if (++ret == nr_pages)
- -                      break;
+ +              if (++ret == nr_pages) {
+ +                      *start = pages[ret - 1]->index + 1;
+ +                      goto out;
+ +              }
         }
   
+ +      /*
+ +       * We come here when there is no page beyond @end. We take care to not
+ +       * overflow the index @start as it confuses some of the callers. This
+ +       * breaks the iteration when there is page at index -1 but that is
+ +       * already broken anyway.
+ +       */
+ +      if (end == (pgoff_t)-1)
+ +              *start = (pgoff_t)-1;
+ +      else
+ +              *start = end + 1;
+ +out:
         rcu_read_unlock();
+ +
         return ret;
   }
   
@@@ -1917,9 -1888,8 +1917,8 @@@ static void shrink_readahead_size_eio(s
   }
   
   /**
-  * do_generic_file_read - generic file read routine
-  * @filp:     the file to read
-  * @ppos:     current file position
+  * generic_file_buffered_read - generic file read routine
+  * @iocb:     the iocb to read
    * @iter:     data destination
    * @written:  already copied
    *
@@@ -1929,12 -1899,14 +1928,14 @@@
    * This is really ugly. But the goto's actually try to clarify some
    * of the logic when it comes to error handling etc.
    */
- static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
+ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
                 struct iov_iter *iter, ssize_t written)
   {
+       struct file *filp = iocb->ki_filp;
         struct address_space *mapping = filp->f_mapping;
         struct inode *inode = mapping->host;
         struct file_ra_state *ra = &filp->f_ra;
+       loff_t *ppos = &iocb->ki_pos;
         pgoff_t index;
         pgoff_t last_index;
         pgoff_t prev_index;
@@@ -1967,6 -1939,8 +1968,8 @@@ find_page
   
                 page = find_get_page(mapping, index);
                 if (!page) {
+                       if (iocb->ki_flags & IOCB_NOWAIT)
+                               goto would_block;
                         page_cache_sync_readahead(mapping,
                                         ra, filp,
                                         index, last_index - index);
@@@ -1980,6 -1954,11 +1983,11 @@@
                                         index, last_index - index);
                 }
                 if (!PageUptodate(page)) {
+                       if (iocb->ki_flags & IOCB_NOWAIT) {
+                               put_page(page);
+                               goto would_block;
+                       }
+ 
                         /*
                          * See comment in do_read_cache_page on why
                          * wait_on_page_locked is used to avoid unnecessarily
@@@ -2161,6 -2140,8 +2169,8 @@@ no_cached_page
                 goto readpage;
         }
   
+ would_block:
+       error = -EAGAIN;
   out:
         ra->prev_pos = prev_index;
         ra->prev_pos <<= PAGE_SHIFT;
@@@ -2182,14 -2163,14 +2192,14 @@@
   ssize_t
   generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
   {
-       struct file *file = iocb->ki_filp;
-       ssize_t retval = 0;
         size_t count = iov_iter_count(iter);
+       ssize_t retval = 0;
   
         if (!count)
                 goto out; /* skip atime */
   
         if (iocb->ki_flags & IOCB_DIRECT) {
+               struct file *file = iocb->ki_filp;
                 struct address_space *mapping = file->f_mapping;
                 struct inode *inode = mapping->host;
                 loff_t size;
@@@ -2230,7 -2211,7 +2240,7 @@@
                         goto out;
         }
   
-       retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval);
+       retval = generic_file_buffered_read(iocb, iter, retval);
   out:
         return retval;
   }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 15 Sep 2017 02:29:55 +0000 (19:29 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 15 Sep 2017 02:29:55 +0000 (19:29 -0700)
		1	2
fs/aio.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/block_dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history