Merge branch 'pm-cpufreq'

[linux-2.6-microblaze.git] / mm / filemap.c
diff --git a/mm/filemap.c b/mm/filemap.c

index 99c49ee..c178022 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -102,8 +102,8 @@
   *    ->swap_lock              (try_to_unmap_one)
   *    ->private_lock           (try_to_unmap_one)
   *    ->i_pages lock           (try_to_unmap_one)
- *    ->pgdat->lru_lock                (follow_page->mark_page_accessed)
- *    ->pgdat->lru_lock                (check_pte_range->isolate_lru_page)
+ *    ->lruvec->lru_lock       (follow_page->mark_page_accessed)
+ *    ->lruvec->lru_lock       (check_pte_range->isolate_lru_page)
   *    ->private_lock           (page_remove_rmap->set_page_dirty)
   *    ->i_pages lock           (page_remove_rmap->set_page_dirty)
   *    bdi.wb->list_lock                (page_remove_rmap->set_page_dirty)
@@ -204,9 +204,9 @@ static void unaccount_page_cache_page(struct address_space *mapping,
         if (PageSwapBacked(page)) {
                 __mod_lruvec_page_state(page, NR_SHMEM, -nr);
                 if (PageTransHuge(page))
-                       __dec_node_page_state(page, NR_SHMEM_THPS);
+                       __dec_lruvec_page_state(page, NR_SHMEM_THPS);
         } else if (PageTransHuge(page)) {
-               __dec_node_page_state(page, NR_FILE_THPS);
+               __dec_lruvec_page_state(page, NR_FILE_THPS);
                 filemap_nr_thps_dec(mapping);
         }
  
@@ -249,7 +249,7 @@ static void page_cache_free_page(struct address_space *mapping,
                 freepage(page);
  
         if (PageTransHuge(page) && !PageHuge(page)) {
-               page_ref_sub(page, HPAGE_PMD_NR);
+               page_ref_sub(page, thp_nr_pages(page));
                 VM_BUG_ON_PAGE(page_count(page) <= 0, page);
         } else {
                 put_page(page);
@@ -414,7 +414,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
                 .range_end = end,
         };
  
-       if (!mapping_cap_writeback_dirty(mapping) ||
+       if (!mapping_can_writeback(mapping) ||
             !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                 return 0;
  
@@ -827,15 +827,14 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
  }
  EXPORT_SYMBOL_GPL(replace_page_cache_page);
  
-static int __add_to_page_cache_locked(struct page *page,
-                                     struct address_space *mapping,
-                                     pgoff_t offset, gfp_t gfp_mask,
-                                     void **shadowp)
+noinline int __add_to_page_cache_locked(struct page *page,
+                                       struct address_space *mapping,
+                                       pgoff_t offset, gfp_t gfp,
+                                       void **shadowp)
  {
         XA_STATE(xas, &mapping->i_pages, offset);
         int huge = PageHuge(page);
         int error;
-       void *old;
  
         VM_BUG_ON_PAGE(!PageLocked(page), page);
         VM_BUG_ON_PAGE(PageSwapBacked(page), page);
@@ -846,25 +845,46 @@ static int __add_to_page_cache_locked(struct page *page,
         page->index = offset;
  
         if (!huge) {
-               error = mem_cgroup_charge(page, current->mm, gfp_mask);
+               error = mem_cgroup_charge(page, current->mm, gfp);
                 if (error)
                         goto error;
         }
  
+       gfp &= GFP_RECLAIM_MASK;
+
         do {
+               unsigned int order = xa_get_order(xas.xa, xas.xa_index);
+               void *entry, *old = NULL;
+
+               if (order > thp_order(page))
+                       xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
+                                       order, gfp);
                 xas_lock_irq(&xas);
-               old = xas_load(&xas);
-               if (old && !xa_is_value(old))
-                       xas_set_err(&xas, -EEXIST);
+               xas_for_each_conflict(&xas, entry) {
+                       old = entry;
+                       if (!xa_is_value(entry)) {
+                               xas_set_err(&xas, -EEXIST);
+                               goto unlock;
+                       }
+               }
+
+               if (old) {
+                       if (shadowp)
+                               *shadowp = old;
+                       /* entry may have been split before we acquired lock */
+                       order = xa_get_order(xas.xa, xas.xa_index);
+                       if (order > thp_order(page)) {
+                               xas_split(&xas, old, order);
+                               xas_reset(&xas);
+                       }
+               }
+
                 xas_store(&xas, page);
                 if (xas_error(&xas))
                         goto unlock;
  
-               if (xa_is_value(old)) {
+               if (old)
                         mapping->nrexceptional--;
-                       if (shadowp)
-                               *shadowp = old;
-               }
                 mapping->nrpages++;
  
                 /* hugetlb pages do not participate in page cache accounting */
@@ -872,7 +892,7 @@ static int __add_to_page_cache_locked(struct page *page,
                         __inc_lruvec_page_state(page, NR_FILE_PAGES);
  unlock:
                 xas_unlock_irq(&xas);
-       } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+       } while (xas_nomem(&xas, gfp));
  
         if (xas_error(&xas)) {
                 error = xas_error(&xas);
@@ -1425,7 +1445,7 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
   * unlock_page - unlock a locked page
   * @page: the page
   *
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
+ * Unlocks the page and wakes up sleepers in wait_on_page_locked().
   * Also wakes sleepers in wait_on_page_writeback() because the wakeup
   * mechanism between PageLocked pages and PageWriteback pages is shared.
   * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
@@ -1464,11 +1484,19 @@ void end_page_writeback(struct page *page)
                 rotate_reclaimable_page(page);
         }
  
+       /*
+        * Writeback does not hold a page reference of its own, relying
+        * on truncation to wait for the clearing of PG_writeback.
+        * But here we must make sure that the page is not freed and
+        * reused before the wake_up_page().
+        */
+       get_page(page);
         if (!test_clear_page_writeback(page))
                 BUG();
  
         smp_mb__after_atomic();
         wake_up_page(page, PG_writeback);
+       put_page(page);
  }
  EXPORT_SYMBOL(end_page_writeback);
  
@@ -1555,19 +1583,20 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
                 else
                         wait_on_page_locked(page);
                 return 0;
-       } else {
-               if (flags & FAULT_FLAG_KILLABLE) {
-                       int ret;
+       }
+       if (flags & FAULT_FLAG_KILLABLE) {
+               int ret;
  
-                       ret = __lock_page_killable(page);
-                       if (ret) {
-                               mmap_read_unlock(mm);
-                               return 0;
-                       }
-               } else
-                       __lock_page(page);
-               return 1;
+               ret = __lock_page_killable(page);
+               if (ret) {
+                       mmap_read_unlock(mm);
+                       return 0;
+               }
+       } else {
+               __lock_page(page);
         }
+       return 1;
+
  }
  
  /**
@@ -1645,19 +1674,19 @@ EXPORT_SYMBOL(page_cache_prev_miss);
  /**
   * find_get_entry - find and get a page cache entry
   * @mapping: the address_space to search
- * @offset: the page cache index
+ * @index: The page cache index.
   *
   * Looks up the page cache slot at @mapping & @offset.  If there is a
- * page cache page, it is returned with an increased refcount.
+ * page cache page, the head page is returned with an increased refcount.
   *
   * If the slot holds a shadow entry of a previously evicted page, or a
   * swap entry from shmem/tmpfs, it is returned.
   *
- * Return: the found page or shadow entry, %NULL if nothing is found.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
   */
-struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
  {
-       XA_STATE(xas, &mapping->i_pages, offset);
+       XA_STATE(xas, &mapping->i_pages, index);
         struct page *page;
  
         rcu_read_lock();
@@ -1685,7 +1714,6 @@ repeat:
                 put_page(page);
                 goto repeat;
         }
-       page = find_subpage(page, offset);
  out:
         rcu_read_unlock();
  
@@ -1693,40 +1721,37 @@ out:
  }
  
  /**
- * find_lock_entry - locate, pin and lock a page cache entry
- * @mapping: the address_space to search
- * @offset: the page cache index
+ * find_lock_entry - Locate and lock a page cache entry.
+ * @mapping: The address_space to search.
+ * @index: The page cache index.
   *
- * Looks up the page cache slot at @mapping & @offset.  If there is a
- * page cache page, it is returned locked and with an increased
- * refcount.
+ * Looks up the page at @mapping & @index.  If there is a page in the
+ * cache, the head page is returned locked and with an increased refcount.
   *
   * If the slot holds a shadow entry of a previously evicted page, or a
   * swap entry from shmem/tmpfs, it is returned.
   *
- * find_lock_entry() may sleep.
- *
- * Return: the found page or shadow entry, %NULL if nothing is found.
+ * Context: May sleep.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
   */
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
  {
         struct page *page;
  
  repeat:
-       page = find_get_entry(mapping, offset);
+       page = find_get_entry(mapping, index);
         if (page && !xa_is_value(page)) {
                 lock_page(page);
                 /* Has the page been truncated? */
-               if (unlikely(page_mapping(page) != mapping)) {
+               if (unlikely(page->mapping != mapping)) {
                         unlock_page(page);
                         put_page(page);
                         goto repeat;
                 }
-               VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+               VM_BUG_ON_PAGE(!thp_contains(page, index), page);
         }
         return page;
  }
-EXPORT_SYMBOL(find_lock_entry);
  
  /**
   * pagecache_get_page - Find and get a reference to a page.
@@ -1741,6 +1766,8 @@ EXPORT_SYMBOL(find_lock_entry);
   *
   * * %FGP_ACCESSED - The page will be marked accessed.
   * * %FGP_LOCK - The page is returned locked.
+ * * %FGP_HEAD - If the page is present and a THP, return the head page
+ *   rather than the exact page specified by the index.
   * * %FGP_CREAT - If no page is present then a new page is allocated using
   *   @gfp_mask and added to the page cache and the VM's LRU list.
   *   The page is returned locked and with an increased refcount.
@@ -1781,12 +1808,12 @@ repeat:
                 }
  
                 /* Has the page been truncated? */
-               if (unlikely(compound_head(page)->mapping != mapping)) {
+               if (unlikely(page->mapping != mapping)) {
                         unlock_page(page);
                         put_page(page);
                         goto repeat;
                 }
-               VM_BUG_ON_PAGE(page->index != index, page);
+               VM_BUG_ON_PAGE(!thp_contains(page, index), page);
         }
  
         if (fgp_flags & FGP_ACCESSED)
@@ -1796,11 +1823,13 @@ repeat:
                 if (page_is_idle(page))
                         clear_page_idle(page);
         }
+       if (!(fgp_flags & FGP_HEAD))
+               page = find_subpage(page, index);
  
  no_page:
         if (!page && (fgp_flags & FGP_CREAT)) {
                 int err;
-               if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
+               if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
                         gfp_mask |= __GFP_WRITE;
                 if (fgp_flags & FGP_NOFS)
                         gfp_mask &= ~__GFP_FS;
@@ -2138,6 +2167,259 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra)
         ra->ra_pages /= 4;
  }
  
+static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
+{
+       if (iocb->ki_flags & IOCB_WAITQ)
+               return lock_page_async(page, iocb->ki_waitq);
+       else if (iocb->ki_flags & IOCB_NOWAIT)
+               return trylock_page(page) ? 0 : -EAGAIN;
+       else
+               return lock_page_killable(page);
+}
+
+static struct page *
+generic_file_buffered_read_readpage(struct kiocb *iocb,
+                                   struct file *filp,
+                                   struct address_space *mapping,
+                                   struct page *page)
+{
+       struct file_ra_state *ra = &filp->f_ra;
+       int error;
+
+       if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
+               unlock_page(page);
+               put_page(page);
+               return ERR_PTR(-EAGAIN);
+       }
+
+       /*
+        * A previous I/O error may have been due to temporary
+        * failures, eg. multipath errors.
+        * PG_error will be set again if readpage fails.
+        */
+       ClearPageError(page);
+       /* Start the actual read. The read will unlock the page. */
+       error = mapping->a_ops->readpage(filp, page);
+
+       if (unlikely(error)) {
+               put_page(page);
+               return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
+       }
+
+       if (!PageUptodate(page)) {
+               error = lock_page_for_iocb(iocb, page);
+               if (unlikely(error)) {
+                       put_page(page);
+                       return ERR_PTR(error);
+               }
+               if (!PageUptodate(page)) {
+                       if (page->mapping == NULL) {
+                               /*
+                                * invalidate_mapping_pages got it
+                                */
+                               unlock_page(page);
+                               put_page(page);
+                               return NULL;
+                       }
+                       unlock_page(page);
+                       shrink_readahead_size_eio(ra);
+                       put_page(page);
+                       return ERR_PTR(-EIO);
+               }
+               unlock_page(page);
+       }
+
+       return page;
+}
+
+static struct page *
+generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb,
+                                          struct file *filp,
+                                          struct iov_iter *iter,
+                                          struct page *page,
+                                          loff_t pos, loff_t count)
+{
+       struct address_space *mapping = filp->f_mapping;
+       struct inode *inode = mapping->host;
+       int error;
+
+       /*
+        * See comment in do_read_cache_page on why
+        * wait_on_page_locked is used to avoid unnecessarily
+        * serialisations and why it's safe.
+        */
+       if (iocb->ki_flags & IOCB_WAITQ) {
+               error = wait_on_page_locked_async(page,
+                                               iocb->ki_waitq);
+       } else {
+               error = wait_on_page_locked_killable(page);
+       }
+       if (unlikely(error)) {
+               put_page(page);
+               return ERR_PTR(error);
+       }
+       if (PageUptodate(page))
+               return page;
+
+       if (inode->i_blkbits == PAGE_SHIFT ||
+                       !mapping->a_ops->is_partially_uptodate)
+               goto page_not_up_to_date;
+       /* pipes can't handle partially uptodate pages */
+       if (unlikely(iov_iter_is_pipe(iter)))
+               goto page_not_up_to_date;
+       if (!trylock_page(page))
+               goto page_not_up_to_date;
+       /* Did it get truncated before we got the lock? */
+       if (!page->mapping)
+               goto page_not_up_to_date_locked;
+       if (!mapping->a_ops->is_partially_uptodate(page,
+                               pos & ~PAGE_MASK, count))
+               goto page_not_up_to_date_locked;
+       unlock_page(page);
+       return page;
+
+page_not_up_to_date:
+       /* Get exclusive access to the page ... */
+       error = lock_page_for_iocb(iocb, page);
+       if (unlikely(error)) {
+               put_page(page);
+               return ERR_PTR(error);
+       }
+
+page_not_up_to_date_locked:
+       /* Did it get truncated before we got the lock? */
+       if (!page->mapping) {
+               unlock_page(page);
+               put_page(page);
+               return NULL;
+       }
+
+       /* Did somebody else fill it already? */
+       if (PageUptodate(page)) {
+               unlock_page(page);
+               return page;
+       }
+
+       return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+}
+
+static struct page *
+generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
+                                         struct iov_iter *iter)
+{
+       struct file *filp = iocb->ki_filp;
+       struct address_space *mapping = filp->f_mapping;
+       pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+       struct page *page;
+       int error;
+
+       if (iocb->ki_flags & IOCB_NOIO)
+               return ERR_PTR(-EAGAIN);
+
+       /*
+        * Ok, it wasn't cached, so we need to create a new
+        * page..
+        */
+       page = page_cache_alloc(mapping);
+       if (!page)
+               return ERR_PTR(-ENOMEM);
+
+       error = add_to_page_cache_lru(page, mapping, index,
+                                     mapping_gfp_constraint(mapping, GFP_KERNEL));
+       if (error) {
+               put_page(page);
+               return error != -EEXIST ? ERR_PTR(error) : NULL;
+       }
+
+       return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+}
+
+static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
+                                               struct iov_iter *iter,
+                                               struct page **pages,
+                                               unsigned int nr)
+{
+       struct file *filp = iocb->ki_filp;
+       struct address_space *mapping = filp->f_mapping;
+       struct file_ra_state *ra = &filp->f_ra;
+       pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+       pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
+       int i, j, nr_got, err = 0;
+
+       nr = min_t(unsigned long, last_index - index, nr);
+find_page:
+       if (fatal_signal_pending(current))
+               return -EINTR;
+
+       nr_got = find_get_pages_contig(mapping, index, nr, pages);
+       if (nr_got)
+               goto got_pages;
+
+       if (iocb->ki_flags & IOCB_NOIO)
+               return -EAGAIN;
+
+       page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
+
+       nr_got = find_get_pages_contig(mapping, index, nr, pages);
+       if (nr_got)
+               goto got_pages;
+
+       pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
+       err = PTR_ERR_OR_ZERO(pages[0]);
+       if (!IS_ERR_OR_NULL(pages[0]))
+               nr_got = 1;
+got_pages:
+       for (i = 0; i < nr_got; i++) {
+               struct page *page = pages[i];
+               pgoff_t pg_index = index + i;
+               loff_t pg_pos = max(iocb->ki_pos,
+                                   (loff_t) pg_index << PAGE_SHIFT);
+               loff_t pg_count = iocb->ki_pos + iter->count - pg_pos;
+
+               if (PageReadahead(page)) {
+                       if (iocb->ki_flags & IOCB_NOIO) {
+                               for (j = i; j < nr_got; j++)
+                                       put_page(pages[j]);
+                               nr_got = i;
+                               err = -EAGAIN;
+                               break;
+                       }
+                       page_cache_async_readahead(mapping, ra, filp, page,
+                                       pg_index, last_index - pg_index);
+               }
+
+               if (!PageUptodate(page)) {
+                       if ((iocb->ki_flags & IOCB_NOWAIT) ||
+                           ((iocb->ki_flags & IOCB_WAITQ) && i)) {
+                               for (j = i; j < nr_got; j++)
+                                       put_page(pages[j]);
+                               nr_got = i;
+                               err = -EAGAIN;
+                               break;
+                       }
+
+                       page = generic_file_buffered_read_pagenotuptodate(iocb,
+                                       filp, iter, page, pg_pos, pg_count);
+                       if (IS_ERR_OR_NULL(page)) {
+                               for (j = i + 1; j < nr_got; j++)
+                                       put_page(pages[j]);
+                               nr_got = i;
+                               err = PTR_ERR_OR_ZERO(page);
+                               break;
+                       }
+               }
+       }
+
+       if (likely(nr_got))
+               return nr_got;
+       if (err)
+               return err;
+       /*
+        * No pages and no error means we raced and should retry:
+        */
+       goto find_page;
+}
+
  /**
   * generic_file_buffered_read - generic file read routine
   * @iocb:      the iocb to read
@@ -2158,276 +2440,117 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
                 struct iov_iter *iter, ssize_t written)
  {
         struct file *filp = iocb->ki_filp;
+       struct file_ra_state *ra = &filp->f_ra;
         struct address_space *mapping = filp->f_mapping;
         struct inode *inode = mapping->host;
-       struct file_ra_state *ra = &filp->f_ra;
-       loff_t *ppos = &iocb->ki_pos;
-       pgoff_t index;
-       pgoff_t last_index;
-       pgoff_t prev_index;
-       unsigned long offset;      /* offset into pagecache page */
-       unsigned int prev_offset;
-       int error = 0;
-
-       if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
+       struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
+       unsigned int nr_pages = min_t(unsigned int, 512,
+                       ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+                       (iocb->ki_pos >> PAGE_SHIFT));
+       int i, pg_nr, error = 0;
+       bool writably_mapped;
+       loff_t isize, end_offset;
+
+       if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
                 return 0;
         iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
  
-       index = *ppos >> PAGE_SHIFT;
-       prev_index = ra->prev_pos >> PAGE_SHIFT;
-       prev_offset = ra->prev_pos & (PAGE_SIZE-1);
-       last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
-       offset = *ppos & ~PAGE_MASK;
+       if (nr_pages > ARRAY_SIZE(pages_onstack))
+               pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
  
-       for (;;) {
-               struct page *page;
-               pgoff_t end_index;
-               loff_t isize;
-               unsigned long nr, ret;
+       if (!pages) {
+               pages = pages_onstack;
+               nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
+       }
  
+       do {
                 cond_resched();
-find_page:
-               if (fatal_signal_pending(current)) {
-                       error = -EINTR;
-                       goto out;
-               }
  
-               page = find_get_page(mapping, index);
-               if (!page) {
-                       if (iocb->ki_flags & IOCB_NOIO)
-                               goto would_block;
-                       page_cache_sync_readahead(mapping,
-                                       ra, filp,
-                                       index, last_index - index);
-                       page = find_get_page(mapping, index);
-                       if (unlikely(page == NULL))
-                               goto no_cached_page;
-               }
-               if (PageReadahead(page)) {
-                       if (iocb->ki_flags & IOCB_NOIO) {
-                               put_page(page);
-                               goto out;
-                       }
-                       page_cache_async_readahead(mapping,
-                                       ra, filp, page,
-                                       index, last_index - index);
-               }
-               if (!PageUptodate(page)) {
-                       /*
-                        * See comment in do_read_cache_page on why
-                        * wait_on_page_locked is used to avoid unnecessarily
-                        * serialisations and why it's safe.
-                        */
-                       if (iocb->ki_flags & IOCB_WAITQ) {
-                               if (written) {
-                                       put_page(page);
-                                       goto out;
-                               }
-                               error = wait_on_page_locked_async(page,
-                                                               iocb->ki_waitq);
-                       } else {
-                               if (iocb->ki_flags & IOCB_NOWAIT) {
-                                       put_page(page);
-                                       goto would_block;
-                               }
-                               error = wait_on_page_locked_killable(page);
-                       }
-                       if (unlikely(error))
-                               goto readpage_error;
-                       if (PageUptodate(page))
-                               goto page_ok;
-
-                       if (inode->i_blkbits == PAGE_SHIFT ||
-                                       !mapping->a_ops->is_partially_uptodate)
-                               goto page_not_up_to_date;
-                       /* pipes can't handle partially uptodate pages */
-                       if (unlikely(iov_iter_is_pipe(iter)))
-                               goto page_not_up_to_date;
-                       if (!trylock_page(page))
-                               goto page_not_up_to_date;
-                       /* Did it get truncated before we got the lock? */
-                       if (!page->mapping)
-                               goto page_not_up_to_date_locked;
-                       if (!mapping->a_ops->is_partially_uptodate(page,
-                                                       offset, iter->count))
-                               goto page_not_up_to_date_locked;
-                       unlock_page(page);
+               /*
+                * If we've already successfully copied some data, then we
+                * can no longer safely return -EIOCBQUEUED. Hence mark
+                * an async read NOWAIT at that point.
+                */
+               if ((iocb->ki_flags & IOCB_WAITQ) && written)
+                       iocb->ki_flags |= IOCB_NOWAIT;
+
+               i = 0;
+               pg_nr = generic_file_buffered_read_get_pages(iocb, iter,
+                                                            pages, nr_pages);
+               if (pg_nr < 0) {
+                       error = pg_nr;
+                       break;
                 }
-page_ok:
+
                 /*
-                * i_size must be checked after we know the page is Uptodate.
+                * i_size must be checked after we know the pages are Uptodate.
                  *
                  * Checking i_size after the check allows us to calculate
                  * the correct value for "nr", which means the zero-filled
                  * part of the page is not copied back to userspace (unless
                  * another truncate extends the file - this is desired though).
                  */
-
                 isize = i_size_read(inode);
-               end_index = (isize - 1) >> PAGE_SHIFT;
-               if (unlikely(!isize || index > end_index)) {
-                       put_page(page);
-                       goto out;
-               }
+               if (unlikely(iocb->ki_pos >= isize))
+                       goto put_pages;
  
-               /* nr is the maximum number of bytes to copy from this page */
-               nr = PAGE_SIZE;
-               if (index == end_index) {
-                       nr = ((isize - 1) & ~PAGE_MASK) + 1;
-                       if (nr <= offset) {
-                               put_page(page);
-                               goto out;
-                       }
-               }
-               nr = nr - offset;
+               end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
  
-               /* If users can be writing to this page using arbitrary
-                * virtual addresses, take care about potential aliasing
-                * before reading the page on the kernel side.
-                */
-               if (mapping_writably_mapped(mapping))
-                       flush_dcache_page(page);
+               while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr >
+                      (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT)
+                       put_page(pages[--pg_nr]);
  
                 /*
-                * When a sequential read accesses a page several times,
-                * only mark it as accessed the first time.
+                * Once we start copying data, we don't want to be touching any
+                * cachelines that might be contended:
                  */
-               if (prev_index != index || offset != prev_offset)
-                       mark_page_accessed(page);
-               prev_index = index;
+               writably_mapped = mapping_writably_mapped(mapping);
  
                 /*
-                * Ok, we have the page, and it's up-to-date, so
-                * now we can copy it to user space...
+                * When a sequential read accesses a page several times, only
+                * mark it as accessed the first time.
                  */
+               if (iocb->ki_pos >> PAGE_SHIFT !=
+                   ra->prev_pos >> PAGE_SHIFT)
+                       mark_page_accessed(pages[0]);
+               for (i = 1; i < pg_nr; i++)
+                       mark_page_accessed(pages[i]);
+
+               for (i = 0; i < pg_nr; i++) {
+                       unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
+                       unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos,
+                                                  PAGE_SIZE - offset);
+                       unsigned int copied;
  
-               ret = copy_page_to_iter(page, offset, nr, iter);
-               offset += ret;
-               index += offset >> PAGE_SHIFT;
-               offset &= ~PAGE_MASK;
-               prev_offset = offset;
-
-               put_page(page);
-               written += ret;
-               if (!iov_iter_count(iter))
-                       goto out;
-               if (ret < nr) {
-                       error = -EFAULT;
-                       goto out;
-               }
-               continue;
-
-page_not_up_to_date:
-               /* Get exclusive access to the page ... */
-               if (iocb->ki_flags & IOCB_WAITQ)
-                       error = lock_page_async(page, iocb->ki_waitq);
-               else
-                       error = lock_page_killable(page);
-               if (unlikely(error))
-                       goto readpage_error;
-
-page_not_up_to_date_locked:
-               /* Did it get truncated before we got the lock? */
-               if (!page->mapping) {
-                       unlock_page(page);
-                       put_page(page);
-                       continue;
-               }
+                       /*
+                        * If users can be writing to this page using arbitrary
+                        * virtual addresses, take care about potential aliasing
+                        * before reading the page on the kernel side.
+                        */
+                       if (writably_mapped)
+                               flush_dcache_page(pages[i]);
  
-               /* Did somebody else fill it already? */
-               if (PageUptodate(page)) {
-                       unlock_page(page);
-                       goto page_ok;
-               }
+                       copied = copy_page_to_iter(pages[i], offset, bytes, iter);
  
-readpage:
-               if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
-                       unlock_page(page);
-                       put_page(page);
-                       goto would_block;
-               }
-               /*
-                * A previous I/O error may have been due to temporary
-                * failures, eg. multipath errors.
-                * PG_error will be set again if readpage fails.
-                */
-               ClearPageError(page);
-               /* Start the actual read. The read will unlock the page. */
-               error = mapping->a_ops->readpage(filp, page);
+                       written += copied;
+                       iocb->ki_pos += copied;
+                       ra->prev_pos = iocb->ki_pos;
  
-               if (unlikely(error)) {
-                       if (error == AOP_TRUNCATED_PAGE) {
-                               put_page(page);
-                               error = 0;
-                               goto find_page;
-                       }
-                       goto readpage_error;
-               }
-
-               if (!PageUptodate(page)) {
-                       if (iocb->ki_flags & IOCB_WAITQ)
-                               error = lock_page_async(page, iocb->ki_waitq);
-                       else
-                               error = lock_page_killable(page);
-
-                       if (unlikely(error))
-                               goto readpage_error;
-                       if (!PageUptodate(page)) {
-                               if (page->mapping == NULL) {
-                                       /*
-                                        * invalidate_mapping_pages got it
-                                        */
-                                       unlock_page(page);
-                                       put_page(page);
-                                       goto find_page;
-                               }
-                               unlock_page(page);
-                               shrink_readahead_size_eio(ra);
-                               error = -EIO;
-                               goto readpage_error;
+                       if (copied < bytes) {
+                               error = -EFAULT;
+                               break;
                         }
-                       unlock_page(page);
                 }
+put_pages:
+               for (i = 0; i < pg_nr; i++)
+                       put_page(pages[i]);
+       } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
  
-               goto page_ok;
-
-readpage_error:
-               /* UHHUH! A synchronous read error occurred. Report it */
-               put_page(page);
-               goto out;
-
-no_cached_page:
-               /*
-                * Ok, it wasn't cached, so we need to create a new
-                * page..
-                */
-               page = page_cache_alloc(mapping);
-               if (!page) {
-                       error = -ENOMEM;
-                       goto out;
-               }
-               error = add_to_page_cache_lru(page, mapping, index,
-                               mapping_gfp_constraint(mapping, GFP_KERNEL));
-               if (error) {
-                       put_page(page);
-                       if (error == -EEXIST) {
-                               error = 0;
-                               goto find_page;
-                       }
-                       goto out;
-               }
-               goto readpage;
-       }
+       file_accessed(filp);
  
-would_block:
-       error = -EAGAIN;
-out:
-       ra->prev_pos = prev_index;
-       ra->prev_pos <<= PAGE_SHIFT;
-       ra->prev_pos |= prev_offset;
+       if (pages != pages_onstack)
+               kfree(pages);
  
-       *ppos = ((loff_t)index << PAGE_SHIFT) + offset;
-       file_accessed(filp);
         return written ? written : error;
  }
  EXPORT_SYMBOL_GPL(generic_file_buffered_read);
@@ -2568,8 +2691,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
         struct file *file = vmf->vma->vm_file;
         struct file_ra_state *ra = &file->f_ra;
         struct address_space *mapping = file->f_mapping;
+       DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
         struct file *fpin = NULL;
-       pgoff_t offset = vmf->pgoff;
         unsigned int mmap_miss;
  
         /* If we don't want any read-ahead, don't bother */
@@ -2580,8 +2703,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
  
         if (vmf->vma->vm_flags & VM_SEQ_READ) {
                 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
-               page_cache_sync_readahead(mapping, ra, file, offset,
-                                         ra->ra_pages);
+               page_cache_sync_ra(&ractl, ra, ra->ra_pages);
                 return fpin;
         }
  
@@ -2601,10 +2723,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
          * mmap read-around
          */
         fpin = maybe_unlock_mmap_for_io(vmf, fpin);
-       ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+       ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
         ra->size = ra->ra_pages;
         ra->async_size = ra->ra_pages / 4;
-       ra_submit(ra, mapping, file);
+       ractl._index = ra->start;
+       do_page_cache_ra(&ractl, ra->size, ra->async_size);
         return fpin;
  }
  
@@ -2793,42 +2916,42 @@ void filemap_map_pages(struct vm_fault *vmf,
         pgoff_t last_pgoff = start_pgoff;
         unsigned long max_idx;
         XA_STATE(xas, &mapping->i_pages, start_pgoff);
-       struct page *page;
+       struct page *head, *page;
         unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
  
         rcu_read_lock();
-       xas_for_each(&xas, page, end_pgoff) {
-               if (xas_retry(&xas, page))
+       xas_for_each(&xas, head, end_pgoff) {
+               if (xas_retry(&xas, head))
                         continue;
-               if (xa_is_value(page))
+               if (xa_is_value(head))
                         goto next;
  
                 /*
                  * Check for a locked page first, as a speculative
                  * reference may adversely influence page migration.
                  */
-               if (PageLocked(page))
+               if (PageLocked(head))
                         goto next;
-               if (!page_cache_get_speculative(page))
+               if (!page_cache_get_speculative(head))
                         goto next;
  
                 /* Has the page moved or been split? */
-               if (unlikely(page != xas_reload(&xas)))
+               if (unlikely(head != xas_reload(&xas)))
                         goto skip;
-               page = find_subpage(page, xas.xa_index);
+               page = find_subpage(head, xas.xa_index);
  
-               if (!PageUptodate(page) ||
+               if (!PageUptodate(head) ||
                                 PageReadahead(page) ||
                                 PageHWPoison(page))
                         goto skip;
-               if (!trylock_page(page))
+               if (!trylock_page(head))
                         goto skip;
  
-               if (page->mapping != mapping || !PageUptodate(page))
+               if (head->mapping != mapping || !PageUptodate(head))
                         goto unlock;
  
                 max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
-               if (page->index >= max_idx)
+               if (xas.xa_index >= max_idx)
                         goto unlock;
  
                 if (mmap_miss > 0)
@@ -2840,12 +2963,12 @@ void filemap_map_pages(struct vm_fault *vmf,
                 last_pgoff = xas.xa_index;
                 if (alloc_set_pte(vmf, page))
                         goto unlock;
-               unlock_page(page);
+               unlock_page(head);
                 goto next;
  unlock:
-               unlock_page(page);
+               unlock_page(head);
  skip:
-               put_page(page);
+               put_page(head);
  next:
                 /* Huge page is mapped? No need to proceed. */
                 if (pmd_trans_huge(*vmf->pmd))
@@ -2984,7 +3107,7 @@ filler:
                 goto out;
  
         /*
-        * Page is not up to date and may be locked due one of the following
+        * Page is not up to date and may be locked due to one of the following
          * case a: Page is being filled and the page lock is held
          * case b: Read/write error clearing the page uptodate status
          * case c: Truncation in progress (page locked)
@@ -3093,228 +3216,6 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
  }
  EXPORT_SYMBOL(read_cache_page_gfp);
  
-/*
- * Don't operate on ranges the page cache doesn't support, and don't exceed the
- * LFS limits.  If pos is under the limit it becomes a short access.  If it
- * exceeds the limit we return -EFBIG.
- */
-static int generic_write_check_limits(struct file *file, loff_t pos,
-                                     loff_t *count)
-{
-       struct inode *inode = file->f_mapping->host;
-       loff_t max_size = inode->i_sb->s_maxbytes;
-       loff_t limit = rlimit(RLIMIT_FSIZE);
-
-       if (limit != RLIM_INFINITY) {
-               if (pos >= limit) {
-                       send_sig(SIGXFSZ, current, 0);
-                       return -EFBIG;
-               }
-               *count = min(*count, limit - pos);
-       }
-
-       if (!(file->f_flags & O_LARGEFILE))
-               max_size = MAX_NON_LFS;
-
-       if (unlikely(pos >= max_size))
-               return -EFBIG;
-
-       *count = min(*count, max_size - pos);
-
-       return 0;
-}
-
-/*
- * Performs necessary checks before doing a write
- *
- * Can adjust writing position or amount of bytes to write.
- * Returns appropriate error code that caller should return or
- * zero in case that write should be allowed.
- */
-inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
-{
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-       loff_t count;
-       int ret;
-
-       if (IS_SWAPFILE(inode))
-               return -ETXTBSY;
-
-       if (!iov_iter_count(from))
-               return 0;
-
-       /* FIXME: this is for backwards compatibility with 2.4 */
-       if (iocb->ki_flags & IOCB_APPEND)
-               iocb->ki_pos = i_size_read(inode);
-
-       if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
-               return -EINVAL;
-
-       count = iov_iter_count(from);
-       ret = generic_write_check_limits(file, iocb->ki_pos, &count);
-       if (ret)
-               return ret;
-
-       iov_iter_truncate(from, count);
-       return iov_iter_count(from);
-}
-EXPORT_SYMBOL(generic_write_checks);
-
-/*
- * Performs necessary checks before doing a clone.
- *
- * Can adjust amount of bytes to clone via @req_count argument.
- * Returns appropriate error code that caller should return or
- * zero in case the clone should be allowed.
- */
-int generic_remap_checks(struct file *file_in, loff_t pos_in,
-                        struct file *file_out, loff_t pos_out,
-                        loff_t *req_count, unsigned int remap_flags)
-{
-       struct inode *inode_in = file_in->f_mapping->host;
-       struct inode *inode_out = file_out->f_mapping->host;
-       uint64_t count = *req_count;
-       uint64_t bcount;
-       loff_t size_in, size_out;
-       loff_t bs = inode_out->i_sb->s_blocksize;
-       int ret;
-
-       /* The start of both ranges must be aligned to an fs block. */
-       if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
-               return -EINVAL;
-
-       /* Ensure offsets don't wrap. */
-       if (pos_in + count < pos_in || pos_out + count < pos_out)
-               return -EINVAL;
-
-       size_in = i_size_read(inode_in);
-       size_out = i_size_read(inode_out);
-
-       /* Dedupe requires both ranges to be within EOF. */
-       if ((remap_flags & REMAP_FILE_DEDUP) &&
-           (pos_in >= size_in || pos_in + count > size_in ||
-            pos_out >= size_out || pos_out + count > size_out))
-               return -EINVAL;
-
-       /* Ensure the infile range is within the infile. */
-       if (pos_in >= size_in)
-               return -EINVAL;
-       count = min(count, size_in - (uint64_t)pos_in);
-
-       ret = generic_write_check_limits(file_out, pos_out, &count);
-       if (ret)
-               return ret;
-
-       /*
-        * If the user wanted us to link to the infile's EOF, round up to the
-        * next block boundary for this check.
-        *
-        * Otherwise, make sure the count is also block-aligned, having
-        * already confirmed the starting offsets' block alignment.
-        */
-       if (pos_in + count == size_in) {
-               bcount = ALIGN(size_in, bs) - pos_in;
-       } else {
-               if (!IS_ALIGNED(count, bs))
-                       count = ALIGN_DOWN(count, bs);
-               bcount = count;
-       }
-
-       /* Don't allow overlapped cloning within the same file. */
-       if (inode_in == inode_out &&
-           pos_out + bcount > pos_in &&
-           pos_out < pos_in + bcount)
-               return -EINVAL;
-
-       /*
-        * We shortened the request but the caller can't deal with that, so
-        * bounce the request back to userspace.
-        */
-       if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
-               return -EINVAL;
-
-       *req_count = count;
-       return 0;
-}
-
-
-/*
- * Performs common checks before doing a file copy/clone
- * from @file_in to @file_out.
- */
-int generic_file_rw_checks(struct file *file_in, struct file *file_out)
-{
-       struct inode *inode_in = file_inode(file_in);
-       struct inode *inode_out = file_inode(file_out);
-
-       /* Don't copy dirs, pipes, sockets... */
-       if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-               return -EISDIR;
-       if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
-               return -EINVAL;
-
-       if (!(file_in->f_mode & FMODE_READ) ||
-           !(file_out->f_mode & FMODE_WRITE) ||
-           (file_out->f_flags & O_APPEND))
-               return -EBADF;
-
-       return 0;
-}
-
-/*
- * Performs necessary checks before doing a file copy
- *
- * Can adjust amount of bytes to copy via @req_count argument.
- * Returns appropriate error code that caller should return or
- * zero in case the copy should be allowed.
- */
-int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
-                            struct file *file_out, loff_t pos_out,
-                            size_t *req_count, unsigned int flags)
-{
-       struct inode *inode_in = file_inode(file_in);
-       struct inode *inode_out = file_inode(file_out);
-       uint64_t count = *req_count;
-       loff_t size_in;
-       int ret;
-
-       ret = generic_file_rw_checks(file_in, file_out);
-       if (ret)
-               return ret;
-
-       /* Don't touch certain kinds of inodes */
-       if (IS_IMMUTABLE(inode_out))
-               return -EPERM;
-
-       if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
-               return -ETXTBSY;
-
-       /* Ensure offsets don't wrap. */
-       if (pos_in + count < pos_in || pos_out + count < pos_out)
-               return -EOVERFLOW;
-
-       /* Shorten the copy to EOF */
-       size_in = i_size_read(inode_in);
-       if (pos_in >= size_in)
-               count = 0;
-       else
-               count = min(count, size_in - (uint64_t)pos_in);
-
-       ret = generic_write_check_limits(file_out, pos_out, &count);
-       if (ret)
-               return ret;
-
-       /* Don't allow overlapped copying within the same file. */
-       if (inode_in == inode_out &&
-           pos_out + count > pos_in &&
-           pos_out < pos_in + count)
-               return -EINVAL;
-
-       *req_count = count;
-       return 0;
-}
-
  int pagecache_write_begin(struct file *file, struct address_space *mapping,
                                 loff_t pos, unsigned len, unsigned flags,
                                 struct page **pagep, void **fsdata)