perf map: Tighten snprintf() string precision to pass gcc check on some 32-bit arches
[linux-2.6-microblaze.git] / mm / filemap.c
index 6ff2a3f..4370048 100644 (file)
@@ -206,9 +206,9 @@ static void unaccount_page_cache_page(struct address_space *mapping,
        if (PageSwapBacked(page)) {
                __mod_lruvec_page_state(page, NR_SHMEM, -nr);
                if (PageTransHuge(page))
-                       __dec_lruvec_page_state(page, NR_SHMEM_THPS);
+                       __mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr);
        } else if (PageTransHuge(page)) {
-               __dec_lruvec_page_state(page, NR_FILE_THPS);
+               __mod_lruvec_page_state(page, NR_FILE_THPS, -nr);
                filemap_nr_thps_dec(mapping);
        }
 
@@ -777,7 +777,6 @@ EXPORT_SYMBOL(file_write_and_wait_range);
  * replace_page_cache_page - replace a pagecache page with a new one
  * @old:       page to be replaced
  * @new:       page to replace with
- * @gfp_mask:  allocation mode
  *
  * This function replaces a page in the pagecache with a new one.  On
  * success it acquires the pagecache reference for the new page and
@@ -786,10 +785,8 @@ EXPORT_SYMBOL(file_write_and_wait_range);
  * caller must do that.
  *
  * The remove + add is atomic.  This function cannot fail.
- *
- * Return: %0
  */
-int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+void replace_page_cache_page(struct page *old, struct page *new)
 {
        struct address_space *mapping = old->mapping;
        void (*freepage)(struct page *) = mapping->a_ops->freepage;
@@ -824,8 +821,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
        if (freepage)
                freepage(old);
        put_page(old);
-
-       return 0;
 }
 EXPORT_SYMBOL_GPL(replace_page_cache_page);
 
@@ -1348,61 +1343,26 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
 }
 EXPORT_SYMBOL(wait_on_page_bit_killable);
 
-static int __wait_on_page_locked_async(struct page *page,
-                                      struct wait_page_queue *wait, bool set)
-{
-       struct wait_queue_head *q = page_waitqueue(page);
-       int ret = 0;
-
-       wait->page = page;
-       wait->bit_nr = PG_locked;
-
-       spin_lock_irq(&q->lock);
-       __add_wait_queue_entry_tail(q, &wait->wait);
-       SetPageWaiters(page);
-       if (set)
-               ret = !trylock_page(page);
-       else
-               ret = PageLocked(page);
-       /*
-        * If we were successful now, we know we're still on the
-        * waitqueue as we're still under the lock. This means it's
-        * safe to remove and return success, we know the callback
-        * isn't going to trigger.
-        */
-       if (!ret)
-               __remove_wait_queue(q, &wait->wait);
-       else
-               ret = -EIOCBQUEUED;
-       spin_unlock_irq(&q->lock);
-       return ret;
-}
-
-static int wait_on_page_locked_async(struct page *page,
-                                    struct wait_page_queue *wait)
-{
-       if (!PageLocked(page))
-               return 0;
-       return __wait_on_page_locked_async(compound_head(page), wait, false);
-}
-
 /**
  * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
  * @page: The page to wait for.
+ * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
  *
  * The caller should hold a reference on @page.  They expect the page to
  * become unlocked relatively soon, but do not wish to hold up migration
  * (for example) by holding the reference while waiting for the page to
  * come unlocked.  After this function returns, the caller should not
  * dereference @page.
+ *
+ * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal.
  */
-void put_and_wait_on_page_locked(struct page *page)
+int put_and_wait_on_page_locked(struct page *page, int state)
 {
        wait_queue_head_t *q;
 
        page = compound_head(page);
        q = page_waitqueue(page);
-       wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
+       return wait_on_page_bit_common(q, page, PG_locked, state, DROP);
 }
 
 /**
@@ -1558,7 +1518,28 @@ EXPORT_SYMBOL_GPL(__lock_page_killable);
 
 int __lock_page_async(struct page *page, struct wait_page_queue *wait)
 {
-       return __wait_on_page_locked_async(page, wait, true);
+       struct wait_queue_head *q = page_waitqueue(page);
+       int ret = 0;
+
+       wait->page = page;
+       wait->bit_nr = PG_locked;
+
+       spin_lock_irq(&q->lock);
+       __add_wait_queue_entry_tail(q, &wait->wait);
+       SetPageWaiters(page);
+       ret = !trylock_page(page);
+       /*
+        * If we were successful now, we know we're still on the
+        * waitqueue as we're still under the lock. This means it's
+        * safe to remove and return success, we know the callback
+        * isn't going to trigger.
+        */
+       if (!ret)
+               __remove_wait_queue(q, &wait->wait);
+       else
+               ret = -EIOCBQUEUED;
+       spin_unlock_irq(&q->lock);
+       return ret;
 }
 
 /*
@@ -1677,8 +1658,8 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
 }
 EXPORT_SYMBOL(page_cache_prev_miss);
 
-/**
- * find_get_entry - find and get a page cache entry
+/*
+ * mapping_get_entry - Get a page cache entry.
  * @mapping: the address_space to search
  * @index: The page cache index.
  *
@@ -1690,7 +1671,8 @@ EXPORT_SYMBOL(page_cache_prev_miss);
  *
  * Return: The head page or shadow entry, %NULL if nothing is found.
  */
-struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
+static struct page *mapping_get_entry(struct address_space *mapping,
+               pgoff_t index)
 {
        XA_STATE(xas, &mapping->i_pages, index);
        struct page *page;
@@ -1726,39 +1708,6 @@ out:
        return page;
 }
 
-/**
- * find_lock_entry - Locate and lock a page cache entry.
- * @mapping: The address_space to search.
- * @index: The page cache index.
- *
- * Looks up the page at @mapping & @index.  If there is a page in the
- * cache, the head page is returned locked and with an increased refcount.
- *
- * If the slot holds a shadow entry of a previously evicted page, or a
- * swap entry from shmem/tmpfs, it is returned.
- *
- * Context: May sleep.
- * Return: The head page or shadow entry, %NULL if nothing is found.
- */
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
-{
-       struct page *page;
-
-repeat:
-       page = find_get_entry(mapping, index);
-       if (page && !xa_is_value(page)) {
-               lock_page(page);
-               /* Has the page been truncated? */
-               if (unlikely(page->mapping != mapping)) {
-                       unlock_page(page);
-                       put_page(page);
-                       goto repeat;
-               }
-               VM_BUG_ON_PAGE(!thp_contains(page, index), page);
-       }
-       return page;
-}
-
 /**
  * pagecache_get_page - Find and get a reference to a page.
  * @mapping: The address_space to search.
@@ -1774,6 +1723,8 @@ repeat:
  * * %FGP_LOCK - The page is returned locked.
  * * %FGP_HEAD - If the page is present and a THP, return the head page
  *   rather than the exact page specified by the index.
+ * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
+ *   instead of allocating a new page to replace it.
  * * %FGP_CREAT - If no page is present then a new page is allocated using
  *   @gfp_mask and added to the page cache and the VM's LRU list.
  *   The page is returned locked and with an increased refcount.
@@ -1797,9 +1748,12 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
        struct page *page;
 
 repeat:
-       page = find_get_entry(mapping, index);
-       if (xa_is_value(page))
+       page = mapping_get_entry(mapping, index);
+       if (xa_is_value(page)) {
+               if (fgp_flags & FGP_ENTRY)
+                       return page;
                page = NULL;
+       }
        if (!page)
                goto no_page;
 
@@ -1871,18 +1825,53 @@ no_page:
 }
 EXPORT_SYMBOL(pagecache_get_page);
 
+static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max,
+               xa_mark_t mark)
+{
+       struct page *page;
+
+retry:
+       if (mark == XA_PRESENT)
+               page = xas_find(xas, max);
+       else
+               page = xas_find_marked(xas, max, mark);
+
+       if (xas_retry(xas, page))
+               goto retry;
+       /*
+        * A shadow entry of a recently evicted page, a swap
+        * entry from shmem/tmpfs or a DAX entry.  Return it
+        * without attempting to raise page count.
+        */
+       if (!page || xa_is_value(page))
+               return page;
+
+       if (!page_cache_get_speculative(page))
+               goto reset;
+
+       /* Has the page moved or been split? */
+       if (unlikely(page != xas_reload(xas))) {
+               put_page(page);
+               goto reset;
+       }
+
+       return page;
+reset:
+       xas_reset(xas);
+       goto retry;
+}
+
 /**
  * find_get_entries - gang pagecache lookup
  * @mapping:   The address_space to search
  * @start:     The starting page cache index
- * @nr_entries:        The maximum number of entries
- * @entries:   Where the resulting entries are placed
+ * @end:       The final page index (inclusive).
+ * @pvec:      Where the resulting entries are placed.
  * @indices:   The cache indices corresponding to the entries in @entries
  *
- * find_get_entries() will search for and return a group of up to
- * @nr_entries entries in the mapping.  The entries are placed at
- * @entries.  find_get_entries() takes a reference against any actual
- * pages it returns.
+ * find_get_entries() will search for and return a batch of entries in
+ * the mapping.  The entries are placed in @pvec.  find_get_entries()
+ * takes a reference on any actual pages it returns.
  *
  * The search returns a group of mapping-contiguous page cache entries
  * with ascending indexes.  There may be holes in the indices due to
@@ -1898,59 +1887,96 @@ EXPORT_SYMBOL(pagecache_get_page);
  *
  * Return: the number of pages and shadow entries which were found.
  */
-unsigned find_get_entries(struct address_space *mapping,
-                         pgoff_t start, unsigned int nr_entries,
-                         struct page **entries, pgoff_t *indices)
+unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
+               pgoff_t end, struct pagevec *pvec, pgoff_t *indices)
 {
        XA_STATE(xas, &mapping->i_pages, start);
        struct page *page;
        unsigned int ret = 0;
-
-       if (!nr_entries)
-               return 0;
+       unsigned nr_entries = PAGEVEC_SIZE;
 
        rcu_read_lock();
-       xas_for_each(&xas, page, ULONG_MAX) {
-               if (xas_retry(&xas, page))
-                       continue;
-               /*
-                * A shadow entry of a recently evicted page, a swap
-                * entry from shmem/tmpfs or a DAX entry.  Return it
-                * without attempting to raise page count.
-                */
-               if (xa_is_value(page))
-                       goto export;
-
-               if (!page_cache_get_speculative(page))
-                       goto retry;
-
-               /* Has the page moved or been split? */
-               if (unlikely(page != xas_reload(&xas)))
-                       goto put_page;
-
+       while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
                /*
                 * Terminate early on finding a THP, to allow the caller to
                 * handle it all at once; but continue if this is hugetlbfs.
                 */
-               if (PageTransHuge(page) && !PageHuge(page)) {
+               if (!xa_is_value(page) && PageTransHuge(page) &&
+                               !PageHuge(page)) {
                        page = find_subpage(page, xas.xa_index);
                        nr_entries = ret + 1;
                }
-export:
+
                indices[ret] = xas.xa_index;
-               entries[ret] = page;
+               pvec->pages[ret] = page;
                if (++ret == nr_entries)
                        break;
-               continue;
-put_page:
-               put_page(page);
-retry:
-               xas_reset(&xas);
        }
        rcu_read_unlock();
+
+       pvec->nr = ret;
        return ret;
 }
 
+/**
+ * find_lock_entries - Find a batch of pagecache entries.
+ * @mapping:   The address_space to search.
+ * @start:     The starting page cache index.
+ * @end:       The final page index (inclusive).
+ * @pvec:      Where the resulting entries are placed.
+ * @indices:   The cache indices of the entries in @pvec.
+ *
+ * find_lock_entries() will return a batch of entries from @mapping.
+ * Swap, shadow and DAX entries are included.  Pages are returned
+ * locked and with an incremented refcount.  Pages which are locked by
+ * somebody else or under writeback are skipped.  Only the head page of
+ * a THP is returned.  Pages which are partially outside the range are
+ * not returned.
+ *
+ * The entries have ascending indexes.  The indices may not be consecutive
+ * due to not-present entries, THP pages, pages which could not be locked
+ * or pages under writeback.
+ *
+ * Return: The number of entries which were found.
+ */
+unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
+               pgoff_t end, struct pagevec *pvec, pgoff_t *indices)
+{
+       XA_STATE(xas, &mapping->i_pages, start);
+       struct page *page;
+
+       rcu_read_lock();
+       while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
+               if (!xa_is_value(page)) {
+                       if (page->index < start)
+                               goto put;
+                       VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
+                       if (page->index + thp_nr_pages(page) - 1 > end)
+                               goto put;
+                       if (!trylock_page(page))
+                               goto put;
+                       if (page->mapping != mapping || PageWriteback(page))
+                               goto unlock;
+                       VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index),
+                                       page);
+               }
+               indices[pvec->nr] = xas.xa_index;
+               if (!pagevec_add(pvec, page))
+                       break;
+               goto next;
+unlock:
+               unlock_page(page);
+put:
+               put_page(page);
+next:
+               if (!xa_is_value(page) && PageTransHuge(page))
+                       xas_set(&xas, page->index + thp_nr_pages(page));
+       }
+       rcu_read_unlock();
+
+       return pagevec_count(pvec);
+}
+
 /**
  * find_get_pages_range - gang pagecache lookup
  * @mapping:   The address_space to search
@@ -1984,30 +2010,16 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
                return 0;
 
        rcu_read_lock();
-       xas_for_each(&xas, page, end) {
-               if (xas_retry(&xas, page))
-                       continue;
+       while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
                /* Skip over shadow, swap and DAX entries */
                if (xa_is_value(page))
                        continue;
 
-               if (!page_cache_get_speculative(page))
-                       goto retry;
-
-               /* Has the page moved or been split? */
-               if (unlikely(page != xas_reload(&xas)))
-                       goto put_page;
-
                pages[ret] = find_subpage(page, xas.xa_index);
                if (++ret == nr_pages) {
                        *start = xas.xa_index + 1;
                        goto out;
                }
-               continue;
-put_page:
-               put_page(page);
-retry:
-               xas_reset(&xas);
        }
 
        /*
@@ -2081,7 +2093,7 @@ retry:
 EXPORT_SYMBOL(find_get_pages_contig);
 
 /**
- * find_get_pages_range_tag - find and return pages in given range matching @tag
+ * find_get_pages_range_tag - Find and return head pages matching @tag.
  * @mapping:   the address_space to search
  * @index:     the starting page index
  * @end:       The final page index (inclusive)
@@ -2089,8 +2101,9 @@ EXPORT_SYMBOL(find_get_pages_contig);
  * @nr_pages:  the maximum number of pages
  * @pages:     where the resulting pages are placed
  *
- * Like find_get_pages, except we only return pages which are tagged with
- * @tag.   We update @index to index the next page for the traversal.
+ * Like find_get_pages(), except we only return head pages which are tagged
+ * with @tag.  @index is updated to the index immediately after the last
+ * page we return, ready for the next iteration.
  *
  * Return: the number of pages which were found.
  */
@@ -2106,9 +2119,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
                return 0;
 
        rcu_read_lock();
-       xas_for_each_marked(&xas, page, end, tag) {
-               if (xas_retry(&xas, page))
-                       continue;
+       while ((page = find_get_entry(&xas, end, tag))) {
                /*
                 * Shadow entries should never be tagged, but this iteration
                 * is lockless so there is a window for page reclaim to evict
@@ -2117,23 +2128,11 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
                if (xa_is_value(page))
                        continue;
 
-               if (!page_cache_get_speculative(page))
-                       goto retry;
-
-               /* Has the page moved or been split? */
-               if (unlikely(page != xas_reload(&xas)))
-                       goto put_page;
-
-               pages[ret] = find_subpage(page, xas.xa_index);
+               pages[ret] = page;
                if (++ret == nr_pages) {
-                       *index = xas.xa_index + 1;
+                       *index = page->index + thp_nr_pages(page);
                        goto out;
                }
-               continue;
-put_page:
-               put_page(page);
-retry:
-               xas_reset(&xas);
        }
 
        /*
@@ -2173,287 +2172,267 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra)
        ra->ra_pages /= 4;
 }
 
-static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
+/*
+ * filemap_get_read_batch - Get a batch of pages for read
+ *
+ * Get a batch of pages which represent a contiguous range of bytes
+ * in the file.  No tail pages will be returned.  If @index is in the
+ * middle of a THP, the entire THP will be returned.  The last page in
+ * the batch may have Readahead set or be not Uptodate so that the
+ * caller can take the appropriate action.
+ */
+static void filemap_get_read_batch(struct address_space *mapping,
+               pgoff_t index, pgoff_t max, struct pagevec *pvec)
 {
-       if (iocb->ki_flags & IOCB_WAITQ)
-               return lock_page_async(page, iocb->ki_waitq);
-       else if (iocb->ki_flags & IOCB_NOWAIT)
-               return trylock_page(page) ? 0 : -EAGAIN;
-       else
-               return lock_page_killable(page);
+       XA_STATE(xas, &mapping->i_pages, index);
+       struct page *head;
+
+       rcu_read_lock();
+       for (head = xas_load(&xas); head; head = xas_next(&xas)) {
+               if (xas_retry(&xas, head))
+                       continue;
+               if (xas.xa_index > max || xa_is_value(head))
+                       break;
+               if (!page_cache_get_speculative(head))
+                       goto retry;
+
+               /* Has the page moved or been split? */
+               if (unlikely(head != xas_reload(&xas)))
+                       goto put_page;
+
+               if (!pagevec_add(pvec, head))
+                       break;
+               if (!PageUptodate(head))
+                       break;
+               if (PageReadahead(head))
+                       break;
+               xas.xa_index = head->index + thp_nr_pages(head) - 1;
+               xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK;
+               continue;
+put_page:
+               put_page(head);
+retry:
+               xas_reset(&xas);
+       }
+       rcu_read_unlock();
 }
 
-static struct page *
-generic_file_buffered_read_readpage(struct kiocb *iocb,
-                                   struct file *filp,
-                                   struct address_space *mapping,
-                                   struct page *page)
+static int filemap_read_page(struct file *file, struct address_space *mapping,
+               struct page *page)
 {
-       struct file_ra_state *ra = &filp->f_ra;
        int error;
 
-       if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
-               unlock_page(page);
-               put_page(page);
-               return ERR_PTR(-EAGAIN);
-       }
-
        /*
-        * A previous I/O error may have been due to temporary
-        * failures, eg. multipath errors.
-        * PG_error will be set again if readpage fails.
+        * A previous I/O error may have been due to temporary failures,
+        * eg. multipath errors.  PG_error will be set again if readpage
+        * fails.
         */
        ClearPageError(page);
        /* Start the actual read. The read will unlock the page. */
-       error = mapping->a_ops->readpage(filp, page);
+       error = mapping->a_ops->readpage(file, page);
+       if (error)
+               return error;
 
-       if (unlikely(error)) {
-               put_page(page);
-               return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
-       }
+       error = wait_on_page_locked_killable(page);
+       if (error)
+               return error;
+       if (PageUptodate(page))
+               return 0;
+       if (!page->mapping)     /* page truncated */
+               return AOP_TRUNCATED_PAGE;
+       shrink_readahead_size_eio(&file->f_ra);
+       return -EIO;
+}
 
-       if (!PageUptodate(page)) {
-               error = lock_page_for_iocb(iocb, page);
-               if (unlikely(error)) {
-                       put_page(page);
-                       return ERR_PTR(error);
-               }
-               if (!PageUptodate(page)) {
-                       if (page->mapping == NULL) {
-                               /*
-                                * invalidate_mapping_pages got it
-                                */
-                               unlock_page(page);
-                               put_page(page);
-                               return NULL;
-                       }
-                       unlock_page(page);
-                       shrink_readahead_size_eio(ra);
-                       put_page(page);
-                       return ERR_PTR(-EIO);
-               }
-               unlock_page(page);
+static bool filemap_range_uptodate(struct address_space *mapping,
+               loff_t pos, struct iov_iter *iter, struct page *page)
+{
+       int count;
+
+       if (PageUptodate(page))
+               return true;
+       /* pipes can't handle partially uptodate pages */
+       if (iov_iter_is_pipe(iter))
+               return false;
+       if (!mapping->a_ops->is_partially_uptodate)
+               return false;
+       if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page)))
+               return false;
+
+       count = iter->count;
+       if (page_offset(page) > pos) {
+               count -= page_offset(page) - pos;
+               pos = 0;
+       } else {
+               pos -= page_offset(page);
        }
 
-       return page;
+       return mapping->a_ops->is_partially_uptodate(page, pos, count);
 }
 
-static struct page *
-generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb,
-                                          struct file *filp,
-                                          struct iov_iter *iter,
-                                          struct page *page,
-                                          loff_t pos, loff_t count)
+static int filemap_update_page(struct kiocb *iocb,
+               struct address_space *mapping, struct iov_iter *iter,
+               struct page *page)
 {
-       struct address_space *mapping = filp->f_mapping;
-       struct inode *inode = mapping->host;
        int error;
 
-       /*
-        * See comment in do_read_cache_page on why
-        * wait_on_page_locked is used to avoid unnecessarily
-        * serialisations and why it's safe.
-        */
-       if (iocb->ki_flags & IOCB_WAITQ) {
-               error = wait_on_page_locked_async(page,
-                                               iocb->ki_waitq);
-       } else {
-               error = wait_on_page_locked_killable(page);
-       }
-       if (unlikely(error)) {
-               put_page(page);
-               return ERR_PTR(error);
+       if (!trylock_page(page)) {
+               if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
+                       return -EAGAIN;
+               if (!(iocb->ki_flags & IOCB_WAITQ)) {
+                       put_and_wait_on_page_locked(page, TASK_KILLABLE);
+                       return AOP_TRUNCATED_PAGE;
+               }
+               error = __lock_page_async(page, iocb->ki_waitq);
+               if (error)
+                       return error;
        }
-       if (PageUptodate(page))
-               return page;
 
-       if (inode->i_blkbits == PAGE_SHIFT ||
-                       !mapping->a_ops->is_partially_uptodate)
-               goto page_not_up_to_date;
-       /* pipes can't handle partially uptodate pages */
-       if (unlikely(iov_iter_is_pipe(iter)))
-               goto page_not_up_to_date;
-       if (!trylock_page(page))
-               goto page_not_up_to_date;
-       /* Did it get truncated before we got the lock? */
        if (!page->mapping)
-               goto page_not_up_to_date_locked;
-       if (!mapping->a_ops->is_partially_uptodate(page,
-                               pos & ~PAGE_MASK, count))
-               goto page_not_up_to_date_locked;
-       unlock_page(page);
-       return page;
-
-page_not_up_to_date:
-       /* Get exclusive access to the page ... */
-       error = lock_page_for_iocb(iocb, page);
-       if (unlikely(error)) {
-               put_page(page);
-               return ERR_PTR(error);
-       }
+               goto truncated;
 
-page_not_up_to_date_locked:
-       /* Did it get truncated before we got the lock? */
-       if (!page->mapping) {
-               unlock_page(page);
-               put_page(page);
-               return NULL;
-       }
+       error = 0;
+       if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
+               goto unlock;
 
-       /* Did somebody else fill it already? */
-       if (PageUptodate(page)) {
-               unlock_page(page);
-               return page;
-       }
+       error = -EAGAIN;
+       if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
+               goto unlock;
 
-       return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+       error = filemap_read_page(iocb->ki_filp, mapping, page);
+       if (error == AOP_TRUNCATED_PAGE)
+               put_page(page);
+       return error;
+truncated:
+       unlock_page(page);
+       put_page(page);
+       return AOP_TRUNCATED_PAGE;
+unlock:
+       unlock_page(page);
+       return error;
 }
 
-static struct page *
-generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
-                                         struct iov_iter *iter)
+static int filemap_create_page(struct file *file,
+               struct address_space *mapping, pgoff_t index,
+               struct pagevec *pvec)
 {
-       struct file *filp = iocb->ki_filp;
-       struct address_space *mapping = filp->f_mapping;
-       pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
        struct page *page;
        int error;
 
-       if (iocb->ki_flags & IOCB_NOIO)
-               return ERR_PTR(-EAGAIN);
-
-       /*
-        * Ok, it wasn't cached, so we need to create a new
-        * page..
-        */
        page = page_cache_alloc(mapping);
        if (!page)
-               return ERR_PTR(-ENOMEM);
+               return -ENOMEM;
 
        error = add_to_page_cache_lru(page, mapping, index,
-                                     mapping_gfp_constraint(mapping, GFP_KERNEL));
-       if (error) {
-               put_page(page);
-               return error != -EEXIST ? ERR_PTR(error) : NULL;
-       }
+                       mapping_gfp_constraint(mapping, GFP_KERNEL));
+       if (error == -EEXIST)
+               error = AOP_TRUNCATED_PAGE;
+       if (error)
+               goto error;
+
+       error = filemap_read_page(file, mapping, page);
+       if (error)
+               goto error;
 
-       return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+       pagevec_add(pvec, page);
+       return 0;
+error:
+       put_page(page);
+       return error;
+}
+
+static int filemap_readahead(struct kiocb *iocb, struct file *file,
+               struct address_space *mapping, struct page *page,
+               pgoff_t last_index)
+{
+       if (iocb->ki_flags & IOCB_NOIO)
+               return -EAGAIN;
+       page_cache_async_readahead(mapping, &file->f_ra, file, page,
+                       page->index, last_index - page->index);
+       return 0;
 }
 
-static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
-                                               struct iov_iter *iter,
-                                               struct page **pages,
-                                               unsigned int nr)
+static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
+               struct pagevec *pvec)
 {
        struct file *filp = iocb->ki_filp;
        struct address_space *mapping = filp->f_mapping;
        struct file_ra_state *ra = &filp->f_ra;
        pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
-       pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
-       int i, j, nr_got, err = 0;
+       pgoff_t last_index;
+       struct page *page;
+       int err = 0;
 
-       nr = min_t(unsigned long, last_index - index, nr);
-find_page:
+       last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
+retry:
        if (fatal_signal_pending(current))
                return -EINTR;
 
-       nr_got = find_get_pages_contig(mapping, index, nr, pages);
-       if (nr_got)
-               goto got_pages;
-
-       if (iocb->ki_flags & IOCB_NOIO)
-               return -EAGAIN;
-
-       page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
-
-       nr_got = find_get_pages_contig(mapping, index, nr, pages);
-       if (nr_got)
-               goto got_pages;
-
-       pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
-       err = PTR_ERR_OR_ZERO(pages[0]);
-       if (!IS_ERR_OR_NULL(pages[0]))
-               nr_got = 1;
-got_pages:
-       for (i = 0; i < nr_got; i++) {
-               struct page *page = pages[i];
-               pgoff_t pg_index = index + i;
-               loff_t pg_pos = max(iocb->ki_pos,
-                                   (loff_t) pg_index << PAGE_SHIFT);
-               loff_t pg_count = iocb->ki_pos + iter->count - pg_pos;
-
-               if (PageReadahead(page)) {
-                       if (iocb->ki_flags & IOCB_NOIO) {
-                               for (j = i; j < nr_got; j++)
-                                       put_page(pages[j]);
-                               nr_got = i;
-                               err = -EAGAIN;
-                               break;
-                       }
-                       page_cache_async_readahead(mapping, ra, filp, page,
-                                       pg_index, last_index - pg_index);
-               }
-
-               if (!PageUptodate(page)) {
-                       if ((iocb->ki_flags & IOCB_NOWAIT) ||
-                           ((iocb->ki_flags & IOCB_WAITQ) && i)) {
-                               for (j = i; j < nr_got; j++)
-                                       put_page(pages[j]);
-                               nr_got = i;
-                               err = -EAGAIN;
-                               break;
-                       }
+       filemap_get_read_batch(mapping, index, last_index, pvec);
+       if (!pagevec_count(pvec)) {
+               if (iocb->ki_flags & IOCB_NOIO)
+                       return -EAGAIN;
+               page_cache_sync_readahead(mapping, ra, filp, index,
+                               last_index - index);
+               filemap_get_read_batch(mapping, index, last_index, pvec);
+       }
+       if (!pagevec_count(pvec)) {
+               if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
+                       return -EAGAIN;
+               err = filemap_create_page(filp, mapping,
+                               iocb->ki_pos >> PAGE_SHIFT, pvec);
+               if (err == AOP_TRUNCATED_PAGE)
+                       goto retry;
+               return err;
+       }
 
-                       page = generic_file_buffered_read_pagenotuptodate(iocb,
-                                       filp, iter, page, pg_pos, pg_count);
-                       if (IS_ERR_OR_NULL(page)) {
-                               for (j = i + 1; j < nr_got; j++)
-                                       put_page(pages[j]);
-                               nr_got = i;
-                               err = PTR_ERR_OR_ZERO(page);
-                               break;
-                       }
-               }
+       page = pvec->pages[pagevec_count(pvec) - 1];
+       if (PageReadahead(page)) {
+               err = filemap_readahead(iocb, filp, mapping, page, last_index);
+               if (err)
+                       goto err;
+       }
+       if (!PageUptodate(page)) {
+               if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1)
+                       iocb->ki_flags |= IOCB_NOWAIT;
+               err = filemap_update_page(iocb, mapping, iter, page);
+               if (err)
+                       goto err;
        }
 
-       if (likely(nr_got))
-               return nr_got;
-       if (err)
-               return err;
-       /*
-        * No pages and no error means we raced and should retry:
-        */
-       goto find_page;
+       return 0;
+err:
+       if (err < 0)
+               put_page(page);
+       if (likely(--pvec->nr))
+               return 0;
+       if (err == AOP_TRUNCATED_PAGE)
+               goto retry;
+       return err;
 }
 
 /**
- * generic_file_buffered_read - generic file read routine
- * @iocb:      the iocb to read
- * @iter:      data destination
- * @written:   already copied
- *
- * This is a generic file read routine, and uses the
- * mapping->a_ops->readpage() function for the actual low-level stuff.
+ * filemap_read - Read data from the page cache.
+ * @iocb: The iocb to read.
+ * @iter: Destination for the data.
+ * @already_read: Number of bytes already read by the caller.
  *
- * This is really ugly. But the goto's actually try to clarify some
- * of the logic when it comes to error handling etc.
+ * Copies data from the page cache.  If the data is not currently present,
+ * uses the readahead and readpage address_space operations to fetch it.
  *
- * Return:
- * * total number of bytes copied, including those the were already @written
- * * negative error code if nothing was copied
+ * Return: Total number of bytes copied, including those already read by
+ * the caller.  If an error happens before any bytes are copied, returns
+ * a negative error number.
  */
-ssize_t generic_file_buffered_read(struct kiocb *iocb,
-               struct iov_iter *iter, ssize_t written)
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
+               ssize_t already_read)
 {
        struct file *filp = iocb->ki_filp;
        struct file_ra_state *ra = &filp->f_ra;
        struct address_space *mapping = filp->f_mapping;
        struct inode *inode = mapping->host;
-       struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
-       unsigned int nr_pages = min_t(unsigned int, 512,
-                       ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
-                       (iocb->ki_pos >> PAGE_SHIFT));
-       int i, pg_nr, error = 0;
+       struct pagevec pvec;
+       int i, error = 0;
        bool writably_mapped;
        loff_t isize, end_offset;
 
@@ -2463,14 +2442,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
                return 0;
 
        iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
-
-       if (nr_pages > ARRAY_SIZE(pages_onstack))
-               pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
-
-       if (!pages) {
-               pages = pages_onstack;
-               nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
-       }
+       pagevec_init(&pvec);
 
        do {
                cond_resched();
@@ -2480,16 +2452,12 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
                 * can no longer safely return -EIOCBQUEUED. Hence mark
                 * an async read NOWAIT at that point.
                 */
-               if ((iocb->ki_flags & IOCB_WAITQ) && written)
+               if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
                        iocb->ki_flags |= IOCB_NOWAIT;
 
-               i = 0;
-               pg_nr = generic_file_buffered_read_get_pages(iocb, iter,
-                                                            pages, nr_pages);
-               if (pg_nr < 0) {
-                       error = pg_nr;
+               error = filemap_get_pages(iocb, iter, &pvec);
+               if (error < 0)
                        break;
-               }
 
                /*
                 * i_size must be checked after we know the pages are Uptodate.
@@ -2502,13 +2470,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
                isize = i_size_read(inode);
                if (unlikely(iocb->ki_pos >= isize))
                        goto put_pages;
-
                end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
 
-               while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr >
-                      (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT)
-                       put_page(pages[--pg_nr]);
-
                /*
                 * Once we start copying data, we don't want to be touching any
                 * cachelines that might be contended:
@@ -2521,27 +2484,35 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
                 */
                if (iocb->ki_pos >> PAGE_SHIFT !=
                    ra->prev_pos >> PAGE_SHIFT)
-                       mark_page_accessed(pages[0]);
-               for (i = 1; i < pg_nr; i++)
-                       mark_page_accessed(pages[i]);
+                       mark_page_accessed(pvec.pages[0]);
 
-               for (i = 0; i < pg_nr; i++) {
-                       unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
-                       unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos,
-                                                  PAGE_SIZE - offset);
-                       unsigned int copied;
+               for (i = 0; i < pagevec_count(&pvec); i++) {
+                       struct page *page = pvec.pages[i];
+                       size_t page_size = thp_size(page);
+                       size_t offset = iocb->ki_pos & (page_size - 1);
+                       size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
+                                            page_size - offset);
+                       size_t copied;
 
+                       if (end_offset < page_offset(page))
+                               break;
+                       if (i > 0)
+                               mark_page_accessed(page);
                        /*
                         * If users can be writing to this page using arbitrary
                         * virtual addresses, take care about potential aliasing
                         * before reading the page on the kernel side.
                         */
-                       if (writably_mapped)
-                               flush_dcache_page(pages[i]);
+                       if (writably_mapped) {
+                               int j;
+
+                               for (j = 0; j < thp_nr_pages(page); j++)
+                                       flush_dcache_page(page + j);
+                       }
 
-                       copied = copy_page_to_iter(pages[i], offset, bytes, iter);
+                       copied = copy_page_to_iter(page, offset, bytes, iter);
 
-                       written += copied;
+                       already_read += copied;
                        iocb->ki_pos += copied;
                        ra->prev_pos = iocb->ki_pos;
 
@@ -2551,18 +2522,16 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
                        }
                }
 put_pages:
-               for (i = 0; i < pg_nr; i++)
-                       put_page(pages[i]);
+               for (i = 0; i < pagevec_count(&pvec); i++)
+                       put_page(pvec.pages[i]);
+               pagevec_reinit(&pvec);
        } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
 
        file_accessed(filp);
 
-       if (pages != pages_onstack)
-               kfree(pages);
-
-       return written ? written : error;
+       return already_read ? already_read : error;
 }
-EXPORT_SYMBOL_GPL(generic_file_buffered_read);
+EXPORT_SYMBOL_GPL(filemap_read);
 
 /**
  * generic_file_read_iter - generic filesystem read routine
@@ -2592,7 +2561,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
        ssize_t retval = 0;
 
        if (!count)
-               goto out; /* skip atime */
+               return 0; /* skip atime */
 
        if (iocb->ki_flags & IOCB_DIRECT) {
                struct file *file = iocb->ki_filp;
@@ -2610,7 +2579,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
                                                iocb->ki_pos,
                                                iocb->ki_pos + count - 1);
                        if (retval < 0)
-                               goto out;
+                               return retval;
                }
 
                file_accessed(file);
@@ -2620,7 +2589,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
                        iocb->ki_pos += retval;
                        count -= retval;
                }
-               iov_iter_revert(iter, count - iov_iter_count(iter));
+               if (retval != -EIOCBQUEUED)
+                       iov_iter_revert(iter, count - iov_iter_count(iter));
 
                /*
                 * Btrfs can have a short DIO read if we encounter
@@ -2633,15 +2603,116 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
                 */
                if (retval < 0 || !count || iocb->ki_pos >= size ||
                    IS_DAX(inode))
-                       goto out;
+                       return retval;
        }
 
-       retval = generic_file_buffered_read(iocb, iter, retval);
-out:
-       return retval;
+       return filemap_read(iocb, iter, retval);
 }
 EXPORT_SYMBOL(generic_file_read_iter);
 
+static inline loff_t page_seek_hole_data(struct xa_state *xas,
+               struct address_space *mapping, struct page *page,
+               loff_t start, loff_t end, bool seek_data)
+{
+       const struct address_space_operations *ops = mapping->a_ops;
+       size_t offset, bsz = i_blocksize(mapping->host);
+
+       if (xa_is_value(page) || PageUptodate(page))
+               return seek_data ? start : end;
+       if (!ops->is_partially_uptodate)
+               return seek_data ? end : start;
+
+       xas_pause(xas);
+       rcu_read_unlock();
+       lock_page(page);
+       if (unlikely(page->mapping != mapping))
+               goto unlock;
+
+       offset = offset_in_thp(page, start) & ~(bsz - 1);
+
+       do {
+               if (ops->is_partially_uptodate(page, offset, bsz) == seek_data)
+                       break;
+               start = (start + bsz) & ~(bsz - 1);
+               offset += bsz;
+       } while (offset < thp_size(page));
+unlock:
+       unlock_page(page);
+       rcu_read_lock();
+       return start;
+}
+
+static inline
+unsigned int seek_page_size(struct xa_state *xas, struct page *page)
+{
+       if (xa_is_value(page))
+               return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
+       return thp_size(page);
+}
+
+/**
+ * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ * @mapping: Address space to search.
+ * @start: First byte to consider.
+ * @end: Limit of search (exclusive).
+ * @whence: Either SEEK_HOLE or SEEK_DATA.
+ *
+ * If the page cache knows which blocks contain holes and which blocks
+ * contain data, your filesystem can use this function to implement
+ * SEEK_HOLE and SEEK_DATA.  This is useful for filesystems which are
+ * entirely memory-based such as tmpfs, and filesystems which support
+ * unwritten extents.
+ *
+ * Return: The requested offset on successs, or -ENXIO if @whence specifies
+ * SEEK_DATA and there is no data after @start.  There is an implicit hole
+ * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
+ * and @end contain data.
+ */
+loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
+               loff_t end, int whence)
+{
+       XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
+       pgoff_t max = (end - 1) / PAGE_SIZE;
+       bool seek_data = (whence == SEEK_DATA);
+       struct page *page;
+
+       if (end <= start)
+               return -ENXIO;
+
+       rcu_read_lock();
+       while ((page = find_get_entry(&xas, max, XA_PRESENT))) {
+               loff_t pos = xas.xa_index * PAGE_SIZE;
+
+               if (start < pos) {
+                       if (!seek_data)
+                               goto unlock;
+                       start = pos;
+               }
+
+               pos += seek_page_size(&xas, page);
+               start = page_seek_hole_data(&xas, mapping, page, start, pos,
+                               seek_data);
+               if (start < pos)
+                       goto unlock;
+               if (!xa_is_value(page))
+                       put_page(page);
+       }
+       rcu_read_unlock();
+
+       if (seek_data)
+               return -ENXIO;
+       goto out;
+
+unlock:
+       rcu_read_unlock();
+       if (!xa_is_value(page))
+               put_page(page);
+out:
+       if (start > end)
+               return end;
+       return start;
+}
+
 #ifdef CONFIG_MMU
 #define MMAP_LOTSAMISS  (100)
 /*
@@ -3431,7 +3502,8 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
                }
                iocb->ki_pos = pos;
        }
-       iov_iter_revert(from, write_len - iov_iter_count(from));
+       if (written != -EIOCBQUEUED)
+               iov_iter_revert(from, write_len - iov_iter_count(from));
 out:
        return written;
 }