Btrfs: do aio_write instead of write
authorJosef Bacik <josef@redhat.com>
Sun, 23 May 2010 15:07:21 +0000 (11:07 -0400)
committerChris Mason <chris.mason@oracle.com>
Tue, 25 May 2010 14:34:57 +0000 (10:34 -0400)
In order for AIO to work, we need to implement aio_write.  This patch converts
our btrfs_file_write to btrfs_aio_write.  I've tested this with xfstests and
nothing broke, and the AIO stuff magically started working.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/extent_io.c
fs/btrfs/file.c

index 1a57c17..a53aca3 100644 (file)
@@ -2017,6 +2017,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        sector_t sector;
        struct extent_map *em;
        struct block_device *bdev;
+       struct btrfs_ordered_extent *ordered;
        int ret;
        int nr = 0;
        size_t page_offset = 0;
@@ -2028,7 +2029,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        set_page_extent_mapped(page);
 
        end = page_end;
-       lock_extent(tree, start, end, GFP_NOFS);
+       while (1) {
+               lock_extent(tree, start, end, GFP_NOFS);
+               ordered = btrfs_lookup_ordered_extent(inode, start);
+               if (!ordered)
+                       break;
+               unlock_extent(tree, start, end, GFP_NOFS);
+               btrfs_start_ordered_extent(inode, ordered, 1);
+               btrfs_put_ordered_extent(ordered);
+       }
 
        if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
                char *userpage;
index a28810a..233aea2 100644 (file)
 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                                         int write_bytes,
                                         struct page **prepared_pages,
-                                        const char __user *buf)
+                                        struct iov_iter *i)
 {
-       long page_fault = 0;
-       int i;
+       size_t copied;
+       int pg = 0;
        int offset = pos & (PAGE_CACHE_SIZE - 1);
 
-       for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+       while (write_bytes > 0) {
                size_t count = min_t(size_t,
                                     PAGE_CACHE_SIZE - offset, write_bytes);
-               struct page *page = prepared_pages[i];
-               fault_in_pages_readable(buf, count);
+               struct page *page = prepared_pages[pg];
+again:
+               if (unlikely(iov_iter_fault_in_readable(i, count)))
+                       return -EFAULT;
 
                /* Copy data from userspace to the current page */
-               kmap(page);
-               page_fault = __copy_from_user(page_address(page) + offset,
-                                             buf, count);
+               copied = iov_iter_copy_from_user(page, i, offset, count);
+
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
-               kunmap(page);
-               buf += count;
-               write_bytes -= count;
+               iov_iter_advance(i, copied);
+               write_bytes -= copied;
 
-               if (page_fault)
-                       break;
+               if (unlikely(copied == 0)) {
+                       count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+                                     iov_iter_single_seg_count(i));
+                       goto again;
+               }
+
+               if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+                       offset += copied;
+               } else {
+                       pg++;
+                       offset = 0;
+               }
        }
-       return page_fault ? -EFAULT : 0;
+       return 0;
 }
 
 /*
@@ -822,60 +832,24 @@ again:
        return 0;
 }
 
-/* Copied from read-write.c */
-static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
-{
-       set_current_state(TASK_UNINTERRUPTIBLE);
-       if (!kiocbIsKicked(iocb))
-               schedule();
-       else
-               kiocbClearKicked(iocb);
-       __set_current_state(TASK_RUNNING);
-}
-
-/*
- * Just a copy of what do_sync_write does.
- */
-static ssize_t __btrfs_direct_write(struct file *file, const char __user *buf,
-                                   size_t count, loff_t pos, loff_t *ppos)
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
+                                   const struct iovec *iov,
+                                   unsigned long nr_segs, loff_t pos)
 {
-       struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
-       unsigned long nr_segs = 1;
-       struct kiocb kiocb;
-       ssize_t ret;
-
-       init_sync_kiocb(&kiocb, file);
-       kiocb.ki_pos = pos;
-       kiocb.ki_left = count;
-       kiocb.ki_nbytes = count;
-
-       while (1) {
-               ret = generic_file_direct_write(&kiocb, &iov, &nr_segs, pos,
-                                               ppos, count, count);
-               if (ret != -EIOCBRETRY)
-                       break;
-               wait_on_retry_sync_kiocb(&kiocb);
-       }
-
-       if (ret == -EIOCBQUEUED)
-               ret = wait_on_sync_kiocb(&kiocb);
-       *ppos = kiocb.ki_pos;
-       return ret;
-}
-
-static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
-                               size_t count, loff_t *ppos)
-{
-       loff_t pos;
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = fdentry(file)->d_inode;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct page *pinned[2];
+       struct page **pages = NULL;
+       struct iov_iter i;
+       loff_t *ppos = &iocb->ki_pos;
        loff_t start_pos;
        ssize_t num_written = 0;
        ssize_t err = 0;
+       size_t count;
+       size_t ocount;
        int ret = 0;
-       struct inode *inode = fdentry(file)->d_inode;
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct page **pages = NULL;
        int nrptrs;
-       struct page *pinned[2];
        unsigned long first_index;
        unsigned long last_index;
        int will_write;
@@ -887,13 +861,17 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
        pinned[0] = NULL;
        pinned[1] = NULL;
 
-       pos = *ppos;
        start_pos = pos;
 
        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 
        mutex_lock(&inode->i_mutex);
 
+       err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+       if (err)
+               goto out;
+       count = ocount;
+
        current->backing_dev_info = inode->i_mapping->backing_dev_info;
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
@@ -910,14 +888,48 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
        BTRFS_I(inode)->sequence++;
 
        if (unlikely(file->f_flags & O_DIRECT)) {
-               num_written = __btrfs_direct_write(file, buf, count, pos,
-                                                  ppos);
-               pos += num_written;
-               count -= num_written;
+               ret = btrfs_delalloc_reserve_space(inode, count);
+               if (ret)
+                       goto out;
 
-               /* We've written everything we wanted to, exit */
-               if (num_written < 0 || !count)
+               num_written = generic_file_direct_write(iocb, iov, &nr_segs,
+                                                       pos, ppos, count,
+                                                       ocount);
+
+               /*
+                * the generic O_DIRECT will update in-memory i_size after the
+                * DIOs are done.  But our endio handlers that update the on
+                * disk i_size never update past the in memory i_size.  So we
+                * need one more update here to catch any additions to the
+                * file
+                */
+               if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+                       btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+                       mark_inode_dirty(inode);
+               }
+
+               if (num_written < 0) {
+                       if (num_written != -EIOCBQUEUED) {
+                               /*
+                                * aio land will take care of releasing the
+                                * delalloc
+                                */
+                               btrfs_delalloc_release_space(inode, count);
+                       }
+                       ret = num_written;
+                       num_written = 0;
                        goto out;
+               } else if (num_written == count) {
+                       /* pick up pos changes done by the generic code */
+                       pos = *ppos;
+                       goto out;
+               }
+
+               /*
+                * the buffered IO will reserve bytes for the rest of the
+                * range, don't double count them here
+                */
+               btrfs_delalloc_release_space(inode, count - num_written);
 
                /*
                 * We are going to do buffered for the rest of the range, so we
@@ -925,18 +937,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                 * done.
                 */
                buffered = 1;
-               buf += num_written;
+               pos += num_written;
        }
 
-       nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
-                    PAGE_CACHE_SIZE / (sizeof(struct page *)));
+       iov_iter_init(&i, iov, nr_segs, count, num_written);
+       nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
+                    PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+                    (sizeof(struct page *)));
        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 
        /* generic_write_checks can change our pos */
        start_pos = pos;
 
        first_index = pos >> PAGE_CACHE_SHIFT;
-       last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+       last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
 
        /*
         * there are lots of better ways to do this, but this code
@@ -953,7 +967,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                        unlock_page(pinned[0]);
                }
        }
-       if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+       if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
                if (!PageUptodate(pinned[1])) {
                        ret = btrfs_readpage(NULL, pinned[1]);
@@ -964,10 +978,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                }
        }
 
-       while (count > 0) {
+       while (iov_iter_count(&i) > 0) {
                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
-               size_t write_bytes = min(count, nrptrs *
-                                       (size_t)PAGE_CACHE_SIZE -
+               size_t write_bytes = min(iov_iter_count(&i),
+                                        nrptrs * (size_t)PAGE_CACHE_SIZE -
                                         offset);
                size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
                                        PAGE_CACHE_SHIFT;
@@ -988,7 +1002,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                }
 
                ret = btrfs_copy_from_user(pos, num_pages,
-                                          write_bytes, pages, buf);
+                                          write_bytes, pages, &i);
                if (ret == 0) {
                        dirty_and_release_pages(NULL, root, file, pages,
                                                num_pages, pos, write_bytes);
@@ -1012,8 +1026,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                        btrfs_throttle(root);
                }
 
-               buf += write_bytes;
-               count -= write_bytes;
                pos += write_bytes;
                num_written += write_bytes;
 
@@ -1206,7 +1218,7 @@ const struct file_operations btrfs_file_operations = {
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .splice_read    = generic_file_splice_read,
-       .write          = btrfs_file_write,
+       .aio_write      = btrfs_file_aio_write,
        .mmap           = btrfs_file_mmap,
        .open           = generic_file_open,
        .release        = btrfs_release_file,