Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 17 Mar 2016 23:31:18 +0000 (16:31 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 17 Mar 2016 23:31:18 +0000 (16:31 -0700)
Pull ext4 updates from Ted Ts'o:
 "Performance improvements in SEEK_DATA and xattr scalability
  improvements, plus a lot of clean ups and bug fixes"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (38 commits)
  ext4: clean up error handling in the MMP support
  jbd2: do not fail journal because of frozen_buffer allocation failure
  ext4: use __GFP_NOFAIL in ext4_free_blocks()
  ext4: fix compile error while opening the macro DOUBLE_CHECK
  ext4: print ext4 mount option data_err=abort correctly
  ext4: fix NULL pointer dereference in ext4_mark_inode_dirty()
  ext4: drop unneeded BUFFER_TRACE in ext4_delete_inline_entry()
  ext4: fix misspellings in comments.
  jbd2: fix FS corruption possibility in jbd2_journal_destroy() on umount path
  ext4: more efficient SEEK_DATA implementation
  ext4: cleanup handling of bh->b_state in DAX mmap
  ext4: return hole from ext4_map_blocks()
  ext4: factor out determining of hole size
  ext4: fix setting of referenced bit in ext4_es_lookup_extent()
  ext4: remove i_ioend_count
  ext4: simplify io_end handling for AIO DIO
  ext4: move trans handling and completion deferal out of _ext4_get_block
  ext4: rename and split get blocks functions
  ext4: use i_mutex to serialize unaligned AIO DIO
  ext4: pack ioend structure better
  ...

1  2 
fs/ext4/file.c
fs/ext4/inode.c

diff --combined fs/ext4/file.c
@@@ -93,31 -93,29 +93,29 @@@ ext4_file_write_iter(struct kiocb *iocb
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(iocb->ki_filp);
-       struct mutex *aio_mutex = NULL;
        struct blk_plug plug;
        int o_direct = iocb->ki_flags & IOCB_DIRECT;
+       int unaligned_aio = 0;
        int overwrite = 0;
        ssize_t ret;
  
+       inode_lock(inode);
+       ret = generic_write_checks(iocb, from);
+       if (ret <= 0)
+               goto out;
        /*
-        * Unaligned direct AIO must be serialized; see comment above
-        * In the case of O_APPEND, assume that we must always serialize
+        * Unaligned direct AIO must be serialized among each other as zeroing
+        * of partial blocks of two competing unaligned AIOs can result in data
+        * corruption.
         */
-       if (o_direct &&
-           ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+       if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
            !is_sync_kiocb(iocb) &&
-           (iocb->ki_flags & IOCB_APPEND ||
-            ext4_unaligned_aio(inode, from, iocb->ki_pos))) {
-               aio_mutex = ext4_aio_mutex(inode);
-               mutex_lock(aio_mutex);
+           ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
+               unaligned_aio = 1;
                ext4_unwritten_wait(inode);
        }
  
-       inode_lock(inode);
-       ret = generic_write_checks(iocb, from);
-       if (ret <= 0)
-               goto out;
        /*
         * If we have encountered a bitmap-format file, the size limit
         * is smaller than s_maxbytes, which is for extent-mapped files.
                blk_start_plug(&plug);
  
                /* check whether we do a DIO overwrite or not */
-               if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
+               if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
                    !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
                        struct ext4_map_blocks map;
                        unsigned int blkbits = inode->i_blkbits;
        if (o_direct)
                blk_finish_plug(&plug);
  
-       if (aio_mutex)
-               mutex_unlock(aio_mutex);
        return ret;
  
  out:
        inode_unlock(inode);
-       if (aio_mutex)
-               mutex_unlock(aio_mutex);
        return ret;
  }
  
@@@ -262,8 -256,23 +256,8 @@@ static int ext4_dax_pmd_fault(struct vm
        return result;
  }
  
 -static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 -{
 -      int err;
 -      struct inode *inode = file_inode(vma->vm_file);
 -
 -      sb_start_pagefault(inode->i_sb);
 -      file_update_time(vma->vm_file);
 -      down_read(&EXT4_I(inode)->i_mmap_sem);
 -      err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL);
 -      up_read(&EXT4_I(inode)->i_mmap_sem);
 -      sb_end_pagefault(inode->i_sb);
 -
 -      return err;
 -}
 -
  /*
 - * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
 + * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
   * handler we check for races agaist truncate. Note that since we cycle through
   * i_mmap_sem, we are sure that also any hole punching that began before we
   * were called is finished by now and so if it included part of the file we
@@@ -296,7 -305,7 +290,7 @@@ static int ext4_dax_pfn_mkwrite(struct 
  static const struct vm_operations_struct ext4_dax_vm_ops = {
        .fault          = ext4_dax_fault,
        .pmd_fault      = ext4_dax_pmd_fault,
 -      .page_mkwrite   = ext4_dax_mkwrite,
 +      .page_mkwrite   = ext4_dax_fault,
        .pfn_mkwrite    = ext4_dax_pfn_mkwrite,
  };
  #else
@@@ -417,7 -426,7 +411,7 @@@ static int ext4_file_open(struct inode 
   */
  static int ext4_find_unwritten_pgoff(struct inode *inode,
                                     int whence,
-                                    struct ext4_map_blocks *map,
+                                    ext4_lblk_t end_blk,
                                     loff_t *offset)
  {
        struct pagevec pvec;
        blkbits = inode->i_sb->s_blocksize_bits;
        startoff = *offset;
        lastoff = startoff;
-       endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
+       endoff = (loff_t)end_blk << blkbits;
  
        index = startoff >> PAGE_CACHE_SHIFT;
        end = endoff >> PAGE_CACHE_SHIFT;
@@@ -550,12 -559,11 +544,11 @@@ out
  static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
  {
        struct inode *inode = file->f_mapping->host;
-       struct ext4_map_blocks map;
        struct extent_status es;
        ext4_lblk_t start, last, end;
        loff_t dataoff, isize;
        int blkbits;
-       int ret = 0;
+       int ret;
  
        inode_lock(inode);
  
        dataoff = offset;
  
        do {
-               map.m_lblk = last;
-               map.m_len = end - last + 1;
-               ret = ext4_map_blocks(NULL, inode, &map, 0);
-               if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
-                       if (last != start)
-                               dataoff = (loff_t)last << blkbits;
-                       break;
+               ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
+               if (ret <= 0) {
+                       /* No extent found -> no data */
+                       if (ret == 0)
+                               ret = -ENXIO;
+                       inode_unlock(inode);
+                       return ret;
                }
  
-               /*
-                * If there is a delay extent at this offset,
-                * it will be as a data.
-                */
-               ext4_es_find_delayed_extent_range(inode, last, last, &es);
-               if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
-                       if (last != start)
-                               dataoff = (loff_t)last << blkbits;
+               last = es.es_lblk;
+               if (last != start)
+                       dataoff = (loff_t)last << blkbits;
+               if (!ext4_es_is_unwritten(&es))
                        break;
-               }
  
                /*
                 * If there is a unwritten extent at this offset,
                 * it will be as a data or a hole according to page
                 * cache that has data or not.
                 */
-               if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-                       int unwritten;
-                       unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
-                                                             &map, &dataoff);
-                       if (unwritten)
-                               break;
-               }
-               last++;
+               if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+                                             es.es_lblk + es.es_len, &dataoff))
+                       break;
+               last += es.es_len;
                dataoff = (loff_t)last << blkbits;
+               cond_resched();
        } while (last <= end);
  
        inode_unlock(inode);
  static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
  {
        struct inode *inode = file->f_mapping->host;
-       struct ext4_map_blocks map;
        struct extent_status es;
        ext4_lblk_t start, last, end;
        loff_t holeoff, isize;
        int blkbits;
-       int ret = 0;
+       int ret;
  
        inode_lock(inode);
  
        holeoff = offset;
  
        do {
-               map.m_lblk = last;
-               map.m_len = end - last + 1;
-               ret = ext4_map_blocks(NULL, inode, &map, 0);
-               if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
-                       last += ret;
-                       holeoff = (loff_t)last << blkbits;
-                       continue;
+               ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
+               if (ret < 0) {
+                       inode_unlock(inode);
+                       return ret;
                }
-               /*
-                * If there is a delay extent at this offset,
-                * we will skip this extent.
-                */
-               ext4_es_find_delayed_extent_range(inode, last, last, &es);
-               if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
-                       last = es.es_lblk + es.es_len;
-                       holeoff = (loff_t)last << blkbits;
-                       continue;
+               /* Found a hole? */
+               if (ret == 0 || es.es_lblk > last) {
+                       if (last != start)
+                               holeoff = (loff_t)last << blkbits;
+                       break;
                }
                /*
                 * If there is a unwritten extent at this offset,
                 * it will be as a data or a hole according to page
                 * cache that has data or not.
                 */
-               if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-                       int unwritten;
-                       unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
-                                                             &map, &holeoff);
-                       if (!unwritten) {
-                               last += ret;
-                               holeoff = (loff_t)last << blkbits;
-                               continue;
-                       }
-               }
+               if (ext4_es_is_unwritten(&es) &&
+                   ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+                                             last + es.es_len, &holeoff))
+                       break;
  
-               /* find a hole */
-               break;
+               last += es.es_len;
+               holeoff = (loff_t)last << blkbits;
+               cond_resched();
        } while (last <= end);
  
        inode_unlock(inode);
diff --combined fs/ext4/inode.c
@@@ -216,7 -216,6 +216,6 @@@ void ext4_evict_inode(struct inode *ino
                }
                truncate_inode_pages_final(&inode->i_data);
  
-               WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
                goto no_delete;
        }
  
                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages_final(&inode->i_data);
  
-       WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
        /*
         * Protect us against freezing - iput() caller didn't have to have any
         * protection against it
@@@ -458,13 -455,13 +455,13 @@@ static void ext4_map_blocks_es_recheck(
   * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
   * based files
   *
-  * On success, it returns the number of blocks being mapped or allocated.
-  * if create==0 and the blocks are pre-allocated and unwritten block,
-  * the result buffer head is unmapped. If the create ==1, it will make sure
-  * the buffer head is mapped.
+  * On success, it returns the number of blocks being mapped or allocated.  if
+  * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
+  * is marked as unwritten. If the create == 1, it will mark @map as mapped.
   *
   * It returns 0 if plain look up failed (blocks have not been allocated), in
-  * that case, buffer head is unmapped
+  * that case, @map is returned as unmapped but we still do fill map->m_len to
+  * indicate the length of a hole starting at map->m_lblk.
   *
   * It returns the error in case of allocation failure.
   */
@@@ -507,6 -504,11 +504,11 @@@ int ext4_map_blocks(handle_t *handle, s
                                retval = map->m_len;
                        map->m_len = retval;
                } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+                       map->m_pblk = 0;
+                       retval = es.es_len - (map->m_lblk - es.es_lblk);
+                       if (retval > map->m_len)
+                               retval = map->m_len;
+                       map->m_len = retval;
                        retval = 0;
                } else {
                        BUG_ON(1);
@@@ -714,16 -716,11 +716,11 @@@ static void ext4_update_bh_state(struc
                 cmpxchg(&bh->b_state, old_state, new_state) != old_state));
  }
  
- /* Maximum number of blocks we map for direct IO at once. */
- #define DIO_MAX_BLOCKS 4096
  static int _ext4_get_block(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int flags)
  {
-       handle_t *handle = ext4_journal_current_handle();
        struct ext4_map_blocks map;
-       int ret = 0, started = 0;
-       int dio_credits;
+       int ret = 0;
  
        if (ext4_has_inline_data(inode))
                return -ERANGE;
        map.m_lblk = iblock;
        map.m_len = bh->b_size >> inode->i_blkbits;
  
-       if (flags && !handle) {
-               /* Direct IO write... */
-               if (map.m_len > DIO_MAX_BLOCKS)
-                       map.m_len = DIO_MAX_BLOCKS;
-               dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
-                                           dio_credits);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       return ret;
-               }
-               started = 1;
-       }
-       ret = ext4_map_blocks(handle, inode, &map, flags);
+       ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
+                             flags);
        if (ret > 0) {
-               ext4_io_end_t *io_end = ext4_inode_aio(inode);
                map_bh(bh, inode->i_sb, map.m_pblk);
                ext4_update_bh_state(bh, map.m_flags);
-               if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
-                       set_buffer_defer_completion(bh);
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
                ret = 0;
        }
-       if (started)
-               ext4_journal_stop(handle);
        return ret;
  }
  
@@@ -768,6 -746,155 +746,155 @@@ int ext4_get_block(struct inode *inode
                               create ? EXT4_GET_BLOCKS_CREATE : 0);
  }
  
+ /*
+  * Get block function used when preparing for buffered write if we require
+  * creating an unwritten extent if blocks haven't been allocated.  The extent
+  * will be converted to written after the IO is complete.
+  */
+ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
+                            struct buffer_head *bh_result, int create)
+ {
+       ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
+                  inode->i_ino, create);
+       return _ext4_get_block(inode, iblock, bh_result,
+                              EXT4_GET_BLOCKS_IO_CREATE_EXT);
+ }
+ /* Maximum number of blocks we map for direct IO at once. */
+ #define DIO_MAX_BLOCKS 4096
+ static handle_t *start_dio_trans(struct inode *inode,
+                                struct buffer_head *bh_result)
+ {
+       int dio_credits;
+       /* Trim mapping request to maximum we can map at once for DIO */
+       if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
+               bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
+       dio_credits = ext4_chunk_trans_blocks(inode,
+                                     bh_result->b_size >> inode->i_blkbits);
+       return ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+ }
+ /* Get block function for DIO reads and writes to inodes without extents */
+ int ext4_dio_get_block(struct inode *inode, sector_t iblock,
+                      struct buffer_head *bh, int create)
+ {
+       handle_t *handle;
+       int ret;
+       /* We don't expect handle for direct IO */
+       WARN_ON_ONCE(ext4_journal_current_handle());
+       if (create) {
+               handle = start_dio_trans(inode, bh);
+               if (IS_ERR(handle))
+                       return PTR_ERR(handle);
+       }
+       ret = _ext4_get_block(inode, iblock, bh,
+                             create ? EXT4_GET_BLOCKS_CREATE : 0);
+       if (create)
+               ext4_journal_stop(handle);
+       return ret;
+ }
+ /*
+  * Get block function for AIO DIO writes when we create unwritten extent if
+  * blocks are not allocated yet. The extent will be converted to written
+  * after IO is complete.
+  */
+ static int ext4_dio_get_block_unwritten_async(struct inode *inode,
+               sector_t iblock, struct buffer_head *bh_result, int create)
+ {
+       handle_t *handle;
+       int ret;
+       /* We don't expect handle for direct IO */
+       WARN_ON_ONCE(ext4_journal_current_handle());
+       handle = start_dio_trans(inode, bh_result);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+       ret = _ext4_get_block(inode, iblock, bh_result,
+                             EXT4_GET_BLOCKS_IO_CREATE_EXT);
+       ext4_journal_stop(handle);
+       /*
+        * When doing DIO using unwritten extents, we need io_end to convert
+        * unwritten extents to written on IO completion. We allocate io_end
+        * once we spot unwritten extent and store it in b_private. Generic
+        * DIO code keeps b_private set and furthermore passes the value to
+        * our completion callback in 'private' argument.
+        */
+       if (!ret && buffer_unwritten(bh_result)) {
+               if (!bh_result->b_private) {
+                       ext4_io_end_t *io_end;
+                       io_end = ext4_init_io_end(inode, GFP_KERNEL);
+                       if (!io_end)
+                               return -ENOMEM;
+                       bh_result->b_private = io_end;
+                       ext4_set_io_unwritten_flag(inode, io_end);
+               }
+               set_buffer_defer_completion(bh_result);
+       }
+       return ret;
+ }
+ /*
+  * Get block function for non-AIO DIO writes when we create unwritten extent if
+  * blocks are not allocated yet. The extent will be converted to written
+  * after IO is complete from ext4_ext_direct_IO() function.
+  */
+ static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
+               sector_t iblock, struct buffer_head *bh_result, int create)
+ {
+       handle_t *handle;
+       int ret;
+       /* We don't expect handle for direct IO */
+       WARN_ON_ONCE(ext4_journal_current_handle());
+       handle = start_dio_trans(inode, bh_result);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+       ret = _ext4_get_block(inode, iblock, bh_result,
+                             EXT4_GET_BLOCKS_IO_CREATE_EXT);
+       ext4_journal_stop(handle);
+       /*
+        * Mark inode as having pending DIO writes to unwritten extents.
+        * ext4_ext_direct_IO() checks this flag and converts extents to
+        * written.
+        */
+       if (!ret && buffer_unwritten(bh_result))
+               ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+       return ret;
+ }
+ static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
+                  struct buffer_head *bh_result, int create)
+ {
+       int ret;
+       ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
+                  inode->i_ino, create);
+       /* We don't expect handle for direct IO */
+       WARN_ON_ONCE(ext4_journal_current_handle());
+       ret = _ext4_get_block(inode, iblock, bh_result, 0);
+       /*
+        * Blocks should have been preallocated! ext4_file_write_iter() checks
+        * that.
+        */
+       WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
+       return ret;
+ }
  /*
   * `handle' can be NULL if create is zero
   */
@@@ -1079,13 -1206,14 +1206,14 @@@ retry_journal
  #ifdef CONFIG_EXT4_FS_ENCRYPTION
        if (ext4_should_dioread_nolock(inode))
                ret = ext4_block_write_begin(page, pos, len,
-                                            ext4_get_block_write);
+                                            ext4_get_block_unwritten);
        else
                ret = ext4_block_write_begin(page, pos, len,
                                             ext4_get_block);
  #else
        if (ext4_should_dioread_nolock(inode))
-               ret = __block_write_begin(page, pos, len, ext4_get_block_write);
+               ret = __block_write_begin(page, pos, len,
+                                         ext4_get_block_unwritten);
        else
                ret = __block_write_begin(page, pos, len, ext4_get_block);
  #endif
@@@ -2478,10 -2606,6 +2606,10 @@@ static int ext4_writepages(struct addre
  
        trace_ext4_writepages(inode, wbc);
  
 +      if (dax_mapping(mapping))
 +              return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
 +                                                 wbc);
 +
        /*
         * No pages to write? This is mainly a kludge to avoid starting
         * a transaction for special inodes like journal inode on last iput()
@@@ -3088,37 -3212,6 +3216,6 @@@ static int ext4_releasepage(struct pag
                return try_to_free_buffers(page);
  }
  
- /*
-  * ext4_get_block used when preparing for a DIO write or buffer write.
-  * We allocate an uinitialized extent if blocks haven't been allocated.
-  * The extent will be converted to initialized after the IO is complete.
-  */
- int ext4_get_block_write(struct inode *inode, sector_t iblock,
-                  struct buffer_head *bh_result, int create)
- {
-       ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
-                  inode->i_ino, create);
-       return _ext4_get_block(inode, iblock, bh_result,
-                              EXT4_GET_BLOCKS_IO_CREATE_EXT);
- }
- static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
-                  struct buffer_head *bh_result, int create)
- {
-       int ret;
-       ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",
-                  inode->i_ino, create);
-       ret = _ext4_get_block(inode, iblock, bh_result, 0);
-       /*
-        * Blocks should have been preallocated! ext4_file_write_iter() checks
-        * that.
-        */
-       WARN_ON_ONCE(!buffer_mapped(bh_result));
-       return ret;
- }
  #ifdef CONFIG_FS_DAX
  int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
                            struct buffer_head *bh_result, int create)
        WARN_ON_ONCE(ret == 0 && create);
        if (ret > 0) {
                map_bh(bh_result, inode->i_sb, map.m_pblk);
-               bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
-                                       map.m_flags;
                /*
                 * At least for now we have to clear BH_New so that DAX code
                 * doesn't attempt to zero blocks again in a racy way.
                 */
-               bh_result->b_state &= ~(1 << BH_New);
+               map.m_flags &= ~EXT4_MAP_NEW;
+               ext4_update_bh_state(bh_result, map.m_flags);
                bh_result->b_size = map.m_len << inode->i_blkbits;
                ret = 0;
        }
  static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                            ssize_t size, void *private)
  {
-         ext4_io_end_t *io_end = iocb->private;
+         ext4_io_end_t *io_end = private;
  
        /* if not async direct IO just return */
        if (!io_end)
  
        ext_debug("ext4_end_io_dio(): io_end 0x%p "
                  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
-                 iocb->private, io_end->inode->i_ino, iocb, offset,
-                 size);
+                 io_end, io_end->inode->i_ino, iocb, offset, size);
  
-       iocb->private = NULL;
        io_end->offset = offset;
        io_end->size = size;
        ext4_put_io_end(io_end);
@@@ -3243,7 -3333,6 +3337,6 @@@ static ssize_t ext4_ext_direct_IO(struc
        get_block_t *get_block_func = NULL;
        int dio_flags = 0;
        loff_t final_size = offset + count;
-       ext4_io_end_t *io_end = NULL;
  
        /* Use the old path for reads and writes beyond i_size. */
        if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
        /*
         * We could direct write to holes and fallocate.
         *
-        * Allocated blocks to fill the hole are marked as
-        * unwritten to prevent parallel buffered read to expose
-        * the stale data before DIO complete the data IO.
+        * Allocated blocks to fill the hole are marked as unwritten to prevent
+        * parallel buffered read to expose the stale data before DIO complete
+        * the data IO.
         *
-        * As to previously fallocated extents, ext4 get_block will
-        * just simply mark the buffer mapped but still keep the
-        * extents unwritten.
+        * As to previously fallocated extents, ext4 get_block will just simply
+        * mark the buffer mapped but still keep the extents unwritten.
         *
-        * For non AIO case, we will convert those unwritten extents
-        * to written after return back from blockdev_direct_IO.
+        * For non AIO case, we will convert those unwritten extents to written
+        * after return back from blockdev_direct_IO. That way we save us from
+        * allocating io_end structure and also the overhead of offloading
+        * the extent convertion to a workqueue.
         *
         * For async DIO, the conversion needs to be deferred when the
         * IO is completed. The ext4 end_io callback function will be
         * case, we allocate an io_end structure to hook to the iocb.
         */
        iocb->private = NULL;
-       if (overwrite) {
-               get_block_func = ext4_get_block_overwrite;
+       if (overwrite)
+               get_block_func = ext4_dio_get_block_overwrite;
+       else if (is_sync_kiocb(iocb)) {
+               get_block_func = ext4_dio_get_block_unwritten_sync;
+               dio_flags = DIO_LOCKING;
        } else {
-               ext4_inode_aio_set(inode, NULL);
-               if (!is_sync_kiocb(iocb)) {
-                       io_end = ext4_init_io_end(inode, GFP_NOFS);
-                       if (!io_end) {
-                               ret = -ENOMEM;
-                               goto retake_lock;
-                       }
-                       /*
-                        * Grab reference for DIO. Will be dropped in
-                        * ext4_end_io_dio()
-                        */
-                       iocb->private = ext4_get_io_end(io_end);
-                       /*
-                        * we save the io structure for current async direct
-                        * IO, so that later ext4_map_blocks() could flag the
-                        * io structure whether there is a unwritten extents
-                        * needs to be converted when IO is completed.
-                        */
-                       ext4_inode_aio_set(inode, io_end);
-               }
-               get_block_func = ext4_get_block_write;
+               get_block_func = ext4_dio_get_block_unwritten_async;
                dio_flags = DIO_LOCKING;
        }
  #ifdef CONFIG_EXT4_FS_ENCRYPTION
                                           get_block_func,
                                           ext4_end_io_dio, NULL, dio_flags);
  
-       /*
-        * Put our reference to io_end. This can free the io_end structure e.g.
-        * in sync IO case or in case of error. It can even perform extent
-        * conversion if all bios we submitted finished before we got here.
-        * Note that in that case iocb->private can be already set to NULL
-        * here.
-        */
-       if (io_end) {
-               ext4_inode_aio_set(inode, NULL);
-               ext4_put_io_end(io_end);
-               /*
-                * When no IO was submitted ext4_end_io_dio() was not
-                * called so we have to put iocb's reference.
-                */
-               if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
-                       WARN_ON(iocb->private != io_end);
-                       WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
-                       ext4_put_io_end(io_end);
-                       iocb->private = NULL;
-               }
-       }
        if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
                                                EXT4_STATE_DIO_UNWRITTEN)) {
                int err;
                ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
        }
  
- retake_lock:
        if (iov_iter_rw(iter) == WRITE)
                inode_dio_end(inode);
        /* take i_mutex locking again if we do a ovewrite dio */
@@@ -4159,7 -4210,7 +4214,7 @@@ void ext4_set_inode_flags(struct inode 
                new_fl |= S_NOATIME;
        if (flags & EXT4_DIRSYNC_FL)
                new_fl |= S_DIRSYNC;
 -      if (test_opt(inode->i_sb, DAX))
 +      if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
                new_fl |= S_DAX;
        inode_set_flags(inode, new_fl,
                        S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
@@@ -5261,6 -5312,8 +5316,8 @@@ int ext4_mark_inode_dirty(handle_t *han
        might_sleep();
        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
+       if (err)
+               return err;
        if (ext4_handle_valid(handle) &&
            EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
            !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
                        }
                }
        }
-       if (!err)
-               err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-       return err;
+       return ext4_mark_iloc_dirty(handle, inode, &iloc);
  }
  
  /*
@@@ -5502,7 -5553,7 +5557,7 @@@ int ext4_page_mkwrite(struct vm_area_st
        unlock_page(page);
        /* OK, we need to fill the hole... */
        if (ext4_should_dioread_nolock(inode))
-               get_block = ext4_get_block_write;
+               get_block = ext4_get_block_unwritten;
        else
                get_block = ext4_get_block;
  retry_alloc:
@@@ -5545,3 -5596,70 +5600,70 @@@ int ext4_filemap_fault(struct vm_area_s
  
        return err;
  }
+ /*
+  * Find the first extent at or after @lblk in an inode that is not a hole.
+  * Search for @map_len blocks at most. The extent is returned in @result.
+  *
+  * The function returns 1 if we found an extent. The function returns 0 in
+  * case there is no extent at or after @lblk and in that case also sets
+  * @result->es_len to 0. In case of error, the error code is returned.
+  */
+ int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
+                        unsigned int map_len, struct extent_status *result)
+ {
+       struct ext4_map_blocks map;
+       struct extent_status es = {};
+       int ret;
+       map.m_lblk = lblk;
+       map.m_len = map_len;
+       /*
+        * For non-extent based files this loop may iterate several times since
+        * we do not determine full hole size.
+        */
+       while (map.m_len > 0) {
+               ret = ext4_map_blocks(NULL, inode, &map, 0);
+               if (ret < 0)
+                       return ret;
+               /* There's extent covering m_lblk? Just return it. */
+               if (ret > 0) {
+                       int status;
+                       ext4_es_store_pblock(result, map.m_pblk);
+                       result->es_lblk = map.m_lblk;
+                       result->es_len = map.m_len;
+                       if (map.m_flags & EXT4_MAP_UNWRITTEN)
+                               status = EXTENT_STATUS_UNWRITTEN;
+                       else
+                               status = EXTENT_STATUS_WRITTEN;
+                       ext4_es_store_status(result, status);
+                       return 1;
+               }
+               ext4_es_find_delayed_extent_range(inode, map.m_lblk,
+                                                 map.m_lblk + map.m_len - 1,
+                                                 &es);
+               /* Is delalloc data before next block in extent tree? */
+               if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
+                       ext4_lblk_t offset = 0;
+                       if (es.es_lblk < lblk)
+                               offset = lblk - es.es_lblk;
+                       result->es_lblk = es.es_lblk + offset;
+                       ext4_es_store_pblock(result,
+                                            ext4_es_pblock(&es) + offset);
+                       result->es_len = es.es_len - offset;
+                       ext4_es_store_status(result, ext4_es_status(&es));
+                       return 1;
+               }
+               /* There's a hole at m_lblk, advance us after it */
+               map.m_lblk += map.m_len;
+               map_len -= map.m_len;
+               map.m_len = map_len;
+               cond_resched();
+       }
+       result->es_len = 0;
+       return 0;
+ }