Merge tag 'xfs-5.12-merge-5' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

author Linus Torvalds <torvalds@linux-foundation.org>

Sun, 21 Feb 2021 18:34:36 +0000 (10:34 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sun, 21 Feb 2021 18:34:36 +0000 (10:34 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Sun, 21 Feb 2021 18:34:36 +0000 (10:34 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sun, 21 Feb 2021 18:34:36 +0000 (10:34 -0800)
diff --combined fs/xfs/xfs_file.c

index 39695b5,38528e5..dc91973
--- 1/fs/xfs/xfs_file.c
--- 2/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@@ -118,6 -118,54 +118,54 @@@ xfs_dir_fsync
         return xfs_log_force_inode(ip);
   }
   
+ static xfs_lsn_t
+ xfs_fsync_lsn(
+       struct xfs_inode        *ip,
+       bool                    datasync)
+ {
+       if (!xfs_ipincount(ip))
+               return 0;
+       if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+               return 0;
+       return ip->i_itemp->ili_last_lsn;
+ }
+ 
+ /*
+  * All metadata updates are logged, which means that we just have to flush the
+  * log up to the latest LSN that touched the inode.
+  *
+  * If we have concurrent fsync/fdatasync() calls, we need them to all block on
+  * the log force before we clear the ili_fsync_fields field. This ensures that
+  * we don't get a racing sync operation that does not wait for the metadata to
+  * hit the journal before returning.  If we race with clearing ili_fsync_fields,
+  * then all that will happen is the log force will do nothing as the lsn will
+  * already be on disk.  We can't race with setting ili_fsync_fields because that
+  * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
+  * shared until after the ili_fsync_fields is cleared.
+  */
+ static  int
+ xfs_fsync_flush_log(
+       struct xfs_inode        *ip,
+       bool                    datasync,
+       int                     *log_flushed)
+ {
+       int                     error = 0;
+       xfs_lsn_t               lsn;
+ 
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
+       lsn = xfs_fsync_lsn(ip, datasync);
+       if (lsn) {
+               error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC,
+                                         log_flushed);
+ 
+               spin_lock(&ip->i_itemp->ili_lock);
+               ip->i_itemp->ili_fsync_fields = 0;
+               spin_unlock(&ip->i_itemp->ili_lock);
+       }
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+       return error;
+ }
+ 
   STATIC int
   xfs_file_fsync(
         struct file             *file,
@@@ -125,13 -173,10 +173,10 @@@
         loff_t                  end,
         int                     datasync)
   {
-       struct inode            *inode = file->f_mapping->host;
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_inode_log_item *iip = ip->i_itemp;
+       struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
         struct xfs_mount        *mp = ip->i_mount;
         int                     error = 0;
         int                     log_flushed = 0;
-       xfs_lsn_t               lsn = 0;
   
         trace_xfs_file_fsync(ip);
   
@@@ -156,32 -201,13 +201,13 @@@
                 xfs_blkdev_issue_flush(mp->m_ddev_targp);
   
         /*
-        * All metadata updates are logged, which means that we just have to
-        * flush the log up to the latest LSN that touched the inode. If we have
-        * concurrent fsync/fdatasync() calls, we need them to all block on the
-        * log force before we clear the ili_fsync_fields field. This ensures
-        * that we don't get a racing sync operation that does not wait for the
-        * metadata to hit the journal before returning. If we race with
-        * clearing the ili_fsync_fields, then all that will happen is the log
-        * force will do nothing as the lsn will already be on disk. We can't
-        * race with setting ili_fsync_fields because that is done under
-        * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
-        * until after the ili_fsync_fields is cleared.
+        * Any inode that has dirty modifications in the log is pinned.  The
+        * racy check here for a pinned inode while not catch modifications
+        * that happen concurrently to the fsync call, but fsync semantics
+        * only require to sync previously completed I/O.
          */
-       xfs_ilock(ip, XFS_ILOCK_SHARED);
-       if (xfs_ipincount(ip)) {
-               if (!datasync ||
-                   (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
-                       lsn = iip->ili_last_lsn;
-       }
- 
-       if (lsn) {
-               error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
-               spin_lock(&iip->ili_lock);
-               iip->ili_fsync_fields = 0;
-               spin_unlock(&iip->ili_lock);
-       }
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+       if (xfs_ipincount(ip))
+               error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
   
         /*
          * If we only have a single device, and the log force about was
@@@ -197,42 -223,30 +223,42 @@@
         return error;
   }
   
+ +static int
+ +xfs_ilock_iocb(
+ +      struct kiocb            *iocb,
+ +      unsigned int            lock_mode)
+ +{
+ +      struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
+ +
+ +      if (iocb->ki_flags & IOCB_NOWAIT) {
+ +              if (!xfs_ilock_nowait(ip, lock_mode))
+ +                      return -EAGAIN;
+ +      } else {
+ +              xfs_ilock(ip, lock_mode);
+ +      }
+ +
+ +      return 0;
+ +}
+ +
   STATIC ssize_t
- -xfs_file_dio_aio_read(
+ +xfs_file_dio_read(
         struct kiocb            *iocb,
         struct iov_iter         *to)
   {
         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
- -      size_t                  count = iov_iter_count(to);
         ssize_t                 ret;
   
- -      trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
+ +      trace_xfs_file_direct_read(iocb, to);
   
- -      if (!count)
+ +      if (!iov_iter_count(to))
                 return 0; /* skip atime */
   
         file_accessed(iocb->ki_filp);
   
- -      if (iocb->ki_flags & IOCB_NOWAIT) {
- -              if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- -                      return -EAGAIN;
- -      } else {
- -              xfs_ilock(ip, XFS_IOLOCK_SHARED);
- -      }
- -      ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
- -                      is_sync_kiocb(iocb));
+ +      ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ +      if (ret)
+ +              return ret;
+ +      ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
   
         return ret;
@@@ -244,16 -258,21 +270,16 @@@ xfs_file_dax_read
         struct iov_iter         *to)
   {
         struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
- -      size_t                  count = iov_iter_count(to);
         ssize_t                 ret = 0;
   
- -      trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+ +      trace_xfs_file_dax_read(iocb, to);
   
- -      if (!count)
+ +      if (!iov_iter_count(to))
                 return 0; /* skip atime */
   
- -      if (iocb->ki_flags & IOCB_NOWAIT) {
- -              if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- -                      return -EAGAIN;
- -      } else {
- -              xfs_ilock(ip, XFS_IOLOCK_SHARED);
- -      }
- -
+ +      ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ +      if (ret)
+ +              return ret;
         ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
   
@@@ -262,18 -281,21 +288,18 @@@
   }
   
   STATIC ssize_t
- -xfs_file_buffered_aio_read(
+ +xfs_file_buffered_read(
         struct kiocb            *iocb,
         struct iov_iter         *to)
   {
         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
         ssize_t                 ret;
   
- -      trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+ +      trace_xfs_file_buffered_read(iocb, to);
   
- -      if (iocb->ki_flags & IOCB_NOWAIT) {
- -              if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- -                      return -EAGAIN;
- -      } else {
- -              xfs_ilock(ip, XFS_IOLOCK_SHARED);
- -      }
+ +      ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ +      if (ret)
+ +              return ret;
         ret = generic_file_read_iter(iocb, to);
         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
   
@@@ -297,9 -319,9 +323,9 @@@ xfs_file_read_iter
         if (IS_DAX(inode))
                 ret = xfs_file_dax_read(iocb, to);
         else if (iocb->ki_flags & IOCB_DIRECT)
- -              ret = xfs_file_dio_aio_read(iocb, to);
+ +              ret = xfs_file_dio_read(iocb, to);
         else
- -              ret = xfs_file_buffered_aio_read(iocb, to);
+ +              ret = xfs_file_buffered_read(iocb, to);
   
         if (ret > 0)
                 XFS_STATS_ADD(mp, xs_read_bytes, ret);
@@@ -314,7 -336,7 +340,7 @@@
    * if called for a direct write beyond i_size.
    */
   STATIC ssize_t
- -xfs_file_aio_write_checks(
+ +xfs_file_write_checks(
         struct kiocb            *iocb,
         struct iov_iter         *from,
         int                     *iolock)
@@@ -332,14 -354,7 +358,14 @@@ restart
         if (error <= 0)
                 return error;
   
- -      error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ +      if (iocb->ki_flags & IOCB_NOWAIT) {
+ +              error = break_layout(inode, false);
+ +              if (error == -EWOULDBLOCK)
+ +                      error = -EAGAIN;
+ +      } else {
+ +              error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ +      }
+ +
         if (error)
                 return error;
   
@@@ -350,11 -365,7 +376,11 @@@
         if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
                 xfs_iunlock(ip, *iolock);
                 *iolock = XFS_IOLOCK_EXCL;
- -              xfs_ilock(ip, *iolock);
+ +              error = xfs_ilock_iocb(iocb, *iolock);
+ +              if (error) {
+ +                      *iolock = 0;
+ +                      return error;
+ +              }
                 goto restart;
         }
         /*
@@@ -376,10 -387,6 +402,10 @@@
         isize = i_size_read(inode);
         if (iocb->ki_pos > isize) {
                 spin_unlock(&ip->i_flags_lock);
+ +
+ +              if (iocb->ki_flags & IOCB_NOWAIT)
+ +                      return -EAGAIN;
+ +
                 if (!drained_dio) {
                         if (*iolock == XFS_IOLOCK_SHARED) {
                                 xfs_iunlock(ip, *iolock);
@@@ -408,12 -415,6 +434,6 @@@
         } else
                 spin_unlock(&ip->i_flags_lock);
   
-       /*
-        * Updating the timestamps will grab the ilock again from
-        * xfs_fs_dirty_inode, so we have to call it after dropping the
-        * lock above.  Eventually we should look into a way to avoid
-        * the pointless lock roundtrip.
-        */
         return file_modified(file);
   }
   
@@@ -499,149 -500,122 +519,149 @@@ static const struct iomap_dio_ops xfs_d
   };
   
   /*
- - * xfs_file_dio_aio_write - handle direct IO writes
- - *
- - * Lock the inode appropriately to prepare for and issue a direct IO write.
- - * By separating it from the buffered write path we remove all the tricky to
- - * follow locking changes and looping.
- - *
- - * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
- - * until we're sure the bytes at the new EOF have been zeroed and/or the cached
- - * pages are flushed out.
- - *
- - * In most cases the direct IO writes will be done holding IOLOCK_SHARED
- - * allowing them to be done in parallel with reads and other direct IO writes.
- - * However, if the IO is not aligned to filesystem blocks, the direct IO layer
- - * needs to do sub-block zeroing and that requires serialisation against other
- - * direct IOs to the same block. In this case we need to serialise the
- - * submission of the unaligned IOs so that we don't get racing block zeroing in
- - * the dio layer.  To avoid the problem with aio, we also need to wait for
- - * outstanding IOs to complete so that unwritten extent conversion is completed
- - * before we try to map the overlapping block. This is currently implemented by
- - * hitting it with a big hammer (i.e. inode_dio_wait()).
- - *
- - * Returns with locks held indicated by @iolock and errors indicated by
- - * negative return values.
+ + * Handle block aligned direct I/O writes
    */
- -STATIC ssize_t
- -xfs_file_dio_aio_write(
+ +static noinline ssize_t
+ +xfs_file_dio_write_aligned(
+ +      struct xfs_inode        *ip,
         struct kiocb            *iocb,
         struct iov_iter         *from)
   {
- -      struct file             *file = iocb->ki_filp;
- -      struct address_space    *mapping = file->f_mapping;
- -      struct inode            *inode = mapping->host;
- -      struct xfs_inode        *ip = XFS_I(inode);
- -      struct xfs_mount        *mp = ip->i_mount;
- -      ssize_t                 ret = 0;
- -      int                     unaligned_io = 0;
- -      int                     iolock;
- -      size_t                  count = iov_iter_count(from);
- -      struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+ +      int                     iolock = XFS_IOLOCK_SHARED;
+ +      ssize_t                 ret;
   
- -      /* DIO must be aligned to device logical sector size */
- -      if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
- -              return -EINVAL;
+ +      ret = xfs_ilock_iocb(iocb, iolock);
+ +      if (ret)
+ +              return ret;
+ +      ret = xfs_file_write_checks(iocb, from, &iolock);
+ +      if (ret)
+ +              goto out_unlock;
   
         /*
- -       * Don't take the exclusive iolock here unless the I/O is unaligned to
- -       * the file system block size.  We don't need to consider the EOF
- -       * extension case here because xfs_file_aio_write_checks() will relock
- -       * the inode as necessary for EOF zeroing cases and fill out the new
- -       * inode size as appropriate.
+ +       * We don't need to hold the IOLOCK exclusively across the IO, so demote
+ +       * the iolock back to shared if we had to take the exclusive lock in
+ +       * xfs_file_write_checks() for other reasons.
          */
- -      if ((iocb->ki_pos & mp->m_blockmask) ||
- -          ((iocb->ki_pos + count) & mp->m_blockmask)) {
- -              unaligned_io = 1;
- -
- -              /*
- -               * We can't properly handle unaligned direct I/O to reflink
- -               * files yet, as we can't unshare a partial block.
- -               */
- -              if (xfs_is_cow_inode(ip)) {
- -                      trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
- -                      return -ENOTBLK;
- -              }
- -              iolock = XFS_IOLOCK_EXCL;
- -      } else {
+ +      if (iolock == XFS_IOLOCK_EXCL) {
+ +              xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
                 iolock = XFS_IOLOCK_SHARED;
         }
+ +      trace_xfs_file_direct_write(iocb, from);
+ +      ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+ +                         &xfs_dio_write_ops, 0);
+ +out_unlock:
+ +      if (iolock)
+ +              xfs_iunlock(ip, iolock);
+ +      return ret;
+ +}
   
- -      if (iocb->ki_flags & IOCB_NOWAIT) {
- -              /* unaligned dio always waits, bail */
- -              if (unaligned_io)
- -                      return -EAGAIN;
- -              if (!xfs_ilock_nowait(ip, iolock))
+ +/*
+ + * Handle block unaligned direct I/O writes
+ + *
+ + * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
+ + * them to be done in parallel with reads and other direct I/O writes.  However,
+ + * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
+ + * to do sub-block zeroing and that requires serialisation against other direct
+ + * I/O to the same block.  In this case we need to serialise the submission of
+ + * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
+ + * In the case where sub-block zeroing is not required, we can do concurrent
+ + * sub-block dios to the same block successfully.
+ + *
+ + * Optimistically submit the I/O using the shared lock first, but use the
+ + * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
+ + * if block allocation or partial block zeroing would be required.  In that case
+ + * we try again with the exclusive lock.
+ + */
+ +static noinline ssize_t
+ +xfs_file_dio_write_unaligned(
+ +      struct xfs_inode        *ip,
+ +      struct kiocb            *iocb,
+ +      struct iov_iter         *from)
+ +{
+ +      size_t                  isize = i_size_read(VFS_I(ip));
+ +      size_t                  count = iov_iter_count(from);
+ +      int                     iolock = XFS_IOLOCK_SHARED;
+ +      unsigned int            flags = IOMAP_DIO_OVERWRITE_ONLY;
+ +      ssize_t                 ret;
+ +
+ +      /*
+ +       * Extending writes need exclusivity because of the sub-block zeroing
+ +       * that the DIO code always does for partial tail blocks beyond EOF, so
+ +       * don't even bother trying the fast path in this case.
+ +       */
+ +      if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
+ +retry_exclusive:
+ +              if (iocb->ki_flags & IOCB_NOWAIT)
                         return -EAGAIN;
- -      } else {
- -              xfs_ilock(ip, iolock);
+ +              iolock = XFS_IOLOCK_EXCL;
+ +              flags = IOMAP_DIO_FORCE_WAIT;
         }
   
- -      ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ +      ret = xfs_ilock_iocb(iocb, iolock);
         if (ret)
- -              goto out;
- -      count = iov_iter_count(from);
+ +              return ret;
   
         /*
- -       * If we are doing unaligned IO, we can't allow any other overlapping IO
- -       * in-flight at the same time or we risk data corruption. Wait for all
- -       * other IO to drain before we submit. If the IO is aligned, demote the
- -       * iolock if we had to take the exclusive lock in
- -       * xfs_file_aio_write_checks() for other reasons.
+ +       * We can't properly handle unaligned direct I/O to reflink files yet,
+ +       * as we can't unshare a partial block.
          */
- -      if (unaligned_io) {
- -              inode_dio_wait(inode);
- -      } else if (iolock == XFS_IOLOCK_EXCL) {
- -              xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
- -              iolock = XFS_IOLOCK_SHARED;
+ +      if (xfs_is_cow_inode(ip)) {
+ +              trace_xfs_reflink_bounce_dio_write(iocb, from);
+ +              ret = -ENOTBLK;
+ +              goto out_unlock;
         }
   
- -      trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
+ +      ret = xfs_file_write_checks(iocb, from, &iolock);
+ +      if (ret)
+ +              goto out_unlock;
+ +
         /*
- -       * If unaligned, this is the only IO in-flight. Wait on it before we
- -       * release the iolock to prevent subsequent overlapping IO.
+ +       * If we are doing exclusive unaligned I/O, this must be the only I/O
+ +       * in-flight.  Otherwise we risk data corruption due to unwritten extent
+ +       * conversions from the AIO end_io handler.  Wait for all other I/O to
+ +       * drain first.
          */
+ +      if (flags & IOMAP_DIO_FORCE_WAIT)
+ +              inode_dio_wait(VFS_I(ip));
+ +
+ +      trace_xfs_file_direct_write(iocb, from);
         ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- -                         &xfs_dio_write_ops,
- -                         is_sync_kiocb(iocb) || unaligned_io);
- -out:
- -      xfs_iunlock(ip, iolock);
+ +                         &xfs_dio_write_ops, flags);
   
         /*
- -       * No fallback to buffered IO after short writes for XFS, direct I/O
- -       * will either complete fully or return an error.
+ +       * Retry unaligned I/O with exclusive blocking semantics if the DIO
+ +       * layer rejected it for mapping or locking reasons. If we are doing
+ +       * nonblocking user I/O, propagate the error.
          */
- -      ASSERT(ret < 0 || ret == count);
+ +      if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
+ +              ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
+ +              xfs_iunlock(ip, iolock);
+ +              goto retry_exclusive;
+ +      }
+ +
+ +out_unlock:
+ +      if (iolock)
+ +              xfs_iunlock(ip, iolock);
         return ret;
   }
   
+ +static ssize_t
+ +xfs_file_dio_write(
+ +      struct kiocb            *iocb,
+ +      struct iov_iter         *from)
+ +{
+ +      struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
+ +      struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+ +      size_t                  count = iov_iter_count(from);
+ +
+ +      /* direct I/O must be aligned to device logical sector size */
+ +      if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
+ +              return -EINVAL;
+ +      if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+ +              return xfs_file_dio_write_unaligned(ip, iocb, from);
+ +      return xfs_file_dio_write_aligned(ip, iocb, from);
+ +}
+ +
   static noinline ssize_t
   xfs_file_dax_write(
         struct kiocb            *iocb,
@@@ -651,26 -625,31 +671,26 @@@
         struct xfs_inode        *ip = XFS_I(inode);
         int                     iolock = XFS_IOLOCK_EXCL;
         ssize_t                 ret, error = 0;
- -      size_t                  count;
         loff_t                  pos;
   
- -      if (iocb->ki_flags & IOCB_NOWAIT) {
- -              if (!xfs_ilock_nowait(ip, iolock))
- -                      return -EAGAIN;
- -      } else {
- -              xfs_ilock(ip, iolock);
- -      }
- -
- -      ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ +      ret = xfs_ilock_iocb(iocb, iolock);
+ +      if (ret)
+ +              return ret;
+ +      ret = xfs_file_write_checks(iocb, from, &iolock);
         if (ret)
                 goto out;
   
         pos = iocb->ki_pos;
- -      count = iov_iter_count(from);
   
- -      trace_xfs_file_dax_write(ip, count, pos);
+ +      trace_xfs_file_dax_write(iocb, from);
         ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
         if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
                 i_size_write(inode, iocb->ki_pos);
                 error = xfs_setfilesize(ip, pos, ret);
         }
   out:
- -      xfs_iunlock(ip, iolock);
+ +      if (iolock)
+ +              xfs_iunlock(ip, iolock);
         if (error)
                 return error;
   
@@@ -684,7 -663,7 +704,7 @@@
   }
   
   STATIC ssize_t
- -xfs_file_buffered_aio_write(
+ +xfs_file_buffered_write(
         struct kiocb            *iocb,
         struct iov_iter         *from)
   {
@@@ -693,7 -672,7 +713,7 @@@
         struct inode            *inode = mapping->host;
         struct xfs_inode        *ip = XFS_I(inode);
         ssize_t                 ret;
-       int                     enospc = 0;
+       bool                    cleared_space = false;
         int                     iolock;
   
         if (iocb->ki_flags & IOCB_NOWAIT)
@@@ -703,14 -682,14 +723,14 @@@ write_retry
         iolock = XFS_IOLOCK_EXCL;
         xfs_ilock(ip, iolock);
   
- -      ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ +      ret = xfs_file_write_checks(iocb, from, &iolock);
         if (ret)
                 goto out;
   
         /* We can write back this queue in page reclaim */
         current->backing_dev_info = inode_to_bdi(inode);
   
- -      trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+ +      trace_xfs_file_buffered_write(iocb, from);
         ret = iomap_file_buffered_write(iocb, from,
                         &xfs_buffered_write_iomap_ops);
         if (likely(ret >= 0))
@@@ -723,27 -702,23 +743,23 @@@
          * metadata space. This reduces the chances that the eofblocks scan
          * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
          * also behaves as a filter to prevent too many eofblocks scans from
-        * running at the same time.
+        * running at the same time.  Use a synchronous scan to increase the
+        * effectiveness of the scan.
          */
-       if (ret == -EDQUOT && !enospc) {
+       if (ret == -EDQUOT && !cleared_space) {
                 xfs_iunlock(ip, iolock);
-               enospc = xfs_inode_free_quota_eofblocks(ip);
-               if (enospc)
-                       goto write_retry;
-               enospc = xfs_inode_free_quota_cowblocks(ip);
-               if (enospc)
-                       goto write_retry;
-               iolock = 0;
-       } else if (ret == -ENOSPC && !enospc) {
+               xfs_blockgc_free_quota(ip, XFS_EOF_FLAGS_SYNC);
+               cleared_space = true;
+               goto write_retry;
+       } else if (ret == -ENOSPC && !cleared_space) {
                 struct xfs_eofblocks eofb = {0};
   
-               enospc = 1;
+               cleared_space = true;
                 xfs_flush_inodes(ip->i_mount);
   
                 xfs_iunlock(ip, iolock);
                 eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
-               xfs_icache_free_eofblocks(ip->i_mount, &eofb);
-               xfs_icache_free_cowblocks(ip->i_mount, &eofb);
+               xfs_blockgc_free_space(ip->i_mount, &eofb);
                 goto write_retry;
         }
   
@@@ -790,12 -765,12 +806,12 @@@ xfs_file_write_iter
                  * CoW.  In all other directio scenarios we do not
                  * allow an operation to fall back to buffered mode.
                  */
- -              ret = xfs_file_dio_aio_write(iocb, from);
+ +              ret = xfs_file_dio_write(iocb, from);
                 if (ret != -ENOTBLK)
                         return ret;
         }
   
- -      return xfs_file_buffered_aio_write(iocb, from);
+ +      return xfs_file_buffered_write(iocb, from);
   }
   
   static void
diff --combined fs/xfs/xfs_iomap.c

index ef76f77,6594f57..e17ab7f
--- 1/fs/xfs/xfs_iomap.c
--- 2/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@@ -194,25 -194,21 +194,21 @@@ xfs_iomap_write_direct
         struct xfs_trans        *tp;
         xfs_filblks_t           resaligned;
         int                     nimaps;
-       int                     quota_flag;
-       uint                    qblocks, resblks;
-       unsigned int            resrtextents = 0;
+       unsigned int            dblocks, rblocks;
+       bool                    force = false;
         int                     error;
         int                     bmapi_flags = XFS_BMAPI_PREALLOC;
-       uint                    tflags = 0;
   
         ASSERT(count_fsb > 0);
   
         resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
                                            xfs_get_extsz_hint(ip));
         if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
-               resrtextents = qblocks = resaligned;
-               resrtextents /= mp->m_sb.sb_rextsize;
-               resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
-               quota_flag = XFS_QMOPT_RES_RTBLKS;
+               dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+               rblocks = resaligned;
         } else {
-               resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
-               quota_flag = XFS_QMOPT_RES_REGBLKS;
+               dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+               rblocks = 0;
         }
   
         error = xfs_qm_dqattach(ip);
@@@ -235,23 -231,21 +231,21 @@@
         if (IS_DAX(VFS_I(ip))) {
                 bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
                 if (imap->br_state == XFS_EXT_UNWRITTEN) {
-                       tflags |= XFS_TRANS_RESERVE;
-                       resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+                       force = true;
+                       dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
                 }
         }
-       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents,
-                       tflags, &tp);
+ 
+       error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
+                       rblocks, force, &tp);
         if (error)
                 return error;
   
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
- 
-       error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
+       error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+                       XFS_IEXT_ADD_NOSPLIT_CNT);
         if (error)
                 goto out_trans_cancel;
   
-       xfs_trans_ijoin(tp, ip, 0);
- 
         /*
          * From this point onwards we overwrite the imap pointer that the
          * caller gave to us.
@@@ -260,7 -254,7 +254,7 @@@
         error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0,
                                 imap, &nimaps);
         if (error)
-               goto out_res_cancel;
+               goto out_trans_cancel;
   
         /*
          * Complete the transaction
@@@ -284,8 -278,6 +278,6 @@@ out_unlock
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
         return error;
   
- out_res_cancel:
-       xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
   out_trans_cancel:
         xfs_trans_cancel(tp);
         goto out_unlock;
@@@ -548,16 -540,13 +540,13 @@@ xfs_iomap_write_unwritten
                  * here as we might be asked to write out the same inode that we
                  * complete here and might deadlock on the iolock.
                  */
-               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
-                               XFS_TRANS_RESERVE, &tp);
+               error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks,
+                               0, true, &tp);
                 if (error)
                         return error;
   
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               xfs_trans_ijoin(tp, ip, 0);
- 
-               error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
-                               XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES);
+               error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+                               XFS_IEXT_WRITE_UNWRITTEN_CNT);
                 if (error)
                         goto error_on_bmapi_transaction;
   
@@@ -784,28 -773,15 +773,28 @@@ xfs_direct_write_iomap_begin
                 goto allocate_blocks;
   
         /*
- -       * NOWAIT IO needs to span the entire requested IO with a single map so
- -       * that we avoid partial IO failures due to the rest of the IO range not
- -       * covered by this map triggering an EAGAIN condition when it is
- -       * subsequently mapped and aborting the IO.
+ +       * NOWAIT and OVERWRITE I/O needs to span the entire requested I/O with
+ +       * a single map so that we avoid partial IO failures due to the rest of
+ +       * the I/O range not covered by this map triggering an EAGAIN condition
+ +       * when it is subsequently mapped and aborting the I/O.
          */
- -      if ((flags & IOMAP_NOWAIT) &&
- -          !imap_spans_range(&imap, offset_fsb, end_fsb)) {
+ +      if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) {
                 error = -EAGAIN;
- -              goto out_unlock;
+ +              if (!imap_spans_range(&imap, offset_fsb, end_fsb))
+ +                      goto out_unlock;
+ +      }
+ +
+ +      /*
+ +       * For overwrite only I/O, we cannot convert unwritten extents without
+ +       * requiring sub-block zeroing.  This can only be done under an
+ +       * exclusive IOLOCK, hence return -EAGAIN if this is not a written
+ +       * extent to tell the caller to try again.
+ +       */
+ +      if (flags & IOMAP_OVERWRITE_ONLY) {
+ +              error = -EAGAIN;
+ +              if (imap.br_state != XFS_EXT_NORM &&
+ +                  ((offset | length) & mp->m_blockmask))
+ +                      goto out_unlock;
         }
   
         xfs_iunlock(ip, lockmode);
@@@ -814,7 -790,7 +803,7 @@@
   
   allocate_blocks:
         error = -EAGAIN;
- -      if (flags & IOMAP_NOWAIT)
+ +      if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY))
                 goto out_unlock;
   
         /*
@@@ -855,7 -831,8 +844,8 @@@ out_found_cow
         return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
   
   out_unlock:
-       xfs_iunlock(ip, lockmode);
+       if (lockmode)
+               xfs_iunlock(ip, lockmode);
         return error;
   }
   
@@@ -883,6 -860,9 +873,9 @@@ xfs_buffered_write_iomap_begin
         int                     allocfork = XFS_DATA_FORK;
         int                     error = 0;
   
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+ 
         /* we can't use delayed allocations when using extent size hints */
         if (xfs_get_extsz_hint(ip))
                 return xfs_direct_write_iomap_begin(inode, offset, count,
diff --combined fs/xfs/xfs_trace.h

index 0cfd65c,63ecbc6..e74bbb6
--- 1/fs/xfs/xfs_trace.h
--- 2/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@@ -37,6 -37,7 +37,7 @@@ struct xfs_trans_res
   struct xfs_inobt_rec_incore;
   union xfs_btree_ptr;
   struct xfs_dqtrx;
+ struct xfs_eofblocks;
   
   #define XFS_ATTR_FILTER_FLAGS \
         { XFS_ATTR_ROOT,        "ROOT" }, \
@@@ -154,10 -155,8 +155,8 @@@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_ta
   DEFINE_PERAG_REF_EVENT(xfs_perag_put);
   DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
   DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
- DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
- DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
- DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks);
- DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks);
+ DEFINE_PERAG_REF_EVENT(xfs_perag_set_blockgc);
+ DEFINE_PERAG_REF_EVENT(xfs_perag_clear_blockgc);
   
   DECLARE_EVENT_CLASS(xfs_ag_class,
         TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
@@@ -358,7 -357,7 +357,7 @@@ DEFINE_BUF_EVENT(xfs_buf_get_uncached)
   DEFINE_BUF_EVENT(xfs_buf_item_relse);
   DEFINE_BUF_EVENT(xfs_buf_iodone_async);
   DEFINE_BUF_EVENT(xfs_buf_error_relse);
- DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
+ DEFINE_BUF_EVENT(xfs_buf_drain_buftarg);
   DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
   
   /* not really buffer traces, but the buf provides useful information */
@@@ -1287,8 -1286,8 +1286,8 @@@ TRACE_EVENT(xfs_log_assign_tail_lsn
   )
   
   DECLARE_EVENT_CLASS(xfs_file_class,
- -      TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
- -      TP_ARGS(ip, count, offset),
+ +      TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),
+ +      TP_ARGS(iocb, iter),
         TP_STRUCT__entry(
                 __field(dev_t, dev)
                 __field(xfs_ino_t, ino)
@@@ -1297,11 -1296,11 +1296,11 @@@
                 __field(size_t, count)
         ),
         TP_fast_assign(
- -              __entry->dev = VFS_I(ip)->i_sb->s_dev;
- -              __entry->ino = ip->i_ino;
- -              __entry->size = ip->i_d.di_size;
- -              __entry->offset = offset;
- -              __entry->count = count;
+ +              __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
+ +              __entry->ino = XFS_I(file_inode(iocb->ki_filp))->i_ino;
+ +              __entry->size = XFS_I(file_inode(iocb->ki_filp))->i_d.di_size;
+ +              __entry->offset = iocb->ki_pos;
+ +              __entry->count = iov_iter_count(iter);
         ),
         TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
@@@ -1313,16 -1312,14 +1312,16 @@@
   
   #define DEFINE_RW_EVENT(name)         \
   DEFINE_EVENT(xfs_file_class, name,    \
- -      TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),    \
- -      TP_ARGS(ip, count, offset))
+ +      TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),            \
+ +      TP_ARGS(iocb, iter))
   DEFINE_RW_EVENT(xfs_file_buffered_read);
   DEFINE_RW_EVENT(xfs_file_direct_read);
   DEFINE_RW_EVENT(xfs_file_dax_read);
   DEFINE_RW_EVENT(xfs_file_buffered_write);
   DEFINE_RW_EVENT(xfs_file_direct_write);
   DEFINE_RW_EVENT(xfs_file_dax_write);
+ +DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
+ +
   
   DECLARE_EVENT_CLASS(xfs_imap_class,
         TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
@@@ -3296,6 -3293,8 +3295,6 @@@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow
   DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
   DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
   
- -DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
- -
   DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
   DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
   DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
@@@ -3888,6 -3887,47 +3887,47 @@@ DEFINE_EVENT(xfs_timestamp_range_class
   DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range);
   DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range);
   
+ DECLARE_EVENT_CLASS(xfs_eofblocks_class,
+       TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb,
+                unsigned long caller_ip),
+       TP_ARGS(mp, eofb, caller_ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(__u32, flags)
+               __field(uint32_t, uid)
+               __field(uint32_t, gid)
+               __field(prid_t, prid)
+               __field(__u64, min_file_size)
+               __field(unsigned long, caller_ip)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->flags = eofb ? eofb->eof_flags : 0;
+               __entry->uid = eofb ? from_kuid(mp->m_super->s_user_ns,
+                                               eofb->eof_uid) : 0;
+               __entry->gid = eofb ? from_kgid(mp->m_super->s_user_ns,
+                                               eofb->eof_gid) : 0;
+               __entry->prid = eofb ? eofb->eof_prid : 0;
+               __entry->min_file_size = eofb ? eofb->eof_min_file_size : 0;
+               __entry->caller_ip = caller_ip;
+       ),
+       TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu caller %pS",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->flags,
+                 __entry->uid,
+                 __entry->gid,
+                 __entry->prid,
+                 __entry->min_file_size,
+                 (char *)__entry->caller_ip)
+ );
+ #define DEFINE_EOFBLOCKS_EVENT(name)  \
+ DEFINE_EVENT(xfs_eofblocks_class, name,       \
+       TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, \
+                unsigned long caller_ip), \
+       TP_ARGS(mp, eofb, caller_ip))
+ DEFINE_EOFBLOCKS_EVENT(xfs_ioc_free_eofblocks);
+ DEFINE_EOFBLOCKS_EVENT(xfs_blockgc_free_space);
+ 
   #endif /* _TRACE_XFS_H */
   
   #undef TRACE_INCLUDE_PATH
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 21 Feb 2021 18:34:36 +0000 (10:34 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 21 Feb 2021 18:34:36 +0000 (10:34 -0800)
		1	2
fs/xfs/xfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_iomap.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_trace.h	patch \|	diff1 \|	diff2 \|	blob \| history