Merge tag 'xfs-5.9-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

[linux-2.6-microblaze.git] / fs / xfs / xfs_reflink.c
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c

index 107bf2a..aac83f9 100644 (file)
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -179,7 +179,7 @@ xfs_reflink_trim_around_shared(
         int                     error = 0;
  
         /* Holes, unwritten, and delalloc extents cannot be shared */
-       if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+       if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
                 *shared = false;
                 return 0;
         }
@@ -655,7 +655,7 @@ xfs_reflink_end_cow_extent(
          * preallocations can leak into the range we are called upon, and we
          * need to skip them.
          */
-       if (!xfs_bmap_is_real_extent(&got)) {
+       if (!xfs_bmap_is_written_extent(&got)) {
                 *end_fsb = del.br_startoff;
                 goto out_cancel;
         }
@@ -984,40 +984,28 @@ xfs_reflink_ag_has_free_space(
  }
  
  /*
- * Unmap a range of blocks from a file, then map other blocks into the hole.
- * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
- * The extent irec is mapped into dest at irec->br_startoff.
+ * Remap the given extent into the file.  The dmap blockcount will be set to
+ * the number of blocks that were actually remapped.
   */
  STATIC int
  xfs_reflink_remap_extent(
         struct xfs_inode        *ip,
-       struct xfs_bmbt_irec    *irec,
-       xfs_fileoff_t           destoff,
+       struct xfs_bmbt_irec    *dmap,
         xfs_off_t               new_isize)
  {
+       struct xfs_bmbt_irec    smap;
         struct xfs_mount        *mp = ip->i_mount;
-       bool                    real_extent = xfs_bmap_is_real_extent(irec);
         struct xfs_trans        *tp;
-       unsigned int            resblks;
-       struct xfs_bmbt_irec    uirec;
-       xfs_filblks_t           rlen;
-       xfs_filblks_t           unmap_len;
         xfs_off_t               newlen;
+       int64_t                 qres, qdelta;
+       unsigned int            resblks;
+       bool                    smap_real;
+       bool                    dmap_written = xfs_bmap_is_written_extent(dmap);
+       int                     nimaps;
         int                     error;
  
-       unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
-       trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
-
-       /* No reflinking if we're low on space */
-       if (real_extent) {
-               error = xfs_reflink_ag_has_free_space(mp,
-                               XFS_FSB_TO_AGNO(mp, irec->br_startblock));
-               if (error)
-                       goto out;
-       }
-
         /* Start a rolling transaction to switch the mappings */
-       resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+       resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
         if (error)
                 goto out;
@@ -1025,87 +1013,147 @@ xfs_reflink_remap_extent(
         xfs_ilock(ip, XFS_ILOCK_EXCL);
         xfs_trans_ijoin(tp, ip, 0);
  
-       /* If we're not just clearing space, then do we have enough quota? */
-       if (real_extent) {
-               error = xfs_trans_reserve_quota_nblks(tp, ip,
-                               irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS);
+       /*
+        * Read what's currently mapped in the destination file into smap.
+        * If smap isn't a hole, we will have to remove it before we can add
+        * dmap to the destination file.
+        */
+       nimaps = 1;
+       error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
+                       &smap, &nimaps, 0);
+       if (error)
+               goto out_cancel;
+       ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
+       smap_real = xfs_bmap_is_real_extent(&smap);
+
+       /*
+        * We can only remap as many blocks as the smaller of the two extent
+        * maps, because we can only remap one extent at a time.
+        */
+       dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
+       ASSERT(dmap->br_blockcount == smap.br_blockcount);
+
+       trace_xfs_reflink_remap_extent_dest(ip, &smap);
+
+       /*
+        * Two extents mapped to the same physical block must not have
+        * different states; that's filesystem corruption.  Move on to the next
+        * extent if they're both holes or both the same physical extent.
+        */
+       if (dmap->br_startblock == smap.br_startblock) {
+               if (dmap->br_state != smap.br_state)
+                       error = -EFSCORRUPTED;
+               goto out_cancel;
+       }
+
+       /* If both extents are unwritten, leave them alone. */
+       if (dmap->br_state == XFS_EXT_UNWRITTEN &&
+           smap.br_state == XFS_EXT_UNWRITTEN)
+               goto out_cancel;
+
+       /* No reflinking if the AG of the dest mapping is low on space. */
+       if (dmap_written) {
+               error = xfs_reflink_ag_has_free_space(mp,
+                               XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
                 if (error)
                         goto out_cancel;
         }
  
-       trace_xfs_reflink_remap(ip, irec->br_startoff,
-                               irec->br_blockcount, irec->br_startblock);
-
-       /* Unmap the old blocks in the data fork. */
-       rlen = unmap_len;
-       while (rlen) {
-               ASSERT(tp->t_firstblock == NULLFSBLOCK);
-               error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1);
+       /*
+        * Compute quota reservation if we think the quota block counter for
+        * this file could increase.
+        *
+        * Adding a written extent to the extent map can cause a bmbt split,
+        * and removing a mapped extent from the extent can cause a bmbt split.
+        * The two operations cannot both cause a split since they operate on
+        * the same index in the bmap btree, so we only need a reservation for
+        * one bmbt split if either thing is happening.
+        *
+        * If we are mapping a written extent into the file, we need to have
+        * enough quota block count reservation to handle the blocks in that
+        * extent.  We log only the delta to the quota block counts, so if the
+        * extent we're unmapping also has blocks allocated to it, we don't
+        * need a quota reservation for the extent itself.
+        *
+        * Note that if we're replacing a delalloc reservation with a written
+        * extent, we have to take the full quota reservation because removing
+        * the delalloc reservation gives the block count back to the quota
+        * count.  This is suboptimal, but the VFS flushed the dest range
+        * before we started.  That should have removed all the delalloc
+        * reservations, but we code defensively.
+        */
+       qres = qdelta = 0;
+       if (smap_real || dmap_written)
+               qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+       if (!smap_real && dmap_written)
+               qres += dmap->br_blockcount;
+       if (qres > 0) {
+               error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
+                               XFS_QMOPT_RES_REGBLKS);
                 if (error)
                         goto out_cancel;
+       }
  
+       if (smap_real) {
                 /*
-                * Trim the extent to whatever got unmapped.
-                * Remember, bunmapi works backwards.
+                * If the extent we're unmapping is backed by storage (written
+                * or not), unmap the extent and drop its refcount.
                  */
-               uirec.br_startblock = irec->br_startblock + rlen;
-               uirec.br_startoff = irec->br_startoff + rlen;
-               uirec.br_blockcount = unmap_len - rlen;
-               uirec.br_state = irec->br_state;
-               unmap_len = rlen;
-
-               /* If this isn't a real mapping, we're done. */
-               if (!real_extent || uirec.br_blockcount == 0)
-                       goto next_extent;
+               xfs_bmap_unmap_extent(tp, ip, &smap);
+               xfs_refcount_decrease_extent(tp, &smap);
+               qdelta -= smap.br_blockcount;
+       } else if (smap.br_startblock == DELAYSTARTBLOCK) {
+               xfs_filblks_t   len = smap.br_blockcount;
  
-               trace_xfs_reflink_remap(ip, uirec.br_startoff,
-                               uirec.br_blockcount, uirec.br_startblock);
-
-               /* Update the refcount tree */
-               xfs_refcount_increase_extent(tp, &uirec);
-
-               /* Map the new blocks into the data fork. */
-               xfs_bmap_map_extent(tp, ip, &uirec);
+               /*
+                * If the extent we're unmapping is a delalloc reservation,
+                * we can use the regular bunmapi function to release the
+                * incore state.  Dropping the delalloc reservation takes care
+                * of the quota reservation for us.
+                */
+               error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1);
+               if (error)
+                       goto out_cancel;
+               ASSERT(len == 0);
+       }
  
-               /* Update quota accounting. */
-               xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
-                               uirec.br_blockcount);
+       /*
+        * If the extent we're sharing is backed by written storage, increase
+        * its refcount and map it into the file.
+        */
+       if (dmap_written) {
+               xfs_refcount_increase_extent(tp, dmap);
+               xfs_bmap_map_extent(tp, ip, dmap);
+               qdelta += dmap->br_blockcount;
+       }
  
-               /* Update dest isize if needed. */
-               newlen = XFS_FSB_TO_B(mp,
-                               uirec.br_startoff + uirec.br_blockcount);
-               newlen = min_t(xfs_off_t, newlen, new_isize);
-               if (newlen > i_size_read(VFS_I(ip))) {
-                       trace_xfs_reflink_update_inode_size(ip, newlen);
-                       i_size_write(VFS_I(ip), newlen);
-                       ip->i_d.di_size = newlen;
-                       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-               }
+       xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
  
-next_extent:
-               /* Process all the deferred stuff. */
-               error = xfs_defer_finish(&tp);
-               if (error)
-                       goto out_cancel;
+       /* Update dest isize if needed. */
+       newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
+       newlen = min_t(xfs_off_t, newlen, new_isize);
+       if (newlen > i_size_read(VFS_I(ip))) {
+               trace_xfs_reflink_update_inode_size(ip, newlen);
+               i_size_write(VFS_I(ip), newlen);
+               ip->i_d.di_size = newlen;
+               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
         }
  
+       /* Commit everything and unlock. */
         error = xfs_trans_commit(tp);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       if (error)
-               goto out;
-       return 0;
+       goto out_unlock;
  
  out_cancel:
         xfs_trans_cancel(tp);
+out_unlock:
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
  out:
-       trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
+       if (error)
+               trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
         return error;
  }
  
-/*
- * Iteratively remap one file's extents (and holes) to another's.
- */
+/* Remap a range of one file to the other. */
  int
  xfs_reflink_remap_blocks(
         struct xfs_inode        *src,
@@ -1116,25 +1164,22 @@ xfs_reflink_remap_blocks(
         loff_t                  *remapped)
  {
         struct xfs_bmbt_irec    imap;
-       xfs_fileoff_t           srcoff;
-       xfs_fileoff_t           destoff;
+       struct xfs_mount        *mp = src->i_mount;
+       xfs_fileoff_t           srcoff = XFS_B_TO_FSBT(mp, pos_in);
+       xfs_fileoff_t           destoff = XFS_B_TO_FSBT(mp, pos_out);
         xfs_filblks_t           len;
-       xfs_filblks_t           range_len;
         xfs_filblks_t           remapped_len = 0;
         xfs_off_t               new_isize = pos_out + remap_len;
         int                     nimaps;
         int                     error = 0;
  
-       destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
-       srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
-       len = XFS_B_TO_FSB(src->i_mount, remap_len);
+       len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
+                       XFS_MAX_FILEOFF);
  
-       /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
-       while (len) {
-               uint            lock_mode;
+       trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
  
-               trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
-                               dest, destoff);
+       while (len > 0) {
+               unsigned int    lock_mode;
  
                 /* Read extent from the source file */
                 nimaps = 1;
@@ -1143,18 +1188,25 @@ xfs_reflink_remap_blocks(
                 xfs_iunlock(src, lock_mode);
                 if (error)
                         break;
-               ASSERT(nimaps == 1);
-
-               trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
-                               &imap);
+               /*
+                * The caller supposedly flushed all dirty pages in the source
+                * file range, which means that writeback should have allocated
+                * or deleted all delalloc reservations in that range.  If we
+                * find one, that's a good sign that something is seriously
+                * wrong here.
+                */
+               ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
+               if (imap.br_startblock == DELAYSTARTBLOCK) {
+                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+                       error = -EFSCORRUPTED;
+                       break;
+               }
  
-               /* Translate imap into the destination file. */
-               range_len = imap.br_startoff + imap.br_blockcount - srcoff;
-               imap.br_startoff += destoff - srcoff;
+               trace_xfs_reflink_remap_extent_src(src, &imap);
  
-               /* Clear dest from destoff to the end of imap and map it in. */
-               error = xfs_reflink_remap_extent(dest, &imap, destoff,
-                               new_isize);
+               /* Remap into the destination file at the given offset. */
+               imap.br_startoff = destoff;
+               error = xfs_reflink_remap_extent(dest, &imap, new_isize);
                 if (error)
                         break;
  
@@ -1164,10 +1216,10 @@ xfs_reflink_remap_blocks(
                 }
  
                 /* Advance drange/srange */
-               srcoff += range_len;
-               destoff += range_len;
-               len -= range_len;
-               remapped_len += range_len;
+               srcoff += imap.br_blockcount;
+               destoff += imap.br_blockcount;
+               len -= imap.br_blockcount;
+               remapped_len += imap.br_blockcount;
         }
  
         if (error)
@@ -1177,81 +1229,6 @@ xfs_reflink_remap_blocks(
         return error;
  }
  
-/*
- * Grab the exclusive iolock for a data copy from src to dest, making sure to
- * abide vfs locking order (lowest pointer value goes first) and breaking the
- * layout leases before proceeding.  The loop is needed because we cannot call
- * the blocking break_layout() with the iolocks held, and therefore have to
- * back out both locks.
- */
-static int
-xfs_iolock_two_inodes_and_break_layout(
-       struct inode            *src,
-       struct inode            *dest)
-{
-       int                     error;
-
-       if (src > dest)
-               swap(src, dest);
-
-retry:
-       /* Wait to break both inodes' layouts before we start locking. */
-       error = break_layout(src, true);
-       if (error)
-               return error;
-       if (src != dest) {
-               error = break_layout(dest, true);
-               if (error)
-                       return error;
-       }
-
-       /* Lock one inode and make sure nobody got in and leased it. */
-       inode_lock(src);
-       error = break_layout(src, false);
-       if (error) {
-               inode_unlock(src);
-               if (error == -EWOULDBLOCK)
-                       goto retry;
-               return error;
-       }
-
-       if (src == dest)
-               return 0;
-
-       /* Lock the other inode and make sure nobody got in and leased it. */
-       inode_lock_nested(dest, I_MUTEX_NONDIR2);
-       error = break_layout(dest, false);
-       if (error) {
-               inode_unlock(src);
-               inode_unlock(dest);
-               if (error == -EWOULDBLOCK)
-                       goto retry;
-               return error;
-       }
-
-       return 0;
-}
-
-/* Unlock both inodes after they've been prepped for a range clone. */
-void
-xfs_reflink_remap_unlock(
-       struct file             *file_in,
-       struct file             *file_out)
-{
-       struct inode            *inode_in = file_inode(file_in);
-       struct xfs_inode        *src = XFS_I(inode_in);
-       struct inode            *inode_out = file_inode(file_out);
-       struct xfs_inode        *dest = XFS_I(inode_out);
-       bool                    same_inode = (inode_in == inode_out);
-
-       xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
-       if (!same_inode)
-               xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
-       inode_unlock(inode_out);
-       if (!same_inode)
-               inode_unlock(inode_in);
-}
-
  /*
   * If we're reflinking to a point past the destination file's EOF, we must
   * zero any speculative post-EOF preallocations that sit between the old EOF
@@ -1314,18 +1291,12 @@ xfs_reflink_remap_prep(
         struct xfs_inode        *src = XFS_I(inode_in);
         struct inode            *inode_out = file_inode(file_out);
         struct xfs_inode        *dest = XFS_I(inode_out);
-       bool                    same_inode = (inode_in == inode_out);
-       ssize_t                 ret;
+       int                     ret;
  
         /* Lock both files against IO */
-       ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
+       ret = xfs_ilock2_io_mmap(src, dest);
         if (ret)
                 return ret;
-       if (same_inode)
-               xfs_ilock(src, XFS_MMAPLOCK_EXCL);
-       else
-               xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest,
-                               XFS_MMAPLOCK_EXCL);
  
         /* Check file eligibility and prepare for block sharing. */
         ret = -EINVAL;
@@ -1339,7 +1310,7 @@ xfs_reflink_remap_prep(
  
         ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
                         len, remap_flags);
-       if (ret < 0 || *len == 0)
+       if (ret || *len == 0)
                 goto out_unlock;
  
         /* Attach dquots to dest inode before changing block map */
@@ -1374,9 +1345,9 @@ xfs_reflink_remap_prep(
         if (ret)
                 goto out_unlock;
  
-       return 1;
+       return 0;
  out_unlock:
-       xfs_reflink_remap_unlock(file_in, file_out);
+       xfs_iunlock2_io_mmap(src, dest);
         return ret;
  }