Merge tag 'xfs-5.19-for-linus' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 26 May 2022 02:34:40 +0000 (19:34 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 26 May 2022 02:34:40 +0000 (19:34 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 26 May 2022 02:34:40 +0000 (19:34 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 26 May 2022 02:34:40 +0000 (19:34 -0700)
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile

index 04611a1..b056cfc 100644 (file)
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -102,6 +102,7 @@ xfs-y                               += xfs_log.o \
                                    xfs_buf_item_recover.o \
                                    xfs_dquot_item_recover.o \
                                    xfs_extfree_item.o \
+                                  xfs_attr_item.o \
                                    xfs_icreate_item.o \
                                    xfs_inode_item.o \
                                    xfs_inode_item_recover.o \
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c

index b52ed33..d3f2886 100644 (file)
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2511,7 +2511,7 @@ __xfs_free_extent_later(
  
         ASSERT(bno != NULLFSBLOCK);
         ASSERT(len > 0);
-       ASSERT(len <= MAXEXTLEN);
+       ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
         ASSERT(!isnullstartblock(bno));
         agno = XFS_FSB_TO_AGNO(mp, bno);
         agbno = XFS_FSB_TO_AGBNO(mp, bno);
@@ -2777,7 +2777,7 @@ xfs_alloc_get_freelist(
         xfs_agblock_t           bno;
         __be32                  *agfl_bno;
         int                     error;
-       int                     logflags;
+       uint32_t                logflags;
         struct xfs_mount        *mp = tp->t_mountp;
         struct xfs_perag        *pag;
  
@@ -2830,9 +2830,9 @@ xfs_alloc_get_freelist(
   */
  void
  xfs_alloc_log_agf(
-       xfs_trans_t     *tp,    /* transaction pointer */
-       struct xfs_buf  *bp,    /* buffer for a.g. freelist header */
-       int             fields) /* mask of fields to be logged (XFS_AGF_...) */
+       struct xfs_trans        *tp,
+       struct xfs_buf          *bp,
+       uint32_t                fields)
  {
         int     first;          /* first byte offset */
         int     last;           /* last byte offset */
@@ -2902,7 +2902,7 @@ xfs_alloc_put_freelist(
         struct xfs_perag        *pag;
         __be32                  *blockp;
         int                     error;
-       int                     logflags;
+       uint32_t                logflags;
         __be32                  *agfl_bno;
         int                     startoff;
  
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h

index d4c057b..84ca09b 100644 (file)
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -121,7 +121,7 @@ void
  xfs_alloc_log_agf(
         struct xfs_trans *tp,   /* transaction pointer */
         struct xfs_buf  *bp,    /* buffer for a.g. freelist header */
-       int             fields);/* mask of fields to be logged (XFS_AGF_...) */
+       uint32_t        fields);/* mask of fields to be logged (XFS_AGF_...) */
  
  /*
   * Interface for inode allocation to force the pag data to be initialized.
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c

index 23523b8..14ae082 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -24,6 +24,11 @@
  #include "xfs_quota.h"
  #include "xfs_trans_space.h"
  #include "xfs_trace.h"
+#include "xfs_attr_item.h"
+#include "xfs_log.h"
+
+struct kmem_cache              *xfs_attri_cache;
+struct kmem_cache              *xfs_attrd_cache;
  
  /*
   * xfs_attr.c
@@ -53,26 +58,22 @@ STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp);
   */
  STATIC int xfs_attr_node_get(xfs_da_args_t *args);
  STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args);
-STATIC int xfs_attr_node_addname(struct xfs_delattr_context *dac);
-STATIC int xfs_attr_node_addname_find_attr(struct xfs_delattr_context *dac);
-STATIC int xfs_attr_node_addname_clear_incomplete(
-                               struct xfs_delattr_context *dac);
+static int xfs_attr_node_try_addname(struct xfs_attr_item *attr);
+STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_item *attr);
+STATIC int xfs_attr_node_remove_attr(struct xfs_attr_item *attr);
  STATIC int xfs_attr_node_hasname(xfs_da_args_t *args,
                                  struct xfs_da_state **state);
-STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
-STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
-STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac,
-                            struct xfs_buf **leaf_bp);
-STATIC int xfs_attr_node_removename(struct xfs_da_args *args,
-                                   struct xfs_da_state *state);
  
  int
  xfs_inode_hasattr(
         struct xfs_inode        *ip)
  {
-       if (!XFS_IFORK_Q(ip) ||
-           (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
-            ip->i_afp->if_nextents == 0))
+       if (!XFS_IFORK_Q(ip))
+               return 0;
+       if (!ip->i_afp)
+               return 0;
+       if (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
+           ip->i_afp->if_nextents == 0)
                 return 0;
         return 1;
  }
@@ -97,6 +98,123 @@ xfs_attr_is_leaf(
         return imap.br_startoff == 0 && imap.br_blockcount == 1;
  }
  
+/*
+ * XXX (dchinner): name path state saving and refilling is an optimisation to
+ * avoid needing to look up name entries after rolling transactions removing
+ * remote xattr blocks between the name entry lookup and name entry removal.
+ * This optimisation got sidelined when combining the set and remove state
+ * machines, but the code has been left in place because it is worthwhile to
+ * restore the optimisation once the combined state machine paths have settled.
+ *
+ * This comment is a public service announcement to remind Future Dave that he
+ * still needs to restore this code to working order.
+ */
+#if 0
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commits have released these buffers.
+ */
+static int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+       xfs_da_state_path_t *path;
+       xfs_da_state_blk_t *blk;
+       int level;
+
+       trace_xfs_attr_fillstate(state->args);
+
+       /*
+        * Roll down the "path" in the state structure, storing the on-disk
+        * block number for those buffers in the "path".
+        */
+       path = &state->path;
+       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+               if (blk->bp) {
+                       blk->disk_blkno = xfs_buf_daddr(blk->bp);
+                       blk->bp = NULL;
+               } else {
+                       blk->disk_blkno = 0;
+               }
+       }
+
+       /*
+        * Roll down the "altpath" in the state structure, storing the on-disk
+        * block number for those buffers in the "altpath".
+        */
+       path = &state->altpath;
+       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+               if (blk->bp) {
+                       blk->disk_blkno = xfs_buf_daddr(blk->bp);
+                       blk->bp = NULL;
+               } else {
+                       blk->disk_blkno = 0;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commits have released those
+ * buffers from our grip.
+ */
+static int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+       xfs_da_state_path_t *path;
+       xfs_da_state_blk_t *blk;
+       int level, error;
+
+       trace_xfs_attr_refillstate(state->args);
+
+       /*
+        * Roll down the "path" in the state structure, storing the on-disk
+        * block number for those buffers in the "path".
+        */
+       path = &state->path;
+       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+               if (blk->disk_blkno) {
+                       error = xfs_da3_node_read_mapped(state->args->trans,
+                                       state->args->dp, blk->disk_blkno,
+                                       &blk->bp, XFS_ATTR_FORK);
+                       if (error)
+                               return error;
+               } else {
+                       blk->bp = NULL;
+               }
+       }
+
+       /*
+        * Roll down the "altpath" in the state structure, storing the on-disk
+        * block number for those buffers in the "altpath".
+        */
+       path = &state->altpath;
+       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+               if (blk->disk_blkno) {
+                       error = xfs_da3_node_read_mapped(state->args->trans,
+                                       state->args->dp, blk->disk_blkno,
+                                       &blk->bp, XFS_ATTR_FORK);
+                       if (error)
+                               return error;
+               } else {
+                       blk->bp = NULL;
+               }
+       }
+
+       return 0;
+}
+#else
+static int xfs_attr_fillstate(xfs_da_state_t *state) { return 0; }
+#endif
+
  /*========================================================================
   * Overall external interface routines.
   *========================================================================*/
@@ -166,7 +284,7 @@ xfs_attr_get(
  /*
   * Calculate how many blocks we need for the new attribute,
   */
-STATIC int
+int
  xfs_attr_calc_size(
         struct xfs_da_args      *args,
         int                     *local)
@@ -199,6 +317,33 @@ xfs_attr_calc_size(
         return nblks;
  }
  
+/* Initialize transaction reservation for attr operations */
+void
+xfs_init_attr_trans(
+       struct xfs_da_args      *args,
+       struct xfs_trans_res    *tres,
+       unsigned int            *total)
+{
+       struct xfs_mount        *mp = args->dp->i_mount;
+
+       if (args->value) {
+               tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+                                M_RES(mp)->tr_attrsetrt.tr_logres *
+                                args->total;
+               tres->tr_logcount = XFS_ATTRSET_LOG_COUNT;
+               tres->tr_logflags = XFS_TRANS_PERM_LOG_RES;
+               *total = args->total;
+       } else {
+               *tres = M_RES(mp)->tr_attrrm;
+               *total = XFS_ATTRRM_SPACE_RES(mp);
+       }
+}
+
+/*
+ * Add an attr to a shortform fork. If there is no space,
+ * xfs_attr_shortform_addname() will convert to leaf format and return -ENOSPC.
+ * to use.
+ */
  STATIC int
  xfs_attr_try_sf_addname(
         struct xfs_inode        *dp,
@@ -230,411 +375,470 @@ xfs_attr_try_sf_addname(
         return error;
  }
  
-/*
- * Check to see if the attr should be upgraded from non-existent or shortform to
- * single-leaf-block attribute list.
- */
-static inline bool
-xfs_attr_is_shortform(
-       struct xfs_inode    *ip)
+static int
+xfs_attr_sf_addname(
+       struct xfs_attr_item            *attr)
  {
-       return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL ||
-              (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
-               ip->i_afp->if_nextents == 0);
+       struct xfs_da_args              *args = attr->xattri_da_args;
+       struct xfs_inode                *dp = args->dp;
+       int                             error = 0;
+
+       error = xfs_attr_try_sf_addname(dp, args);
+       if (error != -ENOSPC) {
+               ASSERT(!error || error == -EEXIST);
+               attr->xattri_dela_state = XFS_DAS_DONE;
+               goto out;
+       }
+
+       /*
+        * It won't fit in the shortform, transform to a leaf block.  GROT:
+        * another possible req'mt for a double-split btree op.
+        */
+       error = xfs_attr_shortform_to_leaf(args, &attr->xattri_leaf_bp);
+       if (error)
+               return error;
+
+       /*
+        * Prevent the leaf buffer from being unlocked so that a concurrent AIL
+        * push cannot grab the half-baked leaf buffer and run into problems
+        * with the write verifier.
+        */
+       xfs_trans_bhold(args->trans, attr->xattri_leaf_bp);
+       attr->xattri_dela_state = XFS_DAS_LEAF_ADD;
+out:
+       trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp);
+       return error;
  }
  
  /*
- * Checks to see if a delayed attribute transaction should be rolled.  If so,
- * transaction is finished or rolled as needed.
+ * Handle the state change on completion of a multi-state attr operation.
+ *
+ * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first
+ * modification in a attr replace operation and we still have to do the second
+ * state, indicated by @replace_state.
+ *
+ * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on
+ * completion of the second half of the attr replace operation we correctly
+ * signal that it is done.
   */
-STATIC int
-xfs_attr_trans_roll(
-       struct xfs_delattr_context      *dac)
+static enum xfs_delattr_state
+xfs_attr_complete_op(
+       struct xfs_attr_item    *attr,
+       enum xfs_delattr_state  replace_state)
  {
-       struct xfs_da_args              *args = dac->da_args;
-       int                             error;
+       struct xfs_da_args      *args = attr->xattri_da_args;
+       bool                    do_replace = args->op_flags & XFS_DA_OP_REPLACE;
+
+       args->op_flags &= ~XFS_DA_OP_REPLACE;
+       if (do_replace) {
+               args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+               return replace_state;
+       }
+       return XFS_DAS_DONE;
+}
+
+static int
+xfs_attr_leaf_addname(
+       struct xfs_attr_item    *attr)
+{
+       struct xfs_da_args      *args = attr->xattri_da_args;
+       int                     error;
+
+       ASSERT(xfs_attr_is_leaf(args->dp));
+
+       /*
+        * Use the leaf buffer we may already hold locked as a result of
+        * a sf-to-leaf conversion. The held buffer is no longer valid
+        * after this call, regardless of the result.
+        */
+       error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp);
+       attr->xattri_leaf_bp = NULL;
+
+       if (error == -ENOSPC) {
+               error = xfs_attr3_leaf_to_node(args);
+               if (error)
+                       return error;
  
-       if (dac->flags & XFS_DAC_DEFER_FINISH) {
                 /*
-                * The caller wants us to finish all the deferred ops so that we
-                * avoid pinning the log tail with a large number of deferred
-                * ops.
+                * We're not in leaf format anymore, so roll the transaction and
+                * retry the add to the newly allocated node block.
                  */
-               dac->flags &= ~XFS_DAC_DEFER_FINISH;
-               error = xfs_defer_finish(&args->trans);
-       } else
-               error = xfs_trans_roll_inode(&args->trans, args->dp);
+               attr->xattri_dela_state = XFS_DAS_NODE_ADD;
+               goto out;
+       }
+       if (error)
+               return error;
  
+       /*
+        * We need to commit and roll if we need to allocate remote xattr blocks
+        * or perform more xattr manipulations. Otherwise there is nothing more
+        * to do and we can return success.
+        */
+       if (args->rmtblkno)
+               attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT;
+       else
+               attr->xattri_dela_state = xfs_attr_complete_op(attr,
+                                                       XFS_DAS_LEAF_REPLACE);
+out:
+       trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp);
         return error;
  }
  
  /*
- * Set the attribute specified in @args.
+ * Add an entry to a node format attr tree.
+ *
+ * Note that we might still have a leaf here - xfs_attr_is_leaf() cannot tell
+ * the difference between leaf + remote attr blocks and a node format tree,
+ * so we may still end up having to convert from leaf to node format here.
   */
-int
-xfs_attr_set_args(
-       struct xfs_da_args              *args)
+static int
+xfs_attr_node_addname(
+       struct xfs_attr_item    *attr)
  {
-       struct xfs_buf                  *leaf_bp = NULL;
-       int                             error = 0;
-       struct xfs_delattr_context      dac = {
-               .da_args        = args,
-       };
+       struct xfs_da_args      *args = attr->xattri_da_args;
+       int                     error;
  
-       do {
-               error = xfs_attr_set_iter(&dac, &leaf_bp);
-               if (error != -EAGAIN)
-                       break;
+       ASSERT(!attr->xattri_leaf_bp);
  
-               error = xfs_attr_trans_roll(&dac);
-               if (error) {
-                       if (leaf_bp)
-                               xfs_trans_brelse(args->trans, leaf_bp);
+       error = xfs_attr_node_addname_find_attr(attr);
+       if (error)
+               return error;
+
+       error = xfs_attr_node_try_addname(attr);
+       if (error == -ENOSPC) {
+               error = xfs_attr3_leaf_to_node(args);
+               if (error)
                         return error;
-               }
-       } while (true);
+               /*
+                * No state change, we really are in node form now
+                * but we need the transaction rolled to continue.
+                */
+               goto out;
+       }
+       if (error)
+               return error;
  
+       if (args->rmtblkno)
+               attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT;
+       else
+               attr->xattri_dela_state = xfs_attr_complete_op(attr,
+                                                       XFS_DAS_NODE_REPLACE);
+out:
+       trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp);
         return error;
  }
  
-STATIC int
-xfs_attr_sf_addname(
-       struct xfs_delattr_context      *dac,
-       struct xfs_buf                  **leaf_bp)
+static int
+xfs_attr_rmtval_alloc(
+       struct xfs_attr_item            *attr)
  {
-       struct xfs_da_args              *args = dac->da_args;
-       struct xfs_inode                *dp = args->dp;
+       struct xfs_da_args              *args = attr->xattri_da_args;
         int                             error = 0;
  
         /*
-        * Try to add the attr to the attribute list in the inode.
+        * If there was an out-of-line value, allocate the blocks we
+        * identified for its storage and copy the value.  This is done
+        * after we create the attribute so that we don't overflow the
+        * maximum size of a transaction and/or hit a deadlock.
          */
-       error = xfs_attr_try_sf_addname(dp, args);
+       if (attr->xattri_blkcnt > 0) {
+               error = xfs_attr_rmtval_set_blk(attr);
+               if (error)
+                       return error;
+               /* Roll the transaction only if there is more to allocate. */
+               if (attr->xattri_blkcnt > 0)
+                       goto out;
+       }
  
-       /* Should only be 0, -EEXIST or -ENOSPC */
-       if (error != -ENOSPC)
+       error = xfs_attr_rmtval_set_value(args);
+       if (error)
                 return error;
  
+       attr->xattri_dela_state = xfs_attr_complete_op(attr,
+                                               ++attr->xattri_dela_state);
         /*
-        * It won't fit in the shortform, transform to a leaf block.  GROT:
-        * another possible req'mt for a double-split btree op.
+        * If we are not doing a rename, we've finished the operation but still
+        * have to clear the incomplete flag protecting the new attr from
+        * exposing partially initialised state if we crash during creation.
          */
-       error = xfs_attr_shortform_to_leaf(args, leaf_bp);
-       if (error)
-               return error;
+       if (attr->xattri_dela_state == XFS_DAS_DONE)
+               error = xfs_attr3_leaf_clearflag(args);
+out:
+       trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp);
+       return error;
+}
+
+/*
+ * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
+ * for later deletion of the entry.
+ */
+static int
+xfs_attr_leaf_mark_incomplete(
+       struct xfs_da_args      *args,
+       struct xfs_da_state     *state)
+{
+       int                     error;
  
         /*
-        * Prevent the leaf buffer from being unlocked so that a concurrent AIL
-        * push cannot grab the half-baked leaf buffer and run into problems
-        * with the write verifier.
+        * Fill in disk block numbers in the state structure
+        * so that we can get the buffers back after we commit
+        * several transactions in the following calls.
          */
-       xfs_trans_bhold(args->trans, *leaf_bp);
+       error = xfs_attr_fillstate(state);
+       if (error)
+               return error;
  
         /*
-        * We're still in XFS_DAS_UNINIT state here.  We've converted
-        * the attr fork to leaf format and will restart with the leaf
-        * add.
+        * Mark the attribute as INCOMPLETE
          */
-       trace_xfs_attr_sf_addname_return(XFS_DAS_UNINIT, args->dp);
-       dac->flags |= XFS_DAC_DEFER_FINISH;
-       return -EAGAIN;
+       return xfs_attr3_leaf_setflag(args);
  }
  
  /*
- * Set the attribute specified in @args.
- * This routine is meant to function as a delayed operation, and may return
- * -EAGAIN when the transaction needs to be rolled.  Calling functions will need
- * to handle this, and recall the function until a successful error code is
- * returned.
+ * Initial setup for xfs_attr_node_removename.  Make sure the attr is there and
+ * the blocks are valid.  Attr keys with remote blocks will be marked
+ * incomplete.
   */
-int
-xfs_attr_set_iter(
-       struct xfs_delattr_context      *dac,
-       struct xfs_buf                  **leaf_bp)
+static
+int xfs_attr_node_removename_setup(
+       struct xfs_attr_item            *attr)
  {
-       struct xfs_da_args              *args = dac->da_args;
-       struct xfs_inode                *dp = args->dp;
-       struct xfs_buf                  *bp = NULL;
-       int                             forkoff, error = 0;
+       struct xfs_da_args              *args = attr->xattri_da_args;
+       struct xfs_da_state             **state = &attr->xattri_da_state;
+       int                             error;
  
-       /* State machine switch */
-       switch (dac->dela_state) {
-       case XFS_DAS_UNINIT:
-               /*
-                * If the fork is shortform, attempt to add the attr. If there
-                * is no space, this converts to leaf format and returns
-                * -EAGAIN with the leaf buffer held across the roll. The caller
-                * will deal with a transaction roll error, but otherwise
-                * release the hold once we return with a clean transaction.
-                */
-               if (xfs_attr_is_shortform(dp))
-                       return xfs_attr_sf_addname(dac, leaf_bp);
-               if (*leaf_bp != NULL) {
-                       xfs_trans_bhold_release(args->trans, *leaf_bp);
-                       *leaf_bp = NULL;
-               }
+       error = xfs_attr_node_hasname(args, state);
+       if (error != -EEXIST)
+               goto out;
+       error = 0;
  
-               if (xfs_attr_is_leaf(dp)) {
-                       error = xfs_attr_leaf_try_add(args, *leaf_bp);
-                       if (error == -ENOSPC) {
-                               error = xfs_attr3_leaf_to_node(args);
-                               if (error)
-                                       return error;
-
-                               /*
-                                * Finish any deferred work items and roll the
-                                * transaction once more.  The goal here is to
-                                * call node_addname with the inode and
-                                * transaction in the same state (inode locked
-                                * and joined, transaction clean) no matter how
-                                * we got to this step.
-                                *
-                                * At this point, we are still in
-                                * XFS_DAS_UNINIT, but when we come back, we'll
-                                * be a node, so we'll fall down into the node
-                                * handling code below
-                                */
-                               dac->flags |= XFS_DAC_DEFER_FINISH;
-                               trace_xfs_attr_set_iter_return(
-                                       dac->dela_state, args->dp);
-                               return -EAGAIN;
-                       } else if (error) {
-                               return error;
-                       }
+       ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL);
+       ASSERT((*state)->path.blk[(*state)->path.active - 1].magic ==
+               XFS_ATTR_LEAF_MAGIC);
  
-                       dac->dela_state = XFS_DAS_FOUND_LBLK;
-               } else {
-                       error = xfs_attr_node_addname_find_attr(dac);
-                       if (error)
-                               return error;
-
-                       error = xfs_attr_node_addname(dac);
-                       if (error)
-                               return error;
+       error = xfs_attr_leaf_mark_incomplete(args, *state);
+       if (error)
+               goto out;
+       if (args->rmtblkno > 0)
+               error = xfs_attr_rmtval_invalidate(args);
+out:
+       if (error)
+               xfs_da_state_free(*state);
  
-                       dac->dela_state = XFS_DAS_FOUND_NBLK;
-               }
-               trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
-               return -EAGAIN;
-       case XFS_DAS_FOUND_LBLK:
-               /*
-                * If there was an out-of-line value, allocate the blocks we
-                * identified for its storage and copy the value.  This is done
-                * after we create the attribute so that we don't overflow the
-                * maximum size of a transaction and/or hit a deadlock.
-                */
+       return error;
+}
  
-               /* Open coded xfs_attr_rmtval_set without trans handling */
-               if ((dac->flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) {
-                       dac->flags |= XFS_DAC_LEAF_ADDNAME_INIT;
-                       if (args->rmtblkno > 0) {
-                               error = xfs_attr_rmtval_find_space(dac);
-                               if (error)
-                                       return error;
-                       }
-               }
+/*
+ * Remove the original attr we have just replaced. This is dependent on the
+ * original lookup and insert placing the old attr in args->blkno/args->index
+ * and the new attr in args->blkno2/args->index2.
+ */
+static int
+xfs_attr_leaf_remove_attr(
+       struct xfs_attr_item            *attr)
+{
+       struct xfs_da_args              *args = attr->xattri_da_args;
+       struct xfs_inode                *dp = args->dp;
+       struct xfs_buf                  *bp = NULL;
+       int                             forkoff;
+       int                             error;
  
-               /*
-                * Repeat allocating remote blocks for the attr value until
-                * blkcnt drops to zero.
-                */
-               if (dac->blkcnt > 0) {
-                       error = xfs_attr_rmtval_set_blk(dac);
-                       if (error)
-                               return error;
-                       trace_xfs_attr_set_iter_return(dac->dela_state,
-                                                      args->dp);
-                       return -EAGAIN;
-               }
+       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+                                  &bp);
+       if (error)
+               return error;
  
-               error = xfs_attr_rmtval_set_value(args);
-               if (error)
-                       return error;
+       xfs_attr3_leaf_remove(bp, args);
  
-               /*
-                * If this is not a rename, clear the incomplete flag and we're
-                * done.
-                */
-               if (!(args->op_flags & XFS_DA_OP_RENAME)) {
-                       if (args->rmtblkno > 0)
-                               error = xfs_attr3_leaf_clearflag(args);
-                       return error;
-               }
+       forkoff = xfs_attr_shortform_allfit(bp, dp);
+       if (forkoff)
+               error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+               /* bp is gone due to xfs_da_shrink_inode */
  
-               /*
-                * If this is an atomic rename operation, we must "flip" the
-                * incomplete flags on the "new" and "old" attribute/value pairs
-                * so that one disappears and one appears atomically.  Then we
-                * must remove the "old" attribute/value pair.
-                *
-                * In a separate transaction, set the incomplete flag on the
-                * "old" attr and clear the incomplete flag on the "new" attr.
-                */
-               error = xfs_attr3_leaf_flipflags(args);
-               if (error)
-                       return error;
-               /*
-                * Commit the flag value change and start the next trans in
-                * series.
-                */
-               dac->dela_state = XFS_DAS_FLIP_LFLAG;
-               trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
-               return -EAGAIN;
-       case XFS_DAS_FLIP_LFLAG:
-               /*
-                * Dismantle the "old" attribute/value pair by removing a
-                * "remote" value (if it exists).
-                */
-               xfs_attr_restore_rmt_blk(args);
-               error = xfs_attr_rmtval_invalidate(args);
-               if (error)
-                       return error;
+       return error;
+}
  
-               fallthrough;
-       case XFS_DAS_RM_LBLK:
-               /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
-               dac->dela_state = XFS_DAS_RM_LBLK;
-               if (args->rmtblkno) {
-                       error = xfs_attr_rmtval_remove(dac);
-                       if (error == -EAGAIN)
-                               trace_xfs_attr_set_iter_return(
-                                       dac->dela_state, args->dp);
-                       if (error)
-                               return error;
+/*
+ * Shrink an attribute from leaf to shortform. Used by the node format remove
+ * path when the node format collapses to a single block and so we have to check
+ * if it can be collapsed further.
+ */
+static int
+xfs_attr_leaf_shrink(
+       struct xfs_da_args      *args)
+{
+       struct xfs_inode        *dp = args->dp;
+       struct xfs_buf          *bp;
+       int                     forkoff;
+       int                     error;
  
-                       dac->dela_state = XFS_DAS_RD_LEAF;
-                       trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
-                       return -EAGAIN;
-               }
+       if (!xfs_attr_is_leaf(dp))
+               return 0;
  
-               fallthrough;
-       case XFS_DAS_RD_LEAF:
-               /*
-                * This is the last step for leaf format. Read the block with
-                * the old attr, remove the old attr, check for shortform
-                * conversion and return.
-                */
-               error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
-                                          &bp);
-               if (error)
-                       return error;
+       error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+       if (error)
+               return error;
  
-               xfs_attr3_leaf_remove(bp, args);
+       forkoff = xfs_attr_shortform_allfit(bp, dp);
+       if (forkoff) {
+               error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+               /* bp is gone due to xfs_da_shrink_inode */
+       } else {
+               xfs_trans_brelse(args->trans, bp);
+       }
  
-               forkoff = xfs_attr_shortform_allfit(bp, dp);
-               if (forkoff)
-                       error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
-                       /* bp is gone due to xfs_da_shrink_inode */
+       return error;
+}
  
-               return error;
+/*
+ * Run the attribute operation specified in @attr.
+ *
+ * This routine is meant to function as a delayed operation and will set the
+ * state to XFS_DAS_DONE when the operation is complete.  Calling functions will
+ * need to handle this, and recall the function until either an error or
+ * XFS_DAS_DONE is detected.
+ */
+int
+xfs_attr_set_iter(
+       struct xfs_attr_item            *attr)
+{
+       struct xfs_da_args              *args = attr->xattri_da_args;
+       int                             error = 0;
  
-       case XFS_DAS_FOUND_NBLK:
-               /*
-                * Find space for remote blocks and fall into the allocation
-                * state.
-                */
-               if (args->rmtblkno > 0) {
-                       error = xfs_attr_rmtval_find_space(dac);
-                       if (error)
-                               return error;
+       /* State machine switch */
+next_state:
+       switch (attr->xattri_dela_state) {
+       case XFS_DAS_UNINIT:
+               ASSERT(0);
+               return -EFSCORRUPTED;
+       case XFS_DAS_SF_ADD:
+               return xfs_attr_sf_addname(attr);
+       case XFS_DAS_LEAF_ADD:
+               return xfs_attr_leaf_addname(attr);
+       case XFS_DAS_NODE_ADD:
+               return xfs_attr_node_addname(attr);
+
+       case XFS_DAS_SF_REMOVE:
+               error = xfs_attr_sf_removename(args);
+               attr->xattri_dela_state = xfs_attr_complete_op(attr,
+                                               xfs_attr_init_add_state(args));
+               break;
+       case XFS_DAS_LEAF_REMOVE:
+               error = xfs_attr_leaf_removename(args);
+               attr->xattri_dela_state = xfs_attr_complete_op(attr,
+                                               xfs_attr_init_add_state(args));
+               break;
+       case XFS_DAS_NODE_REMOVE:
+               error = xfs_attr_node_removename_setup(attr);
+               if (error == -ENOATTR &&
+                   (args->op_flags & XFS_DA_OP_RECOVERY)) {
+                       attr->xattri_dela_state = xfs_attr_complete_op(attr,
+                                               xfs_attr_init_add_state(args));
+                       error = 0;
+                       break;
                 }
+               if (error)
+                       return error;
+               attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT;
+               if (args->rmtblkno == 0)
+                       attr->xattri_dela_state++;
+               break;
  
+       case XFS_DAS_LEAF_SET_RMT:
+       case XFS_DAS_NODE_SET_RMT:
+               error = xfs_attr_rmtval_find_space(attr);
+               if (error)
+                       return error;
+               attr->xattri_dela_state++;
                 fallthrough;
-       case XFS_DAS_ALLOC_NODE:
-               /*
-                * If there was an out-of-line value, allocate the blocks we
-                * identified for its storage and copy the value.  This is done
-                * after we create the attribute so that we don't overflow the
-                * maximum size of a transaction and/or hit a deadlock.
-                */
-               dac->dela_state = XFS_DAS_ALLOC_NODE;
-               if (args->rmtblkno > 0) {
-                       if (dac->blkcnt > 0) {
-                               error = xfs_attr_rmtval_set_blk(dac);
-                               if (error)
-                                       return error;
-                               trace_xfs_attr_set_iter_return(
-                                       dac->dela_state, args->dp);
-                               return -EAGAIN;
-                       }
-
-                       error = xfs_attr_rmtval_set_value(args);
-                       if (error)
-                               return error;
-               }
  
-               /*
-                * If this was not a rename, clear the incomplete flag and we're
-                * done.
-                */
-               if (!(args->op_flags & XFS_DA_OP_RENAME)) {
-                       if (args->rmtblkno > 0)
-                               error = xfs_attr3_leaf_clearflag(args);
-                       goto out;
-               }
+       case XFS_DAS_LEAF_ALLOC_RMT:
+       case XFS_DAS_NODE_ALLOC_RMT:
+               error = xfs_attr_rmtval_alloc(attr);
+               if (error)
+                       return error;
+               if (attr->xattri_dela_state == XFS_DAS_DONE)
+                       break;
+               goto next_state;
  
+       case XFS_DAS_LEAF_REPLACE:
+       case XFS_DAS_NODE_REPLACE:
                 /*
-                * If this is an atomic rename operation, we must "flip" the
-                * incomplete flags on the "new" and "old" attribute/value pairs
-                * so that one disappears and one appears atomically.  Then we
-                * must remove the "old" attribute/value pair.
-                *
-                * In a separate transaction, set the incomplete flag on the
-                * "old" attr and clear the incomplete flag on the "new" attr.
+                * We must "flip" the incomplete flags on the "new" and "old"
+                * attribute/value pairs so that one disappears and one appears
+                * atomically.
                  */
                 error = xfs_attr3_leaf_flipflags(args);
                 if (error)
-                       goto out;
+                       return error;
                 /*
-                * Commit the flag value change and start the next trans in
-                * series
+                * We must commit the flag value change now to make it atomic
+                * and then we can start the next trans in series at REMOVE_OLD.
                  */
-               dac->dela_state = XFS_DAS_FLIP_NFLAG;
-               trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
-               return -EAGAIN;
+               attr->xattri_dela_state++;
+               break;
  
-       case XFS_DAS_FLIP_NFLAG:
+       case XFS_DAS_LEAF_REMOVE_OLD:
+       case XFS_DAS_NODE_REMOVE_OLD:
                 /*
-                * Dismantle the "old" attribute/value pair by removing a
-                * "remote" value (if it exists).
+                * If we have a remote attr, start the process of removing it
+                * by invalidating any cached buffers.
+                *
+                * If we don't have a remote attr, we skip the remote block
+                * removal state altogether with a second state increment.
                  */
                 xfs_attr_restore_rmt_blk(args);
-
-               error = xfs_attr_rmtval_invalidate(args);
-               if (error)
-                       return error;
-
-               fallthrough;
-       case XFS_DAS_RM_NBLK:
-               /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
-               dac->dela_state = XFS_DAS_RM_NBLK;
                 if (args->rmtblkno) {
-                       error = xfs_attr_rmtval_remove(dac);
-                       if (error == -EAGAIN)
-                               trace_xfs_attr_set_iter_return(
-                                       dac->dela_state, args->dp);
-
+                       error = xfs_attr_rmtval_invalidate(args);
                         if (error)
                                 return error;
+               } else {
+                       attr->xattri_dela_state++;
+               }
+
+               attr->xattri_dela_state++;
+               goto next_state;
  
-                       dac->dela_state = XFS_DAS_CLR_FLAG;
-                       trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
-                       return -EAGAIN;
+       case XFS_DAS_LEAF_REMOVE_RMT:
+       case XFS_DAS_NODE_REMOVE_RMT:
+               error = xfs_attr_rmtval_remove(attr);
+               if (error == -EAGAIN) {
+                       error = 0;
+                       break;
                 }
+               if (error)
+                       return error;
  
-               fallthrough;
-       case XFS_DAS_CLR_FLAG:
                 /*
-                * The last state for node format. Look up the old attr and
-                * remove it.
+                * We've finished removing the remote attr blocks, so commit the
+                * transaction and move on to removing the attr name from the
+                * leaf/node block. Removing the attr might require a full
+                * transaction reservation for btree block freeing, so we
+                * can't do that in the same transaction where we removed the
+                * remote attr blocks.
                  */
-               error = xfs_attr_node_addname_clear_incomplete(dac);
+               attr->xattri_dela_state++;
+               break;
+
+       case XFS_DAS_LEAF_REMOVE_ATTR:
+               error = xfs_attr_leaf_remove_attr(attr);
+               attr->xattri_dela_state = xfs_attr_complete_op(attr,
+                                               xfs_attr_init_add_state(args));
+               break;
+
+       case XFS_DAS_NODE_REMOVE_ATTR:
+               error = xfs_attr_node_remove_attr(attr);
+               if (!error)
+                       error = xfs_attr_leaf_shrink(args);
+               attr->xattri_dela_state = xfs_attr_complete_op(attr,
+                                               xfs_attr_init_add_state(args));
                 break;
         default:
                 ASSERT(0);
                 break;
         }
-out:
+
+       trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp);
         return error;
  }
  
@@ -668,30 +872,79 @@ xfs_attr_lookup(
         return xfs_attr_node_hasname(args, NULL);
  }
  
-/*
- * Remove the attribute specified in @args.
- */
-int
-xfs_attr_remove_args(
+static int
+xfs_attr_item_init(
+       struct xfs_da_args      *args,
+       unsigned int            op_flags,       /* op flag (set or remove) */
+       struct xfs_attr_item    **attr)         /* new xfs_attr_item */
+{
+
+       struct xfs_attr_item    *new;
+
+       new = kmem_zalloc(sizeof(struct xfs_attr_item), KM_NOFS);
+       new->xattri_op_flags = op_flags;
+       new->xattri_da_args = args;
+
+       *attr = new;
+       return 0;
+}
+
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_add(
         struct xfs_da_args      *args)
  {
-       int                             error;
-       struct xfs_delattr_context      dac = {
-               .da_args        = args,
-       };
+       struct xfs_attr_item    *new;
+       int                     error = 0;
  
-       do {
-               error = xfs_attr_remove_iter(&dac);
-               if (error != -EAGAIN)
-                       break;
+       error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_SET, &new);
+       if (error)
+               return error;
  
-               error = xfs_attr_trans_roll(&dac);
-               if (error)
-                       return error;
+       new->xattri_dela_state = xfs_attr_init_add_state(args);
+       xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+       trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
+
+       return 0;
+}
  
-       } while (true);
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_replace(
+       struct xfs_da_args      *args)
+{
+       struct xfs_attr_item    *new;
+       int                     error = 0;
  
-       return error;
+       error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REPLACE, &new);
+       if (error)
+               return error;
+
+       new->xattri_dela_state = xfs_attr_init_replace_state(args);
+       xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+       trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp);
+
+       return 0;
+}
+
+/* Removes an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_remove(
+       struct xfs_da_args      *args)
+{
+
+       struct xfs_attr_item    *new;
+       int                     error;
+
+       error  = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REMOVE, &new);
+       if (error)
+               return error;
+
+       new->xattri_dela_state = xfs_attr_init_remove_state(args);
+       xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+       trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp);
+
+       return 0;
  }
  
  /*
@@ -709,6 +962,7 @@ xfs_attr_set(
         int                     error, local;
         int                     rmt_blks = 0;
         unsigned int            total;
+       int                     delayed = xfs_has_larp(mp);
  
         if (xfs_is_shutdown(dp->i_mount))
                 return -EIO;
@@ -730,8 +984,6 @@ xfs_attr_set(
  
         if (args->value) {
                 XFS_STATS_INC(mp, xs_attr_set);
-
-               args->op_flags |= XFS_DA_OP_ADDNAME;
                 args->total = xfs_attr_calc_size(args, &local);
  
                 /*
@@ -748,61 +1000,68 @@ xfs_attr_set(
                                 return error;
                 }
  
-               tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
-                                M_RES(mp)->tr_attrsetrt.tr_logres *
-                                       args->total;
-               tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
-               tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-               total = args->total;
-
                 if (!local)
                         rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen);
         } else {
                 XFS_STATS_INC(mp, xs_attr_remove);
-
-               tres = M_RES(mp)->tr_attrrm;
-               total = XFS_ATTRRM_SPACE_RES(mp);
                 rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
         }
  
+       if (delayed) {
+               error = xfs_attr_use_log_assist(mp);
+               if (error)
+                       return error;
+       }
+
         /*
          * Root fork attributes can use reserved data blocks for this
          * operation if necessary
          */
+       xfs_init_attr_trans(args, &tres, &total);
         error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans);
         if (error)
-               return error;
+               goto drop_incompat;
  
         if (args->value || xfs_inode_hasattr(dp)) {
                 error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
                                 XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
+               if (error == -EFBIG)
+                       error = xfs_iext_count_upgrade(args->trans, dp,
+                                       XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
                 if (error)
                         goto out_trans_cancel;
         }
  
         error = xfs_attr_lookup(args);
-       if (args->value) {
-               if (error == -EEXIST && (args->attr_flags & XATTR_CREATE))
-                       goto out_trans_cancel;
-               if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
-                       goto out_trans_cancel;
-               if (error != -ENOATTR && error != -EEXIST)
+       switch (error) {
+       case -EEXIST:
+               /* if no value, we are performing a remove operation */
+               if (!args->value) {
+                       error = xfs_attr_defer_remove(args);
+                       break;
+               }
+               /* Pure create fails if the attr already exists */
+               if (args->attr_flags & XATTR_CREATE)
                         goto out_trans_cancel;
  
-               error = xfs_attr_set_args(args);
-               if (error)
-                       goto out_trans_cancel;
-               /* shortform attribute has already been committed */
-               if (!args->trans)
-                       goto out_unlock;
-       } else {
-               if (error != -EEXIST)
+               error = xfs_attr_defer_replace(args);
+               break;
+       case -ENOATTR:
+               /* Can't remove what isn't there. */
+               if (!args->value)
                         goto out_trans_cancel;
  
-               error = xfs_attr_remove_args(args);
-               if (error)
+               /* Pure replace fails if no existing attr to replace. */
+               if (args->attr_flags & XATTR_REPLACE)
                         goto out_trans_cancel;
+
+               error = xfs_attr_defer_add(args);
+               break;
+       default:
+               goto out_trans_cancel;
         }
+       if (error)
+               goto out_trans_cancel;
  
         /*
          * If this is a synchronous mount, make sure that the
@@ -821,6 +1080,9 @@ xfs_attr_set(
         error = xfs_trans_commit(args->trans);
  out_unlock:
         xfs_iunlock(dp, XFS_ILOCK_EXCL);
+drop_incompat:
+       if (delayed)
+               xlog_drop_incompat_feat(mp->m_log);
         return error;
  
  out_trans_cancel:
@@ -829,6 +1091,40 @@ out_trans_cancel:
         goto out_unlock;
  }
  
+int __init
+xfs_attri_init_cache(void)
+{
+       xfs_attri_cache = kmem_cache_create("xfs_attri",
+                                           sizeof(struct xfs_attri_log_item),
+                                           0, 0, NULL);
+
+       return xfs_attri_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_attri_destroy_cache(void)
+{
+       kmem_cache_destroy(xfs_attri_cache);
+       xfs_attri_cache = NULL;
+}
+
+int __init
+xfs_attrd_init_cache(void)
+{
+       xfs_attrd_cache = kmem_cache_create("xfs_attrd",
+                                           sizeof(struct xfs_attrd_log_item),
+                                           0, 0, NULL);
+
+       return xfs_attrd_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_attrd_destroy_cache(void)
+{
+       kmem_cache_destroy(xfs_attrd_cache);
+       xfs_attrd_cache = NULL;
+}
+
  /*========================================================================
   * External routines when attribute list is inside the inode
   *========================================================================*/
@@ -845,28 +1141,41 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
   * Add a name to the shortform attribute list structure
   * This is the external routine.
   */
-STATIC int
-xfs_attr_shortform_addname(xfs_da_args_t *args)
+static int
+xfs_attr_shortform_addname(
+       struct xfs_da_args      *args)
  {
-       int newsize, forkoff, retval;
+       int                     newsize, forkoff;
+       int                     error;
  
         trace_xfs_attr_sf_addname(args);
  
-       retval = xfs_attr_shortform_lookup(args);
-       if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
-               return retval;
-       if (retval == -EEXIST) {
-               if (args->attr_flags & XATTR_CREATE)
-                       return retval;
-               retval = xfs_attr_sf_removename(args);
-               if (retval)
-                       return retval;
+       error = xfs_attr_shortform_lookup(args);
+       switch (error) {
+       case -ENOATTR:
+               if (args->op_flags & XFS_DA_OP_REPLACE)
+                       return error;
+               break;
+       case -EEXIST:
+               if (!(args->op_flags & XFS_DA_OP_REPLACE))
+                       return error;
+
+               error = xfs_attr_sf_removename(args);
+               if (error)
+                       return error;
+
                 /*
-                * Since we have removed the old attr, clear ATTR_REPLACE so
-                * that the leaf format add routine won't trip over the attr
-                * not being around.
+                * Since we have removed the old attr, clear XFS_DA_OP_REPLACE
+                * so that the new attr doesn't fit in shortform format, the
+                * leaf format add routine won't trip over the attr not being
+                * around.
                  */
-               args->attr_flags &= ~XATTR_REPLACE;
+               args->op_flags &= ~XFS_DA_OP_REPLACE;
+               break;
+       case 0:
+               break;
+       default:
+               return error;
         }
  
         if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
@@ -889,8 +1198,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
   * External routines when attribute list is one block
   *========================================================================*/
  
-/* Store info about a remote block */
-STATIC void
+/* Save the current remote block info and clear the current pointers. */
+static void
  xfs_attr_save_rmt_blk(
         struct xfs_da_args      *args)
  {
@@ -899,10 +1208,13 @@ xfs_attr_save_rmt_blk(
         args->rmtblkno2 = args->rmtblkno;
         args->rmtblkcnt2 = args->rmtblkcnt;
         args->rmtvaluelen2 = args->rmtvaluelen;
+       args->rmtblkno = 0;
+       args->rmtblkcnt = 0;
+       args->rmtvaluelen = 0;
  }
  
  /* Set stored info about a remote block */
-STATIC void
+static void
  xfs_attr_restore_rmt_blk(
         struct xfs_da_args      *args)
  {
@@ -928,45 +1240,54 @@ xfs_attr_leaf_try_add(
         struct xfs_da_args      *args,
         struct xfs_buf          *bp)
  {
-       int                     retval;
+       int                     error;
  
         /*
-        * Look up the given attribute in the leaf block.  Figure out if
-        * the given flags produce an error or call for an atomic rename.
+        * If the caller provided a buffer to us, it is locked and held in
+        * the transaction because it just did a shortform to leaf conversion.
+        * Hence we don't need to read it again. Otherwise read in the leaf
+        * buffer.
          */
-       retval = xfs_attr_leaf_hasname(args, &bp);
-       if (retval != -ENOATTR && retval != -EEXIST)
-               return retval;
-       if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
-               goto out_brelse;
-       if (retval == -EEXIST) {
-               if (args->attr_flags & XATTR_CREATE)
-                       goto out_brelse;
+       if (bp) {
+               xfs_trans_bhold_release(args->trans, bp);
+       } else {
+               error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+               if (error)
+                       return error;
+       }
  
-               trace_xfs_attr_leaf_replace(args);
-
-               /* save the attribute state for later removal*/
-               args->op_flags |= XFS_DA_OP_RENAME;     /* an atomic rename */
-               xfs_attr_save_rmt_blk(args);
+       /*
+        * Look up the xattr name to set the insertion point for the new xattr.
+        */
+       error = xfs_attr3_leaf_lookup_int(bp, args);
+       switch (error) {
+       case -ENOATTR:
+               if (args->op_flags & XFS_DA_OP_REPLACE)
+                       goto out_brelse;
+               break;
+       case -EEXIST:
+               if (!(args->op_flags & XFS_DA_OP_REPLACE))
+                       goto out_brelse;
  
+               trace_xfs_attr_leaf_replace(args);
                 /*
-                * clear the remote attr state now that it is saved so that the
-                * values reflect the state of the attribute we are about to
+                * Save the existing remote attr state so that the current
+                * values reflect the state of the new attribute we are about to
                  * add, not the attribute we just found and will remove later.
                  */
-               args->rmtblkno = 0;
-               args->rmtblkcnt = 0;
-               args->rmtvaluelen = 0;
+               xfs_attr_save_rmt_blk(args);
+               break;
+       case 0:
+               break;
+       default:
+               goto out_brelse;
         }
  
-       /*
-        * Add the attribute to the leaf block
-        */
         return xfs_attr3_leaf_add(bp, args);
  
  out_brelse:
         xfs_trans_brelse(args->trans, bp);
-       return retval;
+       return error;
  }
  
  /*
@@ -1012,9 +1333,10 @@ xfs_attr_leaf_removename(
         dp = args->dp;
  
         error = xfs_attr_leaf_hasname(args, &bp);
-
         if (error == -ENOATTR) {
                 xfs_trans_brelse(args->trans, bp);
+               if (args->op_flags & XFS_DA_OP_RECOVERY)
+                       return 0;
                 return error;
         } else if (error != -EEXIST)
                 return error;
@@ -1098,46 +1420,45 @@ xfs_attr_node_hasname(
  
  STATIC int
  xfs_attr_node_addname_find_attr(
-       struct xfs_delattr_context      *dac)
+        struct xfs_attr_item   *attr)
  {
-       struct xfs_da_args              *args = dac->da_args;
-       int                             retval;
+       struct xfs_da_args      *args = attr->xattri_da_args;
+       int                     error;
  
         /*
          * Search to see if name already exists, and get back a pointer
          * to where it should go.
          */
-       retval = xfs_attr_node_hasname(args, &dac->da_state);
-       if (retval != -ENOATTR && retval != -EEXIST)
-               goto error;
-
-       if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
-               goto error;
-       if (retval == -EEXIST) {
-               if (args->attr_flags & XATTR_CREATE)
+       error = xfs_attr_node_hasname(args, &attr->xattri_da_state);
+       switch (error) {
+       case -ENOATTR:
+               if (args->op_flags & XFS_DA_OP_REPLACE)
+                       goto error;
+               break;
+       case -EEXIST:
+               if (!(args->op_flags & XFS_DA_OP_REPLACE))
                         goto error;
  
-               trace_xfs_attr_node_replace(args);
-
-               /* save the attribute state for later removal*/
-               args->op_flags |= XFS_DA_OP_RENAME;     /* atomic rename op */
-               xfs_attr_save_rmt_blk(args);
  
+               trace_xfs_attr_node_replace(args);
                 /*
-                * clear the remote attr state now that it is saved so that the
-                * values reflect the state of the attribute we are about to
+                * Save the existing remote attr state so that the current
+                * values reflect the state of the new attribute we are about to
                  * add, not the attribute we just found and will remove later.
                  */
-               args->rmtblkno = 0;
-               args->rmtblkcnt = 0;
-               args->rmtvaluelen = 0;
+               xfs_attr_save_rmt_blk(args);
+               break;
+       case 0:
+               break;
+       default:
+               goto error;
         }
  
         return 0;
  error:
-       if (dac->da_state)
-               xfs_da_state_free(dac->da_state);
-       return retval;
+       if (attr->xattri_da_state)
+               xfs_da_state_free(attr->xattri_da_state);
+       return error;
  }
  
  /*
@@ -1146,21 +1467,13 @@ error:
   * This will involve walking down the Btree, and may involve splitting
   * leaf nodes and even splitting intermediate nodes up to and including
   * the root node (a special case of an intermediate node).
- *
- * "Remote" attribute values confuse the issue and atomic rename operations
- * add a whole extra layer of confusion on top of that.
- *
- * This routine is meant to function as a delayed operation, and may return
- * -EAGAIN when the transaction needs to be rolled.  Calling functions will need
- * to handle this, and recall the function until a successful error code is
- *returned.
   */
-STATIC int
-xfs_attr_node_addname(
-       struct xfs_delattr_context      *dac)
+static int
+xfs_attr_node_try_addname(
+       struct xfs_attr_item            *attr)
  {
-       struct xfs_da_args              *args = dac->da_args;
-       struct xfs_da_state             *state = dac->da_state;
+       struct xfs_da_args              *args = attr->xattri_da_args;
+       struct xfs_da_state             *state = attr->xattri_da_state;
         struct xfs_da_state_blk         *blk;
         int                             error;
  
@@ -1175,25 +1488,9 @@ xfs_attr_node_addname(
                         /*
                          * Its really a single leaf node, but it had
                          * out-of-line values so it looked like it *might*
-                        * have been a b-tree.
+                        * have been a b-tree. Let the caller deal with this.
                          */
-                       xfs_da_state_free(state);
-                       state = NULL;
-                       error = xfs_attr3_leaf_to_node(args);
-                       if (error)
-                               goto out;
-
-                       /*
-                        * Now that we have converted the leaf to a node, we can
-                        * roll the transaction, and try xfs_attr3_leaf_add
-                        * again on re-entry.  No need to set dela_state to do
-                        * this. dela_state is still unset by this function at
-                        * this point.
-                        */
-                       dac->flags |= XFS_DAC_DEFER_FINISH;
-                       trace_xfs_attr_node_addname_return(
-                                       dac->dela_state, args->dp);
-                       return -EAGAIN;
+                       goto out;
                 }
  
                 /*
@@ -1205,7 +1502,6 @@ xfs_attr_node_addname(
                 error = xfs_da3_split(state);
                 if (error)
                         goto out;
-               dac->flags |= XFS_DAC_DEFER_FINISH;
         } else {
                 /*
                  * Addition succeeded, update Btree hashvals.
@@ -1214,24 +1510,42 @@ xfs_attr_node_addname(
         }
  
  out:
-       if (state)
-               xfs_da_state_free(state);
+       xfs_da_state_free(state);
         return error;
  }
  
+static int
+xfs_attr_node_removename(
+       struct xfs_da_args      *args,
+       struct xfs_da_state     *state)
+{
+       struct xfs_da_state_blk *blk;
+       int                     retval;
  
-STATIC int
-xfs_attr_node_addname_clear_incomplete(
-       struct xfs_delattr_context      *dac)
+       /*
+        * Remove the name and update the hashvals in the tree.
+        */
+       blk = &state->path.blk[state->path.active-1];
+       ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+       retval = xfs_attr3_leaf_remove(blk->bp, args);
+       xfs_da3_fixhashpath(state, &state->path);
+
+       return retval;
+}
+
+static int
+xfs_attr_node_remove_attr(
+       struct xfs_attr_item            *attr)
  {
-       struct xfs_da_args              *args = dac->da_args;
+       struct xfs_da_args              *args = attr->xattri_da_args;
         struct xfs_da_state             *state = NULL;
         int                             retval = 0;
         int                             error = 0;
  
         /*
-        * Re-find the "old" attribute entry after any split ops. The INCOMPLETE
-        * flag means that we will find the "old" attr, not the "new" one.
+        * The attr we are removing has already been marked incomplete, so
+        * we need to set the filter appropriately to re-find the "old"
+        * attribute entry after any split ops.
          */
         args->attr_filter |= XFS_ATTR_INCOMPLETE;
         state = xfs_da_state_alloc(args);
@@ -1260,362 +1574,6 @@ out:
         return retval;
  }
  
-/*
- * Shrink an attribute from leaf to shortform
- */
-STATIC int
-xfs_attr_node_shrink(
-       struct xfs_da_args      *args,
-       struct xfs_da_state     *state)
-{
-       struct xfs_inode        *dp = args->dp;
-       int                     error, forkoff;
-       struct xfs_buf          *bp;
-
-       /*
-        * Have to get rid of the copy of this dabuf in the state.
-        */
-       ASSERT(state->path.active == 1);
-       ASSERT(state->path.blk[0].bp);
-       state->path.blk[0].bp = NULL;
-
-       error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
-       if (error)
-               return error;
-
-       forkoff = xfs_attr_shortform_allfit(bp, dp);
-       if (forkoff) {
-               error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
-               /* bp is gone due to xfs_da_shrink_inode */
-       } else
-               xfs_trans_brelse(args->trans, bp);
-
-       return error;
-}
-
-/*
- * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
- * for later deletion of the entry.
- */
-STATIC int
-xfs_attr_leaf_mark_incomplete(
-       struct xfs_da_args      *args,
-       struct xfs_da_state     *state)
-{
-       int                     error;
-
-       /*
-        * Fill in disk block numbers in the state structure
-        * so that we can get the buffers back after we commit
-        * several transactions in the following calls.
-        */
-       error = xfs_attr_fillstate(state);
-       if (error)
-               return error;
-
-       /*
-        * Mark the attribute as INCOMPLETE
-        */
-       return xfs_attr3_leaf_setflag(args);
-}
-
-/*
- * Initial setup for xfs_attr_node_removename.  Make sure the attr is there and
- * the blocks are valid.  Attr keys with remote blocks will be marked
- * incomplete.
- */
-STATIC
-int xfs_attr_node_removename_setup(
-       struct xfs_delattr_context      *dac)
-{
-       struct xfs_da_args              *args = dac->da_args;
-       struct xfs_da_state             **state = &dac->da_state;
-       int                             error;
-
-       error = xfs_attr_node_hasname(args, state);
-       if (error != -EEXIST)
-               goto out;
-       error = 0;
-
-       ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL);
-       ASSERT((*state)->path.blk[(*state)->path.active - 1].magic ==
-               XFS_ATTR_LEAF_MAGIC);
-
-       if (args->rmtblkno > 0) {
-               error = xfs_attr_leaf_mark_incomplete(args, *state);
-               if (error)
-                       goto out;
-
-               error = xfs_attr_rmtval_invalidate(args);
-       }
-out:
-       if (error)
-               xfs_da_state_free(*state);
-
-       return error;
-}
-
-STATIC int
-xfs_attr_node_removename(
-       struct xfs_da_args      *args,
-       struct xfs_da_state     *state)
-{
-       struct xfs_da_state_blk *blk;
-       int                     retval;
-
-       /*
-        * Remove the name and update the hashvals in the tree.
-        */
-       blk = &state->path.blk[state->path.active-1];
-       ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-       retval = xfs_attr3_leaf_remove(blk->bp, args);
-       xfs_da3_fixhashpath(state, &state->path);
-
-       return retval;
-}
-
-/*
- * Remove the attribute specified in @args.
- *
- * This will involve walking down the Btree, and may involve joining
- * leaf nodes and even joining intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- *
- * This routine is meant to function as either an in-line or delayed operation,
- * and may return -EAGAIN when the transaction needs to be rolled.  Calling
- * functions will need to handle this, and call the function until a
- * successful error code is returned.
- */
-int
-xfs_attr_remove_iter(
-       struct xfs_delattr_context      *dac)
-{
-       struct xfs_da_args              *args = dac->da_args;
-       struct xfs_da_state             *state = dac->da_state;
-       int                             retval, error = 0;
-       struct xfs_inode                *dp = args->dp;
-
-       trace_xfs_attr_node_removename(args);
-
-       switch (dac->dela_state) {
-       case XFS_DAS_UNINIT:
-               if (!xfs_inode_hasattr(dp))
-                       return -ENOATTR;
-
-               /*
-                * Shortform or leaf formats don't require transaction rolls and
-                * thus state transitions. Call the right helper and return.
-                */
-               if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
-                       return xfs_attr_sf_removename(args);
-
-               if (xfs_attr_is_leaf(dp))
-                       return xfs_attr_leaf_removename(args);
-
-               /*
-                * Node format may require transaction rolls. Set up the
-                * state context and fall into the state machine.
-                */
-               if (!dac->da_state) {
-                       error = xfs_attr_node_removename_setup(dac);
-                       if (error)
-                               return error;
-                       state = dac->da_state;
-               }
-
-               fallthrough;
-       case XFS_DAS_RMTBLK:
-               dac->dela_state = XFS_DAS_RMTBLK;
-
-               /*
-                * If there is an out-of-line value, de-allocate the blocks.
-                * This is done before we remove the attribute so that we don't
-                * overflow the maximum size of a transaction and/or hit a
-                * deadlock.
-                */
-               if (args->rmtblkno > 0) {
-                       /*
-                        * May return -EAGAIN. Roll and repeat until all remote
-                        * blocks are removed.
-                        */
-                       error = xfs_attr_rmtval_remove(dac);
-                       if (error == -EAGAIN) {
-                               trace_xfs_attr_remove_iter_return(
-                                               dac->dela_state, args->dp);
-                               return error;
-                       } else if (error) {
-                               goto out;
-                       }
-
-                       /*
-                        * Refill the state structure with buffers (the prior
-                        * calls released our buffers) and close out this
-                        * transaction before proceeding.
-                        */
-                       ASSERT(args->rmtblkno == 0);
-                       error = xfs_attr_refillstate(state);
-                       if (error)
-                               goto out;
-                       dac->dela_state = XFS_DAS_RM_NAME;
-                       dac->flags |= XFS_DAC_DEFER_FINISH;
-                       trace_xfs_attr_remove_iter_return(dac->dela_state, args->dp);
-                       return -EAGAIN;
-               }
-
-               fallthrough;
-       case XFS_DAS_RM_NAME:
-               /*
-                * If we came here fresh from a transaction roll, reattach all
-                * the buffers to the current transaction.
-                */
-               if (dac->dela_state == XFS_DAS_RM_NAME) {
-                       error = xfs_attr_refillstate(state);
-                       if (error)
-                               goto out;
-               }
-
-               retval = xfs_attr_node_removename(args, state);
-
-               /*
-                * Check to see if the tree needs to be collapsed. If so, roll
-                * the transacton and fall into the shrink state.
-                */
-               if (retval && (state->path.active > 1)) {
-                       error = xfs_da3_join(state);
-                       if (error)
-                               goto out;
-
-                       dac->flags |= XFS_DAC_DEFER_FINISH;
-                       dac->dela_state = XFS_DAS_RM_SHRINK;
-                       trace_xfs_attr_remove_iter_return(
-                                       dac->dela_state, args->dp);
-                       return -EAGAIN;
-               }
-
-               fallthrough;
-       case XFS_DAS_RM_SHRINK:
-               /*
-                * If the result is small enough, push it all into the inode.
-                * This is our final state so it's safe to return a dirty
-                * transaction.
-                */
-               if (xfs_attr_is_leaf(dp))
-                       error = xfs_attr_node_shrink(args, state);
-               ASSERT(error != -EAGAIN);
-               break;
-       default:
-               ASSERT(0);
-               error = -EINVAL;
-               goto out;
-       }
-out:
-       if (state)
-               xfs_da_state_free(state);
-       return error;
-}
-
-/*
- * Fill in the disk block numbers in the state structure for the buffers
- * that are attached to the state structure.
- * This is done so that we can quickly reattach ourselves to those buffers
- * after some set of transaction commits have released these buffers.
- */
-STATIC int
-xfs_attr_fillstate(xfs_da_state_t *state)
-{
-       xfs_da_state_path_t *path;
-       xfs_da_state_blk_t *blk;
-       int level;
-
-       trace_xfs_attr_fillstate(state->args);
-
-       /*
-        * Roll down the "path" in the state structure, storing the on-disk
-        * block number for those buffers in the "path".
-        */
-       path = &state->path;
-       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
-       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
-               if (blk->bp) {
-                       blk->disk_blkno = xfs_buf_daddr(blk->bp);
-                       blk->bp = NULL;
-               } else {
-                       blk->disk_blkno = 0;
-               }
-       }
-
-       /*
-        * Roll down the "altpath" in the state structure, storing the on-disk
-        * block number for those buffers in the "altpath".
-        */
-       path = &state->altpath;
-       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
-       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
-               if (blk->bp) {
-                       blk->disk_blkno = xfs_buf_daddr(blk->bp);
-                       blk->bp = NULL;
-               } else {
-                       blk->disk_blkno = 0;
-               }
-       }
-
-       return 0;
-}
-
-/*
- * Reattach the buffers to the state structure based on the disk block
- * numbers stored in the state structure.
- * This is done after some set of transaction commits have released those
- * buffers from our grip.
- */
-STATIC int
-xfs_attr_refillstate(xfs_da_state_t *state)
-{
-       xfs_da_state_path_t *path;
-       xfs_da_state_blk_t *blk;
-       int level, error;
-
-       trace_xfs_attr_refillstate(state->args);
-
-       /*
-        * Roll down the "path" in the state structure, storing the on-disk
-        * block number for those buffers in the "path".
-        */
-       path = &state->path;
-       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
-       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
-               if (blk->disk_blkno) {
-                       error = xfs_da3_node_read_mapped(state->args->trans,
-                                       state->args->dp, blk->disk_blkno,
-                                       &blk->bp, XFS_ATTR_FORK);
-                       if (error)
-                               return error;
-               } else {
-                       blk->bp = NULL;
-               }
-       }
-
-       /*
-        * Roll down the "altpath" in the state structure, storing the on-disk
-        * block number for those buffers in the "altpath".
-        */
-       path = &state->altpath;
-       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
-       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
-               if (blk->disk_blkno) {
-                       error = xfs_da3_node_read_mapped(state->args->trans,
-                                       state->args->dp, blk->disk_blkno,
-                                       &blk->bp, XFS_ATTR_FORK);
-                       if (error)
-                               return error;
-               } else {
-                       blk->bp = NULL;
-               }
-       }
-
-       return 0;
-}
-
  /*
   * Retrieve the attribute data from a node attribute list.
   *
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h

index 5e71f71..1af7abe 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -28,6 +28,15 @@ struct xfs_attr_list_context;
   */
  #define        ATTR_MAX_VALUELEN       (64*1024)       /* max length of a value */
  
+static inline bool xfs_has_larp(struct xfs_mount *mp)
+{
+#ifdef DEBUG
+       return xfs_globals.larp;
+#else
+       return false;
+#endif
+}
+
  /*
   * Kernel-internal version of the attrlist cursor.
   */
@@ -425,7 +434,7 @@ struct xfs_attr_list_context {
   */
  
  /*
- * Enum values for xfs_delattr_context.da_state
+ * Enum values for xfs_attr_item.xattri_da_state
   *
   * These values are used by delayed attribute operations to keep track  of where
   * they were before they returned -EAGAIN.  A return code of -EAGAIN signals the
@@ -434,46 +443,105 @@ struct xfs_attr_list_context {
   * to where it was and resume executing where it left off.
   */
  enum xfs_delattr_state {
-       XFS_DAS_UNINIT          = 0,  /* No state has been set yet */
-       XFS_DAS_RMTBLK,               /* Removing remote blks */
-       XFS_DAS_RM_NAME,              /* Remove attr name */
-       XFS_DAS_RM_SHRINK,            /* We are shrinking the tree */
-       XFS_DAS_FOUND_LBLK,           /* We found leaf blk for attr */
-       XFS_DAS_FOUND_NBLK,           /* We found node blk for attr */
-       XFS_DAS_FLIP_LFLAG,           /* Flipped leaf INCOMPLETE attr flag */
-       XFS_DAS_RM_LBLK,              /* A rename is removing leaf blocks */
-       XFS_DAS_RD_LEAF,              /* Read in the new leaf */
-       XFS_DAS_ALLOC_NODE,           /* We are allocating node blocks */
-       XFS_DAS_FLIP_NFLAG,           /* Flipped node INCOMPLETE attr flag */
-       XFS_DAS_RM_NBLK,              /* A rename is removing node blocks */
-       XFS_DAS_CLR_FLAG,             /* Clear incomplete flag */
+       XFS_DAS_UNINIT          = 0,    /* No state has been set yet */
+
+       /*
+        * Initial sequence states. The replace setup code relies on the
+        * ADD and REMOVE states for a specific format to be sequential so
+        * that we can transform the initial operation to be performed
+        * according to the xfs_has_larp() state easily.
+        */
+       XFS_DAS_SF_ADD,                 /* Initial sf add state */
+       XFS_DAS_SF_REMOVE,              /* Initial sf replace/remove state */
+
+       XFS_DAS_LEAF_ADD,               /* Initial leaf add state */
+       XFS_DAS_LEAF_REMOVE,            /* Initial leaf replace/remove state */
+
+       XFS_DAS_NODE_ADD,               /* Initial node add state */
+       XFS_DAS_NODE_REMOVE,            /* Initial node replace/remove state */
+
+       /* Leaf state set/replace/remove sequence */
+       XFS_DAS_LEAF_SET_RMT,           /* set a remote xattr from a leaf */
+       XFS_DAS_LEAF_ALLOC_RMT,         /* We are allocating remote blocks */
+       XFS_DAS_LEAF_REPLACE,           /* Perform replace ops on a leaf */
+       XFS_DAS_LEAF_REMOVE_OLD,        /* Start removing old attr from leaf */
+       XFS_DAS_LEAF_REMOVE_RMT,        /* A rename is removing remote blocks */
+       XFS_DAS_LEAF_REMOVE_ATTR,       /* Remove the old attr from a leaf */
+
+       /* Node state sequence, must match leaf state above */
+       XFS_DAS_NODE_SET_RMT,           /* set a remote xattr from a node */
+       XFS_DAS_NODE_ALLOC_RMT,         /* We are allocating remote blocks */
+       XFS_DAS_NODE_REPLACE,           /* Perform replace ops on a node */
+       XFS_DAS_NODE_REMOVE_OLD,        /* Start removing old attr from node */
+       XFS_DAS_NODE_REMOVE_RMT,        /* A rename is removing remote blocks */
+       XFS_DAS_NODE_REMOVE_ATTR,       /* Remove the old attr from a node */
+
+       XFS_DAS_DONE,                   /* finished operation */
  };
  
+#define XFS_DAS_STRINGS        \
+       { XFS_DAS_UNINIT,               "XFS_DAS_UNINIT" }, \
+       { XFS_DAS_SF_ADD,               "XFS_DAS_SF_ADD" }, \
+       { XFS_DAS_SF_REMOVE,            "XFS_DAS_SF_REMOVE" }, \
+       { XFS_DAS_LEAF_ADD,             "XFS_DAS_LEAF_ADD" }, \
+       { XFS_DAS_LEAF_REMOVE,          "XFS_DAS_LEAF_REMOVE" }, \
+       { XFS_DAS_NODE_ADD,             "XFS_DAS_NODE_ADD" }, \
+       { XFS_DAS_NODE_REMOVE,          "XFS_DAS_NODE_REMOVE" }, \
+       { XFS_DAS_LEAF_SET_RMT,         "XFS_DAS_LEAF_SET_RMT" }, \
+       { XFS_DAS_LEAF_ALLOC_RMT,       "XFS_DAS_LEAF_ALLOC_RMT" }, \
+       { XFS_DAS_LEAF_REPLACE,         "XFS_DAS_LEAF_REPLACE" }, \
+       { XFS_DAS_LEAF_REMOVE_OLD,      "XFS_DAS_LEAF_REMOVE_OLD" }, \
+       { XFS_DAS_LEAF_REMOVE_RMT,      "XFS_DAS_LEAF_REMOVE_RMT" }, \
+       { XFS_DAS_LEAF_REMOVE_ATTR,     "XFS_DAS_LEAF_REMOVE_ATTR" }, \
+       { XFS_DAS_NODE_SET_RMT,         "XFS_DAS_NODE_SET_RMT" }, \
+       { XFS_DAS_NODE_ALLOC_RMT,       "XFS_DAS_NODE_ALLOC_RMT" },  \
+       { XFS_DAS_NODE_REPLACE,         "XFS_DAS_NODE_REPLACE" },  \
+       { XFS_DAS_NODE_REMOVE_OLD,      "XFS_DAS_NODE_REMOVE_OLD" }, \
+       { XFS_DAS_NODE_REMOVE_RMT,      "XFS_DAS_NODE_REMOVE_RMT" }, \
+       { XFS_DAS_NODE_REMOVE_ATTR,     "XFS_DAS_NODE_REMOVE_ATTR" }, \
+       { XFS_DAS_DONE,                 "XFS_DAS_DONE" }
+
  /*
- * Defines for xfs_delattr_context.flags
+ * Defines for xfs_attr_item.xattri_flags
   */
-#define XFS_DAC_DEFER_FINISH           0x01 /* finish the transaction */
-#define XFS_DAC_LEAF_ADDNAME_INIT      0x02 /* xfs_attr_leaf_addname init*/
+#define XFS_DAC_LEAF_ADDNAME_INIT      0x01 /* xfs_attr_leaf_addname init*/
  
  /*
   * Context used for keeping track of delayed attribute operations
   */
-struct xfs_delattr_context {
-       struct xfs_da_args      *da_args;
+struct xfs_attr_item {
+       struct xfs_da_args              *xattri_da_args;
+
+       /*
+        * Used by xfs_attr_set to hold a leaf buffer across a transaction roll
+        */
+       struct xfs_buf                  *xattri_leaf_bp;
  
         /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */
-       struct xfs_bmbt_irec    map;
-       xfs_dablk_t             lblkno;
-       int                     blkcnt;
+       struct xfs_bmbt_irec            xattri_map;
+       xfs_dablk_t                     xattri_lblkno;
+       int                             xattri_blkcnt;
  
         /* Used in xfs_attr_node_removename to roll through removing blocks */
-       struct xfs_da_state     *da_state;
+       struct xfs_da_state             *xattri_da_state;
  
         /* Used to keep track of current state of delayed operation */
-       unsigned int            flags;
-       enum xfs_delattr_state  dela_state;
+       unsigned int                    xattri_flags;
+       enum xfs_delattr_state          xattri_dela_state;
+
+       /*
+        * Attr operation being performed - XFS_ATTR_OP_FLAGS_*
+        */
+       unsigned int                    xattri_op_flags;
+
+       /*
+        * used to log this item to an intent containing a list of attrs to
+        * commit later
+        */
+       struct list_head                xattri_list;
  };
  
+
  /*========================================================================
   * Function prototypes for the kernel.
   *========================================================================*/
@@ -489,11 +557,81 @@ bool xfs_attr_is_leaf(struct xfs_inode *ip);
  int xfs_attr_get_ilocked(struct xfs_da_args *args);
  int xfs_attr_get(struct xfs_da_args *args);
  int xfs_attr_set(struct xfs_da_args *args);
-int xfs_attr_set_args(struct xfs_da_args *args);
-int xfs_attr_remove_args(struct xfs_da_args *args);
-int xfs_attr_remove_iter(struct xfs_delattr_context *dac);
+int xfs_attr_set_iter(struct xfs_attr_item *attr);
+int xfs_attr_remove_iter(struct xfs_attr_item *attr);
  bool xfs_attr_namecheck(const void *name, size_t length);
-void xfs_delattr_context_init(struct xfs_delattr_context *dac,
-                             struct xfs_da_args *args);
+int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
+void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres,
+                        unsigned int *total);
+
+extern struct kmem_cache       *xfs_attri_cache;
+extern struct kmem_cache       *xfs_attrd_cache;
+
+int __init xfs_attri_init_cache(void);
+void xfs_attri_destroy_cache(void);
+int __init xfs_attrd_init_cache(void);
+void xfs_attrd_destroy_cache(void);
+
+/*
+ * Check to see if the attr should be upgraded from non-existent or shortform to
+ * single-leaf-block attribute list.
+ */
+static inline bool
+xfs_attr_is_shortform(
+       struct xfs_inode    *ip)
+{
+       return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL ||
+              (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
+               ip->i_afp->if_nextents == 0);
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_add_state(struct xfs_da_args *args)
+{
+       /*
+        * When called from the completion of a attr remove to determine the
+        * next state, the attribute fork may be null. This can occur only occur
+        * on a pure remove, but we grab the next state before we check if a
+        * replace operation is being performed. If we are called from any other
+        * context, i_afp is guaranteed to exist. Hence if the attr fork is
+        * null, we were called from a pure remove operation and so we are done.
+        */
+       if (!args->dp->i_afp)
+               return XFS_DAS_DONE;
+
+       args->op_flags |= XFS_DA_OP_ADDNAME;
+       if (xfs_attr_is_shortform(args->dp))
+               return XFS_DAS_SF_ADD;
+       if (xfs_attr_is_leaf(args->dp))
+               return XFS_DAS_LEAF_ADD;
+       return XFS_DAS_NODE_ADD;
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_remove_state(struct xfs_da_args *args)
+{
+       args->op_flags |= XFS_DA_OP_REMOVE;
+       if (xfs_attr_is_shortform(args->dp))
+               return XFS_DAS_SF_REMOVE;
+       if (xfs_attr_is_leaf(args->dp))
+               return XFS_DAS_LEAF_REMOVE;
+       return XFS_DAS_NODE_REMOVE;
+}
+
+/*
+ * If we are logging the attributes, then we have to start with removal of the
+ * old attribute so that there is always consistent state that we can recover
+ * from if the system goes down part way through. We always log the new attr
+ * value, so even when we remove the attr first we still have the information in
+ * the log to finish the replace operation atomically.
+ */
+static inline enum xfs_delattr_state
+xfs_attr_init_replace_state(struct xfs_da_args *args)
+{
+       args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE;
+       if (xfs_has_larp(args->dp->i_mount))
+               return xfs_attr_init_remove_state(args);
+       return xfs_attr_init_add_state(args);
+}
  
  #endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c

index 014daa8..15a9904 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -28,6 +28,7 @@
  #include "xfs_dir2.h"
  #include "xfs_log.h"
  #include "xfs_ag.h"
+#include "xfs_errortag.h"
  
  
  /*
@@ -309,6 +310,15 @@ xfs_attr3_leaf_verify(
         if (fa)
                 return fa;
  
+       /*
+        * Empty leaf blocks should never occur;  they imply the existence of a
+        * software bug that needs fixing. xfs_repair also flags them as a
+        * corruption that needs fixing, so we should never let these go to
+        * disk.
+        */
+       if (ichdr.count == 0)
+               return __this_address;
+
         /*
          * firstused is the block offset of the first name info structure.
          * Make sure it doesn't go off the block or crash into the header.
@@ -445,6 +455,14 @@ xfs_attr3_leaf_read(
   * Namespace helper routines
   *========================================================================*/
  
+/*
+ * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE
+ * flag on disk - if there's an incomplete attr then recovery needs to tear it
+ * down. If there's no incomplete attr, then recovery needs to tear that attr
+ * down to replace it with the attr that has been logged. In this case, the
+ * INCOMPLETE flag will not be set in attr->attr_filter, but rather
+ * XFS_DA_OP_RECOVERY will be set in args->op_flags.
+ */
  static bool
  xfs_attr_match(
         struct xfs_da_args      *args,
@@ -452,14 +470,18 @@ xfs_attr_match(
         unsigned char           *name,
         int                     flags)
  {
+
         if (args->namelen != namelen)
                 return false;
         if (memcmp(args->name, name, namelen) != 0)
                 return false;
-       /*
-        * If we are looking for incomplete entries, show only those, else only
-        * show complete entries.
-        */
+
+       /* Recovery ignores the INCOMPLETE flag. */
+       if ((args->op_flags & XFS_DA_OP_RECOVERY) &&
+           args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK))
+               return true;
+
+       /* All remaining matches need to be filtered by INCOMPLETE state. */
         if (args->attr_filter !=
             (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE)))
                 return false;
@@ -798,6 +820,14 @@ xfs_attr_sf_removename(
         sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
  
         error = xfs_attr_sf_findname(args, &sfe, &base);
+
+       /*
+        * If we are recovering an operation, finding nothing to
+        * remove is not an error - it just means there was nothing
+        * to clean up.
+        */
+       if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY))
+               return 0;
         if (error != -EEXIST)
                 return error;
         size = xfs_attr_sf_entsize(sfe);
@@ -818,7 +848,7 @@ xfs_attr_sf_removename(
         totsize -= size;
         if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) &&
             (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
-           !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+           !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) {
                 xfs_attr_fork_remove(dp, args->trans);
         } else {
                 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
@@ -1127,9 +1157,17 @@ xfs_attr3_leaf_to_shortform(
                 goto out;
  
         if (forkoff == -1) {
-               ASSERT(xfs_has_attr2(dp->i_mount));
-               ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
-               xfs_attr_fork_remove(dp, args->trans);
+               /*
+                * Don't remove the attr fork if this operation is the first
+                * part of a attr replace operations. We're going to add a new
+                * attr immediately, so we need to keep the attr fork around in
+                * this case.
+                */
+               if (!(args->op_flags & XFS_DA_OP_REPLACE)) {
+                       ASSERT(xfs_has_attr2(dp->i_mount));
+                       ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
+                       xfs_attr_fork_remove(dp, args->trans);
+               }
                 goto out;
         }
  
@@ -1189,6 +1227,11 @@ xfs_attr3_leaf_to_node(
  
         trace_xfs_attr_leaf_to_node(args);
  
+       if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
+               error = -EIO;
+               goto out;
+       }
+
         error = xfs_da_grow_inode(args, &blkno);
         if (error)
                 goto out;
@@ -1486,8 +1529,9 @@ xfs_attr3_leaf_add_work(
         entry->flags = args->attr_filter;
         if (tmp)
                 entry->flags |= XFS_ATTR_LOCAL;
-       if (args->op_flags & XFS_DA_OP_RENAME) {
-               entry->flags |= XFS_ATTR_INCOMPLETE;
+       if (args->op_flags & XFS_DA_OP_REPLACE) {
+               if (!xfs_has_larp(mp))
+                       entry->flags |= XFS_ATTR_INCOMPLETE;
                 if ((args->blkno2 == args->blkno) &&
                     (args->index2 <= args->index)) {
                         args->index2++;
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c

index 83b95be..4250159 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -568,14 +568,14 @@ xfs_attr_rmtval_stale(
   */
  int
  xfs_attr_rmtval_find_space(
-       struct xfs_delattr_context      *dac)
+       struct xfs_attr_item            *attr)
  {
-       struct xfs_da_args              *args = dac->da_args;
-       struct xfs_bmbt_irec            *map = &dac->map;
+       struct xfs_da_args              *args = attr->xattri_da_args;
+       struct xfs_bmbt_irec            *map = &attr->xattri_map;
         int                             error;
  
-       dac->lblkno = 0;
-       dac->blkcnt = 0;
+       attr->xattri_lblkno = 0;
+       attr->xattri_blkcnt = 0;
         args->rmtblkcnt = 0;
         args->rmtblkno = 0;
         memset(map, 0, sizeof(struct xfs_bmbt_irec));
@@ -584,8 +584,8 @@ xfs_attr_rmtval_find_space(
         if (error)
                 return error;
  
-       dac->blkcnt = args->rmtblkcnt;
-       dac->lblkno = args->rmtblkno;
+       attr->xattri_blkcnt = args->rmtblkcnt;
+       attr->xattri_lblkno = args->rmtblkno;
  
         return 0;
  }
@@ -598,17 +598,18 @@ xfs_attr_rmtval_find_space(
   */
  int
  xfs_attr_rmtval_set_blk(
-       struct xfs_delattr_context      *dac)
+       struct xfs_attr_item            *attr)
  {
-       struct xfs_da_args              *args = dac->da_args;
+       struct xfs_da_args              *args = attr->xattri_da_args;
         struct xfs_inode                *dp = args->dp;
-       struct xfs_bmbt_irec            *map = &dac->map;
+       struct xfs_bmbt_irec            *map = &attr->xattri_map;
         int nmap;
         int error;
  
         nmap = 1;
-       error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)dac->lblkno,
-                       dac->blkcnt, XFS_BMAPI_ATTRFORK, args->total,
+       error = xfs_bmapi_write(args->trans, dp,
+                       (xfs_fileoff_t)attr->xattri_lblkno,
+                       attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total,
                         map, &nmap);
         if (error)
                 return error;
@@ -618,8 +619,8 @@ xfs_attr_rmtval_set_blk(
                (map->br_startblock != HOLESTARTBLOCK));
  
         /* roll attribute extent map forwards */
-       dac->lblkno += map->br_blockcount;
-       dac->blkcnt -= map->br_blockcount;
+       attr->xattri_lblkno += map->br_blockcount;
+       attr->xattri_blkcnt -= map->br_blockcount;
  
         return 0;
  }
@@ -673,9 +674,9 @@ xfs_attr_rmtval_invalidate(
   */
  int
  xfs_attr_rmtval_remove(
-       struct xfs_delattr_context      *dac)
+       struct xfs_attr_item            *attr)
  {
-       struct xfs_da_args              *args = dac->da_args;
+       struct xfs_da_args              *args = attr->xattri_da_args;
         int                             error, done;
  
         /*
@@ -695,8 +696,8 @@ xfs_attr_rmtval_remove(
          * the parent
          */
         if (!done) {
-               dac->flags |= XFS_DAC_DEFER_FINISH;
-               trace_xfs_attr_rmtval_remove_return(dac->dela_state, args->dp);
+               trace_xfs_attr_rmtval_remove_return(attr->xattri_dela_state,
+                                                   args->dp);
                 return -EAGAIN;
         }
  
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h

index d72eff3..62b398e 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -12,9 +12,9 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args);
  int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
                 xfs_buf_flags_t incore_flags);
  int xfs_attr_rmtval_invalidate(struct xfs_da_args *args);
-int xfs_attr_rmtval_remove(struct xfs_delattr_context *dac);
+int xfs_attr_rmtval_remove(struct xfs_attr_item *attr);
  int xfs_attr_rmt_find_hole(struct xfs_da_args *args);
  int xfs_attr_rmtval_set_value(struct xfs_da_args *args);
-int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac);
-int xfs_attr_rmtval_find_space(struct xfs_delattr_context *dac);
+int xfs_attr_rmtval_set_blk(struct xfs_attr_item *attr);
+int xfs_attr_rmtval_find_space(struct xfs_attr_item *attr);
  #endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c

index 74198dd..6833110 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -52,19 +52,17 @@ xfs_bmap_compute_maxlevels(
         xfs_mount_t     *mp,            /* file system mount structure */
         int             whichfork)      /* data or attr fork */
  {
+       uint64_t        maxblocks;      /* max blocks at this level */
+       xfs_extnum_t    maxleafents;    /* max leaf entries possible */
         int             level;          /* btree level */
-       uint            maxblocks;      /* max blocks at this level */
-       uint            maxleafents;    /* max leaf entries possible */
         int             maxrootrecs;    /* max records in root block */
         int             minleafrecs;    /* min records in leaf block */
         int             minnoderecs;    /* min records in node block */
         int             sz;             /* root block size */
  
         /*
-        * The maximum number of extents in a file, hence the maximum number of
-        * leaf entries, is controlled by the size of the on-disk extent count,
-        * either a signed 32-bit number for the data fork, or a signed 16-bit
-        * number for the attr fork.
+        * The maximum number of extents in a fork, hence the maximum number of
+        * leaf entries, is controlled by the size of the on-disk extent count.
          *
          * Note that we can no longer assume that if we are in ATTR1 that the
          * fork offset of all the inodes will be
@@ -74,22 +72,22 @@ xfs_bmap_compute_maxlevels(
          * ATTR2 we have to assume the worst case scenario of a minimum size
          * available.
          */
-       if (whichfork == XFS_DATA_FORK) {
-               maxleafents = MAXEXTNUM;
+       maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
+                               whichfork);
+       if (whichfork == XFS_DATA_FORK)
                 sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
-       } else {
-               maxleafents = MAXAEXTNUM;
+       else
                 sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
-       }
+
         maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
         minleafrecs = mp->m_bmap_dmnr[0];
         minnoderecs = mp->m_bmap_dmnr[1];
-       maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+       maxblocks = howmany_64(maxleafents, minleafrecs);
         for (level = 1; maxblocks > 1; level++) {
                 if (maxblocks <= maxrootrecs)
                         maxblocks = 1;
                 else
-                       maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+                       maxblocks = howmany_64(maxblocks, minnoderecs);
         }
         mp->m_bm_maxlevels[whichfork] = level;
         ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk());
@@ -468,7 +466,7 @@ error0:
         if (bp_release)
                 xfs_trans_brelse(NULL, bp);
  error_norelse:
-       xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+       xfs_warn(mp, "%s: BAD after btree leaves for %llu extents",
                 __func__, i);
         xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__);
         xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -485,7 +483,7 @@ STATIC void
  xfs_bmap_validate_ret(
         xfs_fileoff_t           bno,
         xfs_filblks_t           len,
-       int                     flags,
+       uint32_t                flags,
         xfs_bmbt_irec_t         *mval,
         int                     nmap,
         int                     ret_nmap)
@@ -1399,7 +1397,7 @@ xfs_bmap_add_extent_delay_real(
         xfs_bmbt_irec_t         r[3];   /* neighbor extent entries */
                                         /* left is 0, right is 1, prev is 2 */
         int                     rval=0; /* return value (logging flags) */
-       int                     state = xfs_bmap_fork_to_state(whichfork);
+       uint32_t                state = xfs_bmap_fork_to_state(whichfork);
         xfs_filblks_t           da_new; /* new count del alloc blocks used */
         xfs_filblks_t           da_old; /* old count del alloc blocks used */
         xfs_filblks_t           temp=0; /* value for da_new calculations */
@@ -1452,7 +1450,7 @@ xfs_bmap_add_extent_delay_real(
             LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
             LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
             LEFT.br_state == new->br_state &&
-           LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+           LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
                 state |= BMAP_LEFT_CONTIG;
  
         /*
@@ -1470,13 +1468,13 @@ xfs_bmap_add_extent_delay_real(
             new_endoff == RIGHT.br_startoff &&
             new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
             new->br_state == RIGHT.br_state &&
-           new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+           new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
             ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
                        BMAP_RIGHT_FILLING)) !=
                       (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
                        BMAP_RIGHT_FILLING) ||
              LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
-                       <= MAXEXTLEN))
+                       <= XFS_MAX_BMBT_EXTLEN))
                 state |= BMAP_RIGHT_CONTIG;
  
         error = 0;
@@ -1950,7 +1948,7 @@ xfs_bmap_add_extent_unwritten_real(
         xfs_bmbt_irec_t         r[3];   /* neighbor extent entries */
                                         /* left is 0, right is 1, prev is 2 */
         int                     rval=0; /* return value (logging flags) */
-       int                     state = xfs_bmap_fork_to_state(whichfork);
+       uint32_t                state = xfs_bmap_fork_to_state(whichfork);
         struct xfs_mount        *mp = ip->i_mount;
         struct xfs_bmbt_irec    old;
  
@@ -2000,7 +1998,7 @@ xfs_bmap_add_extent_unwritten_real(
             LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
             LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
             LEFT.br_state == new->br_state &&
-           LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+           LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
                 state |= BMAP_LEFT_CONTIG;
  
         /*
@@ -2018,13 +2016,13 @@ xfs_bmap_add_extent_unwritten_real(
             new_endoff == RIGHT.br_startoff &&
             new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
             new->br_state == RIGHT.br_state &&
-           new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+           new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
             ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
                        BMAP_RIGHT_FILLING)) !=
                       (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
                        BMAP_RIGHT_FILLING) ||
              LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
-                       <= MAXEXTLEN))
+                       <= XFS_MAX_BMBT_EXTLEN))
                 state |= BMAP_RIGHT_CONTIG;
  
         /*
@@ -2479,7 +2477,7 @@ xfs_bmap_add_extent_hole_delay(
         xfs_filblks_t           newlen=0;       /* new indirect size */
         xfs_filblks_t           oldlen=0;       /* old indirect size */
         xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
-       int                     state = xfs_bmap_fork_to_state(whichfork);
+       uint32_t                state = xfs_bmap_fork_to_state(whichfork);
         xfs_filblks_t           temp;    /* temp for indirect calculations */
  
         ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -2510,15 +2508,15 @@ xfs_bmap_add_extent_hole_delay(
          */
         if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
             left.br_startoff + left.br_blockcount == new->br_startoff &&
-           left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+           left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
                 state |= BMAP_LEFT_CONTIG;
  
         if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
             new->br_startoff + new->br_blockcount == right.br_startoff &&
-           new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+           new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
             (!(state & BMAP_LEFT_CONTIG) ||
              (left.br_blockcount + new->br_blockcount +
-             right.br_blockcount <= MAXEXTLEN)))
+             right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
                 state |= BMAP_RIGHT_CONTIG;
  
         /*
@@ -2616,7 +2614,7 @@ xfs_bmap_add_extent_hole_real(
         struct xfs_btree_cur    **curp,
         struct xfs_bmbt_irec    *new,
         int                     *logflagsp,
-       int                     flags)
+       uint32_t                flags)
  {
         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
         struct xfs_mount        *mp = ip->i_mount;
@@ -2626,7 +2624,7 @@ xfs_bmap_add_extent_hole_real(
         xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
         xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
         int                     rval=0; /* return value (logging flags) */
-       int                     state = xfs_bmap_fork_to_state(whichfork);
+       uint32_t                state = xfs_bmap_fork_to_state(whichfork);
         struct xfs_bmbt_irec    old;
  
         ASSERT(!isnullstartblock(new->br_startblock));
@@ -2661,17 +2659,17 @@ xfs_bmap_add_extent_hole_real(
             left.br_startoff + left.br_blockcount == new->br_startoff &&
             left.br_startblock + left.br_blockcount == new->br_startblock &&
             left.br_state == new->br_state &&
-           left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+           left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
                 state |= BMAP_LEFT_CONTIG;
  
         if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
             new->br_startoff + new->br_blockcount == right.br_startoff &&
             new->br_startblock + new->br_blockcount == right.br_startblock &&
             new->br_state == right.br_state &&
-           new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+           new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
             (!(state & BMAP_LEFT_CONTIG) ||
              left.br_blockcount + new->br_blockcount +
-            right.br_blockcount <= MAXEXTLEN))
+            right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))
                 state |= BMAP_RIGHT_CONTIG;
  
         error = 0;
@@ -2906,15 +2904,15 @@ xfs_bmap_extsize_align(
  
         /*
          * For large extent hint sizes, the aligned extent might be larger than
-        * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls
-        * the length back under MAXEXTLEN. The outer allocation loops handle
-        * short allocation just fine, so it is safe to do this. We only want to
-        * do it when we are forced to, though, because it means more allocation
-        * operations are required.
+        * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so
+        * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer
+        * allocation loops handle short allocation just fine, so it is safe to
+        * do this. We only want to do it when we are forced to, though, because
+        * it means more allocation operations are required.
          */
-       while (align_alen > MAXEXTLEN)
+       while (align_alen > XFS_MAX_BMBT_EXTLEN)
                 align_alen -= extsz;
-       ASSERT(align_alen <= MAXEXTLEN);
+       ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN);
  
         /*
          * If the previous block overlaps with this proposed allocation
@@ -3004,9 +3002,9 @@ xfs_bmap_extsize_align(
                         return -EINVAL;
         } else {
                 ASSERT(orig_off >= align_off);
-               /* see MAXEXTLEN handling above */
+               /* see XFS_BMBT_MAX_EXTLEN handling above */
                 ASSERT(orig_end <= align_off + align_alen ||
-                      align_alen + extsz > MAXEXTLEN);
+                      align_alen + extsz > XFS_MAX_BMBT_EXTLEN);
         }
  
  #ifdef DEBUG
@@ -3766,7 +3764,7 @@ xfs_bmapi_trim_map(
         xfs_fileoff_t           obno,
         xfs_fileoff_t           end,
         int                     n,
-       int                     flags)
+       uint32_t                flags)
  {
         if ((flags & XFS_BMAPI_ENTIRE) ||
             got->br_startoff + got->br_blockcount <= obno) {
@@ -3811,7 +3809,7 @@ xfs_bmapi_update_map(
         xfs_fileoff_t           obno,
         xfs_fileoff_t           end,
         int                     *n,
-       int                     flags)
+       uint32_t                flags)
  {
         xfs_bmbt_irec_t *mval = *map;
  
@@ -3864,7 +3862,7 @@ xfs_bmapi_read(
         xfs_filblks_t           len,
         struct xfs_bmbt_irec    *mval,
         int                     *nmap,
-       int                     flags)
+       uint32_t                flags)
  {
         struct xfs_mount        *mp = ip->i_mount;
         int                     whichfork = xfs_bmapi_whichfork(flags);
@@ -3971,7 +3969,7 @@ xfs_bmapi_reserve_delalloc(
          * Cap the alloc length. Keep track of prealloc so we know whether to
          * tag the inode before we return.
          */
-       alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN);
+       alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
         if (!eof)
                 alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
         if (prealloc && alen >= len)
@@ -4104,7 +4102,7 @@ xfs_bmapi_allocate(
                 if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
                         bma->prev.br_startoff = NULLFILEOFF;
         } else {
-               bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+               bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN);
                 if (!bma->eof)
                         bma->length = XFS_FILBLKS_MIN(bma->length,
                                         bma->got.br_startoff - bma->offset);
@@ -4184,7 +4182,7 @@ xfs_bmapi_convert_unwritten(
         struct xfs_bmalloca     *bma,
         struct xfs_bmbt_irec    *mval,
         xfs_filblks_t           len,
-       int                     flags)
+       uint32_t                flags)
  {
         int                     whichfork = xfs_bmapi_whichfork(flags);
         struct xfs_ifork        *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
@@ -4312,7 +4310,7 @@ xfs_bmapi_write(
         struct xfs_inode        *ip,            /* incore inode */
         xfs_fileoff_t           bno,            /* starting file offs. mapped */
         xfs_filblks_t           len,            /* length to map in file */
-       int                     flags,          /* XFS_BMAPI_... */
+       uint32_t                flags,          /* XFS_BMAPI_... */
         xfs_extlen_t            total,          /* total blocks needed */
         struct xfs_bmbt_irec    *mval,          /* output: map values */
         int                     *nmap)          /* i/o: mval size/count */
@@ -4424,8 +4422,8 @@ xfs_bmapi_write(
                          * xfs_extlen_t and therefore 32 bits. Hence we have to
                          * check for 32-bit overflows and handle them here.
                          */
-                       if (len > (xfs_filblks_t)MAXEXTLEN)
-                               bma.length = MAXEXTLEN;
+                       if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN)
+                               bma.length = XFS_MAX_BMBT_EXTLEN;
                         else
                                 bma.length = len;
  
@@ -4526,14 +4524,16 @@ xfs_bmapi_convert_delalloc(
                 return error;
  
         xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, 0);
  
         error = xfs_iext_count_may_overflow(ip, whichfork,
                         XFS_IEXT_ADD_NOSPLIT_CNT);
+       if (error == -EFBIG)
+               error = xfs_iext_count_upgrade(tp, ip,
+                               XFS_IEXT_ADD_NOSPLIT_CNT);
         if (error)
                 goto out_trans_cancel;
  
-       xfs_trans_ijoin(tp, ip, 0);
-
         if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
             bma.got.br_startoff > offset_fsb) {
                 /*
@@ -4560,7 +4560,8 @@ xfs_bmapi_convert_delalloc(
         bma.ip = ip;
         bma.wasdel = true;
         bma.offset = bma.got.br_startoff;
-       bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+       bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount,
+                       XFS_MAX_BMBT_EXTLEN);
         bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
  
         /*
@@ -4629,7 +4630,7 @@ xfs_bmapi_remap(
         xfs_fileoff_t           bno,
         xfs_filblks_t           len,
         xfs_fsblock_t           startblock,
-       int                     flags)
+       uint32_t                flags)
  {
         struct xfs_mount        *mp = ip->i_mount;
         struct xfs_ifork        *ifp;
@@ -4641,7 +4642,7 @@ xfs_bmapi_remap(
  
         ifp = XFS_IFORK_PTR(ip, whichfork);
         ASSERT(len > 0);
-       ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
+       ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
         ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
                            XFS_BMAPI_NORMAP)));
@@ -4801,7 +4802,7 @@ xfs_bmap_del_extent_delay(
         int64_t                 da_old, da_new, da_diff = 0;
         xfs_fileoff_t           del_endoff, got_endoff;
         xfs_filblks_t           got_indlen, new_indlen, stolen;
-       int                     state = xfs_bmap_fork_to_state(whichfork);
+       uint32_t                state = xfs_bmap_fork_to_state(whichfork);
         int                     error = 0;
         bool                    isrt;
  
@@ -4926,7 +4927,7 @@ xfs_bmap_del_extent_cow(
         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
         struct xfs_bmbt_irec    new;
         xfs_fileoff_t           del_endoff, got_endoff;
-       int                     state = BMAP_COWFORK;
+       uint32_t                state = BMAP_COWFORK;
  
         XFS_STATS_INC(mp, xs_del_exlist);
  
@@ -4999,7 +5000,7 @@ xfs_bmap_del_extent_real(
         xfs_bmbt_irec_t         *del,   /* data to remove from extents */
         int                     *logflagsp, /* inode logging flags */
         int                     whichfork, /* data or attr fork */
-       int                     bflags) /* bmapi flags */
+       uint32_t                bflags) /* bmapi flags */
  {
         xfs_fsblock_t           del_endblock=0; /* first block past del */
         xfs_fileoff_t           del_endoff;     /* first offset past del */
@@ -5015,7 +5016,7 @@ xfs_bmap_del_extent_real(
         xfs_bmbt_irec_t         new;    /* new record to be inserted */
         /* REFERENCED */
         uint                    qfield; /* quota field to update */
-       int                     state = xfs_bmap_fork_to_state(whichfork);
+       uint32_t                state = xfs_bmap_fork_to_state(whichfork);
         struct xfs_bmbt_irec    old;
  
         mp = ip->i_mount;
@@ -5148,26 +5149,6 @@ xfs_bmap_del_extent_real(
                  * Deleting the middle of the extent.
                  */
  
-               /*
-                * For directories, -ENOSPC is returned since a directory entry
-                * remove operation must not fail due to low extent count
-                * availability. -ENOSPC will be handled by higher layers of XFS
-                * by letting the corresponding empty Data/Free blocks to linger
-                * until a future remove operation. Dabtree blocks would be
-                * swapped with the last block in the leaf space and then the
-                * new last block will be unmapped.
-                *
-                * The above logic also applies to the source directory entry of
-                * a rename operation.
-                */
-               error = xfs_iext_count_may_overflow(ip, whichfork, 1);
-               if (error) {
-                       ASSERT(S_ISDIR(VFS_I(ip)->i_mode) &&
-                               whichfork == XFS_DATA_FORK);
-                       error = -ENOSPC;
-                       goto done;
-               }
-
                 old = got;
  
                 got.br_blockcount = del->br_startoff - got.br_startoff;
@@ -5281,7 +5262,7 @@ __xfs_bunmapi(
         struct xfs_inode        *ip,            /* incore inode */
         xfs_fileoff_t           start,          /* first file offset deleted */
         xfs_filblks_t           *rlen,          /* i/o: amount remaining */
-       int                     flags,          /* misc flags */
+       uint32_t                flags,          /* misc flags */
         xfs_extnum_t            nexts)          /* number of extents max */
  {
         struct xfs_btree_cur    *cur;           /* bmap btree cursor */
@@ -5299,7 +5280,6 @@ __xfs_bunmapi(
         int                     whichfork;      /* data or attribute fork */
         xfs_fsblock_t           sum;
         xfs_filblks_t           len = *rlen;    /* length to unmap in file */
-       xfs_fileoff_t           max_len;
         xfs_fileoff_t           end;
         struct xfs_iext_cursor  icur;
         bool                    done = false;
@@ -5318,16 +5298,6 @@ __xfs_bunmapi(
         ASSERT(len > 0);
         ASSERT(nexts >= 0);
  
-       /*
-        * Guesstimate how many blocks we can unmap without running the risk of
-        * blowing out the transaction with a mix of EFIs and reflink
-        * adjustments.
-        */
-       if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
-               max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
-       else
-               max_len = len;
-
         error = xfs_iread_extents(tp, ip, whichfork);
         if (error)
                 return error;
@@ -5366,7 +5336,7 @@ __xfs_bunmapi(
  
         extno = 0;
         while (end != (xfs_fileoff_t)-1 && end >= start &&
-              (nexts == 0 || extno < nexts) && max_len > 0) {
+              (nexts == 0 || extno < nexts)) {
                 /*
                  * Is the found extent after a hole in which end lives?
                  * Just back up to the previous extent, if so.
@@ -5400,14 +5370,6 @@ __xfs_bunmapi(
                 if (del.br_startoff + del.br_blockcount > end + 1)
                         del.br_blockcount = end + 1 - del.br_startoff;
  
-               /* How much can we safely unmap? */
-               if (max_len < del.br_blockcount) {
-                       del.br_startoff += del.br_blockcount - max_len;
-                       if (!wasdel)
-                               del.br_startblock += del.br_blockcount - max_len;
-                       del.br_blockcount = max_len;
-               }
-
                 if (!isrt)
                         goto delete;
  
@@ -5543,7 +5505,6 @@ delete:
                 if (error)
                         goto error0;
  
-               max_len -= del.br_blockcount;
                 end = del.br_startoff - 1;
  nodelete:
                 /*
@@ -5609,7 +5570,7 @@ xfs_bunmapi(
         struct xfs_inode        *ip,
         xfs_fileoff_t           bno,
         xfs_filblks_t           len,
-       int                     flags,
+       uint32_t                flags,
         xfs_extnum_t            nexts,
         int                     *done)
  {
@@ -5641,7 +5602,7 @@ xfs_bmse_can_merge(
         if ((left->br_startoff + left->br_blockcount != startoff) ||
             (left->br_startblock + left->br_blockcount != got->br_startblock) ||
             (left->br_state != got->br_state) ||
-           (left->br_blockcount + got->br_blockcount > MAXEXTLEN))
+           (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN))
                 return false;
  
         return true;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h

index 03d9aaf..16db95b 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -39,7 +39,7 @@ struct xfs_bmalloca {
         bool                    aeof;   /* allocated space at eof */
         bool                    conv;   /* overwriting unwritten extents */
         int                     datatype;/* data type being allocated */
-       int                     flags;
+       uint32_t                flags;
  };
  
  #define        XFS_BMAP_MAX_NMAP       4
@@ -47,17 +47,17 @@ struct xfs_bmalloca {
  /*
   * Flags for xfs_bmapi_*
   */
-#define XFS_BMAPI_ENTIRE       0x001   /* return entire extent, not trimmed */
-#define XFS_BMAPI_METADATA     0x002   /* mapping metadata not user data */
-#define XFS_BMAPI_ATTRFORK     0x004   /* use attribute fork not data */
-#define XFS_BMAPI_PREALLOC     0x008   /* preallocation op: unwritten space */
-#define XFS_BMAPI_CONTIG       0x020   /* must allocate only one extent */
+#define XFS_BMAPI_ENTIRE       (1u << 0) /* return entire extent untrimmed */
+#define XFS_BMAPI_METADATA     (1u << 1) /* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK     (1u << 2) /* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC     (1u << 3) /* preallocating unwritten space */
+#define XFS_BMAPI_CONTIG       (1u << 4) /* must allocate only one extent */
  /*
   * unwritten extent conversion - this needs write cache flushing and no additional
   * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
   * from written to unwritten, otherwise convert from unwritten to written.
   */
-#define XFS_BMAPI_CONVERT      0x040
+#define XFS_BMAPI_CONVERT      (1u << 5)
  
  /*
   * allocate zeroed extents - this requires all newly allocated user data extents
@@ -65,7 +65,7 @@ struct xfs_bmalloca {
   * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
   * during the allocation range to zeroed written extents.
   */
-#define XFS_BMAPI_ZERO         0x080
+#define XFS_BMAPI_ZERO         (1u << 6)
  
  /*
   * Map the inode offset to the block given in ap->firstblock.  Primarily
@@ -75,16 +75,16 @@ struct xfs_bmalloca {
   * For bunmapi, this flag unmaps the range without adjusting quota, reducing
   * refcount, or freeing the blocks.
   */
-#define XFS_BMAPI_REMAP                0x100
+#define XFS_BMAPI_REMAP                (1u << 7)
  
  /* Map something in the CoW fork. */
-#define XFS_BMAPI_COWFORK      0x200
+#define XFS_BMAPI_COWFORK      (1u << 8)
  
  /* Skip online discard of freed extents */
-#define XFS_BMAPI_NODISCARD    0x1000
+#define XFS_BMAPI_NODISCARD    (1u << 9)
  
  /* Do not update the rmap btree.  Used for reconstructing bmbt from rmapbt. */
-#define XFS_BMAPI_NORMAP       0x2000
+#define XFS_BMAPI_NORMAP       (1u << 10)
  
  #define XFS_BMAPI_FLAGS \
         { XFS_BMAPI_ENTIRE,     "ENTIRE" }, \
@@ -106,7 +106,7 @@ static inline int xfs_bmapi_aflag(int w)
                (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0));
  }
  
-static inline int xfs_bmapi_whichfork(int bmapi_flags)
+static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
  {
         if (bmapi_flags & XFS_BMAPI_COWFORK)
                 return XFS_COW_FORK;
@@ -124,16 +124,16 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
  /*
   * Flags for xfs_bmap_add_extent*.
   */
-#define BMAP_LEFT_CONTIG       (1 << 0)
-#define BMAP_RIGHT_CONTIG      (1 << 1)
-#define BMAP_LEFT_FILLING      (1 << 2)
-#define BMAP_RIGHT_FILLING     (1 << 3)
-#define BMAP_LEFT_DELAY                (1 << 4)
-#define BMAP_RIGHT_DELAY       (1 << 5)
-#define BMAP_LEFT_VALID                (1 << 6)
-#define BMAP_RIGHT_VALID       (1 << 7)
-#define BMAP_ATTRFORK          (1 << 8)
-#define BMAP_COWFORK           (1 << 9)
+#define BMAP_LEFT_CONTIG       (1u << 0)
+#define BMAP_RIGHT_CONTIG      (1u << 1)
+#define BMAP_LEFT_FILLING      (1u << 2)
+#define BMAP_RIGHT_FILLING     (1u << 3)
+#define BMAP_LEFT_DELAY                (1u << 4)
+#define BMAP_RIGHT_DELAY       (1u << 5)
+#define BMAP_LEFT_VALID                (1u << 6)
+#define BMAP_RIGHT_VALID       (1u << 7)
+#define BMAP_ATTRFORK          (1u << 8)
+#define BMAP_COWFORK           (1u << 9)
  
  #define XFS_BMAP_EXT_FLAGS \
         { BMAP_LEFT_CONTIG,     "LC" }, \
@@ -183,15 +183,15 @@ int       xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
                 int whichfork);
  int    xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
                 xfs_filblks_t len, struct xfs_bmbt_irec *mval,
-               int *nmap, int flags);
+               int *nmap, uint32_t flags);
  int    xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
-               xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+               xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
                 xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
  int    __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
-               xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags,
+               xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
                 xfs_extnum_t nexts);
  int    xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
-               xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+               xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
                 xfs_extnum_t nexts, int *done);
  int    xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
                 struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
@@ -243,7 +243,7 @@ void        xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
  void   xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
                 struct xfs_bmbt_irec *imap);
  
-static inline int xfs_bmap_fork_to_state(int whichfork)
+static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
  {
         switch (whichfork) {
         case XFS_ATTR_FORK:
@@ -260,7 +260,7 @@ xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
  
  int    xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
                 xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
-               int flags);
+               uint32_t flags);
  
  extern struct kmem_cache       *xfs_bmap_intent_cache;
  
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c

index 453309f..2b77d45 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -597,7 +597,11 @@ xfs_bmbt_maxrecs(
         return xfs_bmbt_block_maxrecs(blocklen, leaf);
  }
  
-/* Compute the max possible height for block mapping btrees. */
+/*
+ * Calculate the maximum possible height of the btree that the on-disk format
+ * supports. This is used for sizing structures large enough to support every
+ * possible configuration of a filesystem that might get mounted.
+ */
  unsigned int
  xfs_bmbt_maxlevels_ondisk(void)
  {
@@ -611,7 +615,8 @@ xfs_bmbt_maxlevels_ondisk(void)
         minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;
  
         /* One extra level for the inode root. */
-       return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1;
+       return xfs_btree_compute_maxlevels(minrecs,
+                       XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1;
  }
  
  /*
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c

index c1500b2..2aa300f 100644 (file)
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -51,6 +51,52 @@ xfs_btree_magic(
         return magic;
  }
  
+static xfs_failaddr_t
+xfs_btree_check_lblock_siblings(
+       struct xfs_mount        *mp,
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       xfs_fsblock_t           fsb,
+       xfs_fsblock_t           sibling)
+{
+       if (sibling == NULLFSBLOCK)
+               return NULL;
+       if (sibling == fsb)
+               return __this_address;
+       if (level >= 0) {
+               if (!xfs_btree_check_lptr(cur, sibling, level + 1))
+                       return __this_address;
+       } else {
+               if (!xfs_verify_fsbno(mp, sibling))
+                       return __this_address;
+       }
+
+       return NULL;
+}
+
+static xfs_failaddr_t
+xfs_btree_check_sblock_siblings(
+       struct xfs_mount        *mp,
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       xfs_agnumber_t          agno,
+       xfs_agblock_t           agbno,
+       xfs_agblock_t           sibling)
+{
+       if (sibling == NULLAGBLOCK)
+               return NULL;
+       if (sibling == agbno)
+               return __this_address;
+       if (level >= 0) {
+               if (!xfs_btree_check_sptr(cur, sibling, level + 1))
+                       return __this_address;
+       } else {
+               if (!xfs_verify_agbno(mp, agno, sibling))
+                       return __this_address;
+       }
+       return NULL;
+}
+
  /*
   * Check a long btree block header.  Return the address of the failing check,
   * or NULL if everything is ok.
@@ -65,6 +111,8 @@ __xfs_btree_check_lblock(
         struct xfs_mount        *mp = cur->bc_mp;
         xfs_btnum_t             btnum = cur->bc_btnum;
         int                     crc = xfs_has_crc(mp);
+       xfs_failaddr_t          fa;
+       xfs_fsblock_t           fsb = NULLFSBLOCK;
  
         if (crc) {
                 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
@@ -83,16 +131,16 @@ __xfs_btree_check_lblock(
         if (be16_to_cpu(block->bb_numrecs) >
             cur->bc_ops->get_maxrecs(cur, level))
                 return __this_address;
-       if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
-           !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
-                       level + 1))
-               return __this_address;
-       if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
-           !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
-                       level + 1))
-               return __this_address;
  
-       return NULL;
+       if (bp)
+               fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+
+       fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+                       be64_to_cpu(block->bb_u.l.bb_leftsib));
+       if (!fa)
+               fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+                               be64_to_cpu(block->bb_u.l.bb_rightsib));
+       return fa;
  }
  
  /* Check a long btree block header. */
@@ -130,6 +178,9 @@ __xfs_btree_check_sblock(
         struct xfs_mount        *mp = cur->bc_mp;
         xfs_btnum_t             btnum = cur->bc_btnum;
         int                     crc = xfs_has_crc(mp);
+       xfs_failaddr_t          fa;
+       xfs_agblock_t           agbno = NULLAGBLOCK;
+       xfs_agnumber_t          agno = NULLAGNUMBER;
  
         if (crc) {
                 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
@@ -146,16 +197,18 @@ __xfs_btree_check_sblock(
         if (be16_to_cpu(block->bb_numrecs) >
             cur->bc_ops->get_maxrecs(cur, level))
                 return __this_address;
-       if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
-           !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
-                       level + 1))
-               return __this_address;
-       if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
-           !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
-                       level + 1))
-               return __this_address;
  
-       return NULL;
+       if (bp) {
+               agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+               agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
+       }
+
+       fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno,
+                       be32_to_cpu(block->bb_u.s.bb_leftsib));
+       if (!fa)
+               fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno,
+                                agbno, be32_to_cpu(block->bb_u.s.bb_rightsib));
+       return fa;
  }
  
  /* Check a short btree block header. */
@@ -751,20 +804,20 @@ xfs_btree_lastrec(
   */
  void
  xfs_btree_offsets(
-       int64_t         fields,         /* bitmask of fields */
+       uint32_t        fields,         /* bitmask of fields */
         const short     *offsets,       /* table of field offsets */
         int             nbits,          /* number of bits to inspect */
         int             *first,         /* output: first byte offset */
         int             *last)          /* output: last byte offset */
  {
         int             i;              /* current bit number */
-       int64_t         imask;          /* mask for current bit number */
+       uint32_t        imask;          /* mask for current bit number */
  
         ASSERT(fields != 0);
         /*
          * Find the lowest bit, so the first byte offset.
          */
-       for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+       for (i = 0, imask = 1u; ; i++, imask <<= 1) {
                 if (imask & fields) {
                         *first = offsets[i];
                         break;
@@ -773,7 +826,7 @@ xfs_btree_offsets(
         /*
          * Find the highest bit, so the last byte offset.
          */
-       for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+       for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) {
                 if (imask & fields) {
                         *last = offsets[i + 1] - 1;
                         break;
@@ -1456,7 +1509,7 @@ void
  xfs_btree_log_block(
         struct xfs_btree_cur    *cur,   /* btree cursor */
         struct xfs_buf          *bp,    /* buffer containing btree block */
-       int                     fields) /* mask of fields: XFS_BB_... */
+       uint32_t                fields) /* mask of fields: XFS_BB_... */
  {
         int                     first;  /* first byte offset logged */
         int                     last;   /* last byte offset logged */
@@ -4271,6 +4324,21 @@ xfs_btree_visit_block(
         if (xfs_btree_ptr_is_null(cur, &rptr))
                 return -ENOENT;
  
+       /*
+        * We only visit blocks once in this walk, so we have to avoid the
+        * internal xfs_btree_lookup_get_block() optimisation where it will
+        * return the same block without checking if the right sibling points
+        * back to us and creates a cyclic reference in the btree.
+        */
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp,
+                                                       xfs_buf_daddr(bp)))
+                       return -EFSCORRUPTED;
+       } else {
+               if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp,
+                                                       xfs_buf_daddr(bp)))
+                       return -EFSCORRUPTED;
+       }
         return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
  }
  
@@ -4445,20 +4513,21 @@ xfs_btree_lblock_verify(
  {
         struct xfs_mount        *mp = bp->b_mount;
         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+       xfs_fsblock_t           fsb;
+       xfs_failaddr_t          fa;
  
         /* numrecs verification */
         if (be16_to_cpu(block->bb_numrecs) > max_recs)
                 return __this_address;
  
         /* sibling pointer verification */
-       if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
-           !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
-               return __this_address;
-       if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
-           !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
-               return __this_address;
-
-       return NULL;
+       fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+       fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+                       be64_to_cpu(block->bb_u.l.bb_leftsib));
+       if (!fa)
+               fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+                               be64_to_cpu(block->bb_u.l.bb_rightsib));
+       return fa;
  }
  
  /**
@@ -4499,7 +4568,9 @@ xfs_btree_sblock_verify(
  {
         struct xfs_mount        *mp = bp->b_mount;
         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
-       xfs_agblock_t           agno;
+       xfs_agnumber_t          agno;
+       xfs_agblock_t           agbno;
+       xfs_failaddr_t          fa;
  
         /* numrecs verification */
         if (be16_to_cpu(block->bb_numrecs) > max_recs)
@@ -4507,14 +4578,13 @@ xfs_btree_sblock_verify(
  
         /* sibling pointer verification */
         agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
-       if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
-           !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
-               return __this_address;
-       if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
-           !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
-               return __this_address;
-
-       return NULL;
+       agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+       fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
+                       be32_to_cpu(block->bb_u.s.bb_leftsib));
+       if (!fa)
+               fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
+                               be32_to_cpu(block->bb_u.s.bb_rightsib));
+       return fa;
  }
  
  /*
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h

index 22d9f41..eef2785 100644 (file)
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -68,19 +68,19 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
  /*
   * For logging record fields.
   */
-#define        XFS_BB_MAGIC            (1 << 0)
-#define        XFS_BB_LEVEL            (1 << 1)
-#define        XFS_BB_NUMRECS          (1 << 2)
-#define        XFS_BB_LEFTSIB          (1 << 3)
-#define        XFS_BB_RIGHTSIB         (1 << 4)
-#define        XFS_BB_BLKNO            (1 << 5)
-#define        XFS_BB_LSN              (1 << 6)
-#define        XFS_BB_UUID             (1 << 7)
-#define        XFS_BB_OWNER            (1 << 8)
+#define        XFS_BB_MAGIC            (1u << 0)
+#define        XFS_BB_LEVEL            (1u << 1)
+#define        XFS_BB_NUMRECS          (1u << 2)
+#define        XFS_BB_LEFTSIB          (1u << 3)
+#define        XFS_BB_RIGHTSIB         (1u << 4)
+#define        XFS_BB_BLKNO            (1u << 5)
+#define        XFS_BB_LSN              (1u << 6)
+#define        XFS_BB_UUID             (1u << 7)
+#define        XFS_BB_OWNER            (1u << 8)
  #define        XFS_BB_NUM_BITS         5
-#define        XFS_BB_ALL_BITS         ((1 << XFS_BB_NUM_BITS) - 1)
+#define        XFS_BB_ALL_BITS         ((1u << XFS_BB_NUM_BITS) - 1)
  #define        XFS_BB_NUM_BITS_CRC     9
-#define        XFS_BB_ALL_BITS_CRC     ((1 << XFS_BB_NUM_BITS_CRC) - 1)
+#define        XFS_BB_ALL_BITS_CRC     ((1u << XFS_BB_NUM_BITS_CRC) - 1)
  
  /*
   * Generic stats interface
@@ -345,7 +345,7 @@ xfs_btree_dup_cursor(
   */
  void
  xfs_btree_offsets(
-       int64_t                 fields, /* bitmask of fields */
+       uint32_t                fields, /* bitmask of fields */
         const short             *offsets,/* table of field offsets */
         int                     nbits,  /* number of bits to inspect */
         int                     *first, /* output: first byte offset */
@@ -435,7 +435,7 @@ bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
  /*
   * Internal btree helpers also used by xfs_bmap.c.
   */
-void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t);
  void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
  
  /*
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c

index 9dc1ecb..aa74f3f 100644 (file)
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -22,6 +22,7 @@
  #include "xfs_trace.h"
  #include "xfs_buf_item.h"
  #include "xfs_log.h"
+#include "xfs_errortag.h"
  
  /*
   * xfs_da_btree.c
@@ -482,6 +483,9 @@ xfs_da3_split(
  
         trace_xfs_da_split(state->args);
  
+       if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
+               return -EIO;
+
         /*
          * Walk back up the tree splitting/inserting/adjusting as necessary.
          * If we need to insert and there isn't room, split the node, then
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h

index 0faf7d9..ed2303e 100644 (file)
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -30,6 +30,7 @@ struct xfs_da_geometry {
         unsigned int    free_hdr_size;  /* dir2 free header size */
         unsigned int    free_max_bests; /* # of bests entries in dir2 free */
         xfs_dablk_t     freeblk;        /* blockno of free data v2 */
+       xfs_extnum_t    max_extents;    /* Max. extents in corresponding fork */
  
         xfs_dir2_data_aoff_t data_first_offset;
         size_t          data_entry_offset;
@@ -76,27 +77,31 @@ typedef struct xfs_da_args {
         xfs_dablk_t     rmtblkno2;      /* remote attr value starting blkno */
         int             rmtblkcnt2;     /* remote attr value block count */
         int             rmtvaluelen2;   /* remote attr value length in bytes */
-       int             op_flags;       /* operation flags */
+       uint32_t        op_flags;       /* operation flags */
         enum xfs_dacmp  cmpresult;      /* name compare result for lookups */
  } xfs_da_args_t;
  
  /*
   * Operation flags:
   */
-#define XFS_DA_OP_JUSTCHECK    0x0001  /* check for ok with no space */
-#define XFS_DA_OP_RENAME       0x0002  /* this is an atomic rename op */
-#define XFS_DA_OP_ADDNAME      0x0004  /* this is an add operation */
-#define XFS_DA_OP_OKNOENT      0x0008  /* lookup/add op, ENOENT ok, else die */
-#define XFS_DA_OP_CILOOKUP     0x0010  /* lookup to return CI name if found */
-#define XFS_DA_OP_NOTIME       0x0020  /* don't update inode timestamps */
+#define XFS_DA_OP_JUSTCHECK    (1u << 0) /* check for ok with no space */
+#define XFS_DA_OP_REPLACE      (1u << 1) /* this is an atomic replace op */
+#define XFS_DA_OP_ADDNAME      (1u << 2) /* this is an add operation */
+#define XFS_DA_OP_OKNOENT      (1u << 3) /* lookup op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP     (1u << 4) /* lookup returns CI name if found */
+#define XFS_DA_OP_NOTIME       (1u << 5) /* don't update inode timestamps */
+#define XFS_DA_OP_REMOVE       (1u << 6) /* this is a remove operation */
+#define XFS_DA_OP_RECOVERY     (1u << 7) /* Log recovery operation */
  
  #define XFS_DA_OP_FLAGS \
         { XFS_DA_OP_JUSTCHECK,  "JUSTCHECK" }, \
-       { XFS_DA_OP_RENAME,     "RENAME" }, \
+       { XFS_DA_OP_REPLACE,    "REPLACE" }, \
         { XFS_DA_OP_ADDNAME,    "ADDNAME" }, \
         { XFS_DA_OP_OKNOENT,    "OKNOENT" }, \
         { XFS_DA_OP_CILOOKUP,   "CILOOKUP" }, \
-       { XFS_DA_OP_NOTIME,     "NOTIME" }
+       { XFS_DA_OP_NOTIME,     "NOTIME" }, \
+       { XFS_DA_OP_REMOVE,     "REMOVE" }, \
+       { XFS_DA_OP_RECOVERY,   "RECOVERY" }
  
  /*
   * Storage for holding state during Btree searches and split/join ops.
@@ -197,7 +202,7 @@ int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp,
   * Utility routines.
   */
  
-#define XFS_DABUF_MAP_HOLE_OK  (1 << 0)
+#define XFS_DABUF_MAP_HOLE_OK  (1u << 0)
  
  int    xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
  int    xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h

index 5a49caa..25e2841 100644 (file)
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
   * Directory address space divided into sections,
   * spaces separated by 32GB.
   */
+#define        XFS_DIR2_MAX_SPACES     3
  #define        XFS_DIR2_SPACE_SIZE     (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
  #define        XFS_DIR2_DATA_SPACE     0
  #define        XFS_DIR2_DATA_OFFSET    (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
@@ -688,10 +689,10 @@ struct xfs_attr3_leafblock {
  #define        XFS_ATTR_ROOT_BIT       1       /* limit access to trusted attrs */
  #define        XFS_ATTR_SECURE_BIT     2       /* limit access to secure attrs */
  #define        XFS_ATTR_INCOMPLETE_BIT 7       /* attr in middle of create/delete */
-#define XFS_ATTR_LOCAL         (1 << XFS_ATTR_LOCAL_BIT)
-#define XFS_ATTR_ROOT          (1 << XFS_ATTR_ROOT_BIT)
-#define XFS_ATTR_SECURE                (1 << XFS_ATTR_SECURE_BIT)
-#define XFS_ATTR_INCOMPLETE    (1 << XFS_ATTR_INCOMPLETE_BIT)
+#define XFS_ATTR_LOCAL         (1u << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT          (1u << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE                (1u << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE    (1u << XFS_ATTR_INCOMPLETE_BIT)
  #define XFS_ATTR_NSP_ONDISK_MASK       (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
  
  /*
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c

index 0805ade..ceb222b 100644 (file)
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -22,6 +22,10 @@
  #include "xfs_refcount.h"
  #include "xfs_bmap.h"
  #include "xfs_alloc.h"
+#include "xfs_buf.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
  
  static struct kmem_cache       *xfs_defer_pending_cache;
  
@@ -184,9 +188,10 @@ static const struct xfs_defer_op_type *defer_op_types[] = {
         [XFS_DEFER_OPS_TYPE_RMAP]       = &xfs_rmap_update_defer_type,
         [XFS_DEFER_OPS_TYPE_FREE]       = &xfs_extent_free_defer_type,
         [XFS_DEFER_OPS_TYPE_AGFL_FREE]  = &xfs_agfl_free_defer_type,
+       [XFS_DEFER_OPS_TYPE_ATTR]       = &xfs_attr_defer_type,
  };
  
-static void
+static bool
  xfs_defer_create_intent(
         struct xfs_trans                *tp,
         struct xfs_defer_pending        *dfp,
@@ -197,6 +202,7 @@ xfs_defer_create_intent(
         if (!dfp->dfp_intent)
                 dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
                                                      dfp->dfp_count, sort);
+       return dfp->dfp_intent != NULL;
  }
  
  /*
@@ -204,16 +210,18 @@ xfs_defer_create_intent(
   * associated extents, then add the entire intake list to the end of
   * the pending list.
   */
-STATIC void
+static bool
  xfs_defer_create_intents(
         struct xfs_trans                *tp)
  {
         struct xfs_defer_pending        *dfp;
+       bool                            ret = false;
  
         list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
                 trace_xfs_defer_create_intent(tp->t_mountp, dfp);
-               xfs_defer_create_intent(tp, dfp, true);
+               ret |= xfs_defer_create_intent(tp, dfp, true);
         }
+       return ret;
  }
  
  /* Abort all the intents that were committed. */
@@ -487,7 +495,7 @@ int
  xfs_defer_finish_noroll(
         struct xfs_trans                **tp)
  {
-       struct xfs_defer_pending        *dfp;
+       struct xfs_defer_pending        *dfp = NULL;
         int                             error = 0;
         LIST_HEAD(dop_pending);
  
@@ -506,17 +514,20 @@ xfs_defer_finish_noroll(
                  * of time that any one intent item can stick around in memory,
                  * pinning the log tail.
                  */
-               xfs_defer_create_intents(*tp);
+               bool has_intents = xfs_defer_create_intents(*tp);
+
                 list_splice_init(&(*tp)->t_dfops, &dop_pending);
  
-               error = xfs_defer_trans_roll(tp);
-               if (error)
-                       goto out_shutdown;
+               if (has_intents || dfp) {
+                       error = xfs_defer_trans_roll(tp);
+                       if (error)
+                               goto out_shutdown;
  
-               /* Possibly relog intent items to keep the log moving. */
-               error = xfs_defer_relog(tp, &dop_pending);
-               if (error)
-                       goto out_shutdown;
+                       /* Relog intent items to keep the log moving. */
+                       error = xfs_defer_relog(tp, &dop_pending);
+                       if (error)
+                               goto out_shutdown;
+               }
  
                 dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
                                        dfp_list);
@@ -774,17 +785,25 @@ xfs_defer_ops_continue(
         struct xfs_trans                *tp,
         struct xfs_defer_resources      *dres)
  {
+       unsigned int                    i;
+
         ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
         ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
  
-       /* Lock and join the captured inode to the new transaction. */
+       /* Lock the captured resources to the new transaction. */
         if (dfc->dfc_held.dr_inos == 2)
                 xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
                                     dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
         else if (dfc->dfc_held.dr_inos == 1)
                 xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
+
+       for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+               xfs_buf_lock(dfc->dfc_held.dr_bp[i]);
+
+       /* Join the captured resources to the new transaction. */
         xfs_defer_restore_resources(tp, &dfc->dfc_held);
         memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
+       dres->dr_bufs = 0;
  
         /* Move captured dfops chain and state to the transaction. */
         list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
@@ -854,7 +873,12 @@ xfs_defer_init_item_caches(void)
         error = xfs_extfree_intent_init_cache();
         if (error)
                 goto err;
-
+       error = xfs_attri_init_cache();
+       if (error)
+               goto err;
+       error = xfs_attrd_init_cache();
+       if (error)
+               goto err;
         return 0;
  err:
         xfs_defer_destroy_item_caches();
@@ -865,6 +889,8 @@ err:
  void
  xfs_defer_destroy_item_caches(void)
  {
+       xfs_attri_destroy_cache();
+       xfs_attrd_destroy_cache();
         xfs_extfree_intent_destroy_cache();
         xfs_bmap_intent_destroy_cache();
         xfs_refcount_intent_destroy_cache();
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h

index 7bb8a31..114a3a4 100644 (file)
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -19,6 +19,7 @@ enum xfs_defer_ops_type {
         XFS_DEFER_OPS_TYPE_RMAP,
         XFS_DEFER_OPS_TYPE_FREE,
         XFS_DEFER_OPS_TYPE_AGFL_FREE,
+       XFS_DEFER_OPS_TYPE_ATTR,
         XFS_DEFER_OPS_TYPE_MAX,
  };
  
@@ -63,6 +64,8 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
  extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
  extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
  extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+extern const struct xfs_defer_op_type xfs_attr_defer_type;
+
  
  /*
   * Deferred operation item relogging limits.
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c

index 5f1e479..3cd51fa 100644 (file)
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -150,6 +150,8 @@ xfs_da_mount(
         dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
         dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
                                 (uint)sizeof(xfs_da_node_entry_t);
+       dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >>
+                                       mp->m_sb.sb_blocklog;
         dageo->magicpct = (dageo->blksize * 37) / 100;
  
         /* set up attribute geometry - single fsb only */
@@ -161,6 +163,12 @@ xfs_da_mount(
         dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size;
         dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
                                 (uint)sizeof(xfs_da_node_entry_t);
+
+       if (xfs_has_large_extent_counts(mp))
+               dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+       else
+               dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
         dageo->magicpct = (dageo->blksize * 37) / 100;
         return 0;
  }
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h

index a23a52e..5362908 100644 (file)
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -59,7 +59,10 @@
  #define XFS_ERRTAG_REDUCE_MAX_IEXTENTS                 36
  #define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT            37
  #define XFS_ERRTAG_AG_RESV_FAIL                                38
-#define XFS_ERRTAG_MAX                                 39
+#define XFS_ERRTAG_LARP                                        39
+#define XFS_ERRTAG_DA_LEAF_SPLIT                       40
+#define XFS_ERRTAG_ATTR_LEAF_TO_NODE                   41
+#define XFS_ERRTAG_MAX                                 42
  
  /*
   * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -103,5 +106,8 @@
  #define XFS_RANDOM_REDUCE_MAX_IEXTENTS                 1
  #define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT            1
  #define XFS_RANDOM_AG_RESV_FAIL                                1
+#define XFS_RANDOM_LARP                                        1
+#define XFS_RANDOM_DA_LEAF_SPLIT                       1
+#define XFS_RANDOM_ATTR_LEAF_TO_NODE                   1
  
  #endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h

index d665c04..afdfc81 100644 (file)
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -372,12 +372,14 @@ xfs_sb_has_ro_compat_feature(
  #define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2)        /* metadata UUID */
  #define XFS_SB_FEAT_INCOMPAT_BIGTIME   (1 << 3)        /* large timestamps */
  #define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4)      /* needs xfs_repair */
+#define XFS_SB_FEAT_INCOMPAT_NREXT64   (1 << 5)        /* large extent counters */
  #define XFS_SB_FEAT_INCOMPAT_ALL \
                 (XFS_SB_FEAT_INCOMPAT_FTYPE|    \
                  XFS_SB_FEAT_INCOMPAT_SPINODES| \
                  XFS_SB_FEAT_INCOMPAT_META_UUID| \
                  XFS_SB_FEAT_INCOMPAT_BIGTIME| \
-                XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
+                XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
+                XFS_SB_FEAT_INCOMPAT_NREXT64)
  
  #define XFS_SB_FEAT_INCOMPAT_UNKNOWN   ~XFS_SB_FEAT_INCOMPAT_ALL
  static inline bool
@@ -388,7 +390,9 @@ xfs_sb_has_incompat_feature(
         return (sbp->sb_features_incompat & feature) != 0;
  }
  
-#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS   (1 << 0)     /* Delayed Attributes */
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \
+       (XFS_SB_FEAT_INCOMPAT_LOG_XATTRS)
  #define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN       ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
  static inline bool
  xfs_sb_has_incompat_log_feature(
@@ -413,6 +417,11 @@ xfs_sb_add_incompat_log_features(
         sbp->sb_features_log_incompat |= features;
  }
  
+static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp)
+{
+       return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat &
+                XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
+}
  
  static inline bool
  xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
@@ -525,26 +534,26 @@ typedef struct xfs_agf {
  
  #define XFS_AGF_CRC_OFF                offsetof(struct xfs_agf, agf_crc)
  
-#define        XFS_AGF_MAGICNUM        0x00000001
-#define        XFS_AGF_VERSIONNUM      0x00000002
-#define        XFS_AGF_SEQNO           0x00000004
-#define        XFS_AGF_LENGTH          0x00000008
-#define        XFS_AGF_ROOTS           0x00000010
-#define        XFS_AGF_LEVELS          0x00000020
-#define        XFS_AGF_FLFIRST         0x00000040
-#define        XFS_AGF_FLLAST          0x00000080
-#define        XFS_AGF_FLCOUNT         0x00000100
-#define        XFS_AGF_FREEBLKS        0x00000200
-#define        XFS_AGF_LONGEST         0x00000400
-#define        XFS_AGF_BTREEBLKS       0x00000800
-#define        XFS_AGF_UUID            0x00001000
-#define        XFS_AGF_RMAP_BLOCKS     0x00002000
-#define        XFS_AGF_REFCOUNT_BLOCKS 0x00004000
-#define        XFS_AGF_REFCOUNT_ROOT   0x00008000
-#define        XFS_AGF_REFCOUNT_LEVEL  0x00010000
-#define        XFS_AGF_SPARE64         0x00020000
+#define        XFS_AGF_MAGICNUM        (1u << 0)
+#define        XFS_AGF_VERSIONNUM      (1u << 1)
+#define        XFS_AGF_SEQNO           (1u << 2)
+#define        XFS_AGF_LENGTH          (1u << 3)
+#define        XFS_AGF_ROOTS           (1u << 4)
+#define        XFS_AGF_LEVELS          (1u << 5)
+#define        XFS_AGF_FLFIRST         (1u << 6)
+#define        XFS_AGF_FLLAST          (1u << 7)
+#define        XFS_AGF_FLCOUNT         (1u << 8)
+#define        XFS_AGF_FREEBLKS        (1u << 9)
+#define        XFS_AGF_LONGEST         (1u << 10)
+#define        XFS_AGF_BTREEBLKS       (1u << 11)
+#define        XFS_AGF_UUID            (1u << 12)
+#define        XFS_AGF_RMAP_BLOCKS     (1u << 13)
+#define        XFS_AGF_REFCOUNT_BLOCKS (1u << 14)
+#define        XFS_AGF_REFCOUNT_ROOT   (1u << 15)
+#define        XFS_AGF_REFCOUNT_LEVEL  (1u << 16)
+#define        XFS_AGF_SPARE64         (1u << 17)
  #define        XFS_AGF_NUM_BITS        18
-#define        XFS_AGF_ALL_BITS        ((1 << XFS_AGF_NUM_BITS) - 1)
+#define        XFS_AGF_ALL_BITS        ((1u << XFS_AGF_NUM_BITS) - 1)
  
  #define XFS_AGF_FLAGS \
         { XFS_AGF_MAGICNUM,     "MAGICNUM" }, \
@@ -619,22 +628,22 @@ typedef struct xfs_agi {
  
  #define XFS_AGI_CRC_OFF                offsetof(struct xfs_agi, agi_crc)
  
-#define        XFS_AGI_MAGICNUM        (1 << 0)
-#define        XFS_AGI_VERSIONNUM      (1 << 1)
-#define        XFS_AGI_SEQNO           (1 << 2)
-#define        XFS_AGI_LENGTH          (1 << 3)
-#define        XFS_AGI_COUNT           (1 << 4)
-#define        XFS_AGI_ROOT            (1 << 5)
-#define        XFS_AGI_LEVEL           (1 << 6)
-#define        XFS_AGI_FREECOUNT       (1 << 7)
-#define        XFS_AGI_NEWINO          (1 << 8)
-#define        XFS_AGI_DIRINO          (1 << 9)
-#define        XFS_AGI_UNLINKED        (1 << 10)
+#define        XFS_AGI_MAGICNUM        (1u << 0)
+#define        XFS_AGI_VERSIONNUM      (1u << 1)
+#define        XFS_AGI_SEQNO           (1u << 2)
+#define        XFS_AGI_LENGTH          (1u << 3)
+#define        XFS_AGI_COUNT           (1u << 4)
+#define        XFS_AGI_ROOT            (1u << 5)
+#define        XFS_AGI_LEVEL           (1u << 6)
+#define        XFS_AGI_FREECOUNT       (1u << 7)
+#define        XFS_AGI_NEWINO          (1u << 8)
+#define        XFS_AGI_DIRINO          (1u << 9)
+#define        XFS_AGI_UNLINKED        (1u << 10)
  #define        XFS_AGI_NUM_BITS_R1     11      /* end of the 1st agi logging region */
-#define        XFS_AGI_ALL_BITS_R1     ((1 << XFS_AGI_NUM_BITS_R1) - 1)
-#define        XFS_AGI_FREE_ROOT       (1 << 11)
-#define        XFS_AGI_FREE_LEVEL      (1 << 12)
-#define        XFS_AGI_IBLOCKS         (1 << 13) /* both inobt/finobt block counters */
+#define        XFS_AGI_ALL_BITS_R1     ((1u << XFS_AGI_NUM_BITS_R1) - 1)
+#define        XFS_AGI_FREE_ROOT       (1u << 11)
+#define        XFS_AGI_FREE_LEVEL      (1u << 12)
+#define        XFS_AGI_IBLOCKS         (1u << 13) /* both inobt/finobt block counters */
  #define        XFS_AGI_NUM_BITS_R2     14
  
  /* disk block (xfs_daddr_t) in the AG */
@@ -791,16 +800,41 @@ struct xfs_dinode {
         __be32          di_nlink;       /* number of links to file */
         __be16          di_projid_lo;   /* lower part of owner's project id */
         __be16          di_projid_hi;   /* higher part owner's project id */
-       __u8            di_pad[6];      /* unused, zeroed space */
-       __be16          di_flushiter;   /* incremented on flush */
+       union {
+               /* Number of data fork extents if NREXT64 is set */
+               __be64  di_big_nextents;
+
+               /* Padding for V3 inodes without NREXT64 set. */
+               __be64  di_v3_pad;
+
+               /* Padding and inode flush counter for V2 inodes. */
+               struct {
+                       __u8    di_v2_pad[6];
+                       __be16  di_flushiter;
+               };
+       };
         xfs_timestamp_t di_atime;       /* time last accessed */
         xfs_timestamp_t di_mtime;       /* time last modified */
         xfs_timestamp_t di_ctime;       /* time created/inode modified */
         __be64          di_size;        /* number of bytes in file */
         __be64          di_nblocks;     /* # of direct & btree blocks used */
         __be32          di_extsize;     /* basic/minimum extent size for file */
-       __be32          di_nextents;    /* number of extents in data fork */
-       __be16          di_anextents;   /* number of extents in attribute fork*/
+       union {
+               /*
+                * For V2 inodes and V3 inodes without NREXT64 set, this
+                * is the number of data and attr fork extents.
+                */
+               struct {
+                       __be32  di_nextents;
+                       __be16  di_anextents;
+               } __packed;
+
+               /* Number of attr fork extents if NREXT64 is set. */
+               struct {
+                       __be32  di_big_anextents;
+                       __be16  di_nrext64_pad;
+               } __packed;
+       } __packed;
         __u8            di_forkoff;     /* attr fork offs, <<3 for 64b align */
         __s8            di_aformat;     /* format of attr fork's data */
         __be32          di_dmevmask;    /* DMIG event mask */
@@ -869,6 +903,56 @@ enum xfs_dinode_fmt {
         { XFS_DINODE_FMT_BTREE,         "btree" }, \
         { XFS_DINODE_FMT_UUID,          "uuid" }
  
+/*
+ * Max values for extnum and aextnum.
+ *
+ * The original on-disk extent counts were held in signed fields, resulting in
+ * maximum extent counts of 2^31 and 2^15 for the data and attr forks
+ * respectively. Similarly the maximum extent length is limited to 2^21 blocks
+ * by the 21-bit wide blockcount field of a BMBT extent record.
+ *
+ * The newly introduced data fork extent counter can hold a 64-bit value,
+ * however the maximum number of extents in a file is also limited to 2^54
+ * extents by the 54-bit wide startoff field of a BMBT extent record.
+ *
+ * It is further limited by the maximum supported file size of 2^63
+ * *bytes*. This leads to a maximum extent count for maximally sized filesystem
+ * blocks (64kB) of:
+ *
+ * 2^63 bytes / 2^16 bytes per block = 2^47 blocks
+ *
+ * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence
+ * 2^48 was chosen as the maximum data fork extent count.
+ *
+ * The maximum file size that can be represented by the data fork extent counter
+ * in the worst case occurs when all extents are 1 block in length and each
+ * block is 1KB in size.
+ *
+ * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and
+ * with 1KB sized blocks, a file can reach upto,
+ * 1KB * (2^31) = 2TB
+ *
+ * This is much larger than the theoretical maximum size of a directory
+ * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB.
+ *
+ * Hence, a directory inode can never overflow its data fork extent counter.
+ */
+#define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1))
+#define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1))
+
+/*
+ * When we upgrade an inode to the large extent counts, the maximum value by
+ * which the extent count can increase is bound by the change in size of the
+ * on-disk field. No upgrade operation should ever be adding more than a few
+ * tens of extents, so if we get a really large value it is a sign of a code bug
+ * or corruption.
+ */
+#define XFS_MAX_EXTCNT_UPGRADE_NR      \
+       min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL,    \
+           XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL)
+
  /*
   * Inode minimum and maximum sizes.
   */
@@ -918,10 +1002,6 @@ enum xfs_dinode_fmt {
         ((w) == XFS_DATA_FORK ? \
                 (dip)->di_format : \
                 (dip)->di_aformat)
-#define XFS_DFORK_NEXTENTS(dip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               be32_to_cpu((dip)->di_nextents) : \
-               be16_to_cpu((dip)->di_anextents))
  
  /*
   * For block and character special files the 32bit dev_t is stored at the
@@ -988,15 +1068,17 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
  #define XFS_DIFLAG2_REFLINK_BIT        1       /* file's blocks may be shared */
  #define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */
  #define XFS_DIFLAG2_BIGTIME_BIT        3       /* big timestamps */
+#define XFS_DIFLAG2_NREXT64_BIT 4      /* large extent counters */
  
  #define XFS_DIFLAG2_DAX                (1 << XFS_DIFLAG2_DAX_BIT)
  #define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
  #define XFS_DIFLAG2_COWEXTSIZE  (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
  #define XFS_DIFLAG2_BIGTIME    (1 << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_NREXT64    (1 << XFS_DIFLAG2_NREXT64_BIT)
  
  #define XFS_DIFLAG2_ANY \
         (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
-        XFS_DIFLAG2_BIGTIME)
+        XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
  
  static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
  {
@@ -1004,6 +1086,13 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
                (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME));
  }
  
+static inline bool xfs_dinode_has_large_extent_counts(
+       const struct xfs_dinode *dip)
+{
+       return dip->di_version >= 3 &&
+              (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64));
+}
+
  /*
   * Inode number format:
   * low inopblog bits - offset in block
@@ -1085,10 +1174,10 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
  #define XFS_DQUOT_MAGIC                0x4451          /* 'DQ' */
  #define XFS_DQUOT_VERSION      (uint8_t)0x01   /* latest version number */
  
-#define XFS_DQTYPE_USER                0x01            /* user dquot record */
-#define XFS_DQTYPE_PROJ                0x02            /* project dquot record */
-#define XFS_DQTYPE_GROUP       0x04            /* group dquot record */
-#define XFS_DQTYPE_BIGTIME     0x80            /* large expiry timestamps */
+#define XFS_DQTYPE_USER                (1u << 0)       /* user dquot record */
+#define XFS_DQTYPE_PROJ                (1u << 1)       /* project dquot record */
+#define XFS_DQTYPE_GROUP       (1u << 2)       /* group dquot record */
+#define XFS_DQTYPE_BIGTIME     (1u << 7)       /* large expiry timestamps */
  
  /* bitmask to determine if this is a user/group/project dquot */
  #define XFS_DQTYPE_REC_MASK    (XFS_DQTYPE_USER | \
@@ -1596,6 +1685,8 @@ typedef struct xfs_bmdr_block {
  #define BMBT_STARTOFF_MASK     ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
  #define BMBT_BLOCKCOUNT_MASK   ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)
  
+#define XFS_MAX_BMBT_EXTLEN    ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK))
+
  /*
   * bmbt records have a file offset (block) field that is 54 bits wide, so this
   * is the largest xfs_fileoff_t that we ever expect to see.
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h

index 505533c..1cfd5bc 100644 (file)
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks {
  #define XFS_FSOP_GEOM_FLAGS_REFLINK    (1 << 20) /* files can share blocks */
  #define XFS_FSOP_GEOM_FLAGS_BIGTIME    (1 << 21) /* 64-bit nsec timestamps */
  #define XFS_FSOP_GEOM_FLAGS_INOBTCNT   (1 << 22) /* inobt btree counter */
+#define XFS_FSOP_GEOM_FLAGS_NREXT64    (1 << 23) /* large extent counters */
  
  /*
   * Minimum and maximum sizes need for growth checks.
@@ -377,7 +378,7 @@ struct xfs_bulkstat {
         uint32_t        bs_extsize_blks; /* extent size hint, blocks    */
  
         uint32_t        bs_nlink;       /* number of links              */
-       uint32_t        bs_extents;     /* number of extents            */
+       uint32_t        bs_extents;     /* 32-bit data fork extent counter */
         uint32_t        bs_aextents;    /* attribute number of extents  */
         uint16_t        bs_version;     /* structure version            */
         uint16_t        bs_forkoff;     /* inode fork offset in bytes   */
@@ -386,8 +387,9 @@ struct xfs_bulkstat {
         uint16_t        bs_checked;     /* checked inode metadata       */
         uint16_t        bs_mode;        /* type and mode                */
         uint16_t        bs_pad2;        /* zeroed                       */
+       uint64_t        bs_extents64;   /* 64-bit data fork extent counter */
  
-       uint64_t        bs_pad[7];      /* zeroed                       */
+       uint64_t        bs_pad[6];      /* zeroed                       */
  };
  
  #define XFS_BULKSTAT_VERSION_V1        (1)
@@ -459,17 +461,28 @@ struct xfs_bulk_ireq {
   * Only return results from the specified @agno.  If @ino is zero, start
   * with the first inode of @agno.
   */
-#define XFS_BULK_IREQ_AGNO     (1 << 0)
+#define XFS_BULK_IREQ_AGNO     (1U << 0)
  
  /*
   * Return bulkstat information for a single inode, where @ino value is a
   * special value, not a literal inode number.  See the XFS_BULK_IREQ_SPECIAL_*
   * values below.  Not compatible with XFS_BULK_IREQ_AGNO.
   */
-#define XFS_BULK_IREQ_SPECIAL  (1 << 1)
+#define XFS_BULK_IREQ_SPECIAL  (1U << 1)
  
-#define XFS_BULK_IREQ_FLAGS_ALL        (XFS_BULK_IREQ_AGNO | \
-                                XFS_BULK_IREQ_SPECIAL)
+/*
+ * Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign
+ * 0 to xfs_bulkstat->bs_extents when the flag is set.  Otherwise, use
+ * xfs_bulkstat->bs_extents for returning data fork extent count and set
+ * xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and
+ * assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than
+ * XFS_MAX_EXTCNT_DATA_FORK_OLD.
+ */
+#define XFS_BULK_IREQ_NREXT64  (1U << 2)
+
+#define XFS_BULK_IREQ_FLAGS_ALL        (XFS_BULK_IREQ_AGNO |    \
+                                XFS_BULK_IREQ_SPECIAL | \
+                                XFS_BULK_IREQ_NREXT64)
  
  /* Operate on the root directory inode. */
  #define XFS_BULK_IREQ_SPECIAL_ROOT     (1)
@@ -699,34 +712,34 @@ struct xfs_scrub_metadata {
  #define XFS_SCRUB_TYPE_NR      25
  
  /* i: Repair this metadata. */
-#define XFS_SCRUB_IFLAG_REPAIR         (1 << 0)
+#define XFS_SCRUB_IFLAG_REPAIR         (1u << 0)
  
  /* o: Metadata object needs repair. */
-#define XFS_SCRUB_OFLAG_CORRUPT                (1 << 1)
+#define XFS_SCRUB_OFLAG_CORRUPT                (1u << 1)
  
  /*
   * o: Metadata object could be optimized.  It's not corrupt, but
   *    we could improve on it somehow.
   */
-#define XFS_SCRUB_OFLAG_PREEN          (1 << 2)
+#define XFS_SCRUB_OFLAG_PREEN          (1u << 2)
  
  /* o: Cross-referencing failed. */
-#define XFS_SCRUB_OFLAG_XFAIL          (1 << 3)
+#define XFS_SCRUB_OFLAG_XFAIL          (1u << 3)
  
  /* o: Metadata object disagrees with cross-referenced metadata. */
-#define XFS_SCRUB_OFLAG_XCORRUPT       (1 << 4)
+#define XFS_SCRUB_OFLAG_XCORRUPT       (1u << 4)
  
  /* o: Scan was not complete. */
-#define XFS_SCRUB_OFLAG_INCOMPLETE     (1 << 5)
+#define XFS_SCRUB_OFLAG_INCOMPLETE     (1u << 5)
  
  /* o: Metadata object looked funny but isn't corrupt. */
-#define XFS_SCRUB_OFLAG_WARNING                (1 << 6)
+#define XFS_SCRUB_OFLAG_WARNING                (1u << 6)
  
  /*
   * o: IFLAG_REPAIR was set but metadata object did not need fixing or
   *    optimization and has therefore not been altered.
   */
-#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7)
+#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7)
  
  #define XFS_SCRUB_FLAGS_IN     (XFS_SCRUB_IFLAG_REPAIR)
  #define XFS_SCRUB_FLAGS_OUT    (XFS_SCRUB_OFLAG_CORRUPT | \
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c

index b418fe0..bf2f4bc 100644 (file)
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2414,9 +2414,9 @@ out_drop:
   */
  void
  xfs_ialloc_log_agi(
-       xfs_trans_t     *tp,            /* transaction pointer */
-       struct xfs_buf  *bp,            /* allocation group header buffer */
-       int             fields)         /* bitmask of fields to log */
+       struct xfs_trans        *tp,
+       struct xfs_buf          *bp,
+       uint32_t                fields)
  {
         int                     first;          /* first byte number */
         int                     last;           /* last byte number */
@@ -2772,6 +2772,8 @@ xfs_ialloc_setup_geometry(
         igeo->new_diflags2 = 0;
         if (xfs_has_bigtime(mp))
                 igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
+       if (xfs_has_large_extent_counts(mp))
+               igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
  
         /* Compute inode btree geometry. */
         igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h

index 8b5c2b7..a7705b6 100644 (file)
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -60,7 +60,7 @@ void
  xfs_ialloc_log_agi(
         struct xfs_trans *tp,           /* transaction pointer */
         struct xfs_buf  *bp,            /* allocation group header buffer */
-       int             fields);        /* bitmask of fields to log */
+       uint32_t        fields);        /* bitmask of fields to log */
  
  /*
   * Read in the allocation group header (inode allocation section)
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c

index cae9708..3b1b63f 100644 (file)
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -279,6 +279,25 @@ xfs_inode_to_disk_ts(
         return ts;
  }
  
+static inline void
+xfs_inode_to_disk_iext_counters(
+       struct xfs_inode        *ip,
+       struct xfs_dinode       *to)
+{
+       if (xfs_inode_has_large_extent_counts(ip)) {
+               to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df));
+               to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp));
+               /*
+                * We might be upgrading the inode to use larger extent counters
+                * than was previously used. Hence zero the unused field.
+                */
+               to->di_nrext64_pad = cpu_to_be16(0);
+       } else {
+               to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
+               to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp));
+       }
+}
+
  void
  xfs_inode_to_disk(
         struct xfs_inode        *ip,
@@ -296,7 +315,6 @@ xfs_inode_to_disk(
         to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff);
         to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16);
  
-       memset(to->di_pad, 0, sizeof(to->di_pad));
         to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
         to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
         to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
@@ -307,8 +325,6 @@ xfs_inode_to_disk(
         to->di_size = cpu_to_be64(ip->i_disk_size);
         to->di_nblocks = cpu_to_be64(ip->i_nblocks);
         to->di_extsize = cpu_to_be32(ip->i_extsize);
-       to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
-       to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp));
         to->di_forkoff = ip->i_forkoff;
         to->di_aformat = xfs_ifork_format(ip->i_afp);
         to->di_flags = cpu_to_be16(ip->i_diflags);
@@ -323,11 +339,14 @@ xfs_inode_to_disk(
                 to->di_lsn = cpu_to_be64(lsn);
                 memset(to->di_pad2, 0, sizeof(to->di_pad2));
                 uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
-               to->di_flushiter = 0;
+               to->di_v3_pad = 0;
         } else {
                 to->di_version = 2;
                 to->di_flushiter = cpu_to_be16(ip->i_flushiter);
+               memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
         }
+
+       xfs_inode_to_disk_iext_counters(ip, to);
  }
  
  static xfs_failaddr_t
@@ -336,20 +355,40 @@ xfs_dinode_verify_fork(
         struct xfs_mount        *mp,
         int                     whichfork)
  {
-       uint32_t                di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork);
+       xfs_extnum_t            di_nextents;
+       xfs_extnum_t            max_extents;
+       mode_t                  mode = be16_to_cpu(dip->di_mode);
+       uint32_t                fork_size = XFS_DFORK_SIZE(dip, mp, whichfork);
+       uint32_t                fork_format = XFS_DFORK_FORMAT(dip, whichfork);
+
+       di_nextents = xfs_dfork_nextents(dip, whichfork);
+
+       /*
+        * For fork types that can contain local data, check that the fork
+        * format matches the size of local data contained within the fork.
+        *
+        * For all types, check that when the size says the should be in extent
+        * or btree format, the inode isn't claiming it is in local format.
+        */
+       if (whichfork == XFS_DATA_FORK) {
+               if (S_ISDIR(mode) || S_ISLNK(mode)) {
+                       if (be64_to_cpu(dip->di_size) <= fork_size &&
+                           fork_format != XFS_DINODE_FMT_LOCAL)
+                               return __this_address;
+               }
  
-       switch (XFS_DFORK_FORMAT(dip, whichfork)) {
+               if (be64_to_cpu(dip->di_size) > fork_size &&
+                   fork_format == XFS_DINODE_FMT_LOCAL)
+                       return __this_address;
+       }
+
+       switch (fork_format) {
         case XFS_DINODE_FMT_LOCAL:
                 /*
-                * no local regular files yet
+                * No local regular files yet.
                  */
-               if (whichfork == XFS_DATA_FORK) {
-                       if (S_ISREG(be16_to_cpu(dip->di_mode)))
-                               return __this_address;
-                       if (be64_to_cpu(dip->di_size) >
-                                       XFS_DFORK_SIZE(dip, mp, whichfork))
-                               return __this_address;
-               }
+               if (S_ISREG(mode) && whichfork == XFS_DATA_FORK)
+                       return __this_address;
                 if (di_nextents)
                         return __this_address;
                 break;
@@ -358,12 +397,11 @@ xfs_dinode_verify_fork(
                         return __this_address;
                 break;
         case XFS_DINODE_FMT_BTREE:
-               if (whichfork == XFS_ATTR_FORK) {
-                       if (di_nextents > MAXAEXTNUM)
-                               return __this_address;
-               } else if (di_nextents > MAXEXTNUM) {
+               max_extents = xfs_iext_max_nextents(
+                                       xfs_dinode_has_large_extent_counts(dip),
+                                       whichfork);
+               if (di_nextents > max_extents)
                         return __this_address;
-               }
                 break;
         default:
                 return __this_address;
@@ -396,6 +434,24 @@ xfs_dinode_verify_forkoff(
         return NULL;
  }
  
+static xfs_failaddr_t
+xfs_dinode_verify_nrext64(
+       struct xfs_mount        *mp,
+       struct xfs_dinode       *dip)
+{
+       if (xfs_dinode_has_large_extent_counts(dip)) {
+               if (!xfs_has_large_extent_counts(mp))
+                       return __this_address;
+               if (dip->di_nrext64_pad != 0)
+                       return __this_address;
+       } else if (dip->di_version >= 3) {
+               if (dip->di_v3_pad != 0)
+                       return __this_address;
+       }
+
+       return NULL;
+}
+
  xfs_failaddr_t
  xfs_dinode_verify(
         struct xfs_mount        *mp,
@@ -407,6 +463,9 @@ xfs_dinode_verify(
         uint16_t                flags;
         uint64_t                flags2;
         uint64_t                di_size;
+       xfs_extnum_t            nextents;
+       xfs_extnum_t            naextents;
+       xfs_filblks_t           nblocks;
  
         if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
                 return __this_address;
@@ -437,10 +496,19 @@ xfs_dinode_verify(
         if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0)
                 return __this_address;
  
+       fa = xfs_dinode_verify_nrext64(mp, dip);
+       if (fa)
+               return fa;
+
+       nextents = xfs_dfork_data_extents(dip);
+       naextents = xfs_dfork_attr_extents(dip);
+       nblocks = be64_to_cpu(dip->di_nblocks);
+
         /* Fork checks carried over from xfs_iformat_fork */
-       if (mode &&
-           be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) >
-                       be64_to_cpu(dip->di_nblocks))
+       if (mode && nextents + naextents > nblocks)
+               return __this_address;
+
+       if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
                 return __this_address;
  
         if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize)
@@ -497,7 +565,7 @@ xfs_dinode_verify(
                 default:
                         return __this_address;
                 }
-               if (dip->di_anextents)
+               if (naextents)
                         return __this_address;
         }
  
@@ -639,7 +707,7 @@ xfs_inode_validate_extsize(
         if (extsize_bytes % blocksize_bytes)
                 return __this_address;
  
-       if (extsize > MAXEXTLEN)
+       if (extsize > XFS_MAX_BMBT_EXTLEN)
                 return __this_address;
  
         if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2)
@@ -696,7 +764,7 @@ xfs_inode_validate_cowextsize(
         if (cowextsize_bytes % mp->m_sb.sb_blocksize)
                 return __this_address;
  
-       if (cowextsize > MAXEXTLEN)
+       if (cowextsize > XFS_MAX_BMBT_EXTLEN)
                 return __this_address;
  
         if (cowextsize > mp->m_sb.sb_agblocks / 2)
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c

index 9149f4f..1a4cdf5 100644 (file)
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -36,7 +36,7 @@ xfs_init_local_fork(
         int64_t                 size)
  {
         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
-       int                     mem_size = size, real_size = 0;
+       int                     mem_size = size;
         bool                    zero_terminate;
  
         /*
@@ -50,8 +50,7 @@ xfs_init_local_fork(
                 mem_size++;
  
         if (size) {
-               real_size = roundup(mem_size, 4);
-               ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS);
+               ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS);
                 memcpy(ifp->if_u1.if_data, data, size);
                 if (zero_terminate)
                         ifp->if_u1.if_data[size] = '\0';
@@ -105,7 +104,7 @@ xfs_iformat_extents(
         struct xfs_mount        *mp = ip->i_mount;
         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
         int                     state = xfs_bmap_fork_to_state(whichfork);
-       int                     nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+       xfs_extnum_t            nex = xfs_dfork_nextents(dip, whichfork);
         int                     size = nex * sizeof(xfs_bmbt_rec_t);
         struct xfs_iext_cursor  icur;
         struct xfs_bmbt_rec     *dp;
@@ -117,8 +116,8 @@ xfs_iformat_extents(
          * we just bail out rather than crash in kmem_alloc() or memcpy() below.
          */
         if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
-               xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
-                       (unsigned long long) ip->i_ino, nex);
+               xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).",
+                       ip->i_ino, nex);
                 xfs_inode_verifier_error(ip, -EFSCORRUPTED,
                                 "xfs_iformat_extents(1)", dip, sizeof(*dip),
                                 __this_address);
@@ -230,7 +229,7 @@ xfs_iformat_data_fork(
          * depend on it.
          */
         ip->i_df.if_format = dip->di_format;
-       ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents);
+       ip->i_df.if_nextents = xfs_dfork_data_extents(dip);
  
         switch (inode->i_mode & S_IFMT) {
         case S_IFIFO:
@@ -295,14 +294,14 @@ xfs_iformat_attr_fork(
         struct xfs_inode        *ip,
         struct xfs_dinode       *dip)
  {
+       xfs_extnum_t            naextents = xfs_dfork_attr_extents(dip);
         int                     error = 0;
  
         /*
          * Initialize the extent count early, as the per-format routines may
          * depend on it.
          */
-       ip->i_afp = xfs_ifork_alloc(dip->di_aformat,
-                               be16_to_cpu(dip->di_anextents));
+       ip->i_afp = xfs_ifork_alloc(dip->di_aformat, naextents);
  
         switch (ip->i_afp->if_format) {
         case XFS_DINODE_FMT_LOCAL:
@@ -497,12 +496,7 @@ xfs_idata_realloc(
                 return;
         }
  
-       /*
-        * For inline data, the underlying buffer must be a multiple of 4 bytes
-        * in size so that it can be logged and stay on word boundaries.
-        * We enforce that here.
-        */
-       ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4),
+       ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size,
                                       GFP_NOFS | __GFP_NOFAIL);
         ifp->if_bytes = new_size;
  }
@@ -744,7 +738,8 @@ xfs_iext_count_may_overflow(
         if (whichfork == XFS_COW_FORK)
                 return 0;
  
-       max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM;
+       max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip),
+                               whichfork);
  
         if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
                 max_exts = 10;
@@ -755,3 +750,27 @@ xfs_iext_count_may_overflow(
  
         return 0;
  }
+
+/*
+ * Upgrade this inode's extent counter fields to be able to handle a potential
+ * increase in the extent count by nr_to_add.  Normally this is the same
+ * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG.
+ */
+int
+xfs_iext_count_upgrade(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       uint                    nr_to_add)
+{
+       ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR);
+
+       if (!xfs_has_large_extent_counts(ip->i_mount) ||
+           xfs_inode_has_large_extent_counts(ip) ||
+           XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+               return -EFBIG;
+
+       ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h

index 3d64a3a..4f68c1f 100644 (file)
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -21,9 +21,9 @@ struct xfs_ifork {
                 void            *if_root;       /* extent tree root */
                 char            *if_data;       /* inline file data */
         } if_u1;
+       xfs_extnum_t            if_nextents;    /* # of extents in this fork */
         short                   if_broot_bytes; /* bytes allocated for root */
         int8_t                  if_format;      /* format of this fork */
-       xfs_extnum_t            if_nextents;    /* # of extents in this fork */
  };
  
  /*
@@ -39,19 +39,6 @@ struct xfs_ifork {
   */
  #define XFS_IEXT_PUNCH_HOLE_CNT                (1)
  
-/*
- * Directory entry addition can cause the following,
- * 1. Data block can be added/removed.
- *    A new extent can cause extent count to increase by 1.
- * 2. Free disk block can be added/removed.
- *    Same behaviour as described above for Data block.
- * 3. Dabtree blocks.
- *    XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new
- *    extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH.
- */
-#define XFS_IEXT_DIR_MANIP_CNT(mp) \
-       ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount)
-
  /*
   * Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to
   * be added. One extra extent for dabtree in case a local attr is
@@ -133,6 +120,65 @@ static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp)
         return ifp->if_format;
  }
  
+static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts,
+                               int whichfork)
+{
+       switch (whichfork) {
+       case XFS_DATA_FORK:
+       case XFS_COW_FORK:
+               if (has_large_extent_counts)
+                       return XFS_MAX_EXTCNT_DATA_FORK_LARGE;
+               return XFS_MAX_EXTCNT_DATA_FORK_SMALL;
+
+       case XFS_ATTR_FORK:
+               if (has_large_extent_counts)
+                       return XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+               return XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
+       default:
+               ASSERT(0);
+               return 0;
+       }
+}
+
+static inline xfs_extnum_t
+xfs_dfork_data_extents(
+       struct xfs_dinode       *dip)
+{
+       if (xfs_dinode_has_large_extent_counts(dip))
+               return be64_to_cpu(dip->di_big_nextents);
+
+       return be32_to_cpu(dip->di_nextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_attr_extents(
+       struct xfs_dinode       *dip)
+{
+       if (xfs_dinode_has_large_extent_counts(dip))
+               return be32_to_cpu(dip->di_big_anextents);
+
+       return be16_to_cpu(dip->di_anextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_nextents(
+       struct xfs_dinode       *dip,
+       int                     whichfork)
+{
+       switch (whichfork) {
+       case XFS_DATA_FORK:
+               return xfs_dfork_data_extents(dip);
+       case XFS_ATTR_FORK:
+               return xfs_dfork_attr_extents(dip);
+       default:
+               ASSERT(0);
+               break;
+       }
+
+       return 0;
+}
+
  struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format,
                                 xfs_extnum_t nextents);
  struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
@@ -229,6 +275,8 @@ int xfs_ifork_verify_local_data(struct xfs_inode *ip);
  int xfs_ifork_verify_local_attr(struct xfs_inode *ip);
  int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
                 int nr_to_add);
+int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
+               uint nr_to_add);
  
  /* returns true if the fork has extents but they are not read in yet. */
  static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h

index b322db5..f7edd1e 100644 (file)
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -69,7 +69,6 @@ static inline uint xlog_get_cycle(char *ptr)
  
  /* Log Clients */
  #define XFS_TRANSACTION                0x69
-#define XFS_VOLUME             0x2
  #define XFS_LOG                        0xaa
  
  #define XLOG_UNMOUNT_TYPE      0x556e  /* Un for Unmount */
@@ -114,7 +113,12 @@ struct xfs_unmount_log_format {
  #define XLOG_REG_TYPE_CUD_FORMAT       24
  #define XLOG_REG_TYPE_BUI_FORMAT       25
  #define XLOG_REG_TYPE_BUD_FORMAT       26
-#define XLOG_REG_TYPE_MAX              26
+#define XLOG_REG_TYPE_ATTRI_FORMAT     27
+#define XLOG_REG_TYPE_ATTRD_FORMAT     28
+#define XLOG_REG_TYPE_ATTR_NAME        29
+#define XLOG_REG_TYPE_ATTR_VALUE       30
+#define XLOG_REG_TYPE_MAX              30
+
  
  /*
   * Flags to log operation header
@@ -237,6 +241,8 @@ typedef struct xfs_trans_header {
  #define        XFS_LI_CUD              0x1243
  #define        XFS_LI_BUI              0x1244  /* bmbt update intent */
  #define        XFS_LI_BUD              0x1245
+#define        XFS_LI_ATTRI            0x1246  /* attr set/remove intent*/
+#define        XFS_LI_ATTRD            0x1247  /* attr set/remove done */
  
  #define XFS_LI_TYPE_DESC \
         { XFS_LI_EFI,           "XFS_LI_EFI" }, \
@@ -252,7 +258,9 @@ typedef struct xfs_trans_header {
         { XFS_LI_CUI,           "XFS_LI_CUI" }, \
         { XFS_LI_CUD,           "XFS_LI_CUD" }, \
         { XFS_LI_BUI,           "XFS_LI_BUI" }, \
-       { XFS_LI_BUD,           "XFS_LI_BUD" }
+       { XFS_LI_BUD,           "XFS_LI_BUD" }, \
+       { XFS_LI_ATTRI,         "XFS_LI_ATTRI" }, \
+       { XFS_LI_ATTRD,         "XFS_LI_ATTRD" }
  
  /*
   * Inode Log Item Format definitions.
@@ -388,16 +396,41 @@ struct xfs_log_dinode {
         uint32_t        di_nlink;       /* number of links to file */
         uint16_t        di_projid_lo;   /* lower part of owner's project id */
         uint16_t        di_projid_hi;   /* higher part of owner's project id */
-       uint8_t         di_pad[6];      /* unused, zeroed space */
-       uint16_t        di_flushiter;   /* incremented on flush */
+       union {
+               /* Number of data fork extents if NREXT64 is set */
+               uint64_t        di_big_nextents;
+
+               /* Padding for V3 inodes without NREXT64 set. */
+               uint64_t        di_v3_pad;
+
+               /* Padding and inode flush counter for V2 inodes. */
+               struct {
+                       uint8_t di_v2_pad[6];   /* V2 inode zeroed space */
+                       uint16_t di_flushiter;  /* V2 inode incremented on flush */
+               };
+       };
         xfs_log_timestamp_t di_atime;   /* time last accessed */
         xfs_log_timestamp_t di_mtime;   /* time last modified */
         xfs_log_timestamp_t di_ctime;   /* time created/inode modified */
         xfs_fsize_t     di_size;        /* number of bytes in file */
         xfs_rfsblock_t  di_nblocks;     /* # of direct & btree blocks used */
         xfs_extlen_t    di_extsize;     /* basic/minimum extent size for file */
-       xfs_extnum_t    di_nextents;    /* number of extents in data fork */
-       xfs_aextnum_t   di_anextents;   /* number of extents in attribute fork*/
+       union {
+               /*
+                * For V2 inodes and V3 inodes without NREXT64 set, this
+                * is the number of data and attr fork extents.
+                */
+               struct {
+                       uint32_t  di_nextents;
+                       uint16_t  di_anextents;
+               } __packed;
+
+               /* Number of attr fork extents if NREXT64 is set. */
+               struct {
+                       uint32_t  di_big_anextents;
+                       uint16_t  di_nrext64_pad;
+               } __packed;
+       } __packed;
         uint8_t         di_forkoff;     /* attr fork offs, <<3 for 64b align */
         int8_t          di_aformat;     /* format of attr fork's data */
         uint32_t        di_dmevmask;    /* DMIG event mask */
@@ -869,4 +902,36 @@ struct xfs_icreate_log {
         __be32          icl_gen;        /* inode generation number to use */
  };
  
+/*
+ * Flags for deferred attribute operations.
+ * Upper bits are flags, lower byte is type code
+ */
+#define XFS_ATTR_OP_FLAGS_SET          1       /* Set the attribute */
+#define XFS_ATTR_OP_FLAGS_REMOVE       2       /* Remove the attribute */
+#define XFS_ATTR_OP_FLAGS_REPLACE      3       /* Replace the attribute */
+#define XFS_ATTR_OP_FLAGS_TYPE_MASK    0xFF    /* Flags type mask */
+
+/*
+ * This is the structure used to lay out an attr log item in the
+ * log.
+ */
+struct xfs_attri_log_format {
+       uint16_t        alfi_type;      /* attri log item type */
+       uint16_t        alfi_size;      /* size of this item */
+       uint32_t        __pad;          /* pad to 64 bit aligned */
+       uint64_t        alfi_id;        /* attri identifier */
+       uint64_t        alfi_ino;       /* the inode for this attr operation */
+       uint32_t        alfi_op_flags;  /* marks the op as a set or remove */
+       uint32_t        alfi_name_len;  /* attr name length */
+       uint32_t        alfi_value_len; /* attr value length */
+       uint32_t        alfi_attr_flags;/* attr flags */
+};
+
+struct xfs_attrd_log_format {
+       uint16_t        alfd_type;      /* attrd log item type */
+       uint16_t        alfd_size;      /* size of this item */
+       uint32_t        __pad;          /* pad to 64 bit aligned */
+       uint64_t        alfd_alf_id;    /* id of corresponding attri */
+};
+
  #endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h

index ff69a00..32e2162 100644 (file)
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -72,6 +72,8 @@ extern const struct xlog_recover_item_ops xlog_rui_item_ops;
  extern const struct xlog_recover_item_ops xlog_rud_item_ops;
  extern const struct xlog_recover_item_ops xlog_cui_item_ops;
  extern const struct xlog_recover_item_ops xlog_cud_item_ops;
+extern const struct xlog_recover_item_ops xlog_attri_item_ops;
+extern const struct xlog_recover_item_ops xlog_attrd_item_ops;
  
  /*
   * Macros, structures, prototypes for internal log manager use.
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c

index 67798ff..9975b93 100644 (file)
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -14,6 +14,7 @@
  #include "xfs_trans_space.h"
  #include "xfs_da_btree.h"
  #include "xfs_bmap_btree.h"
+#include "xfs_trace.h"
  
  /*
   * Calculate the maximum length in bytes that would be required for a local
@@ -36,6 +37,65 @@ xfs_log_calc_max_attrsetm_res(
                 M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
  }
  
+/*
+ * Compute an alternate set of log reservation sizes for use exclusively with
+ * minimum log size calculations.
+ */
+static void
+xfs_log_calc_trans_resv_for_minlogblocks(
+       struct xfs_mount        *mp,
+       struct xfs_trans_resv   *resv)
+{
+       unsigned int            rmap_maxlevels = mp->m_rmap_maxlevels;
+
+       /*
+        * In the early days of rmap+reflink, we always set the rmap maxlevels
+        * to 9 even if the AG was small enough that it would never grow to
+        * that height.  Transaction reservation sizes influence the minimum
+        * log size calculation, which influences the size of the log that mkfs
+        * creates.  Use the old value here to ensure that newly formatted
+        * small filesystems will mount on older kernels.
+        */
+       if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
+               mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+
+       xfs_trans_resv_calc(mp, resv);
+
+       if (xfs_has_reflink(mp)) {
+               /*
+                * In the early days of reflink, typical log operation counts
+                * were greatly overestimated.
+                */
+               resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+               resv->tr_itruncate.tr_logcount =
+                               XFS_ITRUNCATE_LOG_COUNT_REFLINK;
+               resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+       } else if (xfs_has_rmapbt(mp)) {
+               /*
+                * In the early days of non-reflink rmap, the impact of rmapbt
+                * updates on log counts were not taken into account at all.
+                */
+               resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+               resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+               resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+       }
+
+       /*
+        * In the early days of reflink, we did not use deferred refcount
+        * update log items, so log reservations must be recomputed using the
+        * old calculations.
+        */
+       resv->tr_write.tr_logres =
+                       xfs_calc_write_reservation_minlogsize(mp);
+       resv->tr_itruncate.tr_logres =
+                       xfs_calc_itruncate_reservation_minlogsize(mp);
+       resv->tr_qm_dqalloc.tr_logres =
+                       xfs_calc_qm_dqalloc_reservation_minlogsize(mp);
+
+       /* Put everything back the way it was.  This goes at the end. */
+       mp->m_rmap_maxlevels = rmap_maxlevels;
+}
+
  /*
   * Iterate over the log space reservation table to figure out and return
   * the maximum one in terms of the pre-calculated values which were done
@@ -46,19 +106,25 @@ xfs_log_get_max_trans_res(
         struct xfs_mount        *mp,
         struct xfs_trans_res    *max_resp)
  {
+       struct xfs_trans_resv   resv = {};
         struct xfs_trans_res    *resp;
         struct xfs_trans_res    *end_resp;
+       unsigned int            i;
         int                     log_space = 0;
         int                     attr_space;
  
         attr_space = xfs_log_calc_max_attrsetm_res(mp);
  
-       resp = (struct xfs_trans_res *)M_RES(mp);
-       end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
-       for (; resp < end_resp; resp++) {
+       xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv);
+
+       resp = (struct xfs_trans_res *)&resv;
+       end_resp = (struct xfs_trans_res *)(&resv + 1);
+       for (i = 0; resp < end_resp; i++, resp++) {
                 int             tmp = resp->tr_logcount > 1 ?
                                       resp->tr_logres * resp->tr_logcount :
                                       resp->tr_logres;
+
+               trace_xfs_trans_resv_calc_minlogsize(mp, i, resp);
                 if (log_space < tmp) {
                         log_space = tmp;
                         *max_resp = *resp;              /* struct copy */
@@ -66,9 +132,10 @@ xfs_log_get_max_trans_res(
         }
  
         if (attr_space > log_space) {
-               *max_resp = M_RES(mp)->tr_attrsetm;     /* struct copy */
+               *max_resp = resv.tr_attrsetm;   /* struct copy */
                 max_resp->tr_logres = attr_space;
         }
+       trace_xfs_log_get_max_trans_res(mp, max_resp);
  }
  
  /*
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h

index a02c506..cb035da 100644 (file)
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -16,7 +16,6 @@
   * and quota-limits. This is a waste in the common case, but hey ...
   */
  typedef uint64_t       xfs_qcnt_t;
-typedef uint16_t       xfs_qwarncnt_t;
  
  typedef uint8_t                xfs_dqtype_t;
  
@@ -29,8 +28,8 @@ typedef uint8_t               xfs_dqtype_t;
  /*
   * flags for q_flags field in the dquot.
   */
-#define XFS_DQFLAG_DIRTY       (1 << 0)        /* dquot is dirty */
-#define XFS_DQFLAG_FREEING     (1 << 1)        /* dquot is being torn down */
+#define XFS_DQFLAG_DIRTY       (1u << 0)       /* dquot is dirty */
+#define XFS_DQFLAG_FREEING     (1u << 1)       /* dquot is being torn down */
  
  #define XFS_DQFLAG_STRINGS \
         { XFS_DQFLAG_DIRTY,     "DIRTY" }, \
@@ -73,29 +72,45 @@ typedef uint8_t             xfs_dqtype_t;
   * to a single function. None of these XFS_QMOPT_* flags are meant to have
   * persistent values (ie. their values can and will change between versions)
   */
-#define XFS_QMOPT_UQUOTA       0x0000004 /* user dquot requested */
-#define XFS_QMOPT_PQUOTA       0x0000008 /* project dquot requested */
-#define XFS_QMOPT_FORCE_RES    0x0000010 /* ignore quota limits */
-#define XFS_QMOPT_SBVERSION    0x0000040 /* change superblock version num */
-#define XFS_QMOPT_GQUOTA       0x0002000 /* group dquot requested */
+#define XFS_QMOPT_UQUOTA       (1u << 0) /* user dquot requested */
+#define XFS_QMOPT_GQUOTA       (1u << 1) /* group dquot requested */
+#define XFS_QMOPT_PQUOTA       (1u << 2) /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES    (1u << 3) /* ignore quota limits */
+#define XFS_QMOPT_SBVERSION    (1u << 4) /* change superblock version num */
  
  /*
   * flags to xfs_trans_mod_dquot to indicate which field needs to be
   * modified.
   */
-#define XFS_QMOPT_RES_REGBLKS  0x0010000
-#define XFS_QMOPT_RES_RTBLKS   0x0020000
-#define XFS_QMOPT_BCOUNT       0x0040000
-#define XFS_QMOPT_ICOUNT       0x0080000
-#define XFS_QMOPT_RTBCOUNT     0x0100000
-#define XFS_QMOPT_DELBCOUNT    0x0200000
-#define XFS_QMOPT_DELRTBCOUNT  0x0400000
-#define XFS_QMOPT_RES_INOS     0x0800000
+#define XFS_QMOPT_RES_REGBLKS  (1u << 7)
+#define XFS_QMOPT_RES_RTBLKS   (1u << 8)
+#define XFS_QMOPT_BCOUNT       (1u << 9)
+#define XFS_QMOPT_ICOUNT       (1u << 10)
+#define XFS_QMOPT_RTBCOUNT     (1u << 11)
+#define XFS_QMOPT_DELBCOUNT    (1u << 12)
+#define XFS_QMOPT_DELRTBCOUNT  (1u << 13)
+#define XFS_QMOPT_RES_INOS     (1u << 14)
  
  /*
   * flags for dqalloc.
   */
-#define XFS_QMOPT_INHERIT      0x1000000
+#define XFS_QMOPT_INHERIT      (1u << 31)
+
+#define XFS_QMOPT_FLAGS \
+       { XFS_QMOPT_UQUOTA,             "UQUOTA" }, \
+       { XFS_QMOPT_PQUOTA,             "PQUOTA" }, \
+       { XFS_QMOPT_FORCE_RES,          "FORCE_RES" }, \
+       { XFS_QMOPT_SBVERSION,          "SBVERSION" }, \
+       { XFS_QMOPT_GQUOTA,             "GQUOTA" }, \
+       { XFS_QMOPT_INHERIT,            "INHERIT" }, \
+       { XFS_QMOPT_RES_REGBLKS,        "RES_REGBLKS" }, \
+       { XFS_QMOPT_RES_RTBLKS,         "RES_RTBLKS" }, \
+       { XFS_QMOPT_BCOUNT,             "BCOUNT" }, \
+       { XFS_QMOPT_ICOUNT,             "ICOUNT" }, \
+       { XFS_QMOPT_RTBCOUNT,           "RTBCOUNT" }, \
+       { XFS_QMOPT_DELBCOUNT,          "DELBCOUNT" }, \
+       { XFS_QMOPT_DELRTBCOUNT,        "DELRTBCOUNT" }, \
+       { XFS_QMOPT_RES_INOS,           "RES_INOS" }
  
  /*
   * flags to xfs_trans_mod_dquot.
@@ -114,6 +129,7 @@ typedef uint8_t             xfs_dqtype_t;
                 (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
  #define XFS_QMOPT_RESBLK_MASK  (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
  
+
  extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
                 struct xfs_disk_dquot *ddq, xfs_dqid_t id);
  extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c

index 327ba25..97e9e60 100644 (file)
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -886,8 +886,13 @@ xfs_refcount_still_have_space(
  {
         unsigned long                   overhead;
  
-       overhead = cur->bc_ag.refc.shape_changes *
-                       xfs_allocfree_log_count(cur->bc_mp, 1);
+       /*
+        * Worst case estimate: full splits of the free space and rmap btrees
+        * to handle each of the shape changes to the refcount btree.
+        */
+       overhead = xfs_allocfree_block_count(cur->bc_mp,
+                               cur->bc_ag.refc.shape_changes);
+       overhead += cur->bc_mp->m_refc_maxlevels;
         overhead *= cur->bc_mp->m_sb.sb_blocksize;
  
         /*
@@ -960,6 +965,7 @@ xfs_refcount_adjust_extents(
                          * Either cover the hole (increment) or
                          * delete the range (decrement).
                          */
+                       cur->bc_ag.refc.nr_ops++;
                         if (tmp.rc_refcount) {
                                 error = xfs_refcount_insert(cur, &tmp,
                                                 &found_tmp);
@@ -970,7 +976,6 @@ xfs_refcount_adjust_extents(
                                         error = -EFSCORRUPTED;
                                         goto out_error;
                                 }
-                               cur->bc_ag.refc.nr_ops++;
                         } else {
                                 fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
                                                 cur->bc_ag.pag->pag_agno,
@@ -1001,11 +1006,11 @@ xfs_refcount_adjust_extents(
                 ext.rc_refcount += adj;
                 trace_xfs_refcount_modify_extent(cur->bc_mp,
                                 cur->bc_ag.pag->pag_agno, &ext);
+               cur->bc_ag.refc.nr_ops++;
                 if (ext.rc_refcount > 1) {
                         error = xfs_refcount_update(cur, &ext);
                         if (error)
                                 goto out_error;
-                       cur->bc_ag.refc.nr_ops++;
                 } else if (ext.rc_refcount == 1) {
                         error = xfs_refcount_delete(cur, &found_rec);
                         if (error)
@@ -1014,7 +1019,6 @@ xfs_refcount_adjust_extents(
                                 error = -EFSCORRUPTED;
                                 goto out_error;
                         }
-                       cur->bc_ag.refc.nr_ops++;
                         goto advloop;
                 } else {
                         fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h

index 9eb01ed..e8b322d 100644 (file)
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -67,14 +67,17 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
   * log (plus any key updates) so we'll conservatively assume 32 bytes
   * per record.  We must also leave space for btree splits on both ends
   * of the range and space for the CUD and a new CUI.
+ *
+ * Each EFI that we attach to the transaction is assumed to consume ~32 bytes.
+ * This is a low estimate for an EFI tracking a single extent (16 bytes for the
+ * EFI header, 16 for the extent, and 12 for the xlog op header), but the
+ * estimate is acceptable if there's more than one extent being freed.
+ * In the worst case of freeing every other block during a refcount decrease
+ * operation, we amortize the space used for one EFI log item across 16
+ * extents.
   */
  #define XFS_REFCOUNT_ITEM_OVERHEAD     32
  
-static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
-{
-       return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
-}
-
  extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
                 xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
  union xfs_btree_rec;
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c

index cd32217..2845019 100644 (file)
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -34,18 +34,32 @@ int
  xfs_rmap_lookup_le(
         struct xfs_btree_cur    *cur,
         xfs_agblock_t           bno,
-       xfs_extlen_t            len,
         uint64_t                owner,
         uint64_t                offset,
         unsigned int            flags,
+       struct xfs_rmap_irec    *irec,
         int                     *stat)
  {
+       int                     get_stat = 0;
+       int                     error;
+
         cur->bc_rec.r.rm_startblock = bno;
-       cur->bc_rec.r.rm_blockcount = len;
+       cur->bc_rec.r.rm_blockcount = 0;
         cur->bc_rec.r.rm_owner = owner;
         cur->bc_rec.r.rm_offset = offset;
         cur->bc_rec.r.rm_flags = flags;
-       return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+
+       error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+       if (error || !(*stat) || !irec)
+               return error;
+
+       error = xfs_rmap_get_rec(cur, irec, &get_stat);
+       if (error)
+               return error;
+       if (!get_stat)
+               return -EFSCORRUPTED;
+
+       return 0;
  }
  
  /*
@@ -251,7 +265,6 @@ out_bad_rec:
  struct xfs_find_left_neighbor_info {
         struct xfs_rmap_irec    high;
         struct xfs_rmap_irec    *irec;
-       int                     *stat;
  };
  
  /* For each rmap given, figure out if it matches the key we want. */
@@ -276,7 +289,6 @@ xfs_rmap_find_left_neighbor_helper(
                 return 0;
  
         *info->irec = *rec;
-       *info->stat = 1;
         return -ECANCELED;
  }
  
@@ -285,7 +297,7 @@ xfs_rmap_find_left_neighbor_helper(
   * return a match with the same owner and adjacent physical and logical
   * block ranges.
   */
-int
+STATIC int
  xfs_rmap_find_left_neighbor(
         struct xfs_btree_cur    *cur,
         xfs_agblock_t           bno,
@@ -296,6 +308,7 @@ xfs_rmap_find_left_neighbor(
         int                     *stat)
  {
         struct xfs_find_left_neighbor_info      info;
+       int                     found = 0;
         int                     error;
  
         *stat = 0;
@@ -313,21 +326,44 @@ xfs_rmap_find_left_neighbor(
         info.high.rm_flags = flags;
         info.high.rm_blockcount = 0;
         info.irec = irec;
-       info.stat = stat;
  
         trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
                         cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
  
-       error = xfs_rmap_query_range(cur, &info.high, &info.high,
-                       xfs_rmap_find_left_neighbor_helper, &info);
-       if (error == -ECANCELED)
-               error = 0;
-       if (*stat)
-               trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
-                               cur->bc_ag.pag->pag_agno, irec->rm_startblock,
-                               irec->rm_blockcount, irec->rm_owner,
-                               irec->rm_offset, irec->rm_flags);
-       return error;
+       /*
+        * Historically, we always used the range query to walk every reverse
+        * mapping that could possibly overlap the key that the caller asked
+        * for, and filter out the ones that don't.  That is very slow when
+        * there are a lot of records.
+        *
+        * However, there are two scenarios where the classic btree search can
+        * produce correct results -- if the index contains a record that is an
+        * exact match for the lookup key; and if there are no other records
+        * between the record we want and the key we supplied.
+        *
+        * As an optimization, try a non-overlapped lookup first.  This makes
+        * extent conversion and remap operations run a bit faster if the
+        * physical extents aren't being shared.  If we don't find what we
+        * want, we fall back to the overlapped query.
+        */
+       error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+                       &found);
+       if (error)
+               return error;
+       if (found)
+               error = xfs_rmap_find_left_neighbor_helper(cur, irec, &info);
+       if (!error)
+               error = xfs_rmap_query_range(cur, &info.high, &info.high,
+                               xfs_rmap_find_left_neighbor_helper, &info);
+       if (error != -ECANCELED)
+               return error;
+
+       *stat = 1;
+       trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+                       cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+                       irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+                       irec->rm_flags);
+       return 0;
  }
  
  /* For each rmap given, figure out if it matches the key we want. */
@@ -353,7 +389,6 @@ xfs_rmap_lookup_le_range_helper(
                 return 0;
  
         *info->irec = *rec;
-       *info->stat = 1;
         return -ECANCELED;
  }
  
@@ -374,6 +409,7 @@ xfs_rmap_lookup_le_range(
         int                     *stat)
  {
         struct xfs_find_left_neighbor_info      info;
+       int                     found = 0;
         int                     error;
  
         info.high.rm_startblock = bno;
@@ -386,20 +422,44 @@ xfs_rmap_lookup_le_range(
         info.high.rm_blockcount = 0;
         *stat = 0;
         info.irec = irec;
-       info.stat = stat;
  
-       trace_xfs_rmap_lookup_le_range(cur->bc_mp,
-                       cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
-       error = xfs_rmap_query_range(cur, &info.high, &info.high,
-                       xfs_rmap_lookup_le_range_helper, &info);
-       if (error == -ECANCELED)
-               error = 0;
-       if (*stat)
-               trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
-                               cur->bc_ag.pag->pag_agno, irec->rm_startblock,
-                               irec->rm_blockcount, irec->rm_owner,
-                               irec->rm_offset, irec->rm_flags);
-       return error;
+       trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+                       bno, 0, owner, offset, flags);
+
+       /*
+        * Historically, we always used the range query to walk every reverse
+        * mapping that could possibly overlap the key that the caller asked
+        * for, and filter out the ones that don't.  That is very slow when
+        * there are a lot of records.
+        *
+        * However, there are two scenarios where the classic btree search can
+        * produce correct results -- if the index contains a record that is an
+        * exact match for the lookup key; and if there are no other records
+        * between the record we want and the key we supplied.
+        *
+        * As an optimization, try a non-overlapped lookup first.  This makes
+        * scrub run much faster on most filesystems because bmbt records are
+        * usually an exact match for rmap records.  If we don't find what we
+        * want, we fall back to the overlapped query.
+        */
+       error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+                       &found);
+       if (error)
+               return error;
+       if (found)
+               error = xfs_rmap_lookup_le_range_helper(cur, irec, &info);
+       if (!error)
+               error = xfs_rmap_query_range(cur, &info.high, &info.high,
+                               xfs_rmap_lookup_le_range_helper, &info);
+       if (error != -ECANCELED)
+               return error;
+
+       *stat = 1;
+       trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+                       cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+                       irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+                       irec->rm_flags);
+       return 0;
  }
  
  /*
@@ -510,7 +570,7 @@ xfs_rmap_unmap(
          * for the AG headers at rm_startblock == 0 created by mkfs/growfs that
          * will not ever be removed from the tree.
          */
-       error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i);
+       error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec, &i);
         if (error)
                 goto out_error;
         if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -518,13 +578,6 @@ xfs_rmap_unmap(
                 goto out_error;
         }
  
-       error = xfs_rmap_get_rec(cur, &ltrec, &i);
-       if (error)
-               goto out_error;
-       if (XFS_IS_CORRUPT(mp, i != 1)) {
-               error = -EFSCORRUPTED;
-               goto out_error;
-       }
         trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
                         cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
                         ltrec.rm_blockcount, ltrec.rm_owner,
@@ -786,18 +839,11 @@ xfs_rmap_map(
          * record for our insertion point. This will also give us the record for
          * start block contiguity tests.
          */
-       error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+       error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec,
                         &have_lt);
         if (error)
                 goto out_error;
         if (have_lt) {
-               error = xfs_rmap_get_rec(cur, &ltrec, &have_lt);
-               if (error)
-                       goto out_error;
-               if (XFS_IS_CORRUPT(mp, have_lt != 1)) {
-                       error = -EFSCORRUPTED;
-                       goto out_error;
-               }
                 trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
                                 cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
                                 ltrec.rm_blockcount, ltrec.rm_owner,
@@ -1022,7 +1068,7 @@ xfs_rmap_convert(
          * record for our insertion point. This will also give us the record for
          * start block contiguity tests.
          */
-       error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+       error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, &PREV, &i);
         if (error)
                 goto done;
         if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -1030,13 +1076,6 @@ xfs_rmap_convert(
                 goto done;
         }
  
-       error = xfs_rmap_get_rec(cur, &PREV, &i);
-       if (error)
-               goto done;
-       if (XFS_IS_CORRUPT(mp, i != 1)) {
-               error = -EFSCORRUPTED;
-               goto done;
-       }
         trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
                         cur->bc_ag.pag->pag_agno, PREV.rm_startblock,
                         PREV.rm_blockcount, PREV.rm_owner,
@@ -1140,7 +1179,7 @@ xfs_rmap_convert(
                         _RET_IP_);
  
         /* reset the cursor back to PREV */
-       error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+       error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i);
         if (error)
                 goto done;
         if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -2677,7 +2716,7 @@ xfs_rmap_record_exists(
         ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) ||
                (flags & XFS_RMAP_BMBT_BLOCK));
  
-       error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+       error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec,
                         &has_record);
         if (error)
                 return error;
@@ -2686,14 +2725,6 @@ xfs_rmap_record_exists(
                 return 0;
         }
  
-       error = xfs_rmap_get_rec(cur, &irec, &has_record);
-       if (error)
-               return error;
-       if (!has_record) {
-               *has_rmap = false;
-               return 0;
-       }
-
         *has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno &&
                      irec.rm_startblock + irec.rm_blockcount >= bno + len);
         return 0;
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h

index b718ebe..54741a5 100644 (file)
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -122,8 +122,8 @@ int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
                   const struct xfs_owner_info *oinfo);
  
  int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-               xfs_extlen_t len, uint64_t owner, uint64_t offset,
-               unsigned int flags, int *stat);
+               uint64_t owner, uint64_t offset, unsigned int flags,
+               struct xfs_rmap_irec *irec, int *stat);
  int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
                 xfs_extlen_t len, uint64_t owner, uint64_t offset,
                 unsigned int flags, int *stat);
@@ -184,9 +184,6 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
                 xfs_fsblock_t startblock, xfs_filblks_t blockcount,
                 xfs_exntst_t state, struct xfs_btree_cur **pcur);
  
-int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-               uint64_t owner, uint64_t offset, unsigned int flags,
-               struct xfs_rmap_irec *irec, int *stat);
  int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
                 uint64_t owner, uint64_t offset, unsigned int flags,
                 struct xfs_rmap_irec *irec, int *stat);
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c

index 5740ba6..fa180ab 100644 (file)
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1008,6 +1008,7 @@ xfs_rtfree_extent(
  /* Find all the free records within a given range. */
  int
  xfs_rtalloc_query_range(
+       struct xfs_mount                *mp,
         struct xfs_trans                *tp,
         const struct xfs_rtalloc_rec    *low_rec,
         const struct xfs_rtalloc_rec    *high_rec,
@@ -1015,7 +1016,6 @@ xfs_rtalloc_query_range(
         void                            *priv)
  {
         struct xfs_rtalloc_rec          rec;
-       struct xfs_mount                *mp = tp->t_mountp;
         xfs_rtblock_t                   rtstart;
         xfs_rtblock_t                   rtend;
         xfs_rtblock_t                   high_key;
@@ -1048,7 +1048,7 @@ xfs_rtalloc_query_range(
                         rec.ar_startext = rtstart;
                         rec.ar_extcount = rtend - rtstart + 1;
  
-                       error = fn(tp, &rec, priv);
+                       error = fn(mp, tp, &rec, priv);
                         if (error)
                                 break;
                 }
@@ -1062,6 +1062,7 @@ xfs_rtalloc_query_range(
  /* Find all the free records. */
  int
  xfs_rtalloc_query_all(
+       struct xfs_mount                *mp,
         struct xfs_trans                *tp,
         xfs_rtalloc_query_range_fn      fn,
         void                            *priv)
@@ -1069,10 +1070,10 @@ xfs_rtalloc_query_all(
         struct xfs_rtalloc_rec          keys[2];
  
         keys[0].ar_startext = 0;
-       keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1;
+       keys[1].ar_startext = mp->m_sb.sb_rextents - 1;
         keys[0].ar_extcount = keys[1].ar_extcount = 0;
  
-       return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
+       return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv);
  }
  
  /* Is the given extent all free? */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c

index f4e84aa..a20cade 100644 (file)
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -30,6 +30,47 @@
   * Physical superblock buffer manipulations. Shared with libxfs in userspace.
   */
  
+/*
+ * Check that all the V4 feature bits that the V5 filesystem format requires are
+ * correctly set.
+ */
+static bool
+xfs_sb_validate_v5_features(
+       struct xfs_sb   *sbp)
+{
+       /* We must not have any unknown V4 feature bits set */
+       if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS)
+               return false;
+
+       /*
+        * The CRC bit is considered an invalid V4 flag, so we have to add it
+        * manually to the OKBITS mask.
+        */
+       if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS |
+                                 XFS_SB_VERSION2_CRCBIT))
+               return false;
+
+       /* Now check all the required V4 feature flags are set. */
+
+#define V5_VERS_FLAGS  (XFS_SB_VERSION_NLINKBIT        | \
+                       XFS_SB_VERSION_ALIGNBIT         | \
+                       XFS_SB_VERSION_LOGV2BIT         | \
+                       XFS_SB_VERSION_EXTFLGBIT        | \
+                       XFS_SB_VERSION_DIRV2BIT         | \
+                       XFS_SB_VERSION_MOREBITSBIT)
+
+#define V5_FEAT_FLAGS  (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
+                       XFS_SB_VERSION2_ATTR2BIT        | \
+                       XFS_SB_VERSION2_PROJID32BIT     | \
+                       XFS_SB_VERSION2_CRCBIT)
+
+       if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS)
+               return false;
+       if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS)
+               return false;
+       return true;
+}
+
  /*
   * We support all XFS versions newer than a v4 superblock with V2 directories.
   */
@@ -37,9 +78,19 @@ bool
  xfs_sb_good_version(
         struct xfs_sb   *sbp)
  {
-       /* all v5 filesystems are supported */
+       /*
+        * All v5 filesystems are supported, but we must check that all the
+        * required v4 feature flags are enabled correctly as the code checks
+        * those flags and not for v5 support.
+        */
         if (xfs_sb_is_v5(sbp))
-               return true;
+               return xfs_sb_validate_v5_features(sbp);
+
+       /* We must not have any unknown v4 feature bits set */
+       if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+           ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+            (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+               return false;
  
         /* versions prior to v4 are not supported */
         if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4)
@@ -51,12 +102,6 @@ xfs_sb_good_version(
         if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
                 return false;
  
-       /* And must not have any unknown v4 feature bits set */
-       if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
-           ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
-            (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
-               return false;
-
         /* It's a supported v4 filesystem */
         return true;
  }
@@ -70,6 +115,8 @@ xfs_sb_version_to_features(
         /* optional V4 features */
         if (sbp->sb_rblocks > 0)
                 features |= XFS_FEAT_REALTIME;
+       if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)
+               features |= XFS_FEAT_NLINK;
         if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)
                 features |= XFS_FEAT_ATTR;
         if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT)
@@ -124,6 +171,9 @@ xfs_sb_version_to_features(
                 features |= XFS_FEAT_BIGTIME;
         if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
                 features |= XFS_FEAT_NEEDSREPAIR;
+       if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64)
+               features |= XFS_FEAT_NREXT64;
+
         return features;
  }
  
@@ -262,12 +312,15 @@ xfs_validate_sb_common(
         bool                    has_dalign;
  
         if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
-               xfs_warn(mp, "bad magic number");
+               xfs_warn(mp,
+"Superblock has bad magic number 0x%x. Not an XFS filesystem?",
+                       be32_to_cpu(dsb->sb_magicnum));
                 return -EWRONGFS;
         }
  
         if (!xfs_sb_good_version(sbp)) {
-               xfs_warn(mp, "bad version");
+               xfs_warn(mp,
+"Superblock has unknown features enabled or corrupted feature masks.");
                 return -EWRONGFS;
         }
  
@@ -911,6 +964,11 @@ xfs_log_sb(
          * reservations that have been taken out percpu counters. If we have an
          * unclean shutdown, this will be corrected by log recovery rebuilding
          * the counters from the AGF block counts.
+        *
+        * Do not update sb_frextents here because it is not part of the lazy
+        * sb counters, despite having a percpu counter. It is always kept
+        * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
+        * and hence we don't need have to update it here.
          */
         if (xfs_has_lazysbcount(mp)) {
                 mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
@@ -1135,6 +1193,8 @@ xfs_fs_geometry(
         } else {
                 geo->logsectsize = BBSIZE;
         }
+       if (xfs_has_large_extent_counts(mp))
+               geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
         geo->rtsectsize = sbp->sb_blocksize;
         geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
  
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h

index 25c4cab..c438138 100644 (file)
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -54,13 +54,23 @@ void        xfs_log_get_max_trans_res(struct xfs_mount *mp,
  /*
   * Values for t_flags.
   */
-#define        XFS_TRANS_DIRTY         0x01    /* something needs to be logged */
-#define        XFS_TRANS_SB_DIRTY      0x02    /* superblock is modified */
-#define        XFS_TRANS_PERM_LOG_RES  0x04    /* xact took a permanent log res */
-#define        XFS_TRANS_SYNC          0x08    /* make commit synchronous */
-#define XFS_TRANS_RESERVE      0x20    /* OK to use reserved data blocks */
-#define XFS_TRANS_NO_WRITECOUNT 0x40   /* do not elevate SB writecount */
-#define XFS_TRANS_RES_FDBLKS   0x80    /* reserve newly freed blocks */
+/* Transaction needs to be logged */
+#define XFS_TRANS_DIRTY                        (1u << 0)
+/* Superblock is dirty and needs to be logged */
+#define XFS_TRANS_SB_DIRTY             (1u << 1)
+/* Transaction took a permanent log reservation */
+#define XFS_TRANS_PERM_LOG_RES         (1u << 2)
+/* Synchronous transaction commit needed */
+#define XFS_TRANS_SYNC                 (1u << 3)
+/* Transaction can use reserve block pool */
+#define XFS_TRANS_RESERVE              (1u << 4)
+/* Transaction should avoid VFS level superblock write accounting */
+#define XFS_TRANS_NO_WRITECOUNT                (1u << 5)
+/* Transaction has freed blocks returned to it's reservation */
+#define XFS_TRANS_RES_FDBLKS           (1u << 6)
+/* Transaction contains an intent done log item */
+#define XFS_TRANS_HAS_INTENT_DONE      (1u << 7)
+
  /*
   * LOWMODE is used by the allocator to activate the lowspace algorithm - when
   * free space is running low the extent allocator may choose to allocate an
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c

index 6f83d9b..e9913c2 100644 (file)
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -56,15 +56,14 @@ xfs_calc_buf_res(
   * Per-extent log reservation for the btree changes involved in freeing or
   * allocating an extent.  In classic XFS there were two trees that will be
   * modified (bnobt + cntbt).  With rmap enabled, there are three trees
- * (rmapbt).  With reflink, there are four trees (refcountbt).  The number of
- * blocks reserved is based on the formula:
+ * (rmapbt).  The number of blocks reserved is based on the formula:
   *
   * num trees * ((2 blocks/level * max depth) - 1)
   *
   * Keep in mind that max depth is calculated separately for each type of tree.
   */
  uint
-xfs_allocfree_log_count(
+xfs_allocfree_block_count(
         struct xfs_mount *mp,
         uint            num_ops)
  {
@@ -73,12 +72,23 @@ xfs_allocfree_log_count(
         blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1);
         if (xfs_has_rmapbt(mp))
                 blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
-       if (xfs_has_reflink(mp))
-               blocks += num_ops * (2 * mp->m_refc_maxlevels - 1);
  
         return blocks;
  }
  
+/*
+ * Per-extent log reservation for refcount btree changes.  These are never done
+ * in the same transaction as an allocation or a free, so we compute them
+ * separately.
+ */
+static unsigned int
+xfs_refcountbt_block_count(
+       struct xfs_mount        *mp,
+       unsigned int            num_ops)
+{
+       return num_ops * (2 * mp->m_refc_maxlevels - 1);
+}
+
  /*
   * Logging inodes is really tricksy. They are logged in memory format,
   * which means that what we write into the log doesn't directly translate into
@@ -136,7 +146,7 @@ xfs_calc_inobt_res(
  {
         return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels,
                         XFS_FSB_TO_B(mp, 1)) +
-                               xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+                               xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
                         XFS_FSB_TO_B(mp, 1));
  }
  
@@ -183,7 +193,7 @@ xfs_calc_inode_chunk_res(
  {
         uint                    res, size = 0;
  
-       res = xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+       res = xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
                                XFS_FSB_TO_B(mp, 1));
         if (alloc) {
                 /* icreate tx uses ordered buffers */
@@ -199,18 +209,18 @@ xfs_calc_inode_chunk_res(
  /*
   * Per-extent log reservation for the btree changes involved in freeing or
   * allocating a realtime extent.  We have to be able to log as many rtbitmap
- * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents,
- * as well as the realtime summary block.
+ * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime
+ * extents, as well as the realtime summary block.
   */
  static unsigned int
-xfs_rtalloc_log_count(
+xfs_rtalloc_block_count(
         struct xfs_mount        *mp,
         unsigned int            num_ops)
  {
         unsigned int            blksz = XFS_FSB_TO_B(mp, 1);
         unsigned int            rtbmp_bytes;
  
-       rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY;
+       rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY;
         return (howmany(rtbmp_bytes, blksz) + 1) * num_ops;
  }
  
@@ -233,6 +243,28 @@ xfs_rtalloc_log_count(
   * register overflow from temporaries in the calculations.
   */
  
+/*
+ * Compute the log reservation required to handle the refcount update
+ * transaction.  Refcount updates are always done via deferred log items.
+ *
+ * This is calculated as:
+ * Data device refcount updates (t1):
+ *    the agfs of the ags containing the blocks: nr_ops * sector size
+ *    the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+static unsigned int
+xfs_calc_refcountbt_reservation(
+       struct xfs_mount        *mp,
+       unsigned int            nr_ops)
+{
+       unsigned int            blksz = XFS_FSB_TO_B(mp, 1);
+
+       if (!xfs_has_reflink(mp))
+               return 0;
+
+       return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+              xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+}
  
  /*
   * In a write transaction we can allocate a maximum of 2
@@ -247,7 +279,7 @@ xfs_rtalloc_log_count(
   *    the inode's bmap btree: max depth * block size
   *    the agfs of the ags from which the extents are allocated: 2 * sector
   *    the superblock free block counter: sector size
- *    the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes
+ *    the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
   *    the realtime summary: 1 block
   *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
   * And the bmap_finish transaction can free bmap blocks in a join (t3):
@@ -255,34 +287,65 @@ xfs_rtalloc_log_count(
   *    the agfls of the ags containing the blocks: 2 * sector size
   *    the super block free block counter: sector size
   *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
   */
  STATIC uint
  xfs_calc_write_reservation(
-       struct xfs_mount        *mp)
+       struct xfs_mount        *mp,
+       bool                    for_minlogsize)
  {
-       unsigned int            t1, t2, t3;
+       unsigned int            t1, t2, t3, t4;
         unsigned int            blksz = XFS_FSB_TO_B(mp, 1);
  
         t1 = xfs_calc_inode_res(mp, 1) +
              xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) +
              xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-            xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+            xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
  
         if (xfs_has_realtime(mp)) {
                 t2 = xfs_calc_inode_res(mp, 1) +
                      xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
                                      blksz) +
                      xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) +
-                    xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz);
+                    xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 1), blksz) +
+                    xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz);
         } else {
                 t2 = 0;
         }
  
         t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-            xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+            xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
  
-       return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+       /*
+        * In the early days of reflink, we included enough reservation to log
+        * two refcountbt splits for each transaction.  The codebase runs
+        * refcountbt updates in separate transactions now, so to compute the
+        * minimum log size, add the refcountbtree splits back to t1 and t3 and
+        * do not account them separately as t4.  Reflink did not support
+        * realtime when the reservations were established, so no adjustment to
+        * t2 is needed.
+        */
+       if (for_minlogsize) {
+               unsigned int    adj = 0;
+
+               if (xfs_has_reflink(mp))
+                       adj = xfs_calc_buf_res(
+                                       xfs_refcountbt_block_count(mp, 2),
+                                       blksz);
+               t1 += adj;
+               t3 += adj;
+               return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+       }
+
+       t4 = xfs_calc_refcountbt_reservation(mp, 1);
+       return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_write_reservation_minlogsize(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_write_reservation(mp, true);
  }
  
  /*
@@ -299,33 +362,62 @@ xfs_calc_write_reservation(
   *    the agf for each of the ags: 2 * sector size
   *    the agfl for each of the ags: 2 * sector size
   *    the super block to reflect the freed blocks: sector size
- *    the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes
+ *    the realtime bitmap:
+ *             2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
   *    the realtime summary: 2 exts * 1 block
   *    worst case split in allocation btrees per extent assuming 2 extents:
   *             2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
   */
  STATIC uint
  xfs_calc_itruncate_reservation(
-       struct xfs_mount        *mp)
+       struct xfs_mount        *mp,
+       bool                    for_minlogsize)
  {
-       unsigned int            t1, t2, t3;
+       unsigned int            t1, t2, t3, t4;
         unsigned int            blksz = XFS_FSB_TO_B(mp, 1);
  
         t1 = xfs_calc_inode_res(mp, 1) +
              xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
  
         t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
-            xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz);
+            xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz);
  
         if (xfs_has_realtime(mp)) {
                 t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) +
-                    xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+                    xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) +
+                    xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
         } else {
                 t3 = 0;
         }
  
-       return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+       /*
+        * In the early days of reflink, we included enough reservation to log
+        * four refcountbt splits in the same transaction as bnobt/cntbt
+        * updates.  The codebase runs refcountbt updates in separate
+        * transactions now, so to compute the minimum log size, add the
+        * refcount btree splits back here and do not compute them separately
+        * as t4.  Reflink did not support realtime when the reservations were
+        * established, so do not adjust t3.
+        */
+       if (for_minlogsize) {
+               if (xfs_has_reflink(mp))
+                       t2 += xfs_calc_buf_res(
+                                       xfs_refcountbt_block_count(mp, 4),
+                                       blksz);
+
+               return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+       }
+
+       t4 = xfs_calc_refcountbt_reservation(mp, 2);
+       return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_itruncate_reservation_minlogsize(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_itruncate_reservation(mp, true);
  }
  
  /*
@@ -349,7 +441,7 @@ xfs_calc_rename_reservation(
                      xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
                                       XFS_FSB_TO_B(mp, 1))),
                     (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3),
+                    xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
                                       XFS_FSB_TO_B(mp, 1))));
  }
  
@@ -389,7 +481,7 @@ xfs_calc_link_reservation(
                      xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
                                       XFS_FSB_TO_B(mp, 1))),
                     (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+                    xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
                                       XFS_FSB_TO_B(mp, 1))));
  }
  
@@ -427,7 +519,7 @@ xfs_calc_remove_reservation(
                      xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
                                       XFS_FSB_TO_B(mp, 1))),
                     (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
+                    xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
                                       XFS_FSB_TO_B(mp, 1))));
  }
  
@@ -572,7 +664,7 @@ xfs_calc_growdata_reservation(
         struct xfs_mount        *mp)
  {
         return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+               xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
                                  XFS_FSB_TO_B(mp, 1));
  }
  
@@ -594,7 +686,7 @@ xfs_calc_growrtalloc_reservation(
                 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
                                  XFS_FSB_TO_B(mp, 1)) +
                 xfs_calc_inode_res(mp, 1) +
-               xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+               xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
                                  XFS_FSB_TO_B(mp, 1));
  }
  
@@ -670,7 +762,7 @@ xfs_calc_addafork_reservation(
                 xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
                 xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
                                  XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+               xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
                                  XFS_FSB_TO_B(mp, 1));
  }
  
@@ -693,7 +785,7 @@ xfs_calc_attrinval_reservation(
                     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
                                      XFS_FSB_TO_B(mp, 1))),
                    (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
-                   xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4),
+                   xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4),
                                      XFS_FSB_TO_B(mp, 1))));
  }
  
@@ -760,7 +852,7 @@ xfs_calc_attrrm_reservation(
                                         XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
                      xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
                     (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
+                    xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
                                       XFS_FSB_TO_B(mp, 1))));
  }
  
@@ -791,13 +883,21 @@ xfs_calc_qm_setqlim_reservation(void)
   */
  STATIC uint
  xfs_calc_qm_dqalloc_reservation(
-       struct xfs_mount        *mp)
+       struct xfs_mount        *mp,
+       bool                    for_minlogsize)
  {
-       return xfs_calc_write_reservation(mp) +
+       return xfs_calc_write_reservation(mp, for_minlogsize) +
                 xfs_calc_buf_res(1,
                         XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
  }
  
+unsigned int
+xfs_calc_qm_dqalloc_reservation_minlogsize(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_qm_dqalloc_reservation(mp, true);
+}
+
  /*
   * Syncing the incore super block changes to disk.
   *     the super block to reflect the changes: sector size
@@ -814,36 +914,18 @@ xfs_trans_resv_calc(
         struct xfs_mount        *mp,
         struct xfs_trans_resv   *resp)
  {
-       unsigned int            rmap_maxlevels = mp->m_rmap_maxlevels;
-
-       /*
-        * In the early days of rmap+reflink, we always set the rmap maxlevels
-        * to 9 even if the AG was small enough that it would never grow to
-        * that height.  Transaction reservation sizes influence the minimum
-        * log size calculation, which influences the size of the log that mkfs
-        * creates.  Use the old value here to ensure that newly formatted
-        * small filesystems will mount on older kernels.
-        */
-       if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
-               mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+       int                     logcount_adj = 0;
  
         /*
          * The following transactions are logged in physical format and
          * require a permanent reservation on space.
          */
-       resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
-       if (xfs_has_reflink(mp))
-               resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
-       else
-               resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+       resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false);
+       resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
         resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
  
-       resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
-       if (xfs_has_reflink(mp))
-               resp->tr_itruncate.tr_logcount =
-                               XFS_ITRUNCATE_LOG_COUNT_REFLINK;
-       else
-               resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+       resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false);
+       resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
         resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
  
         resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
@@ -899,11 +981,9 @@ xfs_trans_resv_calc(
         resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
         resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
  
-       resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
-       if (xfs_has_reflink(mp))
-               resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
-       else
-               resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+       resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp,
+                       false);
+       resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
         resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
  
         /*
@@ -930,6 +1010,19 @@ xfs_trans_resv_calc(
         resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
         resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
  
-       /* Put everything back the way it was.  This goes at the end. */
-       mp->m_rmap_maxlevels = rmap_maxlevels;
+       /*
+        * Add one logcount for BUI items that appear with rmap or reflink,
+        * one logcount for refcount intent items, and one logcount for rmap
+        * intent items.
+        */
+       if (xfs_has_reflink(mp) || xfs_has_rmapbt(mp))
+               logcount_adj++;
+       if (xfs_has_reflink(mp))
+               logcount_adj++;
+       if (xfs_has_rmapbt(mp))
+               logcount_adj++;
+
+       resp->tr_itruncate.tr_logcount += logcount_adj;
+       resp->tr_write.tr_logcount += logcount_adj;
+       resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
  }
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h

index fc4e9b3..0554b9d 100644 (file)
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -73,7 +73,6 @@ struct xfs_trans_resv {
  #define        XFS_DEFAULT_LOG_COUNT           1
  #define        XFS_DEFAULT_PERM_LOG_COUNT      2
  #define        XFS_ITRUNCATE_LOG_COUNT         2
-#define        XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
  #define XFS_INACTIVE_LOG_COUNT         2
  #define        XFS_CREATE_LOG_COUNT            2
  #define        XFS_CREATE_TMPFILE_LOG_COUNT    2
@@ -83,13 +82,24 @@ struct xfs_trans_resv {
  #define        XFS_LINK_LOG_COUNT              2
  #define        XFS_RENAME_LOG_COUNT            2
  #define        XFS_WRITE_LOG_COUNT             2
-#define        XFS_WRITE_LOG_COUNT_REFLINK     8
  #define        XFS_ADDAFORK_LOG_COUNT          2
  #define        XFS_ATTRINVAL_LOG_COUNT         1
  #define        XFS_ATTRSET_LOG_COUNT           3
  #define        XFS_ATTRRM_LOG_COUNT            3
  
+/*
+ * Original log operation counts were overestimated in the early days of
+ * reflink.  These are retained here purely for minimum log size calculations
+ * and must not be used for runtime reservations.
+ */
+#define        XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
+#define        XFS_WRITE_LOG_COUNT_REFLINK     8
+
  void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
-uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops);
+uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
+
+unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
  
  #endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h

index b6da06b..373f64a 100644 (file)
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -12,8 +12,8 @@ typedef uint32_t      xfs_agblock_t;  /* blockno in alloc. group */
  typedef uint32_t       xfs_agino_t;    /* inode # within allocation grp */
  typedef uint32_t       xfs_extlen_t;   /* extent length in blocks */
  typedef uint32_t       xfs_agnumber_t; /* allocation group number */
-typedef int32_t                xfs_extnum_t;   /* # of extents in a file */
-typedef int16_t                xfs_aextnum_t;  /* # extents in an attribute fork */
+typedef uint64_t       xfs_extnum_t;   /* # of extents in a file */
+typedef uint32_t       xfs_aextnum_t;  /* # extents in an attribute fork */
  typedef int64_t                xfs_fsize_t;    /* bytes in a file */
  typedef uint64_t       xfs_ufsize_t;   /* unsigned bytes in a file */
  
@@ -56,13 +56,6 @@ typedef void *               xfs_failaddr_t;
  #define        NULLFSINO       ((xfs_ino_t)-1)
  #define        NULLAGINO       ((xfs_agino_t)-1)
  
-/*
- * Max values for extlen, extnum, aextnum.
- */
-#define        MAXEXTLEN       ((xfs_extlen_t)0x001fffff)      /* 21 bits */
-#define        MAXEXTNUM       ((xfs_extnum_t)0x7fffffff)      /* signed int */
-#define        MAXAEXTNUM      ((xfs_aextnum_t)0x7fff)         /* signed short */
-
  /*
   * Minimum and maximum blocksize and sectorsize.
   * The blocksize upper limit is pretty much arbitrary.
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c

index a4cbbc3..285995b 100644 (file)
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -133,29 +133,13 @@ xchk_bmap_get_rmap(
         if (info->is_shared) {
                 error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno,
                                 owner, offset, rflags, rmap, &has_rmap);
-               if (!xchk_should_check_xref(info->sc, &error,
-                               &info->sc->sa.rmap_cur))
-                       return false;
-               goto out;
+       } else {
+               error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno,
+                               owner, offset, rflags, rmap, &has_rmap);
         }
-
-       /*
-        * Otherwise, use the (faster) regular lookup.
-        */
-       error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, 0, owner,
-                       offset, rflags, &has_rmap);
-       if (!xchk_should_check_xref(info->sc, &error,
-                       &info->sc->sa.rmap_cur))
+       if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur))
                 return false;
-       if (!has_rmap)
-               goto out;
  
-       error = xfs_rmap_get_rec(info->sc->sa.rmap_cur, rmap, &has_rmap);
-       if (!xchk_should_check_xref(info->sc, &error,
-                       &info->sc->sa.rmap_cur))
-               return false;
-
-out:
         if (!has_rmap)
                 xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
                         irec->br_startoff);
@@ -350,7 +334,7 @@ xchk_bmap_iextent(
                                 irec->br_startoff);
  
         /* Make sure the extent points to a valid place. */
-       if (irec->br_blockcount > MAXEXTLEN)
+       if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
                 xchk_fblock_set_corrupt(info->sc, info->whichfork,
                                 irec->br_startoff);
         if (info->is_rt &&
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c

index bf1f360..97b54ac 100644 (file)
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -23,6 +23,8 @@
  #include "xfs_rmap_btree.h"
  #include "xfs_log.h"
  #include "xfs_trans_priv.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
  #include "xfs_attr.h"
  #include "xfs_reflink.h"
  #include "xfs_ag.h"
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c

index eac15af..51820b4 100644 (file)
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -232,7 +232,8 @@ xchk_dinode(
         size_t                  fork_recs;
         unsigned long long      isize;
         uint64_t                flags2;
-       uint32_t                nextents;
+       xfs_extnum_t            nextents;
+       xfs_extnum_t            naextents;
         prid_t                  prid;
         uint16_t                flags;
         uint16_t                mode;
@@ -390,8 +391,10 @@ xchk_dinode(
  
         xchk_inode_extsize(sc, dip, ino, mode, flags);
  
+       nextents = xfs_dfork_data_extents(dip);
+       naextents = xfs_dfork_attr_extents(dip);
+
         /* di_nextents */
-       nextents = be32_to_cpu(dip->di_nextents);
         fork_recs =  XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
         switch (dip->di_format) {
         case XFS_DINODE_FMT_EXTENTS:
@@ -411,7 +414,7 @@ xchk_dinode(
         /* di_forkoff */
         if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
                 xchk_ino_set_corrupt(sc, ino);
-       if (dip->di_anextents != 0 && dip->di_forkoff == 0)
+       if (naextents != 0 && dip->di_forkoff == 0)
                 xchk_ino_set_corrupt(sc, ino);
         if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS)
                 xchk_ino_set_corrupt(sc, ino);
@@ -423,19 +426,18 @@ xchk_dinode(
                 xchk_ino_set_corrupt(sc, ino);
  
         /* di_anextents */
-       nextents = be16_to_cpu(dip->di_anextents);
         fork_recs =  XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
         switch (dip->di_aformat) {
         case XFS_DINODE_FMT_EXTENTS:
-               if (nextents > fork_recs)
+               if (naextents > fork_recs)
                         xchk_ino_set_corrupt(sc, ino);
                 break;
         case XFS_DINODE_FMT_BTREE:
-               if (nextents <= fork_recs)
+               if (naextents <= fork_recs)
                         xchk_ino_set_corrupt(sc, ino);
                 break;
         default:
-               if (nextents != 0)
+               if (naextents != 0)
                         xchk_ino_set_corrupt(sc, ino);
         }
  
@@ -513,14 +515,14 @@ xchk_inode_xref_bmap(
                         &nextents, &count);
         if (!xchk_should_check_xref(sc, &error, NULL))
                 return;
-       if (nextents < be32_to_cpu(dip->di_nextents))
+       if (nextents < xfs_dfork_data_extents(dip))
                 xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
  
         error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
                         &nextents, &acount);
         if (!xchk_should_check_xref(sc, &error, NULL))
                 return;
-       if (nextents != be16_to_cpu(dip->di_anextents))
+       if (nextents != xfs_dfork_attr_extents(dip))
                 xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
  
         /* Check nblocks against the inode. */
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c

index 8fa0120..0a3bde6 100644 (file)
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -40,6 +40,7 @@ xchk_setup_rt(
  /* Scrub a free extent record from the realtime bitmap. */
  STATIC int
  xchk_rtbitmap_rec(
+       struct xfs_mount        *mp,
         struct xfs_trans        *tp,
         const struct xfs_rtalloc_rec *rec,
         void                    *priv)
@@ -48,10 +49,10 @@ xchk_rtbitmap_rec(
         xfs_rtblock_t           startblock;
         xfs_rtblock_t           blockcount;
  
-       startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize;
-       blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize;
+       startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
+       blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
  
-       if (!xfs_verify_rtext(sc->mp, startblock, blockcount))
+       if (!xfs_verify_rtext(mp, startblock, blockcount))
                 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
         return 0;
  }
@@ -114,7 +115,7 @@ xchk_rtbitmap(
         if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
                 return error;
  
-       error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc);
+       error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc);
         if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
                 goto out;
  
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c

index 5c52ee8..3df9c17 100644 (file)
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -10,12 +10,12 @@
  #include "xfs_trans_resv.h"
  #include "xfs_mount.h"
  #include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
  #include "xfs_attr.h"
  #include "xfs_trace.h"
  #include "xfs_error.h"
  #include "xfs_acl.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
  #include "xfs_trans.h"
  
  #include <linux/posix_acl_xattr.h>
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h

index bb6abdc..263404d 100644 (file)
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -16,11 +16,13 @@ extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
  extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
  void xfs_forget_acl(struct inode *inode, const char *name);
  #else
-static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu)
+#define xfs_get_acl NULL
+#define xfs_set_acl NULL
+static inline int __xfs_set_acl(struct inode *inode, struct posix_acl *acl,
+                               int type)
  {
-       return NULL;
+       return 0;
  }
-# define xfs_set_acl                                   NULL
  static inline void xfs_forget_acl(struct inode *inode, const char *name)
  {
  }
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c

new file mode 100644 (file)

index 0000000..e8ac88d
--- /dev/null
+++ b/fs/xfs/xfs_attr_item.c
@@ -0,0 +1,824 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022 Oracle.  All Rights Reserved.
+ * Author: Allison Henderson <allison.henderson@oracle.com>
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_item.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_trans_space.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+static const struct xfs_item_ops xfs_attri_item_ops;
+static const struct xfs_item_ops xfs_attrd_item_ops;
+static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp,
+                                       struct xfs_attri_log_item *attrip);
+
+static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip)
+{
+       return container_of(lip, struct xfs_attri_log_item, attri_item);
+}
+
+STATIC void
+xfs_attri_item_free(
+       struct xfs_attri_log_item       *attrip)
+{
+       kmem_free(attrip->attri_item.li_lv_shadow);
+       kvfree(attrip);
+}
+
+/*
+ * Freeing the attrip requires that we remove it from the AIL if it has already
+ * been placed there. However, the ATTRI may not yet have been placed in the
+ * AIL when called by xfs_attri_release() from ATTRD processing due to the
+ * ordering of committed vs unpin operations in bulk insert operations. Hence
+ * the reference count to ensure only the last caller frees the ATTRI.
+ */
+STATIC void
+xfs_attri_release(
+       struct xfs_attri_log_item       *attrip)
+{
+       ASSERT(atomic_read(&attrip->attri_refcount) > 0);
+       if (!atomic_dec_and_test(&attrip->attri_refcount))
+               return;
+
+       xfs_trans_ail_delete(&attrip->attri_item, 0);
+       xfs_attri_item_free(attrip);
+}
+
+STATIC void
+xfs_attri_item_size(
+       struct xfs_log_item             *lip,
+       int                             *nvecs,
+       int                             *nbytes)
+{
+       struct xfs_attri_log_item       *attrip = ATTRI_ITEM(lip);
+
+       *nvecs += 2;
+       *nbytes += sizeof(struct xfs_attri_log_format) +
+                       xlog_calc_iovec_len(attrip->attri_name_len);
+
+       if (!attrip->attri_value_len)
+               return;
+
+       *nvecs += 1;
+       *nbytes += xlog_calc_iovec_len(attrip->attri_value_len);
+}
+
+/*
+ * This is called to fill in the log iovecs for the given attri log
+ * item. We use  1 iovec for the attri_format_item, 1 for the name, and
+ * another for the value if it is present
+ */
+STATIC void
+xfs_attri_item_format(
+       struct xfs_log_item             *lip,
+       struct xfs_log_vec              *lv)
+{
+       struct xfs_attri_log_item       *attrip = ATTRI_ITEM(lip);
+       struct xfs_log_iovec            *vecp = NULL;
+
+       attrip->attri_format.alfi_type = XFS_LI_ATTRI;
+       attrip->attri_format.alfi_size = 1;
+
+       /*
+        * This size accounting must be done before copying the attrip into the
+        * iovec.  If we do it after, the wrong size will be recorded to the log
+        * and we trip across assertion checks for bad region sizes later during
+        * the log recovery.
+        */
+
+       ASSERT(attrip->attri_name_len > 0);
+       attrip->attri_format.alfi_size++;
+
+       if (attrip->attri_value_len > 0)
+               attrip->attri_format.alfi_size++;
+
+       xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT,
+                       &attrip->attri_format,
+                       sizeof(struct xfs_attri_log_format));
+       xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME,
+                       attrip->attri_name,
+                       attrip->attri_name_len);
+       if (attrip->attri_value_len > 0)
+               xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE,
+                               attrip->attri_value,
+                               attrip->attri_value_len);
+}
+
+/*
+ * The unpin operation is the last place an ATTRI is manipulated in the log. It
+ * is either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the ATTRI transaction has been successfully committed to make
+ * it this far. Therefore, we expect whoever committed the ATTRI to either
+ * construct and commit the ATTRD or drop the ATTRD's reference in the event of
+ * error. Simply drop the log's ATTRI reference now that the log is done with
+ * it.
+ */
+STATIC void
+xfs_attri_item_unpin(
+       struct xfs_log_item     *lip,
+       int                     remove)
+{
+       xfs_attri_release(ATTRI_ITEM(lip));
+}
+
+
+STATIC void
+xfs_attri_item_release(
+       struct xfs_log_item     *lip)
+{
+       xfs_attri_release(ATTRI_ITEM(lip));
+}
+
+/*
+ * Allocate and initialize an attri item.  Caller may allocate an additional
+ * trailing buffer for name and value
+ */
+STATIC struct xfs_attri_log_item *
+xfs_attri_init(
+       struct xfs_mount                *mp,
+       uint32_t                        name_len,
+       uint32_t                        value_len)
+
+{
+       struct xfs_attri_log_item       *attrip;
+       uint32_t                        buffer_size = name_len + value_len;
+
+       if (buffer_size) {
+               /*
+                * This could be over 64kB in length, so we have to use
+                * kvmalloc() for this. But kvmalloc() utterly sucks, so we
+                * use own version.
+                */
+               attrip = xlog_kvmalloc(sizeof(struct xfs_attri_log_item) +
+                                       buffer_size);
+       } else {
+               attrip = kmem_cache_alloc(xfs_attri_cache,
+                                       GFP_NOFS | __GFP_NOFAIL);
+       }
+       memset(attrip, 0, sizeof(struct xfs_attri_log_item));
+
+       attrip->attri_name_len = name_len;
+       if (name_len)
+               attrip->attri_name = ((char *)attrip) +
+                               sizeof(struct xfs_attri_log_item);
+       else
+               attrip->attri_name = NULL;
+
+       attrip->attri_value_len = value_len;
+       if (value_len)
+               attrip->attri_value = ((char *)attrip) +
+                               sizeof(struct xfs_attri_log_item) +
+                               name_len;
+       else
+               attrip->attri_value = NULL;
+
+       xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI,
+                         &xfs_attri_item_ops);
+       attrip->attri_format.alfi_id = (uintptr_t)(void *)attrip;
+       atomic_set(&attrip->attri_refcount, 2);
+
+       return attrip;
+}
+
+/*
+ * Copy an attr format buffer from the given buf, and into the destination attr
+ * format structure.
+ */
+STATIC int
+xfs_attri_copy_format(
+       struct xfs_log_iovec            *buf,
+       struct xfs_attri_log_format     *dst_attr_fmt)
+{
+       struct xfs_attri_log_format     *src_attr_fmt = buf->i_addr;
+       size_t                          len;
+
+       len = sizeof(struct xfs_attri_log_format);
+       if (buf->i_len != len) {
+               XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+               return -EFSCORRUPTED;
+       }
+
+       memcpy((char *)dst_attr_fmt, (char *)src_attr_fmt, len);
+       return 0;
+}
+
+static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip)
+{
+       return container_of(lip, struct xfs_attrd_log_item, attrd_item);
+}
+
+STATIC void
+xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp)
+{
+       kmem_free(attrdp->attrd_item.li_lv_shadow);
+       kmem_free(attrdp);
+}
+
+STATIC void
+xfs_attrd_item_size(
+       struct xfs_log_item             *lip,
+       int                             *nvecs,
+       int                             *nbytes)
+{
+       *nvecs += 1;
+       *nbytes += sizeof(struct xfs_attrd_log_format);
+}
+
+/*
+ * This is called to fill in the log iovecs for the given attrd log item. We use
+ * only 1 iovec for the attrd_format, and we point that at the attr_log_format
+ * structure embedded in the attrd item.
+ */
+STATIC void
+xfs_attrd_item_format(
+       struct xfs_log_item     *lip,
+       struct xfs_log_vec      *lv)
+{
+       struct xfs_attrd_log_item       *attrdp = ATTRD_ITEM(lip);
+       struct xfs_log_iovec            *vecp = NULL;
+
+       attrdp->attrd_format.alfd_type = XFS_LI_ATTRD;
+       attrdp->attrd_format.alfd_size = 1;
+
+       xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRD_FORMAT,
+                       &attrdp->attrd_format,
+                       sizeof(struct xfs_attrd_log_format));
+}
+
+/*
+ * The ATTRD is either committed or aborted if the transaction is canceled. If
+ * the transaction is canceled, drop our reference to the ATTRI and free the
+ * ATTRD.
+ */
+STATIC void
+xfs_attrd_item_release(
+       struct xfs_log_item             *lip)
+{
+       struct xfs_attrd_log_item       *attrdp = ATTRD_ITEM(lip);
+
+       xfs_attri_release(attrdp->attrd_attrip);
+       xfs_attrd_item_free(attrdp);
+}
+
+static struct xfs_log_item *
+xfs_attrd_item_intent(
+       struct xfs_log_item     *lip)
+{
+       return &ATTRD_ITEM(lip)->attrd_attrip->attri_item;
+}
+
+/*
+ * Performs one step of an attribute update intent and marks the attrd item
+ * dirty..  An attr operation may be a set or a remove.  Note that the
+ * transaction is marked dirty regardless of whether the operation succeeds or
+ * fails to support the ATTRI/ATTRD lifecycle rules.
+ */
+STATIC int
+xfs_xattri_finish_update(
+       struct xfs_attr_item            *attr,
+       struct xfs_attrd_log_item       *attrdp)
+{
+       struct xfs_da_args              *args = attr->xattri_da_args;
+       int                             error;
+
+       if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+               error = -EIO;
+               goto out;
+       }
+
+       error = xfs_attr_set_iter(attr);
+       if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
+               error = -EAGAIN;
+out:
+       /*
+        * Mark the transaction dirty, even on error. This ensures the
+        * transaction is aborted, which:
+        *
+        * 1.) releases the ATTRI and frees the ATTRD
+        * 2.) shuts down the filesystem
+        */
+       args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
+
+       /*
+        * attr intent/done items are null when logged attributes are disabled
+        */
+       if (attrdp)
+               set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
+
+       return error;
+}
+
+/* Log an attr to the intent item. */
+STATIC void
+xfs_attr_log_item(
+       struct xfs_trans                *tp,
+       struct xfs_attri_log_item       *attrip,
+       struct xfs_attr_item            *attr)
+{
+       struct xfs_attri_log_format     *attrp;
+
+       tp->t_flags |= XFS_TRANS_DIRTY;
+       set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags);
+
+       /*
+        * At this point the xfs_attr_item has been constructed, and we've
+        * created the log intent. Fill in the attri log item and log format
+        * structure with fields from this xfs_attr_item
+        */
+       attrp = &attrip->attri_format;
+       attrp->alfi_ino = attr->xattri_da_args->dp->i_ino;
+       attrp->alfi_op_flags = attr->xattri_op_flags;
+       attrp->alfi_value_len = attr->xattri_da_args->valuelen;
+       attrp->alfi_name_len = attr->xattri_da_args->namelen;
+       attrp->alfi_attr_flags = attr->xattri_da_args->attr_filter;
+
+       memcpy(attrip->attri_name, attr->xattri_da_args->name,
+              attr->xattri_da_args->namelen);
+       memcpy(attrip->attri_value, attr->xattri_da_args->value,
+              attr->xattri_da_args->valuelen);
+       attrip->attri_name_len = attr->xattri_da_args->namelen;
+       attrip->attri_value_len = attr->xattri_da_args->valuelen;
+}
+
+/* Get an ATTRI. */
+static struct xfs_log_item *
+xfs_attr_create_intent(
+       struct xfs_trans                *tp,
+       struct list_head                *items,
+       unsigned int                    count,
+       bool                            sort)
+{
+       struct xfs_mount                *mp = tp->t_mountp;
+       struct xfs_attri_log_item       *attrip;
+       struct xfs_attr_item            *attr;
+
+       ASSERT(count == 1);
+
+       if (!xfs_sb_version_haslogxattrs(&mp->m_sb))
+               return NULL;
+
+       /*
+        * Each attr item only performs one attribute operation at a time, so
+        * this is a list of one
+        */
+       list_for_each_entry(attr, items, xattri_list) {
+               attrip = xfs_attri_init(mp, attr->xattri_da_args->namelen,
+                                       attr->xattri_da_args->valuelen);
+               if (attrip == NULL)
+                       return NULL;
+
+               xfs_trans_add_item(tp, &attrip->attri_item);
+               xfs_attr_log_item(tp, attrip, attr);
+       }
+
+       return &attrip->attri_item;
+}
+
+/* Process an attr. */
+STATIC int
+xfs_attr_finish_item(
+       struct xfs_trans                *tp,
+       struct xfs_log_item             *done,
+       struct list_head                *item,
+       struct xfs_btree_cur            **state)
+{
+       struct xfs_attr_item            *attr;
+       struct xfs_attrd_log_item       *done_item = NULL;
+       int                             error;
+
+       attr = container_of(item, struct xfs_attr_item, xattri_list);
+       if (done)
+               done_item = ATTRD_ITEM(done);
+
+       /*
+        * Always reset trans after EAGAIN cycle
+        * since the transaction is new
+        */
+       attr->xattri_da_args->trans = tp;
+
+       error = xfs_xattri_finish_update(attr, done_item);
+       if (error != -EAGAIN)
+               kmem_free(attr);
+
+       return error;
+}
+
+/* Abort all pending ATTRs. */
+STATIC void
+xfs_attr_abort_intent(
+       struct xfs_log_item             *intent)
+{
+       xfs_attri_release(ATTRI_ITEM(intent));
+}
+
+/* Cancel an attr */
+STATIC void
+xfs_attr_cancel_item(
+       struct list_head                *item)
+{
+       struct xfs_attr_item            *attr;
+
+       attr = container_of(item, struct xfs_attr_item, xattri_list);
+       kmem_free(attr);
+}
+
+STATIC xfs_lsn_t
+xfs_attri_item_committed(
+       struct xfs_log_item             *lip,
+       xfs_lsn_t                       lsn)
+{
+       struct xfs_attri_log_item       *attrip = ATTRI_ITEM(lip);
+
+       /*
+        * The attrip refers to xfs_attr_item memory to log the name and value
+        * with the intent item. This already occurred when the intent was
+        * committed so these fields are no longer accessed. Clear them out of
+        * caution since we're about to free the xfs_attr_item.
+        */
+       attrip->attri_name = NULL;
+       attrip->attri_value = NULL;
+
+       /*
+        * The ATTRI is logged only once and cannot be moved in the log, so
+        * simply return the lsn at which it's been logged.
+        */
+       return lsn;
+}
+
+STATIC bool
+xfs_attri_item_match(
+       struct xfs_log_item     *lip,
+       uint64_t                intent_id)
+{
+       return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id;
+}
+
+/* Is this recovered ATTRI format ok? */
+static inline bool
+xfs_attri_validate(
+       struct xfs_mount                *mp,
+       struct xfs_attri_log_format     *attrp)
+{
+       unsigned int                    op = attrp->alfi_op_flags &
+                                            XFS_ATTR_OP_FLAGS_TYPE_MASK;
+
+       if (attrp->__pad != 0)
+               return false;
+
+       /* alfi_op_flags should be either a set or remove */
+       switch (op) {
+       case XFS_ATTR_OP_FLAGS_SET:
+       case XFS_ATTR_OP_FLAGS_REPLACE:
+       case XFS_ATTR_OP_FLAGS_REMOVE:
+               break;
+       default:
+               return false;
+       }
+
+       if (attrp->alfi_value_len > XATTR_SIZE_MAX)
+               return false;
+
+       if ((attrp->alfi_name_len > XATTR_NAME_MAX) ||
+           (attrp->alfi_name_len == 0))
+               return false;
+
+       return xfs_verify_ino(mp, attrp->alfi_ino);
+}
+
+/*
+ * Process an attr intent item that was recovered from the log.  We need to
+ * delete the attr that it describes.
+ */
+STATIC int
+xfs_attri_item_recover(
+       struct xfs_log_item             *lip,
+       struct list_head                *capture_list)
+{
+       struct xfs_attri_log_item       *attrip = ATTRI_ITEM(lip);
+       struct xfs_attr_item            *attr;
+       struct xfs_mount                *mp = lip->li_log->l_mp;
+       struct xfs_inode                *ip;
+       struct xfs_da_args              *args;
+       struct xfs_trans                *tp;
+       struct xfs_trans_res            tres;
+       struct xfs_attri_log_format     *attrp;
+       int                             error, ret = 0;
+       int                             total;
+       int                             local;
+       struct xfs_attrd_log_item       *done_item = NULL;
+
+       /*
+        * First check the validity of the attr described by the ATTRI.  If any
+        * are bad, then assume that all are bad and just toss the ATTRI.
+        */
+       attrp = &attrip->attri_format;
+       if (!xfs_attri_validate(mp, attrp) ||
+           !xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len))
+               return -EFSCORRUPTED;
+
+       error = xlog_recover_iget(mp,  attrp->alfi_ino, &ip);
+       if (error)
+               return error;
+
+       attr = kmem_zalloc(sizeof(struct xfs_attr_item) +
+                          sizeof(struct xfs_da_args), KM_NOFS);
+       args = (struct xfs_da_args *)(attr + 1);
+
+       attr->xattri_da_args = args;
+       attr->xattri_op_flags = attrp->alfi_op_flags;
+
+       args->dp = ip;
+       args->geo = mp->m_attr_geo;
+       args->whichfork = XFS_ATTR_FORK;
+       args->name = attrip->attri_name;
+       args->namelen = attrp->alfi_name_len;
+       args->hashval = xfs_da_hashname(args->name, args->namelen);
+       args->attr_filter = attrp->alfi_attr_flags;
+       args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT;
+
+       switch (attrp->alfi_op_flags & XFS_ATTR_OP_FLAGS_TYPE_MASK) {
+       case XFS_ATTR_OP_FLAGS_SET:
+       case XFS_ATTR_OP_FLAGS_REPLACE:
+               args->value = attrip->attri_value;
+               args->valuelen = attrp->alfi_value_len;
+               args->total = xfs_attr_calc_size(args, &local);
+               if (xfs_inode_hasattr(args->dp))
+                       attr->xattri_dela_state = xfs_attr_init_replace_state(args);
+               else
+                       attr->xattri_dela_state = xfs_attr_init_add_state(args);
+               break;
+       case XFS_ATTR_OP_FLAGS_REMOVE:
+               if (!xfs_inode_hasattr(args->dp))
+                       goto out;
+               attr->xattri_dela_state = xfs_attr_init_remove_state(args);
+               break;
+       default:
+               ASSERT(0);
+               error = -EFSCORRUPTED;
+               goto out;
+       }
+
+       xfs_init_attr_trans(args, &tres, &total);
+       error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp);
+       if (error)
+               goto out;
+
+       args->trans = tp;
+       done_item = xfs_trans_get_attrd(tp, attrip);
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, 0);
+
+       ret = xfs_xattri_finish_update(attr, done_item);
+       if (ret == -EAGAIN) {
+               /* There's more work to do, so add it to this transaction */
+               xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
+       } else
+               error = ret;
+
+       if (error) {
+               xfs_trans_cancel(tp);
+               goto out_unlock;
+       }
+
+       error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+
+out_unlock:
+       if (attr->xattri_leaf_bp)
+               xfs_buf_relse(attr->xattri_leaf_bp);
+
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       xfs_irele(ip);
+out:
+       if (ret != -EAGAIN)
+               kmem_free(attr);
+       return error;
+}
+
+/* Re-log an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_attri_item_relog(
+       struct xfs_log_item             *intent,
+       struct xfs_trans                *tp)
+{
+       struct xfs_attrd_log_item       *attrdp;
+       struct xfs_attri_log_item       *old_attrip;
+       struct xfs_attri_log_item       *new_attrip;
+       struct xfs_attri_log_format     *new_attrp;
+       struct xfs_attri_log_format     *old_attrp;
+
+       old_attrip = ATTRI_ITEM(intent);
+       old_attrp = &old_attrip->attri_format;
+
+       tp->t_flags |= XFS_TRANS_DIRTY;
+       attrdp = xfs_trans_get_attrd(tp, old_attrip);
+       set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
+
+       new_attrip = xfs_attri_init(tp->t_mountp, old_attrp->alfi_name_len,
+                                   old_attrp->alfi_value_len);
+       new_attrp = &new_attrip->attri_format;
+
+       new_attrp->alfi_ino = old_attrp->alfi_ino;
+       new_attrp->alfi_op_flags = old_attrp->alfi_op_flags;
+       new_attrp->alfi_value_len = old_attrp->alfi_value_len;
+       new_attrp->alfi_name_len = old_attrp->alfi_name_len;
+       new_attrp->alfi_attr_flags = old_attrp->alfi_attr_flags;
+
+       memcpy(new_attrip->attri_name, old_attrip->attri_name,
+               new_attrip->attri_name_len);
+
+       if (new_attrip->attri_value_len > 0)
+               memcpy(new_attrip->attri_value, old_attrip->attri_value,
+                      new_attrip->attri_value_len);
+
+       xfs_trans_add_item(tp, &new_attrip->attri_item);
+       set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags);
+
+       return &new_attrip->attri_item;
+}
+
+STATIC int
+xlog_recover_attri_commit_pass2(
+       struct xlog                     *log,
+       struct list_head                *buffer_list,
+       struct xlog_recover_item        *item,
+       xfs_lsn_t                       lsn)
+{
+       int                             error;
+       struct xfs_mount                *mp = log->l_mp;
+       struct xfs_attri_log_item       *attrip;
+       struct xfs_attri_log_format     *attri_formatp;
+       int                             region = 0;
+
+       attri_formatp = item->ri_buf[region].i_addr;
+
+       /* Validate xfs_attri_log_format */
+       if (!xfs_attri_validate(mp, attri_formatp)) {
+               XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+               return -EFSCORRUPTED;
+       }
+
+       /* memory alloc failure will cause replay to abort */
+       attrip = xfs_attri_init(mp, attri_formatp->alfi_name_len,
+                               attri_formatp->alfi_value_len);
+       if (attrip == NULL)
+               return -ENOMEM;
+
+       error = xfs_attri_copy_format(&item->ri_buf[region],
+                                     &attrip->attri_format);
+       if (error)
+               goto out;
+
+       region++;
+       memcpy(attrip->attri_name, item->ri_buf[region].i_addr,
+              attrip->attri_name_len);
+
+       if (!xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len)) {
+               XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+               error = -EFSCORRUPTED;
+               goto out;
+       }
+
+       if (attrip->attri_value_len > 0) {
+               region++;
+               memcpy(attrip->attri_value, item->ri_buf[region].i_addr,
+                      attrip->attri_value_len);
+       }
+
+       /*
+        * The ATTRI has two references. One for the ATTRD and one for ATTRI to
+        * ensure it makes it into the AIL. Insert the ATTRI into the AIL
+        * directly and drop the ATTRI reference. Note that
+        * xfs_trans_ail_update() drops the AIL lock.
+        */
+       xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn);
+       xfs_attri_release(attrip);
+       return 0;
+out:
+       xfs_attri_item_free(attrip);
+       return error;
+}
+
+/*
+ * This routine is called to allocate an "attr free done" log item.
+ */
+static struct xfs_attrd_log_item *
+xfs_trans_get_attrd(struct xfs_trans           *tp,
+                 struct xfs_attri_log_item     *attrip)
+{
+       struct xfs_attrd_log_item               *attrdp;
+
+       ASSERT(tp != NULL);
+
+       attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
+
+       xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
+                         &xfs_attrd_item_ops);
+       attrdp->attrd_attrip = attrip;
+       attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
+
+       xfs_trans_add_item(tp, &attrdp->attrd_item);
+       return attrdp;
+}
+
+/* Get an ATTRD so we can process all the attrs. */
+static struct xfs_log_item *
+xfs_attr_create_done(
+       struct xfs_trans                *tp,
+       struct xfs_log_item             *intent,
+       unsigned int                    count)
+{
+       if (!intent)
+               return NULL;
+
+       return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item;
+}
+
+const struct xfs_defer_op_type xfs_attr_defer_type = {
+       .max_items      = 1,
+       .create_intent  = xfs_attr_create_intent,
+       .abort_intent   = xfs_attr_abort_intent,
+       .create_done    = xfs_attr_create_done,
+       .finish_item    = xfs_attr_finish_item,
+       .cancel_item    = xfs_attr_cancel_item,
+};
+
+/*
+ * This routine is called when an ATTRD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding ATTRI if
+ * it was still in the log. To do this it searches the AIL for the ATTRI with
+ * an id equal to that in the ATTRD format structure. If we find it we drop
+ * the ATTRD reference, which removes the ATTRI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_attrd_commit_pass2(
+       struct xlog                     *log,
+       struct list_head                *buffer_list,
+       struct xlog_recover_item        *item,
+       xfs_lsn_t                       lsn)
+{
+       struct xfs_attrd_log_format     *attrd_formatp;
+
+       attrd_formatp = item->ri_buf[0].i_addr;
+       if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) {
+               XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+               return -EFSCORRUPTED;
+       }
+
+       xlog_recover_release_intent(log, XFS_LI_ATTRI,
+                                   attrd_formatp->alfd_alf_id);
+       return 0;
+}
+
+static const struct xfs_item_ops xfs_attri_item_ops = {
+       .flags          = XFS_ITEM_INTENT,
+       .iop_size       = xfs_attri_item_size,
+       .iop_format     = xfs_attri_item_format,
+       .iop_unpin      = xfs_attri_item_unpin,
+       .iop_committed  = xfs_attri_item_committed,
+       .iop_release    = xfs_attri_item_release,
+       .iop_recover    = xfs_attri_item_recover,
+       .iop_match      = xfs_attri_item_match,
+       .iop_relog      = xfs_attri_item_relog,
+};
+
+const struct xlog_recover_item_ops xlog_attri_item_ops = {
+       .item_type      = XFS_LI_ATTRI,
+       .commit_pass2   = xlog_recover_attri_commit_pass2,
+};
+
+static const struct xfs_item_ops xfs_attrd_item_ops = {
+       .flags          = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+                         XFS_ITEM_INTENT_DONE,
+       .iop_size       = xfs_attrd_item_size,
+       .iop_format     = xfs_attrd_item_format,
+       .iop_release    = xfs_attrd_item_release,
+       .iop_intent     = xfs_attrd_item_intent,
+};
+
+const struct xlog_recover_item_ops xlog_attrd_item_ops = {
+       .item_type      = XFS_LI_ATTRD,
+       .commit_pass2   = xlog_recover_attrd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h

new file mode 100644 (file)

index 0000000..c3b779f
--- /dev/null
+++ b/fs/xfs/xfs_attr_item.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (C) 2022 Oracle.  All Rights Reserved.
+ * Author: Allison Henderson <allison.henderson@oracle.com>
+ */
+#ifndef        __XFS_ATTR_ITEM_H__
+#define        __XFS_ATTR_ITEM_H__
+
+/* kernel only ATTRI/ATTRD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+/*
+ * This is the "attr intention" log item.  It is used to log the fact that some
+ * extended attribute operations need to be processed.  An operation is
+ * currently either a set or remove.  Set or remove operations are described by
+ * the xfs_attr_item which may be logged to this intent.
+ *
+ * During a normal attr operation, name and value point to the name and value
+ * fields of the caller's xfs_da_args structure.  During a recovery, the name
+ * and value buffers are copied from the log, and stored in a trailing buffer
+ * attached to the xfs_attr_item until they are committed.  They are freed when
+ * the xfs_attr_item itself is freed when the work is done.
+ */
+struct xfs_attri_log_item {
+       struct xfs_log_item             attri_item;
+       atomic_t                        attri_refcount;
+       int                             attri_name_len;
+       int                             attri_value_len;
+       void                            *attri_name;
+       void                            *attri_value;
+       struct xfs_attri_log_format     attri_format;
+};
+
+/*
+ * This is the "attr done" log item.  It is used to log the fact that some attrs
+ * earlier mentioned in an attri item have been freed.
+ */
+struct xfs_attrd_log_item {
+       struct xfs_log_item             attrd_item;
+       struct xfs_attri_log_item       *attrd_attrip;
+       struct xfs_attrd_log_format     attrd_format;
+};
+
+#endif /* __XFS_ATTR_ITEM_H__ */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c

index 2d1e513..90a14e8 100644 (file)
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -15,6 +15,7 @@
  #include "xfs_inode.h"
  #include "xfs_trans.h"
  #include "xfs_bmap.h"
+#include "xfs_da_btree.h"
  #include "xfs_attr.h"
  #include "xfs_attr_sf.h"
  #include "xfs_attr_leaf.h"
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c

index 761dde1..51f66e9 100644 (file)
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -39,6 +39,7 @@ STATIC void
  xfs_bui_item_free(
         struct xfs_bui_log_item *buip)
  {
+       kmem_free(buip->bui_item.li_lv_shadow);
         kmem_cache_free(xfs_bui_cache, buip);
  }
  
@@ -54,10 +55,11 @@ xfs_bui_release(
         struct xfs_bui_log_item *buip)
  {
         ASSERT(atomic_read(&buip->bui_refcount) > 0);
-       if (atomic_dec_and_test(&buip->bui_refcount)) {
-               xfs_trans_ail_delete(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
-               xfs_bui_item_free(buip);
-       }
+       if (!atomic_dec_and_test(&buip->bui_refcount))
+               return;
+
+       xfs_trans_ail_delete(&buip->bui_item, 0);
+       xfs_bui_item_free(buip);
  }
  
  
@@ -198,14 +200,24 @@ xfs_bud_item_release(
         struct xfs_bud_log_item *budp = BUD_ITEM(lip);
  
         xfs_bui_release(budp->bud_buip);
+       kmem_free(budp->bud_item.li_lv_shadow);
         kmem_cache_free(xfs_bud_cache, budp);
  }
  
+static struct xfs_log_item *
+xfs_bud_item_intent(
+       struct xfs_log_item     *lip)
+{
+       return &BUD_ITEM(lip)->bud_buip->bui_item;
+}
+
  static const struct xfs_item_ops xfs_bud_item_ops = {
-       .flags          = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+       .flags          = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+                         XFS_ITEM_INTENT_DONE,
         .iop_size       = xfs_bud_item_size,
         .iop_format     = xfs_bud_item_format,
         .iop_release    = xfs_bud_item_release,
+       .iop_intent     = xfs_bud_item_intent,
  };
  
  static struct xfs_bud_log_item *
@@ -254,7 +266,7 @@ xfs_trans_log_finish_bmap_update(
          * 1.) releases the BUI and frees the BUD
          * 2.) shuts down the filesystem
          */
-       tp->t_flags |= XFS_TRANS_DIRTY;
+       tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
         set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
  
         return error;
@@ -506,6 +518,8 @@ xfs_bui_item_recover(
                 iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
  
         error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta);
+       if (error == -EFBIG)
+               error = xfs_iext_count_upgrade(tp, ip, iext_delta);
         if (error)
                 goto err_cancel;
  
@@ -584,6 +598,7 @@ xfs_bui_item_relog(
  }
  
  static const struct xfs_item_ops xfs_bui_item_ops = {
+       .flags          = XFS_ITEM_INTENT,
         .iop_size       = xfs_bui_item_size,
         .iop_format     = xfs_bui_item_format,
         .iop_unpin      = xfs_bui_item_unpin,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c

index eb2e387..52be583 100644 (file)
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -119,14 +119,14 @@ retry:
          */
         ralen = ap->length / mp->m_sb.sb_rextsize;
         /*
-        * If the old value was close enough to MAXEXTLEN that
+        * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
          * we rounded up to it, cut it back so it's valid again.
          * Note that if it's a really large request (bigger than
-        * MAXEXTLEN), we don't hear about that number, and can't
+        * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
          * adjust the starting point to match it.
          */
-       if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
-               ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+       if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN)
+               ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize;
  
         /*
          * Lock out modifications to both the RT bitmap and summary inodes
@@ -839,9 +839,11 @@ xfs_alloc_file_space(
                  * count, hence we need to limit the number of blocks we are
                  * trying to reserve to avoid an overflow. We can't allocate
                  * more than @nimaps extents, and an extent is limited on disk
-                * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+                * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
+                * limit.
                  */
-               resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
+               resblks = min_t(xfs_fileoff_t, (e - s),
+                               (XFS_MAX_BMBT_EXTLEN * nimaps));
                 if (unlikely(rt)) {
                         dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
                         rblocks = resblks;
@@ -857,6 +859,9 @@ xfs_alloc_file_space(
  
                 error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
                                 XFS_IEXT_ADD_NOSPLIT_CNT);
+               if (error == -EFBIG)
+                       error = xfs_iext_count_upgrade(tp, ip,
+                                       XFS_IEXT_ADD_NOSPLIT_CNT);
                 if (error)
                         goto error;
  
@@ -912,6 +917,8 @@ xfs_unmap_extent(
  
         error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
                         XFS_IEXT_PUNCH_HOLE_CNT);
+       if (error == -EFBIG)
+               error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
         if (error)
                 goto out_trans_cancel;
  
@@ -1193,6 +1200,8 @@ xfs_insert_file_space(
  
         error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
                         XFS_IEXT_PUNCH_HOLE_CNT);
+       if (error == -EFBIG)
+               error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
         if (error)
                 goto out_trans_cancel;
  
@@ -1421,6 +1430,9 @@ xfs_swap_extent_rmap(
                                 error = xfs_iext_count_may_overflow(ip,
                                                 XFS_DATA_FORK,
                                                 XFS_IEXT_SWAP_RMAP_CNT);
+                               if (error == -EFBIG)
+                                       error = xfs_iext_count_upgrade(tp, ip,
+                                                       XFS_IEXT_SWAP_RMAP_CNT);
                                 if (error)
                                         goto out;
                         }
@@ -1429,6 +1441,9 @@ xfs_swap_extent_rmap(
                                 error = xfs_iext_count_may_overflow(tip,
                                                 XFS_DATA_FORK,
                                                 XFS_IEXT_SWAP_RMAP_CNT);
+                               if (error == -EFBIG)
+                                       error = xfs_iext_count_upgrade(tp, ip,
+                                                       XFS_IEXT_SWAP_RMAP_CNT);
                                 if (error)
                                         goto out;
                         }
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h

index e11e9ef..4d8a6ae 100644 (file)
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -8,15 +8,18 @@
  
  /* kernel only definitions */
  
+struct xfs_buf;
+struct xfs_mount;
+
  /* buf log item flags */
-#define        XFS_BLI_HOLD            0x01
-#define        XFS_BLI_DIRTY           0x02
-#define        XFS_BLI_STALE           0x04
-#define        XFS_BLI_LOGGED          0x08
-#define        XFS_BLI_INODE_ALLOC_BUF 0x10
-#define XFS_BLI_STALE_INODE    0x20
-#define        XFS_BLI_INODE_BUF       0x40
-#define        XFS_BLI_ORDERED         0x80
+#define        XFS_BLI_HOLD            (1u << 0)
+#define        XFS_BLI_DIRTY           (1u << 1)
+#define        XFS_BLI_STALE           (1u << 2)
+#define        XFS_BLI_LOGGED          (1u << 3)
+#define        XFS_BLI_INODE_ALLOC_BUF (1u << 4)
+#define XFS_BLI_STALE_INODE    (1u << 5)
+#define        XFS_BLI_INODE_BUF       (1u << 6)
+#define        XFS_BLI_ORDERED         (1u << 7)
  
  #define XFS_BLI_FLAGS \
         { XFS_BLI_HOLD,         "HOLD" }, \
@@ -28,11 +31,6 @@
         { XFS_BLI_INODE_BUF,    "INODE_BUF" }, \
         { XFS_BLI_ORDERED,      "ORDERED" }
  
-
-struct xfs_buf;
-struct xfs_mount;
-struct xfs_buf_log_item;
-
  /*
   * This is the in core log item structure used to track information
   * needed to log buffers.  It tracks how many times the lock has been
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c

index 5afedcb..5a6c3c3 100644 (file)
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -136,10 +136,7 @@ xfs_qm_adjust_res_timer(
                         res->timer = xfs_dquot_set_timeout(mp,
                                         ktime_get_real_seconds() + qlim->time);
         } else {
-               if (res->timer == 0)
-                       res->warnings = 0;
-               else
-                       res->timer = 0;
+               res->timer = 0;
         }
  }
  
@@ -322,6 +319,9 @@ xfs_dquot_disk_alloc(
  
         error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK,
                         XFS_IEXT_ADD_NOSPLIT_CNT);
+       if (error == -EFBIG)
+               error = xfs_iext_count_upgrade(tp, quotip,
+                               XFS_IEXT_ADD_NOSPLIT_CNT);
         if (error)
                 goto err_cancel;
  
@@ -589,10 +589,6 @@ xfs_dquot_from_disk(
         dqp->q_ino.count = be64_to_cpu(ddqp->d_icount);
         dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount);
  
-       dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns);
-       dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns);
-       dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns);
-
         dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer);
         dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer);
         dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer);
@@ -634,9 +630,9 @@ xfs_dquot_to_disk(
         ddqp->d_icount = cpu_to_be64(dqp->q_ino.count);
         ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count);
  
-       ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings);
-       ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings);
-       ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings);
+       ddqp->d_bwarns = 0;
+       ddqp->d_iwarns = 0;
+       ddqp->d_rtbwarns = 0;
  
         ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer);
         ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer);
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h

index 6b5e3cf..80c8f85 100644 (file)
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -44,14 +44,6 @@ struct xfs_dquot_res {
          * in seconds since the Unix epoch.
          */
         time64_t                timer;
-
-       /*
-        * For root dquots, this is the maximum number of warnings that will
-        * be issued for this quota type.  Otherwise, this is the number of
-        * warnings issued against this quota.  Note that none of this is
-        * implemented.
-        */
-       xfs_qwarncnt_t          warnings;
  };
  
  static inline bool
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c

index 749fd18..296faa4 100644 (file)
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -57,6 +57,9 @@ static unsigned int xfs_errortag_random_default[] = {
         XFS_RANDOM_REDUCE_MAX_IEXTENTS,
         XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT,
         XFS_RANDOM_AG_RESV_FAIL,
+       XFS_RANDOM_LARP,
+       XFS_RANDOM_DA_LEAF_SPLIT,
+       XFS_RANDOM_ATTR_LEAF_TO_NODE,
  };
  
  struct xfs_errortag_attr {
@@ -170,6 +173,9 @@ XFS_ERRORTAG_ATTR_RW(buf_ioerror,   XFS_ERRTAG_BUF_IOERROR);
  XFS_ERRORTAG_ATTR_RW(reduce_max_iextents,      XFS_ERRTAG_REDUCE_MAX_IEXTENTS);
  XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT);
  XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
+XFS_ERRORTAG_ATTR_RW(larp,             XFS_ERRTAG_LARP);
+XFS_ERRORTAG_ATTR_RW(da_leaf_split,    XFS_ERRTAG_DA_LEAF_SPLIT);
+XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node,        XFS_ERRTAG_ATTR_LEAF_TO_NODE);
  
  static struct attribute *xfs_errortag_attrs[] = {
         XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -211,6 +217,9 @@ static struct attribute *xfs_errortag_attrs[] = {
         XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents),
         XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent),
         XFS_ERRORTAG_ATTR_LIST(ag_resv_fail),
+       XFS_ERRORTAG_ATTR_LIST(larp),
+       XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
+       XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
         NULL,
  };
  ATTRIBUTE_GROUPS(xfs_errortag);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h

index 5735d5e..5191e91 100644 (file)
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -64,16 +64,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
   * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
   *                     a panic by setting xfs_panic_mask in a sysctl.
   */
-#define                XFS_NO_PTAG                     0
-#define                XFS_PTAG_IFLUSH                 0x00000001
-#define                XFS_PTAG_LOGRES                 0x00000002
-#define                XFS_PTAG_AILDELETE              0x00000004
-#define                XFS_PTAG_ERROR_REPORT           0x00000008
-#define                XFS_PTAG_SHUTDOWN_CORRUPT       0x00000010
-#define                XFS_PTAG_SHUTDOWN_IOERROR       0x00000020
-#define                XFS_PTAG_SHUTDOWN_LOGERROR      0x00000040
-#define                XFS_PTAG_FSBLOCK_ZERO           0x00000080
-#define                XFS_PTAG_VERIFIER_ERROR         0x00000100
+#define                XFS_NO_PTAG                     0u
+#define                XFS_PTAG_IFLUSH                 (1u << 0)
+#define                XFS_PTAG_LOGRES                 (1u << 1)
+#define                XFS_PTAG_AILDELETE              (1u << 2)
+#define                XFS_PTAG_ERROR_REPORT           (1u << 3)
+#define                XFS_PTAG_SHUTDOWN_CORRUPT       (1u << 4)
+#define                XFS_PTAG_SHUTDOWN_IOERROR       (1u << 5)
+#define                XFS_PTAG_SHUTDOWN_LOGERROR      (1u << 6)
+#define                XFS_PTAG_FSBLOCK_ZERO           (1u << 7)
+#define                XFS_PTAG_VERIFIER_ERROR         (1u << 8)
  
  #define XFS_PTAG_STRINGS \
         { XFS_NO_PTAG,                  "none" }, \
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c

index 0e50f2c..765be05 100644 (file)
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -58,10 +58,11 @@ xfs_efi_release(
         struct xfs_efi_log_item *efip)
  {
         ASSERT(atomic_read(&efip->efi_refcount) > 0);
-       if (atomic_dec_and_test(&efip->efi_refcount)) {
-               xfs_trans_ail_delete(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
-               xfs_efi_item_free(efip);
-       }
+       if (!atomic_dec_and_test(&efip->efi_refcount))
+               return;
+
+       xfs_trans_ail_delete(&efip->efi_item, 0);
+       xfs_efi_item_free(efip);
  }
  
  /*
@@ -306,11 +307,20 @@ xfs_efd_item_release(
         xfs_efd_item_free(efdp);
  }
  
+static struct xfs_log_item *
+xfs_efd_item_intent(
+       struct xfs_log_item     *lip)
+{
+       return &EFD_ITEM(lip)->efd_efip->efi_item;
+}
+
  static const struct xfs_item_ops xfs_efd_item_ops = {
-       .flags          = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+       .flags          = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+                         XFS_ITEM_INTENT_DONE,
         .iop_size       = xfs_efd_item_size,
         .iop_format     = xfs_efd_item_format,
         .iop_release    = xfs_efd_item_release,
+       .iop_intent     = xfs_efd_item_intent,
  };
  
  /*
@@ -380,7 +390,7 @@ xfs_trans_free_extent(
          * 1.) releases the EFI and frees the EFD
          * 2.) shuts down the filesystem
          */
-       tp->t_flags |= XFS_TRANS_DIRTY;
+       tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
         set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
  
         next_extent = efdp->efd_next_extent;
@@ -688,6 +698,7 @@ xfs_efi_item_relog(
  }
  
  static const struct xfs_item_ops xfs_efi_item_ops = {
+       .flags          = XFS_ITEM_INTENT,
         .iop_size       = xfs_efi_item_size,
         .iop_format     = xfs_efi_item_format,
         .iop_unpin      = xfs_efi_item_unpin,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 85c4121..a60632e 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -310,7 +310,7 @@ STATIC ssize_t
  xfs_file_write_checks(
         struct kiocb            *iocb,
         struct iov_iter         *from,
-       int                     *iolock)
+       unsigned int            *iolock)
  {
         struct file             *file = iocb->ki_filp;
         struct inode            *inode = file->f_mapping->host;
@@ -513,7 +513,7 @@ xfs_file_dio_write_aligned(
         struct kiocb            *iocb,
         struct iov_iter         *from)
  {
-       int                     iolock = XFS_IOLOCK_SHARED;
+       unsigned int            iolock = XFS_IOLOCK_SHARED;
         ssize_t                 ret;
  
         ret = xfs_ilock_iocb(iocb, iolock);
@@ -566,7 +566,7 @@ xfs_file_dio_write_unaligned(
  {
         size_t                  isize = i_size_read(VFS_I(ip));
         size_t                  count = iov_iter_count(from);
-       int                     iolock = XFS_IOLOCK_SHARED;
+       unsigned int            iolock = XFS_IOLOCK_SHARED;
         unsigned int            flags = IOMAP_DIO_OVERWRITE_ONLY;
         ssize_t                 ret;
  
@@ -655,7 +655,7 @@ xfs_file_dax_write(
  {
         struct inode            *inode = iocb->ki_filp->f_mapping->host;
         struct xfs_inode        *ip = XFS_I(inode);
-       int                     iolock = XFS_IOLOCK_EXCL;
+       unsigned int            iolock = XFS_IOLOCK_EXCL;
         ssize_t                 ret, error = 0;
         loff_t                  pos;
  
@@ -694,13 +694,11 @@ xfs_file_buffered_write(
         struct kiocb            *iocb,
         struct iov_iter         *from)
  {
-       struct file             *file = iocb->ki_filp;
-       struct address_space    *mapping = file->f_mapping;
-       struct inode            *inode = mapping->host;
+       struct inode            *inode = iocb->ki_filp->f_mapping->host;
         struct xfs_inode        *ip = XFS_I(inode);
         ssize_t                 ret;
         bool                    cleared_space = false;
-       int                     iolock;
+       unsigned int            iolock;
  
         if (iocb->ki_flags & IOCB_NOWAIT)
                 return -EOPNOTSUPP;
@@ -767,9 +765,7 @@ xfs_file_write_iter(
         struct kiocb            *iocb,
         struct iov_iter         *from)
  {
-       struct file             *file = iocb->ki_filp;
-       struct address_space    *mapping = file->f_mapping;
-       struct inode            *inode = mapping->host;
+       struct inode            *inode = iocb->ki_filp->f_mapping->host;
         struct xfs_inode        *ip = XFS_I(inode);
         ssize_t                 ret;
         size_t                  ocount = iov_iter_count(from);
@@ -1167,12 +1163,10 @@ xfs_file_open(
         struct inode    *inode,
         struct file     *file)
  {
-       if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
-               return -EFBIG;
         if (xfs_is_shutdown(XFS_M(inode->i_sb)))
                 return -EIO;
         file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
-       return 0;
+       return generic_file_open(inode, file);
  }
  
  STATIC int
@@ -1181,7 +1175,7 @@ xfs_dir_open(
         struct file     *file)
  {
         struct xfs_inode *ip = XFS_I(inode);
-       int             mode;
+       unsigned int    mode;
         int             error;
  
         error = xfs_file_open(inode, file);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c

index 6a3ce0f..be9bcf8 100644 (file)
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -128,11 +128,12 @@ xfs_filestream_pick_ag(
                 if (!pag->pagf_init) {
                         err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
                         if (err) {
-                               xfs_perag_put(pag);
-                               if (err != -EAGAIN)
+                               if (err != -EAGAIN) {
+                                       xfs_perag_put(pag);
                                         return err;
+                               }
                                 /* Couldn't lock the AGF, skip this AG. */
-                               continue;
+                               goto next_ag;
                         }
                 }
  
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c

index 10e1cb7..bb23199 100644 (file)
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -450,11 +450,11 @@ xfs_getfsmap_logdev(
  /* Transform a rtbitmap "record" into a fsmap */
  STATIC int
  xfs_getfsmap_rtdev_rtbitmap_helper(
+       struct xfs_mount                *mp,
         struct xfs_trans                *tp,
         const struct xfs_rtalloc_rec    *rec,
         void                            *priv)
  {
-       struct xfs_mount                *mp = tp->t_mountp;
         struct xfs_getfsmap_info        *info = priv;
         struct xfs_rmap_irec            irec;
         xfs_daddr_t                     rec_daddr;
@@ -535,7 +535,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
         do_div(alow.ar_startext, mp->m_sb.sb_rextsize);
         if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize))
                 ahigh.ar_startext++;
-       error = xfs_rtalloc_query_range(tp, &alow, &ahigh,
+       error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh,
                         xfs_getfsmap_rtdev_rtbitmap_helper, info);
         if (error)
                 goto err;
@@ -547,7 +547,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
         info->last = true;
         ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext);
  
-       error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info);
+       error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info);
         if (error)
                 goto err;
  err:
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c

index 68f7454..888839e 100644 (file)
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -349,10 +349,7 @@ xfs_fs_counts(
         cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
         cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
                                                 xfs_fdblocks_unavailable(mp);
-
-       spin_lock(&mp->m_sb_lock);
-       cnt->freertx = mp->m_sb.sb_frextents;
-       spin_unlock(&mp->m_sb_lock);
+       cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
  }
  
  /*
@@ -512,7 +509,7 @@ xfs_fs_goingdown(
  void
  xfs_do_force_shutdown(
         struct xfs_mount *mp,
-       int             flags,
+       uint32_t        flags,
         char            *fname,
         int             lnnum)
  {
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c

index f62fa65..4d0a98f 100644 (file)
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -41,5 +41,6 @@ struct xfs_globals xfs_globals = {
  #endif
  #ifdef DEBUG
         .pwork_threads          =       -1,     /* automatic thread detection */
+       .larp                   =       false,  /* log attribute replay */
  #endif
  };
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c

index bffd6eb..5269354 100644 (file)
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1916,13 +1916,16 @@ xfs_inodegc_want_queue_rt_file(
         struct xfs_inode        *ip)
  {
         struct xfs_mount        *mp = ip->i_mount;
-       uint64_t                freertx;
  
         if (!XFS_IS_REALTIME_INODE(ip))
                 return false;
  
-       freertx = READ_ONCE(mp->m_sb.sb_frextents);
-       return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT];
+       if (__percpu_counter_compare(&mp->m_frextents,
+                               mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
+                               XFS_FDBLOCKS_BATCH) < 0)
+               return true;
+
+       return false;
  }
  #else
  # define xfs_inodegc_want_queue_rt_file(ip)    (false)
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c

index 508e184..b05314d 100644 (file)
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,6 +63,7 @@ STATIC void
  xfs_icreate_item_release(
         struct xfs_log_item     *lip)
  {
+       kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
         kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip));
  }
  
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index 39ae53e..b287987 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -416,10 +416,12 @@ xfs_lockdep_subclass_ok(
   * parent locking. Care must be taken to ensure we don't overrun the subclass
   * storage fields in the class mask we build.
   */
-static inline int
-xfs_lock_inumorder(int lock_mode, int subclass)
+static inline uint
+xfs_lock_inumorder(
+       uint    lock_mode,
+       uint    subclass)
  {
-       int     class = 0;
+       uint    class = 0;
  
         ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
                               XFS_ILOCK_RTSUM)));
@@ -464,7 +466,10 @@ xfs_lock_inodes(
         int                     inodes,
         uint                    lock_mode)
  {
-       int                     attempts = 0, i, j, try_lock;
+       int                     attempts = 0;
+       uint                    i;
+       int                     j;
+       bool                    try_lock;
         struct xfs_log_item     *lp;
  
         /*
@@ -489,9 +494,9 @@ xfs_lock_inodes(
         } else if (lock_mode & XFS_MMAPLOCK_EXCL)
                 ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
  
-       try_lock = 0;
-       i = 0;
  again:
+       try_lock = false;
+       i = 0;
         for (; i < inodes; i++) {
                 ASSERT(ips[i]);
  
@@ -506,7 +511,7 @@ again:
                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
                                 lp = &ips[j]->i_itemp->ili_item;
                                 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
-                                       try_lock++;
+                                       try_lock = true;
                         }
                 }
  
@@ -546,8 +551,6 @@ again:
                 if ((attempts % 5) == 0) {
                         delay(1); /* Don't just spin the CPU */
                 }
-               i = 0;
-               try_lock = 0;
                 goto again;
         }
  }
@@ -1024,11 +1027,6 @@ xfs_create(
         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
         unlock_dp_on_error = true;
  
-       error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
-                       XFS_IEXT_DIR_MANIP_CNT(mp));
-       if (error)
-               goto out_trans_cancel;
-
         /*
          * A newly created regular or special file just has one directory
          * entry pointing to them, but a directory also the "." entry
@@ -1242,11 +1240,6 @@ xfs_link(
         if (error)
                 goto std_return;
  
-       error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
-                       XFS_IEXT_DIR_MANIP_CNT(mp));
-       if (error)
-               goto error_return;
-
         /*
          * If we are using project inheritance, we only allow hard link
          * creation in our tree when the project IDs are the same; else
@@ -3212,35 +3205,6 @@ retry:
         /*
          * Check for expected errors before we dirty the transaction
          * so we can return an error without a transaction abort.
-        *
-        * Extent count overflow check:
-        *
-        * From the perspective of src_dp, a rename operation is essentially a
-        * directory entry remove operation. Hence the only place where we check
-        * for extent count overflow for src_dp is in
-        * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns
-        * -ENOSPC when it detects a possible extent count overflow and in
-        * response, the higher layers of directory handling code do the
-        * following:
-        * 1. Data/Free blocks: XFS lets these blocks linger until a
-        *    future remove operation removes them.
-        * 2. Dabtree blocks: XFS swaps the blocks with the last block in the
-        *    Leaf space and unmaps the last block.
-        *
-        * For target_dp, there are two cases depending on whether the
-        * destination directory entry exists or not.
-        *
-        * When destination directory entry does not exist (i.e. target_ip ==
-        * NULL), extent count overflow check is performed only when transaction
-        * has a non-zero sized space reservation associated with it.  With a
-        * zero-sized space reservation, XFS allows a rename operation to
-        * continue only when the directory has sufficient free space in its
-        * data/leaf/free space blocks to hold the new entry.
-        *
-        * When destination directory entry exists (i.e. target_ip != NULL), all
-        * we need to do is change the inode number associated with the already
-        * existing entry. Hence there is no need to perform an extent count
-        * overflow check.
          */
         if (target_ip == NULL) {
                 /*
@@ -3251,12 +3215,6 @@ retry:
                         error = xfs_dir_canenter(tp, target_dp, target_name);
                         if (error)
                                 goto out_trans_cancel;
-               } else {
-                       error = xfs_iext_count_may_overflow(target_dp,
-                                       XFS_DATA_FORK,
-                                       XFS_IEXT_DIR_MANIP_CNT(mp));
-                       if (error)
-                               goto out_trans_cancel;
                 }
         } else {
                 /*
@@ -3424,18 +3382,12 @@ retry:
          * inode number of the whiteout inode rather than removing it
          * altogether.
          */
-       if (wip) {
+       if (wip)
                 error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
                                         spaceres);
-       } else {
-               /*
-                * NOTE: We don't need to check for extent count overflow here
-                * because the dir remove name code will leave the dir block in
-                * place if the extent count would overflow.
-                */
+       else
                 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
                                            spaceres);
-       }
  
         if (error)
                 goto out_trans_cancel;
@@ -3517,8 +3469,8 @@ xfs_iflush(
         if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) >
                                 ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                       "%s: detected corrupt incore inode %Lu, "
-                       "total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
+                       "%s: detected corrupt incore inode %llu, "
+                       "total extents = %llu nblocks = %lld, ptr "PTR_FMT,
                         __func__, ip->i_ino,
                         ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp),
                         ip->i_nblocks, ip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h

index 740ab13..7be6f8e 100644 (file)
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -218,6 +218,11 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
         return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME;
  }
  
+static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
+{
+       return ip->i_diflags2 & XFS_DIFLAG2_NREXT64;
+}
+
  /*
   * Return the buftarg used for data allocations on a given inode.
   */
@@ -278,12 +283,12 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
   * Bit ranges: 1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
   *             1<<16 - 1<<32-1 -- lockdep annotation (integers)
   */
-#define        XFS_IOLOCK_EXCL         (1<<0)
-#define        XFS_IOLOCK_SHARED       (1<<1)
-#define        XFS_ILOCK_EXCL          (1<<2)
-#define        XFS_ILOCK_SHARED        (1<<3)
-#define        XFS_MMAPLOCK_EXCL       (1<<4)
-#define        XFS_MMAPLOCK_SHARED     (1<<5)
+#define        XFS_IOLOCK_EXCL         (1u << 0)
+#define        XFS_IOLOCK_SHARED       (1u << 1)
+#define        XFS_ILOCK_EXCL          (1u << 2)
+#define        XFS_ILOCK_SHARED        (1u << 3)
+#define        XFS_MMAPLOCK_EXCL       (1u << 4)
+#define        XFS_MMAPLOCK_SHARED     (1u << 5)
  
  #define XFS_LOCK_MASK          (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
                                 | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
@@ -350,19 +355,19 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
   */
  #define XFS_IOLOCK_SHIFT               16
  #define XFS_IOLOCK_MAX_SUBCLASS                3
-#define XFS_IOLOCK_DEP_MASK            0x000f0000
+#define XFS_IOLOCK_DEP_MASK            0x000f0000u
  
  #define XFS_MMAPLOCK_SHIFT             20
  #define XFS_MMAPLOCK_NUMORDER          0
  #define XFS_MMAPLOCK_MAX_SUBCLASS      3
-#define XFS_MMAPLOCK_DEP_MASK          0x00f00000
+#define XFS_MMAPLOCK_DEP_MASK          0x00f00000u
  
  #define XFS_ILOCK_SHIFT                        24
-#define XFS_ILOCK_PARENT_VAL           5
+#define XFS_ILOCK_PARENT_VAL           5u
  #define XFS_ILOCK_MAX_SUBCLASS         (XFS_ILOCK_PARENT_VAL - 1)
-#define XFS_ILOCK_RTBITMAP_VAL         6
-#define XFS_ILOCK_RTSUM_VAL            7
-#define XFS_ILOCK_DEP_MASK             0xff000000
+#define XFS_ILOCK_RTBITMAP_VAL         6u
+#define XFS_ILOCK_RTSUM_VAL            7u
+#define XFS_ILOCK_DEP_MASK             0xff000000u
  #define        XFS_ILOCK_PARENT                (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT)
  #define        XFS_ILOCK_RTBITMAP              (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT)
  #define        XFS_ILOCK_RTSUM                 (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c

index 9e6ef55..721def0 100644 (file)
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -71,7 +71,7 @@ xfs_inode_item_data_fork_size(
         case XFS_DINODE_FMT_LOCAL:
                 if ((iip->ili_fields & XFS_ILOG_DDATA) &&
                     ip->i_df.if_bytes > 0) {
-                       *nbytes += roundup(ip->i_df.if_bytes, 4);
+                       *nbytes += xlog_calc_iovec_len(ip->i_df.if_bytes);
                         *nvecs += 1;
                 }
                 break;
@@ -112,7 +112,7 @@ xfs_inode_item_attr_fork_size(
         case XFS_DINODE_FMT_LOCAL:
                 if ((iip->ili_fields & XFS_ILOG_ADATA) &&
                     ip->i_afp->if_bytes > 0) {
-                       *nbytes += roundup(ip->i_afp->if_bytes, 4);
+                       *nbytes += xlog_calc_iovec_len(ip->i_afp->if_bytes);
                         *nvecs += 1;
                 }
                 break;
@@ -204,17 +204,12 @@ xfs_inode_item_format_data_fork(
                         ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
                 if ((iip->ili_fields & XFS_ILOG_DDATA) &&
                     ip->i_df.if_bytes > 0) {
-                       /*
-                        * Round i_bytes up to a word boundary.
-                        * The underlying memory is guaranteed
-                        * to be there by xfs_idata_realloc().
-                        */
-                       data_bytes = roundup(ip->i_df.if_bytes, 4);
                         ASSERT(ip->i_df.if_u1.if_data != NULL);
                         ASSERT(ip->i_disk_size > 0);
                         xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
-                                       ip->i_df.if_u1.if_data, data_bytes);
-                       ilf->ilf_dsize = (unsigned)data_bytes;
+                                       ip->i_df.if_u1.if_data,
+                                       ip->i_df.if_bytes);
+                       ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes;
                         ilf->ilf_size++;
                 } else {
                         iip->ili_fields &= ~XFS_ILOG_DDATA;
@@ -288,17 +283,11 @@ xfs_inode_item_format_attr_fork(
  
                 if ((iip->ili_fields & XFS_ILOG_ADATA) &&
                     ip->i_afp->if_bytes > 0) {
-                       /*
-                        * Round i_bytes up to a word boundary.
-                        * The underlying memory is guaranteed
-                        * to be there by xfs_idata_realloc().
-                        */
-                       data_bytes = roundup(ip->i_afp->if_bytes, 4);
                         ASSERT(ip->i_afp->if_u1.if_data != NULL);
                         xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
                                         ip->i_afp->if_u1.if_data,
-                                       data_bytes);
-                       ilf->ilf_asize = (unsigned)data_bytes;
+                                       ip->i_afp->if_bytes);
+                       ilf->ilf_asize = (unsigned)ip->i_afp->if_bytes;
                         ilf->ilf_size++;
                 } else {
                         iip->ili_fields &= ~XFS_ILOG_ADATA;
@@ -359,6 +348,21 @@ xfs_copy_dm_fields_to_log_dinode(
         }
  }
  
+static inline void
+xfs_inode_to_log_dinode_iext_counters(
+       struct xfs_inode        *ip,
+       struct xfs_log_dinode   *to)
+{
+       if (xfs_inode_has_large_extent_counts(ip)) {
+               to->di_big_nextents = xfs_ifork_nextents(&ip->i_df);
+               to->di_big_anextents = xfs_ifork_nextents(ip->i_afp);
+               to->di_nrext64_pad = 0;
+       } else {
+               to->di_nextents = xfs_ifork_nextents(&ip->i_df);
+               to->di_anextents = xfs_ifork_nextents(ip->i_afp);
+       }
+}
+
  static void
  xfs_inode_to_log_dinode(
         struct xfs_inode        *ip,
@@ -374,7 +378,6 @@ xfs_inode_to_log_dinode(
         to->di_projid_lo = ip->i_projid & 0xffff;
         to->di_projid_hi = ip->i_projid >> 16;
  
-       memset(to->di_pad, 0, sizeof(to->di_pad));
         memset(to->di_pad3, 0, sizeof(to->di_pad3));
         to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
         to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
@@ -386,8 +389,6 @@ xfs_inode_to_log_dinode(
         to->di_size = ip->i_disk_size;
         to->di_nblocks = ip->i_nblocks;
         to->di_extsize = ip->i_extsize;
-       to->di_nextents = xfs_ifork_nextents(&ip->i_df);
-       to->di_anextents = xfs_ifork_nextents(ip->i_afp);
         to->di_forkoff = ip->i_forkoff;
         to->di_aformat = xfs_ifork_format(ip->i_afp);
         to->di_flags = ip->i_diflags;
@@ -407,11 +408,14 @@ xfs_inode_to_log_dinode(
                 to->di_lsn = lsn;
                 memset(to->di_pad2, 0, sizeof(to->di_pad2));
                 uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
-               to->di_flushiter = 0;
+               to->di_v3_pad = 0;
         } else {
                 to->di_version = 2;
                 to->di_flushiter = ip->i_flushiter;
+               memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
         }
+
+       xfs_inode_to_log_dinode_iext_counters(ip, to);
  }
  
  /*
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c

index 239dd2e..d28ffae 100644 (file)
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -142,6 +142,29 @@ xfs_log_dinode_to_disk_ts(
         return ts;
  }
  
+static inline bool xfs_log_dinode_has_large_extent_counts(
+               const struct xfs_log_dinode *ld)
+{
+       return ld->di_version >= 3 &&
+              (ld->di_flags2 & XFS_DIFLAG2_NREXT64);
+}
+
+static inline void
+xfs_log_dinode_to_disk_iext_counters(
+       struct xfs_log_dinode   *from,
+       struct xfs_dinode       *to)
+{
+       if (xfs_log_dinode_has_large_extent_counts(from)) {
+               to->di_big_nextents = cpu_to_be64(from->di_big_nextents);
+               to->di_big_anextents = cpu_to_be32(from->di_big_anextents);
+               to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad);
+       } else {
+               to->di_nextents = cpu_to_be32(from->di_nextents);
+               to->di_anextents = cpu_to_be16(from->di_anextents);
+       }
+
+}
+
  STATIC void
  xfs_log_dinode_to_disk(
         struct xfs_log_dinode   *from,
@@ -158,7 +181,6 @@ xfs_log_dinode_to_disk(
         to->di_nlink = cpu_to_be32(from->di_nlink);
         to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
         to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
-       memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
  
         to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime);
         to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime);
@@ -167,8 +189,6 @@ xfs_log_dinode_to_disk(
         to->di_size = cpu_to_be64(from->di_size);
         to->di_nblocks = cpu_to_be64(from->di_nblocks);
         to->di_extsize = cpu_to_be32(from->di_extsize);
-       to->di_nextents = cpu_to_be32(from->di_nextents);
-       to->di_anextents = cpu_to_be16(from->di_anextents);
         to->di_forkoff = from->di_forkoff;
         to->di_aformat = from->di_aformat;
         to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
@@ -184,12 +204,66 @@ xfs_log_dinode_to_disk(
                 to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
                 to->di_ino = cpu_to_be64(from->di_ino);
                 to->di_lsn = cpu_to_be64(lsn);
-               memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+               memset(to->di_pad2, 0, sizeof(to->di_pad2));
                 uuid_copy(&to->di_uuid, &from->di_uuid);
-               to->di_flushiter = 0;
+               to->di_v3_pad = 0;
         } else {
                 to->di_flushiter = cpu_to_be16(from->di_flushiter);
+               memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
         }
+
+       xfs_log_dinode_to_disk_iext_counters(from, to);
+}
+
+STATIC int
+xlog_dinode_verify_extent_counts(
+       struct xfs_mount        *mp,
+       struct xfs_log_dinode   *ldip)
+{
+       xfs_extnum_t            nextents;
+       xfs_aextnum_t           anextents;
+
+       if (xfs_log_dinode_has_large_extent_counts(ldip)) {
+               if (!xfs_has_large_extent_counts(mp) ||
+                   (ldip->di_nrext64_pad != 0)) {
+                       XFS_CORRUPTION_ERROR(
+                               "Bad log dinode large extent count format",
+                               XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+                       xfs_alert(mp,
+                               "Bad inode 0x%llx, large extent counts %d, padding 0x%x",
+                               ldip->di_ino, xfs_has_large_extent_counts(mp),
+                               ldip->di_nrext64_pad);
+                       return -EFSCORRUPTED;
+               }
+
+               nextents = ldip->di_big_nextents;
+               anextents = ldip->di_big_anextents;
+       } else {
+               if (ldip->di_version == 3 && ldip->di_v3_pad != 0) {
+                       XFS_CORRUPTION_ERROR(
+                               "Bad log dinode di_v3_pad",
+                               XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+                       xfs_alert(mp,
+                               "Bad inode 0x%llx, di_v3_pad 0x%llx",
+                               ldip->di_ino, ldip->di_v3_pad);
+                       return -EFSCORRUPTED;
+               }
+
+               nextents = ldip->di_nextents;
+               anextents = ldip->di_anextents;
+       }
+
+       if (unlikely(nextents + anextents > ldip->di_nblocks)) {
+               XFS_CORRUPTION_ERROR("Bad log dinode extent counts",
+                               XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+               xfs_alert(mp,
+                       "Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx",
+                       ldip->di_ino, xfs_has_large_extent_counts(mp), nextents,
+                       anextents, ldip->di_nblocks);
+               return -EFSCORRUPTED;
+       }
+
+       return 0;
  }
  
  STATIC int
@@ -317,13 +391,12 @@ xlog_recover_inode_commit_pass2(
         if (unlikely(S_ISREG(ldip->di_mode))) {
                 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
                     (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
-                       XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
-                                        XFS_ERRLEVEL_LOW, mp, ldip,
-                                        sizeof(*ldip));
+                       XFS_CORRUPTION_ERROR(
+                               "Bad log dinode data fork format for regular file",
+                               XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
                         xfs_alert(mp,
-               "%s: Bad regular inode log record, rec ptr "PTR_FMT", "
-               "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
-                               __func__, item, dip, bp, in_f->ilf_ino);
+                               "Bad inode 0x%llx, data fork format 0x%x",
+                               in_f->ilf_ino, ldip->di_format);
                         error = -EFSCORRUPTED;
                         goto out_release;
                 }
@@ -331,49 +404,37 @@ xlog_recover_inode_commit_pass2(
                 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
                     (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
                     (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
-                       XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
-                                            XFS_ERRLEVEL_LOW, mp, ldip,
-                                            sizeof(*ldip));
+                       XFS_CORRUPTION_ERROR(
+                               "Bad log dinode data fork format for directory",
+                               XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
                         xfs_alert(mp,
-               "%s: Bad dir inode log record, rec ptr "PTR_FMT", "
-               "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
-                               __func__, item, dip, bp, in_f->ilf_ino);
+                               "Bad inode 0x%llx, data fork format 0x%x",
+                               in_f->ilf_ino, ldip->di_format);
                         error = -EFSCORRUPTED;
                         goto out_release;
                 }
         }
-       if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
-               XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
-                                    XFS_ERRLEVEL_LOW, mp, ldip,
-                                    sizeof(*ldip));
-               xfs_alert(mp,
-       "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
-       "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld",
-                       __func__, item, dip, bp, in_f->ilf_ino,
-                       ldip->di_nextents + ldip->di_anextents,
-                       ldip->di_nblocks);
-               error = -EFSCORRUPTED;
+
+       error = xlog_dinode_verify_extent_counts(mp, ldip);
+       if (error)
                 goto out_release;
-       }
+
         if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
-               XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
-                                    XFS_ERRLEVEL_LOW, mp, ldip,
-                                    sizeof(*ldip));
+               XFS_CORRUPTION_ERROR("Bad log dinode fork offset",
+                               XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
                 xfs_alert(mp,
-       "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
-       "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__,
-                       item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
+                       "Bad inode 0x%llx, di_forkoff 0x%x",
+                       in_f->ilf_ino, ldip->di_forkoff);
                 error = -EFSCORRUPTED;
                 goto out_release;
         }
         isize = xfs_log_dinode_size(mp);
         if (unlikely(item->ri_buf[1].i_len > isize)) {
-               XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
-                                    XFS_ERRLEVEL_LOW, mp, ldip,
-                                    sizeof(*ldip));
+               XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW,
+                                    mp, ldip, sizeof(*ldip));
                 xfs_alert(mp,
-                       "%s: Bad inode log record length %d, rec ptr "PTR_FMT,
-                       __func__, item->ri_buf[1].i_len, item);
+                       "Bad inode 0x%llx log dinode size 0x%x",
+                       in_f->ilf_ino, item->ri_buf[1].i_len);
                 error = -EFSCORRUPTED;
                 goto out_release;
         }
@@ -401,7 +462,7 @@ xlog_recover_inode_commit_pass2(
         ASSERT(in_f->ilf_size <= 4);
         ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
         ASSERT(!(fields & XFS_ILOG_DFORK) ||
-              (len == in_f->ilf_dsize));
+              (len == xlog_calc_iovec_len(in_f->ilf_dsize)));
  
         switch (fields & XFS_ILOG_DFORK) {
         case XFS_ILOG_DDATA:
@@ -436,7 +497,7 @@ xlog_recover_inode_commit_pass2(
                 }
                 len = item->ri_buf[attr_index].i_len;
                 src = item->ri_buf[attr_index].i_addr;
-               ASSERT(len == in_f->ilf_asize);
+               ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize));
  
                 switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
                 case XFS_ILOG_ADATA:
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c

index 8348100..0e5cb79 100644 (file)
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -15,6 +15,8 @@
  #include "xfs_iwalk.h"
  #include "xfs_itable.h"
  #include "xfs_error.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
  #include "xfs_attr.h"
  #include "xfs_bmap.h"
  #include "xfs_bmap_util.h"
@@ -35,8 +37,6 @@
  #include "xfs_health.h"
  #include "xfs_reflink.h"
  #include "xfs_ioctl.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
  
  #include <linux/mount.h>
  #include <linux/namei.h>
@@ -813,6 +813,9 @@ xfs_bulk_ireq_setup(
         if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
                 return -ECANCELED;
  
+       if (hdr->flags & XFS_BULK_IREQ_NREXT64)
+               breq->flags |= XFS_IBULK_NREXT64;
+
         return 0;
  }
  
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c

index ca25ed8..2f54b70 100644 (file)
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -17,6 +17,8 @@
  #include "xfs_itable.h"
  #include "xfs_fsops.h"
  #include "xfs_rtalloc.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
  #include "xfs_attr.h"
  #include "xfs_ioctl.h"
  #include "xfs_ioctl32.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index e552ce5..5a39325 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -251,6 +251,8 @@ xfs_iomap_write_direct(
                 return error;
  
         error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts);
+       if (error == -EFBIG)
+               error = xfs_iext_count_upgrade(tp, ip, nr_exts);
         if (error)
                 goto out_trans_cancel;
  
@@ -402,7 +404,7 @@ xfs_iomap_prealloc_size(
          */
         plen = prev.br_blockcount;
         while (xfs_iext_prev_extent(ifp, &ncur, &got)) {
-               if (plen > MAXEXTLEN / 2 ||
+               if (plen > XFS_MAX_BMBT_EXTLEN / 2 ||
                     isnullstartblock(got.br_startblock) ||
                     got.br_startoff + got.br_blockcount != prev.br_startoff ||
                     got.br_startblock + got.br_blockcount != prev.br_startblock)
@@ -414,23 +416,23 @@ xfs_iomap_prealloc_size(
         /*
          * If the size of the extents is greater than half the maximum extent
          * length, then use the current offset as the basis.  This ensures that
-        * for large files the preallocation size always extends to MAXEXTLEN
-        * rather than falling short due to things like stripe unit/width
-        * alignment of real extents.
+        * for large files the preallocation size always extends to
+        * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe
+        * unit/width alignment of real extents.
          */
         alloc_blocks = plen * 2;
-       if (alloc_blocks > MAXEXTLEN)
+       if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
                 alloc_blocks = XFS_B_TO_FSB(mp, offset);
         qblocks = alloc_blocks;
  
         /*
-        * MAXEXTLEN is not a power of two value but we round the prealloc down
-        * to the nearest power of two value after throttling. To prevent the
-        * round down from unconditionally reducing the maximum supported
-        * prealloc size, we round up first, apply appropriate throttling,
-        * round down and cap the value to MAXEXTLEN.
+        * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc
+        * down to the nearest power of two value after throttling. To prevent
+        * the round down from unconditionally reducing the maximum supported
+        * prealloc size, we round up first, apply appropriate throttling, round
+        * down and cap the value to XFS_BMBT_MAX_EXTLEN.
          */
-       alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
+       alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN),
                                        alloc_blocks);
  
         freesp = percpu_counter_read_positive(&mp->m_fdblocks);
@@ -478,14 +480,14 @@ xfs_iomap_prealloc_size(
          */
         if (alloc_blocks)
                 alloc_blocks = rounddown_pow_of_two(alloc_blocks);
-       if (alloc_blocks > MAXEXTLEN)
-               alloc_blocks = MAXEXTLEN;
+       if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
+               alloc_blocks = XFS_MAX_BMBT_EXTLEN;
  
         /*
          * If we are still trying to allocate more space than is
          * available, squash the prealloc hard. This can happen if we
          * have a large file on a small filesystem and the above
-        * lowspace thresholds are smaller than MAXEXTLEN.
+        * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN.
          */
         while (alloc_blocks && alloc_blocks >= freesp)
                 alloc_blocks >>= 4;
@@ -555,6 +557,9 @@ xfs_iomap_write_unwritten(
  
                 error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
                                 XFS_IEXT_WRITE_UNWRITTEN_CNT);
+               if (error == -EFBIG)
+                       error = xfs_iext_count_upgrade(tp, ip,
+                                       XFS_IEXT_WRITE_UNWRITTEN_CNT);
                 if (error)
                         goto error_on_bmapi_transaction;
  
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c

index b34e8e4..e912b7f 100644 (file)
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -13,6 +13,8 @@
  #include "xfs_inode.h"
  #include "xfs_acl.h"
  #include "xfs_quota.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
  #include "xfs_attr.h"
  #include "xfs_trans.h"
  #include "xfs_trace.h"
@@ -209,7 +211,6 @@ xfs_generic_create(
         if (unlikely(error))
                 goto out_cleanup_inode;
  
-#ifdef CONFIG_XFS_POSIX_ACL
         if (default_acl) {
                 error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
                 if (error)
@@ -220,7 +221,6 @@ xfs_generic_create(
                 if (error)
                         goto out_cleanup_inode;
         }
-#endif
  
         xfs_setup_iops(ip);
  
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c

index c08c79d..f74c9ff 100644 (file)
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -64,6 +64,7 @@ xfs_bulkstat_one_int(
         struct xfs_inode        *ip;            /* incore inode pointer */
         struct inode            *inode;
         struct xfs_bulkstat     *buf = bc->buf;
+       xfs_extnum_t            nextents;
         int                     error = -EINVAL;
  
         if (xfs_internal_inum(mp, ino))
@@ -102,7 +103,13 @@ xfs_bulkstat_one_int(
  
         buf->bs_xflags = xfs_ip2xflags(ip);
         buf->bs_extsize_blks = ip->i_extsize;
-       buf->bs_extents = xfs_ifork_nextents(&ip->i_df);
+
+       nextents = xfs_ifork_nextents(&ip->i_df);
+       if (!(bc->breq->flags & XFS_IBULK_NREXT64))
+               buf->bs_extents = min(nextents, XFS_MAX_EXTCNT_DATA_FORK_SMALL);
+       else
+               buf->bs_extents64 = nextents;
+
         xfs_bulkstat_health(ip, buf);
         buf->bs_aextents = xfs_ifork_nextents(ip->i_afp);
         buf->bs_forkoff = XFS_IFORK_BOFF(ip);
@@ -256,6 +263,7 @@ xfs_bulkstat(
                 .breq           = breq,
         };
         struct xfs_trans        *tp;
+       unsigned int            iwalk_flags = 0;
         int                     error;
  
         if (breq->mnt_userns != &init_user_ns) {
@@ -279,7 +287,10 @@ xfs_bulkstat(
         if (error)
                 goto out;
  
-       error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags,
+       if (breq->flags & XFS_IBULK_SAME_AG)
+               iwalk_flags |= XFS_IWALK_SAME_AG;
+
+       error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags,
                         xfs_bulkstat_iwalk, breq->icount, &bc);
         xfs_trans_cancel(tp);
  out:
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h

index 7078d10..e2d0eba 100644 (file)
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -17,7 +17,10 @@ struct xfs_ibulk {
  };
  
  /* Only iterate within the same AG as startino */
-#define XFS_IBULK_SAME_AG      (XFS_IWALK_SAME_AG)
+#define XFS_IBULK_SAME_AG      (1U << 0)
+
+/* Fill out the bs_extents64 field if set. */
+#define XFS_IBULK_NREXT64      (1U << 1)
  
  /*
   * Advance the user buffer pointer by one record of the given size.  If the
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h

index 37a795f..8369908 100644 (file)
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h
@@ -26,7 +26,7 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
                 unsigned int inode_records, bool poll, void *data);
  
  /* Only iterate inodes within the same AG as @startino. */
-#define XFS_IWALK_SAME_AG      (0x1)
+#define XFS_IWALK_SAME_AG      (1U << 0)
  
  #define XFS_IWALK_FLAGS_ALL    (XFS_IWALK_SAME_AG)
  
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c

index 499e15b..9dc748a 100644 (file)
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -49,7 +49,6 @@ xlog_state_get_iclog_space(
         int                     len,
         struct xlog_in_core     **iclog,
         struct xlog_ticket      *ticket,
-       int                     *continued_write,
         int                     *logoffsetp);
  STATIC void
  xlog_grant_push_ail(
@@ -61,10 +60,6 @@ xlog_sync(
         struct xlog_in_core     *iclog);
  #if defined(DEBUG)
  STATIC void
-xlog_verify_dest_ptr(
-       struct xlog             *log,
-       void                    *ptr);
-STATIC void
  xlog_verify_grant_tail(
         struct xlog *log);
  STATIC void
@@ -77,7 +72,6 @@ xlog_verify_tail_lsn(
         struct xlog             *log,
         struct xlog_in_core     *iclog);
  #else
-#define xlog_verify_dest_ptr(a,b)
  #define xlog_verify_grant_tail(a)
  #define xlog_verify_iclog(a,b,c)
  #define xlog_verify_tail_lsn(a,b)
@@ -90,6 +84,62 @@ xlog_iclogs_empty(
  static int
  xfs_log_cover(struct xfs_mount *);
  
+/*
+ * We need to make sure the buffer pointer returned is naturally aligned for the
+ * biggest basic data type we put into it. We have already accounted for this
+ * padding when sizing the buffer.
+ *
+ * However, this padding does not get written into the log, and hence we have to
+ * track the space used by the log vectors separately to prevent log space hangs
+ * due to inaccurate accounting (i.e. a leak) of the used log space through the
+ * CIL context ticket.
+ *
+ * We also add space for the xlog_op_header that describes this region in the
+ * log. This prepends the data region we return to the caller to copy their data
+ * into, so do all the static initialisation of the ophdr now. Because the ophdr
+ * is not 8 byte aligned, we have to be careful to ensure that we align the
+ * start of the buffer such that the region we return to the call is 8 byte
+ * aligned and packed against the tail of the ophdr.
+ */
+void *
+xlog_prepare_iovec(
+       struct xfs_log_vec      *lv,
+       struct xfs_log_iovec    **vecp,
+       uint                    type)
+{
+       struct xfs_log_iovec    *vec = *vecp;
+       struct xlog_op_header   *oph;
+       uint32_t                len;
+       void                    *buf;
+
+       if (vec) {
+               ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
+               vec++;
+       } else {
+               vec = &lv->lv_iovecp[0];
+       }
+
+       len = lv->lv_buf_len + sizeof(struct xlog_op_header);
+       if (!IS_ALIGNED(len, sizeof(uint64_t))) {
+               lv->lv_buf_len = round_up(len, sizeof(uint64_t)) -
+                                       sizeof(struct xlog_op_header);
+       }
+
+       vec->i_type = type;
+       vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+
+       oph = vec->i_addr;
+       oph->oh_clientid = XFS_TRANSACTION;
+       oph->oh_res2 = 0;
+       oph->oh_flags = 0;
+
+       buf = vec->i_addr + sizeof(struct xlog_op_header);
+       ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t)));
+
+       *vecp = vec;
+       return buf;
+}
+
  static void
  xlog_grant_sub_space(
         struct xlog             *log,
@@ -322,30 +372,6 @@ xlog_grant_head_check(
         return error;
  }
  
-static void
-xlog_tic_reset_res(xlog_ticket_t *tic)
-{
-       tic->t_res_num = 0;
-       tic->t_res_arr_sum = 0;
-       tic->t_res_num_ophdrs = 0;
-}
-
-static void
-xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
-{
-       if (tic->t_res_num == XLOG_TIC_LEN_MAX) {
-               /* add to overflow and start again */
-               tic->t_res_o_flow += tic->t_res_arr_sum;
-               tic->t_res_num = 0;
-               tic->t_res_arr_sum = 0;
-       }
-
-       tic->t_res_arr[tic->t_res_num].r_len = len;
-       tic->t_res_arr[tic->t_res_num].r_type = type;
-       tic->t_res_arr_sum += len;
-       tic->t_res_num++;
-}
-
  bool
  xfs_log_writable(
         struct xfs_mount        *mp)
@@ -395,8 +421,6 @@ xfs_log_regrant(
         xlog_grant_push_ail(log, tic->t_unit_res);
  
         tic->t_curr_res = tic->t_unit_res;
-       xlog_tic_reset_res(tic);
-
         if (tic->t_cnt > 0)
                 return 0;
  
@@ -434,10 +458,9 @@ out_error:
  int
  xfs_log_reserve(
         struct xfs_mount        *mp,
-       int                     unit_bytes,
-       int                     cnt,
+       int                     unit_bytes,
+       int                     cnt,
         struct xlog_ticket      **ticp,
-       uint8_t                 client,
         bool                    permanent)
  {
         struct xlog             *log = mp->m_log;
@@ -445,15 +468,13 @@ xfs_log_reserve(
         int                     need_bytes;
         int                     error = 0;
  
-       ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
-
         if (xlog_is_shutdown(log))
                 return -EIO;
  
         XFS_STATS_INC(mp, xs_try_logspace);
  
         ASSERT(*ticp == NULL);
-       tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent);
+       tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent);
         *ticp = tic;
  
         xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -901,12 +922,22 @@ xlog_write_unmount_record(
         struct xlog             *log,
         struct xlog_ticket      *ticket)
  {
-       struct xfs_unmount_log_format ulf = {
-               .magic = XLOG_UNMOUNT_TYPE,
+       struct  {
+               struct xlog_op_header ophdr;
+               struct xfs_unmount_log_format ulf;
+       } unmount_rec = {
+               .ophdr = {
+                       .oh_clientid = XFS_LOG,
+                       .oh_tid = cpu_to_be32(ticket->t_tid),
+                       .oh_flags = XLOG_UNMOUNT_TRANS,
+               },
+               .ulf = {
+                       .magic = XLOG_UNMOUNT_TYPE,
+               },
         };
         struct xfs_log_iovec reg = {
-               .i_addr = &ulf,
-               .i_len = sizeof(ulf),
+               .i_addr = &unmount_rec,
+               .i_len = sizeof(unmount_rec),
                 .i_type = XLOG_REG_TYPE_UNMOUNT,
         };
         struct xfs_log_vec vec = {
@@ -914,10 +945,14 @@ xlog_write_unmount_record(
                 .lv_iovecp = &reg,
         };
  
+       BUILD_BUG_ON((sizeof(struct xlog_op_header) +
+                     sizeof(struct xfs_unmount_log_format)) !=
+                                                       sizeof(unmount_rec));
+
         /* account for space used by record data */
-       ticket->t_curr_res -= sizeof(ulf);
+       ticket->t_curr_res -= sizeof(unmount_rec);
  
-       return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS);
+       return xlog_write(log, NULL, &vec, ticket, reg.i_len);
  }
  
  /*
@@ -933,7 +968,7 @@ xlog_unmount_write(
         struct xlog_ticket      *tic = NULL;
         int                     error;
  
-       error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
+       error = xfs_log_reserve(mp, 600, 1, &tic, 0);
         if (error)
                 goto out_err;
  
@@ -1584,9 +1619,6 @@ xlog_alloc_log(
                                 GFP_KERNEL | __GFP_RETRY_MAYFAIL);
                 if (!iclog->ic_data)
                         goto out_free_iclog;
-#ifdef DEBUG
-               log->l_iclog_bak[i] = &iclog->ic_header;
-#endif
                 head = &iclog->ic_header;
                 memset(head, 0, sizeof(xlog_rec_header_t));
                 head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
@@ -1602,7 +1634,7 @@ xlog_alloc_log(
                 iclog->ic_log = log;
                 atomic_set(&iclog->ic_refcnt, 0);
                 INIT_LIST_HEAD(&iclog->ic_callbacks);
-               iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
+               iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize;
  
                 init_waitqueue_head(&iclog->ic_force_wait);
                 init_waitqueue_head(&iclog->ic_write_wait);
@@ -2111,63 +2143,11 @@ xlog_print_tic_res(
         struct xfs_mount        *mp,
         struct xlog_ticket      *ticket)
  {
-       uint i;
-       uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
-
-       /* match with XLOG_REG_TYPE_* in xfs_log.h */
-#define REG_TYPE_STR(type, str)        [XLOG_REG_TYPE_##type] = str
-       static char *res_type_str[] = {
-           REG_TYPE_STR(BFORMAT, "bformat"),
-           REG_TYPE_STR(BCHUNK, "bchunk"),
-           REG_TYPE_STR(EFI_FORMAT, "efi_format"),
-           REG_TYPE_STR(EFD_FORMAT, "efd_format"),
-           REG_TYPE_STR(IFORMAT, "iformat"),
-           REG_TYPE_STR(ICORE, "icore"),
-           REG_TYPE_STR(IEXT, "iext"),
-           REG_TYPE_STR(IBROOT, "ibroot"),
-           REG_TYPE_STR(ILOCAL, "ilocal"),
-           REG_TYPE_STR(IATTR_EXT, "iattr_ext"),
-           REG_TYPE_STR(IATTR_BROOT, "iattr_broot"),
-           REG_TYPE_STR(IATTR_LOCAL, "iattr_local"),
-           REG_TYPE_STR(QFORMAT, "qformat"),
-           REG_TYPE_STR(DQUOT, "dquot"),
-           REG_TYPE_STR(QUOTAOFF, "quotaoff"),
-           REG_TYPE_STR(LRHEADER, "LR header"),
-           REG_TYPE_STR(UNMOUNT, "unmount"),
-           REG_TYPE_STR(COMMIT, "commit"),
-           REG_TYPE_STR(TRANSHDR, "trans header"),
-           REG_TYPE_STR(ICREATE, "inode create"),
-           REG_TYPE_STR(RUI_FORMAT, "rui_format"),
-           REG_TYPE_STR(RUD_FORMAT, "rud_format"),
-           REG_TYPE_STR(CUI_FORMAT, "cui_format"),
-           REG_TYPE_STR(CUD_FORMAT, "cud_format"),
-           REG_TYPE_STR(BUI_FORMAT, "bui_format"),
-           REG_TYPE_STR(BUD_FORMAT, "bud_format"),
-       };
-       BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
-#undef REG_TYPE_STR
-
         xfs_warn(mp, "ticket reservation summary:");
-       xfs_warn(mp, "  unit res    = %d bytes",
-                ticket->t_unit_res);
-       xfs_warn(mp, "  current res = %d bytes",
-                ticket->t_curr_res);
-       xfs_warn(mp, "  total reg   = %u bytes (o/flow = %u bytes)",
-                ticket->t_res_arr_sum, ticket->t_res_o_flow);
-       xfs_warn(mp, "  ophdrs      = %u (ophdr space = %u bytes)",
-                ticket->t_res_num_ophdrs, ophdr_spc);
-       xfs_warn(mp, "  ophdr + reg = %u bytes",
-                ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc);
-       xfs_warn(mp, "  num regions = %u",
-                ticket->t_res_num);
-
-       for (i = 0; i < ticket->t_res_num; i++) {
-               uint r_type = ticket->t_res_arr[i].r_type;
-               xfs_warn(mp, "region[%u]: %s - %u bytes", i,
-                           ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
-                           "bad-rtype" : res_type_str[r_type]),
-                           ticket->t_res_arr[i].r_len);
-       }
+       xfs_warn(mp, "  unit res    = %d bytes", ticket->t_unit_res);
+       xfs_warn(mp, "  current res = %d bytes", ticket->t_curr_res);
+       xfs_warn(mp, "  original count  = %d", ticket->t_ocnt);
+       xfs_warn(mp, "  remaining count = %d", ticket->t_cnt);
  }
  
  /*
@@ -2220,187 +2200,226 @@ xlog_print_trans(
         }
  }
  
+static inline void
+xlog_write_iovec(
+       struct xlog_in_core     *iclog,
+       uint32_t                *log_offset,
+       void                    *data,
+       uint32_t                write_len,
+       int                     *bytes_left,
+       uint32_t                *record_cnt,
+       uint32_t                *data_cnt)
+{
+       ASSERT(*log_offset < iclog->ic_log->l_iclog_size);
+       ASSERT(*log_offset % sizeof(int32_t) == 0);
+       ASSERT(write_len % sizeof(int32_t) == 0);
+
+       memcpy(iclog->ic_datap + *log_offset, data, write_len);
+       *log_offset += write_len;
+       *bytes_left -= write_len;
+       (*record_cnt)++;
+       *data_cnt += write_len;
+}
+
  /*
- * Calculate the potential space needed by the log vector.  We may need a start
- * record, and each region gets its own struct xlog_op_header and may need to be
- * double word aligned.
+ * Write log vectors into a single iclog which is guaranteed by the caller
+ * to have enough space to write the entire log vector into.
   */
-static int
-xlog_write_calc_vec_length(
+static void
+xlog_write_full(
+       struct xfs_log_vec      *lv,
         struct xlog_ticket      *ticket,
-       struct xfs_log_vec      *log_vector,
-       uint                    optype)
+       struct xlog_in_core     *iclog,
+       uint32_t                *log_offset,
+       uint32_t                *len,
+       uint32_t                *record_cnt,
+       uint32_t                *data_cnt)
  {
-       struct xfs_log_vec      *lv;
-       int                     headers = 0;
-       int                     len = 0;
-       int                     i;
-
-       if (optype & XLOG_START_TRANS)
-               headers++;
-
-       for (lv = log_vector; lv; lv = lv->lv_next) {
-               /* we don't write ordered log vectors */
-               if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
-                       continue;
+       int                     index;
  
-               headers += lv->lv_niovecs;
+       ASSERT(*log_offset + *len <= iclog->ic_size ||
+               iclog->ic_state == XLOG_STATE_WANT_SYNC);
  
-               for (i = 0; i < lv->lv_niovecs; i++) {
-                       struct xfs_log_iovec    *vecp = &lv->lv_iovecp[i];
+       /*
+        * Ordered log vectors have no regions to write so this
+        * loop will naturally skip them.
+        */
+       for (index = 0; index < lv->lv_niovecs; index++) {
+               struct xfs_log_iovec    *reg = &lv->lv_iovecp[index];
+               struct xlog_op_header   *ophdr = reg->i_addr;
  
-                       len += vecp->i_len;
-                       xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
-               }
+               ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+               xlog_write_iovec(iclog, log_offset, reg->i_addr,
+                               reg->i_len, len, record_cnt, data_cnt);
         }
-
-       ticket->t_res_num_ophdrs += headers;
-       len += headers * sizeof(struct xlog_op_header);
-
-       return len;
-}
-
-static void
-xlog_write_start_rec(
-       struct xlog_op_header   *ophdr,
-       struct xlog_ticket      *ticket)
-{
-       ophdr->oh_tid   = cpu_to_be32(ticket->t_tid);
-       ophdr->oh_clientid = ticket->t_clientid;
-       ophdr->oh_len = 0;
-       ophdr->oh_flags = XLOG_START_TRANS;
-       ophdr->oh_res2 = 0;
  }
  
-static xlog_op_header_t *
-xlog_write_setup_ophdr(
-       struct xlog             *log,
-       struct xlog_op_header   *ophdr,
+static int
+xlog_write_get_more_iclog_space(
         struct xlog_ticket      *ticket,
-       uint                    flags)
+       struct xlog_in_core     **iclogp,
+       uint32_t                *log_offset,
+       uint32_t                len,
+       uint32_t                *record_cnt,
+       uint32_t                *data_cnt)
  {
-       ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
-       ophdr->oh_clientid = ticket->t_clientid;
-       ophdr->oh_res2 = 0;
-
-       /* are we copying a commit or unmount record? */
-       ophdr->oh_flags = flags;
+       struct xlog_in_core     *iclog = *iclogp;
+       struct xlog             *log = iclog->ic_log;
+       int                     error;
  
-       /*
-        * We've seen logs corrupted with bad transaction client ids.  This
-        * makes sure that XFS doesn't generate them on.  Turn this into an EIO
-        * and shut down the filesystem.
-        */
-       switch (ophdr->oh_clientid)  {
-       case XFS_TRANSACTION:
-       case XFS_VOLUME:
-       case XFS_LOG:
-               break;
-       default:
-               xfs_warn(log->l_mp,
-                       "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT,
-                       ophdr->oh_clientid, ticket);
-               return NULL;
-       }
+       spin_lock(&log->l_icloglock);
+       ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC);
+       xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+       error = xlog_state_release_iclog(log, iclog);
+       spin_unlock(&log->l_icloglock);
+       if (error)
+               return error;
  
-       return ophdr;
+       error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+                                       log_offset);
+       if (error)
+               return error;
+       *record_cnt = 0;
+       *data_cnt = 0;
+       *iclogp = iclog;
+       return 0;
  }
  
  /*
- * Set up the parameters of the region copy into the log. This has
- * to handle region write split across multiple log buffers - this
- * state is kept external to this function so that this code can
- * be written in an obvious, self documenting manner.
+ * Write log vectors into a single iclog which is smaller than the current chain
+ * length. We write until we cannot fit a full record into the remaining space
+ * and then stop. We return the log vector that is to be written that cannot
+ * wholly fit in the iclog.
   */
  static int
-xlog_write_setup_copy(
+xlog_write_partial(
+       struct xfs_log_vec      *lv,
         struct xlog_ticket      *ticket,
-       struct xlog_op_header   *ophdr,
-       int                     space_available,
-       int                     space_required,
-       int                     *copy_off,
-       int                     *copy_len,
-       int                     *last_was_partial_copy,
-       int                     *bytes_consumed)
-{
-       int                     still_to_copy;
-
-       still_to_copy = space_required - *bytes_consumed;
-       *copy_off = *bytes_consumed;
-
-       if (still_to_copy <= space_available) {
-               /* write of region completes here */
-               *copy_len = still_to_copy;
-               ophdr->oh_len = cpu_to_be32(*copy_len);
-               if (*last_was_partial_copy)
-                       ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
-               *last_was_partial_copy = 0;
-               *bytes_consumed = 0;
-               return 0;
-       }
+       struct xlog_in_core     **iclogp,
+       uint32_t                *log_offset,
+       uint32_t                *len,
+       uint32_t                *record_cnt,
+       uint32_t                *data_cnt)
+{
+       struct xlog_in_core     *iclog = *iclogp;
+       struct xlog_op_header   *ophdr;
+       int                     index = 0;
+       uint32_t                rlen;
+       int                     error;
  
-       /* partial write of region, needs extra log op header reservation */
-       *copy_len = space_available;
-       ophdr->oh_len = cpu_to_be32(*copy_len);
-       ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
-       if (*last_was_partial_copy)
-               ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
-       *bytes_consumed += *copy_len;
-       (*last_was_partial_copy)++;
+       /* walk the logvec, copying until we run out of space in the iclog */
+       for (index = 0; index < lv->lv_niovecs; index++) {
+               struct xfs_log_iovec    *reg = &lv->lv_iovecp[index];
+               uint32_t                reg_offset = 0;
  
-       /* account for new log op header */
-       ticket->t_curr_res -= sizeof(struct xlog_op_header);
-       ticket->t_res_num_ophdrs++;
+               /*
+                * The first region of a continuation must have a non-zero
+                * length otherwise log recovery will just skip over it and
+                * start recovering from the next opheader it finds. Because we
+                * mark the next opheader as a continuation, recovery will then
+                * incorrectly add the continuation to the previous region and
+                * that breaks stuff.
+                *
+                * Hence if there isn't space for region data after the
+                * opheader, then we need to start afresh with a new iclog.
+                */
+               if (iclog->ic_size - *log_offset <=
+                                       sizeof(struct xlog_op_header)) {
+                       error = xlog_write_get_more_iclog_space(ticket,
+                                       &iclog, log_offset, *len, record_cnt,
+                                       data_cnt);
+                       if (error)
+                               return error;
+               }
  
-       return sizeof(struct xlog_op_header);
-}
+               ophdr = reg->i_addr;
+               rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset);
  
-static int
-xlog_write_copy_finish(
-       struct xlog             *log,
-       struct xlog_in_core     *iclog,
-       uint                    flags,
-       int                     *record_cnt,
-       int                     *data_cnt,
-       int                     *partial_copy,
-       int                     *partial_copy_len,
-       int                     log_offset)
-{
-       int                     error;
+               ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+               ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header));
+               if (rlen != reg->i_len)
+                       ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+
+               xlog_write_iovec(iclog, log_offset, reg->i_addr,
+                               rlen, len, record_cnt, data_cnt);
+
+               /* If we wrote the whole region, move to the next. */
+               if (rlen == reg->i_len)
+                       continue;
  
-       if (*partial_copy) {
                 /*
-                * This iclog has already been marked WANT_SYNC by
-                * xlog_state_get_iclog_space.
+                * We now have a partially written iovec, but it can span
+                * multiple iclogs so we loop here. First we release the iclog
+                * we currently have, then we get a new iclog and add a new
+                * opheader. Then we continue copying from where we were until
+                * we either complete the iovec or fill the iclog. If we
+                * complete the iovec, then we increment the index and go right
+                * back to the top of the outer loop. if we fill the iclog, we
+                * run the inner loop again.
+                *
+                * This is complicated by the tail of a region using all the
+                * space in an iclog and hence requiring us to release the iclog
+                * and get a new one before returning to the outer loop. We must
+                * always guarantee that we exit this inner loop with at least
+                * space for log transaction opheaders left in the current
+                * iclog, hence we cannot just terminate the loop at the end
+                * of the of the continuation. So we loop while there is no
+                * space left in the current iclog, and check for the end of the
+                * continuation after getting a new iclog.
                  */
-               spin_lock(&log->l_icloglock);
-               xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
-               *record_cnt = 0;
-               *data_cnt = 0;
-               goto release_iclog;
-       }
+               do {
+                       /*
+                        * Ensure we include the continuation opheader in the
+                        * space we need in the new iclog by adding that size
+                        * to the length we require. This continuation opheader
+                        * needs to be accounted to the ticket as the space it
+                        * consumes hasn't been accounted to the lv we are
+                        * writing.
+                        */
+                       error = xlog_write_get_more_iclog_space(ticket,
+                                       &iclog, log_offset,
+                                       *len + sizeof(struct xlog_op_header),
+                                       record_cnt, data_cnt);
+                       if (error)
+                               return error;
  
-       *partial_copy = 0;
-       *partial_copy_len = 0;
+                       ophdr = iclog->ic_datap + *log_offset;
+                       ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+                       ophdr->oh_clientid = XFS_TRANSACTION;
+                       ophdr->oh_res2 = 0;
+                       ophdr->oh_flags = XLOG_WAS_CONT_TRANS;
  
-       if (iclog->ic_size - log_offset > sizeof(xlog_op_header_t))
-               return 0;
+                       ticket->t_curr_res -= sizeof(struct xlog_op_header);
+                       *log_offset += sizeof(struct xlog_op_header);
+                       *data_cnt += sizeof(struct xlog_op_header);
  
-       /* no more space in this iclog - push it. */
-       spin_lock(&log->l_icloglock);
-       xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
-       *record_cnt = 0;
-       *data_cnt = 0;
+                       /*
+                        * If rlen fits in the iclog, then end the region
+                        * continuation. Otherwise we're going around again.
+                        */
+                       reg_offset += rlen;
+                       rlen = reg->i_len - reg_offset;
+                       if (rlen <= iclog->ic_size - *log_offset)
+                               ophdr->oh_flags |= XLOG_END_TRANS;
+                       else
+                               ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
  
-       if (iclog->ic_state == XLOG_STATE_ACTIVE)
-               xlog_state_switch_iclogs(log, iclog, 0);
-       else
-               ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
-                       xlog_is_shutdown(log));
-release_iclog:
-       error = xlog_state_release_iclog(log, iclog);
-       spin_unlock(&log->l_icloglock);
-       return error;
+                       rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset);
+                       ophdr->oh_len = cpu_to_be32(rlen);
+
+                       xlog_write_iovec(iclog, log_offset,
+                                       reg->i_addr + reg_offset,
+                                       rlen, len, record_cnt, data_cnt);
+
+               } while (ophdr->oh_flags & XLOG_CONTINUE_TRANS);
+       }
+
+       /*
+        * No more iovecs remain in this logvec so return the next log vec to
+        * the caller so it can go back to fast path copying.
+        */
+       *iclogp = iclog;
+       return 0;
  }
  
  /*
@@ -2449,27 +2468,16 @@ xlog_write(
         struct xfs_cil_ctx      *ctx,
         struct xfs_log_vec      *log_vector,
         struct xlog_ticket      *ticket,
-       uint                    optype)
+       uint32_t                len)
+
  {
         struct xlog_in_core     *iclog = NULL;
         struct xfs_log_vec      *lv = log_vector;
-       struct xfs_log_iovec    *vecp = lv->lv_iovecp;
-       int                     index = 0;
-       int                     len;
-       int                     partial_copy = 0;
-       int                     partial_copy_len = 0;
-       int                     contwr = 0;
-       int                     record_cnt = 0;
-       int                     data_cnt = 0;
+       uint32_t                record_cnt = 0;
+       uint32_t                data_cnt = 0;
         int                     error = 0;
+       int                     log_offset;
  
-       /*
-        * If this is a commit or unmount transaction, we don't need a start
-        * record to be written.  We do, however, have to account for the
-        * commit or unmount header that gets written. Hence we always have
-        * to account for an extra xlog_op_header here.
-        */
-       ticket->t_curr_res -= sizeof(struct xlog_op_header);
         if (ticket->t_curr_res < 0) {
                 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
                      "ctx ticket reservation ran out. Need to up reservation");
@@ -2477,144 +2485,54 @@ xlog_write(
                 xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
         }
  
-       len = xlog_write_calc_vec_length(ticket, log_vector, optype);
-       while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
-               void            *ptr;
-               int             log_offset;
-
-               error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
-                                                  &contwr, &log_offset);
-               if (error)
-                       return error;
+       error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+                                          &log_offset);
+       if (error)
+               return error;
  
-               ASSERT(log_offset <= iclog->ic_size - 1);
-               ptr = iclog->ic_datap + log_offset;
+       ASSERT(log_offset <= iclog->ic_size - 1);
  
-               /*
-                * If we have a context pointer, pass it the first iclog we are
-                * writing to so it can record state needed for iclog write
-                * ordering.
-                */
-               if (ctx) {
-                       xlog_cil_set_ctx_write_state(ctx, iclog);
-                       ctx = NULL;
-               }
+       /*
+        * If we have a context pointer, pass it the first iclog we are
+        * writing to so it can record state needed for iclog write
+        * ordering.
+        */
+       if (ctx)
+               xlog_cil_set_ctx_write_state(ctx, iclog);
  
+       while (lv) {
                 /*
-                * This loop writes out as many regions as can fit in the amount
-                * of space which was allocated by xlog_state_get_iclog_space().
+                * If the entire log vec does not fit in the iclog, punt it to
+                * the partial copy loop which can handle this case.
                  */
-               while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
-                       struct xfs_log_iovec    *reg;
-                       struct xlog_op_header   *ophdr;
-                       int                     copy_len;
-                       int                     copy_off;
-                       bool                    ordered = false;
-                       bool                    wrote_start_rec = false;
-
-                       /* ordered log vectors have no regions to write */
-                       if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
-                               ASSERT(lv->lv_niovecs == 0);
-                               ordered = true;
-                               goto next_lv;
-                       }
-
-                       reg = &vecp[index];
-                       ASSERT(reg->i_len % sizeof(int32_t) == 0);
-                       ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
-
-                       /*
-                        * Before we start formatting log vectors, we need to
-                        * write a start record. Only do this for the first
-                        * iclog we write to.
-                        */
-                       if (optype & XLOG_START_TRANS) {
-                               xlog_write_start_rec(ptr, ticket);
-                               xlog_write_adv_cnt(&ptr, &len, &log_offset,
-                                               sizeof(struct xlog_op_header));
-                               optype &= ~XLOG_START_TRANS;
-                               wrote_start_rec = true;
-                       }
-
-                       ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype);
-                       if (!ophdr)
-                               return -EIO;
-
-                       xlog_write_adv_cnt(&ptr, &len, &log_offset,
-                                          sizeof(struct xlog_op_header));
-
-                       len += xlog_write_setup_copy(ticket, ophdr,
-                                                    iclog->ic_size-log_offset,
-                                                    reg->i_len,
-                                                    &copy_off, &copy_len,
-                                                    &partial_copy,
-                                                    &partial_copy_len);
-                       xlog_verify_dest_ptr(log, ptr);
-
-                       /*
-                        * Copy region.
-                        *
-                        * Unmount records just log an opheader, so can have
-                        * empty payloads with no data region to copy. Hence we
-                        * only copy the payload if the vector says it has data
-                        * to copy.
-                        */
-                       ASSERT(copy_len >= 0);
-                       if (copy_len > 0) {
-                               memcpy(ptr, reg->i_addr + copy_off, copy_len);
-                               xlog_write_adv_cnt(&ptr, &len, &log_offset,
-                                                  copy_len);
-                       }
-                       copy_len += sizeof(struct xlog_op_header);
-                       record_cnt++;
-                       if (wrote_start_rec) {
-                               copy_len += sizeof(struct xlog_op_header);
-                               record_cnt++;
-                       }
-                       data_cnt += contwr ? copy_len : 0;
-
-                       error = xlog_write_copy_finish(log, iclog, optype,
-                                                      &record_cnt, &data_cnt,
-                                                      &partial_copy,
-                                                      &partial_copy_len,
-                                                      log_offset);
-                       if (error)
+               if (lv->lv_niovecs &&
+                   lv->lv_bytes > iclog->ic_size - log_offset) {
+                       error = xlog_write_partial(lv, ticket, &iclog,
+                                       &log_offset, &len, &record_cnt,
+                                       &data_cnt);
+                       if (error) {
+                               /*
+                                * We have no iclog to release, so just return
+                                * the error immediately.
+                                */
                                 return error;
-
-                       /*
-                        * if we had a partial copy, we need to get more iclog
-                        * space but we don't want to increment the region
-                        * index because there is still more is this region to
-                        * write.
-                        *
-                        * If we completed writing this region, and we flushed
-                        * the iclog (indicated by resetting of the record
-                        * count), then we also need to get more log space. If
-                        * this was the last record, though, we are done and
-                        * can just return.
-                        */
-                       if (partial_copy)
-                               break;
-
-                       if (++index == lv->lv_niovecs) {
-next_lv:
-                               lv = lv->lv_next;
-                               index = 0;
-                               if (lv)
-                                       vecp = lv->lv_iovecp;
-                       }
-                       if (record_cnt == 0 && !ordered) {
-                               if (!lv)
-                                       return 0;
-                               break;
                         }
+               } else {
+                       xlog_write_full(lv, ticket, iclog, &log_offset,
+                                        &len, &record_cnt, &data_cnt);
                 }
+               lv = lv->lv_next;
         }
-
         ASSERT(len == 0);
  
+       /*
+        * We've already been guaranteed that the last writes will fit inside
+        * the current iclog, and hence it will already have the space used by
+        * those writes accounted to it. Hence we do not need to update the
+        * iclog with the number of bytes written here.
+        */
         spin_lock(&log->l_icloglock);
-       xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+       xlog_state_finish_copy(log, iclog, record_cnt, 0);
         error = xlog_state_release_iclog(log, iclog);
         spin_unlock(&log->l_icloglock);
  
@@ -2971,7 +2889,6 @@ xlog_state_get_iclog_space(
         int                     len,
         struct xlog_in_core     **iclogp,
         struct xlog_ticket      *ticket,
-       int                     *continued_write,
         int                     *logoffsetp)
  {
         int               log_offset;
@@ -3008,9 +2925,6 @@ restart:
          */
         if (log_offset == 0) {
                 ticket->t_curr_res -= log->l_iclog_hsize;
-               xlog_tic_add_region(ticket,
-                                   log->l_iclog_hsize,
-                                   XLOG_REG_TYPE_LRHEADER);
                 head->h_cycle = cpu_to_be32(log->l_curr_cycle);
                 head->h_lsn = cpu_to_be64(
                         xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
@@ -3052,13 +2966,10 @@ restart:
          * iclogs (to mark it taken), this particular iclog will release/sync
          * to disk in xlog_write().
          */
-       if (len <= iclog->ic_size - iclog->ic_offset) {
-               *continued_write = 0;
+       if (len <= iclog->ic_size - iclog->ic_offset)
                 iclog->ic_offset += len;
-       } else {
-               *continued_write = 1;
+       else
                 xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
-       }
         *iclogp = iclog;
  
         ASSERT(iclog->ic_offset <= iclog->ic_size);
@@ -3090,7 +3001,6 @@ xfs_log_ticket_regrant(
         xlog_grant_sub_space(log, &log->l_write_head.grant,
                                         ticket->t_curr_res);
         ticket->t_curr_res = ticket->t_unit_res;
-       xlog_tic_reset_res(ticket);
  
         trace_xfs_log_ticket_regrant_sub(log, ticket);
  
@@ -3101,7 +3011,6 @@ xfs_log_ticket_regrant(
                 trace_xfs_log_ticket_regrant_exit(log, ticket);
  
                 ticket->t_curr_res = ticket->t_unit_res;
-               xlog_tic_reset_res(ticket);
         }
  
         xfs_log_ticket_put(ticket);
@@ -3591,7 +3500,6 @@ xlog_ticket_alloc(
         struct xlog             *log,
         int                     unit_bytes,
         int                     cnt,
-       char                    client,
         bool                    permanent)
  {
         struct xlog_ticket      *tic;
@@ -3609,39 +3517,13 @@ xlog_ticket_alloc(
         tic->t_cnt              = cnt;
         tic->t_ocnt             = cnt;
         tic->t_tid              = prandom_u32();
-       tic->t_clientid         = client;
         if (permanent)
                 tic->t_flags |= XLOG_TIC_PERM_RESERV;
  
-       xlog_tic_reset_res(tic);
-
         return tic;
  }
  
  #if defined(DEBUG)
-/*
- * Make sure that the destination ptr is within the valid data region of
- * one of the iclogs.  This uses backup pointers stored in a different
- * part of the log in case we trash the log structure.
- */
-STATIC void
-xlog_verify_dest_ptr(
-       struct xlog     *log,
-       void            *ptr)
-{
-       int i;
-       int good_ptr = 0;
-
-       for (i = 0; i < log->l_iclog_bufs; i++) {
-               if (ptr >= log->l_iclog_bak[i] &&
-                   ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
-                       good_ptr++;
-       }
-
-       if (!good_ptr)
-               xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
-}
-
  /*
   * Check to make sure the grant write head didn't just over lap the tail.  If
   * the cycles are the same, we can't be overlapping.  Otherwise, make sure that
@@ -3769,7 +3651,7 @@ xlog_verify_iclog(
                 if (field_offset & 0x1ff) {
                         clientid = ophead->oh_clientid;
                 } else {
-                       idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
+                       idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap);
                         if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
                                 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
                                 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3780,11 +3662,12 @@ xlog_verify_iclog(
                                         iclog->ic_header.h_cycle_data[idx]);
                         }
                 }
-               if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
+               if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) {
                         xfs_warn(log->l_mp,
-                               "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx",
-                               __func__, clientid, ophead,
+                               "%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx",
+                               __func__, i, clientid, ophead,
                                 (unsigned long)field_offset);
+               }
  
                 /* check length */
                 p = &ophead->oh_len;
@@ -3792,8 +3675,7 @@ xlog_verify_iclog(
                 if (field_offset & 0x1ff) {
                         op_len = be32_to_cpu(ophead->oh_len);
                 } else {
-                       idx = BTOBBT((uintptr_t)&ophead->oh_len -
-                                   (uintptr_t)iclog->ic_datap);
+                       idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap);
                         if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
                                 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
                                 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3829,7 +3711,7 @@ xlog_verify_iclog(
  bool
  xlog_force_shutdown(
         struct xlog     *log,
-       int             shutdown_flags)
+       uint32_t        shutdown_flags)
  {
         bool            log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR);
  
@@ -3995,3 +3877,44 @@ xlog_drop_incompat_feat(
  {
         up_read(&log->l_incompat_users);
  }
+
+/*
+ * Get permission to use log-assisted atomic exchange of file extents.
+ *
+ * Callers must not be running any transactions or hold any inode locks, and
+ * they must release the permission by calling xlog_drop_incompat_feat
+ * when they're done.
+ */
+int
+xfs_attr_use_log_assist(
+       struct xfs_mount        *mp)
+{
+       int                     error = 0;
+
+       /*
+        * Protect ourselves from an idle log clearing the logged xattrs log
+        * incompat feature bit.
+        */
+       xlog_use_incompat_feat(mp->m_log);
+
+       /*
+        * If log-assisted xattrs are already enabled, the caller can use the
+        * log assisted swap functions with the log-incompat reference we got.
+        */
+       if (xfs_sb_version_haslogxattrs(&mp->m_sb))
+               return 0;
+
+       /* Enable log-assisted xattrs. */
+       error = xfs_add_incompat_log_feature(mp,
+                       XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
+       if (error)
+               goto drop_incompat;
+
+       xfs_warn_once(mp,
+"EXPERIMENTAL logged extended attributes feature added. Use at your own risk!");
+
+       return 0;
+drop_incompat:
+       xlog_drop_incompat_feat(mp->m_log);
+       return error;
+}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h

index dc1b77b..252b098 100644 (file)
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -21,46 +21,59 @@ struct xfs_log_vec {
  
  #define XFS_LOG_VEC_ORDERED    (-1)
  
-static inline void *
-xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
-               uint type)
+/*
+ * Calculate the log iovec length for a given user buffer length. Intended to be
+ * used by ->iop_size implementations when sizing buffers of arbitrary
+ * alignments.
+ */
+static inline int
+xlog_calc_iovec_len(int len)
  {
-       struct xfs_log_iovec *vec = *vecp;
+       return roundup(len, sizeof(uint32_t));
+}
  
-       if (vec) {
-               ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
-               vec++;
-       } else {
-               vec = &lv->lv_iovecp[0];
+void *xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+               uint type);
+
+static inline void
+xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec,
+               int data_len)
+{
+       struct xlog_op_header   *oph = vec->i_addr;
+       int                     len;
+
+       /*
+        * Always round up the length to the correct alignment so callers don't
+        * need to know anything about this log vec layout requirement. This
+        * means we have to zero the area the data to be written does not cover.
+        * This is complicated by fact the payload region is offset into the
+        * logvec region by the opheader that tracks the payload.
+        */
+       len = xlog_calc_iovec_len(data_len);
+       if (len - data_len != 0) {
+               char    *buf = vec->i_addr + sizeof(struct xlog_op_header);
+
+               memset(buf + data_len, 0, len - data_len);
         }
  
-       vec->i_type = type;
-       vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+       /*
+        * The opheader tracks aligned payload length, whilst the logvec tracks
+        * the overall region length.
+        */
+       oph->oh_len = cpu_to_be32(len);
  
-       ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
+       len += sizeof(struct xlog_op_header);
+       lv->lv_buf_len += len;
+       lv->lv_bytes += len;
+       vec->i_len = len;
  
-       *vecp = vec;
-       return vec->i_addr;
+       /* Catch buffer overruns */
+       ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size);
  }
  
  /*
- * We need to make sure the next buffer is naturally aligned for the biggest
- * basic data type we put into it.  We already accounted for this padding when
- * sizing the buffer.
- *
- * However, this padding does not get written into the log, and hence we have to
- * track the space used by the log vectors separately to prevent log space hangs
- * due to inaccurate accounting (i.e. a leak) of the used log space through the
- * CIL context ticket.
+ * Copy the amount of data requested by the caller into a new log iovec.
   */
-static inline void
-xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
-{
-       lv->lv_buf_len += round_up(len, sizeof(uint64_t));
-       lv->lv_bytes += len;
-       vec->i_len = len;
-}
-
  static inline void *
  xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
                 uint type, void *data, int len)
@@ -117,15 +130,11 @@ int         xfs_log_mount_finish(struct xfs_mount *mp);
  void   xfs_log_mount_cancel(struct xfs_mount *);
  xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
  xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
-void     xfs_log_space_wake(struct xfs_mount *mp);
-int      xfs_log_reserve(struct xfs_mount *mp,
-                         int              length,
-                         int              count,
-                         struct xlog_ticket **ticket,
-                         uint8_t                  clientid,
-                         bool             permanent);
-int      xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
-void      xfs_log_unmount(struct xfs_mount *mp);
+void   xfs_log_space_wake(struct xfs_mount *mp);
+int    xfs_log_reserve(struct xfs_mount *mp, int length, int count,
+                       struct xlog_ticket **ticket, bool permanent);
+int    xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
+void   xfs_log_unmount(struct xfs_mount *mp);
  bool   xfs_log_writable(struct xfs_mount *mp);
  
  struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
@@ -140,9 +149,10 @@ void       xfs_log_clean(struct xfs_mount *mp);
  bool   xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
  
  xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
-bool     xlog_force_shutdown(struct xlog *log, int shutdown_flags);
+bool     xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags);
  
  void xlog_use_incompat_feat(struct xlog *log);
  void xlog_drop_incompat_feat(struct xlog *log);
+int xfs_attr_use_log_assist(struct xfs_mount *mp);
  
  #endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c

index c9f55e4..db6cb78 100644 (file)
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -37,7 +37,7 @@ xlog_cil_ticket_alloc(
  {
         struct xlog_ticket *tic;
  
-       tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0);
+       tic = xlog_ticket_alloc(log, 0, 1, 0);
  
         /*
          * set the current reservation to zero so we know to steal the basic
@@ -47,6 +47,38 @@ xlog_cil_ticket_alloc(
         return tic;
  }
  
+/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+static bool
+xlog_item_in_current_chkpt(
+       struct xfs_cil          *cil,
+       struct xfs_log_item     *lip)
+{
+       if (list_empty(&lip->li_cil))
+               return false;
+
+       /*
+        * li_seq is written on the first commit of a log item to record the
+        * first checkpoint it is written to. Hence if it is different to the
+        * current sequence, we're in a new checkpoint.
+        */
+       return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
+}
+
+bool
+xfs_log_item_in_current_chkpt(
+       struct xfs_log_item *lip)
+{
+       return xlog_item_in_current_chkpt(lip->li_log->l_cilp, lip);
+}
+
  /*
   * Unavoidable forward declaration - xlog_cil_push_work() calls
   * xlog_cil_ctx_alloc() itself.
@@ -102,39 +134,6 @@ xlog_cil_iovec_space(
                         sizeof(uint64_t));
  }
  
-/*
- * shadow buffers can be large, so we need to use kvmalloc() here to ensure
- * success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall
- * back to vmalloc, so we can't actually do anything useful with gfp flags to
- * control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do
- * direct reclaim and compaction in the slow path, both of which are
- * horrendously expensive. We just want kmalloc to fail fast and fall back to
- * vmalloc if it can't get somethign straight away from the free lists or buddy
- * allocator. Hence we have to open code kvmalloc outselves here.
- *
- * Also, we are in memalloc_nofs_save task context here, so despite the use of
- * GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This
- * is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets
- * just all pretend this is a GFP_KERNEL context operation....
- */
-static inline void *
-xlog_cil_kvmalloc(
-       size_t          buf_size)
-{
-       gfp_t           flags = GFP_KERNEL;
-       void            *p;
-
-       flags &= ~__GFP_DIRECT_RECLAIM;
-       flags |= __GFP_NOWARN | __GFP_NORETRY;
-       do {
-               p = kmalloc(buf_size, flags);
-               if (!p)
-                       p = vmalloc(buf_size);
-       } while (!p);
-
-       return p;
-}
-
  /*
   * Allocate or pin log vector buffers for CIL insertion.
   *
@@ -214,13 +213,20 @@ xlog_cil_alloc_shadow_bufs(
                 }
  
                 /*
-                * We 64-bit align the length of each iovec so that the start
-                * of the next one is naturally aligned.  We'll need to
-                * account for that slack space here. Then round nbytes up
-                * to 64-bit alignment so that the initial buffer alignment is
-                * easy to calculate and verify.
+                * We 64-bit align the length of each iovec so that the start of
+                * the next one is naturally aligned.  We'll need to account for
+                * that slack space here.
+                *
+                * We also add the xlog_op_header to each region when
+                * formatting, but that's not accounted to the size of the item
+                * at this point. Hence we'll need an addition number of bytes
+                * for each vector to hold an opheader.
+                *
+                * Then round nbytes up to 64-bit alignment so that the initial
+                * buffer alignment is easy to calculate and verify.
                  */
-               nbytes += niovecs * sizeof(uint64_t);
+               nbytes += niovecs *
+                       (sizeof(uint64_t) + sizeof(struct xlog_op_header));
                 nbytes = round_up(nbytes, sizeof(uint64_t));
  
                 /*
@@ -244,7 +250,7 @@ xlog_cil_alloc_shadow_bufs(
                          * storage.
                          */
                         kmem_free(lip->li_lv_shadow);
-                       lv = xlog_cil_kvmalloc(buf_size);
+                       lv = xlog_kvmalloc(buf_size);
  
                         memset(lv, 0, xlog_cil_iovec_space(niovecs));
  
@@ -277,22 +283,18 @@ xlog_cil_alloc_shadow_bufs(
  
  /*
   * Prepare the log item for insertion into the CIL. Calculate the difference in
- * log space and vectors it will consume, and if it is a new item pin it as
- * well.
+ * log space it will consume, and if it is a new item pin it as well.
   */
  STATIC void
  xfs_cil_prepare_item(
         struct xlog             *log,
         struct xfs_log_vec      *lv,
         struct xfs_log_vec      *old_lv,
-       int                     *diff_len,
-       int                     *diff_iovecs)
+       int                     *diff_len)
  {
         /* Account for the new LV being passed in */
-       if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+       if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
                 *diff_len += lv->lv_bytes;
-               *diff_iovecs += lv->lv_niovecs;
-       }
  
         /*
          * If there is no old LV, this is the first time we've seen the item in
@@ -309,7 +311,6 @@ xfs_cil_prepare_item(
                 ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
  
                 *diff_len -= old_lv->lv_bytes;
-               *diff_iovecs -= old_lv->lv_niovecs;
                 lv->lv_item->li_lv_shadow = old_lv;
         }
  
@@ -358,12 +359,10 @@ static void
  xlog_cil_insert_format_items(
         struct xlog             *log,
         struct xfs_trans        *tp,
-       int                     *diff_len,
-       int                     *diff_iovecs)
+       int                     *diff_len)
  {
         struct xfs_log_item     *lip;
  
-
         /* Bail out if we didn't find a log item.  */
         if (list_empty(&tp->t_items)) {
                 ASSERT(0);
@@ -406,7 +405,6 @@ xlog_cil_insert_format_items(
                          * set the item up as though it is a new insertion so
                          * that the space reservation accounting is correct.
                          */
-                       *diff_iovecs -= lv->lv_niovecs;
                         *diff_len -= lv->lv_bytes;
  
                         /* Ensure the lv is set up according to ->iop_size */
@@ -431,7 +429,7 @@ xlog_cil_insert_format_items(
                 ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
                 lip->li_ops->iop_format(lip, lv);
  insert:
-               xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
+               xfs_cil_prepare_item(log, lv, old_lv, diff_len);
         }
  }
  
@@ -445,13 +443,13 @@ insert:
  static void
  xlog_cil_insert_items(
         struct xlog             *log,
-       struct xfs_trans        *tp)
+       struct xfs_trans        *tp,
+       uint32_t                released_space)
  {
         struct xfs_cil          *cil = log->l_cilp;
         struct xfs_cil_ctx      *ctx = cil->xc_ctx;
         struct xfs_log_item     *lip;
         int                     len = 0;
-       int                     diff_iovecs = 0;
         int                     iclog_space;
         int                     iovhdr_res = 0, split_res = 0, ctx_res = 0;
  
@@ -461,15 +459,10 @@ xlog_cil_insert_items(
          * We can do this safely because the context can't checkpoint until we
          * are done so it doesn't matter exactly how we update the CIL.
          */
-       xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
+       xlog_cil_insert_format_items(log, tp, &len);
  
         spin_lock(&cil->xc_cil_lock);
  
-       /* account for space used by new iovec headers  */
-       iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t);
-       len += iovhdr_res;
-       ctx->nvecs += diff_iovecs;
-
         /* attach the transaction to the CIL if it has any busy extents */
         if (!list_empty(&tp->t_busy))
                 list_splice_init(&tp->t_busy, &ctx->busy_extents);
@@ -500,7 +493,9 @@ xlog_cil_insert_items(
                 ASSERT(tp->t_ticket->t_curr_res >= len);
         }
         tp->t_ticket->t_curr_res -= len;
+       tp->t_ticket->t_curr_res += released_space;
         ctx->space_used += len;
+       ctx->space_used -= released_space;
  
         /*
          * If we've overrun the reservation, dump the tx details before we move
@@ -822,7 +817,8 @@ restart:
  static int
  xlog_cil_write_chain(
         struct xfs_cil_ctx      *ctx,
-       struct xfs_log_vec      *chain)
+       struct xfs_log_vec      *chain,
+       uint32_t                chain_len)
  {
         struct xlog             *log = ctx->cil->xc_log;
         int                     error;
@@ -830,7 +826,7 @@ xlog_cil_write_chain(
         error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD);
         if (error)
                 return error;
-       return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS);
+       return xlog_write(log, ctx, chain, ctx->ticket, chain_len);
  }
  
  /*
@@ -844,9 +840,14 @@ xlog_cil_write_commit_record(
         struct xfs_cil_ctx      *ctx)
  {
         struct xlog             *log = ctx->cil->xc_log;
+       struct xlog_op_header   ophdr = {
+               .oh_clientid = XFS_TRANSACTION,
+               .oh_tid = cpu_to_be32(ctx->ticket->t_tid),
+               .oh_flags = XLOG_COMMIT_TRANS,
+       };
         struct xfs_log_iovec    reg = {
-               .i_addr = NULL,
-               .i_len = 0,
+               .i_addr = &ophdr,
+               .i_len = sizeof(struct xlog_op_header),
                 .i_type = XLOG_REG_TYPE_COMMIT,
         };
         struct xfs_log_vec      vec = {
@@ -862,12 +863,138 @@ xlog_cil_write_commit_record(
         if (error)
                 return error;
  
-       error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS);
+       /* account for space used by record data */
+       ctx->ticket->t_curr_res -= reg.i_len;
+       error = xlog_write(log, ctx, &vec, ctx->ticket, reg.i_len);
         if (error)
                 xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
         return error;
  }
  
+struct xlog_cil_trans_hdr {
+       struct xlog_op_header   oph[2];
+       struct xfs_trans_header thdr;
+       struct xfs_log_iovec    lhdr[2];
+};
+
+/*
+ * Build a checkpoint transaction header to begin the journal transaction.  We
+ * need to account for the space used by the transaction header here as it is
+ * not accounted for in xlog_write().
+ *
+ * This is the only place we write a transaction header, so we also build the
+ * log opheaders that indicate the start of a log transaction and wrap the
+ * transaction header. We keep the start record in it's own log vector rather
+ * than compacting them into a single region as this ends up making the logic
+ * in xlog_write() for handling empty opheaders for start, commit and unmount
+ * records much simpler.
+ */
+static void
+xlog_cil_build_trans_hdr(
+       struct xfs_cil_ctx      *ctx,
+       struct xlog_cil_trans_hdr *hdr,
+       struct xfs_log_vec      *lvhdr,
+       int                     num_iovecs)
+{
+       struct xlog_ticket      *tic = ctx->ticket;
+       __be32                  tid = cpu_to_be32(tic->t_tid);
+
+       memset(hdr, 0, sizeof(*hdr));
+
+       /* Log start record */
+       hdr->oph[0].oh_tid = tid;
+       hdr->oph[0].oh_clientid = XFS_TRANSACTION;
+       hdr->oph[0].oh_flags = XLOG_START_TRANS;
+
+       /* log iovec region pointer */
+       hdr->lhdr[0].i_addr = &hdr->oph[0];
+       hdr->lhdr[0].i_len = sizeof(struct xlog_op_header);
+       hdr->lhdr[0].i_type = XLOG_REG_TYPE_LRHEADER;
+
+       /* log opheader */
+       hdr->oph[1].oh_tid = tid;
+       hdr->oph[1].oh_clientid = XFS_TRANSACTION;
+       hdr->oph[1].oh_len = cpu_to_be32(sizeof(struct xfs_trans_header));
+
+       /* transaction header in host byte order format */
+       hdr->thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+       hdr->thdr.th_type = XFS_TRANS_CHECKPOINT;
+       hdr->thdr.th_tid = tic->t_tid;
+       hdr->thdr.th_num_items = num_iovecs;
+
+       /* log iovec region pointer */
+       hdr->lhdr[1].i_addr = &hdr->oph[1];
+       hdr->lhdr[1].i_len = sizeof(struct xlog_op_header) +
+                               sizeof(struct xfs_trans_header);
+       hdr->lhdr[1].i_type = XLOG_REG_TYPE_TRANSHDR;
+
+       lvhdr->lv_niovecs = 2;
+       lvhdr->lv_iovecp = &hdr->lhdr[0];
+       lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len;
+       lvhdr->lv_next = ctx->lv_chain;
+
+       tic->t_curr_res -= lvhdr->lv_bytes;
+}
+
+/*
+ * Pull all the log vectors off the items in the CIL, and remove the items from
+ * the CIL. We don't need the CIL lock here because it's only needed on the
+ * transaction commit side which is currently locked out by the flush lock.
+ *
+ * If a log item is marked with a whiteout, we do not need to write it to the
+ * journal and so we just move them to the whiteout list for the caller to
+ * dispose of appropriately.
+ */
+static void
+xlog_cil_build_lv_chain(
+       struct xfs_cil          *cil,
+       struct xfs_cil_ctx      *ctx,
+       struct list_head        *whiteouts,
+       uint32_t                *num_iovecs,
+       uint32_t                *num_bytes)
+{
+       struct xfs_log_vec      *lv = NULL;
+
+       while (!list_empty(&cil->xc_cil)) {
+               struct xfs_log_item     *item;
+
+               item = list_first_entry(&cil->xc_cil,
+                                       struct xfs_log_item, li_cil);
+
+               if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) {
+                       list_move(&item->li_cil, whiteouts);
+                       trace_xfs_cil_whiteout_skip(item);
+                       continue;
+               }
+
+               list_del_init(&item->li_cil);
+               if (!ctx->lv_chain)
+                       ctx->lv_chain = item->li_lv;
+               else
+                       lv->lv_next = item->li_lv;
+               lv = item->li_lv;
+               item->li_lv = NULL;
+               *num_iovecs += lv->lv_niovecs;
+
+               /* we don't write ordered log vectors */
+               if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
+                       *num_bytes += lv->lv_bytes;
+       }
+}
+
+static void
+xlog_cil_cleanup_whiteouts(
+       struct list_head        *whiteouts)
+{
+       while (!list_empty(whiteouts)) {
+               struct xfs_log_item *item = list_first_entry(whiteouts,
+                                               struct xfs_log_item, li_cil);
+               list_del_init(&item->li_cil);
+               trace_xfs_cil_whiteout_unpin(item);
+               item->li_ops->iop_unpin(item, 1);
+       }
+}
+
  /*
   * Push the Committed Item List to the log.
   *
@@ -890,16 +1017,15 @@ xlog_cil_push_work(
                 container_of(work, struct xfs_cil_ctx, push_work);
         struct xfs_cil          *cil = ctx->cil;
         struct xlog             *log = cil->xc_log;
-       struct xfs_log_vec      *lv;
         struct xfs_cil_ctx      *new_ctx;
-       struct xlog_ticket      *tic;
-       int                     num_iovecs;
+       int                     num_iovecs = 0;
+       int                     num_bytes = 0;
         int                     error = 0;
-       struct xfs_trans_header thdr;
-       struct xfs_log_iovec    lhdr;
+       struct xlog_cil_trans_hdr thdr;
         struct xfs_log_vec      lvhdr = { NULL };
         xfs_csn_t               push_seq;
         bool                    push_commit_stable;
+       LIST_HEAD               (whiteouts);
  
         new_ctx = xlog_cil_ctx_alloc();
         new_ctx->ticket = xlog_cil_ticket_alloc(log);
@@ -968,28 +1094,7 @@ xlog_cil_push_work(
         list_add(&ctx->committing, &cil->xc_committing);
         spin_unlock(&cil->xc_push_lock);
  
-       /*
-        * Pull all the log vectors off the items in the CIL, and remove the
-        * items from the CIL. We don't need the CIL lock here because it's only
-        * needed on the transaction commit side which is currently locked out
-        * by the flush lock.
-        */
-       lv = NULL;
-       num_iovecs = 0;
-       while (!list_empty(&cil->xc_cil)) {
-               struct xfs_log_item     *item;
-
-               item = list_first_entry(&cil->xc_cil,
-                                       struct xfs_log_item, li_cil);
-               list_del_init(&item->li_cil);
-               if (!ctx->lv_chain)
-                       ctx->lv_chain = item->li_lv;
-               else
-                       lv->lv_next = item->li_lv;
-               lv = item->li_lv;
-               item->li_lv = NULL;
-               num_iovecs += lv->lv_niovecs;
-       }
+       xlog_cil_build_lv_chain(cil, ctx, &whiteouts, &num_iovecs, &num_bytes);
  
         /*
          * Switch the contexts so we can drop the context lock and move out
@@ -1025,26 +1130,11 @@ xlog_cil_push_work(
          * Build a checkpoint transaction header and write it to the log to
          * begin the transaction. We need to account for the space used by the
          * transaction header here as it is not accounted for in xlog_write().
-        *
-        * The LSN we need to pass to the log items on transaction commit is
-        * the LSN reported by the first log vector write. If we use the commit
-        * record lsn then we can move the tail beyond the grant write head.
          */
-       tic = ctx->ticket;
-       thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
-       thdr.th_type = XFS_TRANS_CHECKPOINT;
-       thdr.th_tid = tic->t_tid;
-       thdr.th_num_items = num_iovecs;
-       lhdr.i_addr = &thdr;
-       lhdr.i_len = sizeof(xfs_trans_header_t);
-       lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
-       tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
-
-       lvhdr.lv_niovecs = 1;
-       lvhdr.lv_iovecp = &lhdr;
-       lvhdr.lv_next = ctx->lv_chain;
-
-       error = xlog_cil_write_chain(ctx, &lvhdr);
+       xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs);
+       num_bytes += lvhdr.lv_bytes;
+
+       error = xlog_cil_write_chain(ctx, &lvhdr, num_bytes);
         if (error)
                 goto out_abort_free_ticket;
  
@@ -1052,7 +1142,7 @@ xlog_cil_push_work(
         if (error)
                 goto out_abort_free_ticket;
  
-       xfs_log_ticket_ungrant(log, tic);
+       xfs_log_ticket_ungrant(log, ctx->ticket);
  
         /*
          * If the checkpoint spans multiple iclogs, wait for all previous iclogs
@@ -1107,6 +1197,7 @@ xlog_cil_push_work(
         /* Not safe to reference ctx now! */
  
         spin_unlock(&log->l_icloglock);
+       xlog_cil_cleanup_whiteouts(&whiteouts);
         return;
  
  out_skip:
@@ -1116,8 +1207,9 @@ out_skip:
         return;
  
  out_abort_free_ticket:
-       xfs_log_ticket_ungrant(log, tic);
+       xfs_log_ticket_ungrant(log, ctx->ticket);
         ASSERT(xlog_is_shutdown(log));
+       xlog_cil_cleanup_whiteouts(&whiteouts);
         if (!ctx->commit_iclog) {
                 xlog_cil_committed(ctx);
                 return;
@@ -1266,6 +1358,43 @@ xlog_cil_empty(
         return empty;
  }
  
+/*
+ * If there are intent done items in this transaction and the related intent was
+ * committed in the current (same) CIL checkpoint, we don't need to write either
+ * the intent or intent done item to the journal as the change will be
+ * journalled atomically within this checkpoint. As we cannot remove items from
+ * the CIL here, mark the related intent with a whiteout so that the CIL push
+ * can remove it rather than writing it to the journal. Then remove the intent
+ * done item from the current transaction and release it so it doesn't get put
+ * into the CIL at all.
+ */
+static uint32_t
+xlog_cil_process_intents(
+       struct xfs_cil          *cil,
+       struct xfs_trans        *tp)
+{
+       struct xfs_log_item     *lip, *ilip, *next;
+       uint32_t                len = 0;
+
+       list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+               if (!(lip->li_ops->flags & XFS_ITEM_INTENT_DONE))
+                       continue;
+
+               ilip = lip->li_ops->iop_intent(lip);
+               if (!ilip || !xlog_item_in_current_chkpt(cil, ilip))
+                       continue;
+               set_bit(XFS_LI_WHITEOUT, &ilip->li_flags);
+               trace_xfs_cil_whiteout_mark(ilip);
+               len += ilip->li_lv->lv_bytes;
+               kmem_free(ilip->li_lv);
+               ilip->li_lv = NULL;
+
+               xfs_trans_del_item(lip);
+               lip->li_ops->iop_release(lip);
+       }
+       return len;
+}
+
  /*
   * Commit a transaction with the given vector to the Committed Item List.
   *
@@ -1288,6 +1417,7 @@ xlog_cil_commit(
  {
         struct xfs_cil          *cil = log->l_cilp;
         struct xfs_log_item     *lip, *next;
+       uint32_t                released_space = 0;
  
         /*
          * Do all necessary memory allocation before we lock the CIL.
@@ -1299,7 +1429,10 @@ xlog_cil_commit(
         /* lock out background commit */
         down_read(&cil->xc_ctx_lock);
  
-       xlog_cil_insert_items(log, tp);
+       if (tp->t_flags & XFS_TRANS_HAS_INTENT_DONE)
+               released_space = xlog_cil_process_intents(cil, tp);
+
+       xlog_cil_insert_items(log, tp, released_space);
  
         if (regrant && !xlog_is_shutdown(log))
                 xfs_log_ticket_regrant(log, tp->t_ticket);
@@ -1455,32 +1588,6 @@ out_shutdown:
         return 0;
  }
  
-/*
- * Check if the current log item was first committed in this sequence.
- * We can't rely on just the log item being in the CIL, we have to check
- * the recorded commit sequence number.
- *
- * Note: for this to be used in a non-racy manner, it has to be called with
- * CIL flushing locked out. As a result, it should only be used during the
- * transaction commit process when deciding what to format into the item.
- */
-bool
-xfs_log_item_in_current_chkpt(
-       struct xfs_log_item     *lip)
-{
-       struct xfs_cil          *cil = lip->li_log->l_cilp;
-
-       if (list_empty(&lip->li_cil))
-               return false;
-
-       /*
-        * li_seq is written on the first commit of a log item to record the
-        * first checkpoint it is written to. Hence if it is different to the
-        * current sequence, we're in a new checkpoint.
-        */
-       return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
-}
-
  /*
   * Perform initial CIL structure initialisation.
   */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h

index 401cdc4..67fd978 100644 (file)
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -51,8 +51,8 @@ enum xlog_iclog_state {
  /*
   * In core log flags
   */
-#define XLOG_ICL_NEED_FLUSH    (1 << 0)        /* iclog needs REQ_PREFLUSH */
-#define XLOG_ICL_NEED_FUA      (1 << 1)        /* iclog needs REQ_FUA */
+#define XLOG_ICL_NEED_FLUSH    (1u << 0)       /* iclog needs REQ_PREFLUSH */
+#define XLOG_ICL_NEED_FUA      (1u << 1)       /* iclog needs REQ_FUA */
  
  #define XLOG_ICL_STRINGS \
         { XLOG_ICL_NEED_FLUSH,  "XLOG_ICL_NEED_FLUSH" }, \
@@ -62,7 +62,7 @@ enum xlog_iclog_state {
  /*
   * Log ticket flags
   */
-#define XLOG_TIC_PERM_RESERV   0x1     /* permanent reservation */
+#define XLOG_TIC_PERM_RESERV   (1u << 0)       /* permanent reservation */
  
  #define XLOG_TIC_FLAGS \
         { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
@@ -142,19 +142,6 @@ enum xlog_iclog_state {
  
  #define XLOG_COVER_OPS         5
  
-/* Ticket reservation region accounting */ 
-#define XLOG_TIC_LEN_MAX       15
-
-/*
- * Reservation region
- * As would be stored in xfs_log_iovec but without the i_addr which
- * we don't care about.
- */
-typedef struct xlog_res {
-       uint    r_len;  /* region length                :4 */
-       uint    r_type; /* region's transaction type    :4 */
-} xlog_res_t;
-
  typedef struct xlog_ticket {
         struct list_head   t_queue;      /* reserve/write queue */
         struct task_struct *t_task;      /* task that owns this ticket */
@@ -164,15 +151,7 @@ typedef struct xlog_ticket {
         int                t_unit_res;   /* unit reservation in bytes    : 4  */
         char               t_ocnt;       /* original count               : 1  */
         char               t_cnt;        /* current count                : 1  */
-       char               t_clientid;   /* who does this belong to;     : 1  */
-       char               t_flags;      /* properties of reservation    : 1  */
-
-        /* reservation array fields */
-       uint               t_res_num;                    /* num in array : 4 */
-       uint               t_res_num_ophdrs;             /* num op hdrs  : 4 */
-       uint               t_res_arr_sum;                /* array sum    : 4 */
-       uint               t_res_o_flow;                 /* sum overflow : 4 */
-       xlog_res_t         t_res_arr[XLOG_TIC_LEN_MAX];  /* array of res : 8 * 15 */ 
+       uint8_t            t_flags;      /* properties of reservation    : 1  */
  } xlog_ticket_t;
  
  /*
@@ -211,7 +190,7 @@ typedef struct xlog_in_core {
         u32                     ic_offset;
         enum xlog_iclog_state   ic_state;
         unsigned int            ic_flags;
-       char                    *ic_datap;      /* pointer to iclog data */
+       void                    *ic_datap;      /* pointer to iclog data */
         struct list_head        ic_callbacks;
  
         /* reference counts need their own cacheline */
@@ -242,7 +221,6 @@ struct xfs_cil_ctx {
         xfs_lsn_t               commit_lsn;     /* chkpt commit record lsn */
         struct xlog_in_core     *commit_iclog;
         struct xlog_ticket      *ticket;        /* chkpt ticket */
-       int                     nvecs;          /* number of regions */
         int                     space_used;     /* aggregate size of regions */
         struct list_head        busy_extents;   /* busy extents in chkpt */
         struct xfs_log_vec      *lv_chain;      /* logvecs being pushed */
@@ -441,10 +419,6 @@ struct xlog {
  
         struct xfs_kobj         l_kobj;
  
-       /* The following field are used for debugging; need to hold icloglock */
-#ifdef DEBUG
-       void                    *l_iclog_bak[XLOG_MAX_ICLOGS];
-#endif
         /* log recovery lsn tracking (for buffer submission */
         xfs_lsn_t               l_recovery_lsn;
  
@@ -509,27 +483,14 @@ extern __le32      xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
                             char *dp, int size);
  
  extern struct kmem_cache *xfs_log_ticket_cache;
-struct xlog_ticket *
-xlog_ticket_alloc(
-       struct xlog     *log,
-       int             unit_bytes,
-       int             count,
-       char            client,
-       bool            permanent);
-
-static inline void
-xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
-{
-       *ptr += bytes;
-       *len -= bytes;
-       *off += bytes;
-}
+struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
+               int count, bool permanent);
  
  void   xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
  void   xlog_print_trans(struct xfs_trans *);
  int    xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx,
                 struct xfs_log_vec *log_vector, struct xlog_ticket *tic,
-               uint optype);
+               uint32_t len);
  void   xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
  void   xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
  
@@ -690,4 +651,38 @@ xlog_valid_lsn(
         return valid;
  }
  
+/*
+ * Log vector and shadow buffers can be large, so we need to use kvmalloc() here
+ * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts
+ * to fall back to vmalloc, so we can't actually do anything useful with gfp
+ * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc()
+ * will do direct reclaim and compaction in the slow path, both of which are
+ * horrendously expensive. We just want kmalloc to fail fast and fall back to
+ * vmalloc if it can't get somethign straight away from the free lists or
+ * buddy allocator. Hence we have to open code kvmalloc outselves here.
+ *
+ * This assumes that the caller uses memalloc_nofs_save task context here, so
+ * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS
+ * allocations. This is actually the only way to make vmalloc() do GFP_NOFS
+ * allocations, so lets just all pretend this is a GFP_KERNEL context
+ * operation....
+ */
+static inline void *
+xlog_kvmalloc(
+       size_t          buf_size)
+{
+       gfp_t           flags = GFP_KERNEL;
+       void            *p;
+
+       flags &= ~__GFP_DIRECT_RECLAIM;
+       flags |= __GFP_NOWARN | __GFP_NORETRY;
+       do {
+               p = kmalloc(buf_size, flags);
+               if (!p)
+                       p = vmalloc(buf_size);
+       } while (!p);
+
+       return p;
+}
+
  #endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c

index c4ad429..97b941c 100644 (file)
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1800,6 +1800,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
         &xlog_cud_item_ops,
         &xlog_bui_item_ops,
         &xlog_bud_item_ops,
+       &xlog_attri_item_ops,
+       &xlog_attrd_item_ops,
  };
  
  static const struct xlog_recover_item_ops *
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c

index bc66d95..8f495cc 100644 (file)
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -27,42 +27,34 @@ __xfs_printk(
         printk("%sXFS: %pV\n", level, vaf);
  }
  
-#define define_xfs_printk_level(func, kern_level)              \
-void func(const struct xfs_mount *mp, const char *fmt, ...)    \
-{                                                              \
-       struct va_format        vaf;                            \
-       va_list                 args;                           \
-       int                     level;                          \
-                                                               \
-       va_start(args, fmt);                                    \
-                                                               \
-       vaf.fmt = fmt;                                          \
-       vaf.va = &args;                                         \
-                                                               \
-       __xfs_printk(kern_level, mp, &vaf);                     \
-       va_end(args);                                           \
-                                                               \
-       if (!kstrtoint(kern_level, 0, &level) &&                \
-           level <= LOGLEVEL_ERR &&                            \
-           xfs_error_level >= XFS_ERRLEVEL_HIGH)               \
-               xfs_stack_trace();                              \
-}                                                              \
-
-define_xfs_printk_level(xfs_emerg, KERN_EMERG);
-define_xfs_printk_level(xfs_alert, KERN_ALERT);
-define_xfs_printk_level(xfs_crit, KERN_CRIT);
-define_xfs_printk_level(xfs_err, KERN_ERR);
-define_xfs_printk_level(xfs_warn, KERN_WARNING);
-define_xfs_printk_level(xfs_notice, KERN_NOTICE);
-define_xfs_printk_level(xfs_info, KERN_INFO);
-#ifdef DEBUG
-define_xfs_printk_level(xfs_debug, KERN_DEBUG);
-#endif
+void
+xfs_printk_level(
+       const char *kern_level,
+       const struct xfs_mount *mp,
+       const char *fmt, ...)
+{
+       struct va_format        vaf;
+       va_list                 args;
+       int                     level;
+
+       va_start(args, fmt);
+       vaf.fmt = fmt;
+       vaf.va = &args;
+
+       __xfs_printk(kern_level, mp, &vaf);
+
+       va_end(args);
+
+       if (!kstrtoint(kern_level, 0, &level) &&
+           level <= LOGLEVEL_ERR &&
+           xfs_error_level >= XFS_ERRLEVEL_HIGH)
+               xfs_stack_trace();
+}
  
  void
-xfs_alert_tag(
+_xfs_alert_tag(
         const struct xfs_mount  *mp,
-       int                     panic_tag,
+       uint32_t                panic_tag,
         const char              *fmt, ...)
  {
         struct va_format        vaf;
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h

index bb9860e..55ee464 100644 (file)
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -6,33 +6,46 @@
  
  struct xfs_mount;
  
-extern __printf(2, 3)
-void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...);
  extern __printf(3, 4)
-void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_err(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_info(const struct xfs_mount *mp, const char *fmt, ...);
+void xfs_printk_level(const char *kern_level, const struct xfs_mount *mp,
+                       const char *fmt, ...);
  
+#define xfs_printk_index_wrap(kern_level, mp, fmt, ...)                \
+({                                                             \
+       printk_index_subsys_emit("%sXFS%s: ", kern_level, fmt); \
+       xfs_printk_level(kern_level, mp, fmt, ##__VA_ARGS__);   \
+})
+#define xfs_emerg(mp, fmt, ...) \
+       xfs_printk_index_wrap(KERN_EMERG, mp, fmt, ##__VA_ARGS__)
+#define xfs_alert(mp, fmt, ...) \
+       xfs_printk_index_wrap(KERN_ALERT, mp, fmt, ##__VA_ARGS__)
+#define xfs_crit(mp, fmt, ...) \
+       xfs_printk_index_wrap(KERN_CRIT, mp, fmt, ##__VA_ARGS__)
+#define xfs_err(mp, fmt, ...) \
+       xfs_printk_index_wrap(KERN_ERR, mp, fmt, ##__VA_ARGS__)
+#define xfs_warn(mp, fmt, ...) \
+       xfs_printk_index_wrap(KERN_WARNING, mp, fmt, ##__VA_ARGS__)
+#define xfs_notice(mp, fmt, ...) \
+       xfs_printk_index_wrap(KERN_NOTICE, mp, fmt, ##__VA_ARGS__)
+#define xfs_info(mp, fmt, ...) \
+       xfs_printk_index_wrap(KERN_INFO, mp, fmt, ##__VA_ARGS__)
  #ifdef DEBUG
-extern __printf(2, 3)
-void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...);
+#define xfs_debug(mp, fmt, ...) \
+       xfs_printk_index_wrap(KERN_DEBUG, mp, fmt, ##__VA_ARGS__)
  #else
-static inline __printf(2, 3)
-void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
-{
-}
+#define xfs_debug(mp, fmt, ...) do {} while (0)
  #endif
  
+#define xfs_alert_tag(mp, tag, fmt, ...)                       \
+({                                                             \
+       printk_index_subsys_emit("%sXFS%s: ", KERN_ALERT, fmt); \
+       _xfs_alert_tag(mp, tag, fmt, ##__VA_ARGS__);            \
+})
+
+extern __printf(3, 4)
+void _xfs_alert_tag(const struct xfs_mount *mp, uint32_t tag,
+               const char *fmt, ...);
+
  #define xfs_printk_ratelimited(func, dev, fmt, ...)                    \
  do {                                                                   \
         static DEFINE_RATELIMIT_STATE(_rs,                              \
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c

index c5f153c..0c0bcbd 100644 (file)
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -468,6 +468,8 @@ STATIC int
  xfs_check_summary_counts(
         struct xfs_mount        *mp)
  {
+       int                     error = 0;
+
         /*
          * The AG0 superblock verifier rejects in-progress filesystems,
          * so we should never see the flag set this far into mounting.
@@ -506,11 +508,32 @@ xfs_check_summary_counts(
          * superblock to be correct and we don't need to do anything here.
          * Otherwise, recalculate the summary counters.
          */
-       if ((!xfs_has_lazysbcount(mp) || xfs_is_clean(mp)) &&
-           !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS))
-               return 0;
+       if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) ||
+           xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) {
+               error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
+               if (error)
+                       return error;
+       }
  
-       return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
+       /*
+        * Older kernels misused sb_frextents to reflect both incore
+        * reservations made by running transactions and the actual count of
+        * free rt extents in the ondisk metadata.  Transactions committed
+        * during runtime can therefore contain a superblock update that
+        * undercounts the number of free rt extents tracked in the rt bitmap.
+        * A clean unmount record will have the correct frextents value since
+        * there can be no other transactions running at that point.
+        *
+        * If we're mounting the rt volume after recovering the log, recompute
+        * frextents from the rtbitmap file to fix the inconsistency.
+        */
+       if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
+               error = xfs_rtalloc_reinit_frextents(mp);
+               if (error)
+                       return error;
+       }
+
+       return 0;
  }
  
  /*
@@ -784,11 +807,6 @@ xfs_mountfs(
                 goto out_inodegc_shrinker;
         }
  
-       /* Make sure the summary counts are ok. */
-       error = xfs_check_summary_counts(mp);
-       if (error)
-               goto out_log_dealloc;
-
         /* Enable background inode inactivation workers. */
         xfs_inodegc_start(mp);
         xfs_blockgc_start(mp);
@@ -844,6 +862,11 @@ xfs_mountfs(
                 goto out_rele_rip;
         }
  
+       /* Make sure the summary counts are ok. */
+       error = xfs_check_summary_counts(mp);
+       if (error)
+               goto out_rtunmount;
+
         /*
          * If this is a read-only mount defer the superblock updates until
          * the next remount into writeable mode.  Otherwise we would never
@@ -1087,24 +1110,33 @@ xfs_fs_writable(
         return true;
  }
  
+/* Adjust m_fdblocks or m_frextents. */
  int
-xfs_mod_fdblocks(
+xfs_mod_freecounter(
         struct xfs_mount        *mp,
+       struct percpu_counter   *counter,
         int64_t                 delta,
         bool                    rsvd)
  {
         int64_t                 lcounter;
         long long               res_used;
+       uint64_t                set_aside = 0;
         s32                     batch;
-       uint64_t                set_aside;
+       bool                    has_resv_pool;
+
+       ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
+       has_resv_pool = (counter == &mp->m_fdblocks);
+       if (rsvd)
+               ASSERT(has_resv_pool);
  
         if (delta > 0) {
                 /*
                  * If the reserve pool is depleted, put blocks back into it
                  * first. Most of the time the pool is full.
                  */
-               if (likely(mp->m_resblks == mp->m_resblks_avail)) {
-                       percpu_counter_add(&mp->m_fdblocks, delta);
+               if (likely(!has_resv_pool ||
+                          mp->m_resblks == mp->m_resblks_avail)) {
+                       percpu_counter_add(counter, delta);
                         return 0;
                 }
  
@@ -1116,7 +1148,7 @@ xfs_mod_fdblocks(
                 } else {
                         delta -= res_used;
                         mp->m_resblks_avail = mp->m_resblks;
-                       percpu_counter_add(&mp->m_fdblocks, delta);
+                       percpu_counter_add(counter, delta);
                 }
                 spin_unlock(&mp->m_sb_lock);
                 return 0;
@@ -1130,7 +1162,7 @@ xfs_mod_fdblocks(
          * then make everything serialise as we are real close to
          * ENOSPC.
          */
-       if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
+       if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
                                      XFS_FDBLOCKS_BATCH) < 0)
                 batch = 1;
         else
@@ -1147,9 +1179,10 @@ xfs_mod_fdblocks(
          * problems (i.e. transaction abort, pagecache discards, etc.) than
          * slightly premature -ENOSPC.
          */
-       set_aside = xfs_fdblocks_unavailable(mp);
-       percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
-       if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
+       if (has_resv_pool)
+               set_aside = xfs_fdblocks_unavailable(mp);
+       percpu_counter_add_batch(counter, delta, batch);
+       if (__percpu_counter_compare(counter, set_aside,
                                      XFS_FDBLOCKS_BATCH) >= 0) {
                 /* we had space! */
                 return 0;
@@ -1160,8 +1193,8 @@ xfs_mod_fdblocks(
          * that took us to ENOSPC.
          */
         spin_lock(&mp->m_sb_lock);
-       percpu_counter_add(&mp->m_fdblocks, -delta);
-       if (!rsvd)
+       percpu_counter_add(counter, -delta);
+       if (!has_resv_pool || !rsvd)
                 goto fdblocks_enospc;
  
         lcounter = (long long)mp->m_resblks_avail + delta;
@@ -1178,24 +1211,6 @@ fdblocks_enospc:
         return -ENOSPC;
  }
  
-int
-xfs_mod_frextents(
-       struct xfs_mount        *mp,
-       int64_t                 delta)
-{
-       int64_t                 lcounter;
-       int                     ret = 0;
-
-       spin_lock(&mp->m_sb_lock);
-       lcounter = mp->m_sb.sb_frextents + delta;
-       if (lcounter < 0)
-               ret = -ENOSPC;
-       else
-               mp->m_sb.sb_frextents = lcounter;
-       spin_unlock(&mp->m_sb_lock);
-       return ret;
-}
-
  /*
   * Used to free the superblock along various error paths.
   */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h

index f6dc19d..8c42786 100644 (file)
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -183,6 +183,8 @@ typedef struct xfs_mount {
         struct percpu_counter   m_icount;       /* allocated inodes counter */
         struct percpu_counter   m_ifree;        /* free inodes counter */
         struct percpu_counter   m_fdblocks;     /* free block counter */
+       struct percpu_counter   m_frextents;    /* free rt extent counter */
+
         /*
          * Count of data device blocks reserved for delayed allocations,
          * including indlen blocks.  Does not include allocated CoW staging
@@ -276,6 +278,7 @@ typedef struct xfs_mount {
  #define XFS_FEAT_INOBTCNT      (1ULL << 23)    /* inobt block counts */
  #define XFS_FEAT_BIGTIME       (1ULL << 24)    /* large timestamps */
  #define XFS_FEAT_NEEDSREPAIR   (1ULL << 25)    /* needs xfs_repair */
+#define XFS_FEAT_NREXT64       (1ULL << 26)    /* large extent counters */
  
  /* Mount features */
  #define XFS_FEAT_NOATTR2       (1ULL << 48)    /* disable attr2 creation */
@@ -338,6 +341,7 @@ __XFS_HAS_FEAT(realtime, REALTIME)
  __XFS_HAS_FEAT(inobtcounts, INOBTCNT)
  __XFS_HAS_FEAT(bigtime, BIGTIME)
  __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
+__XFS_HAS_FEAT(large_extent_counts, NREXT64)
  
  /*
   * Mount features
@@ -425,16 +429,15 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
  #define XFS_MAX_IO_LOG         30      /* 1G */
  #define XFS_MIN_IO_LOG         PAGE_SHIFT
  
-#define xfs_is_shutdown(mp)            xfs_is_shutdown(mp)
-void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
+void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname,
                 int lnnum);
  #define xfs_force_shutdown(m,f)        \
         xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
  
-#define SHUTDOWN_META_IO_ERROR 0x0001  /* write attempt to metadata failed */
-#define SHUTDOWN_LOG_IO_ERROR  0x0002  /* write attempt to the log failed */
-#define SHUTDOWN_FORCE_UMOUNT  0x0004  /* shutdown from a forced unmount */
-#define SHUTDOWN_CORRUPT_INCORE        0x0008  /* corrupt in-memory data structures */
+#define SHUTDOWN_META_IO_ERROR (1u << 0) /* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR  (1u << 1) /* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT  (1u << 2) /* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE        (1u << 3) /* corrupt in-memory structures */
  
  #define XFS_SHUTDOWN_STRINGS \
         { SHUTDOWN_META_IO_ERROR,       "metadata_io" }, \
@@ -494,9 +497,20 @@ xfs_fdblocks_unavailable(
         return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
  }
  
-extern int     xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
-                                bool reserved);
-extern int     xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
+int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+               int64_t delta, bool rsvd);
+
+static inline int
+xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved)
+{
+       return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+}
+
+static inline int
+xfs_mod_frextents(struct xfs_mount *mp, int64_t delta)
+{
+       return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false);
+}
  
  extern int     xfs_readsb(xfs_mount_t *, int);
  extern void    xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h

index 2599192..758702b 100644 (file)
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -132,6 +132,8 @@ xfs_check_ondisk_structs(void)
         XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format,      56);
         XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat,        20);
         XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header,          16);
+       XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format,      40);
+       XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format,      16);
  
         /*
          * The v5 superblock format extended several v4 header structures with
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c

index f165d1a..8fc813c 100644 (file)
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -582,9 +582,6 @@ xfs_qm_init_timelimits(
         defq->blk.time = XFS_QM_BTIMELIMIT;
         defq->ino.time = XFS_QM_ITIMELIMIT;
         defq->rtb.time = XFS_QM_RTBTIMELIMIT;
-       defq->blk.warn = XFS_QM_BWARNLIMIT;
-       defq->ino.warn = XFS_QM_IWARNLIMIT;
-       defq->rtb.warn = XFS_QM_RTBWARNLIMIT;
  
         /*
          * We try to get the limits from the superuser's limits fields.
@@ -608,12 +605,6 @@ xfs_qm_init_timelimits(
                 defq->ino.time = dqp->q_ino.timer;
         if (dqp->q_rtb.timer)
                 defq->rtb.time = dqp->q_rtb.timer;
-       if (dqp->q_blk.warnings)
-               defq->blk.warn = dqp->q_blk.warnings;
-       if (dqp->q_ino.warnings)
-               defq->ino.warn = dqp->q_ino.warnings;
-       if (dqp->q_rtb.warnings)
-               defq->rtb.warn = dqp->q_rtb.warnings;
  
         xfs_qm_dqdestroy(dqp);
  }
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h

index 5bb1271..9683f04 100644 (file)
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -34,7 +34,6 @@ struct xfs_quota_limits {
         xfs_qcnt_t              hard;   /* default hard limit */
         xfs_qcnt_t              soft;   /* default soft limit */
         time64_t                time;   /* limit for timers */
-       xfs_qwarncnt_t          warn;   /* limit for warnings */
  };
  
  /* Defaults for each quota type: time limits, warn limits, usage limits */
@@ -134,10 +133,6 @@ struct xfs_dquot_acct {
  #define XFS_QM_RTBTIMELIMIT    (7 * 24*60*60)          /* 1 week */
  #define XFS_QM_ITIMELIMIT      (7 * 24*60*60)          /* 1 week */
  
-#define XFS_QM_BWARNLIMIT      5
-#define XFS_QM_IWARNLIMIT      5
-#define XFS_QM_RTBWARNLIMIT    5
-
  extern void            xfs_qm_destroy_quotainfo(struct xfs_mount *);
  
  /* quota ops */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c

index 7d5a318..74ac9ca 100644 (file)
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -217,8 +217,7 @@ xfs_qm_scall_quotaon(
         return 0;
  }
  
-#define XFS_QC_MASK \
-       (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
+#define XFS_QC_MASK (QC_LIMIT_MASK | QC_TIMER_MASK)
  
  /*
   * Adjust limits of this quota, and the defaults if passed in.  Returns true
@@ -250,17 +249,6 @@ xfs_setqlim_limits(
         return true;
  }
  
-static inline void
-xfs_setqlim_warns(
-       struct xfs_dquot_res    *res,
-       struct xfs_quota_limits *qlim,
-       int                     warns)
-{
-       res->warnings = warns;
-       if (qlim)
-               qlim->warn = warns;
-}
-
  static inline void
  xfs_setqlim_timer(
         struct xfs_mount        *mp,
@@ -354,8 +342,6 @@ xfs_qm_scall_setqlim(
  
         if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk"))
                 xfs_dquot_set_prealloc_limits(dqp);
-       if (newlim->d_fieldmask & QC_SPC_WARNS)
-               xfs_setqlim_warns(res, qlim, newlim->d_spc_warns);
         if (newlim->d_fieldmask & QC_SPC_TIMER)
                 xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer);
  
@@ -370,8 +356,6 @@ xfs_qm_scall_setqlim(
         qlim = id == 0 ? &defq->rtb : NULL;
  
         xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb");
-       if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
-               xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns);
         if (newlim->d_fieldmask & QC_RT_SPC_TIMER)
                 xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer);
  
@@ -386,8 +370,6 @@ xfs_qm_scall_setqlim(
         qlim = id == 0 ? &defq->ino : NULL;
  
         xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino");
-       if (newlim->d_fieldmask & QC_INO_WARNS)
-               xfs_setqlim_warns(res, qlim, newlim->d_ino_warns);
         if (newlim->d_fieldmask & QC_INO_TIMER)
                 xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer);
  
@@ -428,13 +410,13 @@ xfs_qm_scall_getquota_fill_qc(
         dst->d_ino_count = dqp->q_ino.reserved;
         dst->d_spc_timer = dqp->q_blk.timer;
         dst->d_ino_timer = dqp->q_ino.timer;
-       dst->d_ino_warns = dqp->q_ino.warnings;
-       dst->d_spc_warns = dqp->q_blk.warnings;
+       dst->d_ino_warns = 0;
+       dst->d_spc_warns = 0;
         dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit);
         dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit);
         dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved);
         dst->d_rt_spc_timer = dqp->q_rtb.timer;
-       dst->d_rt_spc_warns = dqp->q_rtb.warnings;
+       dst->d_rt_spc_warns = 0;
  
         /*
          * Internally, we don't reset all the timers when quota enforcement
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c

index 07989bd..9c162e6 100644 (file)
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -40,9 +40,9 @@ xfs_qm_fill_state(
         tstate->spc_timelimit = (u32)defq->blk.time;
         tstate->ino_timelimit = (u32)defq->ino.time;
         tstate->rt_spc_timelimit = (u32)defq->rtb.time;
-       tstate->spc_warnlimit = defq->blk.warn;
-       tstate->ino_warnlimit = defq->ino.warn;
-       tstate->rt_spc_warnlimit = defq->rtb.warn;
+       tstate->spc_warnlimit = 0;
+       tstate->ino_warnlimit = 0;
+       tstate->rt_spc_warnlimit = 0;
         if (tempqip)
                 xfs_irele(ip);
  }
@@ -98,7 +98,7 @@ xfs_quota_type(int type)
         }
  }
  
-#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK)
+#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK)
  
  /*
   * Adjust quota timers & warnings
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c

index 0d868c9..7e97bf1 100644 (file)
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -35,6 +35,7 @@ STATIC void
  xfs_cui_item_free(
         struct xfs_cui_log_item *cuip)
  {
+       kmem_free(cuip->cui_item.li_lv_shadow);
         if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
                 kmem_free(cuip);
         else
@@ -53,10 +54,11 @@ xfs_cui_release(
         struct xfs_cui_log_item *cuip)
  {
         ASSERT(atomic_read(&cuip->cui_refcount) > 0);
-       if (atomic_dec_and_test(&cuip->cui_refcount)) {
-               xfs_trans_ail_delete(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
-               xfs_cui_item_free(cuip);
-       }
+       if (!atomic_dec_and_test(&cuip->cui_refcount))
+               return;
+
+       xfs_trans_ail_delete(&cuip->cui_item, 0);
+       xfs_cui_item_free(cuip);
  }
  
  
@@ -204,14 +206,24 @@ xfs_cud_item_release(
         struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
  
         xfs_cui_release(cudp->cud_cuip);
+       kmem_free(cudp->cud_item.li_lv_shadow);
         kmem_cache_free(xfs_cud_cache, cudp);
  }
  
+static struct xfs_log_item *
+xfs_cud_item_intent(
+       struct xfs_log_item     *lip)
+{
+       return &CUD_ITEM(lip)->cud_cuip->cui_item;
+}
+
  static const struct xfs_item_ops xfs_cud_item_ops = {
-       .flags          = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+       .flags          = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+                         XFS_ITEM_INTENT_DONE,
         .iop_size       = xfs_cud_item_size,
         .iop_format     = xfs_cud_item_format,
         .iop_release    = xfs_cud_item_release,
+       .iop_intent     = xfs_cud_item_intent,
  };
  
  static struct xfs_cud_log_item *
@@ -259,7 +271,7 @@ xfs_trans_log_finish_refcount_update(
          * 1.) releases the CUI and frees the CUD
          * 2.) shuts down the filesystem
          */
-       tp->t_flags |= XFS_TRANS_DIRTY;
+       tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
         set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
  
         return error;
@@ -600,6 +612,7 @@ xfs_cui_item_relog(
  }
  
  static const struct xfs_item_ops xfs_cui_item_ops = {
+       .flags          = XFS_ITEM_INTENT,
         .iop_size       = xfs_cui_item_size,
         .iop_format     = xfs_cui_item_format,
         .iop_unpin      = xfs_cui_item_unpin,
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c

index 54e68e5..e7a7c00 100644 (file)
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -586,21 +586,21 @@ out:
  STATIC int
  xfs_reflink_end_cow_extent(
         struct xfs_inode        *ip,
-       xfs_fileoff_t           offset_fsb,
-       xfs_fileoff_t           *end_fsb)
+       xfs_fileoff_t           *offset_fsb,
+       xfs_fileoff_t           end_fsb)
  {
-       struct xfs_bmbt_irec    got, del;
         struct xfs_iext_cursor  icur;
+       struct xfs_bmbt_irec    got, del, data;
         struct xfs_mount        *mp = ip->i_mount;
         struct xfs_trans        *tp;
         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-       xfs_filblks_t           rlen;
         unsigned int            resblks;
+       int                     nmaps;
         int                     error;
  
         /* No COW extents?  That's easy! */
         if (ifp->if_bytes == 0) {
-               *end_fsb = offset_fsb;
+               *offset_fsb = end_fsb;
                 return 0;
         }
  
@@ -620,6 +620,9 @@ xfs_reflink_end_cow_extent(
  
         error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
                         XFS_IEXT_REFLINK_END_COW_CNT);
+       if (error == -EFBIG)
+               error = xfs_iext_count_upgrade(tp, ip,
+                               XFS_IEXT_REFLINK_END_COW_CNT);
         if (error)
                 goto out_cancel;
  
@@ -628,42 +631,66 @@ xfs_reflink_end_cow_extent(
          * left by the time I/O completes for the loser of the race.  In that
          * case we are done.
          */
-       if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) ||
-           got.br_startoff + got.br_blockcount <= offset_fsb) {
-               *end_fsb = offset_fsb;
+       if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
+           got.br_startoff >= end_fsb) {
+               *offset_fsb = end_fsb;
                 goto out_cancel;
         }
  
-       /*
-        * Structure copy @got into @del, then trim @del to the range that we
-        * were asked to remap.  We preserve @got for the eventual CoW fork
-        * deletion; from now on @del represents the mapping that we're
-        * actually remapping.
-        */
-       del = got;
-       xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb);
-
-       ASSERT(del.br_blockcount > 0);
-
         /*
          * Only remap real extents that contain data.  With AIO, speculative
          * preallocations can leak into the range we are called upon, and we
-        * need to skip them.
+        * need to skip them.  Preserve @got for the eventual CoW fork
+        * deletion; from now on @del represents the mapping that we're
+        * actually remapping.
          */
-       if (!xfs_bmap_is_written_extent(&got)) {
-               *end_fsb = del.br_startoff;
-               goto out_cancel;
+       while (!xfs_bmap_is_written_extent(&got)) {
+               if (!xfs_iext_next_extent(ifp, &icur, &got) ||
+                   got.br_startoff >= end_fsb) {
+                       *offset_fsb = end_fsb;
+                       goto out_cancel;
+               }
         }
+       del = got;
  
-       /* Unmap the old blocks in the data fork. */
-       rlen = del.br_blockcount;
-       error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
+       /* Grab the corresponding mapping in the data fork. */
+       nmaps = 1;
+       error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
+                       &nmaps, 0);
         if (error)
                 goto out_cancel;
  
-       /* Trim the extent to whatever got unmapped. */
-       xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen);
-       trace_xfs_reflink_cow_remap(ip, &del);
+       /* We can only remap the smaller of the two extent sizes. */
+       data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
+       del.br_blockcount = data.br_blockcount;
+
+       trace_xfs_reflink_cow_remap_from(ip, &del);
+       trace_xfs_reflink_cow_remap_to(ip, &data);
+
+       if (xfs_bmap_is_real_extent(&data)) {
+               /*
+                * If the extent we're remapping is backed by storage (written
+                * or not), unmap the extent and drop its refcount.
+                */
+               xfs_bmap_unmap_extent(tp, ip, &data);
+               xfs_refcount_decrease_extent(tp, &data);
+               xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+                               -data.br_blockcount);
+       } else if (data.br_startblock == DELAYSTARTBLOCK) {
+               int             done;
+
+               /*
+                * If the extent we're remapping is a delalloc reservation,
+                * we can use the regular bunmapi function to release the
+                * incore state.  Dropping the delalloc reservation takes care
+                * of the quota reservation for us.
+                */
+               error = xfs_bunmapi(NULL, ip, data.br_startoff,
+                               data.br_blockcount, 0, 1, &done);
+               if (error)
+                       goto out_cancel;
+               ASSERT(done);
+       }
  
         /* Free the CoW orphan record. */
         xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
@@ -684,7 +711,7 @@ xfs_reflink_end_cow_extent(
                 return error;
  
         /* Update the caller about how much progress we made. */
-       *end_fsb = del.br_startoff;
+       *offset_fsb = del.br_startoff + del.br_blockcount;
         return 0;
  
  out_cancel:
@@ -712,7 +739,7 @@ xfs_reflink_end_cow(
         end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
  
         /*
-        * Walk backwards until we're out of the I/O range.  The loop function
+        * Walk forwards until we've remapped the I/O range.  The loop function
          * repeatedly cycles the ILOCK to allocate one transaction per remapped
          * extent.
          *
@@ -744,7 +771,7 @@ xfs_reflink_end_cow(
          * blocks will be remapped.
          */
         while (end_fsb > offset_fsb && !error)
-               error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb);
+               error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
  
         if (error)
                 trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
@@ -1121,6 +1148,8 @@ xfs_reflink_remap_extent(
                 ++iext_delta;
  
         error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
+       if (error == -EFBIG)
+               error = xfs_iext_count_upgrade(tp, ip, iext_delta);
         if (error)
                 goto out_cancel;
  
@@ -1133,7 +1162,7 @@ xfs_reflink_remap_extent(
                 xfs_refcount_decrease_extent(tp, &smap);
                 qdelta -= smap.br_blockcount;
         } else if (smap.br_startblock == DELAYSTARTBLOCK) {
-               xfs_filblks_t   len = smap.br_blockcount;
+               int             done;
  
                 /*
                  * If the extent we're unmapping is a delalloc reservation,
@@ -1141,10 +1170,11 @@ xfs_reflink_remap_extent(
                  * incore state.  Dropping the delalloc reservation takes care
                  * of the quota reservation for us.
                  */
-               error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1);
+               error = xfs_bunmapi(NULL, ip, smap.br_startoff,
+                               smap.br_blockcount, 0, 1, &done);
                 if (error)
                         goto out_cancel;
-               ASSERT(len == 0);
+               ASSERT(done);
         }
  
         /*
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c

index a22b2d1..fef92e0 100644 (file)
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -35,6 +35,7 @@ STATIC void
  xfs_rui_item_free(
         struct xfs_rui_log_item *ruip)
  {
+       kmem_free(ruip->rui_item.li_lv_shadow);
         if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
                 kmem_free(ruip);
         else
@@ -53,10 +54,11 @@ xfs_rui_release(
         struct xfs_rui_log_item *ruip)
  {
         ASSERT(atomic_read(&ruip->rui_refcount) > 0);
-       if (atomic_dec_and_test(&ruip->rui_refcount)) {
-               xfs_trans_ail_delete(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
-               xfs_rui_item_free(ruip);
-       }
+       if (!atomic_dec_and_test(&ruip->rui_refcount))
+               return;
+
+       xfs_trans_ail_delete(&ruip->rui_item, 0);
+       xfs_rui_item_free(ruip);
  }
  
  STATIC void
@@ -227,14 +229,24 @@ xfs_rud_item_release(
         struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
  
         xfs_rui_release(rudp->rud_ruip);
+       kmem_free(rudp->rud_item.li_lv_shadow);
         kmem_cache_free(xfs_rud_cache, rudp);
  }
  
+static struct xfs_log_item *
+xfs_rud_item_intent(
+       struct xfs_log_item     *lip)
+{
+       return &RUD_ITEM(lip)->rud_ruip->rui_item;
+}
+
  static const struct xfs_item_ops xfs_rud_item_ops = {
-       .flags          = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+       .flags          = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+                         XFS_ITEM_INTENT_DONE,
         .iop_size       = xfs_rud_item_size,
         .iop_format     = xfs_rud_item_format,
         .iop_release    = xfs_rud_item_release,
+       .iop_intent     = xfs_rud_item_intent,
  };
  
  static struct xfs_rud_log_item *
@@ -327,7 +339,7 @@ xfs_trans_log_finish_rmap_update(
          * 1.) releases the RUI and frees the RUD
          * 2.) shuts down the filesystem
          */
-       tp->t_flags |= XFS_TRANS_DIRTY;
+       tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
         set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
  
         return error;
@@ -630,6 +642,7 @@ xfs_rui_item_relog(
  }
  
  static const struct xfs_item_ops xfs_rui_item_ops = {
+       .flags          = XFS_ITEM_INTENT,
         .iop_size       = xfs_rui_item_size,
         .iop_format     = xfs_rui_item_format,
         .iop_unpin      = xfs_rui_item_unpin,
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c

index b8c79ee..292d5e5 100644 (file)
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -806,6 +806,9 @@ xfs_growfs_rt_alloc(
  
                 error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
                                 XFS_IEXT_ADD_NOSPLIT_CNT);
+               if (error == -EFBIG)
+                       error = xfs_iext_count_upgrade(tp, ip,
+                                       XFS_IEXT_ADD_NOSPLIT_CNT);
                 if (error)
                         goto out_trans_cancel;
  
@@ -1284,6 +1287,44 @@ xfs_rtmount_init(
         return 0;
  }
  
+static int
+xfs_rtalloc_count_frextent(
+       struct xfs_mount                *mp,
+       struct xfs_trans                *tp,
+       const struct xfs_rtalloc_rec    *rec,
+       void                            *priv)
+{
+       uint64_t                        *valp = priv;
+
+       *valp += rec->ar_extcount;
+       return 0;
+}
+
+/*
+ * Reinitialize the number of free realtime extents from the realtime bitmap.
+ * Callers must ensure that there is no other activity in the filesystem.
+ */
+int
+xfs_rtalloc_reinit_frextents(
+       struct xfs_mount        *mp)
+{
+       uint64_t                val = 0;
+       int                     error;
+
+       xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+       error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
+                       &val);
+       xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL);
+       if (error)
+               return error;
+
+       spin_lock(&mp->m_sb_lock);
+       mp->m_sb.sb_frextents = val;
+       spin_unlock(&mp->m_sb_lock);
+       percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+       return 0;
+}
+
  /*
   * Get the bitmap and summary inodes and the summary cache into the mount
   * structure at mount time.
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h

index 91b0028..62c7ad7 100644 (file)
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -22,6 +22,7 @@ struct xfs_rtalloc_rec {
  };
  
  typedef int (*xfs_rtalloc_query_range_fn)(
+       struct xfs_mount                *mp,
         struct xfs_trans                *tp,
         const struct xfs_rtalloc_rec    *rec,
         void                            *priv);
@@ -123,27 +124,29 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
  int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
                      xfs_rtblock_t start, xfs_extlen_t len,
                      struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-int xfs_rtalloc_query_range(struct xfs_trans *tp,
+int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
                 const struct xfs_rtalloc_rec *low_rec,
                 const struct xfs_rtalloc_rec *high_rec,
                 xfs_rtalloc_query_range_fn fn, void *priv);
-int xfs_rtalloc_query_all(struct xfs_trans *tp,
+int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
                           xfs_rtalloc_query_range_fn fn,
                           void *priv);
  bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
  int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
                                xfs_rtblock_t start, xfs_extlen_t len,
                                bool *is_free);
+int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
  #else
  # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb)    (ENOSYS)
  # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
  # define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
  # define xfs_growfs_rt(mp,in)                           (ENOSYS)
  # define xfs_rtalloc_query_range(t,l,h,f,p)             (ENOSYS)
-# define xfs_rtalloc_query_all(t,f,p)                   (ENOSYS)
+# define xfs_rtalloc_query_all(m,t,f,p)                 (ENOSYS)
  # define xfs_rtbuf_get(m,t,b,i,p)                       (ENOSYS)
  # define xfs_verify_rtbno(m, r)                        (false)
  # define xfs_rtalloc_extent_is_free(m,t,s,l,i)          (ENOSYS)
+# define xfs_rtalloc_reinit_frextents(m)                (0)
  static inline int              /* error */
  xfs_rtmount_init(
         xfs_mount_t     *mp)    /* file system mount structure */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index a276b81..8495ef0 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -843,9 +843,11 @@ xfs_fs_statfs(
  
         if (XFS_IS_REALTIME_MOUNT(mp) &&
             (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
+               s64     freertx;
+
                 statp->f_blocks = sbp->sb_rblocks;
-               statp->f_bavail = statp->f_bfree =
-                       sbp->sb_frextents * sbp->sb_rextsize;
+               freertx = percpu_counter_sum_positive(&mp->m_frextents);
+               statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize;
         }
  
         return 0;
@@ -1015,8 +1017,14 @@ xfs_init_percpu_counters(
         if (error)
                 goto free_fdblocks;
  
+       error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
+       if (error)
+               goto free_delalloc;
+
         return 0;
  
+free_delalloc:
+       percpu_counter_destroy(&mp->m_delalloc_blks);
  free_fdblocks:
         percpu_counter_destroy(&mp->m_fdblocks);
  free_ifree:
@@ -1033,6 +1041,7 @@ xfs_reinit_percpu_counters(
         percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
         percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
         percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
+       percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
  }
  
  static void
@@ -1045,6 +1054,7 @@ xfs_destroy_percpu_counters(
         ASSERT(xfs_is_shutdown(mp) ||
                percpu_counter_sum(&mp->m_delalloc_blks) == 0);
         percpu_counter_destroy(&mp->m_delalloc_blks);
+       percpu_counter_destroy(&mp->m_frextents);
  }
  
  static int
@@ -1635,6 +1645,10 @@ xfs_fs_fill_super(
                 goto out_filestream_unmount;
         }
  
+       if (xfs_has_large_extent_counts(mp))
+               xfs_warn(mp,
+       "EXPERIMENTAL Large extent counts feature in use. Use at your own risk!");
+
         error = xfs_mountfs(mp);
         if (error)
                 goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c

index affbedf..4145ba8 100644 (file)
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -226,11 +226,6 @@ xfs_symlink(
                 goto out_trans_cancel;
         }
  
-       error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
-                       XFS_IEXT_DIR_MANIP_CNT(mp));
-       if (error)
-               goto out_trans_cancel;
-
         /*
          * Allocate an inode for the symlink.
          */
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h

index 7692e76..f78ad6b 100644 (file)
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -83,6 +83,7 @@ extern xfs_param_t    xfs_params;
  struct xfs_globals {
  #ifdef DEBUG
         int     pwork_threads;          /* parallel workqueue threads */
+       bool    larp;                   /* log attribute replay */
  #endif
         int     log_recovery_delay;     /* log recovery delay (secs) */
         int     mount_delay;            /* mount setup delay (secs) */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c

index 574b80c..f7faf6e 100644 (file)
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -228,6 +228,29 @@ pwork_threads_show(
         return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads);
  }
  XFS_SYSFS_ATTR_RW(pwork_threads);
+
+static ssize_t
+larp_store(
+       struct kobject  *kobject,
+       const char      *buf,
+       size_t          count)
+{
+       ssize_t         ret;
+
+       ret = kstrtobool(buf, &xfs_globals.larp);
+       if (ret < 0)
+               return ret;
+       return count;
+}
+
+STATIC ssize_t
+larp_show(
+       struct kobject  *kobject,
+       char            *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp);
+}
+XFS_SYSFS_ATTR_RW(larp);
  #endif /* DEBUG */
  
  static struct attribute *xfs_dbg_attrs[] = {
@@ -237,6 +260,7 @@ static struct attribute *xfs_dbg_attrs[] = {
         ATTR_LIST(always_cow),
  #ifdef DEBUG
         ATTR_LIST(pwork_threads),
+       ATTR_LIST(larp),
  #endif
         NULL,
  };
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index b141ef7..d320265 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -418,6 +418,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
                 __field(unsigned, lockval)
                 __field(unsigned, flags)
                 __field(unsigned long, caller_ip)
+               __field(const void *, buf_ops)
         ),
         TP_fast_assign(
                 __entry->dev = bp->b_target->bt_dev;
@@ -428,9 +429,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
                 __entry->lockval = bp->b_sema.count;
                 __entry->flags = bp->b_flags;
                 __entry->caller_ip = caller_ip;
+               __entry->buf_ops = bp->b_ops;
         ),
         TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
-                 "lock %d flags %s caller %pS",
+                 "lock %d flags %s bufops %pS caller %pS",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long long)__entry->bno,
                   __entry->nblks,
@@ -438,6 +440,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
                   __entry->pincount,
                   __entry->lockval,
                   __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+                 __entry->buf_ops,
                   (void *)__entry->caller_ip)
  )
  
@@ -1096,22 +1099,6 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_done);
  DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before);
  DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after);
  
-#define XFS_QMOPT_FLAGS \
-       { XFS_QMOPT_UQUOTA,             "UQUOTA" }, \
-       { XFS_QMOPT_PQUOTA,             "PQUOTA" }, \
-       { XFS_QMOPT_FORCE_RES,          "FORCE_RES" }, \
-       { XFS_QMOPT_SBVERSION,          "SBVERSION" }, \
-       { XFS_QMOPT_GQUOTA,             "GQUOTA" }, \
-       { XFS_QMOPT_INHERIT,            "INHERIT" }, \
-       { XFS_QMOPT_RES_REGBLKS,        "RES_REGBLKS" }, \
-       { XFS_QMOPT_RES_RTBLKS,         "RES_RTBLKS" }, \
-       { XFS_QMOPT_BCOUNT,             "BCOUNT" }, \
-       { XFS_QMOPT_ICOUNT,             "ICOUNT" }, \
-       { XFS_QMOPT_RTBCOUNT,           "RTBCOUNT" }, \
-       { XFS_QMOPT_DELBCOUNT,          "DELBCOUNT" }, \
-       { XFS_QMOPT_DELRTBCOUNT,        "DELRTBCOUNT" }, \
-       { XFS_QMOPT_RES_INOS,           "RES_INOS" }
-
  TRACE_EVENT(xfs_trans_mod_dquot,
         TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp,
                  unsigned int field, int64_t delta),
@@ -1348,6 +1335,9 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
  DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
  DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
  DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin);
  
  DECLARE_EVENT_CLASS(xfs_ail_class,
         TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
@@ -1924,7 +1914,7 @@ DECLARE_EVENT_CLASS(xfs_da_class,
                 __field(int, namelen)
                 __field(xfs_dahash_t, hashval)
                 __field(xfs_ino_t, inumber)
-               __field(int, op_flags)
+               __field(uint32_t, op_flags)
         ),
         TP_fast_assign(
                 __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
@@ -1990,7 +1980,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
                 __field(xfs_dahash_t, hashval)
                 __field(unsigned int, attr_filter)
                 __field(unsigned int, attr_flags)
-               __field(int, op_flags)
+               __field(uint32_t, op_flags)
         ),
         TP_fast_assign(
                 __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
@@ -2097,7 +2087,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_space_class,
         TP_STRUCT__entry(
                 __field(dev_t, dev)
                 __field(xfs_ino_t, ino)
-               __field(int, op_flags)
+               __field(uint32_t, op_flags)
                 __field(int, idx)
         ),
         TP_fast_assign(
@@ -2128,7 +2118,7 @@ TRACE_EVENT(xfs_dir2_leafn_moveents,
         TP_STRUCT__entry(
                 __field(dev_t, dev)
                 __field(xfs_ino_t, ino)
-               __field(int, op_flags)
+               __field(uint32_t, op_flags)
                 __field(int, src_idx)
                 __field(int, dst_idx)
                 __field(int, count)
@@ -2169,7 +2159,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
                 __field(int, which)
                 __field(xfs_ino_t, ino)
                 __field(int, format)
-               __field(int, nex)
+               __field(xfs_extnum_t, nex)
                 __field(int, broot_size)
                 __field(int, fork_off)
         ),
@@ -2182,7 +2172,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
                 __entry->broot_size = ip->i_df.if_broot_bytes;
                 __entry->fork_off = XFS_IFORK_BOFF(ip);
         ),
-       TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
+       TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, "
                   "broot size %d, forkoff 0x%x",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->ino,
@@ -3418,7 +3408,8 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
  
  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
  
  DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
  DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
@@ -3513,7 +3504,7 @@ DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key);
  DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key);
  DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);
  
-TRACE_EVENT(xfs_trans_resv_calc,
+DECLARE_EVENT_CLASS(xfs_trans_resv_class,
         TP_PROTO(struct xfs_mount *mp, unsigned int type,
                  struct xfs_trans_res *res),
         TP_ARGS(mp, type, res),
@@ -3537,6 +3528,33 @@ TRACE_EVENT(xfs_trans_resv_calc,
                   __entry->logres,
                   __entry->logcount,
                   __entry->logflags)
+)
+
+#define DEFINE_TRANS_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_trans_resv_class, name, \
+       TP_PROTO(struct xfs_mount *mp, unsigned int type, \
+                struct xfs_trans_res *res), \
+       TP_ARGS(mp, type, res))
+DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc);
+DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc_minlogsize);
+
+TRACE_EVENT(xfs_log_get_max_trans_res,
+       TP_PROTO(struct xfs_mount *mp, const struct xfs_trans_res *res),
+       TP_ARGS(mp, res),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(uint, logres)
+               __field(int, logcount)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->logres = res->tr_logres;
+               __entry->logcount = res->tr_logcount;
+       ),
+       TP_printk("dev %d:%d logres %u logcount %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->logres,
+                 __entry->logcount)
  );
  
  DECLARE_EVENT_CLASS(xfs_trans_class,
@@ -4111,6 +4129,27 @@ DEFINE_ICLOG_EVENT(xlog_iclog_want_sync);
  DEFINE_ICLOG_EVENT(xlog_iclog_wait_on);
  DEFINE_ICLOG_EVENT(xlog_iclog_write);
  
+TRACE_DEFINE_ENUM(XFS_DAS_UNINIT);
+TRACE_DEFINE_ENUM(XFS_DAS_SF_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_SF_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_OLD);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_ATTR);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_OLD);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_ATTR);
+TRACE_DEFINE_ENUM(XFS_DAS_DONE);
+
  DECLARE_EVENT_CLASS(xfs_das_state_class,
         TP_PROTO(int das, struct xfs_inode *ip),
         TP_ARGS(das, ip),
@@ -4122,8 +4161,9 @@ DECLARE_EVENT_CLASS(xfs_das_state_class,
                 __entry->das = das;
                 __entry->ino = ip->i_ino;
         ),
-       TP_printk("state change %d ino 0x%llx",
-                 __entry->das, __entry->ino)
+       TP_printk("state change %s ino 0x%llx",
+                 __print_symbolic(__entry->das, XFS_DAS_STRINGS),
+                 __entry->ino)
  )
  
  #define DEFINE_DAS_STATE_EVENT(name) \
@@ -4132,9 +4172,15 @@ DEFINE_EVENT(xfs_das_state_class, name, \
         TP_ARGS(das, ip))
  DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return);
  DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return);
  DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return);
  DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
  DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove);
+
  
  TRACE_EVENT(xfs_force_shutdown,
         TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c

index 0ac717a..82cf018 100644 (file)
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -32,7 +32,6 @@ static void
  xfs_trans_trace_reservations(
         struct xfs_mount        *mp)
  {
-       struct xfs_trans_res    resv;
         struct xfs_trans_res    *res;
         struct xfs_trans_res    *end_res;
         int                     i;
@@ -41,8 +40,6 @@ xfs_trans_trace_reservations(
         end_res = (struct xfs_trans_res *)(M_RES(mp) + 1);
         for (i = 0; res < end_res; i++, res++)
                 trace_xfs_trans_resv_calc(mp, i, res);
-       xfs_log_get_max_trans_res(mp, &resv);
-       trace_xfs_trans_resv_calc(mp, -1, &resv);
  }
  #else
  # define xfs_trans_trace_reservations(mp)
@@ -194,11 +191,9 @@ xfs_trans_reserve(
                         ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
                         error = xfs_log_regrant(mp, tp->t_ticket);
                 } else {
-                       error = xfs_log_reserve(mp,
-                                               resp->tr_logres,
+                       error = xfs_log_reserve(mp, resp->tr_logres,
                                                 resp->tr_logcount,
-                                               &tp->t_ticket, XFS_TRANSACTION,
-                                               permanent);
+                                               &tp->t_ticket, permanent);
                 }
  
                 if (error)
@@ -498,10 +493,31 @@ xfs_trans_apply_sb_deltas(
                         be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
         }
  
-       if (tp->t_frextents_delta)
-               be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta);
-       if (tp->t_res_frextents_delta)
-               be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta);
+       /*
+        * Updating frextents requires careful handling because it does not
+        * behave like the lazysb counters because we cannot rely on log
+        * recovery in older kenels to recompute the value from the rtbitmap.
+        * This means that the ondisk frextents must be consistent with the
+        * rtbitmap.
+        *
+        * Therefore, log the frextents change to the ondisk superblock and
+        * update the incore superblock so that future calls to xfs_log_sb
+        * write the correct value ondisk.
+        *
+        * Don't touch m_frextents because it includes incore reservations,
+        * and those are handled by the unreserve function.
+        */
+       if (tp->t_frextents_delta || tp->t_res_frextents_delta) {
+               struct xfs_mount        *mp = tp->t_mountp;
+               int64_t                 rtxdelta;
+
+               rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta;
+
+               spin_lock(&mp->m_sb_lock);
+               be64_add_cpu(&sbp->sb_frextents, rtxdelta);
+               mp->m_sb.sb_frextents += rtxdelta;
+               spin_unlock(&mp->m_sb_lock);
+       }
  
         if (tp->t_dblocks_delta) {
                 be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
@@ -614,7 +630,12 @@ xfs_trans_unreserve_and_mod_sb(
         if (ifreedelta)
                 percpu_counter_add(&mp->m_ifree, ifreedelta);
  
-       if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
+       if (rtxdelta) {
+               error = xfs_mod_frextents(mp, rtxdelta);
+               ASSERT(!error);
+       }
+
+       if (!(tp->t_flags & XFS_TRANS_SB_DIRTY))
                 return;
  
         /* apply remaining deltas */
@@ -622,7 +643,12 @@ xfs_trans_unreserve_and_mod_sb(
         mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta;
         mp->m_sb.sb_icount += idelta;
         mp->m_sb.sb_ifree += ifreedelta;
-       mp->m_sb.sb_frextents += rtxdelta;
+       /*
+        * Do not touch sb_frextents here because we are dealing with incore
+        * reservation.  sb_frextents is not part of the lazy sb counters so it
+        * must be consistent with the ondisk rtbitmap and must never include
+        * incore reservations.
+        */
         mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
         mp->m_sb.sb_agcount += tp->t_agcount_delta;
         mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h

index 0c82673..9561f19 100644 (file)
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -55,13 +55,15 @@ struct xfs_log_item {
  #define        XFS_LI_IN_AIL   0
  #define        XFS_LI_ABORTED  1
  #define        XFS_LI_FAILED   2
-#define        XFS_LI_DIRTY    3       /* log item dirty in transaction */
+#define        XFS_LI_DIRTY    3
+#define        XFS_LI_WHITEOUT 4
  
  #define XFS_LI_FLAGS \
-       { (1 << XFS_LI_IN_AIL),         "IN_AIL" }, \
-       { (1 << XFS_LI_ABORTED),        "ABORTED" }, \
-       { (1 << XFS_LI_FAILED),         "FAILED" }, \
-       { (1 << XFS_LI_DIRTY),          "DIRTY" }
+       { (1u << XFS_LI_IN_AIL),        "IN_AIL" }, \
+       { (1u << XFS_LI_ABORTED),       "ABORTED" }, \
+       { (1u << XFS_LI_FAILED),        "FAILED" }, \
+       { (1u << XFS_LI_DIRTY),         "DIRTY" }, \
+       { (1u << XFS_LI_WHITEOUT),      "WHITEOUT" }
  
  struct xfs_item_ops {
         unsigned flags;
@@ -78,30 +80,32 @@ struct xfs_item_ops {
         bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
         struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
                         struct xfs_trans *tp);
+       struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done);
  };
  
-/* Is this log item a deferred action intent? */
+/*
+ * Log item ops flags
+ */
+/*
+ * Release the log item when the journal commits instead of inserting into the
+ * AIL for writeback tracking and/or log tail pinning.
+ */
+#define XFS_ITEM_RELEASE_WHEN_COMMITTED        (1 << 0)
+#define XFS_ITEM_INTENT                        (1 << 1)
+#define XFS_ITEM_INTENT_DONE           (1 << 2)
+
  static inline bool
  xlog_item_is_intent(struct xfs_log_item *lip)
  {
-       return lip->li_ops->iop_recover != NULL &&
-              lip->li_ops->iop_match != NULL;
+       return lip->li_ops->flags & XFS_ITEM_INTENT;
  }
  
-/* Is this a log intent-done item? */
  static inline bool
  xlog_item_is_intent_done(struct xfs_log_item *lip)
  {
-       return lip->li_ops->iop_unpin == NULL &&
-              lip->li_ops->iop_push == NULL;
+       return lip->li_ops->flags & XFS_ITEM_INTENT_DONE;
  }
  
-/*
- * Release the log item as soon as committed.  This is for items just logging
- * intents that never need to be written back in place.
- */
-#define XFS_ITEM_RELEASE_WHEN_COMMITTED        (1 << 0)
-
  void   xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
                           int type, const struct xfs_item_ops *ops);
  
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c

index 9ba7e6b..aa00cf6 100644 (file)
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -597,13 +597,11 @@ xfs_dqresv_check(
         if (softlimit && total_count > softlimit) {
                 time64_t        now = ktime_get_real_seconds();
  
-               if ((res->timer != 0 && now > res->timer) ||
-                   (res->warnings != 0 && res->warnings >= qlim->warn)) {
+               if (res->timer != 0 && now > res->timer) {
                         *fatal = true;
                         return QUOTA_NL_ISOFTLONGWARN;
                 }
  
-               res->warnings++;
                 return QUOTA_NL_ISOFTWARN;
         }
  
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c

index 0d050f8..7a044af 100644 (file)
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -12,9 +12,9 @@
  #include "xfs_trans_resv.h"
  #include "xfs_mount.h"
  #include "xfs_inode.h"
+#include "xfs_da_btree.h"
  #include "xfs_attr.h"
  #include "xfs_acl.h"
-#include "xfs_da_btree.h"
  
  #include <linux/posix_acl_xattr.h>
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 26 May 2022 02:34:40 +0000 (19:34 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 26 May 2022 02:34:40 +0000 (19:34 -0700)
fs/xfs/Makefile		patch \| blob \| history
fs/xfs/libxfs/xfs_alloc.c		patch \| blob \| history
fs/xfs/libxfs/xfs_alloc.h		patch \| blob \| history
fs/xfs/libxfs/xfs_attr.c		patch \| blob \| history
fs/xfs/libxfs/xfs_attr.h		patch \| blob \| history
fs/xfs/libxfs/xfs_attr_leaf.c		patch \| blob \| history
fs/xfs/libxfs/xfs_attr_remote.c		patch \| blob \| history
fs/xfs/libxfs/xfs_attr_remote.h		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.c		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.h		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_btree.h		patch \| blob \| history
fs/xfs/libxfs/xfs_da_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_da_btree.h		patch \| blob \| history
fs/xfs/libxfs/xfs_da_format.h		patch \| blob \| history
fs/xfs/libxfs/xfs_defer.c		patch \| blob \| history
fs/xfs/libxfs/xfs_defer.h		patch \| blob \| history
fs/xfs/libxfs/xfs_dir2.c		patch \| blob \| history
fs/xfs/libxfs/xfs_errortag.h		patch \| blob \| history
fs/xfs/libxfs/xfs_format.h		patch \| blob \| history
fs/xfs/libxfs/xfs_fs.h		patch \| blob \| history
fs/xfs/libxfs/xfs_ialloc.c		patch \| blob \| history
fs/xfs/libxfs/xfs_ialloc.h		patch \| blob \| history
fs/xfs/libxfs/xfs_inode_buf.c		patch \| blob \| history
fs/xfs/libxfs/xfs_inode_fork.c		patch \| blob \| history
fs/xfs/libxfs/xfs_inode_fork.h		patch \| blob \| history
fs/xfs/libxfs/xfs_log_format.h		patch \| blob \| history
fs/xfs/libxfs/xfs_log_recover.h		patch \| blob \| history
fs/xfs/libxfs/xfs_log_rlimit.c		patch \| blob \| history
fs/xfs/libxfs/xfs_quota_defs.h		patch \| blob \| history
fs/xfs/libxfs/xfs_refcount.c		patch \| blob \| history
fs/xfs/libxfs/xfs_refcount.h		patch \| blob \| history
fs/xfs/libxfs/xfs_rmap.c		patch \| blob \| history
fs/xfs/libxfs/xfs_rmap.h		patch \| blob \| history
fs/xfs/libxfs/xfs_rtbitmap.c		patch \| blob \| history
fs/xfs/libxfs/xfs_sb.c		patch \| blob \| history
fs/xfs/libxfs/xfs_shared.h		patch \| blob \| history
fs/xfs/libxfs/xfs_trans_resv.c		patch \| blob \| history
fs/xfs/libxfs/xfs_trans_resv.h		patch \| blob \| history
fs/xfs/libxfs/xfs_types.h		patch \| blob \| history
fs/xfs/scrub/bmap.c		patch \| blob \| history
fs/xfs/scrub/common.c		patch \| blob \| history
fs/xfs/scrub/inode.c		patch \| blob \| history
fs/xfs/scrub/rtbitmap.c		patch \| blob \| history
fs/xfs/xfs_acl.c		patch \| blob \| history
fs/xfs/xfs_acl.h		patch \| blob \| history
fs/xfs/xfs_attr_item.c	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_attr_item.h	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_attr_list.c		patch \| blob \| history
fs/xfs/xfs_bmap_item.c		patch \| blob \| history
fs/xfs/xfs_bmap_util.c		patch \| blob \| history
fs/xfs/xfs_buf_item.h		patch \| blob \| history
fs/xfs/xfs_dquot.c		patch \| blob \| history
fs/xfs/xfs_dquot.h		patch \| blob \| history
fs/xfs/xfs_error.c		patch \| blob \| history
fs/xfs/xfs_error.h		patch \| blob \| history
fs/xfs/xfs_extfree_item.c		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
fs/xfs/xfs_filestream.c		patch \| blob \| history
fs/xfs/xfs_fsmap.c		patch \| blob \| history
fs/xfs/xfs_fsops.c		patch \| blob \| history
fs/xfs/xfs_globals.c		patch \| blob \| history
fs/xfs/xfs_icache.c		patch \| blob \| history
fs/xfs/xfs_icreate_item.c		patch \| blob \| history
fs/xfs/xfs_inode.c		patch \| blob \| history
fs/xfs/xfs_inode.h		patch \| blob \| history
fs/xfs/xfs_inode_item.c		patch \| blob \| history
fs/xfs/xfs_inode_item_recover.c		patch \| blob \| history
fs/xfs/xfs_ioctl.c		patch \| blob \| history
fs/xfs/xfs_ioctl32.c		patch \| blob \| history
fs/xfs/xfs_iomap.c		patch \| blob \| history
fs/xfs/xfs_iops.c		patch \| blob \| history
fs/xfs/xfs_itable.c		patch \| blob \| history
fs/xfs/xfs_itable.h		patch \| blob \| history
fs/xfs/xfs_iwalk.h		patch \| blob \| history
fs/xfs/xfs_log.c		patch \| blob \| history
fs/xfs/xfs_log.h		patch \| blob \| history
fs/xfs/xfs_log_cil.c		patch \| blob \| history
fs/xfs/xfs_log_priv.h		patch \| blob \| history
fs/xfs/xfs_log_recover.c		patch \| blob \| history
fs/xfs/xfs_message.c		patch \| blob \| history
fs/xfs/xfs_message.h		patch \| blob \| history
fs/xfs/xfs_mount.c		patch \| blob \| history
fs/xfs/xfs_mount.h		patch \| blob \| history
fs/xfs/xfs_ondisk.h		patch \| blob \| history
fs/xfs/xfs_qm.c		patch \| blob \| history
fs/xfs/xfs_qm.h		patch \| blob \| history
fs/xfs/xfs_qm_syscalls.c		patch \| blob \| history
fs/xfs/xfs_quotaops.c		patch \| blob \| history
fs/xfs/xfs_refcount_item.c		patch \| blob \| history
fs/xfs/xfs_reflink.c		patch \| blob \| history
fs/xfs/xfs_rmap_item.c		patch \| blob \| history
fs/xfs/xfs_rtalloc.c		patch \| blob \| history
fs/xfs/xfs_rtalloc.h		patch \| blob \| history
fs/xfs/xfs_super.c		patch \| blob \| history
fs/xfs/xfs_symlink.c		patch \| blob \| history
fs/xfs/xfs_sysctl.h		patch \| blob \| history
fs/xfs/xfs_sysfs.c		patch \| blob \| history
fs/xfs/xfs_trace.h		patch \| blob \| history
fs/xfs/xfs_trans.c		patch \| blob \| history
fs/xfs/xfs_trans.h		patch \| blob \| history
fs/xfs/xfs_trans_dquot.c		patch \| blob \| history
fs/xfs/xfs_xattr.c		patch \| blob \| history