Merge tag 'xfs-5.9-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 7 Aug 2020 17:57:29 +0000 (10:57 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 7 Aug 2020 17:57:29 +0000 (10:57 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 7 Aug 2020 17:57:29 +0000 (10:57 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 7 Aug 2020 17:57:29 +0000 (10:57 -0700)
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c

index f136647..e841ed7 100644 (file)
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -115,24 +115,3 @@ kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
                 congestion_wait(BLK_RW_ASYNC, HZ/50);
         } while (1);
  }
-
-void *
-kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
-{
-       int     retries = 0;
-       gfp_t   lflags = kmem_flags_convert(flags);
-       void    *ptr;
-
-       trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_);
-       do {
-               ptr = kmem_cache_alloc(zone, lflags);
-               if (ptr || (flags & KM_MAYFAIL))
-                       return ptr;
-               if (!(++retries % 100))
-                       xfs_err(NULL,
-               "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
-                               current->comm, current->pid,
-                               __func__, lflags);
-               congestion_wait(BLK_RW_ASYNC, HZ/50);
-       } while (1);
-}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h

index 34cbcfd..8e85558 100644 (file)
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -85,14 +85,6 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
  #define kmem_zone      kmem_cache
  #define kmem_zone_t    struct kmem_cache
  
-extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
-
-static inline void *
-kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
-{
-       return kmem_zone_alloc(zone, flags | KM_ZERO);
-}
-
  static inline struct page *
  kmem_to_page(void *addr)
  {
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c

index 9d84007..8cf73fe 100644 (file)
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -563,7 +563,8 @@ xfs_ag_get_geometry(
         error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
         if (error)
                 goto out_agi;
-       pag = xfs_perag_get(mp, agno);
+
+       pag = agi_bp->b_pag;
  
         /* Fill out form. */
         memset(ageo, 0, sizeof(*ageo));
@@ -583,7 +584,6 @@ xfs_ag_get_geometry(
         xfs_ag_geom_health(pag, ageo);
  
         /* Release resources. */
-       xfs_perag_put(pag);
         xfs_buf_relse(agf_bp);
  out_agi:
         xfs_buf_relse(agi_bp);
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h

index f3fd0ee..8a8eb4b 100644 (file)
--- a/fs/xfs/libxfs/xfs_ag_resv.h
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -37,16 +37,4 @@ xfs_ag_resv_rmapbt_alloc(
         xfs_perag_put(pag);
  }
  
-static inline void
-xfs_ag_resv_rmapbt_free(
-       struct xfs_mount        *mp,
-       xfs_agnumber_t          agno)
-{
-       struct xfs_perag        *pag;
-
-       pag = xfs_perag_get(mp, agno);
-       xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
-       xfs_perag_put(pag);
-}
-
  #endif /* __XFS_AG_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c

index 203e74f..852b536 100644 (file)
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -710,13 +710,12 @@ xfs_alloc_read_agfl(
  STATIC int
  xfs_alloc_update_counters(
         struct xfs_trans        *tp,
-       struct xfs_perag        *pag,
         struct xfs_buf          *agbp,
         long                    len)
  {
         struct xfs_agf          *agf = agbp->b_addr;
  
-       pag->pagf_freeblks += len;
+       agbp->b_pag->pagf_freeblks += len;
         be32_add_cpu(&agf->agf_freeblks, len);
  
         xfs_trans_agblocks_delta(tp, len);
@@ -1175,8 +1174,7 @@ xfs_alloc_ag_vextent(
         }
  
         if (!args->wasfromfl) {
-               error = xfs_alloc_update_counters(args->tp, args->pag,
-                                                 args->agbp,
+               error = xfs_alloc_update_counters(args->tp, args->agbp,
                                                   -((long)(args->len)));
                 if (error)
                         return error;
@@ -1887,7 +1885,6 @@ xfs_free_ag_extent(
         enum xfs_ag_resv_type           type)
  {
         struct xfs_mount                *mp;
-       struct xfs_perag                *pag;
         struct xfs_btree_cur            *bno_cur;
         struct xfs_btree_cur            *cnt_cur;
         xfs_agblock_t                   gtbno; /* start of right neighbor */
@@ -2167,10 +2164,8 @@ xfs_free_ag_extent(
         /*
          * Update the freespace totals in the ag and superblock.
          */
-       pag = xfs_perag_get(mp, agno);
-       error = xfs_alloc_update_counters(tp, pag, agbp, len);
-       xfs_ag_resv_free_extent(pag, type, tp, len);
-       xfs_perag_put(pag);
+       error = xfs_alloc_update_counters(tp, agbp, len);
+       xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
         if (error)
                 goto error0;
  
@@ -2467,7 +2462,8 @@ xfs_defer_agfl_block(
         ASSERT(xfs_bmap_free_item_zone != NULL);
         ASSERT(oinfo != NULL);
  
-       new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
+       new = kmem_cache_alloc(xfs_bmap_free_item_zone,
+                              GFP_KERNEL | __GFP_NOFAIL);
         new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
         new->xefi_blockcount = 1;
         new->xefi_oinfo = *oinfo;
@@ -2689,7 +2685,7 @@ xfs_alloc_get_freelist(
         if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
                 agf->agf_flfirst = 0;
  
-       pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+       pag = agbp->b_pag;
         ASSERT(!pag->pagf_agflreset);
         be32_add_cpu(&agf->agf_flcount, -1);
         xfs_trans_agflist_delta(tp, -1);
@@ -2701,7 +2697,6 @@ xfs_alloc_get_freelist(
                 pag->pagf_btreeblks++;
                 logflags |= XFS_AGF_BTREEBLKS;
         }
-       xfs_perag_put(pag);
  
         xfs_alloc_log_agf(tp, agbp, logflags);
         *bnop = bno;
@@ -2797,7 +2792,7 @@ xfs_alloc_put_freelist(
         if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
                 agf->agf_fllast = 0;
  
-       pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+       pag = agbp->b_pag;
         ASSERT(!pag->pagf_agflreset);
         be32_add_cpu(&agf->agf_flcount, 1);
         xfs_trans_agflist_delta(tp, 1);
@@ -2809,7 +2804,6 @@ xfs_alloc_put_freelist(
                 pag->pagf_btreeblks--;
                 logflags |= XFS_AGF_BTREEBLKS;
         }
-       xfs_perag_put(pag);
  
         xfs_alloc_log_agf(tp, agbp, logflags);
  
@@ -3006,7 +3000,7 @@ xfs_alloc_read_agf(
         ASSERT(!(*bpp)->b_error);
  
         agf = (*bpp)->b_addr;
-       pag = xfs_perag_get(mp, agno);
+       pag = (*bpp)->b_pag;
         if (!pag->pagf_init) {
                 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
                 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
@@ -3034,7 +3028,6 @@ xfs_alloc_read_agf(
                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
         }
  #endif
-       xfs_perag_put(pag);
         return 0;
  }
  
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c

index 60c453c..8e01231 100644 (file)
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -38,16 +38,14 @@ xfs_allocbt_set_root(
  {
         struct xfs_buf          *agbp = cur->bc_ag.agbp;
         struct xfs_agf          *agf = agbp->b_addr;
-       xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
         int                     btnum = cur->bc_btnum;
-       struct xfs_perag        *pag = xfs_perag_get(cur->bc_mp, seqno);
+       struct xfs_perag        *pag = agbp->b_pag;
  
         ASSERT(ptr->s != 0);
  
         agf->agf_roots[btnum] = ptr->s;
         be32_add_cpu(&agf->agf_levels[btnum], inc);
         pag->pagf_levels[btnum] += inc;
-       xfs_perag_put(pag);
  
         xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
  }
@@ -115,7 +113,6 @@ xfs_allocbt_update_lastrec(
         int                     reason)
  {
         struct xfs_agf          *agf = cur->bc_ag.agbp->b_addr;
-       xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
         struct xfs_perag        *pag;
         __be32                  len;
         int                     numrecs;
@@ -160,9 +157,8 @@ xfs_allocbt_update_lastrec(
         }
  
         agf->agf_longest = len;
-       pag = xfs_perag_get(cur->bc_mp, seqno);
+       pag = cur->bc_ag.agbp->b_pag;
         pag->pagf_longest = be32_to_cpu(len);
-       xfs_perag_put(pag);
         xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST);
  }
  
@@ -484,7 +480,7 @@ xfs_allocbt_init_common(
  
         ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
  
-       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+       cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
  
         cur->bc_tp = tp;
         cur->bc_mp = mp;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c

index 3b1bd6e..2e055c0 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -46,6 +46,7 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
  STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
  STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
  STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp);
  
  /*
   * Internal routines when attribute list is more than one block.
@@ -53,6 +54,8 @@ STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
  STATIC int xfs_attr_node_get(xfs_da_args_t *args);
  STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
  STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_node_hasname(xfs_da_args_t *args,
+                                struct xfs_da_state **state);
  STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
  STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
  
@@ -175,8 +178,13 @@ xfs_attr_try_sf_addname(
         struct xfs_da_args      *args)
  {
  
-       struct xfs_mount        *mp = dp->i_mount;
-       int                     error, error2;
+       int                     error;
+
+       /*
+        * Build initial attribute list (if required).
+        */
+       if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS)
+               xfs_attr_shortform_create(args);
  
         error = xfs_attr_shortform_addname(args);
         if (error == -ENOSPC)
@@ -189,12 +197,70 @@ xfs_attr_try_sf_addname(
         if (!error && !(args->op_flags & XFS_DA_OP_NOTIME))
                 xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
  
-       if (mp->m_flags & XFS_MOUNT_WSYNC)
+       if (dp->i_mount->m_flags & XFS_MOUNT_WSYNC)
                 xfs_trans_set_sync(args->trans);
  
-       error2 = xfs_trans_commit(args->trans);
-       args->trans = NULL;
-       return error ? error : error2;
+       return error;
+}
+
+/*
+ * Check to see if the attr should be upgraded from non-existent or shortform to
+ * single-leaf-block attribute list.
+ */
+static inline bool
+xfs_attr_is_shortform(
+       struct xfs_inode    *ip)
+{
+       return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL ||
+              (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
+               ip->i_afp->if_nextents == 0);
+}
+
+/*
+ * Attempts to set an attr in shortform, or converts short form to leaf form if
+ * there is not enough room.  If the attr is set, the transaction is committed
+ * and set to NULL.
+ */
+STATIC int
+xfs_attr_set_shortform(
+       struct xfs_da_args      *args,
+       struct xfs_buf          **leaf_bp)
+{
+       struct xfs_inode        *dp = args->dp;
+       int                     error, error2 = 0;
+
+       /*
+        * Try to add the attr to the attribute list in the inode.
+        */
+       error = xfs_attr_try_sf_addname(dp, args);
+       if (error != -ENOSPC) {
+               error2 = xfs_trans_commit(args->trans);
+               args->trans = NULL;
+               return error ? error : error2;
+       }
+       /*
+        * It won't fit in the shortform, transform to a leaf block.  GROT:
+        * another possible req'mt for a double-split btree op.
+        */
+       error = xfs_attr_shortform_to_leaf(args, leaf_bp);
+       if (error)
+               return error;
+
+       /*
+        * Prevent the leaf buffer from being unlocked so that a concurrent AIL
+        * push cannot grab the half-baked leaf buffer and run into problems
+        * with the write verifier. Once we're done rolling the transaction we
+        * can release the hold and add the attr to the leaf.
+        */
+       xfs_trans_bhold(args->trans, *leaf_bp);
+       error = xfs_defer_finish(&args->trans);
+       xfs_trans_bhold_release(args->trans, *leaf_bp);
+       if (error) {
+               xfs_trans_brelse(args->trans, *leaf_bp);
+               return error;
+       }
+
+       return 0;
  }
  
  /*
@@ -206,60 +272,93 @@ xfs_attr_set_args(
  {
         struct xfs_inode        *dp = args->dp;
         struct xfs_buf          *leaf_bp = NULL;
-       int                     error;
+       int                     error = 0;
  
         /*
-        * If the attribute list is non-existent or a shortform list,
-        * upgrade it to a single-leaf-block attribute list.
+        * If the attribute list is already in leaf format, jump straight to
+        * leaf handling.  Otherwise, try to add the attribute to the shortform
+        * list; if there's no room then convert the list to leaf format and try
+        * again.
          */
-       if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL ||
-           (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
-            dp->i_afp->if_nextents == 0)) {
+       if (xfs_attr_is_shortform(dp)) {
  
                 /*
-                * Build initial attribute list (if required).
+                * If the attr was successfully set in shortform, the
+                * transaction is committed and set to NULL.  Otherwise, is it
+                * converted from shortform to leaf, and the transaction is
+                * retained.
                  */
-               if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS)
-                       xfs_attr_shortform_create(args);
+               error = xfs_attr_set_shortform(args, &leaf_bp);
+               if (error || !args->trans)
+                       return error;
+       }
  
-               /*
-                * Try to add the attr to the attribute list in the inode.
-                */
-               error = xfs_attr_try_sf_addname(dp, args);
+       if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+               error = xfs_attr_leaf_addname(args);
                 if (error != -ENOSPC)
                         return error;
  
                 /*
-                * It won't fit in the shortform, transform to a leaf block.
-                * GROT: another possible req'mt for a double-split btree op.
+                * Promote the attribute list to the Btree format.
                  */
-               error = xfs_attr_shortform_to_leaf(args, &leaf_bp);
+               error = xfs_attr3_leaf_to_node(args);
                 if (error)
                         return error;
  
                 /*
-                * Prevent the leaf buffer from being unlocked so that a
-                * concurrent AIL push cannot grab the half-baked leaf
-                * buffer and run into problems with the write verifier.
-                * Once we're done rolling the transaction we can release
-                * the hold and add the attr to the leaf.
+                * Finish any deferred work items and roll the transaction once
+                * more.  The goal here is to call node_addname with the inode
+                * and transaction in the same state (inode locked and joined,
+                * transaction clean) no matter how we got to this step.
                  */
-               xfs_trans_bhold(args->trans, leaf_bp);
                 error = xfs_defer_finish(&args->trans);
-               xfs_trans_bhold_release(args->trans, leaf_bp);
-               if (error) {
-                       xfs_trans_brelse(args->trans, leaf_bp);
+               if (error)
+                       return error;
+
+               /*
+                * Commit the current trans (including the inode) and
+                * start a new one.
+                */
+               error = xfs_trans_roll_inode(&args->trans, dp);
+               if (error)
                         return error;
-               }
         }
  
-       if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
-               error = xfs_attr_leaf_addname(args);
-       else
-               error = xfs_attr_node_addname(args);
+       error = xfs_attr_node_addname(args);
         return error;
  }
  
+/*
+ * Return EEXIST if attr is found, or ENOATTR if not
+ */
+int
+xfs_has_attr(
+       struct xfs_da_args      *args)
+{
+       struct xfs_inode        *dp = args->dp;
+       struct xfs_buf          *bp = NULL;
+       int                     error;
+
+       if (!xfs_inode_hasattr(dp))
+               return -ENOATTR;
+
+       if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) {
+               ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+               return xfs_attr_sf_findname(args, NULL, NULL);
+       }
+
+       if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+               error = xfs_attr_leaf_hasname(args, &bp);
+
+               if (bp)
+                       xfs_trans_brelse(args->trans, bp);
+
+               return error;
+       }
+
+       return xfs_attr_node_hasname(args, NULL);
+}
+
  /*
   * Remove the attribute specified in @args.
   */
@@ -370,6 +469,15 @@ xfs_attr_set(
                                 args->total, 0, quota_flags);
                 if (error)
                         goto out_trans_cancel;
+
+               error = xfs_has_attr(args);
+               if (error == -EEXIST && (args->attr_flags & XATTR_CREATE))
+                       goto out_trans_cancel;
+               if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
+                       goto out_trans_cancel;
+               if (error != -ENOATTR && error != -EEXIST)
+                       goto out_trans_cancel;
+
                 error = xfs_attr_set_args(args);
                 if (error)
                         goto out_trans_cancel;
@@ -377,6 +485,10 @@ xfs_attr_set(
                 if (!args->trans)
                         goto out_unlock;
         } else {
+               error = xfs_has_attr(args);
+               if (error != -EEXIST)
+                       goto out_trans_cancel;
+
                 error = xfs_attr_remove_args(args);
                 if (error)
                         goto out_trans_cancel;
@@ -459,36 +571,54 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
   * External routines when attribute list is one block
   *========================================================================*/
  
+/* Store info about a remote block */
+STATIC void
+xfs_attr_save_rmt_blk(
+       struct xfs_da_args      *args)
+{
+       args->blkno2 = args->blkno;
+       args->index2 = args->index;
+       args->rmtblkno2 = args->rmtblkno;
+       args->rmtblkcnt2 = args->rmtblkcnt;
+       args->rmtvaluelen2 = args->rmtvaluelen;
+}
+
+/* Set stored info about a remote block */
+STATIC void
+xfs_attr_restore_rmt_blk(
+       struct xfs_da_args      *args)
+{
+       args->blkno = args->blkno2;
+       args->index = args->index2;
+       args->rmtblkno = args->rmtblkno2;
+       args->rmtblkcnt = args->rmtblkcnt2;
+       args->rmtvaluelen = args->rmtvaluelen2;
+}
+
  /*
- * Add a name to the leaf attribute list structure
+ * Tries to add an attribute to an inode in leaf form
   *
- * This leaf block cannot have a "remote" value, we only call this routine
- * if bmap_one_block() says there is only one block (ie: no remote blks).
+ * This function is meant to execute as part of a delayed operation and leaves
+ * the transaction handling to the caller.  On success the attribute is added
+ * and the inode and transaction are left dirty.  If there is not enough space,
+ * the attr data is converted to node format and -ENOSPC is returned. Caller is
+ * responsible for handling the dirty inode and transaction or adding the attr
+ * in node format.
   */
  STATIC int
-xfs_attr_leaf_addname(
-       struct xfs_da_args      *args)
+xfs_attr_leaf_try_add(
+       struct xfs_da_args      *args,
+       struct xfs_buf          *bp)
  {
-       struct xfs_inode        *dp;
-       struct xfs_buf          *bp;
-       int                     retval, error, forkoff;
-
-       trace_xfs_attr_leaf_addname(args);
-
-       /*
-        * Read the (only) block in the attribute list in.
-        */
-       dp = args->dp;
-       args->blkno = 0;
-       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
-       if (error)
-               return error;
+       int                     retval;
  
         /*
          * Look up the given attribute in the leaf block.  Figure out if
          * the given flags produce an error or call for an atomic rename.
          */
-       retval = xfs_attr3_leaf_lookup_int(bp, args);
+       retval = xfs_attr_leaf_hasname(args, &bp);
+       if (retval != -ENOATTR && retval != -EEXIST)
+               return retval;
         if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
                 goto out_brelse;
         if (retval == -EEXIST) {
@@ -499,11 +629,7 @@ xfs_attr_leaf_addname(
  
                 /* save the attribute state for later removal*/
                 args->op_flags |= XFS_DA_OP_RENAME;     /* an atomic rename */
-               args->blkno2 = args->blkno;             /* set 2nd entry info*/
-               args->index2 = args->index;
-               args->rmtblkno2 = args->rmtblkno;
-               args->rmtblkcnt2 = args->rmtblkcnt;
-               args->rmtvaluelen2 = args->rmtvaluelen;
+               xfs_attr_save_rmt_blk(args);
  
                 /*
                  * clear the remote attr state now that it is saved so that the
@@ -516,37 +642,35 @@ xfs_attr_leaf_addname(
         }
  
         /*
-        * Add the attribute to the leaf block, transitioning to a Btree
-        * if required.
+        * Add the attribute to the leaf block
          */
-       retval = xfs_attr3_leaf_add(bp, args);
-       if (retval == -ENOSPC) {
-               /*
-                * Promote the attribute list to the Btree format, then
-                * Commit that transaction so that the node_addname() call
-                * can manage its own transactions.
-                */
-               error = xfs_attr3_leaf_to_node(args);
-               if (error)
-                       return error;
-               error = xfs_defer_finish(&args->trans);
-               if (error)
-                       return error;
+       return xfs_attr3_leaf_add(bp, args);
  
-               /*
-                * Commit the current trans (including the inode) and start
-                * a new one.
-                */
-               error = xfs_trans_roll_inode(&args->trans, dp);
-               if (error)
-                       return error;
+out_brelse:
+       xfs_trans_brelse(args->trans, bp);
+       return retval;
+}
  
-               /*
-                * Fob the whole rest of the problem off on the Btree code.
-                */
-               error = xfs_attr_node_addname(args);
+
+/*
+ * Add a name to the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_addname(
+       struct xfs_da_args      *args)
+{
+       int                     error, forkoff;
+       struct xfs_buf          *bp = NULL;
+       struct xfs_inode        *dp = args->dp;
+
+       trace_xfs_attr_leaf_addname(args);
+
+       error = xfs_attr_leaf_try_add(args, bp);
+       if (error)
                 return error;
-       }
  
         /*
          * Commit the transaction that added the attr name so that
@@ -568,75 +692,93 @@ xfs_attr_leaf_addname(
                         return error;
         }
  
-       /*
-        * If this is an atomic rename operation, we must "flip" the
-        * incomplete flags on the "new" and "old" attribute/value pairs
-        * so that one disappears and one appears atomically.  Then we
-        * must remove the "old" attribute/value pair.
-        */
-       if (args->op_flags & XFS_DA_OP_RENAME) {
+       if (!(args->op_flags & XFS_DA_OP_RENAME)) {
                 /*
-                * In a separate transaction, set the incomplete flag on the
-                * "old" attr and clear the incomplete flag on the "new" attr.
+                * Added a "remote" value, just clear the incomplete flag.
                  */
-               error = xfs_attr3_leaf_flipflags(args);
+               if (args->rmtblkno > 0)
+                       error = xfs_attr3_leaf_clearflag(args);
+
+               return error;
+       }
+
+       /*
+        * If this is an atomic rename operation, we must "flip" the incomplete
+        * flags on the "new" and "old" attribute/value pairs so that one
+        * disappears and one appears atomically.  Then we must remove the "old"
+        * attribute/value pair.
+        *
+        * In a separate transaction, set the incomplete flag on the "old" attr
+        * and clear the incomplete flag on the "new" attr.
+        */
+
+       error = xfs_attr3_leaf_flipflags(args);
+       if (error)
+               return error;
+       /*
+        * Commit the flag value change and start the next trans in series.
+        */
+       error = xfs_trans_roll_inode(&args->trans, args->dp);
+       if (error)
+               return error;
+
+       /*
+        * Dismantle the "old" attribute/value pair by removing a "remote" value
+        * (if it exists).
+        */
+       xfs_attr_restore_rmt_blk(args);
+
+       if (args->rmtblkno) {
+               error = xfs_attr_rmtval_invalidate(args);
                 if (error)
                         return error;
  
-               /*
-                * Dismantle the "old" attribute/value pair by removing
-                * a "remote" value (if it exists).
-                */
-               args->index = args->index2;
-               args->blkno = args->blkno2;
-               args->rmtblkno = args->rmtblkno2;
-               args->rmtblkcnt = args->rmtblkcnt2;
-               args->rmtvaluelen = args->rmtvaluelen2;
-               if (args->rmtblkno) {
-                       error = xfs_attr_rmtval_remove(args);
-                       if (error)
-                               return error;
-               }
-
-               /*
-                * Read in the block containing the "old" attr, then
-                * remove the "old" attr from that block (neat, huh!)
-                */
-               error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
-                                          &bp);
+               error = xfs_attr_rmtval_remove(args);
                 if (error)
                         return error;
+       }
  
-               xfs_attr3_leaf_remove(bp, args);
+       /*
+        * Read in the block containing the "old" attr, then remove the "old"
+        * attr from that block (neat, huh!)
+        */
+       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+                                  &bp);
+       if (error)
+               return error;
  
-               /*
-                * If the result is small enough, shrink it all into the inode.
-                */
-               if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-                       error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
-                       /* bp is gone due to xfs_da_shrink_inode */
-                       if (error)
-                               return error;
-                       error = xfs_defer_finish(&args->trans);
-                       if (error)
-                               return error;
-               }
+       xfs_attr3_leaf_remove(bp, args);
  
-               /*
-                * Commit the remove and start the next trans in series.
-                */
-               error = xfs_trans_roll_inode(&args->trans, dp);
+       /*
+        * If the result is small enough, shrink it all into the inode.
+        */
+       forkoff = xfs_attr_shortform_allfit(bp, dp);
+       if (forkoff)
+               error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+               /* bp is gone due to xfs_da_shrink_inode */
+
+       return error;
+}
+
+/*
+ * Return EEXIST if attr is found, or ENOATTR if not
+ */
+STATIC int
+xfs_attr_leaf_hasname(
+       struct xfs_da_args      *args,
+       struct xfs_buf          **bp)
+{
+       int                     error = 0;
+
+       error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp);
+       if (error)
+               return error;
+
+       error = xfs_attr3_leaf_lookup_int(*bp, args);
+       if (error != -ENOATTR && error != -EEXIST)
+               xfs_trans_brelse(args->trans, *bp);
  
-       } else if (args->rmtblkno > 0) {
-               /*
-                * Added a "remote" value, just clear the incomplete flag.
-                */
-               error = xfs_attr3_leaf_clearflag(args);
-       }
         return error;
-out_brelse:
-       xfs_trans_brelse(args->trans, bp);
-       return retval;
  }
  
  /*
@@ -659,31 +801,25 @@ xfs_attr_leaf_removename(
          * Remove the attribute.
          */
         dp = args->dp;
-       args->blkno = 0;
-       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
-       if (error)
-               return error;
  
-       error = xfs_attr3_leaf_lookup_int(bp, args);
+       error = xfs_attr_leaf_hasname(args, &bp);
+
         if (error == -ENOATTR) {
                 xfs_trans_brelse(args->trans, bp);
                 return error;
-       }
+       } else if (error != -EEXIST)
+               return error;
  
         xfs_attr3_leaf_remove(bp, args);
  
         /*
          * If the result is small enough, shrink it all into the inode.
          */
-       if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-               error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+       forkoff = xfs_attr_shortform_allfit(bp, dp);
+       if (forkoff)
+               return xfs_attr3_leaf_to_shortform(bp, args, forkoff);
                 /* bp is gone due to xfs_da_shrink_inode */
-               if (error)
-                       return error;
-               error = xfs_defer_finish(&args->trans);
-               if (error)
-                       return error;
-       }
+
         return 0;
  }
  
@@ -703,21 +839,53 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
  
         trace_xfs_attr_leaf_get(args);
  
-       args->blkno = 0;
-       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
-       if (error)
-               return error;
+       error = xfs_attr_leaf_hasname(args, &bp);
  
-       error = xfs_attr3_leaf_lookup_int(bp, args);
-       if (error != -EEXIST)  {
+       if (error == -ENOATTR)  {
                 xfs_trans_brelse(args->trans, bp);
                 return error;
-       }
+       } else if (error != -EEXIST)
+               return error;
+
+
         error = xfs_attr3_leaf_getvalue(bp, args);
         xfs_trans_brelse(args->trans, bp);
         return error;
  }
  
+/*
+ * Return EEXIST if attr is found, or ENOATTR if not
+ * statep: If not null is set to point at the found state.  Caller will
+ *         be responsible for freeing the state in this case.
+ */
+STATIC int
+xfs_attr_node_hasname(
+       struct xfs_da_args      *args,
+       struct xfs_da_state     **statep)
+{
+       struct xfs_da_state     *state;
+       int                     retval, error;
+
+       state = xfs_da_state_alloc(args);
+       if (statep != NULL)
+               *statep = NULL;
+
+       /*
+        * Search to see if name exists, and get back a pointer to it.
+        */
+       error = xfs_da3_node_lookup_int(state, &retval);
+       if (error) {
+               xfs_da_state_free(state);
+               return error;
+       }
+
+       if (statep != NULL)
+               *statep = state;
+       else
+               xfs_da_state_free(state);
+       return retval;
+}
+
  /*========================================================================
   * External routines when attribute list size > geo->blksize
   *========================================================================*/
@@ -739,7 +907,6 @@ xfs_attr_node_addname(
         struct xfs_da_state     *state;
         struct xfs_da_state_blk *blk;
         struct xfs_inode        *dp;
-       struct xfs_mount        *mp;
         int                     retval, error;
  
         trace_xfs_attr_node_addname(args);
@@ -748,19 +915,15 @@ xfs_attr_node_addname(
          * Fill in bucket of arguments/results/context to carry around.
          */
         dp = args->dp;
-       mp = dp->i_mount;
  restart:
-       state = xfs_da_state_alloc();
-       state->args = args;
-       state->mp = mp;
-
         /*
          * Search to see if name already exists, and get back a pointer
          * to where it should go.
          */
-       error = xfs_da3_node_lookup_int(state, &retval);
-       if (error)
+       retval = xfs_attr_node_hasname(args, &state);
+       if (retval != -ENOATTR && retval != -EEXIST)
                 goto out;
+
         blk = &state->path.blk[ state->path.active-1 ];
         ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
         if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
@@ -773,11 +936,7 @@ restart:
  
                 /* save the attribute state for later removal*/
                 args->op_flags |= XFS_DA_OP_RENAME;     /* atomic rename op */
-               args->blkno2 = args->blkno;             /* set 2nd entry info*/
-               args->index2 = args->index;
-               args->rmtblkno2 = args->rmtblkno;
-               args->rmtblkcnt2 = args->rmtblkcnt;
-               args->rmtvaluelen2 = args->rmtvaluelen;
+               xfs_attr_save_rmt_blk(args);
  
                 /*
                  * clear the remote attr state now that it is saved so that the
@@ -863,82 +1022,75 @@ restart:
                         return error;
         }
  
-       /*
-        * If this is an atomic rename operation, we must "flip" the
-        * incomplete flags on the "new" and "old" attribute/value pairs
-        * so that one disappears and one appears atomically.  Then we
-        * must remove the "old" attribute/value pair.
-        */
-       if (args->op_flags & XFS_DA_OP_RENAME) {
+       if (!(args->op_flags & XFS_DA_OP_RENAME)) {
                 /*
-                * In a separate transaction, set the incomplete flag on the
-                * "old" attr and clear the incomplete flag on the "new" attr.
+                * Added a "remote" value, just clear the incomplete flag.
                  */
-               error = xfs_attr3_leaf_flipflags(args);
-               if (error)
-                       goto out;
+               if (args->rmtblkno > 0)
+                       error = xfs_attr3_leaf_clearflag(args);
+               retval = error;
+               goto out;
+       }
  
-               /*
-                * Dismantle the "old" attribute/value pair by removing
-                * a "remote" value (if it exists).
-                */
-               args->index = args->index2;
-               args->blkno = args->blkno2;
-               args->rmtblkno = args->rmtblkno2;
-               args->rmtblkcnt = args->rmtblkcnt2;
-               args->rmtvaluelen = args->rmtvaluelen2;
-               if (args->rmtblkno) {
-                       error = xfs_attr_rmtval_remove(args);
-                       if (error)
-                               return error;
-               }
+       /*
+        * If this is an atomic rename operation, we must "flip" the incomplete
+        * flags on the "new" and "old" attribute/value pairs so that one
+        * disappears and one appears atomically.  Then we must remove the "old"
+        * attribute/value pair.
+        *
+        * In a separate transaction, set the incomplete flag on the "old" attr
+        * and clear the incomplete flag on the "new" attr.
+        */
+       error = xfs_attr3_leaf_flipflags(args);
+       if (error)
+               goto out;
+       /*
+        * Commit the flag value change and start the next trans in series
+        */
+       error = xfs_trans_roll_inode(&args->trans, args->dp);
+       if (error)
+               goto out;
  
-               /*
-                * Re-find the "old" attribute entry after any split ops.
-                * The INCOMPLETE flag means that we will find the "old"
-                * attr, not the "new" one.
-                */
-               args->attr_filter |= XFS_ATTR_INCOMPLETE;
-               state = xfs_da_state_alloc();
-               state->args = args;
-               state->mp = mp;
-               state->inleaf = 0;
-               error = xfs_da3_node_lookup_int(state, &retval);
+       /*
+        * Dismantle the "old" attribute/value pair by removing a "remote" value
+        * (if it exists).
+        */
+       xfs_attr_restore_rmt_blk(args);
+
+       if (args->rmtblkno) {
+               error = xfs_attr_rmtval_invalidate(args);
                 if (error)
-                       goto out;
+                       return error;
  
-               /*
-                * Remove the name and update the hashvals in the tree.
-                */
-               blk = &state->path.blk[ state->path.active-1 ];
-               ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-               error = xfs_attr3_leaf_remove(blk->bp, args);
-               xfs_da3_fixhashpath(state, &state->path);
+               error = xfs_attr_rmtval_remove(args);
+               if (error)
+                       return error;
+       }
  
-               /*
-                * Check to see if the tree needs to be collapsed.
-                */
-               if (retval && (state->path.active > 1)) {
-                       error = xfs_da3_join(state);
-                       if (error)
-                               goto out;
-                       error = xfs_defer_finish(&args->trans);
-                       if (error)
-                               goto out;
-               }
+       /*
+        * Re-find the "old" attribute entry after any split ops. The INCOMPLETE
+        * flag means that we will find the "old" attr, not the "new" one.
+        */
+       args->attr_filter |= XFS_ATTR_INCOMPLETE;
+       state = xfs_da_state_alloc(args);
+       state->inleaf = 0;
+       error = xfs_da3_node_lookup_int(state, &retval);
+       if (error)
+               goto out;
  
-               /*
-                * Commit and start the next trans in the chain.
-                */
-               error = xfs_trans_roll_inode(&args->trans, dp);
-               if (error)
-                       goto out;
+       /*
+        * Remove the name and update the hashvals in the tree.
+        */
+       blk = &state->path.blk[state->path.active-1];
+       ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+       error = xfs_attr3_leaf_remove(blk->bp, args);
+       xfs_da3_fixhashpath(state, &state->path);
  
-       } else if (args->rmtblkno > 0) {
-               /*
-                * Added a "remote" value, just clear the incomplete flag.
-                */
-               error = xfs_attr3_leaf_clearflag(args);
+       /*
+        * Check to see if the tree needs to be collapsed.
+        */
+       if (retval && (state->path.active > 1)) {
+               error = xfs_da3_join(state);
                 if (error)
                         goto out;
         }
@@ -952,6 +1104,114 @@ out:
         return retval;
  }
  
+/*
+ * Shrink an attribute from leaf to shortform
+ */
+STATIC int
+xfs_attr_node_shrink(
+       struct xfs_da_args      *args,
+       struct xfs_da_state     *state)
+{
+       struct xfs_inode        *dp = args->dp;
+       int                     error, forkoff;
+       struct xfs_buf          *bp;
+
+       /*
+        * Have to get rid of the copy of this dabuf in the state.
+        */
+       ASSERT(state->path.active == 1);
+       ASSERT(state->path.blk[0].bp);
+       state->path.blk[0].bp = NULL;
+
+       error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+       if (error)
+               return error;
+
+       forkoff = xfs_attr_shortform_allfit(bp, dp);
+       if (forkoff) {
+               error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+               /* bp is gone due to xfs_da_shrink_inode */
+       } else
+               xfs_trans_brelse(args->trans, bp);
+
+       return error;
+}
+
+/*
+ * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
+ * for later deletion of the entry.
+ */
+STATIC int
+xfs_attr_leaf_mark_incomplete(
+       struct xfs_da_args      *args,
+       struct xfs_da_state     *state)
+{
+       int                     error;
+
+       /*
+        * Fill in disk block numbers in the state structure
+        * so that we can get the buffers back after we commit
+        * several transactions in the following calls.
+        */
+       error = xfs_attr_fillstate(state);
+       if (error)
+               return error;
+
+       /*
+        * Mark the attribute as INCOMPLETE
+        */
+       return xfs_attr3_leaf_setflag(args);
+}
+
+/*
+ * Initial setup for xfs_attr_node_removename.  Make sure the attr is there and
+ * the blocks are valid.  Attr keys with remote blocks will be marked
+ * incomplete.
+ */
+STATIC
+int xfs_attr_node_removename_setup(
+       struct xfs_da_args      *args,
+       struct xfs_da_state     **state)
+{
+       int                     error;
+
+       error = xfs_attr_node_hasname(args, state);
+       if (error != -EEXIST)
+               return error;
+
+       ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL);
+       ASSERT((*state)->path.blk[(*state)->path.active - 1].magic ==
+               XFS_ATTR_LEAF_MAGIC);
+
+       if (args->rmtblkno > 0) {
+               error = xfs_attr_leaf_mark_incomplete(args, *state);
+               if (error)
+                       return error;
+
+               return xfs_attr_rmtval_invalidate(args);
+       }
+
+       return 0;
+}
+
+STATIC int
+xfs_attr_node_remove_rmt(
+       struct xfs_da_args      *args,
+       struct xfs_da_state     *state)
+{
+       int                     error = 0;
+
+       error = xfs_attr_rmtval_remove(args);
+       if (error)
+               return error;
+
+       /*
+        * Refill the state structure with buffers, the prior calls released our
+        * buffers.
+        */
+       return xfs_attr_refillstate(state);
+}
+
  /*
   * Remove a name from a B-tree attribute list.
   *
@@ -965,64 +1225,22 @@ xfs_attr_node_removename(
  {
         struct xfs_da_state     *state;
         struct xfs_da_state_blk *blk;
-       struct xfs_inode        *dp;
-       struct xfs_buf          *bp;
-       int                     retval, error, forkoff;
+       int                     retval, error;
+       struct xfs_inode        *dp = args->dp;
  
         trace_xfs_attr_node_removename(args);
  
-       /*
-        * Tie a string around our finger to remind us where we are.
-        */
-       dp = args->dp;
-       state = xfs_da_state_alloc();
-       state->args = args;
-       state->mp = dp->i_mount;
-
-       /*
-        * Search to see if name exists, and get back a pointer to it.
-        */
-       error = xfs_da3_node_lookup_int(state, &retval);
-       if (error || (retval != -EEXIST)) {
-               if (error == 0)
-                       error = retval;
+       error = xfs_attr_node_removename_setup(args, &state);
+       if (error)
                 goto out;
-       }
  
         /*
          * If there is an out-of-line value, de-allocate the blocks.
          * This is done before we remove the attribute so that we don't
          * overflow the maximum size of a transaction and/or hit a deadlock.
          */
-       blk = &state->path.blk[ state->path.active-1 ];
-       ASSERT(blk->bp != NULL);
-       ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
         if (args->rmtblkno > 0) {
-               /*
-                * Fill in disk block numbers in the state structure
-                * so that we can get the buffers back after we commit
-                * several transactions in the following calls.
-                */
-               error = xfs_attr_fillstate(state);
-               if (error)
-                       goto out;
-
-               /*
-                * Mark the attribute as INCOMPLETE, then bunmapi() the
-                * remote value.
-                */
-               error = xfs_attr3_leaf_setflag(args);
-               if (error)
-                       goto out;
-               error = xfs_attr_rmtval_remove(args);
-               if (error)
-                       goto out;
-
-               /*
-                * Refill the state structure with buffers, the prior calls
-                * released our buffers.
-                */
-               error = xfs_attr_refillstate(state);
+               error = xfs_attr_node_remove_rmt(args, state);
                 if (error)
                         goto out;
         }
@@ -1056,33 +1274,12 @@ xfs_attr_node_removename(
         /*
          * If the result is small enough, push it all into the inode.
          */
-       if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
-               /*
-                * Have to get rid of the copy of this dabuf in the state.
-                */
-               ASSERT(state->path.active == 1);
-               ASSERT(state->path.blk[0].bp);
-               state->path.blk[0].bp = NULL;
-
-               error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
-               if (error)
-                       goto out;
-
-               if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-                       error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
-                       /* bp is gone due to xfs_da_shrink_inode */
-                       if (error)
-                               goto out;
-                       error = xfs_defer_finish(&args->trans);
-                       if (error)
-                               goto out;
-               } else
-                       xfs_trans_brelse(args->trans, bp);
-       }
-       error = 0;
+       if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
+               error = xfs_attr_node_shrink(args, state);
  
  out:
-       xfs_da_state_free(state);
+       if (state)
+               xfs_da_state_free(state);
         return error;
  }
  
@@ -1198,47 +1395,41 @@ xfs_attr_refillstate(xfs_da_state_t *state)
   * Returns 0 on successful retrieval, otherwise an error.
   */
  STATIC int
-xfs_attr_node_get(xfs_da_args_t *args)
+xfs_attr_node_get(
+       struct xfs_da_args      *args)
  {
-       xfs_da_state_t *state;
-       xfs_da_state_blk_t *blk;
-       int error, retval;
-       int i;
+       struct xfs_da_state     *state;
+       struct xfs_da_state_blk *blk;
+       int                     i;
+       int                     error;
  
         trace_xfs_attr_node_get(args);
  
-       state = xfs_da_state_alloc();
-       state->args = args;
-       state->mp = args->dp->i_mount;
-
         /*
          * Search to see if name exists, and get back a pointer to it.
          */
-       error = xfs_da3_node_lookup_int(state, &retval);
-       if (error) {
-               retval = error;
-               goto out_release;
-       }
-       if (retval != -EEXIST)
+       error = xfs_attr_node_hasname(args, &state);
+       if (error != -EEXIST)
                 goto out_release;
  
         /*
          * Get the value, local or "remote"
          */
         blk = &state->path.blk[state->path.active - 1];
-       retval = xfs_attr3_leaf_getvalue(blk->bp, args);
+       error = xfs_attr3_leaf_getvalue(blk->bp, args);
  
         /*
          * If not in a transaction, we have to release all the buffers.
          */
  out_release:
-       for (i = 0; i < state->path.active; i++) {
+       for (i = 0; state != NULL && i < state->path.active; i++) {
                 xfs_trans_brelse(args->trans, state->path.blk[i].bp);
                 state->path.blk[i].bp = NULL;
         }
  
-       xfs_da_state_free(state);
-       return retval;
+       if (state)
+               xfs_da_state_free(state);
+       return error;
  }
  
  /* Returns true if the attribute entry name is valid. */
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h

index db47176..3e97a93 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -89,6 +89,7 @@ int xfs_attr_get_ilocked(struct xfs_da_args *args);
  int xfs_attr_get(struct xfs_da_args *args);
  int xfs_attr_set(struct xfs_da_args *args);
  int xfs_attr_set_args(struct xfs_da_args *args);
+int xfs_has_attr(struct xfs_da_args *args);
  int xfs_attr_remove_args(struct xfs_da_args *args);
  bool xfs_attr_namecheck(const void *name, size_t length);
  
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c

index 2f7e89e..8623c81 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -659,19 +659,66 @@ xfs_attr_shortform_create(
         xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
  }
  
+/*
+ * Return -EEXIST if attr is found, or -ENOATTR if not
+ * args:  args containing attribute name and namelen
+ * sfep:  If not null, pointer will be set to the last attr entry found on
+         -EEXIST.  On -ENOATTR pointer is left at the last entry in the list
+ * basep: If not null, pointer is set to the byte offset of the entry in the
+ *       list on -EEXIST.  On -ENOATTR, pointer is left at the byte offset of
+ *       the last entry in the list
+ */
+int
+xfs_attr_sf_findname(
+       struct xfs_da_args       *args,
+       struct xfs_attr_sf_entry **sfep,
+       unsigned int             *basep)
+{
+       struct xfs_attr_shortform *sf;
+       struct xfs_attr_sf_entry *sfe;
+       unsigned int            base = sizeof(struct xfs_attr_sf_hdr);
+       int                     size = 0;
+       int                     end;
+       int                     i;
+
+       sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data;
+       sfe = &sf->list[0];
+       end = sf->hdr.count;
+       for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
+                            base += size, i++) {
+               size = XFS_ATTR_SF_ENTSIZE(sfe);
+               if (!xfs_attr_match(args, sfe->namelen, sfe->nameval,
+                                   sfe->flags))
+                       continue;
+               break;
+       }
+
+       if (sfep != NULL)
+               *sfep = sfe;
+
+       if (basep != NULL)
+               *basep = base;
+
+       if (i == end)
+               return -ENOATTR;
+       return -EEXIST;
+}
+
  /*
   * Add a name/value pair to the shortform attribute list.
   * Overflow from the inode has already been checked for.
   */
  void
-xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
+xfs_attr_shortform_add(
+       struct xfs_da_args              *args,
+       int                             forkoff)
  {
-       xfs_attr_shortform_t *sf;
-       xfs_attr_sf_entry_t *sfe;
-       int i, offset, size;
-       xfs_mount_t *mp;
-       xfs_inode_t *dp;
-       struct xfs_ifork *ifp;
+       struct xfs_attr_shortform       *sf;
+       struct xfs_attr_sf_entry        *sfe;
+       int                             offset, size;
+       struct xfs_mount                *mp;
+       struct xfs_inode                *dp;
+       struct xfs_ifork                *ifp;
  
         trace_xfs_attr_sf_add(args);
  
@@ -682,11 +729,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
         ifp = dp->i_afp;
         ASSERT(ifp->if_flags & XFS_IFINLINE);
         sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
-       sfe = &sf->list[0];
-       for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
-               ASSERT(!xfs_attr_match(args, sfe->namelen, sfe->nameval,
-                       sfe->flags));
-       }
+       if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST)
+               ASSERT(0);
  
         offset = (char *)sfe - (char *)sf;
         size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
@@ -728,31 +772,27 @@ xfs_attr_fork_remove(
   * Remove an attribute from the shortform attribute list structure.
   */
  int
-xfs_attr_shortform_remove(xfs_da_args_t *args)
+xfs_attr_shortform_remove(
+       struct xfs_da_args              *args)
  {
-       xfs_attr_shortform_t *sf;
-       xfs_attr_sf_entry_t *sfe;
-       int base, size=0, end, totsize, i;
-       xfs_mount_t *mp;
-       xfs_inode_t *dp;
+       struct xfs_attr_shortform       *sf;
+       struct xfs_attr_sf_entry        *sfe;
+       int                             size = 0, end, totsize;
+       unsigned int                    base;
+       struct xfs_mount                *mp;
+       struct xfs_inode                *dp;
+       int                             error;
  
         trace_xfs_attr_sf_remove(args);
  
         dp = args->dp;
         mp = dp->i_mount;
-       base = sizeof(xfs_attr_sf_hdr_t);
         sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
-       sfe = &sf->list[0];
-       end = sf->hdr.count;
-       for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
-                                       base += size, i++) {
-               size = XFS_ATTR_SF_ENTSIZE(sfe);
-               if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
-                               sfe->flags))
-                       break;
-       }
-       if (i == end)
-               return -ENOATTR;
+
+       error = xfs_attr_sf_findname(args, &sfe, &base);
+       if (error != -EEXIST)
+               return error;
+       size = XFS_ATTR_SF_ENTSIZE(sfe);
  
         /*
          * Fix up the attribute fork data, covering the hole
@@ -2742,10 +2782,7 @@ xfs_attr3_leaf_clearflag(
                          XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
         }
  
-       /*
-        * Commit the flag value change and start the next trans in series.
-        */
-       return xfs_trans_roll_inode(&args->trans, args->dp);
+       return 0;
  }
  
  /*
@@ -2793,10 +2830,7 @@ xfs_attr3_leaf_setflag(
                          XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
         }
  
-       /*
-        * Commit the flag value change and start the next trans in series.
-        */
-       return xfs_trans_roll_inode(&args->trans, args->dp);
+       return 0;
  }
  
  /*
@@ -2911,10 +2945,5 @@ xfs_attr3_leaf_flipflags(
                          XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
         }
  
-       /*
-        * Commit the flag value change and start the next trans in series.
-        */
-       error = xfs_trans_roll_inode(&args->trans, args->dp);
-
-       return error;
+       return 0;
  }
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h

index 5be6be3..9b1c59f 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -52,6 +52,9 @@ int   xfs_attr_shortform_getvalue(struct xfs_da_args *args);
  int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
                         struct xfs_buf **leaf_bp);
  int    xfs_attr_shortform_remove(struct xfs_da_args *args);
+int    xfs_attr_sf_findname(struct xfs_da_args *args,
+                            struct xfs_attr_sf_entry **sfep,
+                            unsigned int *basep);
  int    xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
  int    xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
  xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip);
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c

index 01ad7f3..3f80ced 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -440,32 +440,23 @@ xfs_attr_rmtval_get(
  }
  
  /*
- * Write the value associated with an attribute into the out-of-line buffer
- * that we have defined for it.
+ * Find a "hole" in the attribute address space large enough for us to drop the
+ * new attribute's value into
   */
-int
-xfs_attr_rmtval_set(
+STATIC int
+xfs_attr_rmt_find_hole(
         struct xfs_da_args      *args)
  {
         struct xfs_inode        *dp = args->dp;
         struct xfs_mount        *mp = dp->i_mount;
-       struct xfs_bmbt_irec    map;
-       xfs_dablk_t             lblkno;
-       xfs_fileoff_t           lfileoff = 0;
-       uint8_t                 *src = args->value;
-       int                     blkcnt;
-       int                     valuelen;
-       int                     nmap;
         int                     error;
-       int                     offset = 0;
-
-       trace_xfs_attr_rmtval_set(args);
+       int                     blkcnt;
+       xfs_fileoff_t           lfileoff = 0;
  
         /*
-        * Find a "hole" in the attribute address space large enough for
-        * us to drop the new attribute's value into. Because CRC enable
-        * attributes have headers, we can't just do a straight byte to FSB
-        * conversion and have to take the header space into account.
+        * Because CRC enable attributes have headers, we can't just do a
+        * straight byte to FSB conversion and have to take the header space
+        * into account.
          */
         blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
         error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
@@ -473,48 +464,26 @@ xfs_attr_rmtval_set(
         if (error)
                 return error;
  
-       args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+       args->rmtblkno = (xfs_dablk_t)lfileoff;
         args->rmtblkcnt = blkcnt;
  
-       /*
-        * Roll through the "value", allocating blocks on disk as required.
-        */
-       while (blkcnt > 0) {
-               /*
-                * Allocate a single extent, up to the size of the value.
-                *
-                * Note that we have to consider this a data allocation as we
-                * write the remote attribute without logging the contents.
-                * Hence we must ensure that we aren't using blocks that are on
-                * the busy list so that we don't overwrite blocks which have
-                * recently been freed but their transactions are not yet
-                * committed to disk. If we overwrite the contents of a busy
-                * extent and then crash then the block may not contain the
-                * correct metadata after log recovery occurs.
-                */
-               nmap = 1;
-               error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
-                                 blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map,
-                                 &nmap);
-               if (error)
-                       return error;
-               error = xfs_defer_finish(&args->trans);
-               if (error)
-                       return error;
-
-               ASSERT(nmap == 1);
-               ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
-                      (map.br_startblock != HOLESTARTBLOCK));
-               lblkno += map.br_blockcount;
-               blkcnt -= map.br_blockcount;
+       return 0;
+}
  
-               /*
-                * Start the next trans in the chain.
-                */
-               error = xfs_trans_roll_inode(&args->trans, dp);
-               if (error)
-                       return error;
-       }
+STATIC int
+xfs_attr_rmtval_set_value(
+       struct xfs_da_args      *args)
+{
+       struct xfs_inode        *dp = args->dp;
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_bmbt_irec    map;
+       xfs_dablk_t             lblkno;
+       uint8_t                 *src = args->value;
+       int                     blkcnt;
+       int                     valuelen;
+       int                     nmap;
+       int                     error;
+       int                     offset = 0;
  
         /*
          * Roll through the "value", copying the attribute value to the
@@ -594,20 +563,83 @@ xfs_attr_rmtval_stale(
         return 0;
  }
  
+/*
+ * Write the value associated with an attribute into the out-of-line buffer
+ * that we have defined for it.
+ */
+int
+xfs_attr_rmtval_set(
+       struct xfs_da_args      *args)
+{
+       struct xfs_inode        *dp = args->dp;
+       struct xfs_bmbt_irec    map;
+       xfs_dablk_t             lblkno;
+       int                     blkcnt;
+       int                     nmap;
+       int                     error;
+
+       trace_xfs_attr_rmtval_set(args);
+
+       error = xfs_attr_rmt_find_hole(args);
+       if (error)
+               return error;
+
+       blkcnt = args->rmtblkcnt;
+       lblkno = (xfs_dablk_t)args->rmtblkno;
+       /*
+        * Roll through the "value", allocating blocks on disk as required.
+        */
+       while (blkcnt > 0) {
+               /*
+                * Allocate a single extent, up to the size of the value.
+                *
+                * Note that we have to consider this a data allocation as we
+                * write the remote attribute without logging the contents.
+                * Hence we must ensure that we aren't using blocks that are on
+                * the busy list so that we don't overwrite blocks which have
+                * recently been freed but their transactions are not yet
+                * committed to disk. If we overwrite the contents of a busy
+                * extent and then crash then the block may not contain the
+                * correct metadata after log recovery occurs.
+                */
+               nmap = 1;
+               error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
+                                 blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map,
+                                 &nmap);
+               if (error)
+                       return error;
+               error = xfs_defer_finish(&args->trans);
+               if (error)
+                       return error;
+
+               ASSERT(nmap == 1);
+               ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+                      (map.br_startblock != HOLESTARTBLOCK));
+               lblkno += map.br_blockcount;
+               blkcnt -= map.br_blockcount;
+
+               /*
+                * Start the next trans in the chain.
+                */
+               error = xfs_trans_roll_inode(&args->trans, dp);
+               if (error)
+                       return error;
+       }
+
+       return xfs_attr_rmtval_set_value(args);
+}
+
  /*
   * Remove the value associated with an attribute by deleting the
   * out-of-line buffer that it is stored on.
   */
  int
-xfs_attr_rmtval_remove(
+xfs_attr_rmtval_invalidate(
         struct xfs_da_args      *args)
  {
         xfs_dablk_t             lblkno;
         int                     blkcnt;
         int                     error;
-       int                     done;
-
-       trace_xfs_attr_rmtval_remove(args);
  
         /*
          * Roll through the "value", invalidating the attribute value's blocks.
@@ -635,21 +667,29 @@ xfs_attr_rmtval_remove(
                 lblkno += map.br_blockcount;
                 blkcnt -= map.br_blockcount;
         }
+       return 0;
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+int
+xfs_attr_rmtval_remove(
+       struct xfs_da_args      *args)
+{
+       int                     error;
+       int                     retval;
+
+       trace_xfs_attr_rmtval_remove(args);
  
         /*
          * Keep de-allocating extents until the remote-value region is gone.
          */
-       lblkno = args->rmtblkno;
-       blkcnt = args->rmtblkcnt;
-       done = 0;
-       while (!done) {
-               error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
-                                   XFS_BMAPI_ATTRFORK, 1, &done);
-               if (error)
-                       return error;
-               error = xfs_defer_finish(&args->trans);
-               if (error)
-                       return error;
+       do {
+               retval = __xfs_attr_rmtval_remove(args);
+               if (retval && retval != -EAGAIN)
+                       return retval;
  
                 /*
                  * Close out trans and start the next one in the chain.
@@ -657,6 +697,36 @@ xfs_attr_rmtval_remove(
                 error = xfs_trans_roll_inode(&args->trans, args->dp);
                 if (error)
                         return error;
-       }
+       } while (retval == -EAGAIN);
+
         return 0;
  }
+
+/*
+ * Remove the value associated with an attribute by deleting the out-of-line
+ * buffer that it is stored on. Returns EAGAIN for the caller to refresh the
+ * transaction and re-call the function
+ */
+int
+__xfs_attr_rmtval_remove(
+       struct xfs_da_args      *args)
+{
+       int                     error, done;
+
+       /*
+        * Unmap value blocks for this attr.
+        */
+       error = xfs_bunmapi(args->trans, args->dp, args->rmtblkno,
+                           args->rmtblkcnt, XFS_BMAPI_ATTRFORK, 1, &done);
+       if (error)
+               return error;
+
+       error = xfs_defer_finish(&args->trans);
+       if (error)
+               return error;
+
+       if (!done)
+               return -EAGAIN;
+
+       return error;
+}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h

index e1144f2..9eee615 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -13,5 +13,6 @@ int xfs_attr_rmtval_set(struct xfs_da_args *args);
  int xfs_attr_rmtval_remove(struct xfs_da_args *args);
  int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
                 xfs_buf_flags_t incore_flags);
-
+int xfs_attr_rmtval_invalidate(struct xfs_da_args *args);
+int __xfs_attr_rmtval_remove(struct xfs_da_args *args);
  #endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c

index 667cdd0..9c40d59 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -553,7 +553,8 @@ __xfs_bmap_add_free(
  #endif
         ASSERT(xfs_bmap_free_item_zone != NULL);
  
-       new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
+       new = kmem_cache_alloc(xfs_bmap_free_item_zone,
+                              GFP_KERNEL | __GFP_NOFAIL);
         new->xefi_startblock = bno;
         new->xefi_blockcount = (xfs_extlen_t)len;
         if (oinfo)
@@ -1098,7 +1099,10 @@ xfs_bmap_add_attrfork(
         if (error)
                 goto trans_cancel;
         ASSERT(ip->i_afp == NULL);
-       ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0);
+
+       ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone,
+                                     GFP_KERNEL | __GFP_NOFAIL);
+
         ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS;
         ip->i_afp->if_flags = XFS_IFEXTENTS;
         logflags = 0;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h

index 6028a3c..e1bd484 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -158,17 +158,22 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
         { BMAP_ATTRFORK,        "ATTR" }, \
         { BMAP_COWFORK,         "COW" }
  
+/* Return true if the extent is an allocated extent, written or not. */
+static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+{
+       return irec->br_startblock != HOLESTARTBLOCK &&
+               irec->br_startblock != DELAYSTARTBLOCK &&
+               !isnullstartblock(irec->br_startblock);
+}
  
  /*
   * Return true if the extent is a real, allocated extent, or false if it is  a
   * delayed allocation, and unwritten extent or a hole.
   */
-static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec)
  {
-       return irec->br_state != XFS_EXT_UNWRITTEN &&
-               irec->br_startblock != HOLESTARTBLOCK &&
-               irec->br_startblock != DELAYSTARTBLOCK &&
-               !isnullstartblock(irec->br_startblock);
+       return xfs_bmap_is_real_extent(irec) &&
+              irec->br_state != XFS_EXT_UNWRITTEN;
  }
  
  /*
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c

index d9c63f1..ecec604 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -552,7 +552,7 @@ xfs_bmbt_init_cursor(
         struct xfs_btree_cur    *cur;
         ASSERT(whichfork != XFS_COW_FORK);
  
-       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+       cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
  
         cur->bc_tp = tp;
         cur->bc_mp = mp;
diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h

index 643f0f9..f0d2976 100644 (file)
--- a/fs/xfs/libxfs/xfs_btree_staging.h
+++ b/fs/xfs/libxfs/xfs_btree_staging.h
@@ -18,7 +18,7 @@ struct xbtree_afakeroot {
         unsigned int            af_blocks;
  };
  
-/* Cursor interactions with with fake roots for AG-rooted btrees. */
+/* Cursor interactions with fake roots for AG-rooted btrees. */
  void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
                 struct xbtree_afakeroot *afake);
  void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
@@ -45,7 +45,7 @@ struct xbtree_ifakeroot {
         unsigned int            if_extents;
  };
  
-/* Cursor interactions with with fake roots for inode-rooted btrees. */
+/* Cursor interactions with fake roots for inode-rooted btrees. */
  void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur,
                 struct xbtree_ifakeroot *ifake,
                 struct xfs_btree_ops **new_ops);
@@ -90,7 +90,7 @@ struct xfs_btree_bload {
  
         /*
          * Number of free records to leave in each leaf block.  If the caller
-        * sets this to -1, the slack value will be calculated to be be halfway
+        * sets this to -1, the slack value will be calculated to be halfway
          * between maxrecs and minrecs.  This typically leaves the block 75%
          * full.  Note that slack values are not enforced on inode root blocks.
          */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c

index 897749c..e46bc03 100644 (file)
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -78,10 +78,16 @@ kmem_zone_t *xfs_da_state_zone;     /* anchor for state struct zone */
   * Allocate a dir-state structure.
   * We don't put them on the stack since they're large.
   */
-xfs_da_state_t *
-xfs_da_state_alloc(void)
+struct xfs_da_state *
+xfs_da_state_alloc(
+       struct xfs_da_args      *args)
  {
-       return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
+       struct xfs_da_state     *state;
+
+       state = kmem_cache_zalloc(xfs_da_state_zone, GFP_NOFS | __GFP_NOFAIL);
+       state->args = args;
+       state->mp = args->dp->i_mount;
+       return state;
  }
  
  /*
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h

index 6e25de6..ad5dd32 100644 (file)
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -219,7 +219,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
                                 const unsigned char *name, int len);
  
  
-xfs_da_state_t *xfs_da_state_alloc(void);
+struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args);
  void xfs_da_state_free(xfs_da_state_t *state);
  
  void   xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c

index 6ac4aad..5d51265 100644 (file)
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -2015,9 +2015,7 @@ xfs_dir2_node_addname(
         /*
          * Allocate and initialize the state (btree cursor).
          */
-       state = xfs_da_state_alloc();
-       state->args = args;
-       state->mp = args->dp->i_mount;
+       state = xfs_da_state_alloc(args);
         /*
          * Look up the name.  We're not supposed to find it, but
          * this gives us the insertion point.
@@ -2086,9 +2084,8 @@ xfs_dir2_node_lookup(
         /*
          * Allocate and initialize the btree cursor.
          */
-       state = xfs_da_state_alloc();
-       state->args = args;
-       state->mp = args->dp->i_mount;
+       state = xfs_da_state_alloc(args);
+
         /*
          * Fill in the path to the entry in the cursor.
          */
@@ -2139,9 +2136,7 @@ xfs_dir2_node_removename(
         /*
          * Allocate and initialize the btree cursor.
          */
-       state = xfs_da_state_alloc();
-       state->args = args;
-       state->mp = args->dp->i_mount;
+       state = xfs_da_state_alloc(args);
  
         /* Look up the entry we're deleting, set up the cursor. */
         error = xfs_da3_node_lookup_int(state, &rval);
@@ -2206,9 +2201,7 @@ xfs_dir2_node_replace(
         /*
          * Allocate and initialize the btree cursor.
          */
-       state = xfs_da_state_alloc();
-       state->args = args;
-       state->mp = args->dp->i_mount;
+       state = xfs_da_state_alloc(args);
  
         /*
          * We have to save new inode number and ftype since
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c

index bedc1e7..5a2db00 100644 (file)
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -37,9 +37,10 @@ xfs_failaddr_t
  xfs_dquot_verify(
         struct xfs_mount        *mp,
         struct xfs_disk_dquot   *ddq,
-       xfs_dqid_t              id,
-       uint                    type)   /* used only during quotacheck */
+       xfs_dqid_t              id)     /* used only during quotacheck */
  {
+       __u8                    ddq_type;
+
         /*
          * We can encounter an uninitialized dquot buffer for 2 reasons:
          * 1. If we crash while deleting the quotainode(s), and those blks got
@@ -60,11 +61,12 @@ xfs_dquot_verify(
         if (ddq->d_version != XFS_DQUOT_VERSION)
                 return __this_address;
  
-       if (type && ddq->d_flags != type)
+       if (ddq->d_type & ~XFS_DQTYPE_ANY)
                 return __this_address;
-       if (ddq->d_flags != XFS_DQ_USER &&
-           ddq->d_flags != XFS_DQ_PROJ &&
-           ddq->d_flags != XFS_DQ_GROUP)
+       ddq_type = ddq->d_type & XFS_DQTYPE_REC_MASK;
+       if (ddq_type != XFS_DQTYPE_USER &&
+           ddq_type != XFS_DQTYPE_PROJ &&
+           ddq_type != XFS_DQTYPE_GROUP)
                 return __this_address;
  
         if (id != -1 && id != be32_to_cpu(ddq->d_id))
@@ -95,14 +97,13 @@ xfs_failaddr_t
  xfs_dqblk_verify(
         struct xfs_mount        *mp,
         struct xfs_dqblk        *dqb,
-       xfs_dqid_t              id,
-       uint                    type)   /* used only during quotacheck */
+       xfs_dqid_t              id)     /* used only during quotacheck */
  {
         if (xfs_sb_version_hascrc(&mp->m_sb) &&
             !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid))
                 return __this_address;
  
-       return xfs_dquot_verify(mp, &dqb->dd_diskdq, id, type);
+       return xfs_dquot_verify(mp, &dqb->dd_diskdq, id);
  }
  
  /*
@@ -113,7 +114,7 @@ xfs_dqblk_repair(
         struct xfs_mount        *mp,
         struct xfs_dqblk        *dqb,
         xfs_dqid_t              id,
-       uint                    type)
+       xfs_dqtype_t            type)
  {
         /*
          * Typically, a repair is only requested by quotacheck.
@@ -123,7 +124,7 @@ xfs_dqblk_repair(
  
         dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
         dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION;
-       dqb->dd_diskdq.d_flags = type;
+       dqb->dd_diskdq.d_type = type;
         dqb->dd_diskdq.d_id = cpu_to_be32(id);
  
         if (xfs_sb_version_hascrc(&mp->m_sb)) {
@@ -205,7 +206,7 @@ xfs_dquot_buf_verify(
                 if (i == 0)
                         id = be32_to_cpu(ddq->d_id);
  
-               fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0);
+               fa = xfs_dqblk_verify(mp, &dqb[i], id + i);
                 if (fa) {
                         if (!readahead)
                                 xfs_buf_verifier_error(bp, -EFSCORRUPTED,
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h

index b42a52b..31b7ece 100644 (file)
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1149,16 +1149,26 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
  #define XFS_DQUOT_MAGIC                0x4451          /* 'DQ' */
  #define XFS_DQUOT_VERSION      (uint8_t)0x01   /* latest version number */
  
+#define XFS_DQTYPE_USER                0x01            /* user dquot record */
+#define XFS_DQTYPE_PROJ                0x02            /* project dquot record */
+#define XFS_DQTYPE_GROUP       0x04            /* group dquot record */
+
+/* bitmask to determine if this is a user/group/project dquot */
+#define XFS_DQTYPE_REC_MASK    (XFS_DQTYPE_USER | \
+                                XFS_DQTYPE_PROJ | \
+                                XFS_DQTYPE_GROUP)
+
+#define XFS_DQTYPE_ANY         (XFS_DQTYPE_REC_MASK)
+
  /*
- * This is the main portion of the on-disk representation of quota
- * information for a user. This is the q_core of the struct xfs_dquot that
- * is kept in kernel memory. We pad this with some more expansion room
- * to construct the on disk structure.
+ * This is the main portion of the on-disk representation of quota information
+ * for a user.  We pad this with some more expansion room to construct the on
+ * disk structure.
   */
  struct xfs_disk_dquot {
         __be16          d_magic;        /* dquot magic = XFS_DQUOT_MAGIC */
         __u8            d_version;      /* dquot version */
-       __u8            d_flags;        /* XFS_DQ_USER/PROJ/GROUP */
+       __u8            d_type;         /* XFS_DQTYPE_USER/PROJ/GROUP */
         __be32          d_id;           /* user,project,group id */
         __be64          d_blk_hardlimit;/* absolute limit on disk blks */
         __be64          d_blk_softlimit;/* preferred limit on disk blks */
@@ -1198,6 +1208,22 @@ typedef struct xfs_dqblk {
  
  #define XFS_DQUOT_CRC_OFF      offsetof(struct xfs_dqblk, dd_crc)
  
+/*
+ * This defines the unit of allocation of dquots.
+ *
+ * Currently, it is just one file system block, and a 4K blk contains 30
+ * (136 * 30 = 4080) dquots. It's probably not worth trying to make
+ * this more dynamic.
+ *
+ * However, if this number is changed, we have to make sure that we don't
+ * implicitly assume that we do allocations in chunks of a single filesystem
+ * block in the dquot/xqm code.
+ *
+ * This is part of the ondisk format because the structure size is not a power
+ * of two, which leaves slack at the end of the disk block.
+ */
+#define XFS_DQUOT_CLUSTER_SIZE_FSB     (xfs_filblks_t)1
+
  /*
   * Remote symlink format and access functions.
   */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c

index 7fcf62b..f742a96 100644 (file)
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -888,10 +888,9 @@ sparse_alloc:
          */
         be32_add_cpu(&agi->agi_count, newlen);
         be32_add_cpu(&agi->agi_freecount, newlen);
-       pag = xfs_perag_get(args.mp, agno);
+       pag = agbp->b_pag;
         pag->pagi_freecount += newlen;
         pag->pagi_count += newlen;
-       xfs_perag_put(pag);
         agi->agi_newino = cpu_to_be32(newino);
  
         /*
@@ -1134,7 +1133,7 @@ xfs_dialloc_ag_inobt(
         xfs_agnumber_t          agno = be32_to_cpu(agi->agi_seqno);
         xfs_agnumber_t          pagno = XFS_INO_TO_AGNO(mp, parent);
         xfs_agino_t             pagino = XFS_INO_TO_AGINO(mp, parent);
-       struct xfs_perag        *pag;
+       struct xfs_perag        *pag = agbp->b_pag;
         struct xfs_btree_cur    *cur, *tcur;
         struct xfs_inobt_rec_incore rec, trec;
         xfs_ino_t               ino;
@@ -1143,8 +1142,6 @@ xfs_dialloc_ag_inobt(
         int                     i, j;
         int                     searchdistance = 10;
  
-       pag = xfs_perag_get(mp, agno);
-
         ASSERT(pag->pagi_init);
         ASSERT(pag->pagi_inodeok);
         ASSERT(pag->pagi_freecount > 0);
@@ -1384,14 +1381,12 @@ alloc_inode:
  
         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
         xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
-       xfs_perag_put(pag);
         *inop = ino;
         return 0;
  error1:
         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
  error0:
         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-       xfs_perag_put(pag);
         return error;
  }
  
@@ -1587,7 +1582,6 @@ xfs_dialloc_ag(
         xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
         xfs_agnumber_t                  pagno = XFS_INO_TO_AGNO(mp, parent);
         xfs_agino_t                     pagino = XFS_INO_TO_AGINO(mp, parent);
-       struct xfs_perag                *pag;
         struct xfs_btree_cur            *cur;   /* finobt cursor */
         struct xfs_btree_cur            *icur;  /* inobt cursor */
         struct xfs_inobt_rec_incore     rec;
@@ -1599,8 +1593,6 @@ xfs_dialloc_ag(
         if (!xfs_sb_version_hasfinobt(&mp->m_sb))
                 return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
  
-       pag = xfs_perag_get(mp, agno);
-
         /*
          * If pagino is 0 (this is the root inode allocation) use newino.
          * This must work because we've just allocated some.
@@ -1667,7 +1659,7 @@ xfs_dialloc_ag(
          */
         be32_add_cpu(&agi->agi_freecount, -1);
         xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
-       pag->pagi_freecount--;
+       agbp->b_pag->pagi_freecount--;
  
         xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
  
@@ -1680,7 +1672,6 @@ xfs_dialloc_ag(
  
         xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-       xfs_perag_put(pag);
         *inop = ino;
         return 0;
  
@@ -1688,7 +1679,6 @@ error_icur:
         xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
  error_cur:
         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-       xfs_perag_put(pag);
         return error;
  }
  
@@ -1945,7 +1935,6 @@ xfs_difree_inobt(
  {
         struct xfs_agi                  *agi = agbp->b_addr;
         xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
-       struct xfs_perag                *pag;
         struct xfs_btree_cur            *cur;
         struct xfs_inobt_rec_incore     rec;
         int                             ilen;
@@ -2007,6 +1996,8 @@ xfs_difree_inobt(
         if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
             rec.ir_free == XFS_INOBT_ALL_FREE &&
             mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
+               struct xfs_perag        *pag = agbp->b_pag;
+
                 xic->deleted = true;
                 xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
                 xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
@@ -2020,10 +2011,8 @@ xfs_difree_inobt(
                 be32_add_cpu(&agi->agi_count, -ilen);
                 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
                 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
-               pag = xfs_perag_get(mp, agno);
                 pag->pagi_freecount -= ilen - 1;
                 pag->pagi_count -= ilen;
-               xfs_perag_put(pag);
                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
  
@@ -2049,9 +2038,7 @@ xfs_difree_inobt(
                  */
                 be32_add_cpu(&agi->agi_freecount, 1);
                 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
-               pag = xfs_perag_get(mp, agno);
-               pag->pagi_freecount++;
-               xfs_perag_put(pag);
+               agbp->b_pag->pagi_freecount++;
                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
         }
  
@@ -2661,7 +2648,7 @@ xfs_ialloc_read_agi(
                 return error;
  
         agi = (*bpp)->b_addr;
-       pag = xfs_perag_get(mp, agno);
+       pag = (*bpp)->b_pag;
         if (!pag->pagi_init) {
                 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
                 pag->pagi_count = be32_to_cpu(agi->agi_count);
@@ -2674,7 +2661,6 @@ xfs_ialloc_read_agi(
          */
         ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
                 XFS_FORCED_SHUTDOWN(mp));
-       xfs_perag_put(pag);
         return 0;
  }
  
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c

index b2c122a..3c8aebc 100644 (file)
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -411,7 +411,7 @@ xfs_inobt_init_common(
  {
         struct xfs_btree_cur    *cur;
  
-       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+       cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
         cur->bc_tp = tp;
         cur->bc_mp = mp;
         cur->bc_btnum = btnum;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c

index 6f84ea8..8d5dd08 100644 (file)
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -20,30 +20,6 @@
  
  #include <linux/iversion.h>
  
-/*
- * Check that none of the inode's in the buffer have a next
- * unlinked field of 0.
- */
-#if defined(DEBUG)
-void
-xfs_inobp_check(
-       xfs_mount_t     *mp,
-       xfs_buf_t       *bp)
-{
-       int             i;
-       xfs_dinode_t    *dip;
-
-       for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
-               dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
-               if (!dip->di_next_unlinked)  {
-                       xfs_alert(mp,
-       "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
-                               i, (long long)bp->b_bn);
-               }
-       }
-}
-#endif
-
  /*
   * If we are doing readahead on an inode buffer, we might be in log recovery
   * reading an inode allocation buffer that hasn't yet been replayed, and hence
@@ -53,10 +29,10 @@ xfs_inobp_check(
   * If the readahead buffer is invalid, we need to mark it with an error and
   * clear the DONE status of the buffer so that a followup read will re-read it
   * from disk. We don't report the error otherwise to avoid warnings during log
- * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
+ * recovery and we don't get unnecessary panics on debug kernels. We use EIO here
   * because all we want to do is say readahead failed; there is no-one to report
   * the error to, so this will distinguish it from a non-ra verifier failure.
- * Changes to this readahead error behavour also need to be reflected in
+ * Changes to this readahead error behaviour also need to be reflected in
   * xfs_dquot_buf_readahead_verify().
   */
  static void
@@ -176,7 +152,8 @@ xfs_imap_to_bp(
         }
  
         *bpp = bp;
-       *dipp = xfs_buf_offset(bp, imap->im_boffset);
+       if (dipp)
+               *dipp = xfs_buf_offset(bp, imap->im_boffset);
         return 0;
  }
  
@@ -203,7 +180,7 @@ xfs_inode_from_disk(
         /*
          * First get the permanent information that is needed to allocate an
          * inode. If the inode is unused, mode is zero and we shouldn't mess
-        * with the unitialized part of it.
+        * with the uninitialized part of it.
          */
         to->di_flushiter = be16_to_cpu(from->di_flushiter);
         inode->i_generation = be32_to_cpu(from->di_gen);
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h

index 865ac49..6b08b9d 100644 (file)
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -52,12 +52,6 @@ int  xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
  void   xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
                                struct xfs_dinode *to);
  
-#if defined(DEBUG)
-void   xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
-#else
-#define        xfs_inobp_check(mp, bp)
-#endif /* DEBUG */
-
  xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino,
                            struct xfs_dinode *dip);
  xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c

index 28b3662..0cf853d 100644 (file)
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -291,7 +291,7 @@ xfs_iformat_attr_fork(
          * Initialize the extent count early, as the per-format routines may
          * depend on it.
          */
-       ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS);
+       ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL);
         ip->i_afp->if_format = dip->di_aformat;
         if (unlikely(ip->i_afp->if_format == 0)) /* pre IRIX 6.2 file system */
                 ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS;
@@ -673,8 +673,8 @@ xfs_ifork_init_cow(
         if (ip->i_cowfp)
                 return;
  
-       ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
-                                      KM_NOFS);
+       ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_zone,
+                                      GFP_NOFS | __GFP_NOFAIL);
         ip->i_cowfp->if_flags = XFS_IFEXTENTS;
         ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS;
  }
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h

index 56d9dd7..076bdc7 100644 (file)
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -18,23 +18,22 @@
  typedef uint64_t       xfs_qcnt_t;
  typedef uint16_t       xfs_qwarncnt_t;
  
+typedef uint8_t                xfs_dqtype_t;
+
+#define XFS_DQTYPE_STRINGS \
+       { XFS_DQTYPE_USER,      "USER" }, \
+       { XFS_DQTYPE_PROJ,      "PROJ" }, \
+       { XFS_DQTYPE_GROUP,     "GROUP" }
+
  /*
   * flags for q_flags field in the dquot.
   */
-#define XFS_DQ_USER            0x0001          /* a user quota */
-#define XFS_DQ_PROJ            0x0002          /* project quota */
-#define XFS_DQ_GROUP           0x0004          /* a group quota */
-#define XFS_DQ_DIRTY           0x0008          /* dquot is dirty */
-#define XFS_DQ_FREEING         0x0010          /* dquot is being torn down */
-
-#define XFS_DQ_ALLTYPES                (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
+#define XFS_DQFLAG_DIRTY       (1 << 0)        /* dquot is dirty */
+#define XFS_DQFLAG_FREEING     (1 << 1)        /* dquot is being torn down */
  
-#define XFS_DQ_FLAGS \
-       { XFS_DQ_USER,          "USER" }, \
-       { XFS_DQ_PROJ,          "PROJ" }, \
-       { XFS_DQ_GROUP,         "GROUP" }, \
-       { XFS_DQ_DIRTY,         "DIRTY" }, \
-       { XFS_DQ_FREEING,       "FREEING" }
+#define XFS_DQFLAG_STRINGS \
+       { XFS_DQFLAG_DIRTY,     "DIRTY" }, \
+       { XFS_DQFLAG_FREEING,   "FREEING" }
  
  /*
   * We have the possibility of all three quota types being active at once, and
@@ -137,11 +136,11 @@ typedef uint16_t  xfs_qwarncnt_t;
  #define XFS_QMOPT_RESBLK_MASK  (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
  
  extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
-               struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type);
+               struct xfs_disk_dquot *ddq, xfs_dqid_t id);
  extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
-               struct xfs_dqblk *dqb, xfs_dqid_t id, uint type);
+               struct xfs_dqblk *dqb, xfs_dqid_t id);
  extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
  extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
-               xfs_dqid_t id, uint type);
+               xfs_dqid_t id, xfs_dqtype_t type);
  
  #endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c

index 7fd6044..a6ac60a 100644 (file)
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -37,15 +37,13 @@ xfs_refcountbt_set_root(
  {
         struct xfs_buf          *agbp = cur->bc_ag.agbp;
         struct xfs_agf          *agf = agbp->b_addr;
-       xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
-       struct xfs_perag        *pag = xfs_perag_get(cur->bc_mp, seqno);
+       struct xfs_perag        *pag = agbp->b_pag;
  
         ASSERT(ptr->s != 0);
  
         agf->agf_refcount_root = ptr->s;
         be32_add_cpu(&agf->agf_refcount_level, inc);
         pag->pagf_refcount_level += inc;
-       xfs_perag_put(pag);
  
         xfs_alloc_log_agf(cur->bc_tp, agbp,
                         XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL);
@@ -325,7 +323,7 @@ xfs_refcountbt_init_common(
         ASSERT(agno != NULLAGNUMBER);
         ASSERT(agno < mp->m_sb.sb_agcount);
  
-       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+       cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
         cur->bc_tp = tp;
         cur->bc_mp = mp;
         cur->bc_btnum = XFS_BTNUM_REFC;
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c

index b7c0531..beb81c8 100644 (file)
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -63,16 +63,14 @@ xfs_rmapbt_set_root(
  {
         struct xfs_buf          *agbp = cur->bc_ag.agbp;
         struct xfs_agf          *agf = agbp->b_addr;
-       xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
         int                     btnum = cur->bc_btnum;
-       struct xfs_perag        *pag = xfs_perag_get(cur->bc_mp, seqno);
+       struct xfs_perag        *pag = agbp->b_pag;
  
         ASSERT(ptr->s != 0);
  
         agf->agf_roots[btnum] = ptr->s;
         be32_add_cpu(&agf->agf_levels[btnum], inc);
         pag->pagf_levels[btnum] += inc;
-       xfs_perag_put(pag);
  
         xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
  }
@@ -123,6 +121,7 @@ xfs_rmapbt_free_block(
  {
         struct xfs_buf          *agbp = cur->bc_ag.agbp;
         struct xfs_agf          *agf = agbp->b_addr;
+       struct xfs_perag        *pag;
         xfs_agblock_t           bno;
         int                     error;
  
@@ -139,8 +138,8 @@ xfs_rmapbt_free_block(
                               XFS_EXTENT_BUSY_SKIP_DISCARD);
         xfs_trans_agbtree_delta(cur->bc_tp, -1);
  
-       xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_ag.agno);
-
+       pag = cur->bc_ag.agbp->b_pag;
+       xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
         return 0;
  }
  
@@ -457,7 +456,7 @@ xfs_rmapbt_init_common(
  {
         struct xfs_btree_cur    *cur;
  
-       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+       cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
         cur->bc_tp = tp;
         cur->bc_mp = mp;
         /* Overlapping btree; 2 keys per pointer. */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c

index 9498ced..1d9fa8a 100644 (file)
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -70,7 +70,7 @@ xfs_rtbuf_get(
         if (error)
                 return error;
  
-       if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_real_extent(&map)))
+       if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map)))
                 return -EFSCORRUPTED;
  
         ASSERT(map.br_startblock != NULLFSBLOCK);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h

index c45acbd..708feb8 100644 (file)
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -65,6 +65,7 @@ void  xfs_log_get_max_trans_res(struct xfs_mount *mp,
  #define XFS_TRANS_DQ_DIRTY     0x10    /* at least one dquot in trx dirty */
  #define XFS_TRANS_RESERVE      0x20    /* OK to use reserved data blocks */
  #define XFS_TRANS_NO_WRITECOUNT 0x40   /* do not elevate SB writecount */
+#define XFS_TRANS_RES_FDBLKS   0x80    /* reserve newly freed blocks */
  /*
   * LOWMODE is used by the allocator to activate the lowspace algorithm - when
   * free space is running low the extent allocator may choose to allocate an
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c

index b5dfb66..e151296 100644 (file)
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -8,6 +8,8 @@
  #include "xfs_shared.h"
  #include "xfs_format.h"
  #include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
  #include "xfs_inode.h"
  #include "xfs_trans.h"
  #include "xfs_trans_priv.h"
@@ -36,6 +38,7 @@ xfs_trans_ijoin(
  
         ASSERT(iip->ili_lock_flags == 0);
         iip->ili_lock_flags = lock_flags;
+       ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
  
         /*
          * Get a log_item_desc to point at the new item.
@@ -71,24 +74,35 @@ xfs_trans_ichgtime(
  }
  
  /*
- * This is called to mark the fields indicated in fieldmask as needing
- * to be logged when the transaction is committed.  The inode must
- * already be associated with the given transaction.
+ * This is called to mark the fields indicated in fieldmask as needing to be
+ * logged when the transaction is committed.  The inode must already be
+ * associated with the given transaction.
   *
- * The values for fieldmask are defined in xfs_inode_item.h.  We always
- * log all of the core inode if any of it has changed, and we always log
- * all of the inline data/extents/b-tree root if any of them has changed.
+ * The values for fieldmask are defined in xfs_inode_item.h.  We always log all
+ * of the core inode if any of it has changed, and we always log all of the
+ * inline data/extents/b-tree root if any of them has changed.
+ *
+ * Grab and pin the cluster buffer associated with this inode to avoid RMW
+ * cycles at inode writeback time. Avoid the need to add error handling to every
+ * xfs_trans_log_inode() call by shutting down on read error.  This will cause
+ * transactions to fail and everything to error out, just like if we return a
+ * read error in a dirty transaction and cancel it.
   */
  void
  xfs_trans_log_inode(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip,
-       uint            flags)
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       uint                    flags)
  {
-       struct inode    *inode = VFS_I(ip);
+       struct xfs_inode_log_item *iip = ip->i_itemp;
+       struct inode            *inode = VFS_I(ip);
+       uint                    iversion_flags = 0;
  
-       ASSERT(ip->i_itemp != NULL);
+       ASSERT(iip);
         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+       ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
+
+       tp->t_flags |= XFS_TRANS_DIRTY;
  
         /*
          * Don't bother with i_lock for the I_DIRTY_TIME check here, as races
@@ -102,15 +116,6 @@ xfs_trans_log_inode(
                 spin_unlock(&inode->i_lock);
         }
  
-       /*
-        * Record the specific change for fdatasync optimisation. This
-        * allows fdatasync to skip log forces for inodes that are only
-        * timestamp dirty. We do this before the change count so that
-        * the core being logged in this case does not impact on fdatasync
-        * behaviour.
-        */
-       ip->i_itemp->ili_fsync_fields |= flags;
-
         /*
          * First time we log the inode in a transaction, bump the inode change
          * counter if it is configured for this to occur. While we have the
@@ -120,23 +125,64 @@ xfs_trans_log_inode(
          * set however, then go ahead and bump the i_version counter
          * unconditionally.
          */
-       if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) &&
-           IS_I_VERSION(VFS_I(ip))) {
-               if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE))
-                       flags |= XFS_ILOG_CORE;
+       if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) {
+               if (IS_I_VERSION(inode) &&
+                   inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE))
+                       iversion_flags = XFS_ILOG_CORE;
         }
  
-       tp->t_flags |= XFS_TRANS_DIRTY;
+       /*
+        * Record the specific change for fdatasync optimisation. This allows
+        * fdatasync to skip log forces for inodes that are only timestamp
+        * dirty.
+        */
+       spin_lock(&iip->ili_lock);
+       iip->ili_fsync_fields |= flags;
+
+       if (!iip->ili_item.li_buf) {
+               struct xfs_buf  *bp;
+               int             error;
+
+               /*
+                * We hold the ILOCK here, so this inode is not going to be
+                * flushed while we are here. Further, because there is no
+                * buffer attached to the item, we know that there is no IO in
+                * progress, so nothing will clear the ili_fields while we read
+                * in the buffer. Hence we can safely drop the spin lock and
+                * read the buffer knowing that the state will not change from
+                * here.
+                */
+               spin_unlock(&iip->ili_lock);
+               error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, NULL,
+                                       &bp, 0);
+               if (error) {
+                       xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR);
+                       return;
+               }
+
+               /*
+                * We need an explicit buffer reference for the log item but
+                * don't want the buffer to remain attached to the transaction.
+                * Hold the buffer but release the transaction reference once
+                * we've attached the inode log item to the buffer log item
+                * list.
+                */
+               xfs_buf_hold(bp);
+               spin_lock(&iip->ili_lock);
+               iip->ili_item.li_buf = bp;
+               bp->b_flags |= _XBF_INODES;
+               list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
+               xfs_trans_brelse(tp, bp);
+       }
  
         /*
-        * Always OR in the bits from the ili_last_fields field.
-        * This is to coordinate with the xfs_iflush() and xfs_iflush_done()
-        * routines in the eventual clearing of the ili_fields bits.
-        * See the big comment in xfs_iflush() for an explanation of
-        * this coordination mechanism.
+        * Always OR in the bits from the ili_last_fields field.  This is to
+        * coordinate with the xfs_iflush() and xfs_iflush_done() routines in
+        * the eventual clearing of the ili_fields bits.  See the big comment in
+        * xfs_iflush() for an explanation of this coordination mechanism.
          */
-       flags |= ip->i_itemp->ili_last_fields;
-       ip->i_itemp->ili_fields |= flags;
+       iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags);
+       spin_unlock(&iip->ili_lock);
  }
  
  int
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h

index 88221c7..c6df01a 100644 (file)
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -57,7 +57,7 @@
         XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
  #define        XFS_IALLOC_SPACE_RES(mp)        \
         (M_IGEO(mp)->ialloc_blks + \
-        (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
+        ((xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1) * \
           (M_IGEO(mp)->inobt_maxlevels - 1)))
  
  /*
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c

index 7badd6d..955302e 100644 (file)
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -45,9 +45,27 @@ xchk_setup_inode_bmap(
          */
         if (S_ISREG(VFS_I(sc->ip)->i_mode) &&
             sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) {
+               struct address_space    *mapping = VFS_I(sc->ip)->i_mapping;
+
                 inode_dio_wait(VFS_I(sc->ip));
-               error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping);
-               if (error)
+
+               /*
+                * Try to flush all incore state to disk before we examine the
+                * space mappings for the data fork.  Leave accumulated errors
+                * in the mapping for the writer threads to consume.
+                *
+                * On ENOSPC or EIO writeback errors, we continue into the
+                * extent mapping checks because write failures do not
+                * necessarily imply anything about the correctness of the file
+                * metadata.  The metadata and the file data could be on
+                * completely separate devices; a media failure might only
+                * affect a subset of the disk, etc.  We can handle delalloc
+                * extents in the scrubber, so leaving them in memory is fine.
+                */
+               error = filemap_fdatawrite(mapping);
+               if (!error)
+                       error = filemap_fdatawait_keep_errors(mapping);
+               if (error && (error != -ENOSPC && error != -EIO))
                         goto out;
         }
  
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c

index 44b1501..e56786f 100644 (file)
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -476,9 +476,7 @@ xchk_da_btree(
         ds.dargs.whichfork = whichfork;
         ds.dargs.trans = sc->tp;
         ds.dargs.op_flags = XFS_DA_OP_OKNOENT;
-       ds.state = xfs_da_state_alloc();
-       ds.state->args = &ds.dargs;
-       ds.state->mp = mp;
+       ds.state = xfs_da_state_alloc(&ds.dargs);
         ds.sc = sc;
         ds.private = private;
         if (whichfork == XFS_ATTR_FORK) {
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c

index 905a345..e34ca20 100644 (file)
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -18,17 +18,17 @@
  #include "scrub/common.h"
  
  /* Convert a scrub type code to a DQ flag, or return 0 if error. */
-static inline uint
+static inline xfs_dqtype_t
  xchk_quota_to_dqtype(
         struct xfs_scrub        *sc)
  {
         switch (sc->sm->sm_type) {
         case XFS_SCRUB_TYPE_UQUOTA:
-               return XFS_DQ_USER;
+               return XFS_DQTYPE_USER;
         case XFS_SCRUB_TYPE_GQUOTA:
-               return XFS_DQ_GROUP;
+               return XFS_DQTYPE_GROUP;
         case XFS_SCRUB_TYPE_PQUOTA:
-               return XFS_DQ_PROJ;
+               return XFS_DQTYPE_PROJ;
         default:
                 return 0;
         }
@@ -40,7 +40,7 @@ xchk_setup_quota(
         struct xfs_scrub        *sc,
         struct xfs_inode        *ip)
  {
-       uint                    dqtype;
+       xfs_dqtype_t            dqtype;
         int                     error;
  
         if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp))
@@ -73,26 +73,15 @@ struct xchk_quota_info {
  STATIC int
  xchk_quota_item(
         struct xfs_dquot        *dq,
-       uint                    dqtype,
+       xfs_dqtype_t            dqtype,
         void                    *priv)
  {
         struct xchk_quota_info  *sqi = priv;
         struct xfs_scrub        *sc = sqi->sc;
         struct xfs_mount        *mp = sc->mp;
-       struct xfs_disk_dquot   *d = &dq->q_core;
         struct xfs_quotainfo    *qi = mp->m_quotainfo;
         xfs_fileoff_t           offset;
-       unsigned long long      bsoft;
-       unsigned long long      isoft;
-       unsigned long long      rsoft;
-       unsigned long long      bhard;
-       unsigned long long      ihard;
-       unsigned long long      rhard;
-       unsigned long long      bcount;
-       unsigned long long      icount;
-       unsigned long long      rcount;
         xfs_ino_t               fs_icount;
-       xfs_dqid_t              id = be32_to_cpu(d->d_id);
         int                     error = 0;
  
         if (xchk_should_terminate(sc, &error))
@@ -102,27 +91,11 @@ xchk_quota_item(
          * Except for the root dquot, the actual dquot we got must either have
          * the same or higher id as we saw before.
          */
-       offset = id / qi->qi_dqperchunk;
-       if (id && id <= sqi->last_id)
+       offset = dq->q_id / qi->qi_dqperchunk;
+       if (dq->q_id && dq->q_id <= sqi->last_id)
                 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
  
-       sqi->last_id = id;
-
-       /* Did we get the dquot type we wanted? */
-       if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES))
-               xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
-
-       if (d->d_pad0 != cpu_to_be32(0) || d->d_pad != cpu_to_be16(0))
-               xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
-
-       /* Check the limits. */
-       bhard = be64_to_cpu(d->d_blk_hardlimit);
-       ihard = be64_to_cpu(d->d_ino_hardlimit);
-       rhard = be64_to_cpu(d->d_rtb_hardlimit);
-
-       bsoft = be64_to_cpu(d->d_blk_softlimit);
-       isoft = be64_to_cpu(d->d_ino_softlimit);
-       rsoft = be64_to_cpu(d->d_rtb_softlimit);
+       sqi->last_id = dq->q_id;
  
         /*
          * Warn if the hard limits are larger than the fs.
@@ -132,25 +105,22 @@ xchk_quota_item(
          * Complain about corruption if the soft limit is greater than
          * the hard limit.
          */
-       if (bhard > mp->m_sb.sb_dblocks)
+       if (dq->q_blk.hardlimit > mp->m_sb.sb_dblocks)
                 xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
-       if (bsoft > bhard)
+       if (dq->q_blk.softlimit > dq->q_blk.hardlimit)
                 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
  
-       if (ihard > M_IGEO(mp)->maxicount)
+       if (dq->q_ino.hardlimit > M_IGEO(mp)->maxicount)
                 xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
-       if (isoft > ihard)
+       if (dq->q_ino.softlimit > dq->q_ino.hardlimit)
                 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
  
-       if (rhard > mp->m_sb.sb_rblocks)
+       if (dq->q_rtb.hardlimit > mp->m_sb.sb_rblocks)
                 xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
-       if (rsoft > rhard)
+       if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit)
                 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
  
         /* Check the resource counts. */
-       bcount = be64_to_cpu(d->d_bcount);
-       icount = be64_to_cpu(d->d_icount);
-       rcount = be64_to_cpu(d->d_rtbcount);
         fs_icount = percpu_counter_sum(&mp->m_icount);
  
         /*
@@ -159,15 +129,15 @@ xchk_quota_item(
          * if there are no quota limits.
          */
         if (xfs_sb_version_hasreflink(&mp->m_sb)) {
-               if (mp->m_sb.sb_dblocks < bcount)
+               if (mp->m_sb.sb_dblocks < dq->q_blk.count)
                         xchk_fblock_set_warning(sc, XFS_DATA_FORK,
                                         offset);
         } else {
-               if (mp->m_sb.sb_dblocks < bcount)
+               if (mp->m_sb.sb_dblocks < dq->q_blk.count)
                         xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
                                         offset);
         }
-       if (icount > fs_icount || rcount > mp->m_sb.sb_rblocks)
+       if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks)
                 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
  
         /*
@@ -175,13 +145,22 @@ xchk_quota_item(
          * lower limit than the actual usage.  However, we flag it for
          * admin review.
          */
-       if (id != 0 && bhard != 0 && bcount > bhard)
+       if (dq->q_id == 0)
+               goto out;
+
+       if (dq->q_blk.hardlimit != 0 &&
+           dq->q_blk.count > dq->q_blk.hardlimit)
                 xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
-       if (id != 0 && ihard != 0 && icount > ihard)
+
+       if (dq->q_ino.hardlimit != 0 &&
+           dq->q_ino.count > dq->q_ino.hardlimit)
                 xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
-       if (id != 0 && rhard != 0 && rcount > rhard)
+
+       if (dq->q_rtb.hardlimit != 0 &&
+           dq->q_rtb.count > dq->q_rtb.hardlimit)
                 xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
  
+out:
         if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
                 return -EFSCORRUPTED;
  
@@ -235,7 +214,7 @@ xchk_quota(
         struct xchk_quota_info  sqi;
         struct xfs_mount        *mp = sc->mp;
         struct xfs_quotainfo    *qi = mp->m_quotainfo;
-       uint                    dqtype;
+       xfs_dqtype_t            dqtype;
         int                     error = 0;
  
         dqtype = xchk_quota_to_dqtype(sc);
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c

index db3cfd1..25e86c7 100644 (file)
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -899,11 +899,11 @@ xrep_find_ag_btree_roots(
  void
  xrep_force_quotacheck(
         struct xfs_scrub        *sc,
-       uint                    dqtype)
+       xfs_dqtype_t            type)
  {
         uint                    flag;
  
-       flag = xfs_quota_chkd_flag(dqtype);
+       flag = xfs_quota_chkd_flag(type);
         if (!(flag & sc->mp->m_qflags))
                 return;
  
@@ -939,11 +939,11 @@ xrep_ino_dqattach(
  "inode %llu repair encountered quota error %d, quotacheck forced.",
                                 (unsigned long long)sc->ip->i_ino, error);
                 if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
-                       xrep_force_quotacheck(sc, XFS_DQ_USER);
+                       xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
                 if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
-                       xrep_force_quotacheck(sc, XFS_DQ_GROUP);
+                       xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
                 if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
-                       xrep_force_quotacheck(sc, XFS_DQ_PROJ);
+                       xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
                 /* fall through */
         case -ESRCH:
                 error = 0;
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h

index 04a47d4..fe77de0 100644 (file)
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -6,6 +6,8 @@
  #ifndef __XFS_SCRUB_REPAIR_H__
  #define __XFS_SCRUB_REPAIR_H__
  
+#include "xfs_quota_defs.h"
+
  static inline int xrep_notsupported(struct xfs_scrub *sc)
  {
         return -EOPNOTSUPP;
@@ -49,7 +51,7 @@ struct xrep_find_ag_btree {
  
  int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp,
                 struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp);
-void xrep_force_quotacheck(struct xfs_scrub *sc, uint dqtype);
+void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type);
  int xrep_ino_dqattach(struct xfs_scrub *sc);
  
  /* Metadata repairers */
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c

index c642bc2..76e4ffe 100644 (file)
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -13,6 +13,7 @@
  #include "xfs_trans.h"
  #include "xfs_rtalloc.h"
  #include "xfs_inode.h"
+#include "xfs_bmap.h"
  #include "scrub/scrub.h"
  #include "scrub/common.h"
  
@@ -58,6 +59,41 @@ xchk_rtbitmap_rec(
         return 0;
  }
  
+/* Make sure the entire rtbitmap file is mapped with written extents. */
+STATIC int
+xchk_rtbitmap_check_extents(
+       struct xfs_scrub        *sc)
+{
+       struct xfs_mount        *mp = sc->mp;
+       struct xfs_bmbt_irec    map;
+       xfs_rtblock_t           off;
+       int                     nmap;
+       int                     error = 0;
+
+       for (off = 0; off < mp->m_sb.sb_rbmblocks;) {
+               if (xchk_should_terminate(sc, &error) ||
+                   (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+                       break;
+
+               /* Make sure we have a written extent. */
+               nmap = 1;
+               error = xfs_bmapi_read(mp->m_rbmip, off,
+                               mp->m_sb.sb_rbmblocks - off, &map, &nmap,
+                               XFS_DATA_FORK);
+               if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
+                       break;
+
+               if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) {
+                       xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+                       break;
+               }
+
+               off += map.br_blockcount;
+       }
+
+       return error;
+}
+
  /* Scrub the realtime bitmap. */
  int
  xchk_rtbitmap(
@@ -65,11 +101,22 @@ xchk_rtbitmap(
  {
         int                     error;
  
+       /* Is the size of the rtbitmap correct? */
+       if (sc->mp->m_rbmip->i_d.di_size !=
+           XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) {
+               xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino);
+               return 0;
+       }
+
         /* Invoke the fork scrubber. */
         error = xchk_metadata_inode_forks(sc);
         if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
                 return error;
  
+       error = xchk_rtbitmap_check_extents(sc);
+       if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+               return error;
+
         error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc);
         if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
                 goto out;
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c

index 6736c5a..ec36913 100644 (file)
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -138,7 +138,7 @@ xfs_bui_init(
  {
         struct xfs_bui_log_item         *buip;
  
-       buip = kmem_zone_zalloc(xfs_bui_zone, 0);
+       buip = kmem_cache_zalloc(xfs_bui_zone, GFP_KERNEL | __GFP_NOFAIL);
  
         xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
         buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
@@ -215,7 +215,7 @@ xfs_trans_get_bud(
  {
         struct xfs_bud_log_item         *budp;
  
-       budp = kmem_zone_zalloc(xfs_bud_zone, 0);
+       budp = kmem_cache_zalloc(xfs_bud_zone, GFP_KERNEL | __GFP_NOFAIL);
         xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
                           &xfs_bud_item_ops);
         budp->bud_buip = buip;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c

index 3052586..73cafc8 100644 (file)
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1567,6 +1567,7 @@ xfs_swap_extents(
         int                     lock_flags;
         uint64_t                f;
         int                     resblks = 0;
+       unsigned int            flags = 0;
  
         /*
          * Lock the inodes against other IO, page faults and truncate to
@@ -1630,17 +1631,16 @@ xfs_swap_extents(
                 resblks +=  XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w);
  
                 /*
-                * Handle the corner case where either inode might straddle the
-                * btree format boundary. If so, the inode could bounce between
-                * btree <-> extent format on unmap -> remap cycles, freeing and
-                * allocating a bmapbt block each time.
+                * If either inode straddles a bmapbt block allocation boundary,
+                * the rmapbt algorithm triggers repeated allocs and frees as
+                * extents are remapped. This can exhaust the block reservation
+                * prematurely and cause shutdown. Return freed blocks to the
+                * transaction reservation to counter this behavior.
                  */
-               if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1))
-                       resblks += XFS_IFORK_MAXEXT(ip, w);
-               if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1))
-                       resblks += XFS_IFORK_MAXEXT(tip, w);
+               flags |= XFS_TRANS_RES_FDBLKS;
         }
-       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags,
+                               &tp);
         if (error)
                 goto out_unlock;
  
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c

index 20b748f..d4cdcb6 100644 (file)
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -14,6 +14,9 @@
  #include "xfs_mount.h"
  #include "xfs_trace.h"
  #include "xfs_log.h"
+#include "xfs_log_recover.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
  #include "xfs_errortag.h"
  #include "xfs_error.h"
  
@@ -211,9 +214,7 @@ _xfs_buf_alloc(
         int                     i;
  
         *bpp = NULL;
-       bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
-       if (unlikely(!bp))
-               return -ENOMEM;
+       bp = kmem_cache_zalloc(xfs_buf_zone, GFP_NOFS | __GFP_NOFAIL);
  
         /*
          * We don't want certain flags to appear in b_flags unless they are
@@ -655,7 +656,6 @@ found:
          */
         if (bp->b_flags & XBF_STALE) {
                 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
-               ASSERT(bp->b_iodone == NULL);
                 bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
                 bp->b_ops = NULL;
         }
@@ -1191,10 +1191,13 @@ xfs_buf_ioend(
         if (!bp->b_error && bp->b_io_error)
                 xfs_buf_ioerror(bp, bp->b_io_error);
  
-       /* Only validate buffers that were read without errors */
-       if (read && !bp->b_error && bp->b_ops) {
-               ASSERT(!bp->b_iodone);
-               bp->b_ops->verify_read(bp);
+       if (read) {
+               if (!bp->b_error && bp->b_ops)
+                       bp->b_ops->verify_read(bp);
+               if (!bp->b_error)
+                       bp->b_flags |= XBF_DONE;
+               xfs_buf_ioend_finish(bp);
+               return;
         }
  
         if (!bp->b_error) {
@@ -1202,12 +1205,25 @@ xfs_buf_ioend(
                 bp->b_flags |= XBF_DONE;
         }
  
-       if (bp->b_iodone)
-               (*(bp->b_iodone))(bp);
-       else if (bp->b_flags & XBF_ASYNC)
-               xfs_buf_relse(bp);
-       else
-               complete(&bp->b_iowait);
+       /*
+        * If this is a log recovery buffer, we aren't doing transactional IO
+        * yet so we need to let it handle IO completions.
+        */
+       if (bp->b_flags & _XBF_LOGRECOVERY) {
+               xlog_recover_iodone(bp);
+               return;
+       }
+
+       if (bp->b_flags & _XBF_INODES) {
+               xfs_buf_inode_iodone(bp);
+               return;
+       }
+
+       if (bp->b_flags & _XBF_DQUOTS) {
+               xfs_buf_dquot_iodone(bp);
+               return;
+       }
+       xfs_buf_iodone(bp);
  }
  
  static void
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h

index 050c53b..755b652 100644 (file)
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -18,6 +18,7 @@
  /*
   *     Base types
   */
+struct xfs_buf;
  
  #define XFS_BUF_DADDR_NULL     ((xfs_daddr_t) (-1LL))
  
@@ -30,15 +31,20 @@
  #define XBF_STALE       (1 << 6) /* buffer has been staled, do not find it */
  #define XBF_WRITE_FAIL  (1 << 7) /* async writes have failed on this buffer */
  
-/* flags used only as arguments to access routines */
-#define XBF_TRYLOCK     (1 << 16)/* lock requested, but do not wait */
-#define XBF_UNMAPPED    (1 << 17)/* do not map the buffer */
+/* buffer type flags for write callbacks */
+#define _XBF_INODES     (1 << 16)/* inode buffer */
+#define _XBF_DQUOTS     (1 << 17)/* dquot buffer */
+#define _XBF_LOGRECOVERY        (1 << 18)/* log recovery buffer */
  
  /* flags used only internally */
  #define _XBF_PAGES      (1 << 20)/* backed by refcounted pages */
  #define _XBF_KMEM       (1 << 21)/* backed by heap memory */
  #define _XBF_DELWRI_Q   (1 << 22)/* buffer on a delwri queue */
  
+/* flags used only as arguments to access routines */
+#define XBF_TRYLOCK     (1 << 30)/* lock requested, but do not wait */
+#define XBF_UNMAPPED    (1 << 31)/* do not map the buffer */
+
  typedef unsigned int xfs_buf_flags_t;
  
  #define XFS_BUF_FLAGS \
@@ -50,12 +56,15 @@ typedef unsigned int xfs_buf_flags_t;
         { XBF_DONE,             "DONE" }, \
         { XBF_STALE,            "STALE" }, \
         { XBF_WRITE_FAIL,       "WRITE_FAIL" }, \
-       { XBF_TRYLOCK,          "TRYLOCK" },    /* should never be set */\
-       { XBF_UNMAPPED,         "UNMAPPED" },   /* ditto */\
+       { _XBF_INODES,          "INODES" }, \
+       { _XBF_DQUOTS,          "DQUOTS" }, \
+       { _XBF_LOGRECOVERY,             "LOG_RECOVERY" }, \
         { _XBF_PAGES,           "PAGES" }, \
         { _XBF_KMEM,            "KMEM" }, \
-       { _XBF_DELWRI_Q,        "DELWRI_Q" }
-
+       { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
+       /* The following interface flags should never be set */ \
+       { XBF_TRYLOCK,          "TRYLOCK" }, \
+       { XBF_UNMAPPED,         "UNMAPPED" }
  
  /*
   * Internal state flags.
@@ -94,10 +103,6 @@ typedef struct xfs_buftarg {
         struct ratelimit_state  bt_ioerror_rl;
  } xfs_buftarg_t;
  
-struct xfs_buf;
-typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-
-
  #define XB_PAGES       2
  
  struct xfs_buf_map {
@@ -150,7 +155,6 @@ typedef struct xfs_buf {
         xfs_buftarg_t           *b_target;      /* buffer target (device) */
         void                    *b_addr;        /* virtual address of buffer */
         struct work_struct      b_ioend_work;
-       xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
         struct completion       b_iowait;       /* queue for I/O waiters */
         struct xfs_buf_log_item *b_log_item;
         struct list_head        b_li_list;      /* Log items list head */
@@ -257,9 +261,23 @@ extern void xfs_buf_unlock(xfs_buf_t *);
  #define xfs_buf_islocked(bp) \
         ((bp)->b_sema.count <= 0)
  
+static inline void xfs_buf_relse(xfs_buf_t *bp)
+{
+       xfs_buf_unlock(bp);
+       xfs_buf_rele(bp);
+}
+
  /* Buffer Read and Write Routines */
  extern int xfs_bwrite(struct xfs_buf *bp);
  extern void xfs_buf_ioend(struct xfs_buf *bp);
+static inline void xfs_buf_ioend_finish(struct xfs_buf *bp)
+{
+       if (bp->b_flags & XBF_ASYNC)
+               xfs_buf_relse(bp);
+       else
+               complete(&bp->b_iowait);
+}
+
  extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error,
                 xfs_failaddr_t failaddr);
  #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address)
@@ -324,12 +342,6 @@ static inline int xfs_buf_ispinned(struct xfs_buf *bp)
         return atomic_read(&bp->b_pin_count);
  }
  
-static inline void xfs_buf_relse(xfs_buf_t *bp)
-{
-       xfs_buf_unlock(bp);
-       xfs_buf_rele(bp);
-}
-
  static inline int
  xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
  {
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c

index 9e75e8d..5bb6f22 100644 (file)
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -12,8 +12,13 @@
  #include "xfs_bit.h"
  #include "xfs_mount.h"
  #include "xfs_trans.h"
-#include "xfs_buf_item.h"
  #include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
  #include "xfs_trace.h"
  #include "xfs_log.h"
  
@@ -25,7 +30,7 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
         return container_of(lip, struct xfs_buf_log_item, bli_item);
  }
  
-STATIC void    xfs_buf_do_callbacks(struct xfs_buf *bp);
+static void xfs_buf_item_done(struct xfs_buf *bp);
  
  /* Is this log iovec plausibly large enough to contain the buffer log format? */
  bool
@@ -457,10 +462,9 @@ xfs_buf_item_unpin(
                  * the AIL lock.
                  */
                 if (bip->bli_flags & XFS_BLI_STALE_INODE) {
-                       xfs_buf_do_callbacks(bp);
-                       bp->b_log_item = NULL;
-                       list_del_init(&bp->b_li_list);
-                       bp->b_iodone = NULL;
+                       xfs_buf_item_done(bp);
+                       xfs_iflush_done(bp);
+                       ASSERT(list_empty(&bp->b_li_list));
                 } else {
                         xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
                         xfs_buf_item_relse(bp);
@@ -734,7 +738,7 @@ xfs_buf_item_init(
                 return 0;
         }
  
-       bip = kmem_zone_zalloc(xfs_buf_item_zone, 0);
+       bip = kmem_cache_zalloc(xfs_buf_item_zone, GFP_KERNEL | __GFP_NOFAIL);
         xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
         bip->bli_buf = bp;
  
@@ -936,11 +940,7 @@ xfs_buf_item_free(
  }
  
  /*
- * This is called when the buf log item is no longer needed.  It should
- * free the buf log item associated with the given buffer and clear
- * the buffer's pointer to the buf log item.  If there are no more
- * items in the list, clear the b_iodone field of the buffer (see
- * xfs_buf_attach_iodone() below).
+ * xfs_buf_item_relse() is called when the buf log item is no longer needed.
   */
  void
  xfs_buf_item_relse(
@@ -952,137 +952,28 @@ xfs_buf_item_relse(
         ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
  
         bp->b_log_item = NULL;
-       if (list_empty(&bp->b_li_list))
-               bp->b_iodone = NULL;
-
         xfs_buf_rele(bp);
         xfs_buf_item_free(bip);
  }
  
-
  /*
- * Add the given log item with its callback to the list of callbacks
- * to be called when the buffer's I/O completes.  If it is not set
- * already, set the buffer's b_iodone() routine to be
- * xfs_buf_iodone_callbacks() and link the log item into the list of
- * items rooted at b_li_list.
+ * Decide if we're going to retry the write after a failure, and prepare
+ * the buffer for retrying the write.
   */
-void
-xfs_buf_attach_iodone(
-       struct xfs_buf          *bp,
-       void                    (*cb)(struct xfs_buf *, struct xfs_log_item *),
-       struct xfs_log_item     *lip)
-{
-       ASSERT(xfs_buf_islocked(bp));
-
-       lip->li_cb = cb;
-       list_add_tail(&lip->li_bio_list, &bp->b_li_list);
-
-       ASSERT(bp->b_iodone == NULL ||
-              bp->b_iodone == xfs_buf_iodone_callbacks);
-       bp->b_iodone = xfs_buf_iodone_callbacks;
-}
-
-/*
- * We can have many callbacks on a buffer. Running the callbacks individually
- * can cause a lot of contention on the AIL lock, so we allow for a single
- * callback to be able to scan the remaining items in bp->b_li_list for other
- * items of the same type and callback to be processed in the first call.
- *
- * As a result, the loop walking the callback list below will also modify the
- * list. it removes the first item from the list and then runs the callback.
- * The loop then restarts from the new first item int the list. This allows the
- * callback to scan and modify the list attached to the buffer and we don't
- * have to care about maintaining a next item pointer.
- */
-STATIC void
-xfs_buf_do_callbacks(
-       struct xfs_buf          *bp)
-{
-       struct xfs_buf_log_item *blip = bp->b_log_item;
-       struct xfs_log_item     *lip;
-
-       /* If there is a buf_log_item attached, run its callback */
-       if (blip) {
-               lip = &blip->bli_item;
-               lip->li_cb(bp, lip);
-       }
-
-       while (!list_empty(&bp->b_li_list)) {
-               lip = list_first_entry(&bp->b_li_list, struct xfs_log_item,
-                                      li_bio_list);
-
-               /*
-                * Remove the item from the list, so we don't have any
-                * confusion if the item is added to another buf.
-                * Don't touch the log item after calling its
-                * callback, because it could have freed itself.
-                */
-               list_del_init(&lip->li_bio_list);
-               lip->li_cb(bp, lip);
-       }
-}
-
-/*
- * Invoke the error state callback for each log item affected by the failed I/O.
- *
- * If a metadata buffer write fails with a non-permanent error, the buffer is
- * eventually resubmitted and so the completion callbacks are not run. The error
- * state may need to be propagated to the log items attached to the buffer,
- * however, so the next AIL push of the item knows hot to handle it correctly.
- */
-STATIC void
-xfs_buf_do_callbacks_fail(
-       struct xfs_buf          *bp)
-{
-       struct xfs_log_item     *lip;
-       struct xfs_ail          *ailp;
-
-       /*
-        * Buffer log item errors are handled directly by xfs_buf_item_push()
-        * and xfs_buf_iodone_callback_error, and they have no IO error
-        * callbacks. Check only for items in b_li_list.
-        */
-       if (list_empty(&bp->b_li_list))
-               return;
-
-       lip = list_first_entry(&bp->b_li_list, struct xfs_log_item,
-                       li_bio_list);
-       ailp = lip->li_ailp;
-       spin_lock(&ailp->ail_lock);
-       list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
-               if (lip->li_ops->iop_error)
-                       lip->li_ops->iop_error(lip, bp);
-       }
-       spin_unlock(&ailp->ail_lock);
-}
-
  static bool
-xfs_buf_iodone_callback_error(
+xfs_buf_ioerror_fail_without_retry(
         struct xfs_buf          *bp)
  {
-       struct xfs_buf_log_item *bip = bp->b_log_item;
-       struct xfs_log_item     *lip;
-       struct xfs_mount        *mp;
+       struct xfs_mount        *mp = bp->b_mount;
         static ulong            lasttime;
         static xfs_buftarg_t    *lasttarg;
-       struct xfs_error_cfg    *cfg;
-
-       /*
-        * The failed buffer might not have a buf_log_item attached or the
-        * log_item list might be empty. Get the mp from the available
-        * xfs_log_item
-        */
-       lip = list_first_entry_or_null(&bp->b_li_list, struct xfs_log_item,
-                                      li_bio_list);
-       mp = lip ? lip->li_mountp : bip->bli_item.li_mountp;
  
         /*
          * If we've already decided to shutdown the filesystem because of
          * I/O errors, there's no point in giving this a retry.
          */
         if (XFS_FORCED_SHUTDOWN(mp))
-               goto out_stale;
+               return true;
  
         if (bp->b_target != lasttarg ||
             time_after(jiffies, (lasttime + 5*HZ))) {
@@ -1093,129 +984,240 @@ xfs_buf_iodone_callback_error(
  
         /* synchronous writes will have callers process the error */
         if (!(bp->b_flags & XBF_ASYNC))
-               goto out_stale;
-
-       trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
-       ASSERT(bp->b_iodone != NULL);
-
-       cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
+               return true;
+       return false;
+}
  
-       /*
-        * If the write was asynchronous then no one will be looking for the
-        * error.  If this is the first failure of this type, clear the error
-        * state and write the buffer out again. This means we always retry an
-        * async write failure at least once, but we also need to set the buffer
-        * up to behave correctly now for repeated failures.
-        */
-       if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) ||
-            bp->b_last_error != bp->b_error) {
-               bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
-               bp->b_last_error = bp->b_error;
-               if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
-                   !bp->b_first_retry_time)
-                       bp->b_first_retry_time = jiffies;
+static bool
+xfs_buf_ioerror_retry(
+       struct xfs_buf          *bp,
+       struct xfs_error_cfg    *cfg)
+{
+       if ((bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) &&
+           bp->b_last_error == bp->b_error)
+               return false;
  
-               xfs_buf_ioerror(bp, 0);
-               xfs_buf_submit(bp);
-               return true;
-       }
+       bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
+       bp->b_last_error = bp->b_error;
+       if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+           !bp->b_first_retry_time)
+               bp->b_first_retry_time = jiffies;
+       return true;
+}
  
-       /*
-        * Repeated failure on an async write. Take action according to the
-        * error configuration we have been set up to use.
-        */
+/*
+ * Account for this latest trip around the retry handler, and decide if
+ * we've failed enough times to constitute a permanent failure.
+ */
+static bool
+xfs_buf_ioerror_permanent(
+       struct xfs_buf          *bp,
+       struct xfs_error_cfg    *cfg)
+{
+       struct xfs_mount        *mp = bp->b_mount;
  
         if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
             ++bp->b_retries > cfg->max_retries)
-                       goto permanent_error;
+               return true;
         if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
             time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
-                       goto permanent_error;
+               return true;
  
         /* At unmount we may treat errors differently */
         if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
-               goto permanent_error;
+               return true;
  
-       /*
-        * Still a transient error, run IO completion failure callbacks and let
-        * the higher layers retry the buffer.
-        */
-       xfs_buf_do_callbacks_fail(bp);
-       xfs_buf_ioerror(bp, 0);
-       xfs_buf_relse(bp);
-       return true;
+       return false;
+}
+
+/*
+ * On a sync write or shutdown we just want to stale the buffer and let the
+ * caller handle the error in bp->b_error appropriately.
+ *
+ * If the write was asynchronous then no one will be looking for the error.  If
+ * this is the first failure of this type, clear the error state and write the
+ * buffer out again. This means we always retry an async write failure at least
+ * once, but we also need to set the buffer up to behave correctly now for
+ * repeated failures.
+ *
+ * If we get repeated async write failures, then we take action according to the
+ * error configuration we have been set up to use.
+ *
+ * Multi-state return value:
+ *
+ * XBF_IOERROR_FINISH: clear IO error retry state and run callback completions
+ * XBF_IOERROR_DONE: resubmitted immediately, do not run any completions
+ * XBF_IOERROR_FAIL: transient error, run failure callback completions and then
+ *    release the buffer
+ */
+enum {
+       XBF_IOERROR_FINISH,
+       XBF_IOERROR_DONE,
+       XBF_IOERROR_FAIL,
+};
+
+static int
+xfs_buf_iodone_error(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_mount;
+       struct xfs_error_cfg    *cfg;
+
+       if (xfs_buf_ioerror_fail_without_retry(bp))
+               goto out_stale;
+
+       trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+
+       cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
+       if (xfs_buf_ioerror_retry(bp, cfg)) {
+               xfs_buf_ioerror(bp, 0);
+               xfs_buf_submit(bp);
+               return XBF_IOERROR_DONE;
+       }
  
         /*
          * Permanent error - we need to trigger a shutdown if we haven't already
          * to indicate that inconsistency will result from this action.
          */
-permanent_error:
-       xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+       if (xfs_buf_ioerror_permanent(bp, cfg)) {
+               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+               goto out_stale;
+       }
+
+       /* Still considered a transient error. Caller will schedule retries. */
+       return XBF_IOERROR_FAIL;
+
  out_stale:
         xfs_buf_stale(bp);
         bp->b_flags |= XBF_DONE;
         trace_xfs_buf_error_relse(bp, _RET_IP_);
-       return false;
+       return XBF_IOERROR_FINISH;
  }
  
-/*
- * This is the iodone() function for buffers which have had callbacks attached
- * to them by xfs_buf_attach_iodone(). We need to iterate the items on the
- * callback list, mark the buffer as having no more callbacks and then push the
- * buffer through IO completion processing.
- */
-void
-xfs_buf_iodone_callbacks(
+static void
+xfs_buf_item_done(
         struct xfs_buf          *bp)
  {
-       /*
-        * If there is an error, process it. Some errors require us
-        * to run callbacks after failure processing is done so we
-        * detect that and take appropriate action.
-        */
-       if (bp->b_error && xfs_buf_iodone_callback_error(bp))
+       struct xfs_buf_log_item *bip = bp->b_log_item;
+
+       if (!bip)
                 return;
  
         /*
-        * Successful IO or permanent error. Either way, we can clear the
-        * retry state here in preparation for the next error that may occur.
+        * If we are forcibly shutting down, this may well be off the AIL
+        * already. That's because we simulate the log-committed callbacks to
+        * unpin these buffers. Or we may never have put this item on AIL
+        * because of the transaction was aborted forcibly.
+        * xfs_trans_ail_delete() takes care of these.
+        *
+        * Either way, AIL is useless if we're forcing a shutdown.
          */
+       xfs_trans_ail_delete(&bip->bli_item, SHUTDOWN_CORRUPT_INCORE);
+       bp->b_log_item = NULL;
+       xfs_buf_item_free(bip);
+       xfs_buf_rele(bp);
+}
+
+static inline void
+xfs_buf_clear_ioerror_retry_state(
+       struct xfs_buf          *bp)
+{
         bp->b_last_error = 0;
         bp->b_retries = 0;
         bp->b_first_retry_time = 0;
+}
  
-       xfs_buf_do_callbacks(bp);
-       bp->b_log_item = NULL;
-       list_del_init(&bp->b_li_list);
-       bp->b_iodone = NULL;
-       xfs_buf_ioend(bp);
+/*
+ * Inode buffer iodone callback function.
+ */
+void
+xfs_buf_inode_iodone(
+       struct xfs_buf          *bp)
+{
+       if (bp->b_error) {
+               struct xfs_log_item *lip;
+               int ret = xfs_buf_iodone_error(bp);
+
+               if (ret == XBF_IOERROR_FINISH)
+                       goto finish_iodone;
+               if (ret == XBF_IOERROR_DONE)
+                       return;
+               ASSERT(ret == XBF_IOERROR_FAIL);
+               list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
+                       set_bit(XFS_LI_FAILED, &lip->li_flags);
+               }
+               xfs_buf_ioerror(bp, 0);
+               xfs_buf_relse(bp);
+               return;
+       }
+
+finish_iodone:
+       xfs_buf_clear_ioerror_retry_state(bp);
+       xfs_buf_item_done(bp);
+       xfs_iflush_done(bp);
+       xfs_buf_ioend_finish(bp);
  }
  
  /*
- * This is the iodone() function for buffers which have been
- * logged.  It is called when they are eventually flushed out.
- * It should remove the buf item from the AIL, and free the buf item.
- * It is called by xfs_buf_iodone_callbacks() above which will take
- * care of cleaning up the buffer itself.
+ * Dquot buffer iodone callback function.
   */
  void
-xfs_buf_iodone(
-       struct xfs_buf          *bp,
-       struct xfs_log_item     *lip)
+xfs_buf_dquot_iodone(
+       struct xfs_buf          *bp)
  {
-       ASSERT(BUF_ITEM(lip)->bli_buf == bp);
+       if (bp->b_error) {
+               struct xfs_log_item *lip;
+               int ret = xfs_buf_iodone_error(bp);
+
+               if (ret == XBF_IOERROR_FINISH)
+                       goto finish_iodone;
+               if (ret == XBF_IOERROR_DONE)
+                       return;
+               ASSERT(ret == XBF_IOERROR_FAIL);
+               spin_lock(&bp->b_mount->m_ail->ail_lock);
+               list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
+                       xfs_set_li_failed(lip, bp);
+               }
+               spin_unlock(&bp->b_mount->m_ail->ail_lock);
+               xfs_buf_ioerror(bp, 0);
+               xfs_buf_relse(bp);
+               return;
+       }
  
-       xfs_buf_rele(bp);
+finish_iodone:
+       xfs_buf_clear_ioerror_retry_state(bp);
+       /* a newly allocated dquot buffer might have a log item attached */
+       xfs_buf_item_done(bp);
+       xfs_dquot_done(bp);
+       xfs_buf_ioend_finish(bp);
+}
  
-       /*
-        * If we are forcibly shutting down, this may well be off the AIL
-        * already. That's because we simulate the log-committed callbacks to
-        * unpin these buffers. Or we may never have put this item on AIL
-        * because of the transaction was aborted forcibly.
-        * xfs_trans_ail_delete() takes care of these.
-        *
-        * Either way, AIL is useless if we're forcing a shutdown.
-        */
-       xfs_trans_ail_delete(lip, SHUTDOWN_CORRUPT_INCORE);
-       xfs_buf_item_free(BUF_ITEM(lip));
+/*
+ * Dirty buffer iodone callback function.
+ *
+ * Note that for things like remote attribute buffers, there may not be a buffer
+ * log item here, so processing the buffer log item must remain be optional.
+ */
+void
+xfs_buf_iodone(
+       struct xfs_buf          *bp)
+{
+       if (bp->b_error) {
+               int ret = xfs_buf_iodone_error(bp);
+
+               if (ret == XBF_IOERROR_FINISH)
+                       goto finish_iodone;
+               if (ret == XBF_IOERROR_DONE)
+                       return;
+               ASSERT(ret == XBF_IOERROR_FAIL);
+               ASSERT(list_empty(&bp->b_li_list));
+               xfs_buf_ioerror(bp, 0);
+               xfs_buf_relse(bp);
+               return;
+       }
+
+finish_iodone:
+       xfs_buf_clear_ioerror_retry_state(bp);
+       xfs_buf_item_done(bp);
+       xfs_buf_ioend_finish(bp);
  }
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h

index c9c57e2..23507cb 100644 (file)
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -54,11 +54,9 @@ void xfs_buf_item_relse(struct xfs_buf *);
  bool   xfs_buf_item_put(struct xfs_buf_log_item *);
  void   xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
  bool   xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
-void   xfs_buf_attach_iodone(struct xfs_buf *,
-                             void(*)(struct xfs_buf *, struct xfs_log_item *),
-                             struct xfs_log_item *);
-void   xfs_buf_iodone_callbacks(struct xfs_buf *);
-void   xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
+void   xfs_buf_inode_iodone(struct xfs_buf *);
+void   xfs_buf_dquot_iodone(struct xfs_buf *);
+void   xfs_buf_iodone(struct xfs_buf *);
  bool   xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
  
  extern kmem_zone_t     *xfs_buf_item_zone;
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c

index 04faa73..d480f11 100644 (file)
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -419,8 +419,7 @@ xlog_recover_validate_buf_type(
         if (bp->b_ops) {
                 struct xfs_buf_log_item *bip;
  
-               ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone);
-               bp->b_iodone = xlog_recover_iodone;
+               bp->b_flags |= _XBF_LOGRECOVERY;
                 xfs_buf_item_init(bp, mp);
                 bip = bp->b_log_item;
                 bip->bli_item.li_lsn = current_lsn;
@@ -494,8 +493,7 @@ xlog_recover_do_reg_buffer(
                                         item->ri_buf[i].i_len, __func__);
                                 goto next;
                         }
-                       fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr,
-                                              -1, 0);
+                       fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1);
                         if (fa) {
                                 xfs_alert(mp,
         "dquot corrupt at %pS trying to replay into block 0x%llx",
@@ -548,11 +546,11 @@ xlog_recover_do_dquot_buffer(
  
         type = 0;
         if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
-               type |= XFS_DQ_USER;
+               type |= XFS_DQTYPE_USER;
         if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
-               type |= XFS_DQ_PROJ;
+               type |= XFS_DQTYPE_PROJ;
         if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
-               type |= XFS_DQ_GROUP;
+               type |= XFS_DQTYPE_GROUP;
         /*
          * This type of quotas was turned off, so ignore this buffer
          */
@@ -963,7 +961,7 @@ xlog_recover_buf_commit_pass2(
                 error = xfs_bwrite(bp);
         } else {
                 ASSERT(bp->b_mount == mp);
-               bp->b_iodone = xlog_recover_iodone;
+               bp->b_flags |= _XBF_LOGRECOVERY;
                 xfs_buf_delwri_queue(bp, buffer_list);
         }
  
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c

index d5b7f03..04dc2be 100644 (file)
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -23,6 +23,7 @@
  #include "xfs_trace.h"
  #include "xfs_log.h"
  #include "xfs_bmap_btree.h"
+#include "xfs_error.h"
  
  /*
   * Lock order:
@@ -66,38 +67,60 @@ xfs_qm_dqdestroy(
   */
  void
  xfs_qm_adjust_dqlimits(
-       struct xfs_mount        *mp,
         struct xfs_dquot        *dq)
  {
+       struct xfs_mount        *mp = dq->q_mount;
         struct xfs_quotainfo    *q = mp->m_quotainfo;
-       struct xfs_disk_dquot   *d = &dq->q_core;
         struct xfs_def_quota    *defq;
         int                     prealloc = 0;
  
-       ASSERT(d->d_id);
+       ASSERT(dq->q_id);
         defq = xfs_get_defquota(q, xfs_dquot_type(dq));
  
-       if (defq->bsoftlimit && !d->d_blk_softlimit) {
-               d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit);
+       if (!dq->q_blk.softlimit) {
+               dq->q_blk.softlimit = defq->blk.soft;
                 prealloc = 1;
         }
-       if (defq->bhardlimit && !d->d_blk_hardlimit) {
-               d->d_blk_hardlimit = cpu_to_be64(defq->bhardlimit);
+       if (!dq->q_blk.hardlimit) {
+               dq->q_blk.hardlimit = defq->blk.hard;
                 prealloc = 1;
         }
-       if (defq->isoftlimit && !d->d_ino_softlimit)
-               d->d_ino_softlimit = cpu_to_be64(defq->isoftlimit);
-       if (defq->ihardlimit && !d->d_ino_hardlimit)
-               d->d_ino_hardlimit = cpu_to_be64(defq->ihardlimit);
-       if (defq->rtbsoftlimit && !d->d_rtb_softlimit)
-               d->d_rtb_softlimit = cpu_to_be64(defq->rtbsoftlimit);
-       if (defq->rtbhardlimit && !d->d_rtb_hardlimit)
-               d->d_rtb_hardlimit = cpu_to_be64(defq->rtbhardlimit);
+       if (!dq->q_ino.softlimit)
+               dq->q_ino.softlimit = defq->ino.soft;
+       if (!dq->q_ino.hardlimit)
+               dq->q_ino.hardlimit = defq->ino.hard;
+       if (!dq->q_rtb.softlimit)
+               dq->q_rtb.softlimit = defq->rtb.soft;
+       if (!dq->q_rtb.hardlimit)
+               dq->q_rtb.hardlimit = defq->rtb.hard;
  
         if (prealloc)
                 xfs_dquot_set_prealloc_limits(dq);
  }
  
+/*
+ * Determine if this quota counter is over either limit and set the quota
+ * timers as appropriate.
+ */
+static inline void
+xfs_qm_adjust_res_timer(
+       struct xfs_dquot_res    *res,
+       struct xfs_quota_limits *qlim)
+{
+       ASSERT(res->hardlimit == 0 || res->softlimit <= res->hardlimit);
+
+       if ((res->softlimit && res->count > res->softlimit) ||
+           (res->hardlimit && res->count > res->hardlimit)) {
+               if (res->timer == 0)
+                       res->timer = ktime_get_real_seconds() + qlim->time;
+       } else {
+               if (res->timer == 0)
+                       res->warnings = 0;
+               else
+                       res->timer = 0;
+       }
+}
+
  /*
   * Check the limits and timers of a dquot and start or reset timers
   * if necessary.
@@ -113,96 +136,18 @@ xfs_qm_adjust_dqlimits(
   */
  void
  xfs_qm_adjust_dqtimers(
-       struct xfs_mount        *mp,
         struct xfs_dquot        *dq)
  {
+       struct xfs_mount        *mp = dq->q_mount;
         struct xfs_quotainfo    *qi = mp->m_quotainfo;
-       struct xfs_disk_dquot   *d = &dq->q_core;
         struct xfs_def_quota    *defq;
  
-       ASSERT(d->d_id);
+       ASSERT(dq->q_id);
         defq = xfs_get_defquota(qi, xfs_dquot_type(dq));
  
-#ifdef DEBUG
-       if (d->d_blk_hardlimit)
-               ASSERT(be64_to_cpu(d->d_blk_softlimit) <=
-                      be64_to_cpu(d->d_blk_hardlimit));
-       if (d->d_ino_hardlimit)
-               ASSERT(be64_to_cpu(d->d_ino_softlimit) <=
-                      be64_to_cpu(d->d_ino_hardlimit));
-       if (d->d_rtb_hardlimit)
-               ASSERT(be64_to_cpu(d->d_rtb_softlimit) <=
-                      be64_to_cpu(d->d_rtb_hardlimit));
-#endif
-
-       if (!d->d_btimer) {
-               if ((d->d_blk_softlimit &&
-                    (be64_to_cpu(d->d_bcount) >
-                     be64_to_cpu(d->d_blk_softlimit))) ||
-                   (d->d_blk_hardlimit &&
-                    (be64_to_cpu(d->d_bcount) >
-                     be64_to_cpu(d->d_blk_hardlimit)))) {
-                       d->d_btimer = cpu_to_be32(ktime_get_real_seconds() +
-                                       defq->btimelimit);
-               } else {
-                       d->d_bwarns = 0;
-               }
-       } else {
-               if ((!d->d_blk_softlimit ||
-                    (be64_to_cpu(d->d_bcount) <=
-                     be64_to_cpu(d->d_blk_softlimit))) &&
-                   (!d->d_blk_hardlimit ||
-                   (be64_to_cpu(d->d_bcount) <=
-                    be64_to_cpu(d->d_blk_hardlimit)))) {
-                       d->d_btimer = 0;
-               }
-       }
-
-       if (!d->d_itimer) {
-               if ((d->d_ino_softlimit &&
-                    (be64_to_cpu(d->d_icount) >
-                     be64_to_cpu(d->d_ino_softlimit))) ||
-                   (d->d_ino_hardlimit &&
-                    (be64_to_cpu(d->d_icount) >
-                     be64_to_cpu(d->d_ino_hardlimit)))) {
-                       d->d_itimer = cpu_to_be32(ktime_get_real_seconds() +
-                                       defq->itimelimit);
-               } else {
-                       d->d_iwarns = 0;
-               }
-       } else {
-               if ((!d->d_ino_softlimit ||
-                    (be64_to_cpu(d->d_icount) <=
-                     be64_to_cpu(d->d_ino_softlimit)))  &&
-                   (!d->d_ino_hardlimit ||
-                    (be64_to_cpu(d->d_icount) <=
-                     be64_to_cpu(d->d_ino_hardlimit)))) {
-                       d->d_itimer = 0;
-               }
-       }
-
-       if (!d->d_rtbtimer) {
-               if ((d->d_rtb_softlimit &&
-                    (be64_to_cpu(d->d_rtbcount) >
-                     be64_to_cpu(d->d_rtb_softlimit))) ||
-                   (d->d_rtb_hardlimit &&
-                    (be64_to_cpu(d->d_rtbcount) >
-                     be64_to_cpu(d->d_rtb_hardlimit)))) {
-                       d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() +
-                                       defq->rtbtimelimit);
-               } else {
-                       d->d_rtbwarns = 0;
-               }
-       } else {
-               if ((!d->d_rtb_softlimit ||
-                    (be64_to_cpu(d->d_rtbcount) <=
-                     be64_to_cpu(d->d_rtb_softlimit))) &&
-                   (!d->d_rtb_hardlimit ||
-                    (be64_to_cpu(d->d_rtbcount) <=
-                     be64_to_cpu(d->d_rtb_hardlimit)))) {
-                       d->d_rtbtimer = 0;
-               }
-       }
+       xfs_qm_adjust_res_timer(&dq->q_blk, &defq->blk);
+       xfs_qm_adjust_res_timer(&dq->q_ino, &defq->ino);
+       xfs_qm_adjust_res_timer(&dq->q_rtb, &defq->rtb);
  }
  
  /*
@@ -213,7 +158,7 @@ xfs_qm_init_dquot_blk(
         struct xfs_trans        *tp,
         struct xfs_mount        *mp,
         xfs_dqid_t              id,
-       uint                    type,
+       xfs_dqtype_t            type,
         struct xfs_buf          *bp)
  {
         struct xfs_quotainfo    *q = mp->m_quotainfo;
@@ -226,6 +171,24 @@ xfs_qm_init_dquot_blk(
         ASSERT(tp);
         ASSERT(xfs_buf_islocked(bp));
  
+       switch (type) {
+       case XFS_DQTYPE_USER:
+               qflag = XFS_UQUOTA_CHKD;
+               blftype = XFS_BLF_UDQUOT_BUF;
+               break;
+       case XFS_DQTYPE_PROJ:
+               qflag = XFS_PQUOTA_CHKD;
+               blftype = XFS_BLF_PDQUOT_BUF;
+               break;
+       case XFS_DQTYPE_GROUP:
+               qflag = XFS_GQUOTA_CHKD;
+               blftype = XFS_BLF_GDQUOT_BUF;
+               break;
+       default:
+               ASSERT(0);
+               return;
+       }
+
         d = bp->b_addr;
  
         /*
@@ -237,7 +200,7 @@ xfs_qm_init_dquot_blk(
                 d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
                 d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
                 d->dd_diskdq.d_id = cpu_to_be32(curid);
-               d->dd_diskdq.d_flags = type;
+               d->dd_diskdq.d_type = type;
                 if (xfs_sb_version_hascrc(&mp->m_sb)) {
                         uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
                         xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
@@ -245,17 +208,6 @@ xfs_qm_init_dquot_blk(
                 }
         }
  
-       if (type & XFS_DQ_USER) {
-               qflag = XFS_UQUOTA_CHKD;
-               blftype = XFS_BLF_UDQUOT_BUF;
-       } else if (type & XFS_DQ_PROJ) {
-               qflag = XFS_PQUOTA_CHKD;
-               blftype = XFS_BLF_PDQUOT_BUF;
-       } else {
-               qflag = XFS_GQUOTA_CHKD;
-               blftype = XFS_BLF_GDQUOT_BUF;
-       }
-
         xfs_trans_dquot_buf(tp, bp, blftype);
  
         /*
@@ -290,8 +242,8 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
  {
         uint64_t space;
  
-       dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
-       dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit);
+       dqp->q_prealloc_hi_wmark = dqp->q_blk.hardlimit;
+       dqp->q_prealloc_lo_wmark = dqp->q_blk.softlimit;
         if (!dqp->q_prealloc_lo_wmark) {
                 dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark;
                 do_div(dqp->q_prealloc_lo_wmark, 100);
@@ -321,14 +273,15 @@ xfs_dquot_disk_alloc(
         struct xfs_trans        *tp = *tpp;
         struct xfs_mount        *mp = tp->t_mountp;
         struct xfs_buf          *bp;
-       struct xfs_inode        *quotip = xfs_quota_inode(mp, dqp->dq_flags);
+       xfs_dqtype_t            qtype = xfs_dquot_type(dqp);
+       struct xfs_inode        *quotip = xfs_quota_inode(mp, qtype);
         int                     nmaps = 1;
         int                     error;
  
         trace_xfs_dqalloc(dqp);
  
         xfs_ilock(quotip, XFS_ILOCK_EXCL);
-       if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
+       if (!xfs_this_quota_on(dqp->q_mount, qtype)) {
                 /*
                  * Return if this type of quotas is turned off while we didn't
                  * have an inode lock
@@ -365,8 +318,7 @@ xfs_dquot_disk_alloc(
          * Make a chunk of dquots out of this buffer and log
          * the entire thing.
          */
-       xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
-                             dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
+       xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp);
         xfs_buf_set_ref(bp, XFS_DQUOT_REF);
  
         /*
@@ -413,13 +365,14 @@ xfs_dquot_disk_read(
  {
         struct xfs_bmbt_irec    map;
         struct xfs_buf          *bp;
-       struct xfs_inode        *quotip = xfs_quota_inode(mp, dqp->dq_flags);
+       xfs_dqtype_t            qtype = xfs_dquot_type(dqp);
+       struct xfs_inode        *quotip = xfs_quota_inode(mp, qtype);
         uint                    lock_mode;
         int                     nmaps = 1;
         int                     error;
  
         lock_mode = xfs_ilock_data_map_shared(quotip);
-       if (!xfs_this_quota_on(mp, dqp->dq_flags)) {
+       if (!xfs_this_quota_on(mp, qtype)) {
                 /*
                  * Return if this type of quotas is turned off while we
                  * didn't have the quota inode lock.
@@ -471,14 +424,14 @@ STATIC struct xfs_dquot *
  xfs_dquot_alloc(
         struct xfs_mount        *mp,
         xfs_dqid_t              id,
-       uint                    type)
+       xfs_dqtype_t            type)
  {
         struct xfs_dquot        *dqp;
  
-       dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0);
+       dqp = kmem_cache_zalloc(xfs_qm_dqzone, GFP_KERNEL | __GFP_NOFAIL);
  
-       dqp->dq_flags = type;
-       dqp->q_core.d_id = cpu_to_be32(id);
+       dqp->q_type = type;
+       dqp->q_id = id;
         dqp->q_mount = mp;
         INIT_LIST_HEAD(&dqp->q_lru);
         mutex_init(&dqp->q_qlock);
@@ -503,13 +456,13 @@ xfs_dquot_alloc(
          * quotas.
          */
         switch (type) {
-       case XFS_DQ_USER:
+       case XFS_DQTYPE_USER:
                 /* uses the default lock class */
                 break;
-       case XFS_DQ_GROUP:
+       case XFS_DQTYPE_GROUP:
                 lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class);
                 break;
-       case XFS_DQ_PROJ:
+       case XFS_DQTYPE_PROJ:
                 lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class);
                 break;
         default:
@@ -524,26 +477,91 @@ xfs_dquot_alloc(
  }
  
  /* Copy the in-core quota fields in from the on-disk buffer. */
-STATIC void
+STATIC int
  xfs_dquot_from_disk(
         struct xfs_dquot        *dqp,
         struct xfs_buf          *bp)
  {
         struct xfs_disk_dquot   *ddqp = bp->b_addr + dqp->q_bufoffset;
  
+       /*
+        * Ensure that we got the type and ID we were looking for.
+        * Everything else was checked by the dquot buffer verifier.
+        */
+       if ((ddqp->d_type & XFS_DQTYPE_REC_MASK) != xfs_dquot_type(dqp) ||
+           be32_to_cpu(ddqp->d_id) != dqp->q_id) {
+               xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR,
+                         "Metadata corruption detected at %pS, quota %u",
+                         __this_address, dqp->q_id);
+               xfs_alert(bp->b_mount, "Unmount and run xfs_repair");
+               return -EFSCORRUPTED;
+       }
+
         /* copy everything from disk dquot to the incore dquot */
-       memcpy(&dqp->q_core, ddqp, sizeof(struct xfs_disk_dquot));
+       dqp->q_type = ddqp->d_type;
+       dqp->q_blk.hardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+       dqp->q_blk.softlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+       dqp->q_ino.hardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+       dqp->q_ino.softlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+       dqp->q_rtb.hardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+       dqp->q_rtb.softlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+
+       dqp->q_blk.count = be64_to_cpu(ddqp->d_bcount);
+       dqp->q_ino.count = be64_to_cpu(ddqp->d_icount);
+       dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount);
+
+       dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns);
+       dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns);
+       dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns);
+
+       dqp->q_blk.timer = be32_to_cpu(ddqp->d_btimer);
+       dqp->q_ino.timer = be32_to_cpu(ddqp->d_itimer);
+       dqp->q_rtb.timer = be32_to_cpu(ddqp->d_rtbtimer);
  
         /*
          * Reservation counters are defined as reservation plus current usage
          * to avoid having to add every time.
          */
-       dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
-       dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
-       dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
+       dqp->q_blk.reserved = dqp->q_blk.count;
+       dqp->q_ino.reserved = dqp->q_ino.count;
+       dqp->q_rtb.reserved = dqp->q_rtb.count;
  
         /* initialize the dquot speculative prealloc thresholds */
         xfs_dquot_set_prealloc_limits(dqp);
+       return 0;
+}
+
+/* Copy the in-core quota fields into the on-disk buffer. */
+void
+xfs_dquot_to_disk(
+       struct xfs_disk_dquot   *ddqp,
+       struct xfs_dquot        *dqp)
+{
+       ddqp->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+       ddqp->d_version = XFS_DQUOT_VERSION;
+       ddqp->d_type = dqp->q_type;
+       ddqp->d_id = cpu_to_be32(dqp->q_id);
+       ddqp->d_pad0 = 0;
+       ddqp->d_pad = 0;
+
+       ddqp->d_blk_hardlimit = cpu_to_be64(dqp->q_blk.hardlimit);
+       ddqp->d_blk_softlimit = cpu_to_be64(dqp->q_blk.softlimit);
+       ddqp->d_ino_hardlimit = cpu_to_be64(dqp->q_ino.hardlimit);
+       ddqp->d_ino_softlimit = cpu_to_be64(dqp->q_ino.softlimit);
+       ddqp->d_rtb_hardlimit = cpu_to_be64(dqp->q_rtb.hardlimit);
+       ddqp->d_rtb_softlimit = cpu_to_be64(dqp->q_rtb.softlimit);
+
+       ddqp->d_bcount = cpu_to_be64(dqp->q_blk.count);
+       ddqp->d_icount = cpu_to_be64(dqp->q_ino.count);
+       ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count);
+
+       ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings);
+       ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings);
+       ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings);
+
+       ddqp->d_btimer = cpu_to_be32(dqp->q_blk.timer);
+       ddqp->d_itimer = cpu_to_be32(dqp->q_ino.timer);
+       ddqp->d_rtbtimer = cpu_to_be32(dqp->q_rtb.timer);
  }
  
  /* Allocate and initialize the dquot buffer for this in-core dquot. */
@@ -592,7 +610,7 @@ static int
  xfs_qm_dqread(
         struct xfs_mount        *mp,
         xfs_dqid_t              id,
-       uint                    type,
+       xfs_dqtype_t            type,
         bool                    can_alloc,
         struct xfs_dquot        **dqpp)
  {
@@ -617,9 +635,11 @@ xfs_qm_dqread(
          * further.
          */
         ASSERT(xfs_buf_islocked(bp));
-       xfs_dquot_from_disk(dqp, bp);
-
+       error = xfs_dquot_from_disk(dqp, bp);
         xfs_buf_relse(bp);
+       if (error)
+               goto err;
+
         *dqpp = dqp;
         return error;
  
@@ -638,7 +658,7 @@ err:
  static int
  xfs_dq_get_next_id(
         struct xfs_mount        *mp,
-       uint                    type,
+       xfs_dqtype_t            type,
         xfs_dqid_t              *id)
  {
         struct xfs_inode        *quotip = xfs_quota_inode(mp, type);
@@ -706,7 +726,7 @@ restart:
         }
  
         xfs_dqlock(dqp);
-       if (dqp->dq_flags & XFS_DQ_FREEING) {
+       if (dqp->q_flags & XFS_DQFLAG_FREEING) {
                 xfs_dqunlock(dqp);
                 mutex_unlock(&qi->qi_tree_lock);
                 trace_xfs_dqget_freeing(dqp);
@@ -762,21 +782,21 @@ xfs_qm_dqget_cache_insert(
  static int
  xfs_qm_dqget_checks(
         struct xfs_mount        *mp,
-       uint                    type)
+       xfs_dqtype_t            type)
  {
         if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp)))
                 return -ESRCH;
  
         switch (type) {
-       case XFS_DQ_USER:
+       case XFS_DQTYPE_USER:
                 if (!XFS_IS_UQUOTA_ON(mp))
                         return -ESRCH;
                 return 0;
-       case XFS_DQ_GROUP:
+       case XFS_DQTYPE_GROUP:
                 if (!XFS_IS_GQUOTA_ON(mp))
                         return -ESRCH;
                 return 0;
-       case XFS_DQ_PROJ:
+       case XFS_DQTYPE_PROJ:
                 if (!XFS_IS_PQUOTA_ON(mp))
                         return -ESRCH;
                 return 0;
@@ -794,7 +814,7 @@ int
  xfs_qm_dqget(
         struct xfs_mount        *mp,
         xfs_dqid_t              id,
-       uint                    type,
+       xfs_dqtype_t            type,
         bool                    can_alloc,
         struct xfs_dquot        **O_dqpp)
  {
@@ -844,7 +864,7 @@ int
  xfs_qm_dqget_uncached(
         struct xfs_mount        *mp,
         xfs_dqid_t              id,
-       uint                    type,
+       xfs_dqtype_t            type,
         struct xfs_dquot        **dqpp)
  {
         int                     error;
@@ -860,14 +880,14 @@ xfs_qm_dqget_uncached(
  xfs_dqid_t
  xfs_qm_id_for_quotatype(
         struct xfs_inode        *ip,
-       uint                    type)
+       xfs_dqtype_t            type)
  {
         switch (type) {
-       case XFS_DQ_USER:
+       case XFS_DQTYPE_USER:
                 return i_uid_read(VFS_I(ip));
-       case XFS_DQ_GROUP:
+       case XFS_DQTYPE_GROUP:
                 return i_gid_read(VFS_I(ip));
-       case XFS_DQ_PROJ:
+       case XFS_DQTYPE_PROJ:
                 return ip->i_d.di_projid;
         }
         ASSERT(0);
@@ -882,7 +902,7 @@ xfs_qm_id_for_quotatype(
  int
  xfs_qm_dqget_inode(
         struct xfs_inode        *ip,
-       uint                    type,
+       xfs_dqtype_t            type,
         bool                    can_alloc,
         struct xfs_dquot        **O_dqpp)
  {
@@ -968,7 +988,7 @@ int
  xfs_qm_dqget_next(
         struct xfs_mount        *mp,
         xfs_dqid_t              id,
-       uint                    type,
+       xfs_dqtype_t            type,
         struct xfs_dquot        **dqpp)
  {
         struct xfs_dquot        *dqp;
@@ -1048,9 +1068,8 @@ xfs_qm_dqrele(
   * from the AIL if it has not been re-logged, and unlocking the dquot's
   * flush lock. This behavior is very similar to that of inodes..
   */
-STATIC void
+static void
  xfs_qm_dqflush_done(
-       struct xfs_buf          *bp,
         struct xfs_log_item     *lip)
  {
         struct xfs_dq_logitem   *qip = (struct xfs_dq_logitem *)lip;
@@ -1071,16 +1090,12 @@ xfs_qm_dqflush_done(
              test_bit(XFS_LI_FAILED, &lip->li_flags))) {
  
                 spin_lock(&ailp->ail_lock);
+               xfs_clear_li_failed(lip);
                 if (lip->li_lsn == qip->qli_flush_lsn) {
                         /* xfs_ail_update_finish() drops the AIL lock */
                         tail_lsn = xfs_ail_delete_one(ailp, lip);
                         xfs_ail_update_finish(ailp, tail_lsn);
                 } else {
-                       /*
-                        * Clear the failed state since we are about to drop the
-                        * flush lock
-                        */
-                       xfs_clear_li_failed(lip);
                         spin_unlock(&ailp->ail_lock);
                 }
         }
@@ -1091,6 +1106,48 @@ xfs_qm_dqflush_done(
         xfs_dqfunlock(dqp);
  }
  
+void
+xfs_dquot_done(
+       struct xfs_buf          *bp)
+{
+       struct xfs_log_item     *lip, *n;
+
+       list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+               list_del_init(&lip->li_bio_list);
+               xfs_qm_dqflush_done(lip);
+       }
+}
+
+/* Check incore dquot for errors before we flush. */
+static xfs_failaddr_t
+xfs_qm_dqflush_check(
+       struct xfs_dquot        *dqp)
+{
+       xfs_dqtype_t            type = xfs_dquot_type(dqp);
+
+       if (type != XFS_DQTYPE_USER &&
+           type != XFS_DQTYPE_GROUP &&
+           type != XFS_DQTYPE_PROJ)
+               return __this_address;
+
+       if (dqp->q_id == 0)
+               return NULL;
+
+       if (dqp->q_blk.softlimit && dqp->q_blk.count > dqp->q_blk.softlimit &&
+           !dqp->q_blk.timer)
+               return __this_address;
+
+       if (dqp->q_ino.softlimit && dqp->q_ino.count > dqp->q_ino.softlimit &&
+           !dqp->q_ino.timer)
+               return __this_address;
+
+       if (dqp->q_rtb.softlimit && dqp->q_rtb.count > dqp->q_rtb.softlimit &&
+           !dqp->q_rtb.timer)
+               return __this_address;
+
+       return NULL;
+}
+
  /*
   * Write a modified dquot to disk.
   * The dquot must be locked and the flush lock too taken by caller.
@@ -1107,8 +1164,7 @@ xfs_qm_dqflush(
         struct xfs_mount        *mp = dqp->q_mount;
         struct xfs_log_item     *lip = &dqp->q_logitem.qli_item;
         struct xfs_buf          *bp;
-       struct xfs_dqblk        *dqb;
-       struct xfs_disk_dquot   *ddqp;
+       struct xfs_dqblk        *dqblk;
         xfs_failaddr_t          fa;
         int                     error;
  
@@ -1132,30 +1188,23 @@ xfs_qm_dqflush(
         if (error)
                 goto out_abort;
  
-       /*
-        * Calculate the location of the dquot inside the buffer.
-        */
-       dqb = bp->b_addr + dqp->q_bufoffset;
-       ddqp = &dqb->dd_diskdq;
-
-       /* sanity check the in-core structure before we flush */
-       fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(dqp->q_core.d_id),
-                             0);
+       fa = xfs_qm_dqflush_check(dqp);
         if (fa) {
                 xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
-                               be32_to_cpu(dqp->q_core.d_id), fa);
+                               dqp->q_id, fa);
                 xfs_buf_relse(bp);
                 error = -EFSCORRUPTED;
                 goto out_abort;
         }
  
-       /* This is the only portion of data that needs to persist */
-       memcpy(ddqp, &dqp->q_core, sizeof(struct xfs_disk_dquot));
+       /* Flush the incore dquot to the ondisk buffer. */
+       dqblk = bp->b_addr + dqp->q_bufoffset;
+       xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp);
  
         /*
          * Clear the dirty field and remember the flush lsn for later use.
          */
-       dqp->dq_flags &= ~XFS_DQ_DIRTY;
+       dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
  
         xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
                                         &dqp->q_logitem.qli_item.li_lsn);
@@ -1170,17 +1219,17 @@ xfs_qm_dqflush(
          * of a dquot without an up-to-date CRC getting to disk.
          */
         if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
-               xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
+               dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
+               xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk),
                                  XFS_DQUOT_CRC_OFF);
         }
  
         /*
-        * Attach an iodone routine so that we can remove this dquot from the
-        * AIL and release the flush lock once the dquot is synced to disk.
+        * Attach the dquot to the buffer so that we can remove this dquot from
+        * the AIL and release the flush lock once the dquot is synced to disk.
          */
-       xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
-                                 &dqp->q_logitem.qli_item);
+       bp->b_flags |= _XBF_DQUOTS;
+       list_add_tail(&dqp->q_logitem.qli_item.li_bio_list, &bp->b_li_list);
  
         /*
          * If the buffer is pinned then push on the log so we won't
@@ -1196,7 +1245,7 @@ xfs_qm_dqflush(
         return 0;
  
  out_abort:
-       dqp->dq_flags &= ~XFS_DQ_DIRTY;
+       dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
         xfs_trans_ail_delete(lip, 0);
         xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
  out_unlock:
@@ -1217,8 +1266,7 @@ xfs_dqlock2(
  {
         if (d1 && d2) {
                 ASSERT(d1 != d2);
-               if (be32_to_cpu(d1->q_core.d_id) >
-                   be32_to_cpu(d2->q_core.d_id)) {
+               if (d1->q_id > d2->q_id) {
                         mutex_lock(&d2->q_qlock);
                         mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
                 } else {
@@ -1270,7 +1318,7 @@ xfs_qm_exit(void)
  int
  xfs_qm_dqiterate(
         struct xfs_mount        *mp,
-       uint                    dqtype,
+       xfs_dqtype_t            type,
         xfs_qm_dqiterate_fn     iter_fn,
         void                    *priv)
  {
@@ -1279,16 +1327,15 @@ xfs_qm_dqiterate(
         int                     error;
  
         do {
-               error = xfs_qm_dqget_next(mp, id, dqtype, &dq);
+               error = xfs_qm_dqget_next(mp, id, type, &dq);
                 if (error == -ENOENT)
                         return 0;
                 if (error)
                         return error;
  
-               error = iter_fn(dq, dqtype, priv);
-               id = be32_to_cpu(dq->q_core.d_id);
+               error = iter_fn(dq, type, priv);
+               id = dq->q_id;
                 xfs_qm_dqput(dq);
-               id++;
         } while (error == 0 && id != 0);
  
         return error;
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h

index 71e36c8..282a65d 100644 (file)
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -27,26 +27,53 @@ enum {
         XFS_QLOWSP_MAX
  };
  
+struct xfs_dquot_res {
+       /* Total resources allocated and reserved. */
+       xfs_qcnt_t              reserved;
+
+       /* Total resources allocated. */
+       xfs_qcnt_t              count;
+
+       /* Absolute and preferred limits. */
+       xfs_qcnt_t              hardlimit;
+       xfs_qcnt_t              softlimit;
+
+       /*
+        * For root dquots, this is the default grace period, in seconds.
+        * Otherwise, this is when the quota grace period expires,
+        * in seconds since the Unix epoch.
+        */
+       time64_t                timer;
+
+       /*
+        * For root dquots, this is the maximum number of warnings that will
+        * be issued for this quota type.  Otherwise, this is the number of
+        * warnings issued against this quota.  Note that none of this is
+        * implemented.
+        */
+       xfs_qwarncnt_t          warnings;
+};
+
  /*
   * The incore dquot structure
   */
  struct xfs_dquot {
-       uint                    dq_flags;
         struct list_head        q_lru;
         struct xfs_mount        *q_mount;
+       xfs_dqtype_t            q_type;
+       uint16_t                q_flags;
+       xfs_dqid_t              q_id;
         uint                    q_nrefs;
-       xfs_daddr_t             q_blkno;
         int                     q_bufoffset;
+       xfs_daddr_t             q_blkno;
         xfs_fileoff_t           q_fileoffset;
  
-       struct xfs_disk_dquot   q_core;
+       struct xfs_dquot_res    q_blk;  /* regular blocks */
+       struct xfs_dquot_res    q_ino;  /* inodes */
+       struct xfs_dquot_res    q_rtb;  /* realtime blocks */
+
         struct xfs_dq_logitem   q_logitem;
-       /* total regular nblks used+reserved */
-       xfs_qcnt_t              q_res_bcount;
-       /* total inos allocd+reserved */
-       xfs_qcnt_t              q_res_icount;
-       /* total realtime blks used+reserved */
-       xfs_qcnt_t              q_res_rtbcount;
+
         xfs_qcnt_t              q_prealloc_lo_wmark;
         xfs_qcnt_t              q_prealloc_hi_wmark;
         int64_t                 q_low_space[XFS_QLOWSP_MAX];
@@ -101,34 +128,59 @@ static inline void xfs_dqunlock(struct xfs_dquot *dqp)
         mutex_unlock(&dqp->q_qlock);
  }
  
-static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
+static inline int
+xfs_dquot_type(const struct xfs_dquot *dqp)
  {
-       switch (type & XFS_DQ_ALLTYPES) {
-       case XFS_DQ_USER:
+       return dqp->q_type & XFS_DQTYPE_REC_MASK;
+}
+
+static inline int xfs_this_quota_on(struct xfs_mount *mp, xfs_dqtype_t type)
+{
+       switch (type) {
+       case XFS_DQTYPE_USER:
                 return XFS_IS_UQUOTA_ON(mp);
-       case XFS_DQ_GROUP:
+       case XFS_DQTYPE_GROUP:
                 return XFS_IS_GQUOTA_ON(mp);
-       case XFS_DQ_PROJ:
+       case XFS_DQTYPE_PROJ:
                 return XFS_IS_PQUOTA_ON(mp);
         default:
                 return 0;
         }
  }
  
-static inline struct xfs_dquot *xfs_inode_dquot(struct xfs_inode *ip, int type)
+static inline struct xfs_dquot *xfs_inode_dquot(
+       struct xfs_inode        *ip,
+       xfs_dqtype_t            type)
  {
-       switch (type & XFS_DQ_ALLTYPES) {
-       case XFS_DQ_USER:
+       switch (type) {
+       case XFS_DQTYPE_USER:
                 return ip->i_udquot;
-       case XFS_DQ_GROUP:
+       case XFS_DQTYPE_GROUP:
                 return ip->i_gdquot;
-       case XFS_DQ_PROJ:
+       case XFS_DQTYPE_PROJ:
                 return ip->i_pdquot;
         default:
                 return NULL;
         }
  }
  
+/* Decide if the dquot's limits are actually being enforced. */
+static inline bool
+xfs_dquot_is_enforced(
+       const struct xfs_dquot  *dqp)
+{
+       switch (xfs_dquot_type(dqp)) {
+       case XFS_DQTYPE_USER:
+               return XFS_IS_UQUOTA_ENFORCED(dqp->q_mount);
+       case XFS_DQTYPE_GROUP:
+               return XFS_IS_GQUOTA_ENFORCED(dqp->q_mount);
+       case XFS_DQTYPE_PROJ:
+               return XFS_IS_PQUOTA_ENFORCED(dqp->q_mount);
+       }
+       ASSERT(0);
+       return false;
+}
+
  /*
   * Check whether a dquot is under low free space conditions. We assume the quota
   * is enabled and enforced.
@@ -137,38 +189,35 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
  {
         int64_t freesp;
  
-       freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount;
+       freesp = dqp->q_blk.hardlimit - dqp->q_blk.reserved;
         if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT])
                 return true;
  
         return false;
  }
  
+void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp);
+
  #define XFS_DQ_IS_LOCKED(dqp)  (mutex_is_locked(&((dqp)->q_qlock)))
-#define XFS_DQ_IS_DIRTY(dqp)   ((dqp)->dq_flags & XFS_DQ_DIRTY)
-#define XFS_QM_ISUDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_USER)
-#define XFS_QM_ISPDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_PROJ)
-#define XFS_QM_ISGDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_GROUP)
+#define XFS_DQ_IS_DIRTY(dqp)   ((dqp)->q_flags & XFS_DQFLAG_DIRTY)
  
  void           xfs_qm_dqdestroy(struct xfs_dquot *dqp);
  int            xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp);
  void           xfs_qm_dqunpin_wait(struct xfs_dquot *dqp);
-void           xfs_qm_adjust_dqtimers(struct xfs_mount *mp,
-                                               struct xfs_dquot *d);
-void           xfs_qm_adjust_dqlimits(struct xfs_mount *mp,
-                                               struct xfs_dquot *d);
-xfs_dqid_t     xfs_qm_id_for_quotatype(struct xfs_inode *ip, uint type);
+void           xfs_qm_adjust_dqtimers(struct xfs_dquot *d);
+void           xfs_qm_adjust_dqlimits(struct xfs_dquot *d);
+xfs_dqid_t     xfs_qm_id_for_quotatype(struct xfs_inode *ip,
+                               xfs_dqtype_t type);
  int            xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id,
-                                       uint type, bool can_alloc,
-                                       struct xfs_dquot **dqpp);
-int            xfs_qm_dqget_inode(struct xfs_inode *ip, uint type,
-                                               bool can_alloc,
-                                               struct xfs_dquot **dqpp);
+                               xfs_dqtype_t type, bool can_alloc,
+                               struct xfs_dquot **dqpp);
+int            xfs_qm_dqget_inode(struct xfs_inode *ip, xfs_dqtype_t type,
+                               bool can_alloc, struct xfs_dquot **dqpp);
  int            xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
-                                       uint type, struct xfs_dquot **dqpp);
+                               xfs_dqtype_t type, struct xfs_dquot **dqpp);
  int            xfs_qm_dqget_uncached(struct xfs_mount *mp,
-                                               xfs_dqid_t id, uint type,
-                                               struct xfs_dquot **dqpp);
+                               xfs_dqid_t id, xfs_dqtype_t type,
+                               struct xfs_dquot **dqpp);
  void           xfs_qm_dqput(struct xfs_dquot *dqp);
  
  void           xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
@@ -183,9 +232,9 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
         return dqp;
  }
  
-typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, uint dqtype,
-               void *priv);
-int xfs_qm_dqiterate(struct xfs_mount *mp, uint dqtype,
+typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq,
+               xfs_dqtype_t type, void *priv);
+int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type,
                 xfs_qm_dqiterate_fn iter_fn, void *priv);
  
  #endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c

index 349c92d..8c1fdf3 100644 (file)
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -45,6 +45,7 @@ xfs_qm_dquot_logitem_format(
         struct xfs_log_item     *lip,
         struct xfs_log_vec      *lv)
  {
+       struct xfs_disk_dquot   ddq;
         struct xfs_dq_logitem   *qlip = DQUOT_ITEM(lip);
         struct xfs_log_iovec    *vecp = NULL;
         struct xfs_dq_logformat *qlf;
@@ -52,14 +53,15 @@ xfs_qm_dquot_logitem_format(
         qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT);
         qlf->qlf_type = XFS_LI_DQUOT;
         qlf->qlf_size = 2;
-       qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id);
+       qlf->qlf_id = qlip->qli_dquot->q_id;
         qlf->qlf_blkno = qlip->qli_dquot->q_blkno;
         qlf->qlf_len = 1;
         qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset;
         xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat));
  
-       xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT,
-                       &qlip->qli_dquot->q_core,
+       xfs_dquot_to_disk(&ddq, qlip->qli_dquot);
+
+       xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, &ddq,
                         sizeof(struct xfs_disk_dquot));
  }
  
@@ -113,23 +115,6 @@ xfs_qm_dqunpin_wait(
         wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
  }
  
-/*
- * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
- * have been failed during writeback
- *
- * this informs the AIL that the dquot is already flush locked on the next push,
- * and acquires a hold on the buffer to ensure that it isn't reclaimed before
- * dirty data makes it to disk.
- */
-STATIC void
-xfs_dquot_item_error(
-       struct xfs_log_item     *lip,
-       struct xfs_buf          *bp)
-{
-       ASSERT(!completion_done(&DQUOT_ITEM(lip)->qli_dquot->q_flush));
-       xfs_set_li_failed(lip, bp);
-}
-
  STATIC uint
  xfs_qm_dquot_logitem_push(
         struct xfs_log_item     *lip,
@@ -216,7 +201,6 @@ static const struct xfs_item_ops xfs_dquot_item_ops = {
         .iop_release    = xfs_qm_dquot_logitem_release,
         .iop_committing = xfs_qm_dquot_logitem_committing,
         .iop_push       = xfs_qm_dquot_logitem_push,
-       .iop_error      = xfs_dquot_item_error
  };
  
  /*
diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c

index 3400be4..5875c7e 100644 (file)
--- a/fs/xfs/xfs_dquot_item_recover.c
+++ b/fs/xfs/xfs_dquot_item_recover.c
@@ -39,7 +39,7 @@ xlog_recover_dquot_ra_pass2(
         if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
                 return;
  
-       type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
+       type = recddq->d_type & XFS_DQTYPE_REC_MASK;
         ASSERT(type);
         if (log->l_quotaoffs_flag & type)
                 return;
@@ -91,7 +91,7 @@ xlog_recover_dquot_commit_pass2(
         /*
          * This type of quotas was turned off, so ignore this record.
          */
-       type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
+       type = recddq->d_type & XFS_DQTYPE_REC_MASK;
         ASSERT(type);
         if (log->l_quotaoffs_flag & type)
                 return 0;
@@ -108,7 +108,7 @@ xlog_recover_dquot_commit_pass2(
          */
         dq_f = item->ri_buf[0].i_addr;
         ASSERT(dq_f);
-       fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0);
+       fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id);
         if (fa) {
                 xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
                                 dq_f->qlf_id, fa);
@@ -153,7 +153,7 @@ xlog_recover_dquot_commit_pass2(
  
         ASSERT(dq_f->qlf_size == 2);
         ASSERT(bp->b_mount == mp);
-       bp->b_iodone = xlog_recover_iodone;
+       bp->b_flags |= _XBF_LOGRECOVERY;
         xfs_buf_delwri_queue(bp, buffer_list);
  
  out_release:
@@ -185,11 +185,11 @@ xlog_recover_quotaoff_commit_pass1(
          * group/project quotaoff or both.
          */
         if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
-               log->l_quotaoffs_flag |= XFS_DQ_USER;
+               log->l_quotaoffs_flag |= XFS_DQTYPE_USER;
         if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
-               log->l_quotaoffs_flag |= XFS_DQ_PROJ;
+               log->l_quotaoffs_flag |= XFS_DQTYPE_PROJ;
         if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
-               log->l_quotaoffs_flag |= XFS_DQ_GROUP;
+               log->l_quotaoffs_flag |= XFS_DQTYPE_GROUP;
  
         return 0;
  }
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c

index b9c333b..6cb8cd1 100644 (file)
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -161,7 +161,8 @@ xfs_efi_init(
                         ((nextents - 1) * sizeof(xfs_extent_t)));
                 efip = kmem_zalloc(size, 0);
         } else {
-               efip = kmem_zone_zalloc(xfs_efi_zone, 0);
+               efip = kmem_cache_zalloc(xfs_efi_zone,
+                                        GFP_KERNEL | __GFP_NOFAIL);
         }
  
         xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
@@ -332,7 +333,8 @@ xfs_trans_get_efd(
                                 (nextents - 1) * sizeof(struct xfs_extent),
                                 0);
         } else {
-               efdp = kmem_zone_zalloc(xfs_efd_zone, 0);
+               efdp = kmem_cache_zalloc(xfs_efd_zone,
+                                       GFP_KERNEL | __GFP_NOFAIL);
         }
  
         xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index d538411..c31cd3b 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -94,6 +94,7 @@ xfs_file_fsync(
  {
         struct inode            *inode = file->f_mapping->host;
         struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_inode_log_item *iip = ip->i_itemp;
         struct xfs_mount        *mp = ip->i_mount;
         int                     error = 0;
         int                     log_flushed = 0;
@@ -137,13 +138,15 @@ xfs_file_fsync(
         xfs_ilock(ip, XFS_ILOCK_SHARED);
         if (xfs_ipincount(ip)) {
                 if (!datasync ||
-                   (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
-                       lsn = ip->i_itemp->ili_last_lsn;
+                   (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+                       lsn = iip->ili_last_lsn;
         }
  
         if (lsn) {
                 error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
-               ip->i_itemp->ili_fsync_fields = 0;
+               spin_lock(&iip->ili_lock);
+               iip->ili_fsync_fields = 0;
+               spin_unlock(&iip->ili_lock);
         }
         xfs_iunlock(ip, XFS_ILOCK_SHARED);
  
@@ -1035,7 +1038,7 @@ xfs_file_remap_range(
         /* Prepare and then clone file data. */
         ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
                         &len, remap_flags);
-       if (ret < 0 || len == 0)
+       if (ret || len == 0)
                 return ret;
  
         trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
@@ -1065,7 +1068,7 @@ xfs_file_remap_range(
         if (mp->m_flags & XFS_MOUNT_WSYNC)
                 xfs_log_force_inode(dest);
  out_unlock:
-       xfs_reflink_remap_unlock(file_in, file_out);
+       xfs_iunlock2_io_mmap(src, dest);
         if (ret)
                 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
         return remapped > 0 ? remapped : ret;
@@ -1263,10 +1266,23 @@ xfs_filemap_pfn_mkwrite(
         return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
  }
  
+static void
+xfs_filemap_map_pages(
+       struct vm_fault         *vmf,
+       pgoff_t                 start_pgoff,
+       pgoff_t                 end_pgoff)
+{
+       struct inode            *inode = file_inode(vmf->vma->vm_file);
+
+       xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+       filemap_map_pages(vmf, start_pgoff, end_pgoff);
+       xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+}
+
  static const struct vm_operations_struct xfs_file_vm_ops = {
         .fault          = xfs_filemap_fault,
         .huge_fault     = xfs_filemap_huge_fault,
-       .map_pages      = filemap_map_pages,
+       .map_pages      = xfs_filemap_map_pages,
         .page_mkwrite   = xfs_filemap_page_mkwrite,
         .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
  };
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c

index 5daef65..101028e 100644 (file)
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -37,13 +37,11 @@ xfs_inode_alloc(
         struct xfs_inode        *ip;
  
         /*
-        * if this didn't occur in transactions, we could use
-        * KM_MAYFAIL and return NULL here on ENOMEM. Set the
-        * code up to do this anyway.
+        * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
+        * and return NULL here on ENOMEM.
          */
-       ip = kmem_zone_alloc(xfs_inode_zone, 0);
-       if (!ip)
-               return NULL;
+       ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL);
+
         if (inode_init_always(mp->m_super, VFS_I(ip))) {
                 kmem_cache_free(xfs_inode_zone, ip);
                 return NULL;
@@ -115,6 +113,7 @@ __xfs_inode_free(
  {
         /* asserts to verify all state is correct here */
         ASSERT(atomic_read(&ip->i_pincount) == 0);
+       ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
         XFS_STATS_DEC(ip->i_mount, vn_active);
  
         call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
@@ -141,11 +140,8 @@ xfs_inode_free(
  }
  
  /*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs periodic sync default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
+ * Queue background inode reclaim work if there are reclaimable inodes and there
+ * isn't reclaim work already scheduled or in progress.
   */
  static void
  xfs_reclaim_work_queue(
@@ -160,24 +156,6 @@ xfs_reclaim_work_queue(
         rcu_read_unlock();
  }
  
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-void
-xfs_reclaim_worker(
-       struct work_struct *work)
-{
-       struct xfs_mount *mp = container_of(to_delayed_work(work),
-                                       struct xfs_mount, m_reclaim_work);
-
-       xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-       xfs_reclaim_work_queue(mp);
-}
-
  static void
  xfs_perag_set_reclaim_tag(
         struct xfs_perag        *pag)
@@ -618,48 +596,31 @@ out_destroy:
  }
  
  /*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, initialise the vfs inode
- * if necessary.
+ * Look up an inode by number in the given file system.  The inode is looked up
+ * in the cache held in each AG.  If the inode is found in the cache, initialise
+ * the vfs inode if necessary.
   *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and initialise the vfs inode.
+ * If it is not in core, read it in from the file system's device, add it to the
+ * cache and initialise the vfs inode.
   *
   * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system.  It points
- *       to the inode hash table.
- * tp -- a pointer to the current transaction if there is one.  This is
- *       simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired.  This is the unique identifier
- *        within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode.  See the comment
- *              for xfs_ilock() for a list of valid values.
+ * Inode lookup is only done during metadata operations and not as part of the
+ * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
   */
  int
  xfs_iget(
-       xfs_mount_t     *mp,
-       xfs_trans_t     *tp,
-       xfs_ino_t       ino,
-       uint            flags,
-       uint            lock_flags,
-       xfs_inode_t     **ipp)
+       struct xfs_mount        *mp,
+       struct xfs_trans        *tp,
+       xfs_ino_t               ino,
+       uint                    flags,
+       uint                    lock_flags,
+       struct xfs_inode        **ipp)
  {
-       xfs_inode_t     *ip;
-       int             error;
-       xfs_perag_t     *pag;
-       xfs_agino_t     agino;
+       struct xfs_inode        *ip;
+       struct xfs_perag        *pag;
+       xfs_agino_t             agino;
+       int                     error;
  
-       /*
-        * xfs_reclaim_inode() uses the ILOCK to ensure an inode
-        * doesn't get freed while it's being referenced during a
-        * radix tree traversal here.  It assumes this function
-        * aqcuires only the ILOCK (and therefore it has no need to
-        * involve the IOLOCK in this synchronization).
-        */
         ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
  
         /* reject inode numbers outside existing AGs */
@@ -776,15 +737,7 @@ xfs_inode_walk_ag_grab(
  
         ASSERT(rcu_read_lock_held());
  
-       /*
-        * check for stale RCU freed inode
-        *
-        * If the inode has been reallocated, it doesn't matter if it's not in
-        * the AG we are walking - we are walking for writeback, so if it
-        * passes all the "valid inode" checks and is dirty, then we'll write
-        * it back anyway.  If it has been reallocated and still being
-        * initialised, the XFS_INEW check below will catch it.
-        */
+       /* Check for stale RCU freed inode */
         spin_lock(&ip->i_flags_lock);
         if (!ip->i_ino)
                 goto out_unlock_noent;
@@ -1028,107 +981,62 @@ xfs_cowblocks_worker(
  
  /*
   * Grab the inode for reclaim exclusively.
- * Return 0 if we grabbed it, non-zero otherwise.
+ *
+ * We have found this inode via a lookup under RCU, so the inode may have
+ * already been freed, or it may be in the process of being recycled by
+ * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
+ * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
+ * will not be set. Hence we need to check for both these flag conditions to
+ * avoid inodes that are no longer reclaim candidates.
+ *
+ * Note: checking for other state flags here, under the i_flags_lock or not, is
+ * racy and should be avoided. Those races should be resolved only after we have
+ * ensured that we are able to reclaim this inode and the world can see that we
+ * are going to reclaim it.
+ *
+ * Return true if we grabbed it, false otherwise.
   */
-STATIC int
+static bool
  xfs_reclaim_inode_grab(
-       struct xfs_inode        *ip,
-       int                     flags)
+       struct xfs_inode        *ip)
  {
         ASSERT(rcu_read_lock_held());
  
-       /* quick check for stale RCU freed inode */
-       if (!ip->i_ino)
-               return 1;
-
-       /*
-        * If we are asked for non-blocking operation, do unlocked checks to
-        * see if the inode already is being flushed or in reclaim to avoid
-        * lock traffic.
-        */
-       if ((flags & SYNC_TRYLOCK) &&
-           __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
-               return 1;
-
-       /*
-        * The radix tree lock here protects a thread in xfs_iget from racing
-        * with us starting reclaim on the inode.  Once we have the
-        * XFS_IRECLAIM flag set it will not touch us.
-        *
-        * Due to RCU lookup, we may find inodes that have been freed and only
-        * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
-        * aren't candidates for reclaim at all, so we must check the
-        * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
-        */
         spin_lock(&ip->i_flags_lock);
         if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
             __xfs_iflags_test(ip, XFS_IRECLAIM)) {
                 /* not a reclaim candidate. */
                 spin_unlock(&ip->i_flags_lock);
-               return 1;
+               return false;
         }
         __xfs_iflags_set(ip, XFS_IRECLAIM);
         spin_unlock(&ip->i_flags_lock);
-       return 0;
+       return true;
  }
  
  /*
- * Inodes in different states need to be treated differently. The following
- * table lists the inode states and the reclaim actions necessary:
- *
- *     inode state          iflush ret         required action
- *      ---------------      ----------         ---------------
- *     bad                     -               reclaim
- *     shutdown                EIO             unpin and reclaim
- *     clean, unpinned         0               reclaim
- *     stale, unpinned         0               reclaim
- *     clean, pinned(*)        0               requeue
- *     stale, pinned           EAGAIN          requeue
- *     dirty, async            -               requeue
- *     dirty, sync             0               reclaim
+ * Inode reclaim is non-blocking, so the default action if progress cannot be
+ * made is to "requeue" the inode for reclaim by unlocking it and clearing the
+ * XFS_IRECLAIM flag.  If we are in a shutdown state, we don't care about
+ * blocking anymore and hence we can wait for the inode to be able to reclaim
+ * it.
   *
- * (*) dgc: I don't think the clean, pinned state is possible but it gets
- * handled anyway given the order of checks implemented.
- *
- * Also, because we get the flush lock first, we know that any inode that has
- * been flushed delwri has had the flush completed by the time we check that
- * the inode is clean.
- *
- * Note that because the inode is flushed delayed write by AIL pushing, the
- * flush lock may already be held here and waiting on it can result in very
- * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
- * the caller should push the AIL first before trying to reclaim inodes to
- * minimise the amount of time spent waiting.  For background relaim, we only
- * bother to reclaim clean inodes anyway.
- *
- * Hence the order of actions after gaining the locks should be:
- *     bad             => reclaim
- *     shutdown        => unpin and reclaim
- *     pinned, async   => requeue
- *     pinned, sync    => unpin
- *     stale           => reclaim
- *     clean           => reclaim
- *     dirty, async    => requeue
- *     dirty, sync     => flush, wait and reclaim
+ * We do no IO here - if callers require inodes to be cleaned they must push the
+ * AIL first to trigger writeback of dirty inodes.  This enables writeback to be
+ * done in the background in a non-blocking manner, and enables memory reclaim
+ * to make progress without blocking.
   */
-STATIC int
+static void
  xfs_reclaim_inode(
         struct xfs_inode        *ip,
-       struct xfs_perag        *pag,
-       int                     sync_mode)
+       struct xfs_perag        *pag)
  {
-       struct xfs_buf          *bp = NULL;
         xfs_ino_t               ino = ip->i_ino; /* for radix_tree_delete */
-       int                     error;
  
-restart:
-       error = 0;
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       if (!xfs_iflock_nowait(ip)) {
-               if (!(sync_mode & SYNC_WAIT))
-                       goto out;
-               xfs_iflock(ip);
-       }
+       if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+               goto out;
+       if (!xfs_iflock_nowait(ip))
+               goto out_iunlock;
  
         if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
                 xfs_iunpin_wait(ip);
@@ -1136,52 +1044,12 @@ restart:
                 xfs_iflush_abort(ip);
                 goto reclaim;
         }
-       if (xfs_ipincount(ip)) {
-               if (!(sync_mode & SYNC_WAIT))
-                       goto out_ifunlock;
-               xfs_iunpin_wait(ip);
-       }
-       if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) {
-               xfs_ifunlock(ip);
-               goto reclaim;
-       }
-
-       /*
-        * Never flush out dirty data during non-blocking reclaim, as it would
-        * just contend with AIL pushing trying to do the same job.
-        */
-       if (!(sync_mode & SYNC_WAIT))
+       if (xfs_ipincount(ip))
+               goto out_ifunlock;
+       if (!xfs_inode_clean(ip))
                 goto out_ifunlock;
  
-       /*
-        * Now we have an inode that needs flushing.
-        *
-        * Note that xfs_iflush will never block on the inode buffer lock, as
-        * xfs_ifree_cluster() can lock the inode buffer before it locks the
-        * ip->i_lock, and we are doing the exact opposite here.  As a result,
-        * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
-        * result in an ABBA deadlock with xfs_ifree_cluster().
-        *
-        * As xfs_ifree_cluser() must gather all inodes that are active in the
-        * cache to mark them stale, if we hit this case we don't actually want
-        * to do IO here - we want the inode marked stale so we can simply
-        * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
-        * inode, back off and try again.  Hopefully the next pass through will
-        * see the stale flag set on the inode.
-        */
-       error = xfs_iflush(ip, &bp);
-       if (error == -EAGAIN) {
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               /* backoff longer than in xfs_ifree_cluster */
-               delay(2);
-               goto restart;
-       }
-
-       if (!error) {
-               error = xfs_bwrite(bp);
-               xfs_buf_relse(bp);
-       }
-
+       xfs_ifunlock(ip);
  reclaim:
         ASSERT(!xfs_isiflocked(ip));
  
@@ -1228,23 +1096,17 @@ reclaim:
         xfs_ilock(ip, XFS_ILOCK_EXCL);
         xfs_qm_dqdetach(ip);
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       ASSERT(xfs_inode_clean(ip));
  
         __xfs_inode_free(ip);
-       return error;
+       return;
  
  out_ifunlock:
         xfs_ifunlock(ip);
+out_iunlock:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
  out:
         xfs_iflags_clear(ip, XFS_IRECLAIM);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       /*
-        * We could return -EAGAIN here to make reclaim rescan the inode tree in
-        * a short while. However, this just burns CPU time scanning the tree
-        * waiting for IO to complete and the reclaim work never goes back to
-        * the idle state. Instead, return 0 to let the next scheduled
-        * background reclaim attempt to reclaim the inode again.
-        */
-       return 0;
  }
  
  /*
@@ -1252,23 +1114,19 @@ out:
   * corrupted, we still want to try to reclaim all the inodes. If we don't,
   * then a shut down during filesystem unmount reclaim walk leak all the
   * unreclaimed inodes.
+ *
+ * Returns non-zero if any AGs or inodes were skipped in the reclaim pass
+ * so that callers that want to block until all dirty inodes are written back
+ * and reclaimed can sanely loop.
   */
-STATIC int
+static void
  xfs_reclaim_inodes_ag(
         struct xfs_mount        *mp,
-       int                     flags,
         int                     *nr_to_scan)
  {
         struct xfs_perag        *pag;
-       int                     error = 0;
-       int                     last_error = 0;
-       xfs_agnumber_t          ag;
-       int                     trylock = flags & SYNC_TRYLOCK;
-       int                     skipped;
+       xfs_agnumber_t          ag = 0;
  
-restart:
-       ag = 0;
-       skipped = 0;
         while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
                 unsigned long   first_index = 0;
                 int             done = 0;
@@ -1276,16 +1134,7 @@ restart:
  
                 ag = pag->pag_agno + 1;
  
-               if (trylock) {
-                       if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
-                               skipped++;
-                               xfs_perag_put(pag);
-                               continue;
-                       }
-                       first_index = pag->pag_ici_reclaim_cursor;
-               } else
-                       mutex_lock(&pag->pag_ici_reclaim_lock);
-
+               first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
                 do {
                         struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                         int     i;
@@ -1309,7 +1158,7 @@ restart:
                         for (i = 0; i < nr_found; i++) {
                                 struct xfs_inode *ip = batch[i];
  
-                               if (done || xfs_reclaim_inode_grab(ip, flags))
+                               if (done || !xfs_reclaim_inode_grab(ip))
                                         batch[i] = NULL;
  
                                 /*
@@ -1338,59 +1187,39 @@ restart:
                         rcu_read_unlock();
  
                         for (i = 0; i < nr_found; i++) {
-                               if (!batch[i])
-                                       continue;
-                               error = xfs_reclaim_inode(batch[i], pag, flags);
-                               if (error && last_error != -EFSCORRUPTED)
-                                       last_error = error;
+                               if (batch[i])
+                                       xfs_reclaim_inode(batch[i], pag);
                         }
  
                         *nr_to_scan -= XFS_LOOKUP_BATCH;
-
                         cond_resched();
-
                 } while (nr_found && !done && *nr_to_scan > 0);
  
-               if (trylock && !done)
-                       pag->pag_ici_reclaim_cursor = first_index;
-               else
-                       pag->pag_ici_reclaim_cursor = 0;
-               mutex_unlock(&pag->pag_ici_reclaim_lock);
+               if (done)
+                       first_index = 0;
+               WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
                 xfs_perag_put(pag);
         }
-
-       /*
-        * if we skipped any AG, and we still have scan count remaining, do
-        * another pass this time using blocking reclaim semantics (i.e
-        * waiting on the reclaim locks and ignoring the reclaim cursors). This
-        * ensure that when we get more reclaimers than AGs we block rather
-        * than spin trying to execute reclaim.
-        */
-       if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
-               trylock = 0;
-               goto restart;
-       }
-       return last_error;
  }
  
-int
+void
  xfs_reclaim_inodes(
-       xfs_mount_t     *mp,
-       int             mode)
+       struct xfs_mount        *mp)
  {
         int             nr_to_scan = INT_MAX;
  
-       return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+       while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+               xfs_ail_push_all_sync(mp->m_ail);
+               xfs_reclaim_inodes_ag(mp, &nr_to_scan);
+       };
  }
  
  /*
- * Scan a certain number of inodes for reclaim.
- *
- * When called we make sure that there is a background (fast) inode reclaim in
- * progress, while we will throttle the speed of reclaim via doing synchronous
- * reclaim of inodes. That means if we come across dirty inodes, we wait for
- * them to be cleaned, which we hope will not be very long due to the
- * background walker having already kicked the IO off on those dirty inodes.
+ * The shrinker infrastructure determines how many inodes we should scan for
+ * reclaim. We want as many clean inodes ready to reclaim as possible, so we
+ * push the AIL here. We also want to proactively free up memory if we can to
+ * minimise the amount of work memory reclaim has to do so we kick the
+ * background reclaim if it isn't already scheduled.
   */
  long
  xfs_reclaim_inodes_nr(
@@ -1401,7 +1230,8 @@ xfs_reclaim_inodes_nr(
         xfs_reclaim_work_queue(mp);
         xfs_ail_push_all(mp->m_ail);
  
-       return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+       xfs_reclaim_inodes_ag(mp, &nr_to_scan);
+       return 0;
  }
  
  /*
@@ -1498,6 +1328,24 @@ xfs_inode_matches_eofb(
         return true;
  }
  
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low.
+ */
+void
+xfs_reclaim_worker(
+       struct work_struct *work)
+{
+       struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                       struct xfs_mount, m_reclaim_work);
+       int             nr_to_scan = INT_MAX;
+
+       xfs_reclaim_inodes_ag(mp, &nr_to_scan);
+       xfs_reclaim_work_queue(mp);
+}
+
  STATIC int
  xfs_inode_free_eofblocks(
         struct xfs_inode        *ip,
@@ -1574,7 +1422,7 @@ __xfs_inode_free_quota_eofblocks(
         eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
  
         if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
-               dq = xfs_inode_dquot(ip, XFS_DQ_USER);
+               dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER);
                 if (dq && xfs_dquot_lowsp(dq)) {
                         eofb.eof_uid = VFS_I(ip)->i_uid;
                         eofb.eof_flags |= XFS_EOF_FLAGS_UID;
@@ -1583,7 +1431,7 @@ __xfs_inode_free_quota_eofblocks(
         }
  
         if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
-               dq = xfs_inode_dquot(ip, XFS_DQ_GROUP);
+               dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP);
                 if (dq && xfs_dquot_lowsp(dq)) {
                         eofb.eof_gid = VFS_I(ip)->i_gid;
                         eofb.eof_flags |= XFS_EOF_FLAGS_GID;
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h

index 93b54e7..3a4c8b3 100644 (file)
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -17,9 +17,6 @@ struct xfs_eofblocks {
         __u64           eof_min_file_size;
  };
  
-#define SYNC_WAIT              0x0001  /* wait for i/o to complete */
-#define SYNC_TRYLOCK           0x0002  /* only try to lock inodes */
-
  /*
   * tags for inode radix tree
   */
@@ -51,7 +48,7 @@ void xfs_inode_free(struct xfs_inode *ip);
  
  void xfs_reclaim_worker(struct work_struct *work);
  
-int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
+void xfs_reclaim_inodes(struct xfs_mount *mp);
  int xfs_reclaim_inodes_count(struct xfs_mount *mp);
  long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
  
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c

index 287a9e5..9b3994b 100644 (file)
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -97,7 +97,7 @@ xfs_icreate_log(
  {
         struct xfs_icreate_item *icp;
  
-       icp = kmem_zone_zalloc(xfs_icreate_zone, 0);
+       icp = kmem_cache_zalloc(xfs_icreate_zone, GFP_KERNEL | __GFP_NOFAIL);
  
         xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
                           &xfs_icreate_item_ops);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index 9aea7d6..407d629 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -44,7 +44,6 @@ kmem_zone_t *xfs_inode_zone;
   */
  #define        XFS_ITRUNC_MAX_EXTENTS  2
  
-STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
  STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
  STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
  
@@ -1740,10 +1739,31 @@ xfs_inactive_ifree(
                 return error;
         }
  
+       /*
+        * We do not hold the inode locked across the entire rolling transaction
+        * here. We only need to hold it for the first transaction that
+        * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
+        * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
+        * here breaks the relationship between cluster buffer invalidation and
+        * stale inode invalidation on cluster buffer item journal commit
+        * completion, and can result in leaving dirty stale inodes hanging
+        * around in memory.
+        *
+        * We have no need for serialising this inode operation against other
+        * operations - we freed the inode and hence reallocation is required
+        * and that will serialise on reallocating the space the deferops need
+        * to free. Hence we can unlock the inode on the first commit of
+        * the transaction rather than roll it right through the deferops. This
+        * avoids relogging the XFS_ISTALE inode.
+        *
+        * We check that xfs_ifree() hasn't grown an internal transaction roll
+        * by asserting that the inode is still locked when it returns.
+        */
         xfs_ilock(ip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, ip, 0);
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  
         error = xfs_ifree(tp, ip);
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
         if (error) {
                 /*
                  * If we fail to free the inode, shut down.  The cancel
@@ -1756,7 +1776,6 @@ xfs_inactive_ifree(
                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
                 }
                 xfs_trans_cancel(tp);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
                 return error;
         }
  
@@ -1774,7 +1793,6 @@ xfs_inactive_ifree(
                 xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
                         __func__, error);
  
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
         return 0;
  }
  
@@ -2147,7 +2165,6 @@ xfs_iunlink_update_dinode(
         xfs_dinode_calc_crc(mp, dip);
         xfs_trans_inode_buf(tp, ibp);
         xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
-       xfs_inobp_check(mp, ibp);
  }
  
  /* Set an in-core inode's unlinked pointer and return the old value. */
@@ -2248,7 +2265,6 @@ xfs_iunlink(
         }
  
         if (next_agino != NULLAGINO) {
-               struct xfs_perag        *pag;
                 xfs_agino_t             old_agino;
  
                 /*
@@ -2265,9 +2281,7 @@ xfs_iunlink(
                  * agino has been unlinked, add a backref from the next inode
                  * back to agino.
                  */
-               pag = xfs_perag_get(mp, agno);
-               error = xfs_iunlink_add_backref(pag, agino, next_agino);
-               xfs_perag_put(pag);
+               error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino);
                 if (error)
                         return error;
         }
@@ -2403,7 +2417,6 @@ xfs_iunlink_remove(
         struct xfs_buf          *agibp;
         struct xfs_buf          *last_ibp;
         struct xfs_dinode       *last_dip = NULL;
-       struct xfs_perag        *pag = NULL;
         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
         xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
         xfs_agino_t             next_agino;
@@ -2447,32 +2460,22 @@ xfs_iunlink_remove(
          * this inode's backref to point from the next inode.
          */
         if (next_agino != NULLAGINO) {
-               pag = xfs_perag_get(mp, agno);
-               error = xfs_iunlink_change_backref(pag, next_agino,
+               error = xfs_iunlink_change_backref(agibp->b_pag, next_agino,
                                 NULLAGINO);
                 if (error)
-                       goto out;
+                       return error;
         }
  
-       if (head_agino == agino) {
-               /* Point the head of the list to the next unlinked inode. */
-               error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
-                               next_agino);
-               if (error)
-                       goto out;
-       } else {
+       if (head_agino != agino) {
                 struct xfs_imap imap;
                 xfs_agino_t     prev_agino;
  
-               if (!pag)
-                       pag = xfs_perag_get(mp, agno);
-
                 /* We need to search the list for the inode being freed. */
                 error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
                                 &prev_agino, &imap, &last_dip, &last_ibp,
-                               pag);
+                               agibp->b_pag);
                 if (error)
-                       goto out;
+                       return error;
  
                 /* Point the previous inode on the list to the next inode. */
                 xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
@@ -2486,29 +2489,29 @@ xfs_iunlink_remove(
                  * change_backref takes care of deleting the backref if
                  * next_agino is NULLAGINO.
                  */
-               error = xfs_iunlink_change_backref(pag, agino, next_agino);
-               if (error)
-                       goto out;
+               return xfs_iunlink_change_backref(agibp->b_pag, agino,
+                               next_agino);
         }
  
-out:
-       if (pag)
-               xfs_perag_put(pag);
-       return error;
+       /* Point the head of the list to the next unlinked inode. */
+       return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+                       next_agino);
  }
  
  /*
- * Look up the inode number specified and mark it stale if it is found. If it is
- * dirty, return the inode so it can be attached to the cluster buffer so it can
- * be processed appropriately when the cluster free transaction completes.
+ * Look up the inode number specified and if it is not already marked XFS_ISTALE
+ * mark it stale. We should only find clean inodes in this lookup that aren't
+ * already stale.
   */
-static struct xfs_inode *
-xfs_ifree_get_one_inode(
-       struct xfs_perag        *pag,
+static void
+xfs_ifree_mark_inode_stale(
+       struct xfs_buf          *bp,
         struct xfs_inode        *free_ip,
         xfs_ino_t               inum)
  {
-       struct xfs_mount        *mp = pag->pag_mount;
+       struct xfs_mount        *mp = bp->b_mount;
+       struct xfs_perag        *pag = bp->b_pag;
+       struct xfs_inode_log_item *iip;
         struct xfs_inode        *ip;
  
  retry:
@@ -2516,8 +2519,10 @@ retry:
         ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
  
         /* Inode not in memory, nothing to do */
-       if (!ip)
-               goto out_rcu_unlock;
+       if (!ip) {
+               rcu_read_unlock();
+               return;
+       }
  
         /*
          * because this is an RCU protected lookup, we could find a recently
@@ -2528,9 +2533,9 @@ retry:
         spin_lock(&ip->i_flags_lock);
         if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
                 spin_unlock(&ip->i_flags_lock);
-               goto out_rcu_unlock;
+               rcu_read_unlock();
+               return;
         }
-       spin_unlock(&ip->i_flags_lock);
  
         /*
          * Don't try to lock/unlock the current inode, but we _cannot_ skip the
@@ -2540,43 +2545,50 @@ retry:
          */
         if (ip != free_ip) {
                 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+                       spin_unlock(&ip->i_flags_lock);
                         rcu_read_unlock();
                         delay(1);
                         goto retry;
                 }
-
-               /*
-                * Check the inode number again in case we're racing with
-                * freeing in xfs_reclaim_inode().  See the comments in that
-                * function for more information as to why the initial check is
-                * not sufficient.
-                */
-               if (ip->i_ino != inum) {
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                       goto out_rcu_unlock;
-               }
         }
+       ip->i_flags |= XFS_ISTALE;
+       spin_unlock(&ip->i_flags_lock);
         rcu_read_unlock();
  
-       xfs_iflock(ip);
-       xfs_iflags_set(ip, XFS_ISTALE);
+       /*
+        * If we can't get the flush lock, the inode is already attached.  All
+        * we needed to do here is mark the inode stale so buffer IO completion
+        * will remove it from the AIL.
+        */
+       iip = ip->i_itemp;
+       if (!xfs_iflock_nowait(ip)) {
+               ASSERT(!list_empty(&iip->ili_item.li_bio_list));
+               ASSERT(iip->ili_last_fields);
+               goto out_iunlock;
+       }
  
         /*
-        * We don't need to attach clean inodes or those only with unlogged
-        * changes (which we throw away, anyway).
+        * Inodes not attached to the buffer can be released immediately.
+        * Everything else has to go through xfs_iflush_abort() on journal
+        * commit as the flock synchronises removal of the inode from the
+        * cluster buffer against inode reclaim.
          */
-       if (!ip->i_itemp || xfs_inode_clean(ip)) {
-               ASSERT(ip != free_ip);
+       if (!iip || list_empty(&iip->ili_item.li_bio_list)) {
                 xfs_ifunlock(ip);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               goto out_no_inode;
+               goto out_iunlock;
         }
-       return ip;
  
-out_rcu_unlock:
-       rcu_read_unlock();
-out_no_inode:
-       return NULL;
+       /* we have a dirty inode in memory that has not yet been flushed. */
+       spin_lock(&iip->ili_lock);
+       iip->ili_last_fields = iip->ili_fields;
+       iip->ili_fields = 0;
+       iip->ili_fsync_fields = 0;
+       spin_unlock(&iip->ili_lock);
+       ASSERT(iip->ili_last_fields);
+
+out_iunlock:
+       if (ip != free_ip)
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
  }
  
  /*
@@ -2586,26 +2598,20 @@ out_no_inode:
   */
  STATIC int
  xfs_ifree_cluster(
-       xfs_inode_t             *free_ip,
-       xfs_trans_t             *tp,
+       struct xfs_inode        *free_ip,
+       struct xfs_trans        *tp,
         struct xfs_icluster     *xic)
  {
-       xfs_mount_t             *mp = free_ip->i_mount;
+       struct xfs_mount        *mp = free_ip->i_mount;
+       struct xfs_ino_geometry *igeo = M_IGEO(mp);
+       struct xfs_buf          *bp;
+       xfs_daddr_t             blkno;
+       xfs_ino_t               inum = xic->first_ino;
         int                     nbufs;
         int                     i, j;
         int                     ioffset;
-       xfs_daddr_t             blkno;
-       xfs_buf_t               *bp;
-       xfs_inode_t             *ip;
-       struct xfs_inode_log_item *iip;
-       struct xfs_log_item     *lip;
-       struct xfs_perag        *pag;
-       struct xfs_ino_geometry *igeo = M_IGEO(mp);
-       xfs_ino_t               inum;
         int                     error;
  
-       inum = xic->first_ino;
-       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
         nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
  
         for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
@@ -2634,10 +2640,8 @@ xfs_ifree_cluster(
                 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
                                 mp->m_bsize * igeo->blocks_per_cluster,
                                 XBF_UNMAPPED, &bp);
-               if (error) {
-                       xfs_perag_put(pag);
+               if (error)
                         return error;
-               }
  
                 /*
                  * This buffer may not have been correctly initialised as we
@@ -2651,60 +2655,16 @@ xfs_ifree_cluster(
                 bp->b_ops = &xfs_inode_buf_ops;
  
                 /*
-                * Walk the inodes already attached to the buffer and mark them
-                * stale. These will all have the flush locks held, so an
-                * in-memory inode walk can't lock them. By marking them all
-                * stale first, we will not attempt to lock them in the loop
-                * below as the XFS_ISTALE flag will be set.
+                * Now we need to set all the cached clean inodes as XFS_ISTALE,
+                * too. This requires lookups, and will skip inodes that we've
+                * already marked XFS_ISTALE.
                  */
-               list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
-                       if (lip->li_type == XFS_LI_INODE) {
-                               iip = (struct xfs_inode_log_item *)lip;
-                               ASSERT(iip->ili_logged == 1);
-                               lip->li_cb = xfs_istale_done;
-                               xfs_trans_ail_copy_lsn(mp->m_ail,
-                                                       &iip->ili_flush_lsn,
-                                                       &iip->ili_item.li_lsn);
-                               xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-                       }
-               }
-
-
-               /*
-                * For each inode in memory attempt to add it to the inode
-                * buffer and set it up for being staled on buffer IO
-                * completion.  This is safe as we've locked out tail pushing
-                * and flushing by locking the buffer.
-                *
-                * We have already marked every inode that was part of a
-                * transaction stale above, which means there is no point in
-                * even trying to lock them.
-                */
-               for (i = 0; i < igeo->inodes_per_cluster; i++) {
-                       ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i);
-                       if (!ip)
-                               continue;
-
-                       iip = ip->i_itemp;
-                       iip->ili_last_fields = iip->ili_fields;
-                       iip->ili_fields = 0;
-                       iip->ili_fsync_fields = 0;
-                       iip->ili_logged = 1;
-                       xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
-                                               &iip->ili_item.li_lsn);
-
-                       xfs_buf_attach_iodone(bp, xfs_istale_done,
-                                                 &iip->ili_item);
-
-                       if (ip != free_ip)
-                               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               }
+               for (i = 0; i < igeo->inodes_per_cluster; i++)
+                       xfs_ifree_mark_inode_stale(bp, free_ip, inum + i);
  
                 xfs_trans_stale_inode_buf(tp, bp);
                 xfs_trans_binval(tp, bp);
         }
-
-       xfs_perag_put(pag);
         return 0;
  }
  
@@ -2725,6 +2685,7 @@ xfs_ifree(
  {
         int                     error;
         struct xfs_icluster     xic = { 0 };
+       struct xfs_inode_log_item *iip = ip->i_itemp;
  
         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
         ASSERT(VFS_I(ip)->i_nlink == 0);
@@ -2762,7 +2723,9 @@ xfs_ifree(
         ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
  
         /* Don't attempt to replay owner changes for a deleted inode */
-       ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER);
+       spin_lock(&iip->ili_lock);
+       iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
+       spin_unlock(&iip->ili_lock);
  
         /*
          * Bump the generation count so no one will be confused
@@ -3469,231 +3432,8 @@ out_release_wip:
         return error;
  }
  
-STATIC int
-xfs_iflush_cluster(
-       struct xfs_inode        *ip,
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_perag        *pag;
-       unsigned long           first_index, mask;
-       int                     cilist_size;
-       struct xfs_inode        **cilist;
-       struct xfs_inode        *cip;
-       struct xfs_ino_geometry *igeo = M_IGEO(mp);
-       int                     error = 0;
-       int                     nr_found;
-       int                     clcount = 0;
-       int                     i;
-
-       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-
-       cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
-       cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
-       if (!cilist)
-               goto out_put;
-
-       mask = ~(igeo->inodes_per_cluster - 1);
-       first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-       rcu_read_lock();
-       /* really need a gang lookup range call here */
-       nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
-                                       first_index, igeo->inodes_per_cluster);
-       if (nr_found == 0)
-               goto out_free;
-
-       for (i = 0; i < nr_found; i++) {
-               cip = cilist[i];
-               if (cip == ip)
-                       continue;
-
-               /*
-                * because this is an RCU protected lookup, we could find a
-                * recently freed or even reallocated inode during the lookup.
-                * We need to check under the i_flags_lock for a valid inode
-                * here. Skip it if it is not valid or the wrong inode.
-                */
-               spin_lock(&cip->i_flags_lock);
-               if (!cip->i_ino ||
-                   __xfs_iflags_test(cip, XFS_ISTALE)) {
-                       spin_unlock(&cip->i_flags_lock);
-                       continue;
-               }
-
-               /*
-                * Once we fall off the end of the cluster, no point checking
-                * any more inodes in the list because they will also all be
-                * outside the cluster.
-                */
-               if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
-                       spin_unlock(&cip->i_flags_lock);
-                       break;
-               }
-               spin_unlock(&cip->i_flags_lock);
-
-               /*
-                * Do an un-protected check to see if the inode is dirty and
-                * is a candidate for flushing.  These checks will be repeated
-                * later after the appropriate locks are acquired.
-                */
-               if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
-                       continue;
-
-               /*
-                * Try to get locks.  If any are unavailable or it is pinned,
-                * then this inode cannot be flushed and is skipped.
-                */
-
-               if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
-                       continue;
-               if (!xfs_iflock_nowait(cip)) {
-                       xfs_iunlock(cip, XFS_ILOCK_SHARED);
-                       continue;
-               }
-               if (xfs_ipincount(cip)) {
-                       xfs_ifunlock(cip);
-                       xfs_iunlock(cip, XFS_ILOCK_SHARED);
-                       continue;
-               }
-
-
-               /*
-                * Check the inode number again, just to be certain we are not
-                * racing with freeing in xfs_reclaim_inode(). See the comments
-                * in that function for more information as to why the initial
-                * check is not sufficient.
-                */
-               if (!cip->i_ino) {
-                       xfs_ifunlock(cip);
-                       xfs_iunlock(cip, XFS_ILOCK_SHARED);
-                       continue;
-               }
-
-               /*
-                * arriving here means that this inode can be flushed.  First
-                * re-check that it's dirty before flushing.
-                */
-               if (!xfs_inode_clean(cip)) {
-                       error = xfs_iflush_int(cip, bp);
-                       if (error) {
-                               xfs_iunlock(cip, XFS_ILOCK_SHARED);
-                               goto out_free;
-                       }
-                       clcount++;
-               } else {
-                       xfs_ifunlock(cip);
-               }
-               xfs_iunlock(cip, XFS_ILOCK_SHARED);
-       }
-
-       if (clcount) {
-               XFS_STATS_INC(mp, xs_icluster_flushcnt);
-               XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
-       }
-
-out_free:
-       rcu_read_unlock();
-       kmem_free(cilist);
-out_put:
-       xfs_perag_put(pag);
-       return error;
-}
-
-/*
- * Flush dirty inode metadata into the backing buffer.
- *
- * The caller must have the inode lock and the inode flush lock held.  The
- * inode lock will still be held upon return to the caller, and the inode
- * flush lock will be released after the inode has reached the disk.
- *
- * The caller must write out the buffer returned in *bpp and release it.
- */
-int
+static int
  xfs_iflush(
-       struct xfs_inode        *ip,
-       struct xfs_buf          **bpp)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_buf          *bp = NULL;
-       struct xfs_dinode       *dip;
-       int                     error;
-
-       XFS_STATS_INC(mp, xs_iflush_count);
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-       ASSERT(xfs_isiflocked(ip));
-       ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
-              ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
-
-       *bpp = NULL;
-
-       xfs_iunpin_wait(ip);
-
-       /*
-        * For stale inodes we cannot rely on the backing buffer remaining
-        * stale in cache for the remaining life of the stale inode and so
-        * xfs_imap_to_bp() below may give us a buffer that no longer contains
-        * inodes below. We have to check this after ensuring the inode is
-        * unpinned so that it is safe to reclaim the stale inode after the
-        * flush call.
-        */
-       if (xfs_iflags_test(ip, XFS_ISTALE)) {
-               xfs_ifunlock(ip);
-               return 0;
-       }
-
-       /*
-        * Get the buffer containing the on-disk inode. We are doing a try-lock
-        * operation here, so we may get an EAGAIN error. In that case, return
-        * leaving the inode dirty.
-        *
-        * If we get any other error, we effectively have a corruption situation
-        * and we cannot flush the inode. Abort the flush and shut down.
-        */
-       error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK);
-       if (error == -EAGAIN) {
-               xfs_ifunlock(ip);
-               return error;
-       }
-       if (error)
-               goto abort;
-
-       /*
-        * If the buffer is pinned then push on the log now so we won't
-        * get stuck waiting in the write for too long.
-        */
-       if (xfs_buf_ispinned(bp))
-               xfs_log_force(mp, 0);
-
-       /*
-        * Flush the provided inode then attempt to gather others from the
-        * cluster into the write.
-        *
-        * Note: Once we attempt to flush an inode, we must run buffer
-        * completion callbacks on any failure. If this fails, simulate an I/O
-        * failure on the buffer and shut down.
-        */
-       error = xfs_iflush_int(ip, bp);
-       if (!error)
-               error = xfs_iflush_cluster(ip, bp);
-       if (error) {
-               bp->b_flags |= XBF_ASYNC;
-               xfs_buf_ioend_fail(bp);
-               goto shutdown;
-       }
-
-       *bpp = bp;
-       return 0;
-
-abort:
-       xfs_iflush_abort(ip);
-shutdown:
-       xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-       return error;
-}
-
-STATIC int
-xfs_iflush_int(
         struct xfs_inode        *ip,
         struct xfs_buf          *bp)
  {
@@ -3706,7 +3446,7 @@ xfs_iflush_int(
         ASSERT(xfs_isiflocked(ip));
         ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
                ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
-       ASSERT(iip != NULL && iip->ili_fields != 0);
+       ASSERT(iip->ili_item.li_buf == bp);
  
         dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
  
@@ -3801,7 +3541,6 @@ xfs_iflush_int(
         xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
         if (XFS_IFORK_Q(ip))
                 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
-       xfs_inobp_check(mp, bp);
  
         /*
          * We've recorded everything logged in the inode, so we'd like to clear
@@ -3818,40 +3557,148 @@ xfs_iflush_int(
          * know that the information those bits represent is permanently on
          * disk.  As long as the flush completes before the inode is logged
          * again, then both ili_fields and ili_last_fields will be cleared.
-        *
-        * We can play with the ili_fields bits here, because the inode lock
-        * must be held exclusively in order to set bits there and the flush
-        * lock protects the ili_last_fields bits.  Set ili_logged so the flush
-        * done routine can tell whether or not to look in the AIL.  Also, store
-        * the current LSN of the inode so that we can tell whether the item has
-        * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
-        * need the AIL lock, because it is a 64 bit value that cannot be read
-        * atomically.
          */
         error = 0;
  flush_out:
+       spin_lock(&iip->ili_lock);
         iip->ili_last_fields = iip->ili_fields;
         iip->ili_fields = 0;
         iip->ili_fsync_fields = 0;
-       iip->ili_logged = 1;
+       spin_unlock(&iip->ili_lock);
  
+       /*
+        * Store the current LSN of the inode so that we can tell whether the
+        * item has moved in the AIL from xfs_iflush_done().
+        */
         xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
                                 &iip->ili_item.li_lsn);
  
+       /* generate the checksum. */
+       xfs_dinode_calc_crc(mp, dip);
+       return error;
+}
+
+/*
+ * Non-blocking flush of dirty inode metadata into the backing buffer.
+ *
+ * The caller must have a reference to the inode and hold the cluster buffer
+ * locked. The function will walk across all the inodes on the cluster buffer it
+ * can find and lock without blocking, and flush them to the cluster buffer.
+ *
+ * On successful flushing of at least one inode, the caller must write out the
+ * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
+ * the caller needs to release the buffer. On failure, the filesystem will be
+ * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
+ * will be returned.
+ */
+int
+xfs_iflush_cluster(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_mount;
+       struct xfs_log_item     *lip, *n;
+       struct xfs_inode        *ip;
+       struct xfs_inode_log_item *iip;
+       int                     clcount = 0;
+       int                     error = 0;
+
         /*
-        * Attach the inode item callback to the buffer whether the flush
-        * succeeded or not. If not, the caller will shut down and fail I/O
-        * completion on the buffer to remove the inode from the AIL and release
-        * the flush lock.
+        * We must use the safe variant here as on shutdown xfs_iflush_abort()
+        * can remove itself from the list.
          */
-       xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
+       list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+               iip = (struct xfs_inode_log_item *)lip;
+               ip = iip->ili_inode;
  
-       /* generate the checksum. */
-       xfs_dinode_calc_crc(mp, dip);
+               /*
+                * Quick and dirty check to avoid locks if possible.
+                */
+               if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
+                       continue;
+               if (xfs_ipincount(ip))
+                       continue;
+
+               /*
+                * The inode is still attached to the buffer, which means it is
+                * dirty but reclaim might try to grab it. Check carefully for
+                * that, and grab the ilock while still holding the i_flags_lock
+                * to guarantee reclaim will not be able to reclaim this inode
+                * once we drop the i_flags_lock.
+                */
+               spin_lock(&ip->i_flags_lock);
+               ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
+               if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) {
+                       spin_unlock(&ip->i_flags_lock);
+                       continue;
+               }
+
+               /*
+                * ILOCK will pin the inode against reclaim and prevent
+                * concurrent transactions modifying the inode while we are
+                * flushing the inode.
+                */
+               if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+                       spin_unlock(&ip->i_flags_lock);
+                       continue;
+               }
+               spin_unlock(&ip->i_flags_lock);
+
+               /*
+                * Skip inodes that are already flush locked as they have
+                * already been written to the buffer.
+                */
+               if (!xfs_iflock_nowait(ip)) {
+                       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                       continue;
+               }
+
+               /*
+                * Abort flushing this inode if we are shut down because the
+                * inode may not currently be in the AIL. This can occur when
+                * log I/O failure unpins the inode without inserting into the
+                * AIL, leaving a dirty/unpinned inode attached to the buffer
+                * that otherwise looks like it should be flushed.
+                */
+               if (XFS_FORCED_SHUTDOWN(mp)) {
+                       xfs_iunpin_wait(ip);
+                       /* xfs_iflush_abort() drops the flush lock */
+                       xfs_iflush_abort(ip);
+                       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                       error = -EIO;
+                       continue;
+               }
+
+               /* don't block waiting on a log force to unpin dirty inodes */
+               if (xfs_ipincount(ip)) {
+                       xfs_ifunlock(ip);
+                       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                       continue;
+               }
+
+               if (!xfs_inode_clean(ip))
+                       error = xfs_iflush(ip, bp);
+               else
+                       xfs_ifunlock(ip);
+               xfs_iunlock(ip, XFS_ILOCK_SHARED);
+               if (error)
+                       break;
+               clcount++;
+       }
+
+       if (error) {
+               bp->b_flags |= XBF_ASYNC;
+               xfs_buf_ioend_fail(bp);
+               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+               return error;
+       }
+
+       if (!clcount)
+               return -EAGAIN;
+
+       XFS_STATS_INC(mp, xs_icluster_flushcnt);
+       XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
+       return 0;
  
-       ASSERT(!list_empty(&bp->b_li_list));
-       ASSERT(bp->b_iodone != NULL);
-       return error;
  }
  
  /* Release an inode. */
@@ -3881,3 +3728,96 @@ xfs_log_force_inode(
                 return 0;
         return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
  }
+
+/*
+ * Grab the exclusive iolock for a data copy from src to dest, making sure to
+ * abide vfs locking order (lowest pointer value goes first) and breaking the
+ * layout leases before proceeding.  The loop is needed because we cannot call
+ * the blocking break_layout() with the iolocks held, and therefore have to
+ * back out both locks.
+ */
+static int
+xfs_iolock_two_inodes_and_break_layout(
+       struct inode            *src,
+       struct inode            *dest)
+{
+       int                     error;
+
+       if (src > dest)
+               swap(src, dest);
+
+retry:
+       /* Wait to break both inodes' layouts before we start locking. */
+       error = break_layout(src, true);
+       if (error)
+               return error;
+       if (src != dest) {
+               error = break_layout(dest, true);
+               if (error)
+                       return error;
+       }
+
+       /* Lock one inode and make sure nobody got in and leased it. */
+       inode_lock(src);
+       error = break_layout(src, false);
+       if (error) {
+               inode_unlock(src);
+               if (error == -EWOULDBLOCK)
+                       goto retry;
+               return error;
+       }
+
+       if (src == dest)
+               return 0;
+
+       /* Lock the other inode and make sure nobody got in and leased it. */
+       inode_lock_nested(dest, I_MUTEX_NONDIR2);
+       error = break_layout(dest, false);
+       if (error) {
+               inode_unlock(src);
+               inode_unlock(dest);
+               if (error == -EWOULDBLOCK)
+                       goto retry;
+               return error;
+       }
+
+       return 0;
+}
+
+/*
+ * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
+ * mmap activity.
+ */
+int
+xfs_ilock2_io_mmap(
+       struct xfs_inode        *ip1,
+       struct xfs_inode        *ip2)
+{
+       int                     ret;
+
+       ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
+       if (ret)
+               return ret;
+       if (ip1 == ip2)
+               xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
+       else
+               xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
+                                   ip2, XFS_MMAPLOCK_EXCL);
+       return 0;
+}
+
+/* Unlock both inodes to allow IO and mmap activity. */
+void
+xfs_iunlock2_io_mmap(
+       struct xfs_inode        *ip1,
+       struct xfs_inode        *ip2)
+{
+       bool                    same_inode = (ip1 == ip2);
+
+       xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+       if (!same_inode)
+               xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+       inode_unlock(VFS_I(ip2));
+       if (!same_inode)
+               inode_unlock(VFS_I(ip1));
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h

index 47d3b39..e9a8bb1 100644 (file)
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -426,7 +426,7 @@ int         xfs_log_force_inode(struct xfs_inode *ip);
  void           xfs_iunpin_wait(xfs_inode_t *);
  #define xfs_ipincount(ip)      ((unsigned int) atomic_read(&ip->i_pincount))
  
-int            xfs_iflush(struct xfs_inode *, struct xfs_buf **);
+int            xfs_iflush_cluster(struct xfs_buf *);
  void           xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
                                 struct xfs_inode *ip1, uint ip1_mode);
  
@@ -499,4 +499,7 @@ void xfs_iunlink_destroy(struct xfs_perag *pag);
  
  void xfs_end_io(struct work_struct *work);
  
+int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+
  #endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c

index ba47bf6..895f61b 100644 (file)
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -439,6 +439,7 @@ xfs_inode_item_pin(
         struct xfs_inode        *ip = INODE_ITEM(lip)->ili_inode;
  
         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+       ASSERT(lip->li_buf);
  
         trace_xfs_inode_pin(ip, _RET_IP_);
         atomic_inc(&ip->i_pincount);
@@ -450,6 +451,12 @@ xfs_inode_item_pin(
   * item which was previously pinned with a call to xfs_inode_item_pin().
   *
   * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
+ *
+ * Note that unpin can race with inode cluster buffer freeing marking the buffer
+ * stale. In that case, flush completions are run from the buffer unpin call,
+ * which may happen before the inode is unpinned. If we lose the race, there
+ * will be no buffer attached to the log item, but the inode will be marked
+ * XFS_ISTALE.
   */
  STATIC void
  xfs_inode_item_unpin(
@@ -459,28 +466,12 @@ xfs_inode_item_unpin(
         struct xfs_inode        *ip = INODE_ITEM(lip)->ili_inode;
  
         trace_xfs_inode_unpin(ip, _RET_IP_);
+       ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE));
         ASSERT(atomic_read(&ip->i_pincount) > 0);
         if (atomic_dec_and_test(&ip->i_pincount))
                 wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
  }
  
-/*
- * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
- * have been failed during writeback
- *
- * This informs the AIL that the inode is already flush locked on the next push,
- * and acquires a hold on the buffer to ensure that it isn't reclaimed before
- * dirty data makes it to disk.
- */
-STATIC void
-xfs_inode_item_error(
-       struct xfs_log_item     *lip,
-       struct xfs_buf          *bp)
-{
-       ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
-       xfs_set_li_failed(lip, bp);
-}
-
  STATIC uint
  xfs_inode_item_push(
         struct xfs_log_item     *lip,
@@ -494,55 +485,44 @@ xfs_inode_item_push(
         uint                    rval = XFS_ITEM_SUCCESS;
         int                     error;
  
-       if (xfs_ipincount(ip) > 0)
+       ASSERT(iip->ili_item.li_buf);
+
+       if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) ||
+           (ip->i_flags & XFS_ISTALE))
                 return XFS_ITEM_PINNED;
  
-       if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
-               return XFS_ITEM_LOCKED;
+       /* If the inode is already flush locked, we're already flushing. */
+       if (xfs_isiflocked(ip))
+               return XFS_ITEM_FLUSHING;
  
-       /*
-        * Re-check the pincount now that we stabilized the value by
-        * taking the ilock.
-        */
-       if (xfs_ipincount(ip) > 0) {
-               rval = XFS_ITEM_PINNED;
-               goto out_unlock;
-       }
+       if (!xfs_buf_trylock(bp))
+               return XFS_ITEM_LOCKED;
  
-       /*
-        * Stale inode items should force out the iclog.
-        */
-       if (ip->i_flags & XFS_ISTALE) {
-               rval = XFS_ITEM_PINNED;
-               goto out_unlock;
-       }
+       spin_unlock(&lip->li_ailp->ail_lock);
  
         /*
-        * Someone else is already flushing the inode.  Nothing we can do
-        * here but wait for the flush to finish and remove the item from
-        * the AIL.
+        * We need to hold a reference for flushing the cluster buffer as it may
+        * fail the buffer without IO submission. In which case, we better get a
+        * reference for that completion because otherwise we don't get a
+        * reference for IO until we queue the buffer for delwri submission.
          */
-       if (!xfs_iflock_nowait(ip)) {
-               rval = XFS_ITEM_FLUSHING;
-               goto out_unlock;
-       }
-
-       ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
-       ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
-
-       spin_unlock(&lip->li_ailp->ail_lock);
-
-       error = xfs_iflush(ip, &bp);
+       xfs_buf_hold(bp);
+       error = xfs_iflush_cluster(bp);
         if (!error) {
                 if (!xfs_buf_delwri_queue(bp, buffer_list))
                         rval = XFS_ITEM_FLUSHING;
                 xfs_buf_relse(bp);
-       } else if (error == -EAGAIN)
+       } else {
+               /*
+                * Release the buffer if we were unable to flush anything. On
+                * any other error, the buffer has already been released.
+                */
+               if (error == -EAGAIN)
+                       xfs_buf_relse(bp);
                 rval = XFS_ITEM_LOCKED;
+       }
  
         spin_lock(&lip->li_ailp->ail_lock);
-out_unlock:
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
         return rval;
  }
  
@@ -621,7 +601,6 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
         .iop_committed  = xfs_inode_item_committed,
         .iop_push       = xfs_inode_item_push,
         .iop_committing = xfs_inode_item_committing,
-       .iop_error      = xfs_inode_item_error
  };
  
  
@@ -636,9 +615,11 @@ xfs_inode_item_init(
         struct xfs_inode_log_item *iip;
  
         ASSERT(ip->i_itemp == NULL);
-       iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0);
+       iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_zone,
+                                             GFP_KERNEL | __GFP_NOFAIL);
  
         iip->ili_inode = ip;
+       spin_lock_init(&iip->ili_lock);
         xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
                                                 &xfs_inode_item_ops);
  }
@@ -648,110 +629,129 @@ xfs_inode_item_init(
   */
  void
  xfs_inode_item_destroy(
-       xfs_inode_t     *ip)
+       struct xfs_inode        *ip)
  {
-       kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
-       kmem_cache_free(xfs_ili_zone, ip->i_itemp);
+       struct xfs_inode_log_item *iip = ip->i_itemp;
+
+       ASSERT(iip->ili_item.li_buf == NULL);
+
+       ip->i_itemp = NULL;
+       kmem_free(iip->ili_item.li_lv_shadow);
+       kmem_cache_free(xfs_ili_zone, iip);
  }
  
  
  /*
- * This is the inode flushing I/O completion routine.  It is called
- * from interrupt level when the buffer containing the inode is
- * flushed to disk.  It is responsible for removing the inode item
- * from the AIL if it has not been re-logged, and unlocking the inode's
- * flush lock.
- *
- * To reduce AIL lock traffic as much as possible, we scan the buffer log item
- * list for other inodes that will run this function. We remove them from the
- * buffer list so we can process all the inode IO completions in one AIL lock
- * traversal.
+ * We only want to pull the item from the AIL if it is actually there
+ * and its location in the log has not changed since we started the
+ * flush.  Thus, we only bother if the inode's lsn has not changed.
   */
-void
-xfs_iflush_done(
-       struct xfs_buf          *bp,
-       struct xfs_log_item     *lip)
+static void
+xfs_iflush_ail_updates(
+       struct xfs_ail          *ailp,
+       struct list_head        *list)
  {
-       struct xfs_inode_log_item *iip;
-       struct xfs_log_item     *blip, *n;
-       struct xfs_ail          *ailp = lip->li_ailp;
-       int                     need_ail = 0;
-       LIST_HEAD(tmp);
+       struct xfs_log_item     *lip;
+       xfs_lsn_t               tail_lsn = 0;
  
-       /*
-        * Scan the buffer IO completions for other inodes being completed and
-        * attach them to the current inode log item.
-        */
+       /* this is an opencoded batch version of xfs_trans_ail_delete */
+       spin_lock(&ailp->ail_lock);
+       list_for_each_entry(lip, list, li_bio_list) {
+               xfs_lsn_t       lsn;
  
-       list_add_tail(&lip->li_bio_list, &tmp);
-
-       list_for_each_entry_safe(blip, n, &bp->b_li_list, li_bio_list) {
-               if (lip->li_cb != xfs_iflush_done)
+               clear_bit(XFS_LI_FAILED, &lip->li_flags);
+               if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn)
                         continue;
  
-               list_move_tail(&blip->li_bio_list, &tmp);
+               lsn = xfs_ail_delete_one(ailp, lip);
+               if (!tail_lsn && lsn)
+                       tail_lsn = lsn;
+       }
+       xfs_ail_update_finish(ailp, tail_lsn);
+}
+
+/*
+ * Walk the list of inodes that have completed their IOs. If they are clean
+ * remove them from the list and dissociate them from the buffer. Buffers that
+ * are still dirty remain linked to the buffer and on the list. Caller must
+ * handle them appropriately.
+ */
+static void
+xfs_iflush_finish(
+       struct xfs_buf          *bp,
+       struct list_head        *list)
+{
+       struct xfs_log_item     *lip, *n;
+
+       list_for_each_entry_safe(lip, n, list, li_bio_list) {
+               struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+               bool    drop_buffer = false;
+
+               spin_lock(&iip->ili_lock);
+
                 /*
-                * while we have the item, do the unlocked check for needing
-                * the AIL lock.
+                * Remove the reference to the cluster buffer if the inode is
+                * clean in memory and drop the buffer reference once we've
+                * dropped the locks we hold.
                  */
-               iip = INODE_ITEM(blip);
-               if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
-                   test_bit(XFS_LI_FAILED, &blip->li_flags))
-                       need_ail++;
+               ASSERT(iip->ili_item.li_buf == bp);
+               if (!iip->ili_fields) {
+                       iip->ili_item.li_buf = NULL;
+                       list_del_init(&lip->li_bio_list);
+                       drop_buffer = true;
+               }
+               iip->ili_last_fields = 0;
+               iip->ili_flush_lsn = 0;
+               spin_unlock(&iip->ili_lock);
+               xfs_ifunlock(iip->ili_inode);
+               if (drop_buffer)
+                       xfs_buf_rele(bp);
         }
+}
  
-       /* make sure we capture the state of the initial inode. */
-       iip = INODE_ITEM(lip);
-       if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
-           test_bit(XFS_LI_FAILED, &lip->li_flags))
-               need_ail++;
+/*
+ * Inode buffer IO completion routine.  It is responsible for removing inodes
+ * attached to the buffer from the AIL if they have not been re-logged, as well
+ * as completing the flush and unlocking the inode.
+ */
+void
+xfs_iflush_done(
+       struct xfs_buf          *bp)
+{
+       struct xfs_log_item     *lip, *n;
+       LIST_HEAD(flushed_inodes);
+       LIST_HEAD(ail_updates);
  
         /*
-        * We only want to pull the item from the AIL if it is
-        * actually there and its location in the log has not
-        * changed since we started the flush.  Thus, we only bother
-        * if the ili_logged flag is set and the inode's lsn has not
-        * changed.  First we check the lsn outside
-        * the lock since it's cheaper, and then we recheck while
-        * holding the lock before removing the inode from the AIL.
+        * Pull the attached inodes from the buffer one at a time and take the
+        * appropriate action on them.
          */
-       if (need_ail) {
-               xfs_lsn_t       tail_lsn = 0;
-
-               /* this is an opencoded batch version of xfs_trans_ail_delete */
-               spin_lock(&ailp->ail_lock);
-               list_for_each_entry(blip, &tmp, li_bio_list) {
-                       if (INODE_ITEM(blip)->ili_logged &&
-                           blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) {
-                               /*
-                                * xfs_ail_update_finish() only cares about the
-                                * lsn of the first tail item removed, any
-                                * others will be at the same or higher lsn so
-                                * we just ignore them.
-                                */
-                               xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip);
-                               if (!tail_lsn && lsn)
-                                       tail_lsn = lsn;
-                       } else {
-                               xfs_clear_li_failed(blip);
-                       }
+       list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+               struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+
+               if (xfs_iflags_test(iip->ili_inode, XFS_ISTALE)) {
+                       xfs_iflush_abort(iip->ili_inode);
+                       continue;
                 }
-               xfs_ail_update_finish(ailp, tail_lsn);
+               if (!iip->ili_last_fields)
+                       continue;
+
+               /* Do an unlocked check for needing the AIL lock. */
+               if (iip->ili_flush_lsn == lip->li_lsn ||
+                   test_bit(XFS_LI_FAILED, &lip->li_flags))
+                       list_move_tail(&lip->li_bio_list, &ail_updates);
+               else
+                       list_move_tail(&lip->li_bio_list, &flushed_inodes);
         }
  
-       /*
-        * clean up and unlock the flush lock now we are done. We can clear the
-        * ili_last_fields bits now that we know that the data corresponding to
-        * them is safely on disk.
-        */
-       list_for_each_entry_safe(blip, n, &tmp, li_bio_list) {
-               list_del_init(&blip->li_bio_list);
-               iip = INODE_ITEM(blip);
-               iip->ili_logged = 0;
-               iip->ili_last_fields = 0;
-               xfs_ifunlock(iip->ili_inode);
+       if (!list_empty(&ail_updates)) {
+               xfs_iflush_ail_updates(bp->b_mount->m_ail, &ail_updates);
+               list_splice_tail(&ail_updates, &flushed_inodes);
         }
-       list_del(&tmp);
+
+       xfs_iflush_finish(bp, &flushed_inodes);
+       if (!list_empty(&flushed_inodes))
+               list_splice_tail(&flushed_inodes, &bp->b_li_list);
  }
  
  /*
@@ -762,37 +762,37 @@ xfs_iflush_done(
   */
  void
  xfs_iflush_abort(
-       struct xfs_inode                *ip)
+       struct xfs_inode        *ip)
  {
-       struct xfs_inode_log_item       *iip = ip->i_itemp;
+       struct xfs_inode_log_item *iip = ip->i_itemp;
+       struct xfs_buf          *bp = NULL;
  
         if (iip) {
-               xfs_trans_ail_delete(&iip->ili_item, 0);
-               iip->ili_logged = 0;
                 /*
-                * Clear the ili_last_fields bits now that we know that the
-                * data corresponding to them is safely on disk.
+                * Clear the failed bit before removing the item from the AIL so
+                * xfs_trans_ail_delete() doesn't try to clear and release the
+                * buffer attached to the log item before we are done with it.
                  */
-               iip->ili_last_fields = 0;
+               clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
+               xfs_trans_ail_delete(&iip->ili_item, 0);
+
                 /*
                  * Clear the inode logging fields so no more flushes are
                  * attempted.
                  */
+               spin_lock(&iip->ili_lock);
+               iip->ili_last_fields = 0;
                 iip->ili_fields = 0;
                 iip->ili_fsync_fields = 0;
+               iip->ili_flush_lsn = 0;
+               bp = iip->ili_item.li_buf;
+               iip->ili_item.li_buf = NULL;
+               list_del_init(&iip->ili_item.li_bio_list);
+               spin_unlock(&iip->ili_lock);
         }
-       /*
-        * Release the inode's flush lock since we're done with it.
-        */
         xfs_ifunlock(ip);
-}
-
-void
-xfs_istale_done(
-       struct xfs_buf          *bp,
-       struct xfs_log_item     *lip)
-{
-       xfs_iflush_abort(INODE_ITEM(lip)->ili_inode);
+       if (bp)
+               xfs_buf_rele(bp);
  }
  
  /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h

index 60b34bb..048b5e7 100644 (file)
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -16,24 +16,34 @@ struct xfs_mount;
  struct xfs_inode_log_item {
         struct xfs_log_item     ili_item;          /* common portion */
         struct xfs_inode        *ili_inode;        /* inode ptr */
-       xfs_lsn_t               ili_flush_lsn;     /* lsn at last flush */
-       xfs_lsn_t               ili_last_lsn;      /* lsn at last transaction */
-       unsigned short          ili_lock_flags;    /* lock flags */
-       unsigned short          ili_logged;        /* flushed logged data */
+       unsigned short          ili_lock_flags;    /* inode lock flags */
+       /*
+        * The ili_lock protects the interactions between the dirty state and
+        * the flush state of the inode log item. This allows us to do atomic
+        * modifications of multiple state fields without having to hold a
+        * specific inode lock to serialise them.
+        *
+        * We need atomic changes between inode dirtying, inode flushing and
+        * inode completion, but these all hold different combinations of
+        * ILOCK and iflock and hence we need some other method of serialising
+        * updates to the flush state.
+        */
+       spinlock_t              ili_lock;          /* flush state lock */
         unsigned int            ili_last_fields;   /* fields when flushed */
         unsigned int            ili_fields;        /* fields to be logged */
         unsigned int            ili_fsync_fields;  /* logged since last fsync */
+       xfs_lsn_t               ili_flush_lsn;     /* lsn at last flush */
+       xfs_lsn_t               ili_last_lsn;      /* lsn at last transaction */
  };
  
-static inline int xfs_inode_clean(xfs_inode_t *ip)
+static inline int xfs_inode_clean(struct xfs_inode *ip)
  {
         return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
  }
  
  extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
  extern void xfs_inode_item_destroy(struct xfs_inode *);
-extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
-extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
+extern void xfs_iflush_done(struct xfs_buf *);
  extern void xfs_iflush_abort(struct xfs_inode *);
  extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
                                          struct xfs_inode_log_format *);
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c

index dc3e26f..5e0d291 100644 (file)
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -376,7 +376,7 @@ out_owner_change:
         xfs_dinode_calc_crc(log->l_mp, dip);
  
         ASSERT(bp->b_mount == mp);
-       bp->b_iodone = xlog_recover_iodone;
+       bp->b_flags |= _XBF_LOGRECOVERY;
         xfs_buf_delwri_queue(bp, buffer_list);
  
  out_release:
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c

index a190212..6f22a66 100644 (file)
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1075,13 +1075,18 @@ xfs_merge_ioc_xflags(
                 xflags |= FS_XFLAG_NODUMP;
         else
                 xflags &= ~FS_XFLAG_NODUMP;
+       if (flags & FS_DAX_FL)
+               xflags |= FS_XFLAG_DAX;
+       else
+               xflags &= ~FS_XFLAG_DAX;
  
         return xflags;
  }
  
  STATIC unsigned int
  xfs_di2lxflags(
-       uint16_t        di_flags)
+       uint16_t        di_flags,
+       uint64_t        di_flags2)
  {
         unsigned int    flags = 0;
  
@@ -1095,6 +1100,9 @@ xfs_di2lxflags(
                 flags |= FS_NOATIME_FL;
         if (di_flags & XFS_DIFLAG_NODUMP)
                 flags |= FS_NODUMP_FL;
+       if (di_flags2 & XFS_DIFLAG2_DAX) {
+               flags |= FS_DAX_FL;
+       }
         return flags;
  }
  
@@ -1565,7 +1573,7 @@ xfs_ioc_getxflags(
  {
         unsigned int            flags;
  
-       flags = xfs_di2lxflags(ip->i_d.di_flags);
+       flags = xfs_di2lxflags(ip->i_d.di_flags, ip->i_d.di_flags2);
         if (copy_to_user(arg, &flags, sizeof(flags)))
                 return -EFAULT;
         return 0;
@@ -1588,7 +1596,7 @@ xfs_ioc_setxflags(
  
         if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
                       FS_NOATIME_FL | FS_NODUMP_FL | \
-                     FS_SYNC_FL))
+                     FS_SYNC_FL | FS_DAX_FL))
                 return -EOPNOTSUPP;
  
         fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index b9a8c37..0e3f62c 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -293,11 +293,11 @@ out_trans_cancel:
  
  STATIC bool
  xfs_quota_need_throttle(
-       struct xfs_inode *ip,
-       int type,
-       xfs_fsblock_t alloc_blocks)
+       struct xfs_inode        *ip,
+       xfs_dqtype_t            type,
+       xfs_fsblock_t           alloc_blocks)
  {
-       struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+       struct xfs_dquot        *dq = xfs_inode_dquot(ip, type);
  
         if (!dq || !xfs_this_quota_on(ip->i_mount, type))
                 return false;
@@ -307,7 +307,7 @@ xfs_quota_need_throttle(
                 return false;
  
         /* under the lo watermark, no throttle */
-       if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark)
+       if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark)
                 return false;
  
         return true;
@@ -315,24 +315,24 @@ xfs_quota_need_throttle(
  
  STATIC void
  xfs_quota_calc_throttle(
-       struct xfs_inode *ip,
-       int type,
-       xfs_fsblock_t *qblocks,
-       int *qshift,
-       int64_t *qfreesp)
+       struct xfs_inode        *ip,
+       xfs_dqtype_t            type,
+       xfs_fsblock_t           *qblocks,
+       int                     *qshift,
+       int64_t                 *qfreesp)
  {
-       int64_t freesp;
-       int shift = 0;
-       struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+       struct xfs_dquot        *dq = xfs_inode_dquot(ip, type);
+       int64_t                 freesp;
+       int                     shift = 0;
  
         /* no dq, or over hi wmark, squash the prealloc completely */
-       if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
+       if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) {
                 *qblocks = 0;
                 *qfreesp = 0;
                 return;
         }
  
-       freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount;
+       freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved;
         if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) {
                 shift = 2;
                 if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT])
@@ -450,14 +450,14 @@ xfs_iomap_prealloc_size(
          * Check each quota to cap the prealloc size, provide a shift value to
          * throttle with and adjust amount of available space.
          */
-       if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks))
-               xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift,
+       if (xfs_quota_need_throttle(ip, XFS_DQTYPE_USER, alloc_blocks))
+               xfs_quota_calc_throttle(ip, XFS_DQTYPE_USER, &qblocks, &qshift,
                                         &freesp);
-       if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks))
-               xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift,
+       if (xfs_quota_need_throttle(ip, XFS_DQTYPE_GROUP, alloc_blocks))
+               xfs_quota_calc_throttle(ip, XFS_DQTYPE_GROUP, &qblocks, &qshift,
                                         &freesp);
-       if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks))
-               xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift,
+       if (xfs_quota_need_throttle(ip, XFS_DQTYPE_PROJ, alloc_blocks))
+               xfs_quota_calc_throttle(ip, XFS_DQTYPE_PROJ, &qblocks, &qshift,
                                         &freesp);
  
         /*
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h

index 9f70d2f..ab737fe 100644 (file)
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -102,12 +102,8 @@ typedef __u32                      xfs_nlink_t;
  #define xfs_cowb_secs          xfs_params.cowb_timer.val
  
  #define current_cpu()          (raw_smp_processor_id())
-#define current_pid()          (current->pid)
-#define current_test_flags(f)  (current->flags & (f))
  #define current_set_flags_nested(sp, f)                \
                 (*(sp) = current->flags, current->flags |= (f))
-#define current_clear_flags_nested(sp, f)      \
-               (*(sp) = current->flags, current->flags &= ~(f))
  #define current_restore_flags_nested(sp, f)    \
                 (current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
  
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c

index 00fda2e..ad0c69e 100644 (file)
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -433,7 +433,7 @@ xfs_log_reserve(
         XFS_STATS_INC(mp, xs_try_logspace);
  
         ASSERT(*ticp == NULL);
-       tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0);
+       tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent);
         *ticp = tic;
  
         xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -3408,15 +3408,12 @@ xlog_ticket_alloc(
         int                     unit_bytes,
         int                     cnt,
         char                    client,
-       bool                    permanent,
-       xfs_km_flags_t          alloc_flags)
+       bool                    permanent)
  {
         struct xlog_ticket      *tic;
         int                     unit_res;
  
-       tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
-       if (!tic)
-               return NULL;
+       tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL);
  
         unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
  
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c

index 9ed9036..56c32ee 100644 (file)
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -37,8 +37,7 @@ xlog_cil_ticket_alloc(
  {
         struct xlog_ticket *tic;
  
-       tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
-                               KM_NOFS);
+       tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0);
  
         /*
          * set the current reservation to zero so we know to steal the basic
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h

index 75a6287..1c6fdbf 100644 (file)
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -464,9 +464,7 @@ xlog_ticket_alloc(
         int             unit_bytes,
         int             count,
         char            client,
-       bool            permanent,
-       xfs_km_flags_t  alloc_flags);
-
+       bool            permanent);
  
  static inline void
  xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c

index ec015df..52a65a7 100644 (file)
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -287,9 +287,8 @@ xlog_recover_iodone(
         if (bp->b_log_item)
                 xfs_buf_item_relse(bp);
         ASSERT(bp->b_log_item == NULL);
-
-       bp->b_iodone = NULL;
-       xfs_buf_ioend(bp);
+       bp->b_flags &= ~_XBF_LOGRECOVERY;
+       xfs_buf_ioend_finish(bp);
  }
  
  /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c

index d5dcf98..c8ae49a 100644 (file)
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -148,7 +148,6 @@ xfs_free_perag(
                 ASSERT(atomic_read(&pag->pag_ref) == 0);
                 xfs_iunlink_destroy(pag);
                 xfs_buf_hash_destroy(pag);
-               mutex_destroy(&pag->pag_ici_reclaim_lock);
                 call_rcu(&pag->rcu_head, __xfs_free_perag);
         }
  }
@@ -200,7 +199,6 @@ xfs_initialize_perag(
                 pag->pag_agno = index;
                 pag->pag_mount = mp;
                 spin_lock_init(&pag->pag_ici_lock);
-               mutex_init(&pag->pag_ici_reclaim_lock);
                 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
                 if (xfs_buf_hash_init(pag))
                         goto out_free_pag;
@@ -242,7 +240,6 @@ xfs_initialize_perag(
  out_hash_destroy:
         xfs_buf_hash_destroy(pag);
  out_free_pag:
-       mutex_destroy(&pag->pag_ici_reclaim_lock);
         kmem_free(pag);
  out_unwind_new_pags:
         /* unwind any prior newly initialized pags */
@@ -252,7 +249,6 @@ out_unwind_new_pags:
                         break;
                 xfs_buf_hash_destroy(pag);
                 xfs_iunlink_destroy(pag);
-               mutex_destroy(&pag->pag_ici_reclaim_lock);
                 kmem_free(pag);
         }
         return error;
@@ -1015,7 +1011,7 @@ xfs_mountfs(
          * quota inodes.
          */
         cancel_delayed_work_sync(&mp->m_reclaim_work);
-       xfs_reclaim_inodes(mp, SYNC_WAIT);
+       xfs_reclaim_inodes(mp);
         xfs_health_unmount(mp);
   out_log_dealloc:
         mp->m_flags |= XFS_MOUNT_UNMOUNTING;
@@ -1092,13 +1088,12 @@ xfs_unmountfs(
         xfs_ail_push_all_sync(mp->m_ail);
  
         /*
-        * And reclaim all inodes.  At this point there should be no dirty
-        * inodes and none should be pinned or locked, but use synchronous
-        * reclaim just to be sure. We can stop background inode reclaim
-        * here as well if it is still running.
+        * Reclaim all inodes. At this point there should be no dirty inodes and
+        * none should be pinned or locked. Stop background inode reclaim here
+        * if it is still running.
          */
         cancel_delayed_work_sync(&mp->m_reclaim_work);
-       xfs_reclaim_inodes(mp, SYNC_WAIT);
+       xfs_reclaim_inodes(mp);
         xfs_health_unmount(mp);
  
         xfs_qm_unmount(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h

index 3725d25..a72cfca 100644 (file)
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -354,7 +354,6 @@ typedef struct xfs_perag {
         spinlock_t      pag_ici_lock;   /* incore inode cache lock */
         struct radix_tree_root pag_ici_root;    /* incore inode cache root */
         int             pag_ici_reclaimable;    /* reclaimable inodes */
-       struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
         unsigned long   pag_ici_reclaim_cursor; /* reclaim restart point */
  
         /* buffer cache index */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c

index d6cd833..be67570 100644 (file)
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -47,7 +47,7 @@ STATIC void   xfs_qm_dqfree_one(struct xfs_dquot *dqp);
  STATIC int
  xfs_qm_dquot_walk(
         struct xfs_mount        *mp,
-       int                     type,
+       xfs_dqtype_t            type,
         int                     (*execute)(struct xfs_dquot *dqp, void *data),
         void                    *data)
  {
@@ -79,7 +79,7 @@ restart:
                 for (i = 0; i < nr_found; i++) {
                         struct xfs_dquot *dqp = batch[i];
  
-                       next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
+                       next_index = dqp->q_id + 1;
  
                         error = execute(batch[i], data);
                         if (error == -EAGAIN) {
@@ -124,10 +124,10 @@ xfs_qm_dqpurge(
         int                     error = -EAGAIN;
  
         xfs_dqlock(dqp);
-       if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0)
+       if ((dqp->q_flags & XFS_DQFLAG_FREEING) || dqp->q_nrefs != 0)
                 goto out_unlock;
  
-       dqp->dq_flags |= XFS_DQ_FREEING;
+       dqp->q_flags |= XFS_DQFLAG_FREEING;
  
         xfs_dqflock(dqp);
  
@@ -148,6 +148,7 @@ xfs_qm_dqpurge(
                         error = xfs_bwrite(bp);
                         xfs_buf_relse(bp);
                 } else if (error == -EAGAIN) {
+                       dqp->q_flags &= ~XFS_DQFLAG_FREEING;
                         goto out_unlock;
                 }
                 xfs_dqflock(dqp);
@@ -160,8 +161,7 @@ xfs_qm_dqpurge(
         xfs_dqfunlock(dqp);
         xfs_dqunlock(dqp);
  
-       radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
-                         be32_to_cpu(dqp->q_core.d_id));
+       radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id);
         qi->qi_dquots--;
  
         /*
@@ -189,11 +189,11 @@ xfs_qm_dqpurge_all(
         uint                    flags)
  {
         if (flags & XFS_QMOPT_UQUOTA)
-               xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL);
+               xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL);
         if (flags & XFS_QMOPT_GQUOTA)
-               xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
+               xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL);
         if (flags & XFS_QMOPT_PQUOTA)
-               xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL);
+               xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL);
  }
  
  /*
@@ -250,7 +250,7 @@ STATIC int
  xfs_qm_dqattach_one(
         struct xfs_inode        *ip,
         xfs_dqid_t              id,
-       uint                    type,
+       xfs_dqtype_t            type,
         bool                    doalloc,
         struct xfs_dquot        **IO_idqpp)
  {
@@ -331,7 +331,7 @@ xfs_qm_dqattach_locked(
  
         if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
                 error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)),
-                               XFS_DQ_USER, doalloc, &ip->i_udquot);
+                               XFS_DQTYPE_USER, doalloc, &ip->i_udquot);
                 if (error)
                         goto done;
                 ASSERT(ip->i_udquot);
@@ -339,14 +339,14 @@ xfs_qm_dqattach_locked(
  
         if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
                 error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)),
-                               XFS_DQ_GROUP, doalloc, &ip->i_gdquot);
+                               XFS_DQTYPE_GROUP, doalloc, &ip->i_gdquot);
                 if (error)
                         goto done;
                 ASSERT(ip->i_gdquot);
         }
  
         if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
-               error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
+               error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQTYPE_PROJ,
                                 doalloc, &ip->i_pdquot);
                 if (error)
                         goto done;
@@ -473,7 +473,7 @@ xfs_qm_dquot_isolate(
         /*
          * Prevent lookups now that we are past the point of no return.
          */
-       dqp->dq_flags |= XFS_DQ_FREEING;
+       dqp->q_flags |= XFS_DQFLAG_FREEING;
         xfs_dqunlock(dqp);
  
         ASSERT(dqp->q_nrefs == 0);
@@ -545,31 +545,29 @@ xfs_qm_shrink_count(
  STATIC void
  xfs_qm_set_defquota(
         struct xfs_mount        *mp,
-       uint                    type,
+       xfs_dqtype_t            type,
         struct xfs_quotainfo    *qinf)
  {
         struct xfs_dquot        *dqp;
         struct xfs_def_quota    *defq;
-       struct xfs_disk_dquot   *ddqp;
         int                     error;
  
         error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
         if (error)
                 return;
  
-       ddqp = &dqp->q_core;
         defq = xfs_get_defquota(qinf, xfs_dquot_type(dqp));
  
         /*
          * Timers and warnings have been already set, let's just set the
          * default limits for this quota type
          */
-       defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
-       defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
-       defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
-       defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
-       defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
-       defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+       defq->blk.hard = dqp->q_blk.hardlimit;
+       defq->blk.soft = dqp->q_blk.softlimit;
+       defq->ino.hard = dqp->q_ino.hardlimit;
+       defq->ino.soft = dqp->q_ino.softlimit;
+       defq->rtb.hard = dqp->q_rtb.hardlimit;
+       defq->rtb.soft = dqp->q_rtb.softlimit;
         xfs_qm_dqdestroy(dqp);
  }
  
@@ -577,22 +575,21 @@ xfs_qm_set_defquota(
  static void
  xfs_qm_init_timelimits(
         struct xfs_mount        *mp,
-       uint                    type)
+       xfs_dqtype_t            type)
  {
         struct xfs_quotainfo    *qinf = mp->m_quotainfo;
         struct xfs_def_quota    *defq;
-       struct xfs_disk_dquot   *ddqp;
         struct xfs_dquot        *dqp;
         int                     error;
  
         defq = xfs_get_defquota(qinf, type);
  
-       defq->btimelimit = XFS_QM_BTIMELIMIT;
-       defq->itimelimit = XFS_QM_ITIMELIMIT;
-       defq->rtbtimelimit = XFS_QM_RTBTIMELIMIT;
-       defq->bwarnlimit = XFS_QM_BWARNLIMIT;
-       defq->iwarnlimit = XFS_QM_IWARNLIMIT;
-       defq->rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
+       defq->blk.time = XFS_QM_BTIMELIMIT;
+       defq->ino.time = XFS_QM_ITIMELIMIT;
+       defq->rtb.time = XFS_QM_RTBTIMELIMIT;
+       defq->blk.warn = XFS_QM_BWARNLIMIT;
+       defq->ino.warn = XFS_QM_IWARNLIMIT;
+       defq->rtb.warn = XFS_QM_RTBWARNLIMIT;
  
         /*
          * We try to get the limits from the superuser's limits fields.
@@ -605,25 +602,23 @@ xfs_qm_init_timelimits(
         if (error)
                 return;
  
-       ddqp = &dqp->q_core;
-
         /*
          * The warnings and timers set the grace period given to
          * a user or group before he or she can not perform any
          * more writing. If it is zero, a default is used.
          */
-       if (ddqp->d_btimer)
-               defq->btimelimit = be32_to_cpu(ddqp->d_btimer);
-       if (ddqp->d_itimer)
-               defq->itimelimit = be32_to_cpu(ddqp->d_itimer);
-       if (ddqp->d_rtbtimer)
-               defq->rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer);
-       if (ddqp->d_bwarns)
-               defq->bwarnlimit = be16_to_cpu(ddqp->d_bwarns);
-       if (ddqp->d_iwarns)
-               defq->iwarnlimit = be16_to_cpu(ddqp->d_iwarns);
-       if (ddqp->d_rtbwarns)
-               defq->rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns);
+       if (dqp->q_blk.timer)
+               defq->blk.time = dqp->q_blk.timer;
+       if (dqp->q_ino.timer)
+               defq->ino.time = dqp->q_ino.timer;
+       if (dqp->q_rtb.timer)
+               defq->rtb.time = dqp->q_rtb.timer;
+       if (dqp->q_blk.warnings)
+               defq->blk.warn = dqp->q_blk.warnings;
+       if (dqp->q_ino.warnings)
+               defq->ino.warn = dqp->q_ino.warnings;
+       if (dqp->q_rtb.warnings)
+               defq->rtb.warn = dqp->q_rtb.warnings;
  
         xfs_qm_dqdestroy(dqp);
  }
@@ -669,16 +664,16 @@ xfs_qm_init_quotainfo(
  
         mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
  
-       xfs_qm_init_timelimits(mp, XFS_DQ_USER);
-       xfs_qm_init_timelimits(mp, XFS_DQ_GROUP);
-       xfs_qm_init_timelimits(mp, XFS_DQ_PROJ);
+       xfs_qm_init_timelimits(mp, XFS_DQTYPE_USER);
+       xfs_qm_init_timelimits(mp, XFS_DQTYPE_GROUP);
+       xfs_qm_init_timelimits(mp, XFS_DQTYPE_PROJ);
  
         if (XFS_IS_UQUOTA_RUNNING(mp))
-               xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf);
+               xfs_qm_set_defquota(mp, XFS_DQTYPE_USER, qinf);
         if (XFS_IS_GQUOTA_RUNNING(mp))
-               xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf);
+               xfs_qm_set_defquota(mp, XFS_DQTYPE_GROUP, qinf);
         if (XFS_IS_PQUOTA_RUNNING(mp))
-               xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf);
+               xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf);
  
         qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
         qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
@@ -828,14 +823,13 @@ xfs_qm_qino_alloc(
  
  STATIC void
  xfs_qm_reset_dqcounts(
-       xfs_mount_t     *mp,
-       xfs_buf_t       *bp,
-       xfs_dqid_t      id,
-       uint            type)
+       struct xfs_mount        *mp,
+       struct xfs_buf          *bp,
+       xfs_dqid_t              id,
+       xfs_dqtype_t            type)
  {
         struct xfs_dqblk        *dqb;
         int                     j;
-       xfs_failaddr_t          fa;
  
         trace_xfs_reset_dqcounts(bp, _RET_IP_);
  
@@ -860,15 +854,15 @@ xfs_qm_reset_dqcounts(
                  * find uninitialised dquot blks. See comment in
                  * xfs_dquot_verify.
                  */
-               fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type);
-               if (fa)
+               if (xfs_dqblk_verify(mp, &dqb[j], id + j) ||
+                   (dqb[j].dd_diskdq.d_type & XFS_DQTYPE_REC_MASK) != type)
                         xfs_dqblk_repair(mp, &dqb[j], id + j, type);
  
                 /*
                  * Reset type in case we are reusing group quota file for
                  * project quotas or vice versa
                  */
-               ddq->d_flags = type;
+               ddq->d_type = type;
                 ddq->d_bcount = 0;
                 ddq->d_icount = 0;
                 ddq->d_rtbcount = 0;
@@ -901,17 +895,13 @@ xfs_qm_reset_dqcounts_all(
         xfs_dqid_t              firstid,
         xfs_fsblock_t           bno,
         xfs_filblks_t           blkcnt,
-       uint                    flags,
+       xfs_dqtype_t            type,
         struct list_head        *buffer_list)
  {
         struct xfs_buf          *bp;
-       int                     error;
-       int                     type;
+       int                     error = 0;
  
         ASSERT(blkcnt > 0);
-       type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
-               (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
-       error = 0;
  
         /*
          * Blkcnt arg can be a very big number, and might even be
@@ -971,7 +961,7 @@ STATIC int
  xfs_qm_reset_dqcounts_buf(
         struct xfs_mount        *mp,
         struct xfs_inode        *qip,
-       uint                    flags,
+       xfs_dqtype_t            type,
         struct list_head        *buffer_list)
  {
         struct xfs_bmbt_irec    *map;
@@ -1047,7 +1037,7 @@ xfs_qm_reset_dqcounts_buf(
                         error = xfs_qm_reset_dqcounts_all(mp, firstid,
                                                    map[i].br_startblock,
                                                    map[i].br_blockcount,
-                                                  flags, buffer_list);
+                                                  type, buffer_list);
                         if (error)
                                 goto out;
                 }
@@ -1069,7 +1059,7 @@ out:
  STATIC int
  xfs_qm_quotacheck_dqadjust(
         struct xfs_inode        *ip,
-       uint                    type,
+       xfs_dqtype_t            type,
         xfs_qcnt_t              nblks,
         xfs_qcnt_t              rtblks)
  {
@@ -1095,15 +1085,15 @@ xfs_qm_quotacheck_dqadjust(
          * Adjust the inode count and the block count to reflect this inode's
          * resource usage.
          */
-       be64_add_cpu(&dqp->q_core.d_icount, 1);
-       dqp->q_res_icount++;
+       dqp->q_ino.count++;
+       dqp->q_ino.reserved++;
         if (nblks) {
-               be64_add_cpu(&dqp->q_core.d_bcount, nblks);
-               dqp->q_res_bcount += nblks;
+               dqp->q_blk.count += nblks;
+               dqp->q_blk.reserved += nblks;
         }
         if (rtblks) {
-               be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks);
-               dqp->q_res_rtbcount += rtblks;
+               dqp->q_rtb.count += rtblks;
+               dqp->q_rtb.reserved += rtblks;
         }
  
         /*
@@ -1111,12 +1101,12 @@ xfs_qm_quotacheck_dqadjust(
          *
          * There are no timers for the default values set in the root dquot.
          */
-       if (dqp->q_core.d_id) {
-               xfs_qm_adjust_dqlimits(mp, dqp);
-               xfs_qm_adjust_dqtimers(mp, dqp);
+       if (dqp->q_id) {
+               xfs_qm_adjust_dqlimits(dqp);
+               xfs_qm_adjust_dqtimers(dqp);
         }
  
-       dqp->dq_flags |= XFS_DQ_DIRTY;
+       dqp->q_flags |= XFS_DQFLAG_DIRTY;
         xfs_qm_dqput(dqp);
         return 0;
  }
@@ -1186,21 +1176,21 @@ xfs_qm_dqusage_adjust(
          * and quotaoffs don't race. (Quotachecks happen at mount time only).
          */
         if (XFS_IS_UQUOTA_ON(mp)) {
-               error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks,
+               error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_USER, nblks,
                                 rtblks);
                 if (error)
                         goto error0;
         }
  
         if (XFS_IS_GQUOTA_ON(mp)) {
-               error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks,
+               error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_GROUP, nblks,
                                 rtblks);
                 if (error)
                         goto error0;
         }
  
         if (XFS_IS_PQUOTA_ON(mp)) {
-               error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks,
+               error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_PROJ, nblks,
                                 rtblks);
                 if (error)
                         goto error0;
@@ -1222,7 +1212,7 @@ xfs_qm_flush_one(
         int                     error = 0;
  
         xfs_dqlock(dqp);
-       if (dqp->dq_flags & XFS_DQ_FREEING)
+       if (dqp->q_flags & XFS_DQFLAG_FREEING)
                 goto out_unlock;
         if (!XFS_DQ_IS_DIRTY(dqp))
                 goto out_unlock;
@@ -1291,7 +1281,7 @@ xfs_qm_quotacheck(
          * We don't log our changes till later.
          */
         if (uip) {
-               error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA,
+               error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_DQTYPE_USER,
                                          &buffer_list);
                 if (error)
                         goto error_return;
@@ -1299,7 +1289,7 @@ xfs_qm_quotacheck(
         }
  
         if (gip) {
-               error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA,
+               error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_DQTYPE_GROUP,
                                          &buffer_list);
                 if (error)
                         goto error_return;
@@ -1307,7 +1297,7 @@ xfs_qm_quotacheck(
         }
  
         if (pip) {
-               error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA,
+               error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_DQTYPE_PROJ,
                                          &buffer_list);
                 if (error)
                         goto error_return;
@@ -1324,17 +1314,17 @@ xfs_qm_quotacheck(
          * down to disk buffers if everything was updated successfully.
          */
         if (XFS_IS_UQUOTA_ON(mp)) {
-               error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one,
+               error = xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_flush_one,
                                           &buffer_list);
         }
         if (XFS_IS_GQUOTA_ON(mp)) {
-               error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one,
+               error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_flush_one,
                                            &buffer_list);
                 if (!error)
                         error = error2;
         }
         if (XFS_IS_PQUOTA_ON(mp)) {
-               error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one,
+               error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_flush_one,
                                            &buffer_list);
                 if (!error)
                         error = error2;
@@ -1597,8 +1587,7 @@ xfs_qm_dqfree_one(
         struct xfs_quotainfo    *qi = mp->m_quotainfo;
  
         mutex_lock(&qi->qi_tree_lock);
-       radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
-                         be32_to_cpu(dqp->q_core.d_id));
+       radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id);
  
         qi->qi_dquots--;
         mutex_unlock(&qi->qi_tree_lock);
@@ -1673,7 +1662,7 @@ xfs_qm_vop_dqalloc(
                          */
                         xfs_iunlock(ip, lockflags);
                         error = xfs_qm_dqget(mp, from_kuid(user_ns, uid),
-                                       XFS_DQ_USER, true, &uq);
+                                       XFS_DQTYPE_USER, true, &uq);
                         if (error) {
                                 ASSERT(error != -ENOENT);
                                 return error;
@@ -1697,7 +1686,7 @@ xfs_qm_vop_dqalloc(
                 if (!gid_eq(inode->i_gid, gid)) {
                         xfs_iunlock(ip, lockflags);
                         error = xfs_qm_dqget(mp, from_kgid(user_ns, gid),
-                                       XFS_DQ_GROUP, true, &gq);
+                                       XFS_DQTYPE_GROUP, true, &gq);
                         if (error) {
                                 ASSERT(error != -ENOENT);
                                 goto error_rele;
@@ -1713,8 +1702,8 @@ xfs_qm_vop_dqalloc(
         if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
                 if (ip->i_d.di_projid != prid) {
                         xfs_iunlock(ip, lockflags);
-                       error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ,
-                                       true, &pq);
+                       error = xfs_qm_dqget(mp, (xfs_dqid_t)prid,
+                                       XFS_DQTYPE_PROJ, true, &pq);
                         if (error) {
                                 ASSERT(error != -ENOENT);
                                 goto error_rele;
@@ -1822,7 +1811,7 @@ xfs_qm_vop_chown_reserve(
                         XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
  
         if (XFS_IS_UQUOTA_ON(mp) && udqp &&
-           i_uid_read(VFS_I(ip)) != be32_to_cpu(udqp->q_core.d_id)) {
+           i_uid_read(VFS_I(ip)) != udqp->q_id) {
                 udq_delblks = udqp;
                 /*
                  * If there are delayed allocation blocks, then we have to
@@ -1835,7 +1824,7 @@ xfs_qm_vop_chown_reserve(
                 }
         }
         if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
-           i_gid_read(VFS_I(ip)) != be32_to_cpu(gdqp->q_core.d_id)) {
+           i_gid_read(VFS_I(ip)) != gdqp->q_id) {
                 gdq_delblks = gdqp;
                 if (delblks) {
                         ASSERT(ip->i_gdquot);
@@ -1844,7 +1833,7 @@ xfs_qm_vop_chown_reserve(
         }
  
         if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
-           ip->i_d.di_projid != be32_to_cpu(pdqp->q_core.d_id)) {
+           ip->i_d.di_projid != pdqp->q_id) {
                 pdq_delblks = pdqp;
                 if (delblks) {
                         ASSERT(ip->i_pdquot);
@@ -1928,21 +1917,21 @@ xfs_qm_vop_create_dqattach(
  
         if (udqp && XFS_IS_UQUOTA_ON(mp)) {
                 ASSERT(ip->i_udquot == NULL);
-               ASSERT(i_uid_read(VFS_I(ip)) == be32_to_cpu(udqp->q_core.d_id));
+               ASSERT(i_uid_read(VFS_I(ip)) == udqp->q_id);
  
                 ip->i_udquot = xfs_qm_dqhold(udqp);
                 xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
         }
         if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
                 ASSERT(ip->i_gdquot == NULL);
-               ASSERT(i_gid_read(VFS_I(ip)) == be32_to_cpu(gdqp->q_core.d_id));
+               ASSERT(i_gid_read(VFS_I(ip)) == gdqp->q_id);
  
                 ip->i_gdquot = xfs_qm_dqhold(gdqp);
                 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
         }
         if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
                 ASSERT(ip->i_pdquot == NULL);
-               ASSERT(ip->i_d.di_projid == be32_to_cpu(pdqp->q_core.d_id));
+               ASSERT(ip->i_d.di_projid == pdqp->q_id);
  
                 ip->i_pdquot = xfs_qm_dqhold(pdqp);
                 xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h

index 7b0e771..9c078c3 100644 (file)
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -20,41 +20,28 @@ extern struct kmem_zone     *xfs_qm_dqtrxzone;
  #define XFS_DQITER_MAP_SIZE    10
  
  #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
-       !dqp->q_core.d_blk_hardlimit && \
-       !dqp->q_core.d_blk_softlimit && \
-       !dqp->q_core.d_rtb_hardlimit && \
-       !dqp->q_core.d_rtb_softlimit && \
-       !dqp->q_core.d_ino_hardlimit && \
-       !dqp->q_core.d_ino_softlimit && \
-       !dqp->q_core.d_bcount && \
-       !dqp->q_core.d_rtbcount && \
-       !dqp->q_core.d_icount)
-
-/*
- * This defines the unit of allocation of dquots.
- * Currently, it is just one file system block, and a 4K blk contains 30
- * (136 * 30 = 4080) dquots. It's probably not worth trying to make
- * this more dynamic.
- * XXXsup However, if this number is changed, we have to make sure that we don't
- * implicitly assume that we do allocations in chunks of a single filesystem
- * block in the dquot/xqm code.
- */
-#define XFS_DQUOT_CLUSTER_SIZE_FSB     (xfs_filblks_t)1
+       !dqp->q_blk.hardlimit && \
+       !dqp->q_blk.softlimit && \
+       !dqp->q_rtb.hardlimit && \
+       !dqp->q_rtb.softlimit && \
+       !dqp->q_ino.hardlimit && \
+       !dqp->q_ino.softlimit && \
+       !dqp->q_blk.count && \
+       !dqp->q_rtb.count && \
+       !dqp->q_ino.count)
+
+struct xfs_quota_limits {
+       xfs_qcnt_t              hard;   /* default hard limit */
+       xfs_qcnt_t              soft;   /* default soft limit */
+       time64_t                time;   /* limit for timers */
+       xfs_qwarncnt_t          warn;   /* limit for warnings */
+};
  
  /* Defaults for each quota type: time limits, warn limits, usage limits */
  struct xfs_def_quota {
-       time64_t        btimelimit;     /* limit for blks timer */
-       time64_t        itimelimit;     /* limit for inodes timer */
-       time64_t        rtbtimelimit;   /* limit for rt blks timer */
-       xfs_qwarncnt_t  bwarnlimit;     /* limit for blks warnings */
-       xfs_qwarncnt_t  iwarnlimit;     /* limit for inodes warnings */
-       xfs_qwarncnt_t  rtbwarnlimit;   /* limit for rt blks warnings */
-       xfs_qcnt_t      bhardlimit;     /* default data blk hard limit */
-       xfs_qcnt_t      bsoftlimit;     /* default data blk soft limit */
-       xfs_qcnt_t      ihardlimit;     /* default inode count hard limit */
-       xfs_qcnt_t      isoftlimit;     /* default inode count soft limit */
-       xfs_qcnt_t      rtbhardlimit;   /* default realtime blk hard limit */
-       xfs_qcnt_t      rtbsoftlimit;   /* default realtime blk soft limit */
+       struct xfs_quota_limits blk;
+       struct xfs_quota_limits ino;
+       struct xfs_quota_limits rtb;
  };
  
  /*
@@ -83,14 +70,14 @@ struct xfs_quotainfo {
  static inline struct radix_tree_root *
  xfs_dquot_tree(
         struct xfs_quotainfo    *qi,
-       int                     type)
+       xfs_dqtype_t            type)
  {
         switch (type) {
-       case XFS_DQ_USER:
+       case XFS_DQTYPE_USER:
                 return &qi->qi_uquota_tree;
-       case XFS_DQ_GROUP:
+       case XFS_DQTYPE_GROUP:
                 return &qi->qi_gquota_tree;
-       case XFS_DQ_PROJ:
+       case XFS_DQTYPE_PROJ:
                 return &qi->qi_pquota_tree;
         default:
                 ASSERT(0);
@@ -99,14 +86,14 @@ xfs_dquot_tree(
  }
  
  static inline struct xfs_inode *
-xfs_quota_inode(xfs_mount_t *mp, uint dq_flags)
+xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type)
  {
-       switch (dq_flags & XFS_DQ_ALLTYPES) {
-       case XFS_DQ_USER:
+       switch (type) {
+       case XFS_DQTYPE_USER:
                 return mp->m_quotainfo->qi_uquotaip;
-       case XFS_DQ_GROUP:
+       case XFS_DQTYPE_GROUP:
                 return mp->m_quotainfo->qi_gquotaip;
-       case XFS_DQ_PROJ:
+       case XFS_DQTYPE_PROJ:
                 return mp->m_quotainfo->qi_pquotaip;
         default:
                 ASSERT(0);
@@ -114,17 +101,6 @@ xfs_quota_inode(xfs_mount_t *mp, uint dq_flags)
         return NULL;
  }
  
-static inline int
-xfs_dquot_type(struct xfs_dquot *dqp)
-{
-       if (XFS_QM_ISUDQ(dqp))
-               return XFS_DQ_USER;
-       if (XFS_QM_ISGDQ(dqp))
-               return XFS_DQ_GROUP;
-       ASSERT(XFS_QM_ISPDQ(dqp));
-       return XFS_DQ_PROJ;
-}
-
  extern void    xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp,
                                     uint field, int64_t delta);
  extern void    xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
@@ -166,24 +142,30 @@ extern void               xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
  
  /* quota ops */
  extern int             xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
-extern int             xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
-                                       uint, struct qc_dqblk *);
-extern int             xfs_qm_scall_getquota_next(struct xfs_mount *,
-                                       xfs_dqid_t *, uint, struct qc_dqblk *);
-extern int             xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
-                                       struct qc_dqblk *);
+extern int             xfs_qm_scall_getquota(struct xfs_mount *mp,
+                                       xfs_dqid_t id,
+                                       xfs_dqtype_t type,
+                                       struct qc_dqblk *dst);
+extern int             xfs_qm_scall_getquota_next(struct xfs_mount *mp,
+                                       xfs_dqid_t *id,
+                                       xfs_dqtype_t type,
+                                       struct qc_dqblk *dst);
+extern int             xfs_qm_scall_setqlim(struct xfs_mount *mp,
+                                       xfs_dqid_t id,
+                                       xfs_dqtype_t type,
+                                       struct qc_dqblk *newlim);
  extern int             xfs_qm_scall_quotaon(struct xfs_mount *, uint);
  extern int             xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
  
  static inline struct xfs_def_quota *
-xfs_get_defquota(struct xfs_quotainfo *qi, int type)
+xfs_get_defquota(struct xfs_quotainfo *qi, xfs_dqtype_t type)
  {
         switch (type) {
-       case XFS_DQ_USER:
+       case XFS_DQTYPE_USER:
                 return &qi->qi_usr_default;
-       case XFS_DQ_GROUP:
+       case XFS_DQTYPE_GROUP:
                 return &qi->qi_grp_default;
-       case XFS_DQ_PROJ:
+       case XFS_DQTYPE_PROJ:
                 return &qi->qi_prj_default;
         default:
                 ASSERT(0);
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c

index fc2fa41..6393980 100644 (file)
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -23,24 +23,24 @@ xfs_fill_statvfs_from_dquot(
  {
         uint64_t                limit;
  
-       limit = dqp->q_core.d_blk_softlimit ?
-               be64_to_cpu(dqp->q_core.d_blk_softlimit) :
-               be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+       limit = dqp->q_blk.softlimit ?
+               dqp->q_blk.softlimit :
+               dqp->q_blk.hardlimit;
         if (limit && statp->f_blocks > limit) {
                 statp->f_blocks = limit;
                 statp->f_bfree = statp->f_bavail =
-                       (statp->f_blocks > dqp->q_res_bcount) ?
-                        (statp->f_blocks - dqp->q_res_bcount) : 0;
+                       (statp->f_blocks > dqp->q_blk.reserved) ?
+                        (statp->f_blocks - dqp->q_blk.reserved) : 0;
         }
  
-       limit = dqp->q_core.d_ino_softlimit ?
-               be64_to_cpu(dqp->q_core.d_ino_softlimit) :
-               be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+       limit = dqp->q_ino.softlimit ?
+               dqp->q_ino.softlimit :
+               dqp->q_ino.hardlimit;
         if (limit && statp->f_files > limit) {
                 statp->f_files = limit;
                 statp->f_ffree =
-                       (statp->f_files > dqp->q_res_icount) ?
-                        (statp->f_files - dqp->q_res_icount) : 0;
+                       (statp->f_files > dqp->q_ino.reserved) ?
+                        (statp->f_files - dqp->q_ino.reserved) : 0;
         }
  }
  
@@ -60,7 +60,7 @@ xfs_qm_statvfs(
         struct xfs_mount        *mp = ip->i_mount;
         struct xfs_dquot        *dqp;
  
-       if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQ_PROJ, false, &dqp)) {
+       if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQTYPE_PROJ, false, &dqp)) {
                 xfs_fill_statvfs_from_dquot(statp, dqp);
                 xfs_qm_dqput(dqp);
         }
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c

index 7effd7a..1c542b4 100644 (file)
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -322,23 +322,23 @@ xfs_qm_scall_trunc_qfiles(
         int             error = -EINVAL;
  
         if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 ||
-           (flags & ~XFS_DQ_ALLTYPES)) {
+           (flags & ~XFS_QMOPT_QUOTALL)) {
                 xfs_debug(mp, "%s: flags=%x m_qflags=%x",
                         __func__, flags, mp->m_qflags);
                 return -EINVAL;
         }
  
-       if (flags & XFS_DQ_USER) {
+       if (flags & XFS_QMOPT_UQUOTA) {
                 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
                 if (error)
                         return error;
         }
-       if (flags & XFS_DQ_GROUP) {
+       if (flags & XFS_QMOPT_GQUOTA) {
                 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
                 if (error)
                         return error;
         }
-       if (flags & XFS_DQ_PROJ)
+       if (flags & XFS_QMOPT_PQUOTA)
                 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
  
         return error;
@@ -436,6 +436,58 @@ xfs_qm_scall_quotaon(
  #define XFS_QC_MASK \
         (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
  
+/*
+ * Adjust limits of this quota, and the defaults if passed in.  Returns true
+ * if the new limits made sense and were applied, false otherwise.
+ */
+static inline bool
+xfs_setqlim_limits(
+       struct xfs_mount        *mp,
+       struct xfs_dquot_res    *res,
+       struct xfs_quota_limits *qlim,
+       xfs_qcnt_t              hard,
+       xfs_qcnt_t              soft,
+       const char              *tag)
+{
+       /* The hard limit can't be less than the soft limit. */
+       if (hard != 0 && hard < soft) {
+               xfs_debug(mp, "%shard %lld < %ssoft %lld", tag, hard, tag,
+                               soft);
+               return false;
+       }
+
+       res->hardlimit = hard;
+       res->softlimit = soft;
+       if (qlim) {
+               qlim->hard = hard;
+               qlim->soft = soft;
+       }
+
+       return true;
+}
+
+static inline void
+xfs_setqlim_warns(
+       struct xfs_dquot_res    *res,
+       struct xfs_quota_limits *qlim,
+       int                     warns)
+{
+       res->warnings = warns;
+       if (qlim)
+               qlim->warn = warns;
+}
+
+static inline void
+xfs_setqlim_timer(
+       struct xfs_dquot_res    *res,
+       struct xfs_quota_limits *qlim,
+       s64                     timer)
+{
+       res->timer = timer;
+       if (qlim)
+               qlim->time = timer;
+}
+
  /*
   * Adjust quota limits, and start/stop timers accordingly.
   */
@@ -443,14 +495,15 @@ int
  xfs_qm_scall_setqlim(
         struct xfs_mount        *mp,
         xfs_dqid_t              id,
-       uint                    type,
+       xfs_dqtype_t            type,
         struct qc_dqblk         *newlim)
  {
         struct xfs_quotainfo    *q = mp->m_quotainfo;
-       struct xfs_disk_dquot   *ddq;
         struct xfs_dquot        *dqp;
         struct xfs_trans        *tp;
         struct xfs_def_quota    *defq;
+       struct xfs_dquot_res    *res;
+       struct xfs_quota_limits *qlim;
         int                     error;
         xfs_qcnt_t              hard, soft;
  
@@ -488,105 +541,72 @@ xfs_qm_scall_setqlim(
  
         xfs_dqlock(dqp);
         xfs_trans_dqjoin(tp, dqp);
-       ddq = &dqp->q_core;
  
         /*
+        * Update quota limits, warnings, and timers, and the defaults
+        * if we're touching id == 0.
+        *
          * Make sure that hardlimits are >= soft limits before changing.
+        *
+        * Update warnings counter(s) if requested.
+        *
+        * Timelimits for the super user set the relative time the other users
+        * can be over quota for this file system. If it is zero a default is
+        * used.  Ditto for the default soft and hard limit values (already
+        * done, above), and for warnings.
+        *
+        * For other IDs, userspace can bump out the grace period if over
+        * the soft limit.
          */
+
+       /* Blocks on the data device. */
         hard = (newlim->d_fieldmask & QC_SPC_HARD) ?
                 (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) :
-                       be64_to_cpu(ddq->d_blk_hardlimit);
+                       dqp->q_blk.hardlimit;
         soft = (newlim->d_fieldmask & QC_SPC_SOFT) ?
                 (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) :
-                       be64_to_cpu(ddq->d_blk_softlimit);
-       if (hard == 0 || hard >= soft) {
-               ddq->d_blk_hardlimit = cpu_to_be64(hard);
-               ddq->d_blk_softlimit = cpu_to_be64(soft);
+                       dqp->q_blk.softlimit;
+       res = &dqp->q_blk;
+       qlim = id == 0 ? &defq->blk : NULL;
+
+       if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk"))
                 xfs_dquot_set_prealloc_limits(dqp);
-               if (id == 0) {
-                       defq->bhardlimit = hard;
-                       defq->bsoftlimit = soft;
-               }
-       } else {
-               xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
-       }
+       if (newlim->d_fieldmask & QC_SPC_WARNS)
+               xfs_setqlim_warns(res, qlim, newlim->d_spc_warns);
+       if (newlim->d_fieldmask & QC_SPC_TIMER)
+               xfs_setqlim_timer(res, qlim, newlim->d_spc_timer);
+
+       /* Blocks on the realtime device. */
         hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ?
                 (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) :
-                       be64_to_cpu(ddq->d_rtb_hardlimit);
+                       dqp->q_rtb.hardlimit;
         soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ?
                 (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) :
-                       be64_to_cpu(ddq->d_rtb_softlimit);
-       if (hard == 0 || hard >= soft) {
-               ddq->d_rtb_hardlimit = cpu_to_be64(hard);
-               ddq->d_rtb_softlimit = cpu_to_be64(soft);
-               if (id == 0) {
-                       defq->rtbhardlimit = hard;
-                       defq->rtbsoftlimit = soft;
-               }
-       } else {
-               xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
-       }
+                       dqp->q_rtb.softlimit;
+       res = &dqp->q_rtb;
+       qlim = id == 0 ? &defq->rtb : NULL;
  
+       xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb");
+       if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
+               xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns);
+       if (newlim->d_fieldmask & QC_RT_SPC_TIMER)
+               xfs_setqlim_timer(res, qlim, newlim->d_rt_spc_timer);
+
+       /* Inodes */
         hard = (newlim->d_fieldmask & QC_INO_HARD) ?
                 (xfs_qcnt_t) newlim->d_ino_hardlimit :
-                       be64_to_cpu(ddq->d_ino_hardlimit);
+                       dqp->q_ino.hardlimit;
         soft = (newlim->d_fieldmask & QC_INO_SOFT) ?
                 (xfs_qcnt_t) newlim->d_ino_softlimit :
-                       be64_to_cpu(ddq->d_ino_softlimit);
-       if (hard == 0 || hard >= soft) {
-               ddq->d_ino_hardlimit = cpu_to_be64(hard);
-               ddq->d_ino_softlimit = cpu_to_be64(soft);
-               if (id == 0) {
-                       defq->ihardlimit = hard;
-                       defq->isoftlimit = soft;
-               }
-       } else {
-               xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft);
-       }
+                       dqp->q_ino.softlimit;
+       res = &dqp->q_ino;
+       qlim = id == 0 ? &defq->ino : NULL;
  
-       /*
-        * Update warnings counter(s) if requested
-        */
-       if (newlim->d_fieldmask & QC_SPC_WARNS)
-               ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns);
+       xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino");
         if (newlim->d_fieldmask & QC_INO_WARNS)
-               ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns);
-       if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
-               ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns);
-
-       if (id == 0) {
-               if (newlim->d_fieldmask & QC_SPC_WARNS)
-                       defq->bwarnlimit = newlim->d_spc_warns;
-               if (newlim->d_fieldmask & QC_INO_WARNS)
-                       defq->iwarnlimit = newlim->d_ino_warns;
-               if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
-                       defq->rtbwarnlimit = newlim->d_rt_spc_warns;
-       }
-
-       /*
-        * Timelimits for the super user set the relative time the other users
-        * can be over quota for this file system. If it is zero a default is
-        * used.  Ditto for the default soft and hard limit values (already
-        * done, above), and for warnings.
-        *
-        * For other IDs, userspace can bump out the grace period if over
-        * the soft limit.
-        */
-       if (newlim->d_fieldmask & QC_SPC_TIMER)
-               ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer);
+               xfs_setqlim_warns(res, qlim, newlim->d_ino_warns);
         if (newlim->d_fieldmask & QC_INO_TIMER)
-               ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer);
-       if (newlim->d_fieldmask & QC_RT_SPC_TIMER)
-               ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer);
-
-       if (id == 0) {
-               if (newlim->d_fieldmask & QC_SPC_TIMER)
-                       defq->btimelimit = newlim->d_spc_timer;
-               if (newlim->d_fieldmask & QC_INO_TIMER)
-                       defq->itimelimit = newlim->d_ino_timer;
-               if (newlim->d_fieldmask & QC_RT_SPC_TIMER)
-                       defq->rtbtimelimit = newlim->d_rt_spc_timer;
-       }
+               xfs_setqlim_timer(res, qlim, newlim->d_ino_timer);
  
         if (id != 0) {
                 /*
@@ -596,9 +616,9 @@ xfs_qm_scall_setqlim(
                  * is on or off. We don't really want to bother with iterating
                  * over all ondisk dquots and turning the timers on/off.
                  */
-               xfs_qm_adjust_dqtimers(mp, dqp);
+               xfs_qm_adjust_dqtimers(dqp);
         }
-       dqp->dq_flags |= XFS_DQ_DIRTY;
+       dqp->q_flags |= XFS_DQFLAG_DIRTY;
         xfs_trans_log_dquot(tp, dqp);
  
         error = xfs_trans_commit(tp);
@@ -614,58 +634,46 @@ out_unlock:
  static void
  xfs_qm_scall_getquota_fill_qc(
         struct xfs_mount        *mp,
-       uint                    type,
+       xfs_dqtype_t            type,
         const struct xfs_dquot  *dqp,
         struct qc_dqblk         *dst)
  {
         memset(dst, 0, sizeof(*dst));
-       dst->d_spc_hardlimit =
-               XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
-       dst->d_spc_softlimit =
-               XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
-       dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
-       dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
-       dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount);
-       dst->d_ino_count = dqp->q_res_icount;
-       dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer);
-       dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer);
-       dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns);
-       dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns);
-       dst->d_rt_spc_hardlimit =
-               XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
-       dst->d_rt_spc_softlimit =
-               XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
-       dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount);
-       dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
-       dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
+       dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_blk.hardlimit);
+       dst->d_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_blk.softlimit);
+       dst->d_ino_hardlimit = dqp->q_ino.hardlimit;
+       dst->d_ino_softlimit = dqp->q_ino.softlimit;
+       dst->d_space = XFS_FSB_TO_B(mp, dqp->q_blk.reserved);
+       dst->d_ino_count = dqp->q_ino.reserved;
+       dst->d_spc_timer = dqp->q_blk.timer;
+       dst->d_ino_timer = dqp->q_ino.timer;
+       dst->d_ino_warns = dqp->q_ino.warnings;
+       dst->d_spc_warns = dqp->q_blk.warnings;
+       dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit);
+       dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit);
+       dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved);
+       dst->d_rt_spc_timer = dqp->q_rtb.timer;
+       dst->d_rt_spc_warns = dqp->q_rtb.warnings;
  
         /*
          * Internally, we don't reset all the timers when quota enforcement
          * gets turned off. No need to confuse the user level code,
          * so return zeroes in that case.
          */
-       if ((!XFS_IS_UQUOTA_ENFORCED(mp) &&
-            dqp->q_core.d_flags == XFS_DQ_USER) ||
-           (!XFS_IS_GQUOTA_ENFORCED(mp) &&
-            dqp->q_core.d_flags == XFS_DQ_GROUP) ||
-           (!XFS_IS_PQUOTA_ENFORCED(mp) &&
-            dqp->q_core.d_flags == XFS_DQ_PROJ)) {
+       if (!xfs_dquot_is_enforced(dqp)) {
                 dst->d_spc_timer = 0;
                 dst->d_ino_timer = 0;
                 dst->d_rt_spc_timer = 0;
         }
  
  #ifdef DEBUG
-       if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
-            (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
-            (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
-           dqp->q_core.d_id != 0) {
+       if (xfs_dquot_is_enforced(dqp) && dqp->q_id != 0) {
                 if ((dst->d_space > dst->d_spc_softlimit) &&
                     (dst->d_spc_softlimit > 0)) {
                         ASSERT(dst->d_spc_timer != 0);
                 }
-               if ((dst->d_ino_count > dst->d_ino_softlimit) &&
-                   (dst->d_ino_softlimit > 0)) {
+               if ((dst->d_ino_count > dqp->q_ino.softlimit) &&
+                   (dqp->q_ino.softlimit > 0)) {
                         ASSERT(dst->d_ino_timer != 0);
                 }
         }
@@ -677,7 +685,7 @@ int
  xfs_qm_scall_getquota(
         struct xfs_mount        *mp,
         xfs_dqid_t              id,
-       uint                    type,
+       xfs_dqtype_t            type,
         struct qc_dqblk         *dst)
  {
         struct xfs_dquot        *dqp;
@@ -715,7 +723,7 @@ int
  xfs_qm_scall_getquota_next(
         struct xfs_mount        *mp,
         xfs_dqid_t              *id,
-       uint                    type,
+       xfs_dqtype_t            type,
         struct qc_dqblk         *dst)
  {
         struct xfs_dquot        *dqp;
@@ -726,7 +734,7 @@ xfs_qm_scall_getquota_next(
                 return error;
  
         /* Fill in the ID we actually read from disk */
-       *id = be32_to_cpu(dqp->q_core.d_id);
+       *id = dqp->q_id;
  
         xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst);
  
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h

index aa8fc1f..06b22e3 100644 (file)
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -13,6 +13,7 @@
   */
  
  struct xfs_trans;
+struct xfs_buf;
  
  /*
   * This check is done typically without holding the inode lock;
@@ -38,14 +39,14 @@ struct xfs_trans;
  
  static inline uint
  xfs_quota_chkd_flag(
-       uint            dqtype)
+       xfs_dqtype_t            type)
  {
-       switch (dqtype) {
-       case XFS_DQ_USER:
+       switch (type) {
+       case XFS_DQTYPE_USER:
                 return XFS_UQUOTA_CHKD;
-       case XFS_DQ_GROUP:
+       case XFS_DQTYPE_GROUP:
                 return XFS_GQUOTA_CHKD;
-       case XFS_DQ_PROJ:
+       case XFS_DQTYPE_PROJ:
                 return XFS_PQUOTA_CHKD;
         default:
                 return 0;
@@ -107,6 +108,8 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *);
  extern void xfs_qm_unmount(struct xfs_mount *);
  extern void xfs_qm_unmount_quotas(struct xfs_mount *);
  
+void           xfs_dquot_done(struct xfs_buf *);
+
  #else
  static inline int
  xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
@@ -148,6 +151,12 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
  #define xfs_qm_mount_quotas(mp)
  #define xfs_qm_unmount(mp)
  #define xfs_qm_unmount_quotas(mp)
+
+static inline void xfs_dquot_done(struct xfs_buf *bp)
+{
+       return;
+}
+
  #endif /* CONFIG_XFS_QUOTA */
  
  #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c

index bf809b7..d27c0e8 100644 (file)
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -37,12 +37,12 @@ xfs_qm_fill_state(
         tstate->flags |= QCI_SYSFILE;
         tstate->blocks = ip->i_d.di_nblocks;
         tstate->nextents = ip->i_df.if_nextents;
-       tstate->spc_timelimit = (u32)defq->btimelimit;
-       tstate->ino_timelimit = (u32)defq->itimelimit;
-       tstate->rt_spc_timelimit = (u32)defq->rtbtimelimit;
-       tstate->spc_warnlimit = defq->bwarnlimit;
-       tstate->ino_warnlimit = defq->iwarnlimit;
-       tstate->rt_spc_warnlimit = defq->rtbwarnlimit;
+       tstate->spc_timelimit = (u32)defq->blk.time;
+       tstate->ino_timelimit = (u32)defq->ino.time;
+       tstate->rt_spc_timelimit = (u32)defq->rtb.time;
+       tstate->spc_warnlimit = defq->blk.warn;
+       tstate->ino_warnlimit = defq->ino.warn;
+       tstate->rt_spc_warnlimit = defq->rtb.warn;
         if (tempqip)
                 xfs_irele(ip);
  }
@@ -85,16 +85,16 @@ xfs_fs_get_quota_state(
         return 0;
  }
  
-STATIC int
+STATIC xfs_dqtype_t
  xfs_quota_type(int type)
  {
         switch (type) {
         case USRQUOTA:
-               return XFS_DQ_USER;
+               return XFS_DQTYPE_USER;
         case GRPQUOTA:
-               return XFS_DQ_GROUP;
+               return XFS_DQTYPE_GROUP;
         default:
-               return XFS_DQ_PROJ;
+               return XFS_DQTYPE_PROJ;
         }
  }
  
@@ -205,11 +205,11 @@ xfs_fs_rm_xquota(
                 return -EINVAL;
  
         if (uflags & FS_USER_QUOTA)
-               flags |= XFS_DQ_USER;
+               flags |= XFS_QMOPT_UQUOTA;
         if (uflags & FS_GROUP_QUOTA)
-               flags |= XFS_DQ_GROUP;
+               flags |= XFS_QMOPT_GQUOTA;
         if (uflags & FS_PROJ_QUOTA)
-               flags |= XFS_DQ_PROJ;
+               flags |= XFS_QMOPT_PQUOTA;
  
         return xfs_qm_scall_trunc_qfiles(mp, flags);
  }
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c

index c816398..7b2c72b 100644 (file)
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -143,7 +143,8 @@ xfs_cui_init(
                 cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
                                 0);
         else
-               cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
+               cuip = kmem_cache_zalloc(xfs_cui_zone,
+                                        GFP_KERNEL | __GFP_NOFAIL);
  
         xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
         cuip->cui_format.cui_nextents = nextents;
@@ -220,7 +221,7 @@ xfs_trans_get_cud(
  {
         struct xfs_cud_log_item         *cudp;
  
-       cudp = kmem_zone_zalloc(xfs_cud_zone, 0);
+       cudp = kmem_cache_zalloc(xfs_cud_zone, GFP_KERNEL | __GFP_NOFAIL);
         xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
                           &xfs_cud_item_ops);
         cudp->cud_cuip = cuip;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c

index 107bf2a..aac83f9 100644 (file)
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -179,7 +179,7 @@ xfs_reflink_trim_around_shared(
         int                     error = 0;
  
         /* Holes, unwritten, and delalloc extents cannot be shared */
-       if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+       if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
                 *shared = false;
                 return 0;
         }
@@ -655,7 +655,7 @@ xfs_reflink_end_cow_extent(
          * preallocations can leak into the range we are called upon, and we
          * need to skip them.
          */
-       if (!xfs_bmap_is_real_extent(&got)) {
+       if (!xfs_bmap_is_written_extent(&got)) {
                 *end_fsb = del.br_startoff;
                 goto out_cancel;
         }
@@ -984,40 +984,28 @@ xfs_reflink_ag_has_free_space(
  }
  
  /*
- * Unmap a range of blocks from a file, then map other blocks into the hole.
- * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
- * The extent irec is mapped into dest at irec->br_startoff.
+ * Remap the given extent into the file.  The dmap blockcount will be set to
+ * the number of blocks that were actually remapped.
   */
  STATIC int
  xfs_reflink_remap_extent(
         struct xfs_inode        *ip,
-       struct xfs_bmbt_irec    *irec,
-       xfs_fileoff_t           destoff,
+       struct xfs_bmbt_irec    *dmap,
         xfs_off_t               new_isize)
  {
+       struct xfs_bmbt_irec    smap;
         struct xfs_mount        *mp = ip->i_mount;
-       bool                    real_extent = xfs_bmap_is_real_extent(irec);
         struct xfs_trans        *tp;
-       unsigned int            resblks;
-       struct xfs_bmbt_irec    uirec;
-       xfs_filblks_t           rlen;
-       xfs_filblks_t           unmap_len;
         xfs_off_t               newlen;
+       int64_t                 qres, qdelta;
+       unsigned int            resblks;
+       bool                    smap_real;
+       bool                    dmap_written = xfs_bmap_is_written_extent(dmap);
+       int                     nimaps;
         int                     error;
  
-       unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
-       trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
-
-       /* No reflinking if we're low on space */
-       if (real_extent) {
-               error = xfs_reflink_ag_has_free_space(mp,
-                               XFS_FSB_TO_AGNO(mp, irec->br_startblock));
-               if (error)
-                       goto out;
-       }
-
         /* Start a rolling transaction to switch the mappings */
-       resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+       resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
         if (error)
                 goto out;
@@ -1025,87 +1013,147 @@ xfs_reflink_remap_extent(
         xfs_ilock(ip, XFS_ILOCK_EXCL);
         xfs_trans_ijoin(tp, ip, 0);
  
-       /* If we're not just clearing space, then do we have enough quota? */
-       if (real_extent) {
-               error = xfs_trans_reserve_quota_nblks(tp, ip,
-                               irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS);
+       /*
+        * Read what's currently mapped in the destination file into smap.
+        * If smap isn't a hole, we will have to remove it before we can add
+        * dmap to the destination file.
+        */
+       nimaps = 1;
+       error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
+                       &smap, &nimaps, 0);
+       if (error)
+               goto out_cancel;
+       ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
+       smap_real = xfs_bmap_is_real_extent(&smap);
+
+       /*
+        * We can only remap as many blocks as the smaller of the two extent
+        * maps, because we can only remap one extent at a time.
+        */
+       dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
+       ASSERT(dmap->br_blockcount == smap.br_blockcount);
+
+       trace_xfs_reflink_remap_extent_dest(ip, &smap);
+
+       /*
+        * Two extents mapped to the same physical block must not have
+        * different states; that's filesystem corruption.  Move on to the next
+        * extent if they're both holes or both the same physical extent.
+        */
+       if (dmap->br_startblock == smap.br_startblock) {
+               if (dmap->br_state != smap.br_state)
+                       error = -EFSCORRUPTED;
+               goto out_cancel;
+       }
+
+       /* If both extents are unwritten, leave them alone. */
+       if (dmap->br_state == XFS_EXT_UNWRITTEN &&
+           smap.br_state == XFS_EXT_UNWRITTEN)
+               goto out_cancel;
+
+       /* No reflinking if the AG of the dest mapping is low on space. */
+       if (dmap_written) {
+               error = xfs_reflink_ag_has_free_space(mp,
+                               XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
                 if (error)
                         goto out_cancel;
         }
  
-       trace_xfs_reflink_remap(ip, irec->br_startoff,
-                               irec->br_blockcount, irec->br_startblock);
-
-       /* Unmap the old blocks in the data fork. */
-       rlen = unmap_len;
-       while (rlen) {
-               ASSERT(tp->t_firstblock == NULLFSBLOCK);
-               error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1);
+       /*
+        * Compute quota reservation if we think the quota block counter for
+        * this file could increase.
+        *
+        * Adding a written extent to the extent map can cause a bmbt split,
+        * and removing a mapped extent from the extent can cause a bmbt split.
+        * The two operations cannot both cause a split since they operate on
+        * the same index in the bmap btree, so we only need a reservation for
+        * one bmbt split if either thing is happening.
+        *
+        * If we are mapping a written extent into the file, we need to have
+        * enough quota block count reservation to handle the blocks in that
+        * extent.  We log only the delta to the quota block counts, so if the
+        * extent we're unmapping also has blocks allocated to it, we don't
+        * need a quota reservation for the extent itself.
+        *
+        * Note that if we're replacing a delalloc reservation with a written
+        * extent, we have to take the full quota reservation because removing
+        * the delalloc reservation gives the block count back to the quota
+        * count.  This is suboptimal, but the VFS flushed the dest range
+        * before we started.  That should have removed all the delalloc
+        * reservations, but we code defensively.
+        */
+       qres = qdelta = 0;
+       if (smap_real || dmap_written)
+               qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+       if (!smap_real && dmap_written)
+               qres += dmap->br_blockcount;
+       if (qres > 0) {
+               error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
+                               XFS_QMOPT_RES_REGBLKS);
                 if (error)
                         goto out_cancel;
+       }
  
+       if (smap_real) {
                 /*
-                * Trim the extent to whatever got unmapped.
-                * Remember, bunmapi works backwards.
+                * If the extent we're unmapping is backed by storage (written
+                * or not), unmap the extent and drop its refcount.
                  */
-               uirec.br_startblock = irec->br_startblock + rlen;
-               uirec.br_startoff = irec->br_startoff + rlen;
-               uirec.br_blockcount = unmap_len - rlen;
-               uirec.br_state = irec->br_state;
-               unmap_len = rlen;
-
-               /* If this isn't a real mapping, we're done. */
-               if (!real_extent || uirec.br_blockcount == 0)
-                       goto next_extent;
+               xfs_bmap_unmap_extent(tp, ip, &smap);
+               xfs_refcount_decrease_extent(tp, &smap);
+               qdelta -= smap.br_blockcount;
+       } else if (smap.br_startblock == DELAYSTARTBLOCK) {
+               xfs_filblks_t   len = smap.br_blockcount;
  
-               trace_xfs_reflink_remap(ip, uirec.br_startoff,
-                               uirec.br_blockcount, uirec.br_startblock);
-
-               /* Update the refcount tree */
-               xfs_refcount_increase_extent(tp, &uirec);
-
-               /* Map the new blocks into the data fork. */
-               xfs_bmap_map_extent(tp, ip, &uirec);
+               /*
+                * If the extent we're unmapping is a delalloc reservation,
+                * we can use the regular bunmapi function to release the
+                * incore state.  Dropping the delalloc reservation takes care
+                * of the quota reservation for us.
+                */
+               error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1);
+               if (error)
+                       goto out_cancel;
+               ASSERT(len == 0);
+       }
  
-               /* Update quota accounting. */
-               xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
-                               uirec.br_blockcount);
+       /*
+        * If the extent we're sharing is backed by written storage, increase
+        * its refcount and map it into the file.
+        */
+       if (dmap_written) {
+               xfs_refcount_increase_extent(tp, dmap);
+               xfs_bmap_map_extent(tp, ip, dmap);
+               qdelta += dmap->br_blockcount;
+       }
  
-               /* Update dest isize if needed. */
-               newlen = XFS_FSB_TO_B(mp,
-                               uirec.br_startoff + uirec.br_blockcount);
-               newlen = min_t(xfs_off_t, newlen, new_isize);
-               if (newlen > i_size_read(VFS_I(ip))) {
-                       trace_xfs_reflink_update_inode_size(ip, newlen);
-                       i_size_write(VFS_I(ip), newlen);
-                       ip->i_d.di_size = newlen;
-                       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-               }
+       xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
  
-next_extent:
-               /* Process all the deferred stuff. */
-               error = xfs_defer_finish(&tp);
-               if (error)
-                       goto out_cancel;
+       /* Update dest isize if needed. */
+       newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
+       newlen = min_t(xfs_off_t, newlen, new_isize);
+       if (newlen > i_size_read(VFS_I(ip))) {
+               trace_xfs_reflink_update_inode_size(ip, newlen);
+               i_size_write(VFS_I(ip), newlen);
+               ip->i_d.di_size = newlen;
+               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
         }
  
+       /* Commit everything and unlock. */
         error = xfs_trans_commit(tp);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       if (error)
-               goto out;
-       return 0;
+       goto out_unlock;
  
  out_cancel:
         xfs_trans_cancel(tp);
+out_unlock:
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
  out:
-       trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
+       if (error)
+               trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
         return error;
  }
  
-/*
- * Iteratively remap one file's extents (and holes) to another's.
- */
+/* Remap a range of one file to the other. */
  int
  xfs_reflink_remap_blocks(
         struct xfs_inode        *src,
@@ -1116,25 +1164,22 @@ xfs_reflink_remap_blocks(
         loff_t                  *remapped)
  {
         struct xfs_bmbt_irec    imap;
-       xfs_fileoff_t           srcoff;
-       xfs_fileoff_t           destoff;
+       struct xfs_mount        *mp = src->i_mount;
+       xfs_fileoff_t           srcoff = XFS_B_TO_FSBT(mp, pos_in);
+       xfs_fileoff_t           destoff = XFS_B_TO_FSBT(mp, pos_out);
         xfs_filblks_t           len;
-       xfs_filblks_t           range_len;
         xfs_filblks_t           remapped_len = 0;
         xfs_off_t               new_isize = pos_out + remap_len;
         int                     nimaps;
         int                     error = 0;
  
-       destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
-       srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
-       len = XFS_B_TO_FSB(src->i_mount, remap_len);
+       len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
+                       XFS_MAX_FILEOFF);
  
-       /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
-       while (len) {
-               uint            lock_mode;
+       trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
  
-               trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
-                               dest, destoff);
+       while (len > 0) {
+               unsigned int    lock_mode;
  
                 /* Read extent from the source file */
                 nimaps = 1;
@@ -1143,18 +1188,25 @@ xfs_reflink_remap_blocks(
                 xfs_iunlock(src, lock_mode);
                 if (error)
                         break;
-               ASSERT(nimaps == 1);
-
-               trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
-                               &imap);
+               /*
+                * The caller supposedly flushed all dirty pages in the source
+                * file range, which means that writeback should have allocated
+                * or deleted all delalloc reservations in that range.  If we
+                * find one, that's a good sign that something is seriously
+                * wrong here.
+                */
+               ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
+               if (imap.br_startblock == DELAYSTARTBLOCK) {
+                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+                       error = -EFSCORRUPTED;
+                       break;
+               }
  
-               /* Translate imap into the destination file. */
-               range_len = imap.br_startoff + imap.br_blockcount - srcoff;
-               imap.br_startoff += destoff - srcoff;
+               trace_xfs_reflink_remap_extent_src(src, &imap);
  
-               /* Clear dest from destoff to the end of imap and map it in. */
-               error = xfs_reflink_remap_extent(dest, &imap, destoff,
-                               new_isize);
+               /* Remap into the destination file at the given offset. */
+               imap.br_startoff = destoff;
+               error = xfs_reflink_remap_extent(dest, &imap, new_isize);
                 if (error)
                         break;
  
@@ -1164,10 +1216,10 @@ xfs_reflink_remap_blocks(
                 }
  
                 /* Advance drange/srange */
-               srcoff += range_len;
-               destoff += range_len;
-               len -= range_len;
-               remapped_len += range_len;
+               srcoff += imap.br_blockcount;
+               destoff += imap.br_blockcount;
+               len -= imap.br_blockcount;
+               remapped_len += imap.br_blockcount;
         }
  
         if (error)
@@ -1177,81 +1229,6 @@ xfs_reflink_remap_blocks(
         return error;
  }
  
-/*
- * Grab the exclusive iolock for a data copy from src to dest, making sure to
- * abide vfs locking order (lowest pointer value goes first) and breaking the
- * layout leases before proceeding.  The loop is needed because we cannot call
- * the blocking break_layout() with the iolocks held, and therefore have to
- * back out both locks.
- */
-static int
-xfs_iolock_two_inodes_and_break_layout(
-       struct inode            *src,
-       struct inode            *dest)
-{
-       int                     error;
-
-       if (src > dest)
-               swap(src, dest);
-
-retry:
-       /* Wait to break both inodes' layouts before we start locking. */
-       error = break_layout(src, true);
-       if (error)
-               return error;
-       if (src != dest) {
-               error = break_layout(dest, true);
-               if (error)
-                       return error;
-       }
-
-       /* Lock one inode and make sure nobody got in and leased it. */
-       inode_lock(src);
-       error = break_layout(src, false);
-       if (error) {
-               inode_unlock(src);
-               if (error == -EWOULDBLOCK)
-                       goto retry;
-               return error;
-       }
-
-       if (src == dest)
-               return 0;
-
-       /* Lock the other inode and make sure nobody got in and leased it. */
-       inode_lock_nested(dest, I_MUTEX_NONDIR2);
-       error = break_layout(dest, false);
-       if (error) {
-               inode_unlock(src);
-               inode_unlock(dest);
-               if (error == -EWOULDBLOCK)
-                       goto retry;
-               return error;
-       }
-
-       return 0;
-}
-
-/* Unlock both inodes after they've been prepped for a range clone. */
-void
-xfs_reflink_remap_unlock(
-       struct file             *file_in,
-       struct file             *file_out)
-{
-       struct inode            *inode_in = file_inode(file_in);
-       struct xfs_inode        *src = XFS_I(inode_in);
-       struct inode            *inode_out = file_inode(file_out);
-       struct xfs_inode        *dest = XFS_I(inode_out);
-       bool                    same_inode = (inode_in == inode_out);
-
-       xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
-       if (!same_inode)
-               xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
-       inode_unlock(inode_out);
-       if (!same_inode)
-               inode_unlock(inode_in);
-}
-
  /*
   * If we're reflinking to a point past the destination file's EOF, we must
   * zero any speculative post-EOF preallocations that sit between the old EOF
@@ -1314,18 +1291,12 @@ xfs_reflink_remap_prep(
         struct xfs_inode        *src = XFS_I(inode_in);
         struct inode            *inode_out = file_inode(file_out);
         struct xfs_inode        *dest = XFS_I(inode_out);
-       bool                    same_inode = (inode_in == inode_out);
-       ssize_t                 ret;
+       int                     ret;
  
         /* Lock both files against IO */
-       ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
+       ret = xfs_ilock2_io_mmap(src, dest);
         if (ret)
                 return ret;
-       if (same_inode)
-               xfs_ilock(src, XFS_MMAPLOCK_EXCL);
-       else
-               xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest,
-                               XFS_MMAPLOCK_EXCL);
  
         /* Check file eligibility and prepare for block sharing. */
         ret = -EINVAL;
@@ -1339,7 +1310,7 @@ xfs_reflink_remap_prep(
  
         ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
                         len, remap_flags);
-       if (ret < 0 || *len == 0)
+       if (ret || *len == 0)
                 goto out_unlock;
  
         /* Attach dquots to dest inode before changing block map */
@@ -1374,9 +1345,9 @@ xfs_reflink_remap_prep(
         if (ret)
                 goto out_unlock;
  
-       return 1;
+       return 0;
  out_unlock:
-       xfs_reflink_remap_unlock(file_in, file_out);
+       xfs_iunlock2_io_mmap(src, dest);
         return ret;
  }
  
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h

index 3e4fd46..487b004 100644 (file)
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -56,7 +56,5 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
                 loff_t *remapped);
  extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
                 xfs_extlen_t cowextsize, unsigned int remap_flags);
-extern void xfs_reflink_remap_unlock(struct file *file_in,
-               struct file *file_out);
  
  #endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c

index a86599d..dc5b075 100644 (file)
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -141,7 +141,8 @@ xfs_rui_init(
         if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
                 ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
         else
-               ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
+               ruip = kmem_cache_zalloc(xfs_rui_zone,
+                                        GFP_KERNEL | __GFP_NOFAIL);
  
         xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
         ruip->rui_format.rui_nextents = nextents;
@@ -243,7 +244,7 @@ xfs_trans_get_rud(
  {
         struct xfs_rud_log_item         *rudp;
  
-       rudp = kmem_zone_zalloc(xfs_rud_zone, 0);
+       rudp = kmem_cache_zalloc(xfs_rud_zone, GFP_KERNEL | __GFP_NOFAIL);
         xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
                           &xfs_rud_item_ops);
         rudp->rud_ruip = ruip;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index 379cbff..71ac6c1 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -890,9 +890,6 @@ xfs_quiesce_attr(
         /* force the log to unpin objects from the now complete transactions */
         xfs_log_force(mp, XFS_LOG_SYNC);
  
-       /* reclaim inodes to do any IO before the freeze completes */
-       xfs_reclaim_inodes(mp, 0);
-       xfs_reclaim_inodes(mp, SYNC_WAIT);
  
         /* Push the superblock and write an unmount record */
         error = xfs_log_sbcount(mp);
@@ -913,11 +910,21 @@ xfs_fs_freeze(
         struct super_block      *sb)
  {
         struct xfs_mount        *mp = XFS_M(sb);
+       unsigned int            flags;
+       int                     ret;
  
+       /*
+        * The filesystem is now frozen far enough that memory reclaim
+        * cannot safely operate on the filesystem. Hence we need to
+        * set a GFP_NOFS context here to avoid recursion deadlocks.
+        */
+       flags = memalloc_nofs_save();
         xfs_stop_block_reaping(mp);
         xfs_save_resvblks(mp);
         xfs_quiesce_attr(mp);
-       return xfs_sync_sb(mp, true);
+       ret = xfs_sync_sb(mp, true);
+       memalloc_nofs_restore(flags);
+       return ret;
  }
  
  STATIC int
@@ -1714,6 +1721,10 @@ xfs_fc_reconfigure(
         int                     flags = fc->sb_flags;
         int                     error;
  
+       /* version 5 superblocks always support version counters. */
+       if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+               fc->sb_flags |= SB_I_VERSION;
+
         error = xfs_fc_validate_params(new_mp);
         if (error)
                 return error;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index 4601366..abb1d85 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -36,6 +36,7 @@ struct xfs_owner_info;
  struct xfs_trans_res;
  struct xfs_inobt_rec_incore;
  union xfs_btree_ptr;
+struct xfs_dqtrx;
  
  #define XFS_ATTR_FILTER_FLAGS \
         { XFS_ATTR_ROOT,        "ROOT" }, \
@@ -864,44 +865,65 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
         TP_STRUCT__entry(
                 __field(dev_t, dev)
                 __field(u32, id)
+               __field(xfs_dqtype_t, type)
                 __field(unsigned, flags)
                 __field(unsigned, nrefs)
                 __field(unsigned long long, res_bcount)
+               __field(unsigned long long, res_rtbcount)
+               __field(unsigned long long, res_icount)
+
                 __field(unsigned long long, bcount)
+               __field(unsigned long long, rtbcount)
                 __field(unsigned long long, icount)
+
                 __field(unsigned long long, blk_hardlimit)
                 __field(unsigned long long, blk_softlimit)
+               __field(unsigned long long, rtb_hardlimit)
+               __field(unsigned long long, rtb_softlimit)
                 __field(unsigned long long, ino_hardlimit)
                 __field(unsigned long long, ino_softlimit)
-       ), \
+       ),
         TP_fast_assign(
                 __entry->dev = dqp->q_mount->m_super->s_dev;
-               __entry->id = be32_to_cpu(dqp->q_core.d_id);
-               __entry->flags = dqp->dq_flags;
+               __entry->id = dqp->q_id;
+               __entry->type = dqp->q_type;
+               __entry->flags = dqp->q_flags;
                 __entry->nrefs = dqp->q_nrefs;
-               __entry->res_bcount = dqp->q_res_bcount;
-               __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount);
-               __entry->icount = be64_to_cpu(dqp->q_core.d_icount);
-               __entry->blk_hardlimit =
-                       be64_to_cpu(dqp->q_core.d_blk_hardlimit);
-               __entry->blk_softlimit =
-                       be64_to_cpu(dqp->q_core.d_blk_softlimit);
-               __entry->ino_hardlimit =
-                       be64_to_cpu(dqp->q_core.d_ino_hardlimit);
-               __entry->ino_softlimit =
-                       be64_to_cpu(dqp->q_core.d_ino_softlimit);
-       ),
-       TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
+
+               __entry->res_bcount = dqp->q_blk.reserved;
+               __entry->res_rtbcount = dqp->q_rtb.reserved;
+               __entry->res_icount = dqp->q_ino.reserved;
+
+               __entry->bcount = dqp->q_blk.count;
+               __entry->rtbcount = dqp->q_rtb.count;
+               __entry->icount = dqp->q_ino.count;
+
+               __entry->blk_hardlimit = dqp->q_blk.hardlimit;
+               __entry->blk_softlimit = dqp->q_blk.softlimit;
+               __entry->rtb_hardlimit = dqp->q_rtb.hardlimit;
+               __entry->rtb_softlimit = dqp->q_rtb.softlimit;
+               __entry->ino_hardlimit = dqp->q_ino.hardlimit;
+               __entry->ino_softlimit = dqp->q_ino.softlimit;
+       ),
+       TP_printk("dev %d:%d id 0x%x type %s flags %s nrefs %u "
+                 "res_bc 0x%llx res_rtbc 0x%llx res_ic 0x%llx "
                   "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
+                 "rtbcnt 0x%llx rtbhardlimit 0x%llx rtbsoftlimit 0x%llx "
                   "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->id,
-                 __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
+                 __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+                 __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),
                   __entry->nrefs,
                   __entry->res_bcount,
+                 __entry->res_rtbcount,
+                 __entry->res_icount,
                   __entry->bcount,
                   __entry->blk_hardlimit,
                   __entry->blk_softlimit,
+                 __entry->rtbcount,
+                 __entry->rtb_hardlimit,
+                 __entry->rtb_softlimit,
                   __entry->icount,
                   __entry->ino_hardlimit,
                   __entry->ino_softlimit)
@@ -932,6 +954,125 @@ DEFINE_DQUOT_EVENT(xfs_dqrele);
  DEFINE_DQUOT_EVENT(xfs_dqflush);
  DEFINE_DQUOT_EVENT(xfs_dqflush_force);
  DEFINE_DQUOT_EVENT(xfs_dqflush_done);
+DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before);
+DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after);
+
+#define XFS_QMOPT_FLAGS \
+       { XFS_QMOPT_UQUOTA,             "UQUOTA" }, \
+       { XFS_QMOPT_PQUOTA,             "PQUOTA" }, \
+       { XFS_QMOPT_FORCE_RES,          "FORCE_RES" }, \
+       { XFS_QMOPT_SBVERSION,          "SBVERSION" }, \
+       { XFS_QMOPT_GQUOTA,             "GQUOTA" }, \
+       { XFS_QMOPT_INHERIT,            "INHERIT" }, \
+       { XFS_QMOPT_RES_REGBLKS,        "RES_REGBLKS" }, \
+       { XFS_QMOPT_RES_RTBLKS,         "RES_RTBLKS" }, \
+       { XFS_QMOPT_BCOUNT,             "BCOUNT" }, \
+       { XFS_QMOPT_ICOUNT,             "ICOUNT" }, \
+       { XFS_QMOPT_RTBCOUNT,           "RTBCOUNT" }, \
+       { XFS_QMOPT_DELBCOUNT,          "DELBCOUNT" }, \
+       { XFS_QMOPT_DELRTBCOUNT,        "DELRTBCOUNT" }, \
+       { XFS_QMOPT_RES_INOS,           "RES_INOS" }
+
+TRACE_EVENT(xfs_trans_mod_dquot,
+       TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp,
+                unsigned int field, int64_t delta),
+       TP_ARGS(tp, dqp, field, delta),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_dqtype_t, type)
+               __field(unsigned int, flags)
+               __field(unsigned int, dqid)
+               __field(unsigned int, field)
+               __field(int64_t, delta)
+       ),
+       TP_fast_assign(
+               __entry->dev = tp->t_mountp->m_super->s_dev;
+               __entry->type = dqp->q_type;
+               __entry->flags = dqp->q_flags;
+               __entry->dqid = dqp->q_id;
+               __entry->field = field;
+               __entry->delta = delta;
+       ),
+       TP_printk("dev %d:%d dquot id 0x%x type %s flags %s field %s delta %lld",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->dqid,
+                 __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+                 __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),
+                 __print_flags(__entry->field, "|", XFS_QMOPT_FLAGS),
+                 __entry->delta)
+);
+
+DECLARE_EVENT_CLASS(xfs_dqtrx_class,
+       TP_PROTO(struct xfs_dqtrx *qtrx),
+       TP_ARGS(qtrx),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_dqtype_t, type)
+               __field(unsigned int, flags)
+               __field(u32, dqid)
+
+               __field(uint64_t, blk_res)
+               __field(int64_t,  bcount_delta)
+               __field(int64_t,  delbcnt_delta)
+
+               __field(uint64_t, rtblk_res)
+               __field(uint64_t, rtblk_res_used)
+               __field(int64_t,  rtbcount_delta)
+               __field(int64_t,  delrtb_delta)
+
+               __field(uint64_t, ino_res)
+               __field(uint64_t, ino_res_used)
+               __field(int64_t,  icount_delta)
+       ),
+       TP_fast_assign(
+               __entry->dev = qtrx->qt_dquot->q_mount->m_super->s_dev;
+               __entry->type = qtrx->qt_dquot->q_type;
+               __entry->flags = qtrx->qt_dquot->q_flags;
+               __entry->dqid = qtrx->qt_dquot->q_id;
+
+               __entry->blk_res = qtrx->qt_blk_res;
+               __entry->bcount_delta = qtrx->qt_bcount_delta;
+               __entry->delbcnt_delta = qtrx->qt_delbcnt_delta;
+
+               __entry->rtblk_res = qtrx->qt_rtblk_res;
+               __entry->rtblk_res_used = qtrx->qt_rtblk_res_used;
+               __entry->rtbcount_delta = qtrx->qt_rtbcount_delta;
+               __entry->delrtb_delta = qtrx->qt_delrtb_delta;
+
+               __entry->ino_res = qtrx->qt_ino_res;
+               __entry->ino_res_used = qtrx->qt_ino_res_used;
+               __entry->icount_delta = qtrx->qt_icount_delta;
+       ),
+       TP_printk("dev %d:%d dquot id 0x%x type %s flags %s"
+                 "blk_res %llu bcount_delta %lld delbcnt_delta %lld "
+                 "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld "
+                 "ino_res %llu ino_res_used %llu icount_delta %lld",
+               MAJOR(__entry->dev), MINOR(__entry->dev),
+               __entry->dqid,
+                 __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+                 __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),
+
+               __entry->blk_res,
+               __entry->bcount_delta,
+               __entry->delbcnt_delta,
+
+               __entry->rtblk_res,
+               __entry->rtblk_res_used,
+               __entry->rtbcount_delta,
+               __entry->delrtb_delta,
+
+               __entry->ino_res,
+               __entry->ino_res_used,
+               __entry->icount_delta)
+)
+
+#define DEFINE_DQTRX_EVENT(name) \
+DEFINE_EVENT(xfs_dqtrx_class, name, \
+       TP_PROTO(struct xfs_dqtrx *qtrx), \
+       TP_ARGS(qtrx))
+DEFINE_DQTRX_EVENT(xfs_trans_apply_dquot_deltas);
+DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_before);
+DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_after);
  
  DECLARE_EVENT_CLASS(xfs_loggrant_class,
         TP_PROTO(struct xlog *log, struct xlog_ticket *tic),
@@ -3052,8 +3193,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
  DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
  DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
  DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
-DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
-TRACE_EVENT(xfs_reflink_remap_blocks_loop,
+TRACE_EVENT(xfs_reflink_remap_blocks,
         TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
                  xfs_filblks_t len, struct xfs_inode *dest,
                  xfs_fileoff_t doffset),
@@ -3084,59 +3224,14 @@ TRACE_EVENT(xfs_reflink_remap_blocks_loop,
                   __entry->dest_ino,
                   __entry->dest_lblk)
  );
-TRACE_EVENT(xfs_reflink_punch_range,
-       TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
-                xfs_extlen_t len),
-       TP_ARGS(ip, lblk, len),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(xfs_fileoff_t, lblk)
-               __field(xfs_extlen_t, len)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-               __entry->lblk = lblk;
-               __entry->len = len;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->lblk,
-                 __entry->len)
-);
-TRACE_EVENT(xfs_reflink_remap,
-       TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
-                xfs_extlen_t len, xfs_fsblock_t new_pblk),
-       TP_ARGS(ip, lblk, len, new_pblk),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(xfs_fileoff_t, lblk)
-               __field(xfs_extlen_t, len)
-               __field(xfs_fsblock_t, new_pblk)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-               __entry->lblk = lblk;
-               __entry->len = len;
-               __entry->new_pblk = new_pblk;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->lblk,
-                 __entry->len,
-                 __entry->new_pblk)
-);
  DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
  DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
  DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
  DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
  DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
  DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest);
  
  /* dedupe tracepoints */
  DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
@@ -3582,7 +3677,6 @@ DEFINE_KMEM_EVENT(kmem_alloc);
  DEFINE_KMEM_EVENT(kmem_alloc_io);
  DEFINE_KMEM_EVENT(kmem_alloc_large);
  DEFINE_KMEM_EVENT(kmem_realloc);
-DEFINE_KMEM_EVENT(kmem_zone_alloc);
  
  TRACE_EVENT(xfs_check_new_dalign,
         TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c

index 3c94e5f..ed72867 100644 (file)
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -90,7 +90,7 @@ xfs_trans_dup(
  
         trace_xfs_trans_dup(tp, _RET_IP_);
  
-       ntp = kmem_zone_zalloc(xfs_trans_zone, 0);
+       ntp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
  
         /*
          * Initialize the new transaction structure.
@@ -107,7 +107,8 @@ xfs_trans_dup(
  
         ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
                        (tp->t_flags & XFS_TRANS_RESERVE) |
-                      (tp->t_flags & XFS_TRANS_NO_WRITECOUNT);
+                      (tp->t_flags & XFS_TRANS_NO_WRITECOUNT) |
+                      (tp->t_flags & XFS_TRANS_RES_FDBLKS);
         /* We gave our writer reference to the new transaction */
         tp->t_flags |= XFS_TRANS_NO_WRITECOUNT;
         ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
@@ -262,7 +263,7 @@ xfs_trans_alloc(
          * GFP_NOFS allocation context so that we avoid lockdep false positives
          * by doing GFP_KERNEL allocations inside sb_start_intwrite().
          */
-       tp = kmem_zone_zalloc(xfs_trans_zone, 0);
+       tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
         if (!(flags & XFS_TRANS_NO_WRITECOUNT))
                 sb_start_intwrite(mp->m_super);
  
@@ -272,6 +273,8 @@ xfs_trans_alloc(
          */
         WARN_ON(resp->tr_logres > 0 &&
                 mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
+       ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) ||
+              xfs_sb_version_haslazysbcount(&mp->m_sb));
  
         tp->t_magic = XFS_TRANS_HEADER_MAGIC;
         tp->t_flags = flags;
@@ -365,6 +368,20 @@ xfs_trans_mod_sb(
                         tp->t_blk_res_used += (uint)-delta;
                         if (tp->t_blk_res_used > tp->t_blk_res)
                                 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+               } else if (delta > 0 && (tp->t_flags & XFS_TRANS_RES_FDBLKS)) {
+                       int64_t blkres_delta;
+
+                       /*
+                        * Return freed blocks directly to the reservation
+                        * instead of the global pool, being careful not to
+                        * overflow the trans counter. This is used to preserve
+                        * reservation across chains of transaction rolls that
+                        * repeatedly free and allocate blocks.
+                        */
+                       blkres_delta = min_t(int64_t, delta,
+                                            UINT_MAX - tp->t_blk_res);
+                       tp->t_blk_res += blkres_delta;
+                       delta -= blkres_delta;
                 }
                 tp->t_fdblocks_delta += delta;
                 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h

index 8308bf6..b752501 100644 (file)
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -37,10 +37,6 @@ struct xfs_log_item {
         unsigned long                   li_flags;       /* misc flags */
         struct xfs_buf                  *li_buf;        /* real buffer pointer */
         struct list_head                li_bio_list;    /* buffer item list */
-       void                            (*li_cb)(struct xfs_buf *,
-                                                struct xfs_log_item *);
-                                                       /* buffer item iodone */
-                                                       /* callback func */
         const struct xfs_item_ops       *li_ops;        /* function list */
  
         /* delayed logging */
@@ -78,7 +74,6 @@ struct xfs_item_ops {
         void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
         void (*iop_release)(struct xfs_log_item *);
         xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
-       void (*iop_error)(struct xfs_log_item *, xfs_buf_t *);
         int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp);
         bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
  };
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c

index ac50193..0c783d3 100644 (file)
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -377,8 +377,12 @@ xfsaild_resubmit_item(
         }
  
         /* protected by ail_lock */
-       list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
-               xfs_clear_li_failed(lip);
+       list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
+               if (bp->b_flags & _XBF_INODES)
+                       clear_bit(XFS_LI_FAILED, &lip->li_flags);
+               else
+                       xfs_clear_li_failed(lip);
+       }
  
         xfs_buf_unlock(bp);
         return XFS_ITEM_SUCCESS;
@@ -444,16 +448,10 @@ xfsaild_push(
         target = ailp->ail_target;
         ailp->ail_target_prev = target;
  
+       /* we're done if the AIL is empty or our push has reached the end */
         lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn);
-       if (!lip) {
-               /*
-                * If the AIL is empty or our push has reached the end we are
-                * done now.
-                */
-               xfs_trans_ail_cursor_done(&cur);
-               spin_unlock(&ailp->ail_lock);
+       if (!lip)
                 goto out_done;
-       }
  
         XFS_STATS_INC(mp, xs_push_ail);
  
@@ -535,6 +533,8 @@ xfsaild_push(
                         break;
                 lsn = lip->li_lsn;
         }
+
+out_done:
         xfs_trans_ail_cursor_done(&cur);
         spin_unlock(&ailp->ail_lock);
  
@@ -542,7 +542,6 @@ xfsaild_push(
                 ailp->ail_log_flush++;
  
         if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
-out_done:
                 /*
                  * We reached the target or the AIL is empty, so wait a bit
                  * longer for I/O to complete and remove pushed items from the
@@ -634,7 +633,8 @@ xfsaild(
                  */
                 smp_rmb();
                 if (!xfs_ail_min(ailp) &&
-                   ailp->ail_target == ailp->ail_target_prev) {
+                   ailp->ail_target == ailp->ail_target_prev &&
+                   list_empty(&ailp->ail_buf_list)) {
                         spin_unlock(&ailp->ail_lock);
                         freezable_schedule();
                         tout = 0;
@@ -843,7 +843,6 @@ xfs_ail_delete_one(
  
         trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
         xfs_ail_delete(ailp, lip);
-       xfs_clear_li_failed(lip);
         clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
         lip->li_lsn = 0;
  
@@ -874,6 +873,7 @@ xfs_trans_ail_delete(
         }
  
         /* xfs_ail_update_finish() drops the AIL lock */
+       xfs_clear_li_failed(lip);
         tail_lsn = xfs_ail_delete_one(ailp, lip);
         xfs_ail_update_finish(ailp, tail_lsn);
  }
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c

index 08174ff..11cd666 100644 (file)
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -465,24 +465,16 @@ xfs_trans_dirty_buf(
  
         ASSERT(bp->b_transp == tp);
         ASSERT(bip != NULL);
-       ASSERT(bp->b_iodone == NULL ||
-              bp->b_iodone == xfs_buf_iodone_callbacks);
  
         /*
          * Mark the buffer as needing to be written out eventually,
          * and set its iodone function to remove the buffer's buf log
          * item from the AIL and free it when the buffer is flushed
-        * to disk.  See xfs_buf_attach_iodone() for more details
-        * on li_cb and xfs_buf_iodone_callbacks().
-        * If we end up aborting this transaction, we trap this buffer
-        * inside the b_bdstrat callback so that this won't get written to
-        * disk.
+        * to disk.
          */
         bp->b_flags |= XBF_DONE;
  
         ASSERT(atomic_read(&bip->bli_refcount) > 0);
-       bp->b_iodone = xfs_buf_iodone_callbacks;
-       bip->bli_item.li_cb = xfs_buf_iodone;
  
         /*
          * If we invalidated the buffer within this transaction, then
@@ -626,6 +618,7 @@ xfs_trans_inode_buf(
         ASSERT(atomic_read(&bip->bli_refcount) > 0);
  
         bip->bli_flags |= XFS_BLI_INODE_BUF;
+       bp->b_flags |= _XBF_INODES;
         xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
  }
  
@@ -650,7 +643,7 @@ xfs_trans_stale_inode_buf(
         ASSERT(atomic_read(&bip->bli_refcount) > 0);
  
         bip->bli_flags |= XFS_BLI_STALE_INODE;
-       bip->bli_item.li_cb = xfs_buf_iodone;
+       bp->b_flags |= _XBF_INODES;
         xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
  }
  
@@ -675,6 +668,7 @@ xfs_trans_inode_alloc_buf(
         ASSERT(atomic_read(&bip->bli_refcount) > 0);
  
         bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+       bp->b_flags |= _XBF_INODES;
         xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
  }
  
@@ -785,5 +779,6 @@ xfs_trans_dquot_buf(
                 break;
         }
  
+       bp->b_flags |= _XBF_DQUOTS;
         xfs_trans_buf_set_type(tp, bp, type);
  }
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c

index c0f73b8..c6ba7ef 100644 (file)
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -15,6 +15,7 @@
  #include "xfs_trans_priv.h"
  #include "xfs_quota.h"
  #include "xfs_qm.h"
+#include "xfs_trace.h"
  
  STATIC void    xfs_trans_alloc_dqinfo(xfs_trans_t *);
  
@@ -155,14 +156,19 @@ xfs_trans_get_dqtrx(
         int                     i;
         struct xfs_dqtrx        *qa;
  
-       if (XFS_QM_ISUDQ(dqp))
+       switch (xfs_dquot_type(dqp)) {
+       case XFS_DQTYPE_USER:
                 qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
-       else if (XFS_QM_ISGDQ(dqp))
+               break;
+       case XFS_DQTYPE_GROUP:
                 qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
-       else if (XFS_QM_ISPDQ(dqp))
+               break;
+       case XFS_DQTYPE_PROJ:
                 qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ];
-       else
+               break;
+       default:
                 return NULL;
+       }
  
         for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
                 if (qa[i].qt_dquot == NULL ||
@@ -203,6 +209,11 @@ xfs_trans_mod_dquot(
         if (qtrx->qt_dquot == NULL)
                 qtrx->qt_dquot = dqp;
  
+       if (delta) {
+               trace_xfs_trans_mod_dquot_before(qtrx);
+               trace_xfs_trans_mod_dquot(tp, dqp, field, delta);
+       }
+
         switch (field) {
  
                 /*
@@ -266,6 +277,10 @@ xfs_trans_mod_dquot(
               default:
                 ASSERT(0);
         }
+
+       if (delta)
+               trace_xfs_trans_mod_dquot_after(qtrx);
+
         tp->t_flags |= XFS_TRANS_DQ_DIRTY;
  }
  
@@ -293,6 +308,37 @@ xfs_trans_dqlockedjoin(
         }
  }
  
+/* Apply dqtrx changes to the quota reservation counters. */
+static inline void
+xfs_apply_quota_reservation_deltas(
+       struct xfs_dquot_res    *res,
+       uint64_t                reserved,
+       int64_t                 res_used,
+       int64_t                 count_delta)
+{
+       if (reserved != 0) {
+               /*
+                * Subtle math here: If reserved > res_used (the normal case),
+                * we're simply subtracting the unused transaction quota
+                * reservation from the dquot reservation.
+                *
+                * If, however, res_used > reserved, then we have allocated
+                * more quota blocks than were reserved for the transaction.
+                * We must add that excess to the dquot reservation since it
+                * tracks (usage + resv) and by definition we didn't reserve
+                * that excess.
+                */
+               res->reserved -= abs(reserved - res_used);
+       } else if (count_delta != 0) {
+               /*
+                * These blks were never reserved, either inside a transaction
+                * or outside one (in a delayed allocation). Also, this isn't
+                * always a negative number since we sometimes deliberately
+                * skip quota reservations.
+                */
+               res->reserved += count_delta;
+       }
+}
  
  /*
   * Called by xfs_trans_commit() and similar in spirit to
@@ -309,7 +355,6 @@ xfs_trans_apply_dquot_deltas(
         int                     i, j;
         struct xfs_dquot        *dqp;
         struct xfs_dqtrx        *qtrx, *qa;
-       struct xfs_disk_dquot   *d;
         int64_t                 totalbdelta;
         int64_t                 totalrtbdelta;
  
@@ -328,6 +373,8 @@ xfs_trans_apply_dquot_deltas(
                 xfs_trans_dqlockedjoin(tp, qa);
  
                 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+                       uint64_t        blk_res_used;
+
                         qtrx = &qa[i];
                         /*
                          * The array of dquots is filled
@@ -341,7 +388,6 @@ xfs_trans_apply_dquot_deltas(
                         /*
                          * adjust the actual number of blocks used
                          */
-                       d = &dqp->q_core;
  
                         /*
                          * The issue here is - sometimes we don't make a blkquota
@@ -360,38 +406,46 @@ xfs_trans_apply_dquot_deltas(
                                 qtrx->qt_delbcnt_delta;
                         totalrtbdelta = qtrx->qt_rtbcount_delta +
                                 qtrx->qt_delrtb_delta;
+
+                       if (totalbdelta != 0 || totalrtbdelta != 0 ||
+                           qtrx->qt_icount_delta != 0) {
+                               trace_xfs_trans_apply_dquot_deltas_before(dqp);
+                               trace_xfs_trans_apply_dquot_deltas(qtrx);
+                       }
+
  #ifdef DEBUG
                         if (totalbdelta < 0)
-                               ASSERT(be64_to_cpu(d->d_bcount) >=
-                                      -totalbdelta);
+                               ASSERT(dqp->q_blk.count >= -totalbdelta);
  
                         if (totalrtbdelta < 0)
-                               ASSERT(be64_to_cpu(d->d_rtbcount) >=
-                                      -totalrtbdelta);
+                               ASSERT(dqp->q_rtb.count >= -totalrtbdelta);
  
                         if (qtrx->qt_icount_delta < 0)
-                               ASSERT(be64_to_cpu(d->d_icount) >=
-                                      -qtrx->qt_icount_delta);
+                               ASSERT(dqp->q_ino.count >= -qtrx->qt_icount_delta);
  #endif
                         if (totalbdelta)
-                               be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta);
+                               dqp->q_blk.count += totalbdelta;
  
                         if (qtrx->qt_icount_delta)
-                               be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta);
+                               dqp->q_ino.count += qtrx->qt_icount_delta;
  
                         if (totalrtbdelta)
-                               be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta);
+                               dqp->q_rtb.count += totalrtbdelta;
+
+                       if (totalbdelta != 0 || totalrtbdelta != 0 ||
+                           qtrx->qt_icount_delta != 0)
+                               trace_xfs_trans_apply_dquot_deltas_after(dqp);
  
                         /*
                          * Get any default limits in use.
                          * Start/reset the timer(s) if needed.
                          */
-                       if (d->d_id) {
-                               xfs_qm_adjust_dqlimits(tp->t_mountp, dqp);
-                               xfs_qm_adjust_dqtimers(tp->t_mountp, dqp);
+                       if (dqp->q_id) {
+                               xfs_qm_adjust_dqlimits(dqp);
+                               xfs_qm_adjust_dqtimers(dqp);
                         }
  
-                       dqp->dq_flags |= XFS_DQ_DIRTY;
+                       dqp->q_flags |= XFS_DQFLAG_DIRTY;
                         /*
                          * add this to the list of items to get logged
                          */
@@ -401,78 +455,31 @@ xfs_trans_apply_dquot_deltas(
                          * In case of delayed allocations, there's no
                          * reservation that a transaction structure knows of.
                          */
-                       if (qtrx->qt_blk_res != 0) {
-                               uint64_t        blk_res_used = 0;
-
-                               if (qtrx->qt_bcount_delta > 0)
-                                       blk_res_used = qtrx->qt_bcount_delta;
-
-                               if (qtrx->qt_blk_res != blk_res_used) {
-                                       if (qtrx->qt_blk_res > blk_res_used)
-                                               dqp->q_res_bcount -= (xfs_qcnt_t)
-                                                       (qtrx->qt_blk_res -
-                                                        blk_res_used);
-                                       else
-                                               dqp->q_res_bcount -= (xfs_qcnt_t)
-                                                       (blk_res_used -
-                                                        qtrx->qt_blk_res);
-                               }
-                       } else {
-                               /*
-                                * These blks were never reserved, either inside
-                                * a transaction or outside one (in a delayed
-                                * allocation). Also, this isn't always a
-                                * negative number since we sometimes
-                                * deliberately skip quota reservations.
-                                */
-                               if (qtrx->qt_bcount_delta) {
-                                       dqp->q_res_bcount +=
-                                             (xfs_qcnt_t)qtrx->qt_bcount_delta;
-                               }
-                       }
+                       blk_res_used = max_t(int64_t, 0, qtrx->qt_bcount_delta);
+                       xfs_apply_quota_reservation_deltas(&dqp->q_blk,
+                                       qtrx->qt_blk_res, blk_res_used,
+                                       qtrx->qt_bcount_delta);
+
                         /*
                          * Adjust the RT reservation.
                          */
-                       if (qtrx->qt_rtblk_res != 0) {
-                               if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) {
-                                       if (qtrx->qt_rtblk_res >
-                                           qtrx->qt_rtblk_res_used)
-                                              dqp->q_res_rtbcount -= (xfs_qcnt_t)
-                                                      (qtrx->qt_rtblk_res -
-                                                       qtrx->qt_rtblk_res_used);
-                                       else
-                                              dqp->q_res_rtbcount -= (xfs_qcnt_t)
-                                                      (qtrx->qt_rtblk_res_used -
-                                                       qtrx->qt_rtblk_res);
-                               }
-                       } else {
-                               if (qtrx->qt_rtbcount_delta)
-                                       dqp->q_res_rtbcount +=
-                                           (xfs_qcnt_t)qtrx->qt_rtbcount_delta;
-                       }
+                       xfs_apply_quota_reservation_deltas(&dqp->q_rtb,
+                                       qtrx->qt_rtblk_res,
+                                       qtrx->qt_rtblk_res_used,
+                                       qtrx->qt_rtbcount_delta);
  
                         /*
                          * Adjust the inode reservation.
                          */
-                       if (qtrx->qt_ino_res != 0) {
-                               ASSERT(qtrx->qt_ino_res >=
-                                      qtrx->qt_ino_res_used);
-                               if (qtrx->qt_ino_res > qtrx->qt_ino_res_used)
-                                       dqp->q_res_icount -= (xfs_qcnt_t)
-                                               (qtrx->qt_ino_res -
-                                                qtrx->qt_ino_res_used);
-                       } else {
-                               if (qtrx->qt_icount_delta)
-                                       dqp->q_res_icount +=
-                                           (xfs_qcnt_t)qtrx->qt_icount_delta;
-                       }
-
-                       ASSERT(dqp->q_res_bcount >=
-                               be64_to_cpu(dqp->q_core.d_bcount));
-                       ASSERT(dqp->q_res_icount >=
-                               be64_to_cpu(dqp->q_core.d_icount));
-                       ASSERT(dqp->q_res_rtbcount >=
-                               be64_to_cpu(dqp->q_core.d_rtbcount));
+                       ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
+                       xfs_apply_quota_reservation_deltas(&dqp->q_ino,
+                                       qtrx->qt_ino_res,
+                                       qtrx->qt_ino_res_used,
+                                       qtrx->qt_icount_delta);
+
+                       ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
+                       ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
+                       ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
                 }
         }
  }
@@ -516,7 +523,7 @@ xfs_trans_unreserve_and_mod_dquots(
                         if (qtrx->qt_blk_res) {
                                 xfs_dqlock(dqp);
                                 locked = true;
-                               dqp->q_res_bcount -=
+                               dqp->q_blk.reserved -=
                                         (xfs_qcnt_t)qtrx->qt_blk_res;
                         }
                         if (qtrx->qt_ino_res) {
@@ -524,7 +531,7 @@ xfs_trans_unreserve_and_mod_dquots(
                                         xfs_dqlock(dqp);
                                         locked = true;
                                 }
-                               dqp->q_res_icount -=
+                               dqp->q_ino.reserved -=
                                         (xfs_qcnt_t)qtrx->qt_ino_res;
                         }
  
@@ -533,7 +540,7 @@ xfs_trans_unreserve_and_mod_dquots(
                                         xfs_dqlock(dqp);
                                         locked = true;
                                 }
-                               dqp->q_res_rtbcount -=
+                               dqp->q_rtb.reserved -=
                                         (xfs_qcnt_t)qtrx->qt_rtblk_res;
                         }
                         if (locked)
@@ -549,20 +556,79 @@ xfs_quota_warn(
         struct xfs_dquot        *dqp,
         int                     type)
  {
-       enum quota_type qtype;
+       enum quota_type         qtype;
  
-       if (dqp->dq_flags & XFS_DQ_PROJ)
+       switch (xfs_dquot_type(dqp)) {
+       case XFS_DQTYPE_PROJ:
                 qtype = PRJQUOTA;
-       else if (dqp->dq_flags & XFS_DQ_USER)
+               break;
+       case XFS_DQTYPE_USER:
                 qtype = USRQUOTA;
-       else
+               break;
+       case XFS_DQTYPE_GROUP:
                 qtype = GRPQUOTA;
+               break;
+       default:
+               return;
+       }
  
-       quota_send_warning(make_kqid(&init_user_ns, qtype,
-                                    be32_to_cpu(dqp->q_core.d_id)),
+       quota_send_warning(make_kqid(&init_user_ns, qtype, dqp->q_id),
                            mp->m_super->s_dev, type);
  }
  
+/*
+ * Decide if we can make an additional reservation against a quota resource.
+ * Returns an inode QUOTA_NL_ warning code and whether or not it's fatal.
+ *
+ * Note that we assume that the numeric difference between the inode and block
+ * warning codes will always be 3 since it's userspace ABI now, and will never
+ * decrease the quota reservation, so the *BELOW messages are irrelevant.
+ */
+static inline int
+xfs_dqresv_check(
+       struct xfs_dquot_res    *res,
+       struct xfs_quota_limits *qlim,
+       int64_t                 delta,
+       bool                    *fatal)
+{
+       xfs_qcnt_t              hardlimit = res->hardlimit;
+       xfs_qcnt_t              softlimit = res->softlimit;
+       xfs_qcnt_t              total_count = res->reserved + delta;
+
+       BUILD_BUG_ON(QUOTA_NL_BHARDWARN     != QUOTA_NL_IHARDWARN + 3);
+       BUILD_BUG_ON(QUOTA_NL_BSOFTLONGWARN != QUOTA_NL_ISOFTLONGWARN + 3);
+       BUILD_BUG_ON(QUOTA_NL_BSOFTWARN     != QUOTA_NL_ISOFTWARN + 3);
+
+       *fatal = false;
+       if (delta <= 0)
+               return QUOTA_NL_NOWARN;
+
+       if (!hardlimit)
+               hardlimit = qlim->hard;
+       if (!softlimit)
+               softlimit = qlim->soft;
+
+       if (hardlimit && total_count > hardlimit) {
+               *fatal = true;
+               return QUOTA_NL_IHARDWARN;
+       }
+
+       if (softlimit && total_count > softlimit) {
+               time64_t        now = ktime_get_real_seconds();
+
+               if ((res->timer != 0 && now > res->timer) ||
+                   (res->warnings != 0 && res->warnings >= qlim->warn)) {
+                       *fatal = true;
+                       return QUOTA_NL_ISOFTLONGWARN;
+               }
+
+               res->warnings++;
+               return QUOTA_NL_ISOFTWARN;
+       }
+
+       return QUOTA_NL_NOWARN;
+}
+
  /*
   * This reserves disk blocks and inodes against a dquot.
   * Flags indicate if the dquot is to be locked here and also
@@ -578,110 +644,58 @@ xfs_trans_dqresv(
         long                    ninos,
         uint                    flags)
  {
-       xfs_qcnt_t              hardlimit;
-       xfs_qcnt_t              softlimit;
-       time64_t                timer;
-       xfs_qwarncnt_t          warns;
-       xfs_qwarncnt_t          warnlimit;
-       xfs_qcnt_t              total_count;
-       xfs_qcnt_t              *resbcountp;
         struct xfs_quotainfo    *q = mp->m_quotainfo;
         struct xfs_def_quota    *defq;
-
+       struct xfs_dquot_res    *blkres;
+       struct xfs_quota_limits *qlim;
  
         xfs_dqlock(dqp);
  
         defq = xfs_get_defquota(q, xfs_dquot_type(dqp));
  
         if (flags & XFS_TRANS_DQ_RES_BLKS) {
-               hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
-               if (!hardlimit)
-                       hardlimit = defq->bhardlimit;
-               softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit);
-               if (!softlimit)
-                       softlimit = defq->bsoftlimit;
-               timer = be32_to_cpu(dqp->q_core.d_btimer);
-               warns = be16_to_cpu(dqp->q_core.d_bwarns);
-               warnlimit = defq->bwarnlimit;
-               resbcountp = &dqp->q_res_bcount;
+               blkres = &dqp->q_blk;
+               qlim = &defq->blk;
         } else {
-               ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
-               hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit);
-               if (!hardlimit)
-                       hardlimit = defq->rtbhardlimit;
-               softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit);
-               if (!softlimit)
-                       softlimit = defq->rtbsoftlimit;
-               timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
-               warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
-               warnlimit = defq->rtbwarnlimit;
-               resbcountp = &dqp->q_res_rtbcount;
+               blkres = &dqp->q_rtb;
+               qlim = &defq->rtb;
         }
  
-       if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
-           dqp->q_core.d_id &&
-           ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
-            (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
-            (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
-               if (nblks > 0) {
+       if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_id &&
+           xfs_dquot_is_enforced(dqp)) {
+               int             quota_nl;
+               bool            fatal;
+
+               /*
+                * dquot is locked already. See if we'd go over the hardlimit
+                * or exceed the timelimit if we'd reserve resources.
+                */
+               quota_nl = xfs_dqresv_check(blkres, qlim, nblks, &fatal);
+               if (quota_nl != QUOTA_NL_NOWARN) {
                         /*
-                        * dquot is locked already. See if we'd go over the
-                        * hardlimit or exceed the timelimit if we allocate
-                        * nblks.
+                        * Quota block warning codes are 3 more than the inode
+                        * codes, which we check above.
                          */
-                       total_count = *resbcountp + nblks;
-                       if (hardlimit && total_count > hardlimit) {
-                               xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
+                       xfs_quota_warn(mp, dqp, quota_nl + 3);
+                       if (fatal)
                                 goto error_return;
-                       }
-                       if (softlimit && total_count > softlimit) {
-                               if ((timer != 0 &&
-                                    ktime_get_real_seconds() > timer) ||
-                                   (warns != 0 && warns >= warnlimit)) {
-                                       xfs_quota_warn(mp, dqp,
-                                                      QUOTA_NL_BSOFTLONGWARN);
-                                       goto error_return;
-                               }
-
-                               xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
-                       }
                 }
-               if (ninos > 0) {
-                       total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos;
-                       timer = be32_to_cpu(dqp->q_core.d_itimer);
-                       warns = be16_to_cpu(dqp->q_core.d_iwarns);
-                       warnlimit = defq->iwarnlimit;
-                       hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
-                       if (!hardlimit)
-                               hardlimit = defq->ihardlimit;
-                       softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
-                       if (!softlimit)
-                               softlimit = defq->isoftlimit;
-
-                       if (hardlimit && total_count > hardlimit) {
-                               xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
+
+               quota_nl = xfs_dqresv_check(&dqp->q_ino, &defq->ino, ninos,
+                               &fatal);
+               if (quota_nl != QUOTA_NL_NOWARN) {
+                       xfs_quota_warn(mp, dqp, quota_nl);
+                       if (fatal)
                                 goto error_return;
-                       }
-                       if (softlimit && total_count > softlimit) {
-                               if  ((timer != 0 &&
-                                     ktime_get_real_seconds() > timer) ||
-                                    (warns != 0 && warns >= warnlimit)) {
-                                       xfs_quota_warn(mp, dqp,
-                                                      QUOTA_NL_ISOFTLONGWARN);
-                                       goto error_return;
-                               }
-                               xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
-                       }
                 }
         }
  
         /*
          * Change the reservation, but not the actual usage.
-        * Note that q_res_bcount = q_core.d_bcount + resv
+        * Note that q_blk.reserved = q_blk.count + resv
          */
-       (*resbcountp) += (xfs_qcnt_t)nblks;
-       if (ninos != 0)
-               dqp->q_res_icount += (xfs_qcnt_t)ninos;
+       blkres->reserved += (xfs_qcnt_t)nblks;
+       dqp->q_ino.reserved += (xfs_qcnt_t)ninos;
  
         /*
          * note the reservation amt in the trans struct too,
@@ -702,16 +716,16 @@ xfs_trans_dqresv(
                                             XFS_TRANS_DQ_RES_INOS,
                                             ninos);
         }
-       ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount));
-       ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
-       ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
+       ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
+       ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
+       ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
  
         xfs_dqunlock(dqp);
         return 0;
  
  error_return:
         xfs_dqunlock(dqp);
-       if (XFS_QM_ISPDQ(dqp))
+       if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ)
                 return -ENOSPC;
         return -EDQUOT;
  }
@@ -860,7 +874,8 @@ STATIC void
  xfs_trans_alloc_dqinfo(
         xfs_trans_t     *tp)
  {
-       tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0);
+       tp->t_dqinfo = kmem_cache_zalloc(xfs_qm_dqtrxzone,
+                                        GFP_KERNEL | __GFP_NOFAIL);
  }
  
  void
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 7 Aug 2020 17:57:29 +0000 (10:57 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 7 Aug 2020 17:57:29 +0000 (10:57 -0700)
fs/xfs/kmem.c		patch \| blob \| history
fs/xfs/kmem.h		patch \| blob \| history
fs/xfs/libxfs/xfs_ag.c		patch \| blob \| history
fs/xfs/libxfs/xfs_ag_resv.h		patch \| blob \| history
fs/xfs/libxfs/xfs_alloc.c		patch \| blob \| history
fs/xfs/libxfs/xfs_alloc_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_attr.c		patch \| blob \| history
fs/xfs/libxfs/xfs_attr.h		patch \| blob \| history
fs/xfs/libxfs/xfs_attr_leaf.c		patch \| blob \| history
fs/xfs/libxfs/xfs_attr_leaf.h		patch \| blob \| history
fs/xfs/libxfs/xfs_attr_remote.c		patch \| blob \| history
fs/xfs/libxfs/xfs_attr_remote.h		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.c		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.h		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_btree_staging.h		patch \| blob \| history
fs/xfs/libxfs/xfs_da_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_da_btree.h		patch \| blob \| history
fs/xfs/libxfs/xfs_dir2_node.c		patch \| blob \| history
fs/xfs/libxfs/xfs_dquot_buf.c		patch \| blob \| history
fs/xfs/libxfs/xfs_format.h		patch \| blob \| history
fs/xfs/libxfs/xfs_ialloc.c		patch \| blob \| history
fs/xfs/libxfs/xfs_ialloc_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_inode_buf.c		patch \| blob \| history
fs/xfs/libxfs/xfs_inode_buf.h		patch \| blob \| history
fs/xfs/libxfs/xfs_inode_fork.c		patch \| blob \| history
fs/xfs/libxfs/xfs_quota_defs.h		patch \| blob \| history
fs/xfs/libxfs/xfs_refcount_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_rmap_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_rtbitmap.c		patch \| blob \| history
fs/xfs/libxfs/xfs_shared.h		patch \| blob \| history
fs/xfs/libxfs/xfs_trans_inode.c		patch \| blob \| history
fs/xfs/libxfs/xfs_trans_space.h		patch \| blob \| history
fs/xfs/scrub/bmap.c		patch \| blob \| history
fs/xfs/scrub/dabtree.c		patch \| blob \| history
fs/xfs/scrub/quota.c		patch \| blob \| history
fs/xfs/scrub/repair.c		patch \| blob \| history
fs/xfs/scrub/repair.h		patch \| blob \| history
fs/xfs/scrub/rtbitmap.c		patch \| blob \| history
fs/xfs/xfs_bmap_item.c		patch \| blob \| history
fs/xfs/xfs_bmap_util.c		patch \| blob \| history
fs/xfs/xfs_buf.c		patch \| blob \| history
fs/xfs/xfs_buf.h		patch \| blob \| history
fs/xfs/xfs_buf_item.c		patch \| blob \| history
fs/xfs/xfs_buf_item.h		patch \| blob \| history
fs/xfs/xfs_buf_item_recover.c		patch \| blob \| history
fs/xfs/xfs_dquot.c		patch \| blob \| history
fs/xfs/xfs_dquot.h		patch \| blob \| history
fs/xfs/xfs_dquot_item.c		patch \| blob \| history
fs/xfs/xfs_dquot_item_recover.c		patch \| blob \| history
fs/xfs/xfs_extfree_item.c		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
fs/xfs/xfs_icache.c		patch \| blob \| history
fs/xfs/xfs_icache.h		patch \| blob \| history
fs/xfs/xfs_icreate_item.c		patch \| blob \| history
fs/xfs/xfs_inode.c		patch \| blob \| history
fs/xfs/xfs_inode.h		patch \| blob \| history
fs/xfs/xfs_inode_item.c		patch \| blob \| history
fs/xfs/xfs_inode_item.h		patch \| blob \| history
fs/xfs/xfs_inode_item_recover.c		patch \| blob \| history
fs/xfs/xfs_ioctl.c		patch \| blob \| history
fs/xfs/xfs_iomap.c		patch \| blob \| history
fs/xfs/xfs_linux.h		patch \| blob \| history
fs/xfs/xfs_log.c		patch \| blob \| history
fs/xfs/xfs_log_cil.c		patch \| blob \| history
fs/xfs/xfs_log_priv.h		patch \| blob \| history
fs/xfs/xfs_log_recover.c		patch \| blob \| history
fs/xfs/xfs_mount.c		patch \| blob \| history
fs/xfs/xfs_mount.h		patch \| blob \| history
fs/xfs/xfs_qm.c		patch \| blob \| history
fs/xfs/xfs_qm.h		patch \| blob \| history
fs/xfs/xfs_qm_bhv.c		patch \| blob \| history
fs/xfs/xfs_qm_syscalls.c		patch \| blob \| history
fs/xfs/xfs_quota.h		patch \| blob \| history
fs/xfs/xfs_quotaops.c		patch \| blob \| history
fs/xfs/xfs_refcount_item.c		patch \| blob \| history
fs/xfs/xfs_reflink.c		patch \| blob \| history
fs/xfs/xfs_reflink.h		patch \| blob \| history
fs/xfs/xfs_rmap_item.c		patch \| blob \| history
fs/xfs/xfs_super.c		patch \| blob \| history
fs/xfs/xfs_trace.h		patch \| blob \| history
fs/xfs/xfs_trans.c		patch \| blob \| history
fs/xfs/xfs_trans.h		patch \| blob \| history
fs/xfs/xfs_trans_ail.c		patch \| blob \| history
fs/xfs/xfs_trans_buf.c		patch \| blob \| history
fs/xfs/xfs_trans_dquot.c		patch \| blob \| history