xfs: support in-memory btrees
authorDarrick J. Wong <djwong@kernel.org>
Thu, 22 Feb 2024 20:43:35 +0000 (12:43 -0800)
committerDarrick J. Wong <djwong@kernel.org>
Thu, 22 Feb 2024 20:43:35 +0000 (12:43 -0800)
Adapt the generic btree cursor code to be able to create a btree whose
buffers come from a (presumably in-memory) buftarg with a header block
that's specific to in-memory btrees.  We'll connect this to other parts
of online scrub in the next patches.

Note that in-memory btrees always have a block size matching the system
memory page size for efficiency reasons.  There are also a few things we
need to do to finalize a btree update; that's covered in the next patch.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
14 files changed:
Documentation/filesystems/xfs/xfs-online-fsck-design.rst
fs/xfs/Kconfig
fs/xfs/Makefile
fs/xfs/libxfs/xfs_btree.c
fs/xfs/libxfs/xfs_btree.h
fs/xfs/libxfs/xfs_btree_mem.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_btree_mem.h [new file with mode: 0644]
fs/xfs/scrub/scrub.c
fs/xfs/scrub/scrub.h
fs/xfs/xfs_buf_mem.c
fs/xfs/xfs_buf_mem.h
fs/xfs/xfs_health.c
fs/xfs/xfs_trace.c
fs/xfs/xfs_trace.h

index d034802..6333697 100644 (file)
@@ -2270,13 +2270,12 @@ follows:
    pointing to the xfile.
 
 3. Pass the buffer cache target, buffer ops, and other information to
-   ``xfbtree_create`` to write an initial tree header and root block to the
-   xfile.
+   ``xfbtree_init`` to initialize the passed in ``struct xfbtree`` and write an
+   initial root block to the xfile.
    Each btree type should define a wrapper that passes necessary arguments to
    the creation function.
    For example, rmap btrees define ``xfs_rmapbt_mem_create`` to take care of
    all the necessary details for callers.
-   A ``struct xfbtree`` object will be returned.
 
 4. Pass the xfbtree object to the btree cursor creation function for the
    btree type.
index 7017ea0..d41edd3 100644 (file)
@@ -131,6 +131,9 @@ config XFS_LIVE_HOOKS
 config XFS_MEMORY_BUFS
        bool
 
+config XFS_BTREE_IN_MEM
+       bool
+
 config XFS_ONLINE_SCRUB
        bool "XFS online metadata check support"
        default n
@@ -173,6 +176,7 @@ config XFS_ONLINE_REPAIR
        bool "XFS online metadata repair support"
        default n
        depends on XFS_FS && XFS_ONLINE_SCRUB
+       select XFS_BTREE_IN_MEM
        help
          If you say Y here you will be able to repair metadata on a
          mounted XFS filesystem.  This feature is intended to reduce
index 0458742..9a6574d 100644 (file)
@@ -138,6 +138,7 @@ endif
 xfs-$(CONFIG_XFS_DRAIN_INTENTS)        += xfs_drain.o
 xfs-$(CONFIG_XFS_LIVE_HOOKS)   += xfs_hooks.o
 xfs-$(CONFIG_XFS_MEMORY_BUFS)  += xfs_buf_mem.o
+xfs-$(CONFIG_XFS_BTREE_IN_MEM) += libxfs/xfs_btree_mem.o
 
 # online scrub/repair
 ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
index aa60a46..d295475 100644 (file)
@@ -28,6 +28,8 @@
 #include "xfs_rmap_btree.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_health.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
 
 /*
  * Btree magic numbers.
@@ -75,6 +77,25 @@ xfs_btree_check_fsblock_siblings(
        return NULL;
 }
 
+static inline xfs_failaddr_t
+xfs_btree_check_memblock_siblings(
+       struct xfs_buftarg      *btp,
+       xfbno_t                 bno,
+       __be64                  dsibling)
+{
+       xfbno_t                 sibling;
+
+       if (dsibling == cpu_to_be64(NULLFSBLOCK))
+               return NULL;
+
+       sibling = be64_to_cpu(dsibling);
+       if (sibling == bno)
+               return __this_address;
+       if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling)))
+               return __this_address;
+       return NULL;
+}
+
 static inline xfs_failaddr_t
 xfs_btree_check_agblock_siblings(
        struct xfs_perag        *pag,
@@ -164,6 +185,34 @@ __xfs_btree_check_fsblock(
        return fa;
 }
 
+/*
+ * Check an in-memory btree block header.  Return the address of the failing
+ * check, or NULL if everything is ok.
+ */
+static xfs_failaddr_t
+__xfs_btree_check_memblock(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       int                     level,
+       struct xfs_buf          *bp)
+{
+       struct xfs_buftarg      *btp = cur->bc_mem.xfbtree->target;
+       xfs_failaddr_t          fa;
+       xfbno_t                 bno;
+
+       fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
+       if (fa)
+               return fa;
+
+       bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
+       fa = xfs_btree_check_memblock_siblings(btp, bno,
+                       block->bb_u.l.bb_leftsib);
+       if (!fa)
+               fa = xfs_btree_check_memblock_siblings(btp, bno,
+                               block->bb_u.l.bb_rightsib);
+       return fa;
+}
+
 /*
  * Check a short btree block header.  Return the address of the failing check,
  * or NULL if everything is ok.
@@ -216,9 +265,17 @@ __xfs_btree_check_block(
        int                     level,
        struct xfs_buf          *bp)
 {
-       if (cur->bc_ops->type == XFS_BTREE_TYPE_AG)
+       switch (cur->bc_ops->type) {
+       case XFS_BTREE_TYPE_MEM:
+               return __xfs_btree_check_memblock(cur, block, level, bp);
+       case XFS_BTREE_TYPE_AG:
                return __xfs_btree_check_agblock(cur, block, level, bp);
-       return __xfs_btree_check_fsblock(cur, block, level, bp);
+       case XFS_BTREE_TYPE_INODE:
+               return __xfs_btree_check_fsblock(cur, block, level, bp);
+       default:
+               ASSERT(0);
+               return __this_address;
+       }
 }
 
 static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur)
@@ -262,14 +319,22 @@ __xfs_btree_check_ptr(
        if (level <= 0)
                return -EFSCORRUPTED;
 
-       if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
+       switch (cur->bc_ops->type) {
+       case XFS_BTREE_TYPE_MEM:
+               if (!xfbtree_verify_bno(cur->bc_mem.xfbtree,
+                               be64_to_cpu((&ptr->l)[index])))
+                       return -EFSCORRUPTED;
+               break;
+       case XFS_BTREE_TYPE_INODE:
                if (!xfs_verify_fsbno(cur->bc_mp,
                                be64_to_cpu((&ptr->l)[index])))
                        return -EFSCORRUPTED;
-       } else {
+               break;
+       case XFS_BTREE_TYPE_AG:
                if (!xfs_verify_agbno(cur->bc_ag.pag,
                                be32_to_cpu((&ptr->s)[index])))
                        return -EFSCORRUPTED;
+               break;
        }
 
        return 0;
@@ -290,17 +355,26 @@ xfs_btree_check_ptr(
 
        error = __xfs_btree_check_ptr(cur, ptr, index, level);
        if (error) {
-               if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
+               switch (cur->bc_ops->type) {
+               case XFS_BTREE_TYPE_MEM:
+                       xfs_err(cur->bc_mp,
+"In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.",
+                               cur->bc_ops->name, cur->bc_flags, level, index,
+                               __this_address);
+                       break;
+               case XFS_BTREE_TYPE_INODE:
                        xfs_err(cur->bc_mp,
 "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.",
                                cur->bc_ino.ip->i_ino,
                                cur->bc_ino.whichfork, cur->bc_ops->name,
                                level, index);
-               } else {
+                       break;
+               case XFS_BTREE_TYPE_AG:
                        xfs_err(cur->bc_mp,
 "AG %u: Corrupt %sbt pointer at level %d index %d.",
                                cur->bc_ag.pag->pag_agno, cur->bc_ops->name,
                                level, index);
+                       break;
                }
                xfs_btree_mark_sick(cur);
        }
@@ -457,11 +531,35 @@ xfs_btree_del_cursor(
        case XFS_BTREE_TYPE_INODE:
                /* nothing to do */
                break;
+       case XFS_BTREE_TYPE_MEM:
+               if (cur->bc_mem.pag)
+                       xfs_perag_put(cur->bc_mem.pag);
+               break;
        }
 
        kmem_cache_free(cur->bc_cache, cur);
 }
 
+/* Return the buffer target for this btree's buffer. */
+static inline struct xfs_buftarg *
+xfs_btree_buftarg(
+       struct xfs_btree_cur    *cur)
+{
+       if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
+               return cur->bc_mem.xfbtree->target;
+       return cur->bc_mp->m_ddev_targp;
+}
+
+/* Return the block size (in units of 512b sectors) for this btree. */
+static inline unsigned int
+xfs_btree_bbsize(
+       struct xfs_btree_cur    *cur)
+{
+       if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
+               return XFBNO_BBSIZE;
+       return cur->bc_mp->m_bsize;
+}
+
 /*
  * Duplicate the btree cursor.
  * Allocate a new one, copy the record, re-get the buffers.
@@ -505,10 +603,11 @@ xfs_btree_dup_cursor(
                new->bc_levels[i].ra = cur->bc_levels[i].ra;
                bp = cur->bc_levels[i].bp;
                if (bp) {
-                       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-                                                  xfs_buf_daddr(bp), mp->m_bsize,
-                                                  0, &bp,
-                                                  cur->bc_ops->buf_ops);
+                       error = xfs_trans_read_buf(mp, tp,
+                                       xfs_btree_buftarg(cur),
+                                       xfs_buf_daddr(bp),
+                                       xfs_btree_bbsize(cur), 0, &bp,
+                                       cur->bc_ops->buf_ops);
                        if (xfs_metadata_is_sick(error))
                                xfs_btree_mark_sick(new);
                        if (error) {
@@ -885,6 +984,32 @@ xfs_btree_readahead_fsblock(
        return rval;
 }
 
+STATIC int
+xfs_btree_readahead_memblock(
+       struct xfs_btree_cur    *cur,
+       int                     lr,
+       struct xfs_btree_block  *block)
+{
+       struct xfs_buftarg      *btp = cur->bc_mem.xfbtree->target;
+       xfbno_t                 left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+       xfbno_t                 right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+       int                     rval = 0;
+
+       if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
+               xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE,
+                               cur->bc_ops->buf_ops);
+               rval++;
+       }
+
+       if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
+               xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE,
+                               cur->bc_ops->buf_ops);
+               rval++;
+       }
+
+       return rval;
+}
+
 STATIC int
 xfs_btree_readahead_agblock(
        struct xfs_btree_cur    *cur,
@@ -939,9 +1064,17 @@ xfs_btree_readahead(
        cur->bc_levels[lev].ra |= lr;
        block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
 
-       if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
+       switch (cur->bc_ops->type) {
+       case XFS_BTREE_TYPE_AG:
+               return xfs_btree_readahead_agblock(cur, lr, block);
+       case XFS_BTREE_TYPE_INODE:
                return xfs_btree_readahead_fsblock(cur, lr, block);
-       return xfs_btree_readahead_agblock(cur, lr, block);
+       case XFS_BTREE_TYPE_MEM:
+               return xfs_btree_readahead_memblock(cur, lr, block);
+       default:
+               ASSERT(0);
+               return 0;
+       }
 }
 
 STATIC int
@@ -950,23 +1083,24 @@ xfs_btree_ptr_to_daddr(
        const union xfs_btree_ptr       *ptr,
        xfs_daddr_t                     *daddr)
 {
-       xfs_fsblock_t           fsbno;
-       xfs_agblock_t           agbno;
        int                     error;
 
        error = xfs_btree_check_ptr(cur, ptr, 0, 1);
        if (error)
                return error;
 
-       if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
-               fsbno = be64_to_cpu(ptr->l);
-               *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
-       } else {
-               agbno = be32_to_cpu(ptr->s);
+       switch (cur->bc_ops->type) {
+       case XFS_BTREE_TYPE_AG:
                *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-                               agbno);
+                               be32_to_cpu(ptr->s));
+               break;
+       case XFS_BTREE_TYPE_INODE:
+               *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+               break;
+       case XFS_BTREE_TYPE_MEM:
+               *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l));
+               break;
        }
-
        return 0;
 }
 
@@ -986,8 +1120,9 @@ xfs_btree_readahead_ptr(
 
        if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
                return;
-       xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
-                         cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+       xfs_buf_readahead(xfs_btree_buftarg(cur), daddr,
+                       xfs_btree_bbsize(cur) * count,
+                       cur->bc_ops->buf_ops);
 }
 
 /*
@@ -1172,9 +1307,17 @@ static inline __u64
 xfs_btree_owner(
        struct xfs_btree_cur    *cur)
 {
-       if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
+       switch (cur->bc_ops->type) {
+       case XFS_BTREE_TYPE_MEM:
+               return cur->bc_mem.xfbtree->owner;
+       case XFS_BTREE_TYPE_INODE:
                return cur->bc_ino.ip->i_ino;
-       return cur->bc_ag.pag->pag_agno;
+       case XFS_BTREE_TYPE_AG:
+               return cur->bc_ag.pag->pag_agno;
+       default:
+               ASSERT(0);
+               return 0;
+       }
 }
 
 void
@@ -1218,12 +1361,18 @@ xfs_btree_buf_to_ptr(
        struct xfs_buf          *bp,
        union xfs_btree_ptr     *ptr)
 {
-       if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
-               ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
-                                       xfs_buf_daddr(bp)));
-       else {
+       switch (cur->bc_ops->type) {
+       case XFS_BTREE_TYPE_AG:
                ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
                                        xfs_buf_daddr(bp)));
+               break;
+       case XFS_BTREE_TYPE_INODE:
+               ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+                                       xfs_buf_daddr(bp)));
+               break;
+       case XFS_BTREE_TYPE_MEM:
+               ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp)));
+               break;
        }
 }
 
@@ -1242,15 +1391,14 @@ xfs_btree_get_buf_block(
        struct xfs_btree_block          **block,
        struct xfs_buf                  **bpp)
 {
-       struct xfs_mount        *mp = cur->bc_mp;
-       xfs_daddr_t             d;
-       int                     error;
+       xfs_daddr_t                     d;
+       int                             error;
 
        error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
        if (error)
                return error;
-       error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
-                       0, bpp);
+       error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d,
+                       xfs_btree_bbsize(cur), 0, bpp);
        if (error)
                return error;
 
@@ -1281,9 +1429,9 @@ xfs_btree_read_buf_block(
        error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
        if (error)
                return error;
-       error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
-                                  mp->m_bsize, flags, bpp,
-                                  cur->bc_ops->buf_ops);
+       error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d,
+                       xfs_btree_bbsize(cur), flags, bpp,
+                       cur->bc_ops->buf_ops);
        if (xfs_metadata_is_sick(error))
                xfs_btree_mark_sick(cur);
        if (error)
@@ -4582,6 +4730,8 @@ xfs_btree_fsblock_verify(
        xfs_fsblock_t           fsb;
        xfs_failaddr_t          fa;
 
+       ASSERT(!xfs_buftarg_is_mem(bp->b_target));
+
        /* numrecs verification */
        if (be16_to_cpu(block->bb_numrecs) > max_recs)
                return __this_address;
@@ -4596,6 +4746,36 @@ xfs_btree_fsblock_verify(
        return fa;
 }
 
+/* Verify an in-memory btree block. */
+xfs_failaddr_t
+xfs_btree_memblock_verify(
+       struct xfs_buf          *bp,
+       unsigned int            max_recs)
+{
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+       struct xfs_buftarg      *btp = bp->b_target;
+       xfs_failaddr_t          fa;
+       xfbno_t                 bno;
+
+       ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+       /* numrecs verification */
+       if (be16_to_cpu(block->bb_numrecs) > max_recs)
+               return __this_address;
+
+       /* sibling pointer verification */
+       bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
+       fa = xfs_btree_check_memblock_siblings(btp, bno,
+                       block->bb_u.l.bb_leftsib);
+       if (fa)
+               return fa;
+       fa = xfs_btree_check_memblock_siblings(btp, bno,
+                       block->bb_u.l.bb_rightsib);
+       if (fa)
+               return fa;
+
+       return NULL;
+}
 /**
  * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format
  *                                   btree block
@@ -4637,6 +4817,8 @@ xfs_btree_agblock_verify(
        xfs_agblock_t           agbno;
        xfs_failaddr_t          fa;
 
+       ASSERT(!xfs_buftarg_is_mem(bp->b_target));
+
        /* numrecs verification */
        if (be16_to_cpu(block->bb_numrecs) > max_recs)
                return __this_address;
index bacd67c..f933742 100644 (file)
@@ -112,6 +112,7 @@ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y)
 enum xfs_btree_type {
        XFS_BTREE_TYPE_AG,
        XFS_BTREE_TYPE_INODE,
+       XFS_BTREE_TYPE_MEM,
 };
 
 struct xfs_btree_ops {
@@ -281,6 +282,10 @@ struct xfs_btree_cur
                        struct xfs_buf          *agbp;
                        struct xbtree_afakeroot *afake; /* for staging cursor */
                } bc_ag;
+               struct {
+                       struct xfbtree          *xfbtree;
+                       struct xfs_perag        *pag;
+               } bc_mem;
        };
 
        /* per-format private data */
@@ -455,6 +460,8 @@ xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp,
                uint64_t owner);
 xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp,
                unsigned int max_recs);
+xfs_failaddr_t xfs_btree_memblock_verify(struct xfs_buf *bp,
+               unsigned int max_recs);
 
 unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits,
                unsigned long long records);
diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c
new file mode 100644 (file)
index 0000000..cb156e9
--- /dev/null
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_btree.h"
+#include "xfs_error.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_ag.h"
+#include "xfs_buf_item.h"
+#include "xfs_trace.h"
+
+/* Set the root of an in-memory btree. */
+void
+xfbtree_set_root(
+       struct xfs_btree_cur            *cur,
+       const union xfs_btree_ptr       *ptr,
+       int                             inc)
+{
+       ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+       cur->bc_mem.xfbtree->root = *ptr;
+       cur->bc_mem.xfbtree->nlevels += inc;
+}
+
+/* Initialize a pointer from the in-memory btree header. */
+void
+xfbtree_init_ptr_from_cur(
+       struct xfs_btree_cur            *cur,
+       union xfs_btree_ptr             *ptr)
+{
+       ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+       *ptr = cur->bc_mem.xfbtree->root;
+}
+
+/* Duplicate an in-memory btree cursor. */
+struct xfs_btree_cur *
+xfbtree_dup_cursor(
+       struct xfs_btree_cur            *cur)
+{
+       struct xfs_btree_cur            *ncur;
+
+       ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+       ncur = xfs_btree_alloc_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ops,
+                       cur->bc_maxlevels, cur->bc_cache);
+       ncur->bc_flags = cur->bc_flags;
+       ncur->bc_nlevels = cur->bc_nlevels;
+       ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree;
+
+       if (cur->bc_mem.pag)
+               ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag);
+
+       return ncur;
+}
+
+/* Close the btree xfile and release all resources. */
+void
+xfbtree_destroy(
+       struct xfbtree          *xfbt)
+{
+       xfs_buftarg_drain(xfbt->target);
+}
+
+/* Compute the number of bytes available for records. */
+static inline unsigned int
+xfbtree_rec_bytes(
+       struct xfs_mount                *mp,
+       const struct xfs_btree_ops      *ops)
+{
+       return XMBUF_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+}
+
+/* Initialize an empty leaf block as the btree root. */
+STATIC int
+xfbtree_init_leaf_block(
+       struct xfs_mount                *mp,
+       struct xfbtree                  *xfbt,
+       const struct xfs_btree_ops      *ops)
+{
+       struct xfs_buf                  *bp;
+       xfbno_t                         bno = xfbt->highest_bno++;
+       int                             error;
+
+       error = xfs_buf_get(xfbt->target, xfbno_to_daddr(bno), XFBNO_BBSIZE,
+                       &bp);
+       if (error)
+               return error;
+
+       trace_xfbtree_create_root_buf(xfbt, bp);
+
+       bp->b_ops = ops->buf_ops;
+       xfs_btree_init_buf(mp, bp, ops, 0, 0, xfbt->owner);
+       xfs_buf_relse(bp);
+
+       xfbt->root.l = cpu_to_be64(bno);
+       return 0;
+}
+
+/*
+ * Create an in-memory btree root that can be used with the given xmbuf.
+ * Callers must set xfbt->owner.
+ */
+int
+xfbtree_init(
+       struct xfs_mount                *mp,
+       struct xfbtree                  *xfbt,
+       struct xfs_buftarg              *btp,
+       const struct xfs_btree_ops      *ops)
+{
+       unsigned int                    blocklen = xfbtree_rec_bytes(mp, ops);
+       unsigned int                    keyptr_len;
+       int                             error;
+
+       /* Requires a long-format CRC-format btree */
+       if (!xfs_has_crc(mp)) {
+               ASSERT(xfs_has_crc(mp));
+               return -EINVAL;
+       }
+       if (ops->ptr_len != XFS_BTREE_LONG_PTR_LEN) {
+               ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN);
+               return -EINVAL;
+       }
+
+       memset(xfbt, 0, sizeof(*xfbt));
+       xfbt->target = btp;
+
+       /* Set up min/maxrecs for this btree. */
+       keyptr_len = ops->key_len + sizeof(__be64);
+       xfbt->maxrecs[0] = blocklen / ops->rec_len;
+       xfbt->maxrecs[1] = blocklen / keyptr_len;
+       xfbt->minrecs[0] = xfbt->maxrecs[0] / 2;
+       xfbt->minrecs[1] = xfbt->maxrecs[1] / 2;
+       xfbt->highest_bno = 0;
+       xfbt->nlevels = 1;
+
+       /* Initialize the empty btree. */
+       error = xfbtree_init_leaf_block(mp, xfbt, ops);
+       if (error)
+               goto err_freesp;
+
+       trace_xfbtree_init(mp, xfbt, ops);
+
+       return 0;
+
+err_freesp:
+       xfs_buftarg_drain(xfbt->target);
+       return error;
+}
+
+/* Allocate a block to our in-memory btree. */
+int
+xfbtree_alloc_block(
+       struct xfs_btree_cur            *cur,
+       const union xfs_btree_ptr       *start,
+       union xfs_btree_ptr             *new,
+       int                             *stat)
+{
+       struct xfbtree                  *xfbt = cur->bc_mem.xfbtree;
+       xfbno_t                         bno = xfbt->highest_bno++;
+
+       ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+       trace_xfbtree_alloc_block(xfbt, cur, bno);
+
+       /* Fail if the block address exceeds the maximum for the buftarg. */
+       if (!xfbtree_verify_bno(xfbt, bno)) {
+               ASSERT(xfbtree_verify_bno(xfbt, bno));
+               *stat = 0;
+               return 0;
+       }
+
+       new->l = cpu_to_be64(bno);
+       *stat = 1;
+       return 0;
+}
+
+/* Free a block from our in-memory btree. */
+int
+xfbtree_free_block(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
+{
+       struct xfbtree          *xfbt = cur->bc_mem.xfbtree;
+       xfs_daddr_t             daddr = xfs_buf_daddr(bp);
+       xfbno_t                 bno = xfs_daddr_to_xfbno(daddr);
+
+       ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+       trace_xfbtree_free_block(xfbt, cur, bno);
+
+       if (bno + 1 == xfbt->highest_bno)
+               xfbt->highest_bno--;
+
+       return 0;
+}
+
+/* Return the minimum number of records for a btree block. */
+int
+xfbtree_get_minrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       struct xfbtree          *xfbt = cur->bc_mem.xfbtree;
+
+       return xfbt->minrecs[level != 0];
+}
+
+/* Return the maximum number of records for a btree block. */
+int
+xfbtree_get_maxrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       struct xfbtree          *xfbt = cur->bc_mem.xfbtree;
+
+       return xfbt->maxrecs[level != 0];
+}
diff --git a/fs/xfs/libxfs/xfs_btree_mem.h b/fs/xfs/libxfs/xfs_btree_mem.h
new file mode 100644 (file)
index 0000000..ecc2cea
--- /dev/null
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_BTREE_MEM_H__
+#define __XFS_BTREE_MEM_H__
+
+typedef uint64_t xfbno_t;
+
+#define XFBNO_BLOCKSIZE                        (XMBUF_BLOCKSIZE)
+#define XFBNO_BBSHIFT                  (XMBUF_BLOCKSHIFT - BBSHIFT)
+#define XFBNO_BBSIZE                   (XFBNO_BLOCKSIZE >> BBSHIFT)
+
+static inline xfs_daddr_t xfbno_to_daddr(xfbno_t blkno)
+{
+       return blkno << XFBNO_BBSHIFT;
+}
+
+static inline xfbno_t xfs_daddr_to_xfbno(xfs_daddr_t daddr)
+{
+       return daddr >> XFBNO_BBSHIFT;
+}
+
+struct xfbtree {
+       /* buffer cache target for this in-memory btree */
+       struct xfs_buftarg              *target;
+
+       /* Highest block number that has been written to. */
+       xfbno_t                         highest_bno;
+
+       /* Owner of this btree. */
+       unsigned long long              owner;
+
+       /* Btree header */
+       union xfs_btree_ptr             root;
+       unsigned int                    nlevels;
+
+       /* Minimum and maximum records per block. */
+       unsigned int                    maxrecs[2];
+       unsigned int                    minrecs[2];
+};
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+static inline bool xfbtree_verify_bno(struct xfbtree *xfbt, xfbno_t bno)
+{
+       return xmbuf_verify_daddr(xfbt->target, xfbno_to_daddr(bno));
+}
+
+void xfbtree_set_root(struct xfs_btree_cur *cur,
+               const union xfs_btree_ptr *ptr, int inc);
+void xfbtree_init_ptr_from_cur(struct xfs_btree_cur *cur,
+               union xfs_btree_ptr *ptr);
+struct xfs_btree_cur *xfbtree_dup_cursor(struct xfs_btree_cur *cur);
+
+int xfbtree_get_minrecs(struct xfs_btree_cur *cur, int level);
+int xfbtree_get_maxrecs(struct xfs_btree_cur *cur, int level);
+
+int xfbtree_alloc_block(struct xfs_btree_cur *cur,
+               const union xfs_btree_ptr *start, union xfs_btree_ptr *ptr,
+               int *stat);
+int xfbtree_free_block(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
+/* Callers must set xfbt->target and xfbt->owner before calling this */
+int xfbtree_init(struct xfs_mount *mp, struct xfbtree *xfbt,
+               struct xfs_buftarg *btp, const struct xfs_btree_ops *ops);
+void xfbtree_destroy(struct xfbtree *xfbt);
+#else
+# define xfbtree_verify_bno(...)       (false)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
+#endif /* __XFS_BTREE_MEM_H__ */
index aeac9ca..6828e72 100644 (file)
@@ -15,6 +15,7 @@
 #include "xfs_quota.h"
 #include "xfs_qm.h"
 #include "xfs_scrub.h"
+#include "xfs_buf_mem.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -190,6 +191,10 @@ xchk_teardown(
                sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
                mnt_drop_write_file(sc->file);
        }
+       if (sc->xmbtp) {
+               xmbuf_free(sc->xmbtp);
+               sc->xmbtp = NULL;
+       }
        if (sc->xfile) {
                xfile_destroy(sc->xfile);
                sc->xfile = NULL;
index f99a3c2..1247284 100644 (file)
@@ -99,6 +99,9 @@ struct xfs_scrub {
        /* xfile used by the scrubbers; freed at teardown. */
        struct xfile                    *xfile;
 
+       /* buffer target for in-memory btrees; also freed at teardown. */
+       struct xfs_buftarg              *xmbtp;
+
        /* Lock flags for @ip. */
        uint                            ilock_flags;
 
index be71ba1..73caa2e 100644 (file)
@@ -187,3 +187,16 @@ xmbuf_unmap_page(
        bp->b_pages = NULL;
        bp->b_page_count = 0;
 }
+
+/* Is this a valid daddr within the buftarg? */
+bool
+xmbuf_verify_daddr(
+       struct xfs_buftarg      *btp,
+       xfs_daddr_t             daddr)
+{
+       struct inode            *inode = file_inode(btp->bt_file);
+
+       ASSERT(xfs_buftarg_is_mem(btp));
+
+       return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT);
+}
index 945f4b6..dfb3029 100644 (file)
@@ -21,10 +21,12 @@ void xmbuf_free(struct xfs_buftarg *btp);
 
 int xmbuf_map_page(struct xfs_buf *bp);
 void xmbuf_unmap_page(struct xfs_buf *bp);
+bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr);
 #else
 # define xfs_buftarg_is_mem(...)       (false)
 # define xmbuf_map_page(...)           (-ENOMEM)
 # define xmbuf_unmap_page(...)         ((void)0)
+# define xmbuf_verify_daddr(...)       (false)
 #endif /* CONFIG_XFS_MEMORY_BUFS */
 
 #endif /* __XFS_BUF_MEM_H__ */
index b28546b..b39f959 100644 (file)
@@ -527,6 +527,9 @@ xfs_btree_mark_sick(
        struct xfs_btree_cur            *cur)
 {
        switch (cur->bc_ops->type) {
+       case XFS_BTREE_TYPE_MEM:
+               /* no health state tracking for ephemeral btrees */
+               return;
        case XFS_BTREE_TYPE_AG:
                ASSERT(cur->bc_ops->sick_mask);
                xfs_ag_mark_sick(cur->bc_ag.pag, cur->bc_ops->sick_mask);
index ae5be6b..3f25388 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/iomap.h>
 #include "xfs_iomap.h"
 #include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
 
 /*
  * We include this last to have the helpers above available for the trace
index 14cb875..4971d65 100644 (file)
@@ -79,6 +79,8 @@ union xfs_btree_ptr;
 struct xfs_dqtrx;
 struct xfs_icwalk;
 struct xfs_perag;
+struct xfbtree;
+struct xfs_btree_ops;
 
 #define XFS_ATTR_FILTER_FLAGS \
        { XFS_ATTR_ROOT,        "ROOT" }, \
@@ -2499,12 +2501,19 @@ TRACE_EVENT(xfs_btree_alloc_block,
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
-               if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
+               switch (cur->bc_ops->type) {
+               case XFS_BTREE_TYPE_INODE:
                        __entry->agno = 0;
                        __entry->ino = cur->bc_ino.ip->i_ino;
-               } else {
+                       break;
+               case XFS_BTREE_TYPE_AG:
                        __entry->agno = cur->bc_ag.pag->pag_agno;
                        __entry->ino = 0;
+                       break;
+               case XFS_BTREE_TYPE_MEM:
+                       __entry->agno = 0;
+                       __entry->ino = 0;
+                       break;
                }
                __assign_str(name, cur->bc_ops->name);
                __entry->error = error;
@@ -4563,6 +4572,110 @@ TRACE_EVENT(xmbuf_free,
 );
 #endif /* CONFIG_XFS_MEMORY_BUFS */
 
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+TRACE_EVENT(xfbtree_init,
+       TP_PROTO(struct xfs_mount *mp, struct xfbtree *xfbt,
+                const struct xfs_btree_ops *ops),
+       TP_ARGS(mp, xfbt, ops),
+       TP_STRUCT__entry(
+               __field(const void *, btree_ops)
+               __field(unsigned long, xfino)
+               __field(unsigned int, leaf_mxr)
+               __field(unsigned int, leaf_mnr)
+               __field(unsigned int, node_mxr)
+               __field(unsigned int, node_mnr)
+               __field(unsigned long long, owner)
+       ),
+       TP_fast_assign(
+               __entry->btree_ops = ops;
+               __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+               __entry->leaf_mxr = xfbt->maxrecs[0];
+               __entry->node_mxr = xfbt->maxrecs[1];
+               __entry->leaf_mnr = xfbt->minrecs[0];
+               __entry->node_mnr = xfbt->minrecs[1];
+               __entry->owner = xfbt->owner;
+       ),
+       TP_printk("xfino 0x%lx btree_ops %pS owner 0x%llx leaf_mxr %u leaf_mnr %u node_mxr %u node_mnr %u",
+                 __entry->xfino,
+                 __entry->btree_ops,
+                 __entry->owner,
+                 __entry->leaf_mxr,
+                 __entry->leaf_mnr,
+                 __entry->node_mxr,
+                 __entry->node_mnr)
+);
+
+DECLARE_EVENT_CLASS(xfbtree_buf_class,
+       TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp),
+       TP_ARGS(xfbt, bp),
+       TP_STRUCT__entry(
+               __field(unsigned long, xfino)
+               __field(xfs_daddr_t, bno)
+               __field(int, nblks)
+               __field(int, hold)
+               __field(int, pincount)
+               __field(unsigned int, lockval)
+               __field(unsigned int, flags)
+       ),
+       TP_fast_assign(
+               __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+               __entry->bno = xfs_buf_daddr(bp);
+               __entry->nblks = bp->b_length;
+               __entry->hold = atomic_read(&bp->b_hold);
+               __entry->pincount = atomic_read(&bp->b_pin_count);
+               __entry->lockval = bp->b_sema.count;
+               __entry->flags = bp->b_flags;
+       ),
+       TP_printk("xfino 0x%lx daddr 0x%llx bbcount 0x%x hold %d pincount %d lock %d flags %s",
+                 __entry->xfino,
+                 (unsigned long long)__entry->bno,
+                 __entry->nblks,
+                 __entry->hold,
+                 __entry->pincount,
+                 __entry->lockval,
+                 __print_flags(__entry->flags, "|", XFS_BUF_FLAGS))
+)
+
+#define DEFINE_XFBTREE_BUF_EVENT(name) \
+DEFINE_EVENT(xfbtree_buf_class, name, \
+       TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp), \
+       TP_ARGS(xfbt, bp))
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_create_root_buf);
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_commit_buf);
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_cancel_buf);
+
+DECLARE_EVENT_CLASS(xfbtree_freesp_class,
+       TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur,
+                xfs_fileoff_t fileoff),
+       TP_ARGS(xfbt, cur, fileoff),
+       TP_STRUCT__entry(
+               __field(unsigned long, xfino)
+               __string(btname, cur->bc_ops->name)
+               __field(int, nlevels)
+               __field(xfs_fileoff_t, fileoff)
+       ),
+       TP_fast_assign(
+               __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+               __assign_str(btname, cur->bc_ops->name);
+               __entry->nlevels = cur->bc_nlevels;
+               __entry->fileoff = fileoff;
+       ),
+       TP_printk("xfino 0x%lx %sbt nlevels %d fileoff 0x%llx",
+                 __entry->xfino,
+                 __get_str(btname),
+                 __entry->nlevels,
+                 (unsigned long long)__entry->fileoff)
+)
+
+#define DEFINE_XFBTREE_FREESP_EVENT(name) \
+DEFINE_EVENT(xfbtree_freesp_class, name, \
+       TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur, \
+                xfs_fileoff_t fileoff), \
+       TP_ARGS(xfbt, cur, fileoff))
+DEFINE_XFBTREE_FREESP_EVENT(xfbtree_alloc_block);
+DEFINE_XFBTREE_FREESP_EVENT(xfbtree_free_block);
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH