xfs: remove unused flag arguments
[linux-2.6-microblaze.git] / fs / xfs / libxfs / xfs_btree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_defer.h"
15 #include "xfs_inode.h"
16 #include "xfs_trans.h"
17 #include "xfs_inode_item.h"
18 #include "xfs_buf_item.h"
19 #include "xfs_btree.h"
20 #include "xfs_errortag.h"
21 #include "xfs_error.h"
22 #include "xfs_trace.h"
23 #include "xfs_cksum.h"
24 #include "xfs_alloc.h"
25 #include "xfs_log.h"
26
27 /*
28  * Cursor allocation zone.
29  */
30 kmem_zone_t     *xfs_btree_cur_zone;
31
32 /*
33  * Btree magic numbers.
34  */
35 static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
36         { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
37           XFS_FIBT_MAGIC, 0 },
38         { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
39           XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
40           XFS_REFC_CRC_MAGIC }
41 };
42
43 uint32_t
44 xfs_btree_magic(
45         int                     crc,
46         xfs_btnum_t             btnum)
47 {
48         uint32_t                magic = xfs_magics[crc][btnum];
49
50         /* Ensure we asked for crc for crc-only magics. */
51         ASSERT(magic != 0);
52         return magic;
53 }
54
55 /*
56  * Check a long btree block header.  Return the address of the failing check,
57  * or NULL if everything is ok.
58  */
59 xfs_failaddr_t
60 __xfs_btree_check_lblock(
61         struct xfs_btree_cur    *cur,
62         struct xfs_btree_block  *block,
63         int                     level,
64         struct xfs_buf          *bp)
65 {
66         struct xfs_mount        *mp = cur->bc_mp;
67         xfs_btnum_t             btnum = cur->bc_btnum;
68         int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
69
70         if (crc) {
71                 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
72                         return __this_address;
73                 if (block->bb_u.l.bb_blkno !=
74                     cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
75                         return __this_address;
76                 if (block->bb_u.l.bb_pad != cpu_to_be32(0))
77                         return __this_address;
78         }
79
80         if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
81                 return __this_address;
82         if (be16_to_cpu(block->bb_level) != level)
83                 return __this_address;
84         if (be16_to_cpu(block->bb_numrecs) >
85             cur->bc_ops->get_maxrecs(cur, level))
86                 return __this_address;
87         if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
88             !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
89                         level + 1))
90                 return __this_address;
91         if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
92             !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
93                         level + 1))
94                 return __this_address;
95
96         return NULL;
97 }
98
99 /* Check a long btree block header. */
100 static int
101 xfs_btree_check_lblock(
102         struct xfs_btree_cur    *cur,
103         struct xfs_btree_block  *block,
104         int                     level,
105         struct xfs_buf          *bp)
106 {
107         struct xfs_mount        *mp = cur->bc_mp;
108         xfs_failaddr_t          fa;
109
110         fa = __xfs_btree_check_lblock(cur, block, level, bp);
111         if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
112                         XFS_ERRTAG_BTREE_CHECK_LBLOCK))) {
113                 if (bp)
114                         trace_xfs_btree_corrupt(bp, _RET_IP_);
115                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
116                 return -EFSCORRUPTED;
117         }
118         return 0;
119 }
120
121 /*
122  * Check a short btree block header.  Return the address of the failing check,
123  * or NULL if everything is ok.
124  */
125 xfs_failaddr_t
126 __xfs_btree_check_sblock(
127         struct xfs_btree_cur    *cur,
128         struct xfs_btree_block  *block,
129         int                     level,
130         struct xfs_buf          *bp)
131 {
132         struct xfs_mount        *mp = cur->bc_mp;
133         xfs_btnum_t             btnum = cur->bc_btnum;
134         int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
135
136         if (crc) {
137                 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
138                         return __this_address;
139                 if (block->bb_u.s.bb_blkno !=
140                     cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
141                         return __this_address;
142         }
143
144         if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
145                 return __this_address;
146         if (be16_to_cpu(block->bb_level) != level)
147                 return __this_address;
148         if (be16_to_cpu(block->bb_numrecs) >
149             cur->bc_ops->get_maxrecs(cur, level))
150                 return __this_address;
151         if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
152             !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
153                         level + 1))
154                 return __this_address;
155         if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
156             !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
157                         level + 1))
158                 return __this_address;
159
160         return NULL;
161 }
162
163 /* Check a short btree block header. */
164 STATIC int
165 xfs_btree_check_sblock(
166         struct xfs_btree_cur    *cur,
167         struct xfs_btree_block  *block,
168         int                     level,
169         struct xfs_buf          *bp)
170 {
171         struct xfs_mount        *mp = cur->bc_mp;
172         xfs_failaddr_t          fa;
173
174         fa = __xfs_btree_check_sblock(cur, block, level, bp);
175         if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
176                         XFS_ERRTAG_BTREE_CHECK_SBLOCK))) {
177                 if (bp)
178                         trace_xfs_btree_corrupt(bp, _RET_IP_);
179                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
180                 return -EFSCORRUPTED;
181         }
182         return 0;
183 }
184
185 /*
186  * Debug routine: check that block header is ok.
187  */
188 int
189 xfs_btree_check_block(
190         struct xfs_btree_cur    *cur,   /* btree cursor */
191         struct xfs_btree_block  *block, /* generic btree block pointer */
192         int                     level,  /* level of the btree block */
193         struct xfs_buf          *bp)    /* buffer containing block, if any */
194 {
195         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
196                 return xfs_btree_check_lblock(cur, block, level, bp);
197         else
198                 return xfs_btree_check_sblock(cur, block, level, bp);
199 }
200
201 /* Check that this long pointer is valid and points within the fs. */
202 bool
203 xfs_btree_check_lptr(
204         struct xfs_btree_cur    *cur,
205         xfs_fsblock_t           fsbno,
206         int                     level)
207 {
208         if (level <= 0)
209                 return false;
210         return xfs_verify_fsbno(cur->bc_mp, fsbno);
211 }
212
213 /* Check that this short pointer is valid and points within the AG. */
214 bool
215 xfs_btree_check_sptr(
216         struct xfs_btree_cur    *cur,
217         xfs_agblock_t           agbno,
218         int                     level)
219 {
220         if (level <= 0)
221                 return false;
222         return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno);
223 }
224
225 /*
226  * Check that a given (indexed) btree pointer at a certain level of a
227  * btree is valid and doesn't point past where it should.
228  */
229 static int
230 xfs_btree_check_ptr(
231         struct xfs_btree_cur    *cur,
232         union xfs_btree_ptr     *ptr,
233         int                     index,
234         int                     level)
235 {
236         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
237                 if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
238                                 level))
239                         return 0;
240                 xfs_err(cur->bc_mp,
241 "Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
242                                 cur->bc_private.b.ip->i_ino,
243                                 cur->bc_private.b.whichfork, cur->bc_btnum,
244                                 level, index);
245         } else {
246                 if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
247                                 level))
248                         return 0;
249                 xfs_err(cur->bc_mp,
250 "AG %u: Corrupt btree %d pointer at level %d index %d.",
251                                 cur->bc_private.a.agno, cur->bc_btnum,
252                                 level, index);
253         }
254
255         return -EFSCORRUPTED;
256 }
257
258 #ifdef DEBUG
259 # define xfs_btree_debug_check_ptr      xfs_btree_check_ptr
260 #else
261 # define xfs_btree_debug_check_ptr(...) (0)
262 #endif
263
264 /*
265  * Calculate CRC on the whole btree block and stuff it into the
266  * long-form btree header.
267  *
268  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
269  * it into the buffer so recovery knows what the last modification was that made
270  * it to disk.
271  */
272 void
273 xfs_btree_lblock_calc_crc(
274         struct xfs_buf          *bp)
275 {
276         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
277         struct xfs_buf_log_item *bip = bp->b_log_item;
278
279         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
280                 return;
281         if (bip)
282                 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
283         xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
284 }
285
286 bool
287 xfs_btree_lblock_verify_crc(
288         struct xfs_buf          *bp)
289 {
290         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
291         struct xfs_mount        *mp = bp->b_target->bt_mount;
292
293         if (xfs_sb_version_hascrc(&mp->m_sb)) {
294                 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
295                         return false;
296                 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
297         }
298
299         return true;
300 }
301
302 /*
303  * Calculate CRC on the whole btree block and stuff it into the
304  * short-form btree header.
305  *
306  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
307  * it into the buffer so recovery knows what the last modification was that made
308  * it to disk.
309  */
310 void
311 xfs_btree_sblock_calc_crc(
312         struct xfs_buf          *bp)
313 {
314         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
315         struct xfs_buf_log_item *bip = bp->b_log_item;
316
317         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
318                 return;
319         if (bip)
320                 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
321         xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
322 }
323
324 bool
325 xfs_btree_sblock_verify_crc(
326         struct xfs_buf          *bp)
327 {
328         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
329         struct xfs_mount        *mp = bp->b_target->bt_mount;
330
331         if (xfs_sb_version_hascrc(&mp->m_sb)) {
332                 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
333                         return false;
334                 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
335         }
336
337         return true;
338 }
339
340 static int
341 xfs_btree_free_block(
342         struct xfs_btree_cur    *cur,
343         struct xfs_buf          *bp)
344 {
345         int                     error;
346
347         error = cur->bc_ops->free_block(cur, bp);
348         if (!error) {
349                 xfs_trans_binval(cur->bc_tp, bp);
350                 XFS_BTREE_STATS_INC(cur, free);
351         }
352         return error;
353 }
354
355 /*
356  * Delete the btree cursor.
357  */
358 void
359 xfs_btree_del_cursor(
360         xfs_btree_cur_t *cur,           /* btree cursor */
361         int             error)          /* del because of error */
362 {
363         int             i;              /* btree level */
364
365         /*
366          * Clear the buffer pointers, and release the buffers.
367          * If we're doing this in the face of an error, we
368          * need to make sure to inspect all of the entries
369          * in the bc_bufs array for buffers to be unlocked.
370          * This is because some of the btree code works from
371          * level n down to 0, and if we get an error along
372          * the way we won't have initialized all the entries
373          * down to 0.
374          */
375         for (i = 0; i < cur->bc_nlevels; i++) {
376                 if (cur->bc_bufs[i])
377                         xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
378                 else if (!error)
379                         break;
380         }
381         /*
382          * Can't free a bmap cursor without having dealt with the
383          * allocated indirect blocks' accounting.
384          */
385         ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
386                cur->bc_private.b.allocated == 0);
387         /*
388          * Free the cursor.
389          */
390         kmem_zone_free(xfs_btree_cur_zone, cur);
391 }
392
393 /*
394  * Duplicate the btree cursor.
395  * Allocate a new one, copy the record, re-get the buffers.
396  */
397 int                                     /* error */
398 xfs_btree_dup_cursor(
399         xfs_btree_cur_t *cur,           /* input cursor */
400         xfs_btree_cur_t **ncur)         /* output cursor */
401 {
402         xfs_buf_t       *bp;            /* btree block's buffer pointer */
403         int             error;          /* error return value */
404         int             i;              /* level number of btree block */
405         xfs_mount_t     *mp;            /* mount structure for filesystem */
406         xfs_btree_cur_t *new;           /* new cursor value */
407         xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
408
409         tp = cur->bc_tp;
410         mp = cur->bc_mp;
411
412         /*
413          * Allocate a new cursor like the old one.
414          */
415         new = cur->bc_ops->dup_cursor(cur);
416
417         /*
418          * Copy the record currently in the cursor.
419          */
420         new->bc_rec = cur->bc_rec;
421
422         /*
423          * For each level current, re-get the buffer and copy the ptr value.
424          */
425         for (i = 0; i < new->bc_nlevels; i++) {
426                 new->bc_ptrs[i] = cur->bc_ptrs[i];
427                 new->bc_ra[i] = cur->bc_ra[i];
428                 bp = cur->bc_bufs[i];
429                 if (bp) {
430                         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
431                                                    XFS_BUF_ADDR(bp), mp->m_bsize,
432                                                    0, &bp,
433                                                    cur->bc_ops->buf_ops);
434                         if (error) {
435                                 xfs_btree_del_cursor(new, error);
436                                 *ncur = NULL;
437                                 return error;
438                         }
439                 }
440                 new->bc_bufs[i] = bp;
441         }
442         *ncur = new;
443         return 0;
444 }
445
446 /*
447  * XFS btree block layout and addressing:
448  *
449  * There are two types of blocks in the btree: leaf and non-leaf blocks.
450  *
451  * The leaf record start with a header then followed by records containing
452  * the values.  A non-leaf block also starts with the same header, and
453  * then first contains lookup keys followed by an equal number of pointers
454  * to the btree blocks at the previous level.
455  *
456  *              +--------+-------+-------+-------+-------+-------+-------+
457  * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
458  *              +--------+-------+-------+-------+-------+-------+-------+
459  *
460  *              +--------+-------+-------+-------+-------+-------+-------+
461  * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
462  *              +--------+-------+-------+-------+-------+-------+-------+
463  *
464  * The header is called struct xfs_btree_block for reasons better left unknown
465  * and comes in different versions for short (32bit) and long (64bit) block
466  * pointers.  The record and key structures are defined by the btree instances
467  * and opaque to the btree core.  The block pointers are simple disk endian
468  * integers, available in a short (32bit) and long (64bit) variant.
469  *
470  * The helpers below calculate the offset of a given record, key or pointer
471  * into a btree block (xfs_btree_*_offset) or return a pointer to the given
472  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
473  * inside the btree block is done using indices starting at one, not zero!
474  *
475  * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
476  * overlapping intervals.  In such a tree, records are still sorted lowest to
477  * highest and indexed by the smallest key value that refers to the record.
478  * However, nodes are different: each pointer has two associated keys -- one
479  * indexing the lowest key available in the block(s) below (the same behavior
480  * as the key in a regular btree) and another indexing the highest key
481  * available in the block(s) below.  Because records are /not/ sorted by the
482  * highest key, all leaf block updates require us to compute the highest key
483  * that matches any record in the leaf and to recursively update the high keys
484  * in the nodes going further up in the tree, if necessary.  Nodes look like
485  * this:
486  *
487  *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
488  * Non-Leaf:    | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
489  *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
490  *
491  * To perform an interval query on an overlapped tree, perform the usual
492  * depth-first search and use the low and high keys to decide if we can skip
493  * that particular node.  If a leaf node is reached, return the records that
494  * intersect the interval.  Note that an interval query may return numerous
495  * entries.  For a non-overlapped tree, simply search for the record associated
496  * with the lowest key and iterate forward until a non-matching record is
497  * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
498  * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
499  * more detail.
500  *
501  * Why do we care about overlapping intervals?  Let's say you have a bunch of
502  * reverse mapping records on a reflink filesystem:
503  *
504  * 1: +- file A startblock B offset C length D -----------+
505  * 2:      +- file E startblock F offset G length H --------------+
506  * 3:      +- file I startblock F offset J length K --+
507  * 4:                                                        +- file L... --+
508  *
509  * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
510  * we'd simply increment the length of record 1.  But how do we find the record
511  * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
512  * record 3 because the keys are ordered first by startblock.  An interval
513  * query would return records 1 and 2 because they both overlap (B+D-1), and
514  * from that we can pick out record 1 as the appropriate left neighbor.
515  *
516  * In the non-overlapped case you can do a LE lookup and decrement the cursor
517  * because a record's interval must end before the next record.
518  */
519
520 /*
521  * Return size of the btree block header for this btree instance.
522  */
523 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
524 {
525         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
526                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
527                         return XFS_BTREE_LBLOCK_CRC_LEN;
528                 return XFS_BTREE_LBLOCK_LEN;
529         }
530         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
531                 return XFS_BTREE_SBLOCK_CRC_LEN;
532         return XFS_BTREE_SBLOCK_LEN;
533 }
534
535 /*
536  * Return size of btree block pointers for this btree instance.
537  */
538 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
539 {
540         return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
541                 sizeof(__be64) : sizeof(__be32);
542 }
543
544 /*
545  * Calculate offset of the n-th record in a btree block.
546  */
547 STATIC size_t
548 xfs_btree_rec_offset(
549         struct xfs_btree_cur    *cur,
550         int                     n)
551 {
552         return xfs_btree_block_len(cur) +
553                 (n - 1) * cur->bc_ops->rec_len;
554 }
555
556 /*
557  * Calculate offset of the n-th key in a btree block.
558  */
559 STATIC size_t
560 xfs_btree_key_offset(
561         struct xfs_btree_cur    *cur,
562         int                     n)
563 {
564         return xfs_btree_block_len(cur) +
565                 (n - 1) * cur->bc_ops->key_len;
566 }
567
568 /*
569  * Calculate offset of the n-th high key in a btree block.
570  */
571 STATIC size_t
572 xfs_btree_high_key_offset(
573         struct xfs_btree_cur    *cur,
574         int                     n)
575 {
576         return xfs_btree_block_len(cur) +
577                 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
578 }
579
580 /*
581  * Calculate offset of the n-th block pointer in a btree block.
582  */
583 STATIC size_t
584 xfs_btree_ptr_offset(
585         struct xfs_btree_cur    *cur,
586         int                     n,
587         int                     level)
588 {
589         return xfs_btree_block_len(cur) +
590                 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
591                 (n - 1) * xfs_btree_ptr_len(cur);
592 }
593
594 /*
595  * Return a pointer to the n-th record in the btree block.
596  */
597 union xfs_btree_rec *
598 xfs_btree_rec_addr(
599         struct xfs_btree_cur    *cur,
600         int                     n,
601         struct xfs_btree_block  *block)
602 {
603         return (union xfs_btree_rec *)
604                 ((char *)block + xfs_btree_rec_offset(cur, n));
605 }
606
607 /*
608  * Return a pointer to the n-th key in the btree block.
609  */
610 union xfs_btree_key *
611 xfs_btree_key_addr(
612         struct xfs_btree_cur    *cur,
613         int                     n,
614         struct xfs_btree_block  *block)
615 {
616         return (union xfs_btree_key *)
617                 ((char *)block + xfs_btree_key_offset(cur, n));
618 }
619
620 /*
621  * Return a pointer to the n-th high key in the btree block.
622  */
623 union xfs_btree_key *
624 xfs_btree_high_key_addr(
625         struct xfs_btree_cur    *cur,
626         int                     n,
627         struct xfs_btree_block  *block)
628 {
629         return (union xfs_btree_key *)
630                 ((char *)block + xfs_btree_high_key_offset(cur, n));
631 }
632
633 /*
634  * Return a pointer to the n-th block pointer in the btree block.
635  */
636 union xfs_btree_ptr *
637 xfs_btree_ptr_addr(
638         struct xfs_btree_cur    *cur,
639         int                     n,
640         struct xfs_btree_block  *block)
641 {
642         int                     level = xfs_btree_get_level(block);
643
644         ASSERT(block->bb_level != 0);
645
646         return (union xfs_btree_ptr *)
647                 ((char *)block + xfs_btree_ptr_offset(cur, n, level));
648 }
649
650 /*
651  * Get the root block which is stored in the inode.
652  *
653  * For now this btree implementation assumes the btree root is always
654  * stored in the if_broot field of an inode fork.
655  */
656 STATIC struct xfs_btree_block *
657 xfs_btree_get_iroot(
658         struct xfs_btree_cur    *cur)
659 {
660         struct xfs_ifork        *ifp;
661
662         ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
663         return (struct xfs_btree_block *)ifp->if_broot;
664 }
665
666 /*
667  * Retrieve the block pointer from the cursor at the given level.
668  * This may be an inode btree root or from a buffer.
669  */
670 struct xfs_btree_block *                /* generic btree block pointer */
671 xfs_btree_get_block(
672         struct xfs_btree_cur    *cur,   /* btree cursor */
673         int                     level,  /* level in btree */
674         struct xfs_buf          **bpp)  /* buffer containing the block */
675 {
676         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
677             (level == cur->bc_nlevels - 1)) {
678                 *bpp = NULL;
679                 return xfs_btree_get_iroot(cur);
680         }
681
682         *bpp = cur->bc_bufs[level];
683         return XFS_BUF_TO_BLOCK(*bpp);
684 }
685
686 /*
687  * Get a buffer for the block, return it with no data read.
688  * Long-form addressing.
689  */
690 xfs_buf_t *                             /* buffer for fsbno */
691 xfs_btree_get_bufl(
692         xfs_mount_t     *mp,            /* file system mount point */
693         xfs_trans_t     *tp,            /* transaction pointer */
694         xfs_fsblock_t   fsbno)          /* file system block number */
695 {
696         xfs_daddr_t             d;              /* real disk block address */
697
698         ASSERT(fsbno != NULLFSBLOCK);
699         d = XFS_FSB_TO_DADDR(mp, fsbno);
700         return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
701 }
702
703 /*
704  * Get a buffer for the block, return it with no data read.
705  * Short-form addressing.
706  */
707 xfs_buf_t *                             /* buffer for agno/agbno */
708 xfs_btree_get_bufs(
709         xfs_mount_t     *mp,            /* file system mount point */
710         xfs_trans_t     *tp,            /* transaction pointer */
711         xfs_agnumber_t  agno,           /* allocation group number */
712         xfs_agblock_t   agbno)          /* allocation group block number */
713 {
714         xfs_daddr_t             d;              /* real disk block address */
715
716         ASSERT(agno != NULLAGNUMBER);
717         ASSERT(agbno != NULLAGBLOCK);
718         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
719         return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
720 }
721
722 /*
723  * Check for the cursor referring to the last block at the given level.
724  */
725 int                                     /* 1=is last block, 0=not last block */
726 xfs_btree_islastblock(
727         xfs_btree_cur_t         *cur,   /* btree cursor */
728         int                     level)  /* level to check */
729 {
730         struct xfs_btree_block  *block; /* generic btree block pointer */
731         xfs_buf_t               *bp;    /* buffer containing block */
732
733         block = xfs_btree_get_block(cur, level, &bp);
734         xfs_btree_check_block(cur, block, level, bp);
735         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
736                 return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
737         else
738                 return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
739 }
740
741 /*
742  * Change the cursor to point to the first record at the given level.
743  * Other levels are unaffected.
744  */
745 STATIC int                              /* success=1, failure=0 */
746 xfs_btree_firstrec(
747         xfs_btree_cur_t         *cur,   /* btree cursor */
748         int                     level)  /* level to change */
749 {
750         struct xfs_btree_block  *block; /* generic btree block pointer */
751         xfs_buf_t               *bp;    /* buffer containing block */
752
753         /*
754          * Get the block pointer for this level.
755          */
756         block = xfs_btree_get_block(cur, level, &bp);
757         if (xfs_btree_check_block(cur, block, level, bp))
758                 return 0;
759         /*
760          * It's empty, there is no such record.
761          */
762         if (!block->bb_numrecs)
763                 return 0;
764         /*
765          * Set the ptr value to 1, that's the first record/key.
766          */
767         cur->bc_ptrs[level] = 1;
768         return 1;
769 }
770
771 /*
772  * Change the cursor to point to the last record in the current block
773  * at the given level.  Other levels are unaffected.
774  */
775 STATIC int                              /* success=1, failure=0 */
776 xfs_btree_lastrec(
777         xfs_btree_cur_t         *cur,   /* btree cursor */
778         int                     level)  /* level to change */
779 {
780         struct xfs_btree_block  *block; /* generic btree block pointer */
781         xfs_buf_t               *bp;    /* buffer containing block */
782
783         /*
784          * Get the block pointer for this level.
785          */
786         block = xfs_btree_get_block(cur, level, &bp);
787         if (xfs_btree_check_block(cur, block, level, bp))
788                 return 0;
789         /*
790          * It's empty, there is no such record.
791          */
792         if (!block->bb_numrecs)
793                 return 0;
794         /*
795          * Set the ptr value to numrecs, that's the last record/key.
796          */
797         cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
798         return 1;
799 }
800
801 /*
802  * Compute first and last byte offsets for the fields given.
803  * Interprets the offsets table, which contains struct field offsets.
804  */
805 void
806 xfs_btree_offsets(
807         int64_t         fields,         /* bitmask of fields */
808         const short     *offsets,       /* table of field offsets */
809         int             nbits,          /* number of bits to inspect */
810         int             *first,         /* output: first byte offset */
811         int             *last)          /* output: last byte offset */
812 {
813         int             i;              /* current bit number */
814         int64_t         imask;          /* mask for current bit number */
815
816         ASSERT(fields != 0);
817         /*
818          * Find the lowest bit, so the first byte offset.
819          */
820         for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
821                 if (imask & fields) {
822                         *first = offsets[i];
823                         break;
824                 }
825         }
826         /*
827          * Find the highest bit, so the last byte offset.
828          */
829         for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
830                 if (imask & fields) {
831                         *last = offsets[i + 1] - 1;
832                         break;
833                 }
834         }
835 }
836
837 /*
838  * Get a buffer for the block, return it read in.
839  * Long-form addressing.
840  */
841 int
842 xfs_btree_read_bufl(
843         struct xfs_mount        *mp,            /* file system mount point */
844         struct xfs_trans        *tp,            /* transaction pointer */
845         xfs_fsblock_t           fsbno,          /* file system block number */
846         struct xfs_buf          **bpp,          /* buffer for fsbno */
847         int                     refval,         /* ref count value for buffer */
848         const struct xfs_buf_ops *ops)
849 {
850         struct xfs_buf          *bp;            /* return value */
851         xfs_daddr_t             d;              /* real disk block address */
852         int                     error;
853
854         if (!xfs_verify_fsbno(mp, fsbno))
855                 return -EFSCORRUPTED;
856         d = XFS_FSB_TO_DADDR(mp, fsbno);
857         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
858                                    mp->m_bsize, 0, &bp, ops);
859         if (error)
860                 return error;
861         if (bp)
862                 xfs_buf_set_ref(bp, refval);
863         *bpp = bp;
864         return 0;
865 }
866
867 /*
868  * Read-ahead the block, don't wait for it, don't return a buffer.
869  * Long-form addressing.
870  */
871 /* ARGSUSED */
872 void
873 xfs_btree_reada_bufl(
874         struct xfs_mount        *mp,            /* file system mount point */
875         xfs_fsblock_t           fsbno,          /* file system block number */
876         xfs_extlen_t            count,          /* count of filesystem blocks */
877         const struct xfs_buf_ops *ops)
878 {
879         xfs_daddr_t             d;
880
881         ASSERT(fsbno != NULLFSBLOCK);
882         d = XFS_FSB_TO_DADDR(mp, fsbno);
883         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
884 }
885
886 /*
887  * Read-ahead the block, don't wait for it, don't return a buffer.
888  * Short-form addressing.
889  */
890 /* ARGSUSED */
891 void
892 xfs_btree_reada_bufs(
893         struct xfs_mount        *mp,            /* file system mount point */
894         xfs_agnumber_t          agno,           /* allocation group number */
895         xfs_agblock_t           agbno,          /* allocation group block number */
896         xfs_extlen_t            count,          /* count of filesystem blocks */
897         const struct xfs_buf_ops *ops)
898 {
899         xfs_daddr_t             d;
900
901         ASSERT(agno != NULLAGNUMBER);
902         ASSERT(agbno != NULLAGBLOCK);
903         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
904         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
905 }
906
907 STATIC int
908 xfs_btree_readahead_lblock(
909         struct xfs_btree_cur    *cur,
910         int                     lr,
911         struct xfs_btree_block  *block)
912 {
913         int                     rval = 0;
914         xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
915         xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
916
917         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
918                 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
919                                      cur->bc_ops->buf_ops);
920                 rval++;
921         }
922
923         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
924                 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
925                                      cur->bc_ops->buf_ops);
926                 rval++;
927         }
928
929         return rval;
930 }
931
932 STATIC int
933 xfs_btree_readahead_sblock(
934         struct xfs_btree_cur    *cur,
935         int                     lr,
936         struct xfs_btree_block *block)
937 {
938         int                     rval = 0;
939         xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
940         xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
941
942
943         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
944                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
945                                      left, 1, cur->bc_ops->buf_ops);
946                 rval++;
947         }
948
949         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
950                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
951                                      right, 1, cur->bc_ops->buf_ops);
952                 rval++;
953         }
954
955         return rval;
956 }
957
958 /*
959  * Read-ahead btree blocks, at the given level.
960  * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
961  */
962 STATIC int
963 xfs_btree_readahead(
964         struct xfs_btree_cur    *cur,           /* btree cursor */
965         int                     lev,            /* level in btree */
966         int                     lr)             /* left/right bits */
967 {
968         struct xfs_btree_block  *block;
969
970         /*
971          * No readahead needed if we are at the root level and the
972          * btree root is stored in the inode.
973          */
974         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
975             (lev == cur->bc_nlevels - 1))
976                 return 0;
977
978         if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
979                 return 0;
980
981         cur->bc_ra[lev] |= lr;
982         block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
983
984         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
985                 return xfs_btree_readahead_lblock(cur, lr, block);
986         return xfs_btree_readahead_sblock(cur, lr, block);
987 }
988
989 STATIC int
990 xfs_btree_ptr_to_daddr(
991         struct xfs_btree_cur    *cur,
992         union xfs_btree_ptr     *ptr,
993         xfs_daddr_t             *daddr)
994 {
995         xfs_fsblock_t           fsbno;
996         xfs_agblock_t           agbno;
997         int                     error;
998
999         error = xfs_btree_check_ptr(cur, ptr, 0, 1);
1000         if (error)
1001                 return error;
1002
1003         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1004                 fsbno = be64_to_cpu(ptr->l);
1005                 *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
1006         } else {
1007                 agbno = be32_to_cpu(ptr->s);
1008                 *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
1009                                 agbno);
1010         }
1011
1012         return 0;
1013 }
1014
1015 /*
1016  * Readahead @count btree blocks at the given @ptr location.
1017  *
1018  * We don't need to care about long or short form btrees here as we have a
1019  * method of converting the ptr directly to a daddr available to us.
1020  */
1021 STATIC void
1022 xfs_btree_readahead_ptr(
1023         struct xfs_btree_cur    *cur,
1024         union xfs_btree_ptr     *ptr,
1025         xfs_extlen_t            count)
1026 {
1027         xfs_daddr_t             daddr;
1028
1029         if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
1030                 return;
1031         xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
1032                           cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
1033 }
1034
1035 /*
1036  * Set the buffer for level "lev" in the cursor to bp, releasing
1037  * any previous buffer.
1038  */
1039 STATIC void
1040 xfs_btree_setbuf(
1041         xfs_btree_cur_t         *cur,   /* btree cursor */
1042         int                     lev,    /* level in btree */
1043         xfs_buf_t               *bp)    /* new buffer to set */
1044 {
1045         struct xfs_btree_block  *b;     /* btree block */
1046
1047         if (cur->bc_bufs[lev])
1048                 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
1049         cur->bc_bufs[lev] = bp;
1050         cur->bc_ra[lev] = 0;
1051
1052         b = XFS_BUF_TO_BLOCK(bp);
1053         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1054                 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1055                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1056                 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1057                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1058         } else {
1059                 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1060                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1061                 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1062                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1063         }
1064 }
1065
1066 bool
1067 xfs_btree_ptr_is_null(
1068         struct xfs_btree_cur    *cur,
1069         union xfs_btree_ptr     *ptr)
1070 {
1071         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1072                 return ptr->l == cpu_to_be64(NULLFSBLOCK);
1073         else
1074                 return ptr->s == cpu_to_be32(NULLAGBLOCK);
1075 }
1076
1077 STATIC void
1078 xfs_btree_set_ptr_null(
1079         struct xfs_btree_cur    *cur,
1080         union xfs_btree_ptr     *ptr)
1081 {
1082         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1083                 ptr->l = cpu_to_be64(NULLFSBLOCK);
1084         else
1085                 ptr->s = cpu_to_be32(NULLAGBLOCK);
1086 }
1087
1088 /*
1089  * Get/set/init sibling pointers
1090  */
1091 void
1092 xfs_btree_get_sibling(
1093         struct xfs_btree_cur    *cur,
1094         struct xfs_btree_block  *block,
1095         union xfs_btree_ptr     *ptr,
1096         int                     lr)
1097 {
1098         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1099
1100         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1101                 if (lr == XFS_BB_RIGHTSIB)
1102                         ptr->l = block->bb_u.l.bb_rightsib;
1103                 else
1104                         ptr->l = block->bb_u.l.bb_leftsib;
1105         } else {
1106                 if (lr == XFS_BB_RIGHTSIB)
1107                         ptr->s = block->bb_u.s.bb_rightsib;
1108                 else
1109                         ptr->s = block->bb_u.s.bb_leftsib;
1110         }
1111 }
1112
1113 STATIC void
1114 xfs_btree_set_sibling(
1115         struct xfs_btree_cur    *cur,
1116         struct xfs_btree_block  *block,
1117         union xfs_btree_ptr     *ptr,
1118         int                     lr)
1119 {
1120         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1121
1122         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1123                 if (lr == XFS_BB_RIGHTSIB)
1124                         block->bb_u.l.bb_rightsib = ptr->l;
1125                 else
1126                         block->bb_u.l.bb_leftsib = ptr->l;
1127         } else {
1128                 if (lr == XFS_BB_RIGHTSIB)
1129                         block->bb_u.s.bb_rightsib = ptr->s;
1130                 else
1131                         block->bb_u.s.bb_leftsib = ptr->s;
1132         }
1133 }
1134
1135 void
1136 xfs_btree_init_block_int(
1137         struct xfs_mount        *mp,
1138         struct xfs_btree_block  *buf,
1139         xfs_daddr_t             blkno,
1140         xfs_btnum_t             btnum,
1141         __u16                   level,
1142         __u16                   numrecs,
1143         __u64                   owner,
1144         unsigned int            flags)
1145 {
1146         int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
1147         __u32                   magic = xfs_btree_magic(crc, btnum);
1148
1149         buf->bb_magic = cpu_to_be32(magic);
1150         buf->bb_level = cpu_to_be16(level);
1151         buf->bb_numrecs = cpu_to_be16(numrecs);
1152
1153         if (flags & XFS_BTREE_LONG_PTRS) {
1154                 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1155                 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1156                 if (crc) {
1157                         buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1158                         buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1159                         uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1160                         buf->bb_u.l.bb_pad = 0;
1161                         buf->bb_u.l.bb_lsn = 0;
1162                 }
1163         } else {
1164                 /* owner is a 32 bit value on short blocks */
1165                 __u32 __owner = (__u32)owner;
1166
1167                 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1168                 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1169                 if (crc) {
1170                         buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1171                         buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1172                         uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1173                         buf->bb_u.s.bb_lsn = 0;
1174                 }
1175         }
1176 }
1177
1178 void
1179 xfs_btree_init_block(
1180         struct xfs_mount *mp,
1181         struct xfs_buf  *bp,
1182         xfs_btnum_t     btnum,
1183         __u16           level,
1184         __u16           numrecs,
1185         __u64           owner)
1186 {
1187         xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1188                                  btnum, level, numrecs, owner, 0);
1189 }
1190
1191 STATIC void
1192 xfs_btree_init_block_cur(
1193         struct xfs_btree_cur    *cur,
1194         struct xfs_buf          *bp,
1195         int                     level,
1196         int                     numrecs)
1197 {
1198         __u64                   owner;
1199
1200         /*
1201          * we can pull the owner from the cursor right now as the different
1202          * owners align directly with the pointer size of the btree. This may
1203          * change in future, but is safe for current users of the generic btree
1204          * code.
1205          */
1206         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1207                 owner = cur->bc_private.b.ip->i_ino;
1208         else
1209                 owner = cur->bc_private.a.agno;
1210
1211         xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1212                                  cur->bc_btnum, level, numrecs,
1213                                  owner, cur->bc_flags);
1214 }
1215
1216 /*
1217  * Return true if ptr is the last record in the btree and
1218  * we need to track updates to this record.  The decision
1219  * will be further refined in the update_lastrec method.
1220  */
1221 STATIC int
1222 xfs_btree_is_lastrec(
1223         struct xfs_btree_cur    *cur,
1224         struct xfs_btree_block  *block,
1225         int                     level)
1226 {
1227         union xfs_btree_ptr     ptr;
1228
1229         if (level > 0)
1230                 return 0;
1231         if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1232                 return 0;
1233
1234         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1235         if (!xfs_btree_ptr_is_null(cur, &ptr))
1236                 return 0;
1237         return 1;
1238 }
1239
1240 STATIC void
1241 xfs_btree_buf_to_ptr(
1242         struct xfs_btree_cur    *cur,
1243         struct xfs_buf          *bp,
1244         union xfs_btree_ptr     *ptr)
1245 {
1246         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1247                 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1248                                         XFS_BUF_ADDR(bp)));
1249         else {
1250                 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1251                                         XFS_BUF_ADDR(bp)));
1252         }
1253 }
1254
1255 STATIC void
1256 xfs_btree_set_refs(
1257         struct xfs_btree_cur    *cur,
1258         struct xfs_buf          *bp)
1259 {
1260         switch (cur->bc_btnum) {
1261         case XFS_BTNUM_BNO:
1262         case XFS_BTNUM_CNT:
1263                 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1264                 break;
1265         case XFS_BTNUM_INO:
1266         case XFS_BTNUM_FINO:
1267                 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1268                 break;
1269         case XFS_BTNUM_BMAP:
1270                 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1271                 break;
1272         case XFS_BTNUM_RMAP:
1273                 xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
1274                 break;
1275         case XFS_BTNUM_REFC:
1276                 xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
1277                 break;
1278         default:
1279                 ASSERT(0);
1280         }
1281 }
1282
1283 STATIC int
1284 xfs_btree_get_buf_block(
1285         struct xfs_btree_cur    *cur,
1286         union xfs_btree_ptr     *ptr,
1287         struct xfs_btree_block  **block,
1288         struct xfs_buf          **bpp)
1289 {
1290         struct xfs_mount        *mp = cur->bc_mp;
1291         xfs_daddr_t             d;
1292         int                     error;
1293
1294         error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1295         if (error)
1296                 return error;
1297         *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1298                                  mp->m_bsize, 0);
1299
1300         if (!*bpp)
1301                 return -ENOMEM;
1302
1303         (*bpp)->b_ops = cur->bc_ops->buf_ops;
1304         *block = XFS_BUF_TO_BLOCK(*bpp);
1305         return 0;
1306 }
1307
1308 /*
1309  * Read in the buffer at the given ptr and return the buffer and
1310  * the block pointer within the buffer.
1311  */
1312 STATIC int
1313 xfs_btree_read_buf_block(
1314         struct xfs_btree_cur    *cur,
1315         union xfs_btree_ptr     *ptr,
1316         int                     flags,
1317         struct xfs_btree_block  **block,
1318         struct xfs_buf          **bpp)
1319 {
1320         struct xfs_mount        *mp = cur->bc_mp;
1321         xfs_daddr_t             d;
1322         int                     error;
1323
1324         /* need to sort out how callers deal with failures first */
1325         ASSERT(!(flags & XBF_TRYLOCK));
1326
1327         error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1328         if (error)
1329                 return error;
1330         error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1331                                    mp->m_bsize, flags, bpp,
1332                                    cur->bc_ops->buf_ops);
1333         if (error)
1334                 return error;
1335
1336         xfs_btree_set_refs(cur, *bpp);
1337         *block = XFS_BUF_TO_BLOCK(*bpp);
1338         return 0;
1339 }
1340
1341 /*
1342  * Copy keys from one btree block to another.
1343  */
1344 STATIC void
1345 xfs_btree_copy_keys(
1346         struct xfs_btree_cur    *cur,
1347         union xfs_btree_key     *dst_key,
1348         union xfs_btree_key     *src_key,
1349         int                     numkeys)
1350 {
1351         ASSERT(numkeys >= 0);
1352         memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1353 }
1354
1355 /*
1356  * Copy records from one btree block to another.
1357  */
1358 STATIC void
1359 xfs_btree_copy_recs(
1360         struct xfs_btree_cur    *cur,
1361         union xfs_btree_rec     *dst_rec,
1362         union xfs_btree_rec     *src_rec,
1363         int                     numrecs)
1364 {
1365         ASSERT(numrecs >= 0);
1366         memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1367 }
1368
1369 /*
1370  * Copy block pointers from one btree block to another.
1371  */
1372 STATIC void
1373 xfs_btree_copy_ptrs(
1374         struct xfs_btree_cur    *cur,
1375         union xfs_btree_ptr     *dst_ptr,
1376         union xfs_btree_ptr     *src_ptr,
1377         int                     numptrs)
1378 {
1379         ASSERT(numptrs >= 0);
1380         memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1381 }
1382
1383 /*
1384  * Shift keys one index left/right inside a single btree block.
1385  */
1386 STATIC void
1387 xfs_btree_shift_keys(
1388         struct xfs_btree_cur    *cur,
1389         union xfs_btree_key     *key,
1390         int                     dir,
1391         int                     numkeys)
1392 {
1393         char                    *dst_key;
1394
1395         ASSERT(numkeys >= 0);
1396         ASSERT(dir == 1 || dir == -1);
1397
1398         dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1399         memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1400 }
1401
1402 /*
1403  * Shift records one index left/right inside a single btree block.
1404  */
1405 STATIC void
1406 xfs_btree_shift_recs(
1407         struct xfs_btree_cur    *cur,
1408         union xfs_btree_rec     *rec,
1409         int                     dir,
1410         int                     numrecs)
1411 {
1412         char                    *dst_rec;
1413
1414         ASSERT(numrecs >= 0);
1415         ASSERT(dir == 1 || dir == -1);
1416
1417         dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1418         memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1419 }
1420
1421 /*
1422  * Shift block pointers one index left/right inside a single btree block.
1423  */
1424 STATIC void
1425 xfs_btree_shift_ptrs(
1426         struct xfs_btree_cur    *cur,
1427         union xfs_btree_ptr     *ptr,
1428         int                     dir,
1429         int                     numptrs)
1430 {
1431         char                    *dst_ptr;
1432
1433         ASSERT(numptrs >= 0);
1434         ASSERT(dir == 1 || dir == -1);
1435
1436         dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1437         memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1438 }
1439
1440 /*
1441  * Log key values from the btree block.
1442  */
1443 STATIC void
1444 xfs_btree_log_keys(
1445         struct xfs_btree_cur    *cur,
1446         struct xfs_buf          *bp,
1447         int                     first,
1448         int                     last)
1449 {
1450
1451         if (bp) {
1452                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1453                 xfs_trans_log_buf(cur->bc_tp, bp,
1454                                   xfs_btree_key_offset(cur, first),
1455                                   xfs_btree_key_offset(cur, last + 1) - 1);
1456         } else {
1457                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1458                                 xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1459         }
1460 }
1461
1462 /*
1463  * Log record values from the btree block.
1464  */
1465 void
1466 xfs_btree_log_recs(
1467         struct xfs_btree_cur    *cur,
1468         struct xfs_buf          *bp,
1469         int                     first,
1470         int                     last)
1471 {
1472
1473         xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1474         xfs_trans_log_buf(cur->bc_tp, bp,
1475                           xfs_btree_rec_offset(cur, first),
1476                           xfs_btree_rec_offset(cur, last + 1) - 1);
1477
1478 }
1479
1480 /*
1481  * Log block pointer fields from a btree block (nonleaf).
1482  */
1483 STATIC void
1484 xfs_btree_log_ptrs(
1485         struct xfs_btree_cur    *cur,   /* btree cursor */
1486         struct xfs_buf          *bp,    /* buffer containing btree block */
1487         int                     first,  /* index of first pointer to log */
1488         int                     last)   /* index of last pointer to log */
1489 {
1490
1491         if (bp) {
1492                 struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1493                 int                     level = xfs_btree_get_level(block);
1494
1495                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1496                 xfs_trans_log_buf(cur->bc_tp, bp,
1497                                 xfs_btree_ptr_offset(cur, first, level),
1498                                 xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1499         } else {
1500                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1501                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1502         }
1503
1504 }
1505
1506 /*
1507  * Log fields from a btree block header.
1508  */
1509 void
1510 xfs_btree_log_block(
1511         struct xfs_btree_cur    *cur,   /* btree cursor */
1512         struct xfs_buf          *bp,    /* buffer containing btree block */
1513         int                     fields) /* mask of fields: XFS_BB_... */
1514 {
1515         int                     first;  /* first byte offset logged */
1516         int                     last;   /* last byte offset logged */
1517         static const short      soffsets[] = {  /* table of offsets (short) */
1518                 offsetof(struct xfs_btree_block, bb_magic),
1519                 offsetof(struct xfs_btree_block, bb_level),
1520                 offsetof(struct xfs_btree_block, bb_numrecs),
1521                 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1522                 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1523                 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1524                 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1525                 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1526                 offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1527                 offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1528                 XFS_BTREE_SBLOCK_CRC_LEN
1529         };
1530         static const short      loffsets[] = {  /* table of offsets (long) */
1531                 offsetof(struct xfs_btree_block, bb_magic),
1532                 offsetof(struct xfs_btree_block, bb_level),
1533                 offsetof(struct xfs_btree_block, bb_numrecs),
1534                 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1535                 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1536                 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1537                 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1538                 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1539                 offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1540                 offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1541                 offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1542                 XFS_BTREE_LBLOCK_CRC_LEN
1543         };
1544
1545         if (bp) {
1546                 int nbits;
1547
1548                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1549                         /*
1550                          * We don't log the CRC when updating a btree
1551                          * block but instead recreate it during log
1552                          * recovery.  As the log buffers have checksums
1553                          * of their own this is safe and avoids logging a crc
1554                          * update in a lot of places.
1555                          */
1556                         if (fields == XFS_BB_ALL_BITS)
1557                                 fields = XFS_BB_ALL_BITS_CRC;
1558                         nbits = XFS_BB_NUM_BITS_CRC;
1559                 } else {
1560                         nbits = XFS_BB_NUM_BITS;
1561                 }
1562                 xfs_btree_offsets(fields,
1563                                   (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1564                                         loffsets : soffsets,
1565                                   nbits, &first, &last);
1566                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1567                 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1568         } else {
1569                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1570                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1571         }
1572 }
1573
1574 /*
1575  * Increment cursor by one record at the level.
1576  * For nonzero levels the leaf-ward information is untouched.
1577  */
1578 int                                             /* error */
1579 xfs_btree_increment(
1580         struct xfs_btree_cur    *cur,
1581         int                     level,
1582         int                     *stat)          /* success/failure */
1583 {
1584         struct xfs_btree_block  *block;
1585         union xfs_btree_ptr     ptr;
1586         struct xfs_buf          *bp;
1587         int                     error;          /* error return value */
1588         int                     lev;
1589
1590         ASSERT(level < cur->bc_nlevels);
1591
1592         /* Read-ahead to the right at this level. */
1593         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1594
1595         /* Get a pointer to the btree block. */
1596         block = xfs_btree_get_block(cur, level, &bp);
1597
1598 #ifdef DEBUG
1599         error = xfs_btree_check_block(cur, block, level, bp);
1600         if (error)
1601                 goto error0;
1602 #endif
1603
1604         /* We're done if we remain in the block after the increment. */
1605         if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1606                 goto out1;
1607
1608         /* Fail if we just went off the right edge of the tree. */
1609         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1610         if (xfs_btree_ptr_is_null(cur, &ptr))
1611                 goto out0;
1612
1613         XFS_BTREE_STATS_INC(cur, increment);
1614
1615         /*
1616          * March up the tree incrementing pointers.
1617          * Stop when we don't go off the right edge of a block.
1618          */
1619         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1620                 block = xfs_btree_get_block(cur, lev, &bp);
1621
1622 #ifdef DEBUG
1623                 error = xfs_btree_check_block(cur, block, lev, bp);
1624                 if (error)
1625                         goto error0;
1626 #endif
1627
1628                 if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1629                         break;
1630
1631                 /* Read-ahead the right block for the next loop. */
1632                 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1633         }
1634
1635         /*
1636          * If we went off the root then we are either seriously
1637          * confused or have the tree root in an inode.
1638          */
1639         if (lev == cur->bc_nlevels) {
1640                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1641                         goto out0;
1642                 ASSERT(0);
1643                 error = -EFSCORRUPTED;
1644                 goto error0;
1645         }
1646         ASSERT(lev < cur->bc_nlevels);
1647
1648         /*
1649          * Now walk back down the tree, fixing up the cursor's buffer
1650          * pointers and key numbers.
1651          */
1652         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1653                 union xfs_btree_ptr     *ptrp;
1654
1655                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1656                 --lev;
1657                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1658                 if (error)
1659                         goto error0;
1660
1661                 xfs_btree_setbuf(cur, lev, bp);
1662                 cur->bc_ptrs[lev] = 1;
1663         }
1664 out1:
1665         *stat = 1;
1666         return 0;
1667
1668 out0:
1669         *stat = 0;
1670         return 0;
1671
1672 error0:
1673         return error;
1674 }
1675
1676 /*
1677  * Decrement cursor by one record at the level.
1678  * For nonzero levels the leaf-ward information is untouched.
1679  */
1680 int                                             /* error */
1681 xfs_btree_decrement(
1682         struct xfs_btree_cur    *cur,
1683         int                     level,
1684         int                     *stat)          /* success/failure */
1685 {
1686         struct xfs_btree_block  *block;
1687         xfs_buf_t               *bp;
1688         int                     error;          /* error return value */
1689         int                     lev;
1690         union xfs_btree_ptr     ptr;
1691
1692         ASSERT(level < cur->bc_nlevels);
1693
1694         /* Read-ahead to the left at this level. */
1695         xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1696
1697         /* We're done if we remain in the block after the decrement. */
1698         if (--cur->bc_ptrs[level] > 0)
1699                 goto out1;
1700
1701         /* Get a pointer to the btree block. */
1702         block = xfs_btree_get_block(cur, level, &bp);
1703
1704 #ifdef DEBUG
1705         error = xfs_btree_check_block(cur, block, level, bp);
1706         if (error)
1707                 goto error0;
1708 #endif
1709
1710         /* Fail if we just went off the left edge of the tree. */
1711         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1712         if (xfs_btree_ptr_is_null(cur, &ptr))
1713                 goto out0;
1714
1715         XFS_BTREE_STATS_INC(cur, decrement);
1716
1717         /*
1718          * March up the tree decrementing pointers.
1719          * Stop when we don't go off the left edge of a block.
1720          */
1721         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1722                 if (--cur->bc_ptrs[lev] > 0)
1723                         break;
1724                 /* Read-ahead the left block for the next loop. */
1725                 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1726         }
1727
1728         /*
1729          * If we went off the root then we are seriously confused.
1730          * or the root of the tree is in an inode.
1731          */
1732         if (lev == cur->bc_nlevels) {
1733                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1734                         goto out0;
1735                 ASSERT(0);
1736                 error = -EFSCORRUPTED;
1737                 goto error0;
1738         }
1739         ASSERT(lev < cur->bc_nlevels);
1740
1741         /*
1742          * Now walk back down the tree, fixing up the cursor's buffer
1743          * pointers and key numbers.
1744          */
1745         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1746                 union xfs_btree_ptr     *ptrp;
1747
1748                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1749                 --lev;
1750                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1751                 if (error)
1752                         goto error0;
1753                 xfs_btree_setbuf(cur, lev, bp);
1754                 cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1755         }
1756 out1:
1757         *stat = 1;
1758         return 0;
1759
1760 out0:
1761         *stat = 0;
1762         return 0;
1763
1764 error0:
1765         return error;
1766 }
1767
1768 int
1769 xfs_btree_lookup_get_block(
1770         struct xfs_btree_cur    *cur,   /* btree cursor */
1771         int                     level,  /* level in the btree */
1772         union xfs_btree_ptr     *pp,    /* ptr to btree block */
1773         struct xfs_btree_block  **blkp) /* return btree block */
1774 {
1775         struct xfs_buf          *bp;    /* buffer pointer for btree block */
1776         xfs_daddr_t             daddr;
1777         int                     error = 0;
1778
1779         /* special case the root block if in an inode */
1780         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1781             (level == cur->bc_nlevels - 1)) {
1782                 *blkp = xfs_btree_get_iroot(cur);
1783                 return 0;
1784         }
1785
1786         /*
1787          * If the old buffer at this level for the disk address we are
1788          * looking for re-use it.
1789          *
1790          * Otherwise throw it away and get a new one.
1791          */
1792         bp = cur->bc_bufs[level];
1793         error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
1794         if (error)
1795                 return error;
1796         if (bp && XFS_BUF_ADDR(bp) == daddr) {
1797                 *blkp = XFS_BUF_TO_BLOCK(bp);
1798                 return 0;
1799         }
1800
1801         error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1802         if (error)
1803                 return error;
1804
1805         /* Check the inode owner since the verifiers don't. */
1806         if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
1807             !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
1808             (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
1809             be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
1810                         cur->bc_private.b.ip->i_ino)
1811                 goto out_bad;
1812
1813         /* Did we get the level we were looking for? */
1814         if (be16_to_cpu((*blkp)->bb_level) != level)
1815                 goto out_bad;
1816
1817         /* Check that internal nodes have at least one record. */
1818         if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1819                 goto out_bad;
1820
1821         xfs_btree_setbuf(cur, level, bp);
1822         return 0;
1823
1824 out_bad:
1825         *blkp = NULL;
1826         xfs_trans_brelse(cur->bc_tp, bp);
1827         return -EFSCORRUPTED;
1828 }
1829
1830 /*
1831  * Get current search key.  For level 0 we don't actually have a key
1832  * structure so we make one up from the record.  For all other levels
1833  * we just return the right key.
1834  */
1835 STATIC union xfs_btree_key *
1836 xfs_lookup_get_search_key(
1837         struct xfs_btree_cur    *cur,
1838         int                     level,
1839         int                     keyno,
1840         struct xfs_btree_block  *block,
1841         union xfs_btree_key     *kp)
1842 {
1843         if (level == 0) {
1844                 cur->bc_ops->init_key_from_rec(kp,
1845                                 xfs_btree_rec_addr(cur, keyno, block));
1846                 return kp;
1847         }
1848
1849         return xfs_btree_key_addr(cur, keyno, block);
1850 }
1851
1852 /*
1853  * Lookup the record.  The cursor is made to point to it, based on dir.
1854  * stat is set to 0 if can't find any such record, 1 for success.
1855  */
1856 int                                     /* error */
1857 xfs_btree_lookup(
1858         struct xfs_btree_cur    *cur,   /* btree cursor */
1859         xfs_lookup_t            dir,    /* <=, ==, or >= */
1860         int                     *stat)  /* success/failure */
1861 {
1862         struct xfs_btree_block  *block; /* current btree block */
1863         int64_t                 diff;   /* difference for the current key */
1864         int                     error;  /* error return value */
1865         int                     keyno;  /* current key number */
1866         int                     level;  /* level in the btree */
1867         union xfs_btree_ptr     *pp;    /* ptr to btree block */
1868         union xfs_btree_ptr     ptr;    /* ptr to btree block */
1869
1870         XFS_BTREE_STATS_INC(cur, lookup);
1871
1872         /* No such thing as a zero-level tree. */
1873         if (cur->bc_nlevels == 0)
1874                 return -EFSCORRUPTED;
1875
1876         block = NULL;
1877         keyno = 0;
1878
1879         /* initialise start pointer from cursor */
1880         cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1881         pp = &ptr;
1882
1883         /*
1884          * Iterate over each level in the btree, starting at the root.
1885          * For each level above the leaves, find the key we need, based
1886          * on the lookup record, then follow the corresponding block
1887          * pointer down to the next level.
1888          */
1889         for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1890                 /* Get the block we need to do the lookup on. */
1891                 error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1892                 if (error)
1893                         goto error0;
1894
1895                 if (diff == 0) {
1896                         /*
1897                          * If we already had a key match at a higher level, we
1898                          * know we need to use the first entry in this block.
1899                          */
1900                         keyno = 1;
1901                 } else {
1902                         /* Otherwise search this block. Do a binary search. */
1903
1904                         int     high;   /* high entry number */
1905                         int     low;    /* low entry number */
1906
1907                         /* Set low and high entry numbers, 1-based. */
1908                         low = 1;
1909                         high = xfs_btree_get_numrecs(block);
1910                         if (!high) {
1911                                 /* Block is empty, must be an empty leaf. */
1912                                 if (level != 0 || cur->bc_nlevels != 1) {
1913                                         XFS_CORRUPTION_ERROR(__func__,
1914                                                         XFS_ERRLEVEL_LOW,
1915                                                         cur->bc_mp, block,
1916                                                         sizeof(*block));
1917                                         return -EFSCORRUPTED;
1918                                 }
1919
1920                                 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1921                                 *stat = 0;
1922                                 return 0;
1923                         }
1924
1925                         /* Binary search the block. */
1926                         while (low <= high) {
1927                                 union xfs_btree_key     key;
1928                                 union xfs_btree_key     *kp;
1929
1930                                 XFS_BTREE_STATS_INC(cur, compare);
1931
1932                                 /* keyno is average of low and high. */
1933                                 keyno = (low + high) >> 1;
1934
1935                                 /* Get current search key */
1936                                 kp = xfs_lookup_get_search_key(cur, level,
1937                                                 keyno, block, &key);
1938
1939                                 /*
1940                                  * Compute difference to get next direction:
1941                                  *  - less than, move right
1942                                  *  - greater than, move left
1943                                  *  - equal, we're done
1944                                  */
1945                                 diff = cur->bc_ops->key_diff(cur, kp);
1946                                 if (diff < 0)
1947                                         low = keyno + 1;
1948                                 else if (diff > 0)
1949                                         high = keyno - 1;
1950                                 else
1951                                         break;
1952                         }
1953                 }
1954
1955                 /*
1956                  * If there are more levels, set up for the next level
1957                  * by getting the block number and filling in the cursor.
1958                  */
1959                 if (level > 0) {
1960                         /*
1961                          * If we moved left, need the previous key number,
1962                          * unless there isn't one.
1963                          */
1964                         if (diff > 0 && --keyno < 1)
1965                                 keyno = 1;
1966                         pp = xfs_btree_ptr_addr(cur, keyno, block);
1967
1968                         error = xfs_btree_debug_check_ptr(cur, pp, 0, level);
1969                         if (error)
1970                                 goto error0;
1971
1972                         cur->bc_ptrs[level] = keyno;
1973                 }
1974         }
1975
1976         /* Done with the search. See if we need to adjust the results. */
1977         if (dir != XFS_LOOKUP_LE && diff < 0) {
1978                 keyno++;
1979                 /*
1980                  * If ge search and we went off the end of the block, but it's
1981                  * not the last block, we're in the wrong block.
1982                  */
1983                 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1984                 if (dir == XFS_LOOKUP_GE &&
1985                     keyno > xfs_btree_get_numrecs(block) &&
1986                     !xfs_btree_ptr_is_null(cur, &ptr)) {
1987                         int     i;
1988
1989                         cur->bc_ptrs[0] = keyno;
1990                         error = xfs_btree_increment(cur, 0, &i);
1991                         if (error)
1992                                 goto error0;
1993                         XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1994                         *stat = 1;
1995                         return 0;
1996                 }
1997         } else if (dir == XFS_LOOKUP_LE && diff > 0)
1998                 keyno--;
1999         cur->bc_ptrs[0] = keyno;
2000
2001         /* Return if we succeeded or not. */
2002         if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
2003                 *stat = 0;
2004         else if (dir != XFS_LOOKUP_EQ || diff == 0)
2005                 *stat = 1;
2006         else
2007                 *stat = 0;
2008         return 0;
2009
2010 error0:
2011         return error;
2012 }
2013
2014 /* Find the high key storage area from a regular key. */
2015 union xfs_btree_key *
2016 xfs_btree_high_key_from_key(
2017         struct xfs_btree_cur    *cur,
2018         union xfs_btree_key     *key)
2019 {
2020         ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2021         return (union xfs_btree_key *)((char *)key +
2022                         (cur->bc_ops->key_len / 2));
2023 }
2024
2025 /* Determine the low (and high if overlapped) keys of a leaf block */
2026 STATIC void
2027 xfs_btree_get_leaf_keys(
2028         struct xfs_btree_cur    *cur,
2029         struct xfs_btree_block  *block,
2030         union xfs_btree_key     *key)
2031 {
2032         union xfs_btree_key     max_hkey;
2033         union xfs_btree_key     hkey;
2034         union xfs_btree_rec     *rec;
2035         union xfs_btree_key     *high;
2036         int                     n;
2037
2038         rec = xfs_btree_rec_addr(cur, 1, block);
2039         cur->bc_ops->init_key_from_rec(key, rec);
2040
2041         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2042
2043                 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
2044                 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2045                         rec = xfs_btree_rec_addr(cur, n, block);
2046                         cur->bc_ops->init_high_key_from_rec(&hkey, rec);
2047                         if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
2048                                         > 0)
2049                                 max_hkey = hkey;
2050                 }
2051
2052                 high = xfs_btree_high_key_from_key(cur, key);
2053                 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2054         }
2055 }
2056
2057 /* Determine the low (and high if overlapped) keys of a node block */
2058 STATIC void
2059 xfs_btree_get_node_keys(
2060         struct xfs_btree_cur    *cur,
2061         struct xfs_btree_block  *block,
2062         union xfs_btree_key     *key)
2063 {
2064         union xfs_btree_key     *hkey;
2065         union xfs_btree_key     *max_hkey;
2066         union xfs_btree_key     *high;
2067         int                     n;
2068
2069         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2070                 memcpy(key, xfs_btree_key_addr(cur, 1, block),
2071                                 cur->bc_ops->key_len / 2);
2072
2073                 max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2074                 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2075                         hkey = xfs_btree_high_key_addr(cur, n, block);
2076                         if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
2077                                 max_hkey = hkey;
2078                 }
2079
2080                 high = xfs_btree_high_key_from_key(cur, key);
2081                 memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2082         } else {
2083                 memcpy(key, xfs_btree_key_addr(cur, 1, block),
2084                                 cur->bc_ops->key_len);
2085         }
2086 }
2087
2088 /* Derive the keys for any btree block. */
2089 void
2090 xfs_btree_get_keys(
2091         struct xfs_btree_cur    *cur,
2092         struct xfs_btree_block  *block,
2093         union xfs_btree_key     *key)
2094 {
2095         if (be16_to_cpu(block->bb_level) == 0)
2096                 xfs_btree_get_leaf_keys(cur, block, key);
2097         else
2098                 xfs_btree_get_node_keys(cur, block, key);
2099 }
2100
2101 /*
2102  * Decide if we need to update the parent keys of a btree block.  For
2103  * a standard btree this is only necessary if we're updating the first
2104  * record/key.  For an overlapping btree, we must always update the
2105  * keys because the highest key can be in any of the records or keys
2106  * in the block.
2107  */
2108 static inline bool
2109 xfs_btree_needs_key_update(
2110         struct xfs_btree_cur    *cur,
2111         int                     ptr)
2112 {
2113         return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
2114 }
2115
2116 /*
2117  * Update the low and high parent keys of the given level, progressing
2118  * towards the root.  If force_all is false, stop if the keys for a given
2119  * level do not need updating.
2120  */
2121 STATIC int
2122 __xfs_btree_updkeys(
2123         struct xfs_btree_cur    *cur,
2124         int                     level,
2125         struct xfs_btree_block  *block,
2126         struct xfs_buf          *bp0,
2127         bool                    force_all)
2128 {
2129         union xfs_btree_key     key;    /* keys from current level */
2130         union xfs_btree_key     *lkey;  /* keys from the next level up */
2131         union xfs_btree_key     *hkey;
2132         union xfs_btree_key     *nlkey; /* keys from the next level up */
2133         union xfs_btree_key     *nhkey;
2134         struct xfs_buf          *bp;
2135         int                     ptr;
2136
2137         ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2138
2139         /* Exit if there aren't any parent levels to update. */
2140         if (level + 1 >= cur->bc_nlevels)
2141                 return 0;
2142
2143         trace_xfs_btree_updkeys(cur, level, bp0);
2144
2145         lkey = &key;
2146         hkey = xfs_btree_high_key_from_key(cur, lkey);
2147         xfs_btree_get_keys(cur, block, lkey);
2148         for (level++; level < cur->bc_nlevels; level++) {
2149 #ifdef DEBUG
2150                 int             error;
2151 #endif
2152                 block = xfs_btree_get_block(cur, level, &bp);
2153                 trace_xfs_btree_updkeys(cur, level, bp);
2154 #ifdef DEBUG
2155                 error = xfs_btree_check_block(cur, block, level, bp);
2156                 if (error)
2157                         return error;
2158 #endif
2159                 ptr = cur->bc_ptrs[level];
2160                 nlkey = xfs_btree_key_addr(cur, ptr, block);
2161                 nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2162                 if (!force_all &&
2163                     !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
2164                       cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
2165                         break;
2166                 xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2167                 xfs_btree_log_keys(cur, bp, ptr, ptr);
2168                 if (level + 1 >= cur->bc_nlevels)
2169                         break;
2170                 xfs_btree_get_node_keys(cur, block, lkey);
2171         }
2172
2173         return 0;
2174 }
2175
2176 /* Update all the keys from some level in cursor back to the root. */
2177 STATIC int
2178 xfs_btree_updkeys_force(
2179         struct xfs_btree_cur    *cur,
2180         int                     level)
2181 {
2182         struct xfs_buf          *bp;
2183         struct xfs_btree_block  *block;
2184
2185         block = xfs_btree_get_block(cur, level, &bp);
2186         return __xfs_btree_updkeys(cur, level, block, bp, true);
2187 }
2188
2189 /*
2190  * Update the parent keys of the given level, progressing towards the root.
2191  */
2192 STATIC int
2193 xfs_btree_update_keys(
2194         struct xfs_btree_cur    *cur,
2195         int                     level)
2196 {
2197         struct xfs_btree_block  *block;
2198         struct xfs_buf          *bp;
2199         union xfs_btree_key     *kp;
2200         union xfs_btree_key     key;
2201         int                     ptr;
2202
2203         ASSERT(level >= 0);
2204
2205         block = xfs_btree_get_block(cur, level, &bp);
2206         if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
2207                 return __xfs_btree_updkeys(cur, level, block, bp, false);
2208
2209         /*
2210          * Go up the tree from this level toward the root.
2211          * At each level, update the key value to the value input.
2212          * Stop when we reach a level where the cursor isn't pointing
2213          * at the first entry in the block.
2214          */
2215         xfs_btree_get_keys(cur, block, &key);
2216         for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2217 #ifdef DEBUG
2218                 int             error;
2219 #endif
2220                 block = xfs_btree_get_block(cur, level, &bp);
2221 #ifdef DEBUG
2222                 error = xfs_btree_check_block(cur, block, level, bp);
2223                 if (error)
2224                         return error;
2225 #endif
2226                 ptr = cur->bc_ptrs[level];
2227                 kp = xfs_btree_key_addr(cur, ptr, block);
2228                 xfs_btree_copy_keys(cur, kp, &key, 1);
2229                 xfs_btree_log_keys(cur, bp, ptr, ptr);
2230         }
2231
2232         return 0;
2233 }
2234
2235 /*
2236  * Update the record referred to by cur to the value in the
2237  * given record. This either works (return 0) or gets an
2238  * EFSCORRUPTED error.
2239  */
2240 int
2241 xfs_btree_update(
2242         struct xfs_btree_cur    *cur,
2243         union xfs_btree_rec     *rec)
2244 {
2245         struct xfs_btree_block  *block;
2246         struct xfs_buf          *bp;
2247         int                     error;
2248         int                     ptr;
2249         union xfs_btree_rec     *rp;
2250
2251         /* Pick up the current block. */
2252         block = xfs_btree_get_block(cur, 0, &bp);
2253
2254 #ifdef DEBUG
2255         error = xfs_btree_check_block(cur, block, 0, bp);
2256         if (error)
2257                 goto error0;
2258 #endif
2259         /* Get the address of the rec to be updated. */
2260         ptr = cur->bc_ptrs[0];
2261         rp = xfs_btree_rec_addr(cur, ptr, block);
2262
2263         /* Fill in the new contents and log them. */
2264         xfs_btree_copy_recs(cur, rp, rec, 1);
2265         xfs_btree_log_recs(cur, bp, ptr, ptr);
2266
2267         /*
2268          * If we are tracking the last record in the tree and
2269          * we are at the far right edge of the tree, update it.
2270          */
2271         if (xfs_btree_is_lastrec(cur, block, 0)) {
2272                 cur->bc_ops->update_lastrec(cur, block, rec,
2273                                             ptr, LASTREC_UPDATE);
2274         }
2275
2276         /* Pass new key value up to our parent. */
2277         if (xfs_btree_needs_key_update(cur, ptr)) {
2278                 error = xfs_btree_update_keys(cur, 0);
2279                 if (error)
2280                         goto error0;
2281         }
2282
2283         return 0;
2284
2285 error0:
2286         return error;
2287 }
2288
2289 /*
2290  * Move 1 record left from cur/level if possible.
2291  * Update cur to reflect the new path.
2292  */
2293 STATIC int                                      /* error */
2294 xfs_btree_lshift(
2295         struct xfs_btree_cur    *cur,
2296         int                     level,
2297         int                     *stat)          /* success/failure */
2298 {
2299         struct xfs_buf          *lbp;           /* left buffer pointer */
2300         struct xfs_btree_block  *left;          /* left btree block */
2301         int                     lrecs;          /* left record count */
2302         struct xfs_buf          *rbp;           /* right buffer pointer */
2303         struct xfs_btree_block  *right;         /* right btree block */
2304         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2305         int                     rrecs;          /* right record count */
2306         union xfs_btree_ptr     lptr;           /* left btree pointer */
2307         union xfs_btree_key     *rkp = NULL;    /* right btree key */
2308         union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
2309         union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
2310         int                     error;          /* error return value */
2311         int                     i;
2312
2313         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2314             level == cur->bc_nlevels - 1)
2315                 goto out0;
2316
2317         /* Set up variables for this block as "right". */
2318         right = xfs_btree_get_block(cur, level, &rbp);
2319
2320 #ifdef DEBUG
2321         error = xfs_btree_check_block(cur, right, level, rbp);
2322         if (error)
2323                 goto error0;
2324 #endif
2325
2326         /* If we've got no left sibling then we can't shift an entry left. */
2327         xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2328         if (xfs_btree_ptr_is_null(cur, &lptr))
2329                 goto out0;
2330
2331         /*
2332          * If the cursor entry is the one that would be moved, don't
2333          * do it... it's too complicated.
2334          */
2335         if (cur->bc_ptrs[level] <= 1)
2336                 goto out0;
2337
2338         /* Set up the left neighbor as "left". */
2339         error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2340         if (error)
2341                 goto error0;
2342
2343         /* If it's full, it can't take another entry. */
2344         lrecs = xfs_btree_get_numrecs(left);
2345         if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2346                 goto out0;
2347
2348         rrecs = xfs_btree_get_numrecs(right);
2349
2350         /*
2351          * We add one entry to the left side and remove one for the right side.
2352          * Account for it here, the changes will be updated on disk and logged
2353          * later.
2354          */
2355         lrecs++;
2356         rrecs--;
2357
2358         XFS_BTREE_STATS_INC(cur, lshift);
2359         XFS_BTREE_STATS_ADD(cur, moves, 1);
2360
2361         /*
2362          * If non-leaf, copy a key and a ptr to the left block.
2363          * Log the changes to the left block.
2364          */
2365         if (level > 0) {
2366                 /* It's a non-leaf.  Move keys and pointers. */
2367                 union xfs_btree_key     *lkp;   /* left btree key */
2368                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2369
2370                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2371                 rkp = xfs_btree_key_addr(cur, 1, right);
2372
2373                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2374                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2375
2376                 error = xfs_btree_debug_check_ptr(cur, rpp, 0, level);
2377                 if (error)
2378                         goto error0;
2379
2380                 xfs_btree_copy_keys(cur, lkp, rkp, 1);
2381                 xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2382
2383                 xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2384                 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2385
2386                 ASSERT(cur->bc_ops->keys_inorder(cur,
2387                         xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2388         } else {
2389                 /* It's a leaf.  Move records.  */
2390                 union xfs_btree_rec     *lrp;   /* left record pointer */
2391
2392                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2393                 rrp = xfs_btree_rec_addr(cur, 1, right);
2394
2395                 xfs_btree_copy_recs(cur, lrp, rrp, 1);
2396                 xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2397
2398                 ASSERT(cur->bc_ops->recs_inorder(cur,
2399                         xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2400         }
2401
2402         xfs_btree_set_numrecs(left, lrecs);
2403         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2404
2405         xfs_btree_set_numrecs(right, rrecs);
2406         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2407
2408         /*
2409          * Slide the contents of right down one entry.
2410          */
2411         XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2412         if (level > 0) {
2413                 /* It's a nonleaf. operate on keys and ptrs */
2414                 int                     i;              /* loop index */
2415
2416                 for (i = 0; i < rrecs; i++) {
2417                         error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
2418                         if (error)
2419                                 goto error0;
2420                 }
2421
2422                 xfs_btree_shift_keys(cur,
2423                                 xfs_btree_key_addr(cur, 2, right),
2424                                 -1, rrecs);
2425                 xfs_btree_shift_ptrs(cur,
2426                                 xfs_btree_ptr_addr(cur, 2, right),
2427                                 -1, rrecs);
2428
2429                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2430                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2431         } else {
2432                 /* It's a leaf. operate on records */
2433                 xfs_btree_shift_recs(cur,
2434                         xfs_btree_rec_addr(cur, 2, right),
2435                         -1, rrecs);
2436                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2437         }
2438
2439         /*
2440          * Using a temporary cursor, update the parent key values of the
2441          * block on the left.
2442          */
2443         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2444                 error = xfs_btree_dup_cursor(cur, &tcur);
2445                 if (error)
2446                         goto error0;
2447                 i = xfs_btree_firstrec(tcur, level);
2448                 XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2449
2450                 error = xfs_btree_decrement(tcur, level, &i);
2451                 if (error)
2452                         goto error1;
2453
2454                 /* Update the parent high keys of the left block, if needed. */
2455                 error = xfs_btree_update_keys(tcur, level);
2456                 if (error)
2457                         goto error1;
2458
2459                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2460         }
2461
2462         /* Update the parent keys of the right block. */
2463         error = xfs_btree_update_keys(cur, level);
2464         if (error)
2465                 goto error0;
2466
2467         /* Slide the cursor value left one. */
2468         cur->bc_ptrs[level]--;
2469
2470         *stat = 1;
2471         return 0;
2472
2473 out0:
2474         *stat = 0;
2475         return 0;
2476
2477 error0:
2478         return error;
2479
2480 error1:
2481         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2482         return error;
2483 }
2484
2485 /*
2486  * Move 1 record right from cur/level if possible.
2487  * Update cur to reflect the new path.
2488  */
2489 STATIC int                                      /* error */
2490 xfs_btree_rshift(
2491         struct xfs_btree_cur    *cur,
2492         int                     level,
2493         int                     *stat)          /* success/failure */
2494 {
2495         struct xfs_buf          *lbp;           /* left buffer pointer */
2496         struct xfs_btree_block  *left;          /* left btree block */
2497         struct xfs_buf          *rbp;           /* right buffer pointer */
2498         struct xfs_btree_block  *right;         /* right btree block */
2499         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2500         union xfs_btree_ptr     rptr;           /* right block pointer */
2501         union xfs_btree_key     *rkp;           /* right btree key */
2502         int                     rrecs;          /* right record count */
2503         int                     lrecs;          /* left record count */
2504         int                     error;          /* error return value */
2505         int                     i;              /* loop counter */
2506
2507         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2508             (level == cur->bc_nlevels - 1))
2509                 goto out0;
2510
2511         /* Set up variables for this block as "left". */
2512         left = xfs_btree_get_block(cur, level, &lbp);
2513
2514 #ifdef DEBUG
2515         error = xfs_btree_check_block(cur, left, level, lbp);
2516         if (error)
2517                 goto error0;
2518 #endif
2519
2520         /* If we've got no right sibling then we can't shift an entry right. */
2521         xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2522         if (xfs_btree_ptr_is_null(cur, &rptr))
2523                 goto out0;
2524
2525         /*
2526          * If the cursor entry is the one that would be moved, don't
2527          * do it... it's too complicated.
2528          */
2529         lrecs = xfs_btree_get_numrecs(left);
2530         if (cur->bc_ptrs[level] >= lrecs)
2531                 goto out0;
2532
2533         /* Set up the right neighbor as "right". */
2534         error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2535         if (error)
2536                 goto error0;
2537
2538         /* If it's full, it can't take another entry. */
2539         rrecs = xfs_btree_get_numrecs(right);
2540         if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2541                 goto out0;
2542
2543         XFS_BTREE_STATS_INC(cur, rshift);
2544         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2545
2546         /*
2547          * Make a hole at the start of the right neighbor block, then
2548          * copy the last left block entry to the hole.
2549          */
2550         if (level > 0) {
2551                 /* It's a nonleaf. make a hole in the keys and ptrs */
2552                 union xfs_btree_key     *lkp;
2553                 union xfs_btree_ptr     *lpp;
2554                 union xfs_btree_ptr     *rpp;
2555
2556                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2557                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2558                 rkp = xfs_btree_key_addr(cur, 1, right);
2559                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2560
2561                 for (i = rrecs - 1; i >= 0; i--) {
2562                         error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
2563                         if (error)
2564                                 goto error0;
2565                 }
2566
2567                 xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2568                 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2569
2570                 error = xfs_btree_debug_check_ptr(cur, lpp, 0, level);
2571                 if (error)
2572                         goto error0;
2573
2574                 /* Now put the new data in, and log it. */
2575                 xfs_btree_copy_keys(cur, rkp, lkp, 1);
2576                 xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2577
2578                 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2579                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2580
2581                 ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2582                         xfs_btree_key_addr(cur, 2, right)));
2583         } else {
2584                 /* It's a leaf. make a hole in the records */
2585                 union xfs_btree_rec     *lrp;
2586                 union xfs_btree_rec     *rrp;
2587
2588                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2589                 rrp = xfs_btree_rec_addr(cur, 1, right);
2590
2591                 xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2592
2593                 /* Now put the new data in, and log it. */
2594                 xfs_btree_copy_recs(cur, rrp, lrp, 1);
2595                 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2596         }
2597
2598         /*
2599          * Decrement and log left's numrecs, bump and log right's numrecs.
2600          */
2601         xfs_btree_set_numrecs(left, --lrecs);
2602         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2603
2604         xfs_btree_set_numrecs(right, ++rrecs);
2605         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2606
2607         /*
2608          * Using a temporary cursor, update the parent key values of the
2609          * block on the right.
2610          */
2611         error = xfs_btree_dup_cursor(cur, &tcur);
2612         if (error)
2613                 goto error0;
2614         i = xfs_btree_lastrec(tcur, level);
2615         XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2616
2617         error = xfs_btree_increment(tcur, level, &i);
2618         if (error)
2619                 goto error1;
2620
2621         /* Update the parent high keys of the left block, if needed. */
2622         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2623                 error = xfs_btree_update_keys(cur, level);
2624                 if (error)
2625                         goto error1;
2626         }
2627
2628         /* Update the parent keys of the right block. */
2629         error = xfs_btree_update_keys(tcur, level);
2630         if (error)
2631                 goto error1;
2632
2633         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2634
2635         *stat = 1;
2636         return 0;
2637
2638 out0:
2639         *stat = 0;
2640         return 0;
2641
2642 error0:
2643         return error;
2644
2645 error1:
2646         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2647         return error;
2648 }
2649
2650 /*
2651  * Split cur/level block in half.
2652  * Return new block number and the key to its first
2653  * record (to be inserted into parent).
2654  */
2655 STATIC int                                      /* error */
2656 __xfs_btree_split(
2657         struct xfs_btree_cur    *cur,
2658         int                     level,
2659         union xfs_btree_ptr     *ptrp,
2660         union xfs_btree_key     *key,
2661         struct xfs_btree_cur    **curp,
2662         int                     *stat)          /* success/failure */
2663 {
2664         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2665         struct xfs_buf          *lbp;           /* left buffer pointer */
2666         struct xfs_btree_block  *left;          /* left btree block */
2667         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2668         struct xfs_buf          *rbp;           /* right buffer pointer */
2669         struct xfs_btree_block  *right;         /* right btree block */
2670         union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2671         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2672         struct xfs_btree_block  *rrblock;       /* right-right btree block */
2673         int                     lrecs;
2674         int                     rrecs;
2675         int                     src_index;
2676         int                     error;          /* error return value */
2677         int                     i;
2678
2679         XFS_BTREE_STATS_INC(cur, split);
2680
2681         /* Set up left block (current one). */
2682         left = xfs_btree_get_block(cur, level, &lbp);
2683
2684 #ifdef DEBUG
2685         error = xfs_btree_check_block(cur, left, level, lbp);
2686         if (error)
2687                 goto error0;
2688 #endif
2689
2690         xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2691
2692         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2693         error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2694         if (error)
2695                 goto error0;
2696         if (*stat == 0)
2697                 goto out0;
2698         XFS_BTREE_STATS_INC(cur, alloc);
2699
2700         /* Set up the new block as "right". */
2701         error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
2702         if (error)
2703                 goto error0;
2704
2705         /* Fill in the btree header for the new right block. */
2706         xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2707
2708         /*
2709          * Split the entries between the old and the new block evenly.
2710          * Make sure that if there's an odd number of entries now, that
2711          * each new block will have the same number of entries.
2712          */
2713         lrecs = xfs_btree_get_numrecs(left);
2714         rrecs = lrecs / 2;
2715         if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2716                 rrecs++;
2717         src_index = (lrecs - rrecs + 1);
2718
2719         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2720
2721         /* Adjust numrecs for the later get_*_keys() calls. */
2722         lrecs -= rrecs;
2723         xfs_btree_set_numrecs(left, lrecs);
2724         xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2725
2726         /*
2727          * Copy btree block entries from the left block over to the
2728          * new block, the right. Update the right block and log the
2729          * changes.
2730          */
2731         if (level > 0) {
2732                 /* It's a non-leaf.  Move keys and pointers. */
2733                 union xfs_btree_key     *lkp;   /* left btree key */
2734                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2735                 union xfs_btree_key     *rkp;   /* right btree key */
2736                 union xfs_btree_ptr     *rpp;   /* right address pointer */
2737
2738                 lkp = xfs_btree_key_addr(cur, src_index, left);
2739                 lpp = xfs_btree_ptr_addr(cur, src_index, left);
2740                 rkp = xfs_btree_key_addr(cur, 1, right);
2741                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2742
2743                 for (i = src_index; i < rrecs; i++) {
2744                         error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
2745                         if (error)
2746                                 goto error0;
2747                 }
2748
2749                 /* Copy the keys & pointers to the new block. */
2750                 xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2751                 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2752
2753                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2754                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2755
2756                 /* Stash the keys of the new block for later insertion. */
2757                 xfs_btree_get_node_keys(cur, right, key);
2758         } else {
2759                 /* It's a leaf.  Move records.  */
2760                 union xfs_btree_rec     *lrp;   /* left record pointer */
2761                 union xfs_btree_rec     *rrp;   /* right record pointer */
2762
2763                 lrp = xfs_btree_rec_addr(cur, src_index, left);
2764                 rrp = xfs_btree_rec_addr(cur, 1, right);
2765
2766                 /* Copy records to the new block. */
2767                 xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2768                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2769
2770                 /* Stash the keys of the new block for later insertion. */
2771                 xfs_btree_get_leaf_keys(cur, right, key);
2772         }
2773
2774         /*
2775          * Find the left block number by looking in the buffer.
2776          * Adjust sibling pointers.
2777          */
2778         xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2779         xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2780         xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2781         xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2782
2783         xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2784         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2785
2786         /*
2787          * If there's a block to the new block's right, make that block
2788          * point back to right instead of to left.
2789          */
2790         if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2791                 error = xfs_btree_read_buf_block(cur, &rrptr,
2792                                                         0, &rrblock, &rrbp);
2793                 if (error)
2794                         goto error0;
2795                 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2796                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2797         }
2798
2799         /* Update the parent high keys of the left block, if needed. */
2800         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2801                 error = xfs_btree_update_keys(cur, level);
2802                 if (error)
2803                         goto error0;
2804         }
2805
2806         /*
2807          * If the cursor is really in the right block, move it there.
2808          * If it's just pointing past the last entry in left, then we'll
2809          * insert there, so don't change anything in that case.
2810          */
2811         if (cur->bc_ptrs[level] > lrecs + 1) {
2812                 xfs_btree_setbuf(cur, level, rbp);
2813                 cur->bc_ptrs[level] -= lrecs;
2814         }
2815         /*
2816          * If there are more levels, we'll need another cursor which refers
2817          * the right block, no matter where this cursor was.
2818          */
2819         if (level + 1 < cur->bc_nlevels) {
2820                 error = xfs_btree_dup_cursor(cur, curp);
2821                 if (error)
2822                         goto error0;
2823                 (*curp)->bc_ptrs[level + 1]++;
2824         }
2825         *ptrp = rptr;
2826         *stat = 1;
2827         return 0;
2828 out0:
2829         *stat = 0;
2830         return 0;
2831
2832 error0:
2833         return error;
2834 }
2835
2836 struct xfs_btree_split_args {
2837         struct xfs_btree_cur    *cur;
2838         int                     level;
2839         union xfs_btree_ptr     *ptrp;
2840         union xfs_btree_key     *key;
2841         struct xfs_btree_cur    **curp;
2842         int                     *stat;          /* success/failure */
2843         int                     result;
2844         bool                    kswapd; /* allocation in kswapd context */
2845         struct completion       *done;
2846         struct work_struct      work;
2847 };
2848
2849 /*
2850  * Stack switching interfaces for allocation
2851  */
2852 static void
2853 xfs_btree_split_worker(
2854         struct work_struct      *work)
2855 {
2856         struct xfs_btree_split_args     *args = container_of(work,
2857                                                 struct xfs_btree_split_args, work);
2858         unsigned long           pflags;
2859         unsigned long           new_pflags = PF_MEMALLOC_NOFS;
2860
2861         /*
2862          * we are in a transaction context here, but may also be doing work
2863          * in kswapd context, and hence we may need to inherit that state
2864          * temporarily to ensure that we don't block waiting for memory reclaim
2865          * in any way.
2866          */
2867         if (args->kswapd)
2868                 new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2869
2870         current_set_flags_nested(&pflags, new_pflags);
2871
2872         args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2873                                          args->key, args->curp, args->stat);
2874         complete(args->done);
2875
2876         current_restore_flags_nested(&pflags, new_pflags);
2877 }
2878
2879 /*
2880  * BMBT split requests often come in with little stack to work on. Push
2881  * them off to a worker thread so there is lots of stack to use. For the other
2882  * btree types, just call directly to avoid the context switch overhead here.
2883  */
2884 STATIC int                                      /* error */
2885 xfs_btree_split(
2886         struct xfs_btree_cur    *cur,
2887         int                     level,
2888         union xfs_btree_ptr     *ptrp,
2889         union xfs_btree_key     *key,
2890         struct xfs_btree_cur    **curp,
2891         int                     *stat)          /* success/failure */
2892 {
2893         struct xfs_btree_split_args     args;
2894         DECLARE_COMPLETION_ONSTACK(done);
2895
2896         if (cur->bc_btnum != XFS_BTNUM_BMAP)
2897                 return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2898
2899         args.cur = cur;
2900         args.level = level;
2901         args.ptrp = ptrp;
2902         args.key = key;
2903         args.curp = curp;
2904         args.stat = stat;
2905         args.done = &done;
2906         args.kswapd = current_is_kswapd();
2907         INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2908         queue_work(xfs_alloc_wq, &args.work);
2909         wait_for_completion(&done);
2910         destroy_work_on_stack(&args.work);
2911         return args.result;
2912 }
2913
2914
2915 /*
2916  * Copy the old inode root contents into a real block and make the
2917  * broot point to it.
2918  */
2919 int                                             /* error */
2920 xfs_btree_new_iroot(
2921         struct xfs_btree_cur    *cur,           /* btree cursor */
2922         int                     *logflags,      /* logging flags for inode */
2923         int                     *stat)          /* return status - 0 fail */
2924 {
2925         struct xfs_buf          *cbp;           /* buffer for cblock */
2926         struct xfs_btree_block  *block;         /* btree block */
2927         struct xfs_btree_block  *cblock;        /* child btree block */
2928         union xfs_btree_key     *ckp;           /* child key pointer */
2929         union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2930         union xfs_btree_key     *kp;            /* pointer to btree key */
2931         union xfs_btree_ptr     *pp;            /* pointer to block addr */
2932         union xfs_btree_ptr     nptr;           /* new block addr */
2933         int                     level;          /* btree level */
2934         int                     error;          /* error return code */
2935         int                     i;              /* loop counter */
2936
2937         XFS_BTREE_STATS_INC(cur, newroot);
2938
2939         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2940
2941         level = cur->bc_nlevels - 1;
2942
2943         block = xfs_btree_get_iroot(cur);
2944         pp = xfs_btree_ptr_addr(cur, 1, block);
2945
2946         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2947         error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2948         if (error)
2949                 goto error0;
2950         if (*stat == 0)
2951                 return 0;
2952
2953         XFS_BTREE_STATS_INC(cur, alloc);
2954
2955         /* Copy the root into a real block. */
2956         error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
2957         if (error)
2958                 goto error0;
2959
2960         /*
2961          * we can't just memcpy() the root in for CRC enabled btree blocks.
2962          * In that case have to also ensure the blkno remains correct
2963          */
2964         memcpy(cblock, block, xfs_btree_block_len(cur));
2965         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2966                 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2967                         cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2968                 else
2969                         cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2970         }
2971
2972         be16_add_cpu(&block->bb_level, 1);
2973         xfs_btree_set_numrecs(block, 1);
2974         cur->bc_nlevels++;
2975         cur->bc_ptrs[level + 1] = 1;
2976
2977         kp = xfs_btree_key_addr(cur, 1, block);
2978         ckp = xfs_btree_key_addr(cur, 1, cblock);
2979         xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2980
2981         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2982         for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2983                 error = xfs_btree_debug_check_ptr(cur, pp, i, level);
2984                 if (error)
2985                         goto error0;
2986         }
2987
2988         xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2989
2990         error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
2991         if (error)
2992                 goto error0;
2993
2994         xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2995
2996         xfs_iroot_realloc(cur->bc_private.b.ip,
2997                           1 - xfs_btree_get_numrecs(cblock),
2998                           cur->bc_private.b.whichfork);
2999
3000         xfs_btree_setbuf(cur, level, cbp);
3001
3002         /*
3003          * Do all this logging at the end so that
3004          * the root is at the right level.
3005          */
3006         xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
3007         xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3008         xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3009
3010         *logflags |=
3011                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
3012         *stat = 1;
3013         return 0;
3014 error0:
3015         return error;
3016 }
3017
3018 /*
3019  * Allocate a new root block, fill it in.
3020  */
3021 STATIC int                              /* error */
3022 xfs_btree_new_root(
3023         struct xfs_btree_cur    *cur,   /* btree cursor */
3024         int                     *stat)  /* success/failure */
3025 {
3026         struct xfs_btree_block  *block; /* one half of the old root block */
3027         struct xfs_buf          *bp;    /* buffer containing block */
3028         int                     error;  /* error return value */
3029         struct xfs_buf          *lbp;   /* left buffer pointer */
3030         struct xfs_btree_block  *left;  /* left btree block */
3031         struct xfs_buf          *nbp;   /* new (root) buffer */
3032         struct xfs_btree_block  *new;   /* new (root) btree block */
3033         int                     nptr;   /* new value for key index, 1 or 2 */
3034         struct xfs_buf          *rbp;   /* right buffer pointer */
3035         struct xfs_btree_block  *right; /* right btree block */
3036         union xfs_btree_ptr     rptr;
3037         union xfs_btree_ptr     lptr;
3038
3039         XFS_BTREE_STATS_INC(cur, newroot);
3040
3041         /* initialise our start point from the cursor */
3042         cur->bc_ops->init_ptr_from_cur(cur, &rptr);
3043
3044         /* Allocate the new block. If we can't do it, we're toast. Give up. */
3045         error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
3046         if (error)
3047                 goto error0;
3048         if (*stat == 0)
3049                 goto out0;
3050         XFS_BTREE_STATS_INC(cur, alloc);
3051
3052         /* Set up the new block. */
3053         error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
3054         if (error)
3055                 goto error0;
3056
3057         /* Set the root in the holding structure  increasing the level by 1. */
3058         cur->bc_ops->set_root(cur, &lptr, 1);
3059
3060         /*
3061          * At the previous root level there are now two blocks: the old root,
3062          * and the new block generated when it was split.  We don't know which
3063          * one the cursor is pointing at, so we set up variables "left" and
3064          * "right" for each case.
3065          */
3066         block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3067
3068 #ifdef DEBUG
3069         error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3070         if (error)
3071                 goto error0;
3072 #endif
3073
3074         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3075         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3076                 /* Our block is left, pick up the right block. */
3077                 lbp = bp;
3078                 xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3079                 left = block;
3080                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3081                 if (error)
3082                         goto error0;
3083                 bp = rbp;
3084                 nptr = 1;
3085         } else {
3086                 /* Our block is right, pick up the left block. */
3087                 rbp = bp;
3088                 xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3089                 right = block;
3090                 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3091                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3092                 if (error)
3093                         goto error0;
3094                 bp = lbp;
3095                 nptr = 2;
3096         }
3097
3098         /* Fill in the new block's btree header and log it. */
3099         xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3100         xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3101         ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3102                         !xfs_btree_ptr_is_null(cur, &rptr));
3103
3104         /* Fill in the key data in the new root. */
3105         if (xfs_btree_get_level(left) > 0) {
3106                 /*
3107                  * Get the keys for the left block's keys and put them directly
3108                  * in the parent block.  Do the same for the right block.
3109                  */
3110                 xfs_btree_get_node_keys(cur, left,
3111                                 xfs_btree_key_addr(cur, 1, new));
3112                 xfs_btree_get_node_keys(cur, right,
3113                                 xfs_btree_key_addr(cur, 2, new));
3114         } else {
3115                 /*
3116                  * Get the keys for the left block's records and put them
3117                  * directly in the parent block.  Do the same for the right
3118                  * block.
3119                  */
3120                 xfs_btree_get_leaf_keys(cur, left,
3121                         xfs_btree_key_addr(cur, 1, new));
3122                 xfs_btree_get_leaf_keys(cur, right,
3123                         xfs_btree_key_addr(cur, 2, new));
3124         }
3125         xfs_btree_log_keys(cur, nbp, 1, 2);
3126
3127         /* Fill in the pointer data in the new root. */
3128         xfs_btree_copy_ptrs(cur,
3129                 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3130         xfs_btree_copy_ptrs(cur,
3131                 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3132         xfs_btree_log_ptrs(cur, nbp, 1, 2);
3133
3134         /* Fix up the cursor. */
3135         xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3136         cur->bc_ptrs[cur->bc_nlevels] = nptr;
3137         cur->bc_nlevels++;
3138         *stat = 1;
3139         return 0;
3140 error0:
3141         return error;
3142 out0:
3143         *stat = 0;
3144         return 0;
3145 }
3146
3147 STATIC int
3148 xfs_btree_make_block_unfull(
3149         struct xfs_btree_cur    *cur,   /* btree cursor */
3150         int                     level,  /* btree level */
3151         int                     numrecs,/* # of recs in block */
3152         int                     *oindex,/* old tree index */
3153         int                     *index, /* new tree index */
3154         union xfs_btree_ptr     *nptr,  /* new btree ptr */
3155         struct xfs_btree_cur    **ncur, /* new btree cursor */
3156         union xfs_btree_key     *key,   /* key of new block */
3157         int                     *stat)
3158 {
3159         int                     error = 0;
3160
3161         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3162             level == cur->bc_nlevels - 1) {
3163                 struct xfs_inode *ip = cur->bc_private.b.ip;
3164
3165                 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3166                         /* A root block that can be made bigger. */
3167                         xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
3168                         *stat = 1;
3169                 } else {
3170                         /* A root block that needs replacing */
3171                         int     logflags = 0;
3172
3173                         error = xfs_btree_new_iroot(cur, &logflags, stat);
3174                         if (error || *stat == 0)
3175                                 return error;
3176
3177                         xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3178                 }
3179
3180                 return 0;
3181         }
3182
3183         /* First, try shifting an entry to the right neighbor. */
3184         error = xfs_btree_rshift(cur, level, stat);
3185         if (error || *stat)
3186                 return error;
3187
3188         /* Next, try shifting an entry to the left neighbor. */
3189         error = xfs_btree_lshift(cur, level, stat);
3190         if (error)
3191                 return error;
3192
3193         if (*stat) {
3194                 *oindex = *index = cur->bc_ptrs[level];
3195                 return 0;
3196         }
3197
3198         /*
3199          * Next, try splitting the current block in half.
3200          *
3201          * If this works we have to re-set our variables because we
3202          * could be in a different block now.
3203          */
3204         error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3205         if (error || *stat == 0)
3206                 return error;
3207
3208
3209         *index = cur->bc_ptrs[level];
3210         return 0;
3211 }
3212
3213 /*
3214  * Insert one record/level.  Return information to the caller
3215  * allowing the next level up to proceed if necessary.
3216  */
3217 STATIC int
3218 xfs_btree_insrec(
3219         struct xfs_btree_cur    *cur,   /* btree cursor */
3220         int                     level,  /* level to insert record at */
3221         union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
3222         union xfs_btree_rec     *rec,   /* record to insert */
3223         union xfs_btree_key     *key,   /* i/o: block key for ptrp */
3224         struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
3225         int                     *stat)  /* success/failure */
3226 {
3227         struct xfs_btree_block  *block; /* btree block */
3228         struct xfs_buf          *bp;    /* buffer for block */
3229         union xfs_btree_ptr     nptr;   /* new block ptr */
3230         struct xfs_btree_cur    *ncur;  /* new btree cursor */
3231         union xfs_btree_key     nkey;   /* new block key */
3232         union xfs_btree_key     *lkey;
3233         int                     optr;   /* old key/record index */
3234         int                     ptr;    /* key/record index */
3235         int                     numrecs;/* number of records */
3236         int                     error;  /* error return value */
3237         int                     i;
3238         xfs_daddr_t             old_bn;
3239
3240         ncur = NULL;
3241         lkey = &nkey;
3242
3243         /*
3244          * If we have an external root pointer, and we've made it to the
3245          * root level, allocate a new root block and we're done.
3246          */
3247         if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3248             (level >= cur->bc_nlevels)) {
3249                 error = xfs_btree_new_root(cur, stat);
3250                 xfs_btree_set_ptr_null(cur, ptrp);
3251
3252                 return error;
3253         }
3254
3255         /* If we're off the left edge, return failure. */
3256         ptr = cur->bc_ptrs[level];
3257         if (ptr == 0) {
3258                 *stat = 0;
3259                 return 0;
3260         }
3261
3262         optr = ptr;
3263
3264         XFS_BTREE_STATS_INC(cur, insrec);
3265
3266         /* Get pointers to the btree buffer and block. */
3267         block = xfs_btree_get_block(cur, level, &bp);
3268         old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
3269         numrecs = xfs_btree_get_numrecs(block);
3270
3271 #ifdef DEBUG
3272         error = xfs_btree_check_block(cur, block, level, bp);
3273         if (error)
3274                 goto error0;
3275
3276         /* Check that the new entry is being inserted in the right place. */
3277         if (ptr <= numrecs) {
3278                 if (level == 0) {
3279                         ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3280                                 xfs_btree_rec_addr(cur, ptr, block)));
3281                 } else {
3282                         ASSERT(cur->bc_ops->keys_inorder(cur, key,
3283                                 xfs_btree_key_addr(cur, ptr, block)));
3284                 }
3285         }
3286 #endif
3287
3288         /*
3289          * If the block is full, we can't insert the new entry until we
3290          * make the block un-full.
3291          */
3292         xfs_btree_set_ptr_null(cur, &nptr);
3293         if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3294                 error = xfs_btree_make_block_unfull(cur, level, numrecs,
3295                                         &optr, &ptr, &nptr, &ncur, lkey, stat);
3296                 if (error || *stat == 0)
3297                         goto error0;
3298         }
3299
3300         /*
3301          * The current block may have changed if the block was
3302          * previously full and we have just made space in it.
3303          */
3304         block = xfs_btree_get_block(cur, level, &bp);
3305         numrecs = xfs_btree_get_numrecs(block);
3306
3307 #ifdef DEBUG
3308         error = xfs_btree_check_block(cur, block, level, bp);
3309         if (error)
3310                 return error;
3311 #endif
3312
3313         /*
3314          * At this point we know there's room for our new entry in the block
3315          * we're pointing at.
3316          */
3317         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3318
3319         if (level > 0) {
3320                 /* It's a nonleaf. make a hole in the keys and ptrs */
3321                 union xfs_btree_key     *kp;
3322                 union xfs_btree_ptr     *pp;
3323
3324                 kp = xfs_btree_key_addr(cur, ptr, block);
3325                 pp = xfs_btree_ptr_addr(cur, ptr, block);
3326
3327                 for (i = numrecs - ptr; i >= 0; i--) {
3328                         error = xfs_btree_debug_check_ptr(cur, pp, i, level);
3329                         if (error)
3330                                 return error;
3331                 }
3332
3333                 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3334                 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3335
3336                 error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level);
3337                 if (error)
3338                         goto error0;
3339
3340                 /* Now put the new data in, bump numrecs and log it. */
3341                 xfs_btree_copy_keys(cur, kp, key, 1);
3342                 xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3343                 numrecs++;
3344                 xfs_btree_set_numrecs(block, numrecs);
3345                 xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3346                 xfs_btree_log_keys(cur, bp, ptr, numrecs);
3347 #ifdef DEBUG
3348                 if (ptr < numrecs) {
3349                         ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3350                                 xfs_btree_key_addr(cur, ptr + 1, block)));
3351                 }
3352 #endif
3353         } else {
3354                 /* It's a leaf. make a hole in the records */
3355                 union xfs_btree_rec             *rp;
3356
3357                 rp = xfs_btree_rec_addr(cur, ptr, block);
3358
3359                 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3360
3361                 /* Now put the new data in, bump numrecs and log it. */
3362                 xfs_btree_copy_recs(cur, rp, rec, 1);
3363                 xfs_btree_set_numrecs(block, ++numrecs);
3364                 xfs_btree_log_recs(cur, bp, ptr, numrecs);
3365 #ifdef DEBUG
3366                 if (ptr < numrecs) {
3367                         ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3368                                 xfs_btree_rec_addr(cur, ptr + 1, block)));
3369                 }
3370 #endif
3371         }
3372
3373         /* Log the new number of records in the btree header. */
3374         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3375
3376         /*
3377          * If we just inserted into a new tree block, we have to
3378          * recalculate nkey here because nkey is out of date.
3379          *
3380          * Otherwise we're just updating an existing block (having shoved
3381          * some records into the new tree block), so use the regular key
3382          * update mechanism.
3383          */
3384         if (bp && bp->b_bn != old_bn) {
3385                 xfs_btree_get_keys(cur, block, lkey);
3386         } else if (xfs_btree_needs_key_update(cur, optr)) {
3387                 error = xfs_btree_update_keys(cur, level);
3388                 if (error)
3389                         goto error0;
3390         }
3391
3392         /*
3393          * If we are tracking the last record in the tree and
3394          * we are at the far right edge of the tree, update it.
3395          */
3396         if (xfs_btree_is_lastrec(cur, block, level)) {
3397                 cur->bc_ops->update_lastrec(cur, block, rec,
3398                                             ptr, LASTREC_INSREC);
3399         }
3400
3401         /*
3402          * Return the new block number, if any.
3403          * If there is one, give back a record value and a cursor too.
3404          */
3405         *ptrp = nptr;
3406         if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3407                 xfs_btree_copy_keys(cur, key, lkey, 1);
3408                 *curp = ncur;
3409         }
3410
3411         *stat = 1;
3412         return 0;
3413
3414 error0:
3415         return error;
3416 }
3417
3418 /*
3419  * Insert the record at the point referenced by cur.
3420  *
3421  * A multi-level split of the tree on insert will invalidate the original
3422  * cursor.  All callers of this function should assume that the cursor is
3423  * no longer valid and revalidate it.
3424  */
3425 int
3426 xfs_btree_insert(
3427         struct xfs_btree_cur    *cur,
3428         int                     *stat)
3429 {
3430         int                     error;  /* error return value */
3431         int                     i;      /* result value, 0 for failure */
3432         int                     level;  /* current level number in btree */
3433         union xfs_btree_ptr     nptr;   /* new block number (split result) */
3434         struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3435         struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3436         union xfs_btree_key     bkey;   /* key of block to insert */
3437         union xfs_btree_key     *key;
3438         union xfs_btree_rec     rec;    /* record to insert */
3439
3440         level = 0;
3441         ncur = NULL;
3442         pcur = cur;
3443         key = &bkey;
3444
3445         xfs_btree_set_ptr_null(cur, &nptr);
3446
3447         /* Make a key out of the record data to be inserted, and save it. */
3448         cur->bc_ops->init_rec_from_cur(cur, &rec);
3449         cur->bc_ops->init_key_from_rec(key, &rec);
3450
3451         /*
3452          * Loop going up the tree, starting at the leaf level.
3453          * Stop when we don't get a split block, that must mean that
3454          * the insert is finished with this level.
3455          */
3456         do {
3457                 /*
3458                  * Insert nrec/nptr into this level of the tree.
3459                  * Note if we fail, nptr will be null.
3460                  */
3461                 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3462                                 &ncur, &i);
3463                 if (error) {
3464                         if (pcur != cur)
3465                                 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3466                         goto error0;
3467                 }
3468
3469                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3470                 level++;
3471
3472                 /*
3473                  * See if the cursor we just used is trash.
3474                  * Can't trash the caller's cursor, but otherwise we should
3475                  * if ncur is a new cursor or we're about to be done.
3476                  */
3477                 if (pcur != cur &&
3478                     (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3479                         /* Save the state from the cursor before we trash it */
3480                         if (cur->bc_ops->update_cursor)
3481                                 cur->bc_ops->update_cursor(pcur, cur);
3482                         cur->bc_nlevels = pcur->bc_nlevels;
3483                         xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3484                 }
3485                 /* If we got a new cursor, switch to it. */
3486                 if (ncur) {
3487                         pcur = ncur;
3488                         ncur = NULL;
3489                 }
3490         } while (!xfs_btree_ptr_is_null(cur, &nptr));
3491
3492         *stat = i;
3493         return 0;
3494 error0:
3495         return error;
3496 }
3497
3498 /*
3499  * Try to merge a non-leaf block back into the inode root.
3500  *
3501  * Note: the killroot names comes from the fact that we're effectively
3502  * killing the old root block.  But because we can't just delete the
3503  * inode we have to copy the single block it was pointing to into the
3504  * inode.
3505  */
3506 STATIC int
3507 xfs_btree_kill_iroot(
3508         struct xfs_btree_cur    *cur)
3509 {
3510         int                     whichfork = cur->bc_private.b.whichfork;
3511         struct xfs_inode        *ip = cur->bc_private.b.ip;
3512         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3513         struct xfs_btree_block  *block;
3514         struct xfs_btree_block  *cblock;
3515         union xfs_btree_key     *kp;
3516         union xfs_btree_key     *ckp;
3517         union xfs_btree_ptr     *pp;
3518         union xfs_btree_ptr     *cpp;
3519         struct xfs_buf          *cbp;
3520         int                     level;
3521         int                     index;
3522         int                     numrecs;
3523         int                     error;
3524 #ifdef DEBUG
3525         union xfs_btree_ptr     ptr;
3526 #endif
3527         int                     i;
3528
3529         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3530         ASSERT(cur->bc_nlevels > 1);
3531
3532         /*
3533          * Don't deal with the root block needs to be a leaf case.
3534          * We're just going to turn the thing back into extents anyway.
3535          */
3536         level = cur->bc_nlevels - 1;
3537         if (level == 1)
3538                 goto out0;
3539
3540         /*
3541          * Give up if the root has multiple children.
3542          */
3543         block = xfs_btree_get_iroot(cur);
3544         if (xfs_btree_get_numrecs(block) != 1)
3545                 goto out0;
3546
3547         cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3548         numrecs = xfs_btree_get_numrecs(cblock);
3549
3550         /*
3551          * Only do this if the next level will fit.
3552          * Then the data must be copied up to the inode,
3553          * instead of freeing the root you free the next level.
3554          */
3555         if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3556                 goto out0;
3557
3558         XFS_BTREE_STATS_INC(cur, killroot);
3559
3560 #ifdef DEBUG
3561         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3562         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3563         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3564         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3565 #endif
3566
3567         index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3568         if (index) {
3569                 xfs_iroot_realloc(cur->bc_private.b.ip, index,
3570                                   cur->bc_private.b.whichfork);
3571                 block = ifp->if_broot;
3572         }
3573
3574         be16_add_cpu(&block->bb_numrecs, index);
3575         ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3576
3577         kp = xfs_btree_key_addr(cur, 1, block);
3578         ckp = xfs_btree_key_addr(cur, 1, cblock);
3579         xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3580
3581         pp = xfs_btree_ptr_addr(cur, 1, block);
3582         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3583
3584         for (i = 0; i < numrecs; i++) {
3585                 error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
3586                 if (error)
3587                         return error;
3588         }
3589
3590         xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3591
3592         error = xfs_btree_free_block(cur, cbp);
3593         if (error)
3594                 return error;
3595
3596         cur->bc_bufs[level - 1] = NULL;
3597         be16_add_cpu(&block->bb_level, -1);
3598         xfs_trans_log_inode(cur->bc_tp, ip,
3599                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3600         cur->bc_nlevels--;
3601 out0:
3602         return 0;
3603 }
3604
3605 /*
3606  * Kill the current root node, and replace it with it's only child node.
3607  */
3608 STATIC int
3609 xfs_btree_kill_root(
3610         struct xfs_btree_cur    *cur,
3611         struct xfs_buf          *bp,
3612         int                     level,
3613         union xfs_btree_ptr     *newroot)
3614 {
3615         int                     error;
3616
3617         XFS_BTREE_STATS_INC(cur, killroot);
3618
3619         /*
3620          * Update the root pointer, decreasing the level by 1 and then
3621          * free the old root.
3622          */
3623         cur->bc_ops->set_root(cur, newroot, -1);
3624
3625         error = xfs_btree_free_block(cur, bp);
3626         if (error)
3627                 return error;
3628
3629         cur->bc_bufs[level] = NULL;
3630         cur->bc_ra[level] = 0;
3631         cur->bc_nlevels--;
3632
3633         return 0;
3634 }
3635
3636 STATIC int
3637 xfs_btree_dec_cursor(
3638         struct xfs_btree_cur    *cur,
3639         int                     level,
3640         int                     *stat)
3641 {
3642         int                     error;
3643         int                     i;
3644
3645         if (level > 0) {
3646                 error = xfs_btree_decrement(cur, level, &i);
3647                 if (error)
3648                         return error;
3649         }
3650
3651         *stat = 1;
3652         return 0;
3653 }
3654
3655 /*
3656  * Single level of the btree record deletion routine.
3657  * Delete record pointed to by cur/level.
3658  * Remove the record from its block then rebalance the tree.
3659  * Return 0 for error, 1 for done, 2 to go on to the next level.
3660  */
3661 STATIC int                                      /* error */
3662 xfs_btree_delrec(
3663         struct xfs_btree_cur    *cur,           /* btree cursor */
3664         int                     level,          /* level removing record from */
3665         int                     *stat)          /* fail/done/go-on */
3666 {
3667         struct xfs_btree_block  *block;         /* btree block */
3668         union xfs_btree_ptr     cptr;           /* current block ptr */
3669         struct xfs_buf          *bp;            /* buffer for block */
3670         int                     error;          /* error return value */
3671         int                     i;              /* loop counter */
3672         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3673         struct xfs_buf          *lbp;           /* left buffer pointer */
3674         struct xfs_btree_block  *left;          /* left btree block */
3675         int                     lrecs = 0;      /* left record count */
3676         int                     ptr;            /* key/record index */
3677         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3678         struct xfs_buf          *rbp;           /* right buffer pointer */
3679         struct xfs_btree_block  *right;         /* right btree block */
3680         struct xfs_btree_block  *rrblock;       /* right-right btree block */
3681         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3682         int                     rrecs = 0;      /* right record count */
3683         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3684         int                     numrecs;        /* temporary numrec count */
3685
3686         tcur = NULL;
3687
3688         /* Get the index of the entry being deleted, check for nothing there. */
3689         ptr = cur->bc_ptrs[level];
3690         if (ptr == 0) {
3691                 *stat = 0;
3692                 return 0;
3693         }
3694
3695         /* Get the buffer & block containing the record or key/ptr. */
3696         block = xfs_btree_get_block(cur, level, &bp);
3697         numrecs = xfs_btree_get_numrecs(block);
3698
3699 #ifdef DEBUG
3700         error = xfs_btree_check_block(cur, block, level, bp);
3701         if (error)
3702                 goto error0;
3703 #endif
3704
3705         /* Fail if we're off the end of the block. */
3706         if (ptr > numrecs) {
3707                 *stat = 0;
3708                 return 0;
3709         }
3710
3711         XFS_BTREE_STATS_INC(cur, delrec);
3712         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3713
3714         /* Excise the entries being deleted. */
3715         if (level > 0) {
3716                 /* It's a nonleaf. operate on keys and ptrs */
3717                 union xfs_btree_key     *lkp;
3718                 union xfs_btree_ptr     *lpp;
3719
3720                 lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3721                 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3722
3723                 for (i = 0; i < numrecs - ptr; i++) {
3724                         error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
3725                         if (error)
3726                                 goto error0;
3727                 }
3728
3729                 if (ptr < numrecs) {
3730                         xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3731                         xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3732                         xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3733                         xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3734                 }
3735         } else {
3736                 /* It's a leaf. operate on records */
3737                 if (ptr < numrecs) {
3738                         xfs_btree_shift_recs(cur,
3739                                 xfs_btree_rec_addr(cur, ptr + 1, block),
3740                                 -1, numrecs - ptr);
3741                         xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3742                 }
3743         }
3744
3745         /*
3746          * Decrement and log the number of entries in the block.
3747          */
3748         xfs_btree_set_numrecs(block, --numrecs);
3749         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3750
3751         /*
3752          * If we are tracking the last record in the tree and
3753          * we are at the far right edge of the tree, update it.
3754          */
3755         if (xfs_btree_is_lastrec(cur, block, level)) {
3756                 cur->bc_ops->update_lastrec(cur, block, NULL,
3757                                             ptr, LASTREC_DELREC);
3758         }
3759
3760         /*
3761          * We're at the root level.  First, shrink the root block in-memory.
3762          * Try to get rid of the next level down.  If we can't then there's
3763          * nothing left to do.
3764          */
3765         if (level == cur->bc_nlevels - 1) {
3766                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3767                         xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3768                                           cur->bc_private.b.whichfork);
3769
3770                         error = xfs_btree_kill_iroot(cur);
3771                         if (error)
3772                                 goto error0;
3773
3774                         error = xfs_btree_dec_cursor(cur, level, stat);
3775                         if (error)
3776                                 goto error0;
3777                         *stat = 1;
3778                         return 0;
3779                 }
3780
3781                 /*
3782                  * If this is the root level, and there's only one entry left,
3783                  * and it's NOT the leaf level, then we can get rid of this
3784                  * level.
3785                  */
3786                 if (numrecs == 1 && level > 0) {
3787                         union xfs_btree_ptr     *pp;
3788                         /*
3789                          * pp is still set to the first pointer in the block.
3790                          * Make it the new root of the btree.
3791                          */
3792                         pp = xfs_btree_ptr_addr(cur, 1, block);
3793                         error = xfs_btree_kill_root(cur, bp, level, pp);
3794                         if (error)
3795                                 goto error0;
3796                 } else if (level > 0) {
3797                         error = xfs_btree_dec_cursor(cur, level, stat);
3798                         if (error)
3799                                 goto error0;
3800                 }
3801                 *stat = 1;
3802                 return 0;
3803         }
3804
3805         /*
3806          * If we deleted the leftmost entry in the block, update the
3807          * key values above us in the tree.
3808          */
3809         if (xfs_btree_needs_key_update(cur, ptr)) {
3810                 error = xfs_btree_update_keys(cur, level);
3811                 if (error)
3812                         goto error0;
3813         }
3814
3815         /*
3816          * If the number of records remaining in the block is at least
3817          * the minimum, we're done.
3818          */
3819         if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3820                 error = xfs_btree_dec_cursor(cur, level, stat);
3821                 if (error)
3822                         goto error0;
3823                 return 0;
3824         }
3825
3826         /*
3827          * Otherwise, we have to move some records around to keep the
3828          * tree balanced.  Look at the left and right sibling blocks to
3829          * see if we can re-balance by moving only one record.
3830          */
3831         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3832         xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3833
3834         if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3835                 /*
3836                  * One child of root, need to get a chance to copy its contents
3837                  * into the root and delete it. Can't go up to next level,
3838                  * there's nothing to delete there.
3839                  */
3840                 if (xfs_btree_ptr_is_null(cur, &rptr) &&
3841                     xfs_btree_ptr_is_null(cur, &lptr) &&
3842                     level == cur->bc_nlevels - 2) {
3843                         error = xfs_btree_kill_iroot(cur);
3844                         if (!error)
3845                                 error = xfs_btree_dec_cursor(cur, level, stat);
3846                         if (error)
3847                                 goto error0;
3848                         return 0;
3849                 }
3850         }
3851
3852         ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3853                !xfs_btree_ptr_is_null(cur, &lptr));
3854
3855         /*
3856          * Duplicate the cursor so our btree manipulations here won't
3857          * disrupt the next level up.
3858          */
3859         error = xfs_btree_dup_cursor(cur, &tcur);
3860         if (error)
3861                 goto error0;
3862
3863         /*
3864          * If there's a right sibling, see if it's ok to shift an entry
3865          * out of it.
3866          */
3867         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3868                 /*
3869                  * Move the temp cursor to the last entry in the next block.
3870                  * Actually any entry but the first would suffice.
3871                  */
3872                 i = xfs_btree_lastrec(tcur, level);
3873                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3874
3875                 error = xfs_btree_increment(tcur, level, &i);
3876                 if (error)
3877                         goto error0;
3878                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3879
3880                 i = xfs_btree_lastrec(tcur, level);
3881                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3882
3883                 /* Grab a pointer to the block. */
3884                 right = xfs_btree_get_block(tcur, level, &rbp);
3885 #ifdef DEBUG
3886                 error = xfs_btree_check_block(tcur, right, level, rbp);
3887                 if (error)
3888                         goto error0;
3889 #endif
3890                 /* Grab the current block number, for future use. */
3891                 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3892
3893                 /*
3894                  * If right block is full enough so that removing one entry
3895                  * won't make it too empty, and left-shifting an entry out
3896                  * of right to us works, we're done.
3897                  */
3898                 if (xfs_btree_get_numrecs(right) - 1 >=
3899                     cur->bc_ops->get_minrecs(tcur, level)) {
3900                         error = xfs_btree_lshift(tcur, level, &i);
3901                         if (error)
3902                                 goto error0;
3903                         if (i) {
3904                                 ASSERT(xfs_btree_get_numrecs(block) >=
3905                                        cur->bc_ops->get_minrecs(tcur, level));
3906
3907                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3908                                 tcur = NULL;
3909
3910                                 error = xfs_btree_dec_cursor(cur, level, stat);
3911                                 if (error)
3912                                         goto error0;
3913                                 return 0;
3914                         }
3915                 }
3916
3917                 /*
3918                  * Otherwise, grab the number of records in right for
3919                  * future reference, and fix up the temp cursor to point
3920                  * to our block again (last record).
3921                  */
3922                 rrecs = xfs_btree_get_numrecs(right);
3923                 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3924                         i = xfs_btree_firstrec(tcur, level);
3925                         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3926
3927                         error = xfs_btree_decrement(tcur, level, &i);
3928                         if (error)
3929                                 goto error0;
3930                         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3931                 }
3932         }
3933
3934         /*
3935          * If there's a left sibling, see if it's ok to shift an entry
3936          * out of it.
3937          */
3938         if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3939                 /*
3940                  * Move the temp cursor to the first entry in the
3941                  * previous block.
3942                  */
3943                 i = xfs_btree_firstrec(tcur, level);
3944                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3945
3946                 error = xfs_btree_decrement(tcur, level, &i);
3947                 if (error)
3948                         goto error0;
3949                 i = xfs_btree_firstrec(tcur, level);
3950                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3951
3952                 /* Grab a pointer to the block. */
3953                 left = xfs_btree_get_block(tcur, level, &lbp);
3954 #ifdef DEBUG
3955                 error = xfs_btree_check_block(cur, left, level, lbp);
3956                 if (error)
3957                         goto error0;
3958 #endif
3959                 /* Grab the current block number, for future use. */
3960                 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3961
3962                 /*
3963                  * If left block is full enough so that removing one entry
3964                  * won't make it too empty, and right-shifting an entry out
3965                  * of left to us works, we're done.
3966                  */
3967                 if (xfs_btree_get_numrecs(left) - 1 >=
3968                     cur->bc_ops->get_minrecs(tcur, level)) {
3969                         error = xfs_btree_rshift(tcur, level, &i);
3970                         if (error)
3971                                 goto error0;
3972                         if (i) {
3973                                 ASSERT(xfs_btree_get_numrecs(block) >=
3974                                        cur->bc_ops->get_minrecs(tcur, level));
3975                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3976                                 tcur = NULL;
3977                                 if (level == 0)
3978                                         cur->bc_ptrs[0]++;
3979
3980                                 *stat = 1;
3981                                 return 0;
3982                         }
3983                 }
3984
3985                 /*
3986                  * Otherwise, grab the number of records in right for
3987                  * future reference.
3988                  */
3989                 lrecs = xfs_btree_get_numrecs(left);
3990         }
3991
3992         /* Delete the temp cursor, we're done with it. */
3993         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3994         tcur = NULL;
3995
3996         /* If here, we need to do a join to keep the tree balanced. */
3997         ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3998
3999         if (!xfs_btree_ptr_is_null(cur, &lptr) &&
4000             lrecs + xfs_btree_get_numrecs(block) <=
4001                         cur->bc_ops->get_maxrecs(cur, level)) {
4002                 /*
4003                  * Set "right" to be the starting block,
4004                  * "left" to be the left neighbor.
4005                  */
4006                 rptr = cptr;
4007                 right = block;
4008                 rbp = bp;
4009                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
4010                 if (error)
4011                         goto error0;
4012
4013         /*
4014          * If that won't work, see if we can join with the right neighbor block.
4015          */
4016         } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4017                    rrecs + xfs_btree_get_numrecs(block) <=
4018                         cur->bc_ops->get_maxrecs(cur, level)) {
4019                 /*
4020                  * Set "left" to be the starting block,
4021                  * "right" to be the right neighbor.
4022                  */
4023                 lptr = cptr;
4024                 left = block;
4025                 lbp = bp;
4026                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4027                 if (error)
4028                         goto error0;
4029
4030         /*
4031          * Otherwise, we can't fix the imbalance.
4032          * Just return.  This is probably a logic error, but it's not fatal.
4033          */
4034         } else {
4035                 error = xfs_btree_dec_cursor(cur, level, stat);
4036                 if (error)
4037                         goto error0;
4038                 return 0;
4039         }
4040
4041         rrecs = xfs_btree_get_numrecs(right);
4042         lrecs = xfs_btree_get_numrecs(left);
4043
4044         /*
4045          * We're now going to join "left" and "right" by moving all the stuff
4046          * in "right" to "left" and deleting "right".
4047          */
4048         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4049         if (level > 0) {
4050                 /* It's a non-leaf.  Move keys and pointers. */
4051                 union xfs_btree_key     *lkp;   /* left btree key */
4052                 union xfs_btree_ptr     *lpp;   /* left address pointer */
4053                 union xfs_btree_key     *rkp;   /* right btree key */
4054                 union xfs_btree_ptr     *rpp;   /* right address pointer */
4055
4056                 lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4057                 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4058                 rkp = xfs_btree_key_addr(cur, 1, right);
4059                 rpp = xfs_btree_ptr_addr(cur, 1, right);
4060
4061                 for (i = 1; i < rrecs; i++) {
4062                         error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
4063                         if (error)
4064                                 goto error0;
4065                 }
4066
4067                 xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4068                 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4069
4070                 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4071                 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4072         } else {
4073                 /* It's a leaf.  Move records.  */
4074                 union xfs_btree_rec     *lrp;   /* left record pointer */
4075                 union xfs_btree_rec     *rrp;   /* right record pointer */
4076
4077                 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4078                 rrp = xfs_btree_rec_addr(cur, 1, right);
4079
4080                 xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4081                 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4082         }
4083
4084         XFS_BTREE_STATS_INC(cur, join);
4085
4086         /*
4087          * Fix up the number of records and right block pointer in the
4088          * surviving block, and log it.
4089          */
4090         xfs_btree_set_numrecs(left, lrecs + rrecs);
4091         xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
4092         xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4093         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4094
4095         /* If there is a right sibling, point it to the remaining block. */
4096         xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4097         if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4098                 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4099                 if (error)
4100                         goto error0;
4101                 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4102                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4103         }
4104
4105         /* Free the deleted block. */
4106         error = xfs_btree_free_block(cur, rbp);
4107         if (error)
4108                 goto error0;
4109
4110         /*
4111          * If we joined with the left neighbor, set the buffer in the
4112          * cursor to the left block, and fix up the index.
4113          */
4114         if (bp != lbp) {
4115                 cur->bc_bufs[level] = lbp;
4116                 cur->bc_ptrs[level] += lrecs;
4117                 cur->bc_ra[level] = 0;
4118         }
4119         /*
4120          * If we joined with the right neighbor and there's a level above
4121          * us, increment the cursor at that level.
4122          */
4123         else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
4124                    (level + 1 < cur->bc_nlevels)) {
4125                 error = xfs_btree_increment(cur, level + 1, &i);
4126                 if (error)
4127                         goto error0;
4128         }
4129
4130         /*
4131          * Readjust the ptr at this level if it's not a leaf, since it's
4132          * still pointing at the deletion point, which makes the cursor
4133          * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4134          * We can't use decrement because it would change the next level up.
4135          */
4136         if (level > 0)
4137                 cur->bc_ptrs[level]--;
4138
4139         /*
4140          * We combined blocks, so we have to update the parent keys if the
4141          * btree supports overlapped intervals.  However, bc_ptrs[level + 1]
4142          * points to the old block so that the caller knows which record to
4143          * delete.  Therefore, the caller must be savvy enough to call updkeys
4144          * for us if we return stat == 2.  The other exit points from this
4145          * function don't require deletions further up the tree, so they can
4146          * call updkeys directly.
4147          */
4148
4149         /* Return value means the next level up has something to do. */
4150         *stat = 2;
4151         return 0;
4152
4153 error0:
4154         if (tcur)
4155                 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4156         return error;
4157 }
4158
4159 /*
4160  * Delete the record pointed to by cur.
4161  * The cursor refers to the place where the record was (could be inserted)
4162  * when the operation returns.
4163  */
4164 int                                     /* error */
4165 xfs_btree_delete(
4166         struct xfs_btree_cur    *cur,
4167         int                     *stat)  /* success/failure */
4168 {
4169         int                     error;  /* error return value */
4170         int                     level;
4171         int                     i;
4172         bool                    joined = false;
4173
4174         /*
4175          * Go up the tree, starting at leaf level.
4176          *
4177          * If 2 is returned then a join was done; go to the next level.
4178          * Otherwise we are done.
4179          */
4180         for (level = 0, i = 2; i == 2; level++) {
4181                 error = xfs_btree_delrec(cur, level, &i);
4182                 if (error)
4183                         goto error0;
4184                 if (i == 2)
4185                         joined = true;
4186         }
4187
4188         /*
4189          * If we combined blocks as part of deleting the record, delrec won't
4190          * have updated the parent high keys so we have to do that here.
4191          */
4192         if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
4193                 error = xfs_btree_updkeys_force(cur, 0);
4194                 if (error)
4195                         goto error0;
4196         }
4197
4198         if (i == 0) {
4199                 for (level = 1; level < cur->bc_nlevels; level++) {
4200                         if (cur->bc_ptrs[level] == 0) {
4201                                 error = xfs_btree_decrement(cur, level, &i);
4202                                 if (error)
4203                                         goto error0;
4204                                 break;
4205                         }
4206                 }
4207         }
4208
4209         *stat = i;
4210         return 0;
4211 error0:
4212         return error;
4213 }
4214
4215 /*
4216  * Get the data from the pointed-to record.
4217  */
4218 int                                     /* error */
4219 xfs_btree_get_rec(
4220         struct xfs_btree_cur    *cur,   /* btree cursor */
4221         union xfs_btree_rec     **recp, /* output: btree record */
4222         int                     *stat)  /* output: success/failure */
4223 {
4224         struct xfs_btree_block  *block; /* btree block */
4225         struct xfs_buf          *bp;    /* buffer pointer */
4226         int                     ptr;    /* record number */
4227 #ifdef DEBUG
4228         int                     error;  /* error return value */
4229 #endif
4230
4231         ptr = cur->bc_ptrs[0];
4232         block = xfs_btree_get_block(cur, 0, &bp);
4233
4234 #ifdef DEBUG
4235         error = xfs_btree_check_block(cur, block, 0, bp);
4236         if (error)
4237                 return error;
4238 #endif
4239
4240         /*
4241          * Off the right end or left end, return failure.
4242          */
4243         if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4244                 *stat = 0;
4245                 return 0;
4246         }
4247
4248         /*
4249          * Point to the record and extract its data.
4250          */
4251         *recp = xfs_btree_rec_addr(cur, ptr, block);
4252         *stat = 1;
4253         return 0;
4254 }
4255
4256 /* Visit a block in a btree. */
4257 STATIC int
4258 xfs_btree_visit_block(
4259         struct xfs_btree_cur            *cur,
4260         int                             level,
4261         xfs_btree_visit_blocks_fn       fn,
4262         void                            *data)
4263 {
4264         struct xfs_btree_block          *block;
4265         struct xfs_buf                  *bp;
4266         union xfs_btree_ptr             rptr;
4267         int                             error;
4268
4269         /* do right sibling readahead */
4270         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4271         block = xfs_btree_get_block(cur, level, &bp);
4272
4273         /* process the block */
4274         error = fn(cur, level, data);
4275         if (error)
4276                 return error;
4277
4278         /* now read rh sibling block for next iteration */
4279         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4280         if (xfs_btree_ptr_is_null(cur, &rptr))
4281                 return -ENOENT;
4282
4283         return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4284 }
4285
4286
4287 /* Visit every block in a btree. */
4288 int
4289 xfs_btree_visit_blocks(
4290         struct xfs_btree_cur            *cur,
4291         xfs_btree_visit_blocks_fn       fn,
4292         void                            *data)
4293 {
4294         union xfs_btree_ptr             lptr;
4295         int                             level;
4296         struct xfs_btree_block          *block = NULL;
4297         int                             error = 0;
4298
4299         cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4300
4301         /* for each level */
4302         for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4303                 /* grab the left hand block */
4304                 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4305                 if (error)
4306                         return error;
4307
4308                 /* readahead the left most block for the next level down */
4309                 if (level > 0) {
4310                         union xfs_btree_ptr     *ptr;
4311
4312                         ptr = xfs_btree_ptr_addr(cur, 1, block);
4313                         xfs_btree_readahead_ptr(cur, ptr, 1);
4314
4315                         /* save for the next iteration of the loop */
4316                         xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4317                 }
4318
4319                 /* for each buffer in the level */
4320                 do {
4321                         error = xfs_btree_visit_block(cur, level, fn, data);
4322                 } while (!error);
4323
4324                 if (error != -ENOENT)
4325                         return error;
4326         }
4327
4328         return 0;
4329 }
4330
4331 /*
4332  * Change the owner of a btree.
4333  *
4334  * The mechanism we use here is ordered buffer logging. Because we don't know
4335  * how many buffers were are going to need to modify, we don't really want to
4336  * have to make transaction reservations for the worst case of every buffer in a
4337  * full size btree as that may be more space that we can fit in the log....
4338  *
4339  * We do the btree walk in the most optimal manner possible - we have sibling
4340  * pointers so we can just walk all the blocks on each level from left to right
4341  * in a single pass, and then move to the next level and do the same. We can
4342  * also do readahead on the sibling pointers to get IO moving more quickly,
4343  * though for slow disks this is unlikely to make much difference to performance
4344  * as the amount of CPU work we have to do before moving to the next block is
4345  * relatively small.
4346  *
4347  * For each btree block that we load, modify the owner appropriately, set the
4348  * buffer as an ordered buffer and log it appropriately. We need to ensure that
4349  * we mark the region we change dirty so that if the buffer is relogged in
4350  * a subsequent transaction the changes we make here as an ordered buffer are
4351  * correctly relogged in that transaction.  If we are in recovery context, then
4352  * just queue the modified buffer as delayed write buffer so the transaction
4353  * recovery completion writes the changes to disk.
4354  */
4355 struct xfs_btree_block_change_owner_info {
4356         uint64_t                new_owner;
4357         struct list_head        *buffer_list;
4358 };
4359
4360 static int
4361 xfs_btree_block_change_owner(
4362         struct xfs_btree_cur    *cur,
4363         int                     level,
4364         void                    *data)
4365 {
4366         struct xfs_btree_block_change_owner_info        *bbcoi = data;
4367         struct xfs_btree_block  *block;
4368         struct xfs_buf          *bp;
4369
4370         /* modify the owner */
4371         block = xfs_btree_get_block(cur, level, &bp);
4372         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
4373                 if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
4374                         return 0;
4375                 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4376         } else {
4377                 if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
4378                         return 0;
4379                 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4380         }
4381
4382         /*
4383          * If the block is a root block hosted in an inode, we might not have a
4384          * buffer pointer here and we shouldn't attempt to log the change as the
4385          * information is already held in the inode and discarded when the root
4386          * block is formatted into the on-disk inode fork. We still change it,
4387          * though, so everything is consistent in memory.
4388          */
4389         if (!bp) {
4390                 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4391                 ASSERT(level == cur->bc_nlevels - 1);
4392                 return 0;
4393         }
4394
4395         if (cur->bc_tp) {
4396                 if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
4397                         xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4398                         return -EAGAIN;
4399                 }
4400         } else {
4401                 xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4402         }
4403
4404         return 0;
4405 }
4406
4407 int
4408 xfs_btree_change_owner(
4409         struct xfs_btree_cur    *cur,
4410         uint64_t                new_owner,
4411         struct list_head        *buffer_list)
4412 {
4413         struct xfs_btree_block_change_owner_info        bbcoi;
4414
4415         bbcoi.new_owner = new_owner;
4416         bbcoi.buffer_list = buffer_list;
4417
4418         return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4419                         &bbcoi);
4420 }
4421
4422 /* Verify the v5 fields of a long-format btree block. */
4423 xfs_failaddr_t
4424 xfs_btree_lblock_v5hdr_verify(
4425         struct xfs_buf          *bp,
4426         uint64_t                owner)
4427 {
4428         struct xfs_mount        *mp = bp->b_target->bt_mount;
4429         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4430
4431         if (!xfs_sb_version_hascrc(&mp->m_sb))
4432                 return __this_address;
4433         if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
4434                 return __this_address;
4435         if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn))
4436                 return __this_address;
4437         if (owner != XFS_RMAP_OWN_UNKNOWN &&
4438             be64_to_cpu(block->bb_u.l.bb_owner) != owner)
4439                 return __this_address;
4440         return NULL;
4441 }
4442
4443 /* Verify a long-format btree block. */
4444 xfs_failaddr_t
4445 xfs_btree_lblock_verify(
4446         struct xfs_buf          *bp,
4447         unsigned int            max_recs)
4448 {
4449         struct xfs_mount        *mp = bp->b_target->bt_mount;
4450         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4451
4452         /* numrecs verification */
4453         if (be16_to_cpu(block->bb_numrecs) > max_recs)
4454                 return __this_address;
4455
4456         /* sibling pointer verification */
4457         if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
4458             !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
4459                 return __this_address;
4460         if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
4461             !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
4462                 return __this_address;
4463
4464         return NULL;
4465 }
4466
4467 /**
4468  * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4469  *                                    btree block
4470  *
4471  * @bp: buffer containing the btree block
4472  * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
4473  * @pag_max_level: pointer to the per-ag max level field
4474  */
4475 xfs_failaddr_t
4476 xfs_btree_sblock_v5hdr_verify(
4477         struct xfs_buf          *bp)
4478 {
4479         struct xfs_mount        *mp = bp->b_target->bt_mount;
4480         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4481         struct xfs_perag        *pag = bp->b_pag;
4482
4483         if (!xfs_sb_version_hascrc(&mp->m_sb))
4484                 return __this_address;
4485         if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4486                 return __this_address;
4487         if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
4488                 return __this_address;
4489         if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4490                 return __this_address;
4491         return NULL;
4492 }
4493
4494 /**
4495  * xfs_btree_sblock_verify() -- verify a short-format btree block
4496  *
4497  * @bp: buffer containing the btree block
4498  * @max_recs: maximum records allowed in this btree node
4499  */
4500 xfs_failaddr_t
4501 xfs_btree_sblock_verify(
4502         struct xfs_buf          *bp,
4503         unsigned int            max_recs)
4504 {
4505         struct xfs_mount        *mp = bp->b_target->bt_mount;
4506         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4507         xfs_agblock_t           agno;
4508
4509         /* numrecs verification */
4510         if (be16_to_cpu(block->bb_numrecs) > max_recs)
4511                 return __this_address;
4512
4513         /* sibling pointer verification */
4514         agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
4515         if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
4516             !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
4517                 return __this_address;
4518         if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
4519             !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
4520                 return __this_address;
4521
4522         return NULL;
4523 }
4524
4525 /*
4526  * Calculate the number of btree levels needed to store a given number of
4527  * records in a short-format btree.
4528  */
4529 uint
4530 xfs_btree_compute_maxlevels(
4531         uint                    *limits,
4532         unsigned long           len)
4533 {
4534         uint                    level;
4535         unsigned long           maxblocks;
4536
4537         maxblocks = (len + limits[0] - 1) / limits[0];
4538         for (level = 1; maxblocks > 1; level++)
4539                 maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4540         return level;
4541 }
4542
4543 /*
4544  * Query a regular btree for all records overlapping a given interval.
4545  * Start with a LE lookup of the key of low_rec and return all records
4546  * until we find a record with a key greater than the key of high_rec.
4547  */
4548 STATIC int
4549 xfs_btree_simple_query_range(
4550         struct xfs_btree_cur            *cur,
4551         union xfs_btree_key             *low_key,
4552         union xfs_btree_key             *high_key,
4553         xfs_btree_query_range_fn        fn,
4554         void                            *priv)
4555 {
4556         union xfs_btree_rec             *recp;
4557         union xfs_btree_key             rec_key;
4558         int64_t                         diff;
4559         int                             stat;
4560         bool                            firstrec = true;
4561         int                             error;
4562
4563         ASSERT(cur->bc_ops->init_high_key_from_rec);
4564         ASSERT(cur->bc_ops->diff_two_keys);
4565
4566         /*
4567          * Find the leftmost record.  The btree cursor must be set
4568          * to the low record used to generate low_key.
4569          */
4570         stat = 0;
4571         error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4572         if (error)
4573                 goto out;
4574
4575         /* Nothing?  See if there's anything to the right. */
4576         if (!stat) {
4577                 error = xfs_btree_increment(cur, 0, &stat);
4578                 if (error)
4579                         goto out;
4580         }
4581
4582         while (stat) {
4583                 /* Find the record. */
4584                 error = xfs_btree_get_rec(cur, &recp, &stat);
4585                 if (error || !stat)
4586                         break;
4587
4588                 /* Skip if high_key(rec) < low_key. */
4589                 if (firstrec) {
4590                         cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4591                         firstrec = false;
4592                         diff = cur->bc_ops->diff_two_keys(cur, low_key,
4593                                         &rec_key);
4594                         if (diff > 0)
4595                                 goto advloop;
4596                 }
4597
4598                 /* Stop if high_key < low_key(rec). */
4599                 cur->bc_ops->init_key_from_rec(&rec_key, recp);
4600                 diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4601                 if (diff > 0)
4602                         break;
4603
4604                 /* Callback */
4605                 error = fn(cur, recp, priv);
4606                 if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
4607                         break;
4608
4609 advloop:
4610                 /* Move on to the next record. */
4611                 error = xfs_btree_increment(cur, 0, &stat);
4612                 if (error)
4613                         break;
4614         }
4615
4616 out:
4617         return error;
4618 }
4619
4620 /*
4621  * Query an overlapped interval btree for all records overlapping a given
4622  * interval.  This function roughly follows the algorithm given in
4623  * "Interval Trees" of _Introduction to Algorithms_, which is section
4624  * 14.3 in the 2nd and 3rd editions.
4625  *
4626  * First, generate keys for the low and high records passed in.
4627  *
4628  * For any leaf node, generate the high and low keys for the record.
4629  * If the record keys overlap with the query low/high keys, pass the
4630  * record to the function iterator.
4631  *
4632  * For any internal node, compare the low and high keys of each
4633  * pointer against the query low/high keys.  If there's an overlap,
4634  * follow the pointer.
4635  *
4636  * As an optimization, we stop scanning a block when we find a low key
4637  * that is greater than the query's high key.
4638  */
4639 STATIC int
4640 xfs_btree_overlapped_query_range(
4641         struct xfs_btree_cur            *cur,
4642         union xfs_btree_key             *low_key,
4643         union xfs_btree_key             *high_key,
4644         xfs_btree_query_range_fn        fn,
4645         void                            *priv)
4646 {
4647         union xfs_btree_ptr             ptr;
4648         union xfs_btree_ptr             *pp;
4649         union xfs_btree_key             rec_key;
4650         union xfs_btree_key             rec_hkey;
4651         union xfs_btree_key             *lkp;
4652         union xfs_btree_key             *hkp;
4653         union xfs_btree_rec             *recp;
4654         struct xfs_btree_block          *block;
4655         int64_t                         ldiff;
4656         int64_t                         hdiff;
4657         int                             level;
4658         struct xfs_buf                  *bp;
4659         int                             i;
4660         int                             error;
4661
4662         /* Load the root of the btree. */
4663         level = cur->bc_nlevels - 1;
4664         cur->bc_ops->init_ptr_from_cur(cur, &ptr);
4665         error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
4666         if (error)
4667                 return error;
4668         xfs_btree_get_block(cur, level, &bp);
4669         trace_xfs_btree_overlapped_query_range(cur, level, bp);
4670 #ifdef DEBUG
4671         error = xfs_btree_check_block(cur, block, level, bp);
4672         if (error)
4673                 goto out;
4674 #endif
4675         cur->bc_ptrs[level] = 1;
4676
4677         while (level < cur->bc_nlevels) {
4678                 block = xfs_btree_get_block(cur, level, &bp);
4679
4680                 /* End of node, pop back towards the root. */
4681                 if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
4682 pop_up:
4683                         if (level < cur->bc_nlevels - 1)
4684                                 cur->bc_ptrs[level + 1]++;
4685                         level++;
4686                         continue;
4687                 }
4688
4689                 if (level == 0) {
4690                         /* Handle a leaf node. */
4691                         recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
4692
4693                         cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
4694                         ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
4695                                         low_key);
4696
4697                         cur->bc_ops->init_key_from_rec(&rec_key, recp);
4698                         hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
4699                                         &rec_key);
4700
4701                         /*
4702                          * If (record's high key >= query's low key) and
4703                          *    (query's high key >= record's low key), then
4704                          * this record overlaps the query range; callback.
4705                          */
4706                         if (ldiff >= 0 && hdiff >= 0) {
4707                                 error = fn(cur, recp, priv);
4708                                 if (error < 0 ||
4709                                     error == XFS_BTREE_QUERY_RANGE_ABORT)
4710                                         break;
4711                         } else if (hdiff < 0) {
4712                                 /* Record is larger than high key; pop. */
4713                                 goto pop_up;
4714                         }
4715                         cur->bc_ptrs[level]++;
4716                         continue;
4717                 }
4718
4719                 /* Handle an internal node. */
4720                 lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
4721                 hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
4722                 pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
4723
4724                 ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
4725                 hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
4726
4727                 /*
4728                  * If (pointer's high key >= query's low key) and
4729                  *    (query's high key >= pointer's low key), then
4730                  * this record overlaps the query range; follow pointer.
4731                  */
4732                 if (ldiff >= 0 && hdiff >= 0) {
4733                         level--;
4734                         error = xfs_btree_lookup_get_block(cur, level, pp,
4735                                         &block);
4736                         if (error)
4737                                 goto out;
4738                         xfs_btree_get_block(cur, level, &bp);
4739                         trace_xfs_btree_overlapped_query_range(cur, level, bp);
4740 #ifdef DEBUG
4741                         error = xfs_btree_check_block(cur, block, level, bp);
4742                         if (error)
4743                                 goto out;
4744 #endif
4745                         cur->bc_ptrs[level] = 1;
4746                         continue;
4747                 } else if (hdiff < 0) {
4748                         /* The low key is larger than the upper range; pop. */
4749                         goto pop_up;
4750                 }
4751                 cur->bc_ptrs[level]++;
4752         }
4753
4754 out:
4755         /*
4756          * If we don't end this function with the cursor pointing at a record
4757          * block, a subsequent non-error cursor deletion will not release
4758          * node-level buffers, causing a buffer leak.  This is quite possible
4759          * with a zero-results range query, so release the buffers if we
4760          * failed to return any results.
4761          */
4762         if (cur->bc_bufs[0] == NULL) {
4763                 for (i = 0; i < cur->bc_nlevels; i++) {
4764                         if (cur->bc_bufs[i]) {
4765                                 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
4766                                 cur->bc_bufs[i] = NULL;
4767                                 cur->bc_ptrs[i] = 0;
4768                                 cur->bc_ra[i] = 0;
4769                         }
4770                 }
4771         }
4772
4773         return error;
4774 }
4775
4776 /*
4777  * Query a btree for all records overlapping a given interval of keys.  The
4778  * supplied function will be called with each record found; return one of the
4779  * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4780  * code.  This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
4781  * negative error code.
4782  */
4783 int
4784 xfs_btree_query_range(
4785         struct xfs_btree_cur            *cur,
4786         union xfs_btree_irec            *low_rec,
4787         union xfs_btree_irec            *high_rec,
4788         xfs_btree_query_range_fn        fn,
4789         void                            *priv)
4790 {
4791         union xfs_btree_rec             rec;
4792         union xfs_btree_key             low_key;
4793         union xfs_btree_key             high_key;
4794
4795         /* Find the keys of both ends of the interval. */
4796         cur->bc_rec = *high_rec;
4797         cur->bc_ops->init_rec_from_cur(cur, &rec);
4798         cur->bc_ops->init_key_from_rec(&high_key, &rec);
4799
4800         cur->bc_rec = *low_rec;
4801         cur->bc_ops->init_rec_from_cur(cur, &rec);
4802         cur->bc_ops->init_key_from_rec(&low_key, &rec);
4803
4804         /* Enforce low key < high key. */
4805         if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
4806                 return -EINVAL;
4807
4808         if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
4809                 return xfs_btree_simple_query_range(cur, &low_key,
4810                                 &high_key, fn, priv);
4811         return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4812                         fn, priv);
4813 }
4814
4815 /* Query a btree for all records. */
4816 int
4817 xfs_btree_query_all(
4818         struct xfs_btree_cur            *cur,
4819         xfs_btree_query_range_fn        fn,
4820         void                            *priv)
4821 {
4822         union xfs_btree_key             low_key;
4823         union xfs_btree_key             high_key;
4824
4825         memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
4826         memset(&low_key, 0, sizeof(low_key));
4827         memset(&high_key, 0xFF, sizeof(high_key));
4828
4829         return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
4830 }
4831
4832 /*
4833  * Calculate the number of blocks needed to store a given number of records
4834  * in a short-format (per-AG metadata) btree.
4835  */
4836 unsigned long long
4837 xfs_btree_calc_size(
4838         uint                    *limits,
4839         unsigned long long      len)
4840 {
4841         int                     level;
4842         int                     maxrecs;
4843         unsigned long long      rval;
4844
4845         maxrecs = limits[0];
4846         for (level = 0, rval = 0; len > 1; level++) {
4847                 len += maxrecs - 1;
4848                 do_div(len, maxrecs);
4849                 maxrecs = limits[1];
4850                 rval += len;
4851         }
4852         return rval;
4853 }
4854
4855 static int
4856 xfs_btree_count_blocks_helper(
4857         struct xfs_btree_cur    *cur,
4858         int                     level,
4859         void                    *data)
4860 {
4861         xfs_extlen_t            *blocks = data;
4862         (*blocks)++;
4863
4864         return 0;
4865 }
4866
4867 /* Count the blocks in a btree and return the result in *blocks. */
4868 int
4869 xfs_btree_count_blocks(
4870         struct xfs_btree_cur    *cur,
4871         xfs_extlen_t            *blocks)
4872 {
4873         *blocks = 0;
4874         return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4875                         blocks);
4876 }
4877
4878 /* Compare two btree pointers. */
4879 int64_t
4880 xfs_btree_diff_two_ptrs(
4881         struct xfs_btree_cur            *cur,
4882         const union xfs_btree_ptr       *a,
4883         const union xfs_btree_ptr       *b)
4884 {
4885         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4886                 return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
4887         return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
4888 }
4889
4890 /* If there's an extent, we're done. */
4891 STATIC int
4892 xfs_btree_has_record_helper(
4893         struct xfs_btree_cur            *cur,
4894         union xfs_btree_rec             *rec,
4895         void                            *priv)
4896 {
4897         return XFS_BTREE_QUERY_RANGE_ABORT;
4898 }
4899
4900 /* Is there a record covering a given range of keys? */
4901 int
4902 xfs_btree_has_record(
4903         struct xfs_btree_cur    *cur,
4904         union xfs_btree_irec    *low,
4905         union xfs_btree_irec    *high,
4906         bool                    *exists)
4907 {
4908         int                     error;
4909
4910         error = xfs_btree_query_range(cur, low, high,
4911                         &xfs_btree_has_record_helper, NULL);
4912         if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
4913                 *exists = true;
4914                 return 0;
4915         }
4916         *exists = false;
4917         return error;
4918 }
4919
4920 /* Are there more records in this btree? */
4921 bool
4922 xfs_btree_has_more_records(
4923         struct xfs_btree_cur    *cur)
4924 {
4925         struct xfs_btree_block  *block;
4926         struct xfs_buf          *bp;
4927
4928         block = xfs_btree_get_block(cur, 0, &bp);
4929
4930         /* There are still records in this block. */
4931         if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
4932                 return true;
4933
4934         /* There are more record blocks. */
4935         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4936                 return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
4937         else
4938                 return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
4939 }