758f3dc9c1ff8f8aceb728fdf2feb02062fdbc68
[linux-2.6-microblaze.git] / fs / xfs / libxfs / xfs_btree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_inode.h"
15 #include "xfs_trans.h"
16 #include "xfs_buf_item.h"
17 #include "xfs_btree.h"
18 #include "xfs_errortag.h"
19 #include "xfs_error.h"
20 #include "xfs_trace.h"
21 #include "xfs_alloc.h"
22 #include "xfs_log.h"
23 #include "xfs_btree_staging.h"
24 #include "xfs_ag.h"
25
26 /*
27  * Cursor allocation zone.
28  */
29 kmem_zone_t     *xfs_btree_cur_zone;
30
31 /*
32  * Btree magic numbers.
33  */
34 static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
35         { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
36           XFS_FIBT_MAGIC, 0 },
37         { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
38           XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
39           XFS_REFC_CRC_MAGIC }
40 };
41
42 uint32_t
43 xfs_btree_magic(
44         int                     crc,
45         xfs_btnum_t             btnum)
46 {
47         uint32_t                magic = xfs_magics[crc][btnum];
48
49         /* Ensure we asked for crc for crc-only magics. */
50         ASSERT(magic != 0);
51         return magic;
52 }
53
54 /*
55  * Check a long btree block header.  Return the address of the failing check,
56  * or NULL if everything is ok.
57  */
58 xfs_failaddr_t
59 __xfs_btree_check_lblock(
60         struct xfs_btree_cur    *cur,
61         struct xfs_btree_block  *block,
62         int                     level,
63         struct xfs_buf          *bp)
64 {
65         struct xfs_mount        *mp = cur->bc_mp;
66         xfs_btnum_t             btnum = cur->bc_btnum;
67         int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
68
69         if (crc) {
70                 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
71                         return __this_address;
72                 if (block->bb_u.l.bb_blkno !=
73                     cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
74                         return __this_address;
75                 if (block->bb_u.l.bb_pad != cpu_to_be32(0))
76                         return __this_address;
77         }
78
79         if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
80                 return __this_address;
81         if (be16_to_cpu(block->bb_level) != level)
82                 return __this_address;
83         if (be16_to_cpu(block->bb_numrecs) >
84             cur->bc_ops->get_maxrecs(cur, level))
85                 return __this_address;
86         if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
87             !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
88                         level + 1))
89                 return __this_address;
90         if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
91             !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
92                         level + 1))
93                 return __this_address;
94
95         return NULL;
96 }
97
98 /* Check a long btree block header. */
99 static int
100 xfs_btree_check_lblock(
101         struct xfs_btree_cur    *cur,
102         struct xfs_btree_block  *block,
103         int                     level,
104         struct xfs_buf          *bp)
105 {
106         struct xfs_mount        *mp = cur->bc_mp;
107         xfs_failaddr_t          fa;
108
109         fa = __xfs_btree_check_lblock(cur, block, level, bp);
110         if (XFS_IS_CORRUPT(mp, fa != NULL) ||
111             XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) {
112                 if (bp)
113                         trace_xfs_btree_corrupt(bp, _RET_IP_);
114                 return -EFSCORRUPTED;
115         }
116         return 0;
117 }
118
119 /*
120  * Check a short btree block header.  Return the address of the failing check,
121  * or NULL if everything is ok.
122  */
123 xfs_failaddr_t
124 __xfs_btree_check_sblock(
125         struct xfs_btree_cur    *cur,
126         struct xfs_btree_block  *block,
127         int                     level,
128         struct xfs_buf          *bp)
129 {
130         struct xfs_mount        *mp = cur->bc_mp;
131         xfs_btnum_t             btnum = cur->bc_btnum;
132         int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
133
134         if (crc) {
135                 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
136                         return __this_address;
137                 if (block->bb_u.s.bb_blkno !=
138                     cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
139                         return __this_address;
140         }
141
142         if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
143                 return __this_address;
144         if (be16_to_cpu(block->bb_level) != level)
145                 return __this_address;
146         if (be16_to_cpu(block->bb_numrecs) >
147             cur->bc_ops->get_maxrecs(cur, level))
148                 return __this_address;
149         if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
150             !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
151                         level + 1))
152                 return __this_address;
153         if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
154             !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
155                         level + 1))
156                 return __this_address;
157
158         return NULL;
159 }
160
161 /* Check a short btree block header. */
162 STATIC int
163 xfs_btree_check_sblock(
164         struct xfs_btree_cur    *cur,
165         struct xfs_btree_block  *block,
166         int                     level,
167         struct xfs_buf          *bp)
168 {
169         struct xfs_mount        *mp = cur->bc_mp;
170         xfs_failaddr_t          fa;
171
172         fa = __xfs_btree_check_sblock(cur, block, level, bp);
173         if (XFS_IS_CORRUPT(mp, fa != NULL) ||
174             XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) {
175                 if (bp)
176                         trace_xfs_btree_corrupt(bp, _RET_IP_);
177                 return -EFSCORRUPTED;
178         }
179         return 0;
180 }
181
182 /*
183  * Debug routine: check that block header is ok.
184  */
185 int
186 xfs_btree_check_block(
187         struct xfs_btree_cur    *cur,   /* btree cursor */
188         struct xfs_btree_block  *block, /* generic btree block pointer */
189         int                     level,  /* level of the btree block */
190         struct xfs_buf          *bp)    /* buffer containing block, if any */
191 {
192         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
193                 return xfs_btree_check_lblock(cur, block, level, bp);
194         else
195                 return xfs_btree_check_sblock(cur, block, level, bp);
196 }
197
198 /* Check that this long pointer is valid and points within the fs. */
199 bool
200 xfs_btree_check_lptr(
201         struct xfs_btree_cur    *cur,
202         xfs_fsblock_t           fsbno,
203         int                     level)
204 {
205         if (level <= 0)
206                 return false;
207         return xfs_verify_fsbno(cur->bc_mp, fsbno);
208 }
209
210 /* Check that this short pointer is valid and points within the AG. */
211 bool
212 xfs_btree_check_sptr(
213         struct xfs_btree_cur    *cur,
214         xfs_agblock_t           agbno,
215         int                     level)
216 {
217         if (level <= 0)
218                 return false;
219         return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno);
220 }
221
222 /*
223  * Check that a given (indexed) btree pointer at a certain level of a
224  * btree is valid and doesn't point past where it should.
225  */
226 static int
227 xfs_btree_check_ptr(
228         struct xfs_btree_cur            *cur,
229         const union xfs_btree_ptr       *ptr,
230         int                             index,
231         int                             level)
232 {
233         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
234                 if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
235                                 level))
236                         return 0;
237                 xfs_err(cur->bc_mp,
238 "Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
239                                 cur->bc_ino.ip->i_ino,
240                                 cur->bc_ino.whichfork, cur->bc_btnum,
241                                 level, index);
242         } else {
243                 if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
244                                 level))
245                         return 0;
246                 xfs_err(cur->bc_mp,
247 "AG %u: Corrupt btree %d pointer at level %d index %d.",
248                                 cur->bc_ag.pag->pag_agno, cur->bc_btnum,
249                                 level, index);
250         }
251
252         return -EFSCORRUPTED;
253 }
254
255 #ifdef DEBUG
256 # define xfs_btree_debug_check_ptr      xfs_btree_check_ptr
257 #else
258 # define xfs_btree_debug_check_ptr(...) (0)
259 #endif
260
261 /*
262  * Calculate CRC on the whole btree block and stuff it into the
263  * long-form btree header.
264  *
265  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
266  * it into the buffer so recovery knows what the last modification was that made
267  * it to disk.
268  */
269 void
270 xfs_btree_lblock_calc_crc(
271         struct xfs_buf          *bp)
272 {
273         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
274         struct xfs_buf_log_item *bip = bp->b_log_item;
275
276         if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
277                 return;
278         if (bip)
279                 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
280         xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
281 }
282
283 bool
284 xfs_btree_lblock_verify_crc(
285         struct xfs_buf          *bp)
286 {
287         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
288         struct xfs_mount        *mp = bp->b_mount;
289
290         if (xfs_sb_version_hascrc(&mp->m_sb)) {
291                 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
292                         return false;
293                 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
294         }
295
296         return true;
297 }
298
299 /*
300  * Calculate CRC on the whole btree block and stuff it into the
301  * short-form btree header.
302  *
303  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
304  * it into the buffer so recovery knows what the last modification was that made
305  * it to disk.
306  */
307 void
308 xfs_btree_sblock_calc_crc(
309         struct xfs_buf          *bp)
310 {
311         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
312         struct xfs_buf_log_item *bip = bp->b_log_item;
313
314         if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
315                 return;
316         if (bip)
317                 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
318         xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
319 }
320
321 bool
322 xfs_btree_sblock_verify_crc(
323         struct xfs_buf          *bp)
324 {
325         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
326         struct xfs_mount        *mp = bp->b_mount;
327
328         if (xfs_sb_version_hascrc(&mp->m_sb)) {
329                 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
330                         return false;
331                 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
332         }
333
334         return true;
335 }
336
337 static int
338 xfs_btree_free_block(
339         struct xfs_btree_cur    *cur,
340         struct xfs_buf          *bp)
341 {
342         int                     error;
343
344         error = cur->bc_ops->free_block(cur, bp);
345         if (!error) {
346                 xfs_trans_binval(cur->bc_tp, bp);
347                 XFS_BTREE_STATS_INC(cur, free);
348         }
349         return error;
350 }
351
352 /*
353  * Delete the btree cursor.
354  */
355 void
356 xfs_btree_del_cursor(
357         struct xfs_btree_cur    *cur,           /* btree cursor */
358         int                     error)          /* del because of error */
359 {
360         int                     i;              /* btree level */
361
362         /*
363          * Clear the buffer pointers and release the buffers. If we're doing
364          * this because of an error, inspect all of the entries in the bc_bufs
365          * array for buffers to be unlocked. This is because some of the btree
366          * code works from level n down to 0, and if we get an error along the
367          * way we won't have initialized all the entries down to 0.
368          */
369         for (i = 0; i < cur->bc_nlevels; i++) {
370                 if (cur->bc_bufs[i])
371                         xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
372                 else if (!error)
373                         break;
374         }
375
376         ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
377                XFS_FORCED_SHUTDOWN(cur->bc_mp));
378         if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
379                 kmem_free(cur->bc_ops);
380         if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
381                 xfs_perag_put(cur->bc_ag.pag);
382         kmem_cache_free(xfs_btree_cur_zone, cur);
383 }
384
385 /*
386  * Duplicate the btree cursor.
387  * Allocate a new one, copy the record, re-get the buffers.
388  */
389 int                                     /* error */
390 xfs_btree_dup_cursor(
391         xfs_btree_cur_t *cur,           /* input cursor */
392         xfs_btree_cur_t **ncur)         /* output cursor */
393 {
394         struct xfs_buf  *bp;            /* btree block's buffer pointer */
395         int             error;          /* error return value */
396         int             i;              /* level number of btree block */
397         xfs_mount_t     *mp;            /* mount structure for filesystem */
398         xfs_btree_cur_t *new;           /* new cursor value */
399         xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
400
401         tp = cur->bc_tp;
402         mp = cur->bc_mp;
403
404         /*
405          * Allocate a new cursor like the old one.
406          */
407         new = cur->bc_ops->dup_cursor(cur);
408
409         /*
410          * Copy the record currently in the cursor.
411          */
412         new->bc_rec = cur->bc_rec;
413
414         /*
415          * For each level current, re-get the buffer and copy the ptr value.
416          */
417         for (i = 0; i < new->bc_nlevels; i++) {
418                 new->bc_ptrs[i] = cur->bc_ptrs[i];
419                 new->bc_ra[i] = cur->bc_ra[i];
420                 bp = cur->bc_bufs[i];
421                 if (bp) {
422                         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
423                                                    XFS_BUF_ADDR(bp), mp->m_bsize,
424                                                    0, &bp,
425                                                    cur->bc_ops->buf_ops);
426                         if (error) {
427                                 xfs_btree_del_cursor(new, error);
428                                 *ncur = NULL;
429                                 return error;
430                         }
431                 }
432                 new->bc_bufs[i] = bp;
433         }
434         *ncur = new;
435         return 0;
436 }
437
438 /*
439  * XFS btree block layout and addressing:
440  *
441  * There are two types of blocks in the btree: leaf and non-leaf blocks.
442  *
443  * The leaf record start with a header then followed by records containing
444  * the values.  A non-leaf block also starts with the same header, and
445  * then first contains lookup keys followed by an equal number of pointers
446  * to the btree blocks at the previous level.
447  *
448  *              +--------+-------+-------+-------+-------+-------+-------+
449  * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
450  *              +--------+-------+-------+-------+-------+-------+-------+
451  *
452  *              +--------+-------+-------+-------+-------+-------+-------+
453  * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
454  *              +--------+-------+-------+-------+-------+-------+-------+
455  *
456  * The header is called struct xfs_btree_block for reasons better left unknown
457  * and comes in different versions for short (32bit) and long (64bit) block
458  * pointers.  The record and key structures are defined by the btree instances
459  * and opaque to the btree core.  The block pointers are simple disk endian
460  * integers, available in a short (32bit) and long (64bit) variant.
461  *
462  * The helpers below calculate the offset of a given record, key or pointer
463  * into a btree block (xfs_btree_*_offset) or return a pointer to the given
464  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
465  * inside the btree block is done using indices starting at one, not zero!
466  *
467  * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
468  * overlapping intervals.  In such a tree, records are still sorted lowest to
469  * highest and indexed by the smallest key value that refers to the record.
470  * However, nodes are different: each pointer has two associated keys -- one
471  * indexing the lowest key available in the block(s) below (the same behavior
472  * as the key in a regular btree) and another indexing the highest key
473  * available in the block(s) below.  Because records are /not/ sorted by the
474  * highest key, all leaf block updates require us to compute the highest key
475  * that matches any record in the leaf and to recursively update the high keys
476  * in the nodes going further up in the tree, if necessary.  Nodes look like
477  * this:
478  *
479  *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
480  * Non-Leaf:    | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
481  *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
482  *
483  * To perform an interval query on an overlapped tree, perform the usual
484  * depth-first search and use the low and high keys to decide if we can skip
485  * that particular node.  If a leaf node is reached, return the records that
486  * intersect the interval.  Note that an interval query may return numerous
487  * entries.  For a non-overlapped tree, simply search for the record associated
488  * with the lowest key and iterate forward until a non-matching record is
489  * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
490  * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
491  * more detail.
492  *
493  * Why do we care about overlapping intervals?  Let's say you have a bunch of
494  * reverse mapping records on a reflink filesystem:
495  *
496  * 1: +- file A startblock B offset C length D -----------+
497  * 2:      +- file E startblock F offset G length H --------------+
498  * 3:      +- file I startblock F offset J length K --+
499  * 4:                                                        +- file L... --+
500  *
501  * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
502  * we'd simply increment the length of record 1.  But how do we find the record
503  * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
504  * record 3 because the keys are ordered first by startblock.  An interval
505  * query would return records 1 and 2 because they both overlap (B+D-1), and
506  * from that we can pick out record 1 as the appropriate left neighbor.
507  *
508  * In the non-overlapped case you can do a LE lookup and decrement the cursor
509  * because a record's interval must end before the next record.
510  */
511
512 /*
513  * Return size of the btree block header for this btree instance.
514  */
515 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
516 {
517         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
518                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
519                         return XFS_BTREE_LBLOCK_CRC_LEN;
520                 return XFS_BTREE_LBLOCK_LEN;
521         }
522         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
523                 return XFS_BTREE_SBLOCK_CRC_LEN;
524         return XFS_BTREE_SBLOCK_LEN;
525 }
526
527 /*
528  * Return size of btree block pointers for this btree instance.
529  */
530 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
531 {
532         return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
533                 sizeof(__be64) : sizeof(__be32);
534 }
535
536 /*
537  * Calculate offset of the n-th record in a btree block.
538  */
539 STATIC size_t
540 xfs_btree_rec_offset(
541         struct xfs_btree_cur    *cur,
542         int                     n)
543 {
544         return xfs_btree_block_len(cur) +
545                 (n - 1) * cur->bc_ops->rec_len;
546 }
547
548 /*
549  * Calculate offset of the n-th key in a btree block.
550  */
551 STATIC size_t
552 xfs_btree_key_offset(
553         struct xfs_btree_cur    *cur,
554         int                     n)
555 {
556         return xfs_btree_block_len(cur) +
557                 (n - 1) * cur->bc_ops->key_len;
558 }
559
560 /*
561  * Calculate offset of the n-th high key in a btree block.
562  */
563 STATIC size_t
564 xfs_btree_high_key_offset(
565         struct xfs_btree_cur    *cur,
566         int                     n)
567 {
568         return xfs_btree_block_len(cur) +
569                 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
570 }
571
572 /*
573  * Calculate offset of the n-th block pointer in a btree block.
574  */
575 STATIC size_t
576 xfs_btree_ptr_offset(
577         struct xfs_btree_cur    *cur,
578         int                     n,
579         int                     level)
580 {
581         return xfs_btree_block_len(cur) +
582                 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
583                 (n - 1) * xfs_btree_ptr_len(cur);
584 }
585
586 /*
587  * Return a pointer to the n-th record in the btree block.
588  */
589 union xfs_btree_rec *
590 xfs_btree_rec_addr(
591         struct xfs_btree_cur    *cur,
592         int                     n,
593         struct xfs_btree_block  *block)
594 {
595         return (union xfs_btree_rec *)
596                 ((char *)block + xfs_btree_rec_offset(cur, n));
597 }
598
599 /*
600  * Return a pointer to the n-th key in the btree block.
601  */
602 union xfs_btree_key *
603 xfs_btree_key_addr(
604         struct xfs_btree_cur    *cur,
605         int                     n,
606         struct xfs_btree_block  *block)
607 {
608         return (union xfs_btree_key *)
609                 ((char *)block + xfs_btree_key_offset(cur, n));
610 }
611
612 /*
613  * Return a pointer to the n-th high key in the btree block.
614  */
615 union xfs_btree_key *
616 xfs_btree_high_key_addr(
617         struct xfs_btree_cur    *cur,
618         int                     n,
619         struct xfs_btree_block  *block)
620 {
621         return (union xfs_btree_key *)
622                 ((char *)block + xfs_btree_high_key_offset(cur, n));
623 }
624
625 /*
626  * Return a pointer to the n-th block pointer in the btree block.
627  */
628 union xfs_btree_ptr *
629 xfs_btree_ptr_addr(
630         struct xfs_btree_cur    *cur,
631         int                     n,
632         struct xfs_btree_block  *block)
633 {
634         int                     level = xfs_btree_get_level(block);
635
636         ASSERT(block->bb_level != 0);
637
638         return (union xfs_btree_ptr *)
639                 ((char *)block + xfs_btree_ptr_offset(cur, n, level));
640 }
641
642 struct xfs_ifork *
643 xfs_btree_ifork_ptr(
644         struct xfs_btree_cur    *cur)
645 {
646         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
647
648         if (cur->bc_flags & XFS_BTREE_STAGING)
649                 return cur->bc_ino.ifake->if_fork;
650         return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork);
651 }
652
653 /*
654  * Get the root block which is stored in the inode.
655  *
656  * For now this btree implementation assumes the btree root is always
657  * stored in the if_broot field of an inode fork.
658  */
659 STATIC struct xfs_btree_block *
660 xfs_btree_get_iroot(
661         struct xfs_btree_cur    *cur)
662 {
663         struct xfs_ifork        *ifp = xfs_btree_ifork_ptr(cur);
664
665         return (struct xfs_btree_block *)ifp->if_broot;
666 }
667
668 /*
669  * Retrieve the block pointer from the cursor at the given level.
670  * This may be an inode btree root or from a buffer.
671  */
672 struct xfs_btree_block *                /* generic btree block pointer */
673 xfs_btree_get_block(
674         struct xfs_btree_cur    *cur,   /* btree cursor */
675         int                     level,  /* level in btree */
676         struct xfs_buf          **bpp)  /* buffer containing the block */
677 {
678         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
679             (level == cur->bc_nlevels - 1)) {
680                 *bpp = NULL;
681                 return xfs_btree_get_iroot(cur);
682         }
683
684         *bpp = cur->bc_bufs[level];
685         return XFS_BUF_TO_BLOCK(*bpp);
686 }
687
688 /*
689  * Change the cursor to point to the first record at the given level.
690  * Other levels are unaffected.
691  */
692 STATIC int                              /* success=1, failure=0 */
693 xfs_btree_firstrec(
694         xfs_btree_cur_t         *cur,   /* btree cursor */
695         int                     level)  /* level to change */
696 {
697         struct xfs_btree_block  *block; /* generic btree block pointer */
698         struct xfs_buf          *bp;    /* buffer containing block */
699
700         /*
701          * Get the block pointer for this level.
702          */
703         block = xfs_btree_get_block(cur, level, &bp);
704         if (xfs_btree_check_block(cur, block, level, bp))
705                 return 0;
706         /*
707          * It's empty, there is no such record.
708          */
709         if (!block->bb_numrecs)
710                 return 0;
711         /*
712          * Set the ptr value to 1, that's the first record/key.
713          */
714         cur->bc_ptrs[level] = 1;
715         return 1;
716 }
717
718 /*
719  * Change the cursor to point to the last record in the current block
720  * at the given level.  Other levels are unaffected.
721  */
722 STATIC int                              /* success=1, failure=0 */
723 xfs_btree_lastrec(
724         xfs_btree_cur_t         *cur,   /* btree cursor */
725         int                     level)  /* level to change */
726 {
727         struct xfs_btree_block  *block; /* generic btree block pointer */
728         struct xfs_buf          *bp;    /* buffer containing block */
729
730         /*
731          * Get the block pointer for this level.
732          */
733         block = xfs_btree_get_block(cur, level, &bp);
734         if (xfs_btree_check_block(cur, block, level, bp))
735                 return 0;
736         /*
737          * It's empty, there is no such record.
738          */
739         if (!block->bb_numrecs)
740                 return 0;
741         /*
742          * Set the ptr value to numrecs, that's the last record/key.
743          */
744         cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
745         return 1;
746 }
747
748 /*
749  * Compute first and last byte offsets for the fields given.
750  * Interprets the offsets table, which contains struct field offsets.
751  */
752 void
753 xfs_btree_offsets(
754         int64_t         fields,         /* bitmask of fields */
755         const short     *offsets,       /* table of field offsets */
756         int             nbits,          /* number of bits to inspect */
757         int             *first,         /* output: first byte offset */
758         int             *last)          /* output: last byte offset */
759 {
760         int             i;              /* current bit number */
761         int64_t         imask;          /* mask for current bit number */
762
763         ASSERT(fields != 0);
764         /*
765          * Find the lowest bit, so the first byte offset.
766          */
767         for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
768                 if (imask & fields) {
769                         *first = offsets[i];
770                         break;
771                 }
772         }
773         /*
774          * Find the highest bit, so the last byte offset.
775          */
776         for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
777                 if (imask & fields) {
778                         *last = offsets[i + 1] - 1;
779                         break;
780                 }
781         }
782 }
783
784 /*
785  * Get a buffer for the block, return it read in.
786  * Long-form addressing.
787  */
788 int
789 xfs_btree_read_bufl(
790         struct xfs_mount        *mp,            /* file system mount point */
791         struct xfs_trans        *tp,            /* transaction pointer */
792         xfs_fsblock_t           fsbno,          /* file system block number */
793         struct xfs_buf          **bpp,          /* buffer for fsbno */
794         int                     refval,         /* ref count value for buffer */
795         const struct xfs_buf_ops *ops)
796 {
797         struct xfs_buf          *bp;            /* return value */
798         xfs_daddr_t             d;              /* real disk block address */
799         int                     error;
800
801         if (!xfs_verify_fsbno(mp, fsbno))
802                 return -EFSCORRUPTED;
803         d = XFS_FSB_TO_DADDR(mp, fsbno);
804         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
805                                    mp->m_bsize, 0, &bp, ops);
806         if (error)
807                 return error;
808         if (bp)
809                 xfs_buf_set_ref(bp, refval);
810         *bpp = bp;
811         return 0;
812 }
813
814 /*
815  * Read-ahead the block, don't wait for it, don't return a buffer.
816  * Long-form addressing.
817  */
818 /* ARGSUSED */
819 void
820 xfs_btree_reada_bufl(
821         struct xfs_mount        *mp,            /* file system mount point */
822         xfs_fsblock_t           fsbno,          /* file system block number */
823         xfs_extlen_t            count,          /* count of filesystem blocks */
824         const struct xfs_buf_ops *ops)
825 {
826         xfs_daddr_t             d;
827
828         ASSERT(fsbno != NULLFSBLOCK);
829         d = XFS_FSB_TO_DADDR(mp, fsbno);
830         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
831 }
832
833 /*
834  * Read-ahead the block, don't wait for it, don't return a buffer.
835  * Short-form addressing.
836  */
837 /* ARGSUSED */
838 void
839 xfs_btree_reada_bufs(
840         struct xfs_mount        *mp,            /* file system mount point */
841         xfs_agnumber_t          agno,           /* allocation group number */
842         xfs_agblock_t           agbno,          /* allocation group block number */
843         xfs_extlen_t            count,          /* count of filesystem blocks */
844         const struct xfs_buf_ops *ops)
845 {
846         xfs_daddr_t             d;
847
848         ASSERT(agno != NULLAGNUMBER);
849         ASSERT(agbno != NULLAGBLOCK);
850         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
851         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
852 }
853
854 STATIC int
855 xfs_btree_readahead_lblock(
856         struct xfs_btree_cur    *cur,
857         int                     lr,
858         struct xfs_btree_block  *block)
859 {
860         int                     rval = 0;
861         xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
862         xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
863
864         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
865                 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
866                                      cur->bc_ops->buf_ops);
867                 rval++;
868         }
869
870         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
871                 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
872                                      cur->bc_ops->buf_ops);
873                 rval++;
874         }
875
876         return rval;
877 }
878
879 STATIC int
880 xfs_btree_readahead_sblock(
881         struct xfs_btree_cur    *cur,
882         int                     lr,
883         struct xfs_btree_block *block)
884 {
885         int                     rval = 0;
886         xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
887         xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
888
889
890         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
891                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
892                                      left, 1, cur->bc_ops->buf_ops);
893                 rval++;
894         }
895
896         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
897                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
898                                      right, 1, cur->bc_ops->buf_ops);
899                 rval++;
900         }
901
902         return rval;
903 }
904
905 /*
906  * Read-ahead btree blocks, at the given level.
907  * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
908  */
909 STATIC int
910 xfs_btree_readahead(
911         struct xfs_btree_cur    *cur,           /* btree cursor */
912         int                     lev,            /* level in btree */
913         int                     lr)             /* left/right bits */
914 {
915         struct xfs_btree_block  *block;
916
917         /*
918          * No readahead needed if we are at the root level and the
919          * btree root is stored in the inode.
920          */
921         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
922             (lev == cur->bc_nlevels - 1))
923                 return 0;
924
925         if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
926                 return 0;
927
928         cur->bc_ra[lev] |= lr;
929         block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
930
931         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
932                 return xfs_btree_readahead_lblock(cur, lr, block);
933         return xfs_btree_readahead_sblock(cur, lr, block);
934 }
935
936 STATIC int
937 xfs_btree_ptr_to_daddr(
938         struct xfs_btree_cur            *cur,
939         const union xfs_btree_ptr       *ptr,
940         xfs_daddr_t                     *daddr)
941 {
942         xfs_fsblock_t           fsbno;
943         xfs_agblock_t           agbno;
944         int                     error;
945
946         error = xfs_btree_check_ptr(cur, ptr, 0, 1);
947         if (error)
948                 return error;
949
950         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
951                 fsbno = be64_to_cpu(ptr->l);
952                 *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
953         } else {
954                 agbno = be32_to_cpu(ptr->s);
955                 *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
956                                 agbno);
957         }
958
959         return 0;
960 }
961
962 /*
963  * Readahead @count btree blocks at the given @ptr location.
964  *
965  * We don't need to care about long or short form btrees here as we have a
966  * method of converting the ptr directly to a daddr available to us.
967  */
968 STATIC void
969 xfs_btree_readahead_ptr(
970         struct xfs_btree_cur    *cur,
971         union xfs_btree_ptr     *ptr,
972         xfs_extlen_t            count)
973 {
974         xfs_daddr_t             daddr;
975
976         if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
977                 return;
978         xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
979                           cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
980 }
981
982 /*
983  * Set the buffer for level "lev" in the cursor to bp, releasing
984  * any previous buffer.
985  */
986 STATIC void
987 xfs_btree_setbuf(
988         xfs_btree_cur_t         *cur,   /* btree cursor */
989         int                     lev,    /* level in btree */
990         struct xfs_buf          *bp)    /* new buffer to set */
991 {
992         struct xfs_btree_block  *b;     /* btree block */
993
994         if (cur->bc_bufs[lev])
995                 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
996         cur->bc_bufs[lev] = bp;
997         cur->bc_ra[lev] = 0;
998
999         b = XFS_BUF_TO_BLOCK(bp);
1000         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1001                 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1002                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1003                 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1004                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1005         } else {
1006                 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1007                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1008                 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1009                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1010         }
1011 }
1012
1013 bool
1014 xfs_btree_ptr_is_null(
1015         struct xfs_btree_cur            *cur,
1016         const union xfs_btree_ptr       *ptr)
1017 {
1018         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1019                 return ptr->l == cpu_to_be64(NULLFSBLOCK);
1020         else
1021                 return ptr->s == cpu_to_be32(NULLAGBLOCK);
1022 }
1023
1024 void
1025 xfs_btree_set_ptr_null(
1026         struct xfs_btree_cur    *cur,
1027         union xfs_btree_ptr     *ptr)
1028 {
1029         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1030                 ptr->l = cpu_to_be64(NULLFSBLOCK);
1031         else
1032                 ptr->s = cpu_to_be32(NULLAGBLOCK);
1033 }
1034
1035 /*
1036  * Get/set/init sibling pointers
1037  */
1038 void
1039 xfs_btree_get_sibling(
1040         struct xfs_btree_cur    *cur,
1041         struct xfs_btree_block  *block,
1042         union xfs_btree_ptr     *ptr,
1043         int                     lr)
1044 {
1045         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1046
1047         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1048                 if (lr == XFS_BB_RIGHTSIB)
1049                         ptr->l = block->bb_u.l.bb_rightsib;
1050                 else
1051                         ptr->l = block->bb_u.l.bb_leftsib;
1052         } else {
1053                 if (lr == XFS_BB_RIGHTSIB)
1054                         ptr->s = block->bb_u.s.bb_rightsib;
1055                 else
1056                         ptr->s = block->bb_u.s.bb_leftsib;
1057         }
1058 }
1059
1060 void
1061 xfs_btree_set_sibling(
1062         struct xfs_btree_cur            *cur,
1063         struct xfs_btree_block          *block,
1064         const union xfs_btree_ptr       *ptr,
1065         int                             lr)
1066 {
1067         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1068
1069         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1070                 if (lr == XFS_BB_RIGHTSIB)
1071                         block->bb_u.l.bb_rightsib = ptr->l;
1072                 else
1073                         block->bb_u.l.bb_leftsib = ptr->l;
1074         } else {
1075                 if (lr == XFS_BB_RIGHTSIB)
1076                         block->bb_u.s.bb_rightsib = ptr->s;
1077                 else
1078                         block->bb_u.s.bb_leftsib = ptr->s;
1079         }
1080 }
1081
1082 void
1083 xfs_btree_init_block_int(
1084         struct xfs_mount        *mp,
1085         struct xfs_btree_block  *buf,
1086         xfs_daddr_t             blkno,
1087         xfs_btnum_t             btnum,
1088         __u16                   level,
1089         __u16                   numrecs,
1090         __u64                   owner,
1091         unsigned int            flags)
1092 {
1093         int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
1094         __u32                   magic = xfs_btree_magic(crc, btnum);
1095
1096         buf->bb_magic = cpu_to_be32(magic);
1097         buf->bb_level = cpu_to_be16(level);
1098         buf->bb_numrecs = cpu_to_be16(numrecs);
1099
1100         if (flags & XFS_BTREE_LONG_PTRS) {
1101                 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1102                 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1103                 if (crc) {
1104                         buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1105                         buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1106                         uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1107                         buf->bb_u.l.bb_pad = 0;
1108                         buf->bb_u.l.bb_lsn = 0;
1109                 }
1110         } else {
1111                 /* owner is a 32 bit value on short blocks */
1112                 __u32 __owner = (__u32)owner;
1113
1114                 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1115                 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1116                 if (crc) {
1117                         buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1118                         buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1119                         uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1120                         buf->bb_u.s.bb_lsn = 0;
1121                 }
1122         }
1123 }
1124
1125 void
1126 xfs_btree_init_block(
1127         struct xfs_mount *mp,
1128         struct xfs_buf  *bp,
1129         xfs_btnum_t     btnum,
1130         __u16           level,
1131         __u16           numrecs,
1132         __u64           owner)
1133 {
1134         xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1135                                  btnum, level, numrecs, owner, 0);
1136 }
1137
1138 void
1139 xfs_btree_init_block_cur(
1140         struct xfs_btree_cur    *cur,
1141         struct xfs_buf          *bp,
1142         int                     level,
1143         int                     numrecs)
1144 {
1145         __u64                   owner;
1146
1147         /*
1148          * we can pull the owner from the cursor right now as the different
1149          * owners align directly with the pointer size of the btree. This may
1150          * change in future, but is safe for current users of the generic btree
1151          * code.
1152          */
1153         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1154                 owner = cur->bc_ino.ip->i_ino;
1155         else
1156                 owner = cur->bc_ag.pag->pag_agno;
1157
1158         xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1159                                  cur->bc_btnum, level, numrecs,
1160                                  owner, cur->bc_flags);
1161 }
1162
1163 /*
1164  * Return true if ptr is the last record in the btree and
1165  * we need to track updates to this record.  The decision
1166  * will be further refined in the update_lastrec method.
1167  */
1168 STATIC int
1169 xfs_btree_is_lastrec(
1170         struct xfs_btree_cur    *cur,
1171         struct xfs_btree_block  *block,
1172         int                     level)
1173 {
1174         union xfs_btree_ptr     ptr;
1175
1176         if (level > 0)
1177                 return 0;
1178         if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1179                 return 0;
1180
1181         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1182         if (!xfs_btree_ptr_is_null(cur, &ptr))
1183                 return 0;
1184         return 1;
1185 }
1186
1187 STATIC void
1188 xfs_btree_buf_to_ptr(
1189         struct xfs_btree_cur    *cur,
1190         struct xfs_buf          *bp,
1191         union xfs_btree_ptr     *ptr)
1192 {
1193         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1194                 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1195                                         XFS_BUF_ADDR(bp)));
1196         else {
1197                 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1198                                         XFS_BUF_ADDR(bp)));
1199         }
1200 }
1201
1202 STATIC void
1203 xfs_btree_set_refs(
1204         struct xfs_btree_cur    *cur,
1205         struct xfs_buf          *bp)
1206 {
1207         switch (cur->bc_btnum) {
1208         case XFS_BTNUM_BNO:
1209         case XFS_BTNUM_CNT:
1210                 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1211                 break;
1212         case XFS_BTNUM_INO:
1213         case XFS_BTNUM_FINO:
1214                 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1215                 break;
1216         case XFS_BTNUM_BMAP:
1217                 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1218                 break;
1219         case XFS_BTNUM_RMAP:
1220                 xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
1221                 break;
1222         case XFS_BTNUM_REFC:
1223                 xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
1224                 break;
1225         default:
1226                 ASSERT(0);
1227         }
1228 }
1229
1230 int
1231 xfs_btree_get_buf_block(
1232         struct xfs_btree_cur            *cur,
1233         const union xfs_btree_ptr       *ptr,
1234         struct xfs_btree_block          **block,
1235         struct xfs_buf                  **bpp)
1236 {
1237         struct xfs_mount        *mp = cur->bc_mp;
1238         xfs_daddr_t             d;
1239         int                     error;
1240
1241         error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1242         if (error)
1243                 return error;
1244         error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
1245                         0, bpp);
1246         if (error)
1247                 return error;
1248
1249         (*bpp)->b_ops = cur->bc_ops->buf_ops;
1250         *block = XFS_BUF_TO_BLOCK(*bpp);
1251         return 0;
1252 }
1253
1254 /*
1255  * Read in the buffer at the given ptr and return the buffer and
1256  * the block pointer within the buffer.
1257  */
1258 STATIC int
1259 xfs_btree_read_buf_block(
1260         struct xfs_btree_cur            *cur,
1261         const union xfs_btree_ptr       *ptr,
1262         int                             flags,
1263         struct xfs_btree_block          **block,
1264         struct xfs_buf                  **bpp)
1265 {
1266         struct xfs_mount        *mp = cur->bc_mp;
1267         xfs_daddr_t             d;
1268         int                     error;
1269
1270         /* need to sort out how callers deal with failures first */
1271         ASSERT(!(flags & XBF_TRYLOCK));
1272
1273         error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1274         if (error)
1275                 return error;
1276         error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1277                                    mp->m_bsize, flags, bpp,
1278                                    cur->bc_ops->buf_ops);
1279         if (error)
1280                 return error;
1281
1282         xfs_btree_set_refs(cur, *bpp);
1283         *block = XFS_BUF_TO_BLOCK(*bpp);
1284         return 0;
1285 }
1286
1287 /*
1288  * Copy keys from one btree block to another.
1289  */
1290 void
1291 xfs_btree_copy_keys(
1292         struct xfs_btree_cur            *cur,
1293         union xfs_btree_key             *dst_key,
1294         const union xfs_btree_key       *src_key,
1295         int                             numkeys)
1296 {
1297         ASSERT(numkeys >= 0);
1298         memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1299 }
1300
1301 /*
1302  * Copy records from one btree block to another.
1303  */
1304 STATIC void
1305 xfs_btree_copy_recs(
1306         struct xfs_btree_cur    *cur,
1307         union xfs_btree_rec     *dst_rec,
1308         union xfs_btree_rec     *src_rec,
1309         int                     numrecs)
1310 {
1311         ASSERT(numrecs >= 0);
1312         memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1313 }
1314
1315 /*
1316  * Copy block pointers from one btree block to another.
1317  */
1318 void
1319 xfs_btree_copy_ptrs(
1320         struct xfs_btree_cur    *cur,
1321         union xfs_btree_ptr     *dst_ptr,
1322         const union xfs_btree_ptr *src_ptr,
1323         int                     numptrs)
1324 {
1325         ASSERT(numptrs >= 0);
1326         memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1327 }
1328
1329 /*
1330  * Shift keys one index left/right inside a single btree block.
1331  */
1332 STATIC void
1333 xfs_btree_shift_keys(
1334         struct xfs_btree_cur    *cur,
1335         union xfs_btree_key     *key,
1336         int                     dir,
1337         int                     numkeys)
1338 {
1339         char                    *dst_key;
1340
1341         ASSERT(numkeys >= 0);
1342         ASSERT(dir == 1 || dir == -1);
1343
1344         dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1345         memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1346 }
1347
1348 /*
1349  * Shift records one index left/right inside a single btree block.
1350  */
1351 STATIC void
1352 xfs_btree_shift_recs(
1353         struct xfs_btree_cur    *cur,
1354         union xfs_btree_rec     *rec,
1355         int                     dir,
1356         int                     numrecs)
1357 {
1358         char                    *dst_rec;
1359
1360         ASSERT(numrecs >= 0);
1361         ASSERT(dir == 1 || dir == -1);
1362
1363         dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1364         memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1365 }
1366
1367 /*
1368  * Shift block pointers one index left/right inside a single btree block.
1369  */
1370 STATIC void
1371 xfs_btree_shift_ptrs(
1372         struct xfs_btree_cur    *cur,
1373         union xfs_btree_ptr     *ptr,
1374         int                     dir,
1375         int                     numptrs)
1376 {
1377         char                    *dst_ptr;
1378
1379         ASSERT(numptrs >= 0);
1380         ASSERT(dir == 1 || dir == -1);
1381
1382         dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1383         memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1384 }
1385
1386 /*
1387  * Log key values from the btree block.
1388  */
1389 STATIC void
1390 xfs_btree_log_keys(
1391         struct xfs_btree_cur    *cur,
1392         struct xfs_buf          *bp,
1393         int                     first,
1394         int                     last)
1395 {
1396
1397         if (bp) {
1398                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1399                 xfs_trans_log_buf(cur->bc_tp, bp,
1400                                   xfs_btree_key_offset(cur, first),
1401                                   xfs_btree_key_offset(cur, last + 1) - 1);
1402         } else {
1403                 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1404                                 xfs_ilog_fbroot(cur->bc_ino.whichfork));
1405         }
1406 }
1407
1408 /*
1409  * Log record values from the btree block.
1410  */
1411 void
1412 xfs_btree_log_recs(
1413         struct xfs_btree_cur    *cur,
1414         struct xfs_buf          *bp,
1415         int                     first,
1416         int                     last)
1417 {
1418
1419         xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1420         xfs_trans_log_buf(cur->bc_tp, bp,
1421                           xfs_btree_rec_offset(cur, first),
1422                           xfs_btree_rec_offset(cur, last + 1) - 1);
1423
1424 }
1425
1426 /*
1427  * Log block pointer fields from a btree block (nonleaf).
1428  */
1429 STATIC void
1430 xfs_btree_log_ptrs(
1431         struct xfs_btree_cur    *cur,   /* btree cursor */
1432         struct xfs_buf          *bp,    /* buffer containing btree block */
1433         int                     first,  /* index of first pointer to log */
1434         int                     last)   /* index of last pointer to log */
1435 {
1436
1437         if (bp) {
1438                 struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1439                 int                     level = xfs_btree_get_level(block);
1440
1441                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1442                 xfs_trans_log_buf(cur->bc_tp, bp,
1443                                 xfs_btree_ptr_offset(cur, first, level),
1444                                 xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1445         } else {
1446                 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1447                         xfs_ilog_fbroot(cur->bc_ino.whichfork));
1448         }
1449
1450 }
1451
1452 /*
1453  * Log fields from a btree block header.
1454  */
1455 void
1456 xfs_btree_log_block(
1457         struct xfs_btree_cur    *cur,   /* btree cursor */
1458         struct xfs_buf          *bp,    /* buffer containing btree block */
1459         int                     fields) /* mask of fields: XFS_BB_... */
1460 {
1461         int                     first;  /* first byte offset logged */
1462         int                     last;   /* last byte offset logged */
1463         static const short      soffsets[] = {  /* table of offsets (short) */
1464                 offsetof(struct xfs_btree_block, bb_magic),
1465                 offsetof(struct xfs_btree_block, bb_level),
1466                 offsetof(struct xfs_btree_block, bb_numrecs),
1467                 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1468                 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1469                 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1470                 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1471                 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1472                 offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1473                 offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1474                 XFS_BTREE_SBLOCK_CRC_LEN
1475         };
1476         static const short      loffsets[] = {  /* table of offsets (long) */
1477                 offsetof(struct xfs_btree_block, bb_magic),
1478                 offsetof(struct xfs_btree_block, bb_level),
1479                 offsetof(struct xfs_btree_block, bb_numrecs),
1480                 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1481                 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1482                 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1483                 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1484                 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1485                 offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1486                 offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1487                 offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1488                 XFS_BTREE_LBLOCK_CRC_LEN
1489         };
1490
1491         if (bp) {
1492                 int nbits;
1493
1494                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1495                         /*
1496                          * We don't log the CRC when updating a btree
1497                          * block but instead recreate it during log
1498                          * recovery.  As the log buffers have checksums
1499                          * of their own this is safe and avoids logging a crc
1500                          * update in a lot of places.
1501                          */
1502                         if (fields == XFS_BB_ALL_BITS)
1503                                 fields = XFS_BB_ALL_BITS_CRC;
1504                         nbits = XFS_BB_NUM_BITS_CRC;
1505                 } else {
1506                         nbits = XFS_BB_NUM_BITS;
1507                 }
1508                 xfs_btree_offsets(fields,
1509                                   (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1510                                         loffsets : soffsets,
1511                                   nbits, &first, &last);
1512                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1513                 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1514         } else {
1515                 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1516                         xfs_ilog_fbroot(cur->bc_ino.whichfork));
1517         }
1518 }
1519
1520 /*
1521  * Increment cursor by one record at the level.
1522  * For nonzero levels the leaf-ward information is untouched.
1523  */
1524 int                                             /* error */
1525 xfs_btree_increment(
1526         struct xfs_btree_cur    *cur,
1527         int                     level,
1528         int                     *stat)          /* success/failure */
1529 {
1530         struct xfs_btree_block  *block;
1531         union xfs_btree_ptr     ptr;
1532         struct xfs_buf          *bp;
1533         int                     error;          /* error return value */
1534         int                     lev;
1535
1536         ASSERT(level < cur->bc_nlevels);
1537
1538         /* Read-ahead to the right at this level. */
1539         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1540
1541         /* Get a pointer to the btree block. */
1542         block = xfs_btree_get_block(cur, level, &bp);
1543
1544 #ifdef DEBUG
1545         error = xfs_btree_check_block(cur, block, level, bp);
1546         if (error)
1547                 goto error0;
1548 #endif
1549
1550         /* We're done if we remain in the block after the increment. */
1551         if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1552                 goto out1;
1553
1554         /* Fail if we just went off the right edge of the tree. */
1555         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1556         if (xfs_btree_ptr_is_null(cur, &ptr))
1557                 goto out0;
1558
1559         XFS_BTREE_STATS_INC(cur, increment);
1560
1561         /*
1562          * March up the tree incrementing pointers.
1563          * Stop when we don't go off the right edge of a block.
1564          */
1565         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1566                 block = xfs_btree_get_block(cur, lev, &bp);
1567
1568 #ifdef DEBUG
1569                 error = xfs_btree_check_block(cur, block, lev, bp);
1570                 if (error)
1571                         goto error0;
1572 #endif
1573
1574                 if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1575                         break;
1576
1577                 /* Read-ahead the right block for the next loop. */
1578                 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1579         }
1580
1581         /*
1582          * If we went off the root then we are either seriously
1583          * confused or have the tree root in an inode.
1584          */
1585         if (lev == cur->bc_nlevels) {
1586                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1587                         goto out0;
1588                 ASSERT(0);
1589                 error = -EFSCORRUPTED;
1590                 goto error0;
1591         }
1592         ASSERT(lev < cur->bc_nlevels);
1593
1594         /*
1595          * Now walk back down the tree, fixing up the cursor's buffer
1596          * pointers and key numbers.
1597          */
1598         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1599                 union xfs_btree_ptr     *ptrp;
1600
1601                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1602                 --lev;
1603                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1604                 if (error)
1605                         goto error0;
1606
1607                 xfs_btree_setbuf(cur, lev, bp);
1608                 cur->bc_ptrs[lev] = 1;
1609         }
1610 out1:
1611         *stat = 1;
1612         return 0;
1613
1614 out0:
1615         *stat = 0;
1616         return 0;
1617
1618 error0:
1619         return error;
1620 }
1621
1622 /*
1623  * Decrement cursor by one record at the level.
1624  * For nonzero levels the leaf-ward information is untouched.
1625  */
1626 int                                             /* error */
1627 xfs_btree_decrement(
1628         struct xfs_btree_cur    *cur,
1629         int                     level,
1630         int                     *stat)          /* success/failure */
1631 {
1632         struct xfs_btree_block  *block;
1633         struct xfs_buf          *bp;
1634         int                     error;          /* error return value */
1635         int                     lev;
1636         union xfs_btree_ptr     ptr;
1637
1638         ASSERT(level < cur->bc_nlevels);
1639
1640         /* Read-ahead to the left at this level. */
1641         xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1642
1643         /* We're done if we remain in the block after the decrement. */
1644         if (--cur->bc_ptrs[level] > 0)
1645                 goto out1;
1646
1647         /* Get a pointer to the btree block. */
1648         block = xfs_btree_get_block(cur, level, &bp);
1649
1650 #ifdef DEBUG
1651         error = xfs_btree_check_block(cur, block, level, bp);
1652         if (error)
1653                 goto error0;
1654 #endif
1655
1656         /* Fail if we just went off the left edge of the tree. */
1657         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1658         if (xfs_btree_ptr_is_null(cur, &ptr))
1659                 goto out0;
1660
1661         XFS_BTREE_STATS_INC(cur, decrement);
1662
1663         /*
1664          * March up the tree decrementing pointers.
1665          * Stop when we don't go off the left edge of a block.
1666          */
1667         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1668                 if (--cur->bc_ptrs[lev] > 0)
1669                         break;
1670                 /* Read-ahead the left block for the next loop. */
1671                 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1672         }
1673
1674         /*
1675          * If we went off the root then we are seriously confused.
1676          * or the root of the tree is in an inode.
1677          */
1678         if (lev == cur->bc_nlevels) {
1679                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1680                         goto out0;
1681                 ASSERT(0);
1682                 error = -EFSCORRUPTED;
1683                 goto error0;
1684         }
1685         ASSERT(lev < cur->bc_nlevels);
1686
1687         /*
1688          * Now walk back down the tree, fixing up the cursor's buffer
1689          * pointers and key numbers.
1690          */
1691         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1692                 union xfs_btree_ptr     *ptrp;
1693
1694                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1695                 --lev;
1696                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1697                 if (error)
1698                         goto error0;
1699                 xfs_btree_setbuf(cur, lev, bp);
1700                 cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1701         }
1702 out1:
1703         *stat = 1;
1704         return 0;
1705
1706 out0:
1707         *stat = 0;
1708         return 0;
1709
1710 error0:
1711         return error;
1712 }
1713
1714 int
1715 xfs_btree_lookup_get_block(
1716         struct xfs_btree_cur            *cur,   /* btree cursor */
1717         int                             level,  /* level in the btree */
1718         const union xfs_btree_ptr       *pp,    /* ptr to btree block */
1719         struct xfs_btree_block          **blkp) /* return btree block */
1720 {
1721         struct xfs_buf          *bp;    /* buffer pointer for btree block */
1722         xfs_daddr_t             daddr;
1723         int                     error = 0;
1724
1725         /* special case the root block if in an inode */
1726         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1727             (level == cur->bc_nlevels - 1)) {
1728                 *blkp = xfs_btree_get_iroot(cur);
1729                 return 0;
1730         }
1731
1732         /*
1733          * If the old buffer at this level for the disk address we are
1734          * looking for re-use it.
1735          *
1736          * Otherwise throw it away and get a new one.
1737          */
1738         bp = cur->bc_bufs[level];
1739         error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
1740         if (error)
1741                 return error;
1742         if (bp && XFS_BUF_ADDR(bp) == daddr) {
1743                 *blkp = XFS_BUF_TO_BLOCK(bp);
1744                 return 0;
1745         }
1746
1747         error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1748         if (error)
1749                 return error;
1750
1751         /* Check the inode owner since the verifiers don't. */
1752         if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
1753             !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
1754             (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
1755             be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
1756                         cur->bc_ino.ip->i_ino)
1757                 goto out_bad;
1758
1759         /* Did we get the level we were looking for? */
1760         if (be16_to_cpu((*blkp)->bb_level) != level)
1761                 goto out_bad;
1762
1763         /* Check that internal nodes have at least one record. */
1764         if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1765                 goto out_bad;
1766
1767         xfs_btree_setbuf(cur, level, bp);
1768         return 0;
1769
1770 out_bad:
1771         *blkp = NULL;
1772         xfs_buf_mark_corrupt(bp);
1773         xfs_trans_brelse(cur->bc_tp, bp);
1774         return -EFSCORRUPTED;
1775 }
1776
1777 /*
1778  * Get current search key.  For level 0 we don't actually have a key
1779  * structure so we make one up from the record.  For all other levels
1780  * we just return the right key.
1781  */
1782 STATIC union xfs_btree_key *
1783 xfs_lookup_get_search_key(
1784         struct xfs_btree_cur    *cur,
1785         int                     level,
1786         int                     keyno,
1787         struct xfs_btree_block  *block,
1788         union xfs_btree_key     *kp)
1789 {
1790         if (level == 0) {
1791                 cur->bc_ops->init_key_from_rec(kp,
1792                                 xfs_btree_rec_addr(cur, keyno, block));
1793                 return kp;
1794         }
1795
1796         return xfs_btree_key_addr(cur, keyno, block);
1797 }
1798
1799 /*
1800  * Lookup the record.  The cursor is made to point to it, based on dir.
1801  * stat is set to 0 if can't find any such record, 1 for success.
1802  */
1803 int                                     /* error */
1804 xfs_btree_lookup(
1805         struct xfs_btree_cur    *cur,   /* btree cursor */
1806         xfs_lookup_t            dir,    /* <=, ==, or >= */
1807         int                     *stat)  /* success/failure */
1808 {
1809         struct xfs_btree_block  *block; /* current btree block */
1810         int64_t                 diff;   /* difference for the current key */
1811         int                     error;  /* error return value */
1812         int                     keyno;  /* current key number */
1813         int                     level;  /* level in the btree */
1814         union xfs_btree_ptr     *pp;    /* ptr to btree block */
1815         union xfs_btree_ptr     ptr;    /* ptr to btree block */
1816
1817         XFS_BTREE_STATS_INC(cur, lookup);
1818
1819         /* No such thing as a zero-level tree. */
1820         if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0))
1821                 return -EFSCORRUPTED;
1822
1823         block = NULL;
1824         keyno = 0;
1825
1826         /* initialise start pointer from cursor */
1827         cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1828         pp = &ptr;
1829
1830         /*
1831          * Iterate over each level in the btree, starting at the root.
1832          * For each level above the leaves, find the key we need, based
1833          * on the lookup record, then follow the corresponding block
1834          * pointer down to the next level.
1835          */
1836         for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1837                 /* Get the block we need to do the lookup on. */
1838                 error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1839                 if (error)
1840                         goto error0;
1841
1842                 if (diff == 0) {
1843                         /*
1844                          * If we already had a key match at a higher level, we
1845                          * know we need to use the first entry in this block.
1846                          */
1847                         keyno = 1;
1848                 } else {
1849                         /* Otherwise search this block. Do a binary search. */
1850
1851                         int     high;   /* high entry number */
1852                         int     low;    /* low entry number */
1853
1854                         /* Set low and high entry numbers, 1-based. */
1855                         low = 1;
1856                         high = xfs_btree_get_numrecs(block);
1857                         if (!high) {
1858                                 /* Block is empty, must be an empty leaf. */
1859                                 if (level != 0 || cur->bc_nlevels != 1) {
1860                                         XFS_CORRUPTION_ERROR(__func__,
1861                                                         XFS_ERRLEVEL_LOW,
1862                                                         cur->bc_mp, block,
1863                                                         sizeof(*block));
1864                                         return -EFSCORRUPTED;
1865                                 }
1866
1867                                 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1868                                 *stat = 0;
1869                                 return 0;
1870                         }
1871
1872                         /* Binary search the block. */
1873                         while (low <= high) {
1874                                 union xfs_btree_key     key;
1875                                 union xfs_btree_key     *kp;
1876
1877                                 XFS_BTREE_STATS_INC(cur, compare);
1878
1879                                 /* keyno is average of low and high. */
1880                                 keyno = (low + high) >> 1;
1881
1882                                 /* Get current search key */
1883                                 kp = xfs_lookup_get_search_key(cur, level,
1884                                                 keyno, block, &key);
1885
1886                                 /*
1887                                  * Compute difference to get next direction:
1888                                  *  - less than, move right
1889                                  *  - greater than, move left
1890                                  *  - equal, we're done
1891                                  */
1892                                 diff = cur->bc_ops->key_diff(cur, kp);
1893                                 if (diff < 0)
1894                                         low = keyno + 1;
1895                                 else if (diff > 0)
1896                                         high = keyno - 1;
1897                                 else
1898                                         break;
1899                         }
1900                 }
1901
1902                 /*
1903                  * If there are more levels, set up for the next level
1904                  * by getting the block number and filling in the cursor.
1905                  */
1906                 if (level > 0) {
1907                         /*
1908                          * If we moved left, need the previous key number,
1909                          * unless there isn't one.
1910                          */
1911                         if (diff > 0 && --keyno < 1)
1912                                 keyno = 1;
1913                         pp = xfs_btree_ptr_addr(cur, keyno, block);
1914
1915                         error = xfs_btree_debug_check_ptr(cur, pp, 0, level);
1916                         if (error)
1917                                 goto error0;
1918
1919                         cur->bc_ptrs[level] = keyno;
1920                 }
1921         }
1922
1923         /* Done with the search. See if we need to adjust the results. */
1924         if (dir != XFS_LOOKUP_LE && diff < 0) {
1925                 keyno++;
1926                 /*
1927                  * If ge search and we went off the end of the block, but it's
1928                  * not the last block, we're in the wrong block.
1929                  */
1930                 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1931                 if (dir == XFS_LOOKUP_GE &&
1932                     keyno > xfs_btree_get_numrecs(block) &&
1933                     !xfs_btree_ptr_is_null(cur, &ptr)) {
1934                         int     i;
1935
1936                         cur->bc_ptrs[0] = keyno;
1937                         error = xfs_btree_increment(cur, 0, &i);
1938                         if (error)
1939                                 goto error0;
1940                         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
1941                                 return -EFSCORRUPTED;
1942                         *stat = 1;
1943                         return 0;
1944                 }
1945         } else if (dir == XFS_LOOKUP_LE && diff > 0)
1946                 keyno--;
1947         cur->bc_ptrs[0] = keyno;
1948
1949         /* Return if we succeeded or not. */
1950         if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1951                 *stat = 0;
1952         else if (dir != XFS_LOOKUP_EQ || diff == 0)
1953                 *stat = 1;
1954         else
1955                 *stat = 0;
1956         return 0;
1957
1958 error0:
1959         return error;
1960 }
1961
1962 /* Find the high key storage area from a regular key. */
1963 union xfs_btree_key *
1964 xfs_btree_high_key_from_key(
1965         struct xfs_btree_cur    *cur,
1966         union xfs_btree_key     *key)
1967 {
1968         ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
1969         return (union xfs_btree_key *)((char *)key +
1970                         (cur->bc_ops->key_len / 2));
1971 }
1972
1973 /* Determine the low (and high if overlapped) keys of a leaf block */
1974 STATIC void
1975 xfs_btree_get_leaf_keys(
1976         struct xfs_btree_cur    *cur,
1977         struct xfs_btree_block  *block,
1978         union xfs_btree_key     *key)
1979 {
1980         union xfs_btree_key     max_hkey;
1981         union xfs_btree_key     hkey;
1982         union xfs_btree_rec     *rec;
1983         union xfs_btree_key     *high;
1984         int                     n;
1985
1986         rec = xfs_btree_rec_addr(cur, 1, block);
1987         cur->bc_ops->init_key_from_rec(key, rec);
1988
1989         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
1990
1991                 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
1992                 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
1993                         rec = xfs_btree_rec_addr(cur, n, block);
1994                         cur->bc_ops->init_high_key_from_rec(&hkey, rec);
1995                         if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
1996                                         > 0)
1997                                 max_hkey = hkey;
1998                 }
1999
2000                 high = xfs_btree_high_key_from_key(cur, key);
2001                 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2002         }
2003 }
2004
2005 /* Determine the low (and high if overlapped) keys of a node block */
2006 STATIC void
2007 xfs_btree_get_node_keys(
2008         struct xfs_btree_cur    *cur,
2009         struct xfs_btree_block  *block,
2010         union xfs_btree_key     *key)
2011 {
2012         union xfs_btree_key     *hkey;
2013         union xfs_btree_key     *max_hkey;
2014         union xfs_btree_key     *high;
2015         int                     n;
2016
2017         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2018                 memcpy(key, xfs_btree_key_addr(cur, 1, block),
2019                                 cur->bc_ops->key_len / 2);
2020
2021                 max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2022                 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2023                         hkey = xfs_btree_high_key_addr(cur, n, block);
2024                         if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
2025                                 max_hkey = hkey;
2026                 }
2027
2028                 high = xfs_btree_high_key_from_key(cur, key);
2029                 memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2030         } else {
2031                 memcpy(key, xfs_btree_key_addr(cur, 1, block),
2032                                 cur->bc_ops->key_len);
2033         }
2034 }
2035
2036 /* Derive the keys for any btree block. */
2037 void
2038 xfs_btree_get_keys(
2039         struct xfs_btree_cur    *cur,
2040         struct xfs_btree_block  *block,
2041         union xfs_btree_key     *key)
2042 {
2043         if (be16_to_cpu(block->bb_level) == 0)
2044                 xfs_btree_get_leaf_keys(cur, block, key);
2045         else
2046                 xfs_btree_get_node_keys(cur, block, key);
2047 }
2048
2049 /*
2050  * Decide if we need to update the parent keys of a btree block.  For
2051  * a standard btree this is only necessary if we're updating the first
2052  * record/key.  For an overlapping btree, we must always update the
2053  * keys because the highest key can be in any of the records or keys
2054  * in the block.
2055  */
2056 static inline bool
2057 xfs_btree_needs_key_update(
2058         struct xfs_btree_cur    *cur,
2059         int                     ptr)
2060 {
2061         return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
2062 }
2063
2064 /*
2065  * Update the low and high parent keys of the given level, progressing
2066  * towards the root.  If force_all is false, stop if the keys for a given
2067  * level do not need updating.
2068  */
2069 STATIC int
2070 __xfs_btree_updkeys(
2071         struct xfs_btree_cur    *cur,
2072         int                     level,
2073         struct xfs_btree_block  *block,
2074         struct xfs_buf          *bp0,
2075         bool                    force_all)
2076 {
2077         union xfs_btree_key     key;    /* keys from current level */
2078         union xfs_btree_key     *lkey;  /* keys from the next level up */
2079         union xfs_btree_key     *hkey;
2080         union xfs_btree_key     *nlkey; /* keys from the next level up */
2081         union xfs_btree_key     *nhkey;
2082         struct xfs_buf          *bp;
2083         int                     ptr;
2084
2085         ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2086
2087         /* Exit if there aren't any parent levels to update. */
2088         if (level + 1 >= cur->bc_nlevels)
2089                 return 0;
2090
2091         trace_xfs_btree_updkeys(cur, level, bp0);
2092
2093         lkey = &key;
2094         hkey = xfs_btree_high_key_from_key(cur, lkey);
2095         xfs_btree_get_keys(cur, block, lkey);
2096         for (level++; level < cur->bc_nlevels; level++) {
2097 #ifdef DEBUG
2098                 int             error;
2099 #endif
2100                 block = xfs_btree_get_block(cur, level, &bp);
2101                 trace_xfs_btree_updkeys(cur, level, bp);
2102 #ifdef DEBUG
2103                 error = xfs_btree_check_block(cur, block, level, bp);
2104                 if (error)
2105                         return error;
2106 #endif
2107                 ptr = cur->bc_ptrs[level];
2108                 nlkey = xfs_btree_key_addr(cur, ptr, block);
2109                 nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2110                 if (!force_all &&
2111                     !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
2112                       cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
2113                         break;
2114                 xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2115                 xfs_btree_log_keys(cur, bp, ptr, ptr);
2116                 if (level + 1 >= cur->bc_nlevels)
2117                         break;
2118                 xfs_btree_get_node_keys(cur, block, lkey);
2119         }
2120
2121         return 0;
2122 }
2123
2124 /* Update all the keys from some level in cursor back to the root. */
2125 STATIC int
2126 xfs_btree_updkeys_force(
2127         struct xfs_btree_cur    *cur,
2128         int                     level)
2129 {
2130         struct xfs_buf          *bp;
2131         struct xfs_btree_block  *block;
2132
2133         block = xfs_btree_get_block(cur, level, &bp);
2134         return __xfs_btree_updkeys(cur, level, block, bp, true);
2135 }
2136
2137 /*
2138  * Update the parent keys of the given level, progressing towards the root.
2139  */
2140 STATIC int
2141 xfs_btree_update_keys(
2142         struct xfs_btree_cur    *cur,
2143         int                     level)
2144 {
2145         struct xfs_btree_block  *block;
2146         struct xfs_buf          *bp;
2147         union xfs_btree_key     *kp;
2148         union xfs_btree_key     key;
2149         int                     ptr;
2150
2151         ASSERT(level >= 0);
2152
2153         block = xfs_btree_get_block(cur, level, &bp);
2154         if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
2155                 return __xfs_btree_updkeys(cur, level, block, bp, false);
2156
2157         /*
2158          * Go up the tree from this level toward the root.
2159          * At each level, update the key value to the value input.
2160          * Stop when we reach a level where the cursor isn't pointing
2161          * at the first entry in the block.
2162          */
2163         xfs_btree_get_keys(cur, block, &key);
2164         for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2165 #ifdef DEBUG
2166                 int             error;
2167 #endif
2168                 block = xfs_btree_get_block(cur, level, &bp);
2169 #ifdef DEBUG
2170                 error = xfs_btree_check_block(cur, block, level, bp);
2171                 if (error)
2172                         return error;
2173 #endif
2174                 ptr = cur->bc_ptrs[level];
2175                 kp = xfs_btree_key_addr(cur, ptr, block);
2176                 xfs_btree_copy_keys(cur, kp, &key, 1);
2177                 xfs_btree_log_keys(cur, bp, ptr, ptr);
2178         }
2179
2180         return 0;
2181 }
2182
2183 /*
2184  * Update the record referred to by cur to the value in the
2185  * given record. This either works (return 0) or gets an
2186  * EFSCORRUPTED error.
2187  */
2188 int
2189 xfs_btree_update(
2190         struct xfs_btree_cur    *cur,
2191         union xfs_btree_rec     *rec)
2192 {
2193         struct xfs_btree_block  *block;
2194         struct xfs_buf          *bp;
2195         int                     error;
2196         int                     ptr;
2197         union xfs_btree_rec     *rp;
2198
2199         /* Pick up the current block. */
2200         block = xfs_btree_get_block(cur, 0, &bp);
2201
2202 #ifdef DEBUG
2203         error = xfs_btree_check_block(cur, block, 0, bp);
2204         if (error)
2205                 goto error0;
2206 #endif
2207         /* Get the address of the rec to be updated. */
2208         ptr = cur->bc_ptrs[0];
2209         rp = xfs_btree_rec_addr(cur, ptr, block);
2210
2211         /* Fill in the new contents and log them. */
2212         xfs_btree_copy_recs(cur, rp, rec, 1);
2213         xfs_btree_log_recs(cur, bp, ptr, ptr);
2214
2215         /*
2216          * If we are tracking the last record in the tree and
2217          * we are at the far right edge of the tree, update it.
2218          */
2219         if (xfs_btree_is_lastrec(cur, block, 0)) {
2220                 cur->bc_ops->update_lastrec(cur, block, rec,
2221                                             ptr, LASTREC_UPDATE);
2222         }
2223
2224         /* Pass new key value up to our parent. */
2225         if (xfs_btree_needs_key_update(cur, ptr)) {
2226                 error = xfs_btree_update_keys(cur, 0);
2227                 if (error)
2228                         goto error0;
2229         }
2230
2231         return 0;
2232
2233 error0:
2234         return error;
2235 }
2236
2237 /*
2238  * Move 1 record left from cur/level if possible.
2239  * Update cur to reflect the new path.
2240  */
2241 STATIC int                                      /* error */
2242 xfs_btree_lshift(
2243         struct xfs_btree_cur    *cur,
2244         int                     level,
2245         int                     *stat)          /* success/failure */
2246 {
2247         struct xfs_buf          *lbp;           /* left buffer pointer */
2248         struct xfs_btree_block  *left;          /* left btree block */
2249         int                     lrecs;          /* left record count */
2250         struct xfs_buf          *rbp;           /* right buffer pointer */
2251         struct xfs_btree_block  *right;         /* right btree block */
2252         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2253         int                     rrecs;          /* right record count */
2254         union xfs_btree_ptr     lptr;           /* left btree pointer */
2255         union xfs_btree_key     *rkp = NULL;    /* right btree key */
2256         union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
2257         union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
2258         int                     error;          /* error return value */
2259         int                     i;
2260
2261         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2262             level == cur->bc_nlevels - 1)
2263                 goto out0;
2264
2265         /* Set up variables for this block as "right". */
2266         right = xfs_btree_get_block(cur, level, &rbp);
2267
2268 #ifdef DEBUG
2269         error = xfs_btree_check_block(cur, right, level, rbp);
2270         if (error)
2271                 goto error0;
2272 #endif
2273
2274         /* If we've got no left sibling then we can't shift an entry left. */
2275         xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2276         if (xfs_btree_ptr_is_null(cur, &lptr))
2277                 goto out0;
2278
2279         /*
2280          * If the cursor entry is the one that would be moved, don't
2281          * do it... it's too complicated.
2282          */
2283         if (cur->bc_ptrs[level] <= 1)
2284                 goto out0;
2285
2286         /* Set up the left neighbor as "left". */
2287         error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2288         if (error)
2289                 goto error0;
2290
2291         /* If it's full, it can't take another entry. */
2292         lrecs = xfs_btree_get_numrecs(left);
2293         if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2294                 goto out0;
2295
2296         rrecs = xfs_btree_get_numrecs(right);
2297
2298         /*
2299          * We add one entry to the left side and remove one for the right side.
2300          * Account for it here, the changes will be updated on disk and logged
2301          * later.
2302          */
2303         lrecs++;
2304         rrecs--;
2305
2306         XFS_BTREE_STATS_INC(cur, lshift);
2307         XFS_BTREE_STATS_ADD(cur, moves, 1);
2308
2309         /*
2310          * If non-leaf, copy a key and a ptr to the left block.
2311          * Log the changes to the left block.
2312          */
2313         if (level > 0) {
2314                 /* It's a non-leaf.  Move keys and pointers. */
2315                 union xfs_btree_key     *lkp;   /* left btree key */
2316                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2317
2318                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2319                 rkp = xfs_btree_key_addr(cur, 1, right);
2320
2321                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2322                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2323
2324                 error = xfs_btree_debug_check_ptr(cur, rpp, 0, level);
2325                 if (error)
2326                         goto error0;
2327
2328                 xfs_btree_copy_keys(cur, lkp, rkp, 1);
2329                 xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2330
2331                 xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2332                 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2333
2334                 ASSERT(cur->bc_ops->keys_inorder(cur,
2335                         xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2336         } else {
2337                 /* It's a leaf.  Move records.  */
2338                 union xfs_btree_rec     *lrp;   /* left record pointer */
2339
2340                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2341                 rrp = xfs_btree_rec_addr(cur, 1, right);
2342
2343                 xfs_btree_copy_recs(cur, lrp, rrp, 1);
2344                 xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2345
2346                 ASSERT(cur->bc_ops->recs_inorder(cur,
2347                         xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2348         }
2349
2350         xfs_btree_set_numrecs(left, lrecs);
2351         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2352
2353         xfs_btree_set_numrecs(right, rrecs);
2354         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2355
2356         /*
2357          * Slide the contents of right down one entry.
2358          */
2359         XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2360         if (level > 0) {
2361                 /* It's a nonleaf. operate on keys and ptrs */
2362                 for (i = 0; i < rrecs; i++) {
2363                         error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
2364                         if (error)
2365                                 goto error0;
2366                 }
2367
2368                 xfs_btree_shift_keys(cur,
2369                                 xfs_btree_key_addr(cur, 2, right),
2370                                 -1, rrecs);
2371                 xfs_btree_shift_ptrs(cur,
2372                                 xfs_btree_ptr_addr(cur, 2, right),
2373                                 -1, rrecs);
2374
2375                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2376                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2377         } else {
2378                 /* It's a leaf. operate on records */
2379                 xfs_btree_shift_recs(cur,
2380                         xfs_btree_rec_addr(cur, 2, right),
2381                         -1, rrecs);
2382                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2383         }
2384
2385         /*
2386          * Using a temporary cursor, update the parent key values of the
2387          * block on the left.
2388          */
2389         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2390                 error = xfs_btree_dup_cursor(cur, &tcur);
2391                 if (error)
2392                         goto error0;
2393                 i = xfs_btree_firstrec(tcur, level);
2394                 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2395                         error = -EFSCORRUPTED;
2396                         goto error0;
2397                 }
2398
2399                 error = xfs_btree_decrement(tcur, level, &i);
2400                 if (error)
2401                         goto error1;
2402
2403                 /* Update the parent high keys of the left block, if needed. */
2404                 error = xfs_btree_update_keys(tcur, level);
2405                 if (error)
2406                         goto error1;
2407
2408                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2409         }
2410
2411         /* Update the parent keys of the right block. */
2412         error = xfs_btree_update_keys(cur, level);
2413         if (error)
2414                 goto error0;
2415
2416         /* Slide the cursor value left one. */
2417         cur->bc_ptrs[level]--;
2418
2419         *stat = 1;
2420         return 0;
2421
2422 out0:
2423         *stat = 0;
2424         return 0;
2425
2426 error0:
2427         return error;
2428
2429 error1:
2430         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2431         return error;
2432 }
2433
2434 /*
2435  * Move 1 record right from cur/level if possible.
2436  * Update cur to reflect the new path.
2437  */
2438 STATIC int                                      /* error */
2439 xfs_btree_rshift(
2440         struct xfs_btree_cur    *cur,
2441         int                     level,
2442         int                     *stat)          /* success/failure */
2443 {
2444         struct xfs_buf          *lbp;           /* left buffer pointer */
2445         struct xfs_btree_block  *left;          /* left btree block */
2446         struct xfs_buf          *rbp;           /* right buffer pointer */
2447         struct xfs_btree_block  *right;         /* right btree block */
2448         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2449         union xfs_btree_ptr     rptr;           /* right block pointer */
2450         union xfs_btree_key     *rkp;           /* right btree key */
2451         int                     rrecs;          /* right record count */
2452         int                     lrecs;          /* left record count */
2453         int                     error;          /* error return value */
2454         int                     i;              /* loop counter */
2455
2456         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2457             (level == cur->bc_nlevels - 1))
2458                 goto out0;
2459
2460         /* Set up variables for this block as "left". */
2461         left = xfs_btree_get_block(cur, level, &lbp);
2462
2463 #ifdef DEBUG
2464         error = xfs_btree_check_block(cur, left, level, lbp);
2465         if (error)
2466                 goto error0;
2467 #endif
2468
2469         /* If we've got no right sibling then we can't shift an entry right. */
2470         xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2471         if (xfs_btree_ptr_is_null(cur, &rptr))
2472                 goto out0;
2473
2474         /*
2475          * If the cursor entry is the one that would be moved, don't
2476          * do it... it's too complicated.
2477          */
2478         lrecs = xfs_btree_get_numrecs(left);
2479         if (cur->bc_ptrs[level] >= lrecs)
2480                 goto out0;
2481
2482         /* Set up the right neighbor as "right". */
2483         error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2484         if (error)
2485                 goto error0;
2486
2487         /* If it's full, it can't take another entry. */
2488         rrecs = xfs_btree_get_numrecs(right);
2489         if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2490                 goto out0;
2491
2492         XFS_BTREE_STATS_INC(cur, rshift);
2493         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2494
2495         /*
2496          * Make a hole at the start of the right neighbor block, then
2497          * copy the last left block entry to the hole.
2498          */
2499         if (level > 0) {
2500                 /* It's a nonleaf. make a hole in the keys and ptrs */
2501                 union xfs_btree_key     *lkp;
2502                 union xfs_btree_ptr     *lpp;
2503                 union xfs_btree_ptr     *rpp;
2504
2505                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2506                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2507                 rkp = xfs_btree_key_addr(cur, 1, right);
2508                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2509
2510                 for (i = rrecs - 1; i >= 0; i--) {
2511                         error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
2512                         if (error)
2513                                 goto error0;
2514                 }
2515
2516                 xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2517                 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2518
2519                 error = xfs_btree_debug_check_ptr(cur, lpp, 0, level);
2520                 if (error)
2521                         goto error0;
2522
2523                 /* Now put the new data in, and log it. */
2524                 xfs_btree_copy_keys(cur, rkp, lkp, 1);
2525                 xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2526
2527                 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2528                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2529
2530                 ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2531                         xfs_btree_key_addr(cur, 2, right)));
2532         } else {
2533                 /* It's a leaf. make a hole in the records */
2534                 union xfs_btree_rec     *lrp;
2535                 union xfs_btree_rec     *rrp;
2536
2537                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2538                 rrp = xfs_btree_rec_addr(cur, 1, right);
2539
2540                 xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2541
2542                 /* Now put the new data in, and log it. */
2543                 xfs_btree_copy_recs(cur, rrp, lrp, 1);
2544                 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2545         }
2546
2547         /*
2548          * Decrement and log left's numrecs, bump and log right's numrecs.
2549          */
2550         xfs_btree_set_numrecs(left, --lrecs);
2551         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2552
2553         xfs_btree_set_numrecs(right, ++rrecs);
2554         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2555
2556         /*
2557          * Using a temporary cursor, update the parent key values of the
2558          * block on the right.
2559          */
2560         error = xfs_btree_dup_cursor(cur, &tcur);
2561         if (error)
2562                 goto error0;
2563         i = xfs_btree_lastrec(tcur, level);
2564         if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2565                 error = -EFSCORRUPTED;
2566                 goto error0;
2567         }
2568
2569         error = xfs_btree_increment(tcur, level, &i);
2570         if (error)
2571                 goto error1;
2572
2573         /* Update the parent high keys of the left block, if needed. */
2574         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2575                 error = xfs_btree_update_keys(cur, level);
2576                 if (error)
2577                         goto error1;
2578         }
2579
2580         /* Update the parent keys of the right block. */
2581         error = xfs_btree_update_keys(tcur, level);
2582         if (error)
2583                 goto error1;
2584
2585         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2586
2587         *stat = 1;
2588         return 0;
2589
2590 out0:
2591         *stat = 0;
2592         return 0;
2593
2594 error0:
2595         return error;
2596
2597 error1:
2598         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2599         return error;
2600 }
2601
2602 /*
2603  * Split cur/level block in half.
2604  * Return new block number and the key to its first
2605  * record (to be inserted into parent).
2606  */
2607 STATIC int                                      /* error */
2608 __xfs_btree_split(
2609         struct xfs_btree_cur    *cur,
2610         int                     level,
2611         union xfs_btree_ptr     *ptrp,
2612         union xfs_btree_key     *key,
2613         struct xfs_btree_cur    **curp,
2614         int                     *stat)          /* success/failure */
2615 {
2616         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2617         struct xfs_buf          *lbp;           /* left buffer pointer */
2618         struct xfs_btree_block  *left;          /* left btree block */
2619         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2620         struct xfs_buf          *rbp;           /* right buffer pointer */
2621         struct xfs_btree_block  *right;         /* right btree block */
2622         union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2623         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2624         struct xfs_btree_block  *rrblock;       /* right-right btree block */
2625         int                     lrecs;
2626         int                     rrecs;
2627         int                     src_index;
2628         int                     error;          /* error return value */
2629         int                     i;
2630
2631         XFS_BTREE_STATS_INC(cur, split);
2632
2633         /* Set up left block (current one). */
2634         left = xfs_btree_get_block(cur, level, &lbp);
2635
2636 #ifdef DEBUG
2637         error = xfs_btree_check_block(cur, left, level, lbp);
2638         if (error)
2639                 goto error0;
2640 #endif
2641
2642         xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2643
2644         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2645         error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2646         if (error)
2647                 goto error0;
2648         if (*stat == 0)
2649                 goto out0;
2650         XFS_BTREE_STATS_INC(cur, alloc);
2651
2652         /* Set up the new block as "right". */
2653         error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
2654         if (error)
2655                 goto error0;
2656
2657         /* Fill in the btree header for the new right block. */
2658         xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2659
2660         /*
2661          * Split the entries between the old and the new block evenly.
2662          * Make sure that if there's an odd number of entries now, that
2663          * each new block will have the same number of entries.
2664          */
2665         lrecs = xfs_btree_get_numrecs(left);
2666         rrecs = lrecs / 2;
2667         if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2668                 rrecs++;
2669         src_index = (lrecs - rrecs + 1);
2670
2671         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2672
2673         /* Adjust numrecs for the later get_*_keys() calls. */
2674         lrecs -= rrecs;
2675         xfs_btree_set_numrecs(left, lrecs);
2676         xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2677
2678         /*
2679          * Copy btree block entries from the left block over to the
2680          * new block, the right. Update the right block and log the
2681          * changes.
2682          */
2683         if (level > 0) {
2684                 /* It's a non-leaf.  Move keys and pointers. */
2685                 union xfs_btree_key     *lkp;   /* left btree key */
2686                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2687                 union xfs_btree_key     *rkp;   /* right btree key */
2688                 union xfs_btree_ptr     *rpp;   /* right address pointer */
2689
2690                 lkp = xfs_btree_key_addr(cur, src_index, left);
2691                 lpp = xfs_btree_ptr_addr(cur, src_index, left);
2692                 rkp = xfs_btree_key_addr(cur, 1, right);
2693                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2694
2695                 for (i = src_index; i < rrecs; i++) {
2696                         error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
2697                         if (error)
2698                                 goto error0;
2699                 }
2700
2701                 /* Copy the keys & pointers to the new block. */
2702                 xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2703                 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2704
2705                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2706                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2707
2708                 /* Stash the keys of the new block for later insertion. */
2709                 xfs_btree_get_node_keys(cur, right, key);
2710         } else {
2711                 /* It's a leaf.  Move records.  */
2712                 union xfs_btree_rec     *lrp;   /* left record pointer */
2713                 union xfs_btree_rec     *rrp;   /* right record pointer */
2714
2715                 lrp = xfs_btree_rec_addr(cur, src_index, left);
2716                 rrp = xfs_btree_rec_addr(cur, 1, right);
2717
2718                 /* Copy records to the new block. */
2719                 xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2720                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2721
2722                 /* Stash the keys of the new block for later insertion. */
2723                 xfs_btree_get_leaf_keys(cur, right, key);
2724         }
2725
2726         /*
2727          * Find the left block number by looking in the buffer.
2728          * Adjust sibling pointers.
2729          */
2730         xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2731         xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2732         xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2733         xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2734
2735         xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2736         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2737
2738         /*
2739          * If there's a block to the new block's right, make that block
2740          * point back to right instead of to left.
2741          */
2742         if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2743                 error = xfs_btree_read_buf_block(cur, &rrptr,
2744                                                         0, &rrblock, &rrbp);
2745                 if (error)
2746                         goto error0;
2747                 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2748                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2749         }
2750
2751         /* Update the parent high keys of the left block, if needed. */
2752         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2753                 error = xfs_btree_update_keys(cur, level);
2754                 if (error)
2755                         goto error0;
2756         }
2757
2758         /*
2759          * If the cursor is really in the right block, move it there.
2760          * If it's just pointing past the last entry in left, then we'll
2761          * insert there, so don't change anything in that case.
2762          */
2763         if (cur->bc_ptrs[level] > lrecs + 1) {
2764                 xfs_btree_setbuf(cur, level, rbp);
2765                 cur->bc_ptrs[level] -= lrecs;
2766         }
2767         /*
2768          * If there are more levels, we'll need another cursor which refers
2769          * the right block, no matter where this cursor was.
2770          */
2771         if (level + 1 < cur->bc_nlevels) {
2772                 error = xfs_btree_dup_cursor(cur, curp);
2773                 if (error)
2774                         goto error0;
2775                 (*curp)->bc_ptrs[level + 1]++;
2776         }
2777         *ptrp = rptr;
2778         *stat = 1;
2779         return 0;
2780 out0:
2781         *stat = 0;
2782         return 0;
2783
2784 error0:
2785         return error;
2786 }
2787
2788 struct xfs_btree_split_args {
2789         struct xfs_btree_cur    *cur;
2790         int                     level;
2791         union xfs_btree_ptr     *ptrp;
2792         union xfs_btree_key     *key;
2793         struct xfs_btree_cur    **curp;
2794         int                     *stat;          /* success/failure */
2795         int                     result;
2796         bool                    kswapd; /* allocation in kswapd context */
2797         struct completion       *done;
2798         struct work_struct      work;
2799 };
2800
2801 /*
2802  * Stack switching interfaces for allocation
2803  */
2804 static void
2805 xfs_btree_split_worker(
2806         struct work_struct      *work)
2807 {
2808         struct xfs_btree_split_args     *args = container_of(work,
2809                                                 struct xfs_btree_split_args, work);
2810         unsigned long           pflags;
2811         unsigned long           new_pflags = 0;
2812
2813         /*
2814          * we are in a transaction context here, but may also be doing work
2815          * in kswapd context, and hence we may need to inherit that state
2816          * temporarily to ensure that we don't block waiting for memory reclaim
2817          * in any way.
2818          */
2819         if (args->kswapd)
2820                 new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2821
2822         current_set_flags_nested(&pflags, new_pflags);
2823         xfs_trans_set_context(args->cur->bc_tp);
2824
2825         args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2826                                          args->key, args->curp, args->stat);
2827
2828         xfs_trans_clear_context(args->cur->bc_tp);
2829         current_restore_flags_nested(&pflags, new_pflags);
2830
2831         /*
2832          * Do not access args after complete() has run here. We don't own args
2833          * and the owner may run and free args before we return here.
2834          */
2835         complete(args->done);
2836
2837 }
2838
2839 /*
2840  * BMBT split requests often come in with little stack to work on. Push
2841  * them off to a worker thread so there is lots of stack to use. For the other
2842  * btree types, just call directly to avoid the context switch overhead here.
2843  */
2844 STATIC int                                      /* error */
2845 xfs_btree_split(
2846         struct xfs_btree_cur    *cur,
2847         int                     level,
2848         union xfs_btree_ptr     *ptrp,
2849         union xfs_btree_key     *key,
2850         struct xfs_btree_cur    **curp,
2851         int                     *stat)          /* success/failure */
2852 {
2853         struct xfs_btree_split_args     args;
2854         DECLARE_COMPLETION_ONSTACK(done);
2855
2856         if (cur->bc_btnum != XFS_BTNUM_BMAP)
2857                 return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2858
2859         args.cur = cur;
2860         args.level = level;
2861         args.ptrp = ptrp;
2862         args.key = key;
2863         args.curp = curp;
2864         args.stat = stat;
2865         args.done = &done;
2866         args.kswapd = current_is_kswapd();
2867         INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2868         queue_work(xfs_alloc_wq, &args.work);
2869         wait_for_completion(&done);
2870         destroy_work_on_stack(&args.work);
2871         return args.result;
2872 }
2873
2874
2875 /*
2876  * Copy the old inode root contents into a real block and make the
2877  * broot point to it.
2878  */
2879 int                                             /* error */
2880 xfs_btree_new_iroot(
2881         struct xfs_btree_cur    *cur,           /* btree cursor */
2882         int                     *logflags,      /* logging flags for inode */
2883         int                     *stat)          /* return status - 0 fail */
2884 {
2885         struct xfs_buf          *cbp;           /* buffer for cblock */
2886         struct xfs_btree_block  *block;         /* btree block */
2887         struct xfs_btree_block  *cblock;        /* child btree block */
2888         union xfs_btree_key     *ckp;           /* child key pointer */
2889         union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2890         union xfs_btree_key     *kp;            /* pointer to btree key */
2891         union xfs_btree_ptr     *pp;            /* pointer to block addr */
2892         union xfs_btree_ptr     nptr;           /* new block addr */
2893         int                     level;          /* btree level */
2894         int                     error;          /* error return code */
2895         int                     i;              /* loop counter */
2896
2897         XFS_BTREE_STATS_INC(cur, newroot);
2898
2899         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2900
2901         level = cur->bc_nlevels - 1;
2902
2903         block = xfs_btree_get_iroot(cur);
2904         pp = xfs_btree_ptr_addr(cur, 1, block);
2905
2906         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2907         error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2908         if (error)
2909                 goto error0;
2910         if (*stat == 0)
2911                 return 0;
2912
2913         XFS_BTREE_STATS_INC(cur, alloc);
2914
2915         /* Copy the root into a real block. */
2916         error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
2917         if (error)
2918                 goto error0;
2919
2920         /*
2921          * we can't just memcpy() the root in for CRC enabled btree blocks.
2922          * In that case have to also ensure the blkno remains correct
2923          */
2924         memcpy(cblock, block, xfs_btree_block_len(cur));
2925         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2926                 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2927                         cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2928                 else
2929                         cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2930         }
2931
2932         be16_add_cpu(&block->bb_level, 1);
2933         xfs_btree_set_numrecs(block, 1);
2934         cur->bc_nlevels++;
2935         cur->bc_ptrs[level + 1] = 1;
2936
2937         kp = xfs_btree_key_addr(cur, 1, block);
2938         ckp = xfs_btree_key_addr(cur, 1, cblock);
2939         xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2940
2941         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2942         for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2943                 error = xfs_btree_debug_check_ptr(cur, pp, i, level);
2944                 if (error)
2945                         goto error0;
2946         }
2947
2948         xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2949
2950         error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
2951         if (error)
2952                 goto error0;
2953
2954         xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2955
2956         xfs_iroot_realloc(cur->bc_ino.ip,
2957                           1 - xfs_btree_get_numrecs(cblock),
2958                           cur->bc_ino.whichfork);
2959
2960         xfs_btree_setbuf(cur, level, cbp);
2961
2962         /*
2963          * Do all this logging at the end so that
2964          * the root is at the right level.
2965          */
2966         xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2967         xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2968         xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2969
2970         *logflags |=
2971                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
2972         *stat = 1;
2973         return 0;
2974 error0:
2975         return error;
2976 }
2977
2978 /*
2979  * Allocate a new root block, fill it in.
2980  */
2981 STATIC int                              /* error */
2982 xfs_btree_new_root(
2983         struct xfs_btree_cur    *cur,   /* btree cursor */
2984         int                     *stat)  /* success/failure */
2985 {
2986         struct xfs_btree_block  *block; /* one half of the old root block */
2987         struct xfs_buf          *bp;    /* buffer containing block */
2988         int                     error;  /* error return value */
2989         struct xfs_buf          *lbp;   /* left buffer pointer */
2990         struct xfs_btree_block  *left;  /* left btree block */
2991         struct xfs_buf          *nbp;   /* new (root) buffer */
2992         struct xfs_btree_block  *new;   /* new (root) btree block */
2993         int                     nptr;   /* new value for key index, 1 or 2 */
2994         struct xfs_buf          *rbp;   /* right buffer pointer */
2995         struct xfs_btree_block  *right; /* right btree block */
2996         union xfs_btree_ptr     rptr;
2997         union xfs_btree_ptr     lptr;
2998
2999         XFS_BTREE_STATS_INC(cur, newroot);
3000
3001         /* initialise our start point from the cursor */
3002         cur->bc_ops->init_ptr_from_cur(cur, &rptr);
3003
3004         /* Allocate the new block. If we can't do it, we're toast. Give up. */
3005         error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
3006         if (error)
3007                 goto error0;
3008         if (*stat == 0)
3009                 goto out0;
3010         XFS_BTREE_STATS_INC(cur, alloc);
3011
3012         /* Set up the new block. */
3013         error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
3014         if (error)
3015                 goto error0;
3016
3017         /* Set the root in the holding structure  increasing the level by 1. */
3018         cur->bc_ops->set_root(cur, &lptr, 1);
3019
3020         /*
3021          * At the previous root level there are now two blocks: the old root,
3022          * and the new block generated when it was split.  We don't know which
3023          * one the cursor is pointing at, so we set up variables "left" and
3024          * "right" for each case.
3025          */
3026         block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3027
3028 #ifdef DEBUG
3029         error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3030         if (error)
3031                 goto error0;
3032 #endif
3033
3034         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3035         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3036                 /* Our block is left, pick up the right block. */
3037                 lbp = bp;
3038                 xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3039                 left = block;
3040                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3041                 if (error)
3042                         goto error0;
3043                 bp = rbp;
3044                 nptr = 1;
3045         } else {
3046                 /* Our block is right, pick up the left block. */
3047                 rbp = bp;
3048                 xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3049                 right = block;
3050                 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3051                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3052                 if (error)
3053                         goto error0;
3054                 bp = lbp;
3055                 nptr = 2;
3056         }
3057
3058         /* Fill in the new block's btree header and log it. */
3059         xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3060         xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3061         ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3062                         !xfs_btree_ptr_is_null(cur, &rptr));
3063
3064         /* Fill in the key data in the new root. */
3065         if (xfs_btree_get_level(left) > 0) {
3066                 /*
3067                  * Get the keys for the left block's keys and put them directly
3068                  * in the parent block.  Do the same for the right block.
3069                  */
3070                 xfs_btree_get_node_keys(cur, left,
3071                                 xfs_btree_key_addr(cur, 1, new));
3072                 xfs_btree_get_node_keys(cur, right,
3073                                 xfs_btree_key_addr(cur, 2, new));
3074         } else {
3075                 /*
3076                  * Get the keys for the left block's records and put them
3077                  * directly in the parent block.  Do the same for the right
3078                  * block.
3079                  */
3080                 xfs_btree_get_leaf_keys(cur, left,
3081                         xfs_btree_key_addr(cur, 1, new));
3082                 xfs_btree_get_leaf_keys(cur, right,
3083                         xfs_btree_key_addr(cur, 2, new));
3084         }
3085         xfs_btree_log_keys(cur, nbp, 1, 2);
3086
3087         /* Fill in the pointer data in the new root. */
3088         xfs_btree_copy_ptrs(cur,
3089                 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3090         xfs_btree_copy_ptrs(cur,
3091                 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3092         xfs_btree_log_ptrs(cur, nbp, 1, 2);
3093
3094         /* Fix up the cursor. */
3095         xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3096         cur->bc_ptrs[cur->bc_nlevels] = nptr;
3097         cur->bc_nlevels++;
3098         *stat = 1;
3099         return 0;
3100 error0:
3101         return error;
3102 out0:
3103         *stat = 0;
3104         return 0;
3105 }
3106
3107 STATIC int
3108 xfs_btree_make_block_unfull(
3109         struct xfs_btree_cur    *cur,   /* btree cursor */
3110         int                     level,  /* btree level */
3111         int                     numrecs,/* # of recs in block */
3112         int                     *oindex,/* old tree index */
3113         int                     *index, /* new tree index */
3114         union xfs_btree_ptr     *nptr,  /* new btree ptr */
3115         struct xfs_btree_cur    **ncur, /* new btree cursor */
3116         union xfs_btree_key     *key,   /* key of new block */
3117         int                     *stat)
3118 {
3119         int                     error = 0;
3120
3121         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3122             level == cur->bc_nlevels - 1) {
3123                 struct xfs_inode *ip = cur->bc_ino.ip;
3124
3125                 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3126                         /* A root block that can be made bigger. */
3127                         xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
3128                         *stat = 1;
3129                 } else {
3130                         /* A root block that needs replacing */
3131                         int     logflags = 0;
3132
3133                         error = xfs_btree_new_iroot(cur, &logflags, stat);
3134                         if (error || *stat == 0)
3135                                 return error;
3136
3137                         xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3138                 }
3139
3140                 return 0;
3141         }
3142
3143         /* First, try shifting an entry to the right neighbor. */
3144         error = xfs_btree_rshift(cur, level, stat);
3145         if (error || *stat)
3146                 return error;
3147
3148         /* Next, try shifting an entry to the left neighbor. */
3149         error = xfs_btree_lshift(cur, level, stat);
3150         if (error)
3151                 return error;
3152
3153         if (*stat) {
3154                 *oindex = *index = cur->bc_ptrs[level];
3155                 return 0;
3156         }
3157
3158         /*
3159          * Next, try splitting the current block in half.
3160          *
3161          * If this works we have to re-set our variables because we
3162          * could be in a different block now.
3163          */
3164         error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3165         if (error || *stat == 0)
3166                 return error;
3167
3168
3169         *index = cur->bc_ptrs[level];
3170         return 0;
3171 }
3172
3173 /*
3174  * Insert one record/level.  Return information to the caller
3175  * allowing the next level up to proceed if necessary.
3176  */
3177 STATIC int
3178 xfs_btree_insrec(
3179         struct xfs_btree_cur    *cur,   /* btree cursor */
3180         int                     level,  /* level to insert record at */
3181         union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
3182         union xfs_btree_rec     *rec,   /* record to insert */
3183         union xfs_btree_key     *key,   /* i/o: block key for ptrp */
3184         struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
3185         int                     *stat)  /* success/failure */
3186 {
3187         struct xfs_btree_block  *block; /* btree block */
3188         struct xfs_buf          *bp;    /* buffer for block */
3189         union xfs_btree_ptr     nptr;   /* new block ptr */
3190         struct xfs_btree_cur    *ncur;  /* new btree cursor */
3191         union xfs_btree_key     nkey;   /* new block key */
3192         union xfs_btree_key     *lkey;
3193         int                     optr;   /* old key/record index */
3194         int                     ptr;    /* key/record index */
3195         int                     numrecs;/* number of records */
3196         int                     error;  /* error return value */
3197         int                     i;
3198         xfs_daddr_t             old_bn;
3199
3200         ncur = NULL;
3201         lkey = &nkey;
3202
3203         /*
3204          * If we have an external root pointer, and we've made it to the
3205          * root level, allocate a new root block and we're done.
3206          */
3207         if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3208             (level >= cur->bc_nlevels)) {
3209                 error = xfs_btree_new_root(cur, stat);
3210                 xfs_btree_set_ptr_null(cur, ptrp);
3211
3212                 return error;
3213         }
3214
3215         /* If we're off the left edge, return failure. */
3216         ptr = cur->bc_ptrs[level];
3217         if (ptr == 0) {
3218                 *stat = 0;
3219                 return 0;
3220         }
3221
3222         optr = ptr;
3223
3224         XFS_BTREE_STATS_INC(cur, insrec);
3225
3226         /* Get pointers to the btree buffer and block. */
3227         block = xfs_btree_get_block(cur, level, &bp);
3228         old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
3229         numrecs = xfs_btree_get_numrecs(block);
3230
3231 #ifdef DEBUG
3232         error = xfs_btree_check_block(cur, block, level, bp);
3233         if (error)
3234                 goto error0;
3235
3236         /* Check that the new entry is being inserted in the right place. */
3237         if (ptr <= numrecs) {
3238                 if (level == 0) {
3239                         ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3240                                 xfs_btree_rec_addr(cur, ptr, block)));
3241                 } else {
3242                         ASSERT(cur->bc_ops->keys_inorder(cur, key,
3243                                 xfs_btree_key_addr(cur, ptr, block)));
3244                 }
3245         }
3246 #endif
3247
3248         /*
3249          * If the block is full, we can't insert the new entry until we
3250          * make the block un-full.
3251          */
3252         xfs_btree_set_ptr_null(cur, &nptr);
3253         if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3254                 error = xfs_btree_make_block_unfull(cur, level, numrecs,
3255                                         &optr, &ptr, &nptr, &ncur, lkey, stat);
3256                 if (error || *stat == 0)
3257                         goto error0;
3258         }
3259
3260         /*
3261          * The current block may have changed if the block was
3262          * previously full and we have just made space in it.
3263          */
3264         block = xfs_btree_get_block(cur, level, &bp);
3265         numrecs = xfs_btree_get_numrecs(block);
3266
3267 #ifdef DEBUG
3268         error = xfs_btree_check_block(cur, block, level, bp);
3269         if (error)
3270                 return error;
3271 #endif
3272
3273         /*
3274          * At this point we know there's room for our new entry in the block
3275          * we're pointing at.
3276          */
3277         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3278
3279         if (level > 0) {
3280                 /* It's a nonleaf. make a hole in the keys and ptrs */
3281                 union xfs_btree_key     *kp;
3282                 union xfs_btree_ptr     *pp;
3283
3284                 kp = xfs_btree_key_addr(cur, ptr, block);
3285                 pp = xfs_btree_ptr_addr(cur, ptr, block);
3286
3287                 for (i = numrecs - ptr; i >= 0; i--) {
3288                         error = xfs_btree_debug_check_ptr(cur, pp, i, level);
3289                         if (error)
3290                                 return error;
3291                 }
3292
3293                 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3294                 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3295
3296                 error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level);
3297                 if (error)
3298                         goto error0;
3299
3300                 /* Now put the new data in, bump numrecs and log it. */
3301                 xfs_btree_copy_keys(cur, kp, key, 1);
3302                 xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3303                 numrecs++;
3304                 xfs_btree_set_numrecs(block, numrecs);
3305                 xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3306                 xfs_btree_log_keys(cur, bp, ptr, numrecs);
3307 #ifdef DEBUG
3308                 if (ptr < numrecs) {
3309                         ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3310                                 xfs_btree_key_addr(cur, ptr + 1, block)));
3311                 }
3312 #endif
3313         } else {
3314                 /* It's a leaf. make a hole in the records */
3315                 union xfs_btree_rec             *rp;
3316
3317                 rp = xfs_btree_rec_addr(cur, ptr, block);
3318
3319                 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3320
3321                 /* Now put the new data in, bump numrecs and log it. */
3322                 xfs_btree_copy_recs(cur, rp, rec, 1);
3323                 xfs_btree_set_numrecs(block, ++numrecs);
3324                 xfs_btree_log_recs(cur, bp, ptr, numrecs);
3325 #ifdef DEBUG
3326                 if (ptr < numrecs) {
3327                         ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3328                                 xfs_btree_rec_addr(cur, ptr + 1, block)));
3329                 }
3330 #endif
3331         }
3332
3333         /* Log the new number of records in the btree header. */
3334         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3335
3336         /*
3337          * If we just inserted into a new tree block, we have to
3338          * recalculate nkey here because nkey is out of date.
3339          *
3340          * Otherwise we're just updating an existing block (having shoved
3341          * some records into the new tree block), so use the regular key
3342          * update mechanism.
3343          */
3344         if (bp && bp->b_bn != old_bn) {
3345                 xfs_btree_get_keys(cur, block, lkey);
3346         } else if (xfs_btree_needs_key_update(cur, optr)) {
3347                 error = xfs_btree_update_keys(cur, level);
3348                 if (error)
3349                         goto error0;
3350         }
3351
3352         /*
3353          * If we are tracking the last record in the tree and
3354          * we are at the far right edge of the tree, update it.
3355          */
3356         if (xfs_btree_is_lastrec(cur, block, level)) {
3357                 cur->bc_ops->update_lastrec(cur, block, rec,
3358                                             ptr, LASTREC_INSREC);
3359         }
3360
3361         /*
3362          * Return the new block number, if any.
3363          * If there is one, give back a record value and a cursor too.
3364          */
3365         *ptrp = nptr;
3366         if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3367                 xfs_btree_copy_keys(cur, key, lkey, 1);
3368                 *curp = ncur;
3369         }
3370
3371         *stat = 1;
3372         return 0;
3373
3374 error0:
3375         return error;
3376 }
3377
3378 /*
3379  * Insert the record at the point referenced by cur.
3380  *
3381  * A multi-level split of the tree on insert will invalidate the original
3382  * cursor.  All callers of this function should assume that the cursor is
3383  * no longer valid and revalidate it.
3384  */
3385 int
3386 xfs_btree_insert(
3387         struct xfs_btree_cur    *cur,
3388         int                     *stat)
3389 {
3390         int                     error;  /* error return value */
3391         int                     i;      /* result value, 0 for failure */
3392         int                     level;  /* current level number in btree */
3393         union xfs_btree_ptr     nptr;   /* new block number (split result) */
3394         struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3395         struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3396         union xfs_btree_key     bkey;   /* key of block to insert */
3397         union xfs_btree_key     *key;
3398         union xfs_btree_rec     rec;    /* record to insert */
3399
3400         level = 0;
3401         ncur = NULL;
3402         pcur = cur;
3403         key = &bkey;
3404
3405         xfs_btree_set_ptr_null(cur, &nptr);
3406
3407         /* Make a key out of the record data to be inserted, and save it. */
3408         cur->bc_ops->init_rec_from_cur(cur, &rec);
3409         cur->bc_ops->init_key_from_rec(key, &rec);
3410
3411         /*
3412          * Loop going up the tree, starting at the leaf level.
3413          * Stop when we don't get a split block, that must mean that
3414          * the insert is finished with this level.
3415          */
3416         do {
3417                 /*
3418                  * Insert nrec/nptr into this level of the tree.
3419                  * Note if we fail, nptr will be null.
3420                  */
3421                 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3422                                 &ncur, &i);
3423                 if (error) {
3424                         if (pcur != cur)
3425                                 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3426                         goto error0;
3427                 }
3428
3429                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3430                         error = -EFSCORRUPTED;
3431                         goto error0;
3432                 }
3433                 level++;
3434
3435                 /*
3436                  * See if the cursor we just used is trash.
3437                  * Can't trash the caller's cursor, but otherwise we should
3438                  * if ncur is a new cursor or we're about to be done.
3439                  */
3440                 if (pcur != cur &&
3441                     (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3442                         /* Save the state from the cursor before we trash it */
3443                         if (cur->bc_ops->update_cursor)
3444                                 cur->bc_ops->update_cursor(pcur, cur);
3445                         cur->bc_nlevels = pcur->bc_nlevels;
3446                         xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3447                 }
3448                 /* If we got a new cursor, switch to it. */
3449                 if (ncur) {
3450                         pcur = ncur;
3451                         ncur = NULL;
3452                 }
3453         } while (!xfs_btree_ptr_is_null(cur, &nptr));
3454
3455         *stat = i;
3456         return 0;
3457 error0:
3458         return error;
3459 }
3460
3461 /*
3462  * Try to merge a non-leaf block back into the inode root.
3463  *
3464  * Note: the killroot names comes from the fact that we're effectively
3465  * killing the old root block.  But because we can't just delete the
3466  * inode we have to copy the single block it was pointing to into the
3467  * inode.
3468  */
3469 STATIC int
3470 xfs_btree_kill_iroot(
3471         struct xfs_btree_cur    *cur)
3472 {
3473         int                     whichfork = cur->bc_ino.whichfork;
3474         struct xfs_inode        *ip = cur->bc_ino.ip;
3475         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3476         struct xfs_btree_block  *block;
3477         struct xfs_btree_block  *cblock;
3478         union xfs_btree_key     *kp;
3479         union xfs_btree_key     *ckp;
3480         union xfs_btree_ptr     *pp;
3481         union xfs_btree_ptr     *cpp;
3482         struct xfs_buf          *cbp;
3483         int                     level;
3484         int                     index;
3485         int                     numrecs;
3486         int                     error;
3487 #ifdef DEBUG
3488         union xfs_btree_ptr     ptr;
3489 #endif
3490         int                     i;
3491
3492         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3493         ASSERT(cur->bc_nlevels > 1);
3494
3495         /*
3496          * Don't deal with the root block needs to be a leaf case.
3497          * We're just going to turn the thing back into extents anyway.
3498          */
3499         level = cur->bc_nlevels - 1;
3500         if (level == 1)
3501                 goto out0;
3502
3503         /*
3504          * Give up if the root has multiple children.
3505          */
3506         block = xfs_btree_get_iroot(cur);
3507         if (xfs_btree_get_numrecs(block) != 1)
3508                 goto out0;
3509
3510         cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3511         numrecs = xfs_btree_get_numrecs(cblock);
3512
3513         /*
3514          * Only do this if the next level will fit.
3515          * Then the data must be copied up to the inode,
3516          * instead of freeing the root you free the next level.
3517          */
3518         if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3519                 goto out0;
3520
3521         XFS_BTREE_STATS_INC(cur, killroot);
3522
3523 #ifdef DEBUG
3524         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3525         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3526         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3527         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3528 #endif
3529
3530         index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3531         if (index) {
3532                 xfs_iroot_realloc(cur->bc_ino.ip, index,
3533                                   cur->bc_ino.whichfork);
3534                 block = ifp->if_broot;
3535         }
3536
3537         be16_add_cpu(&block->bb_numrecs, index);
3538         ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3539
3540         kp = xfs_btree_key_addr(cur, 1, block);
3541         ckp = xfs_btree_key_addr(cur, 1, cblock);
3542         xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3543
3544         pp = xfs_btree_ptr_addr(cur, 1, block);
3545         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3546
3547         for (i = 0; i < numrecs; i++) {
3548                 error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
3549                 if (error)
3550                         return error;
3551         }
3552
3553         xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3554
3555         error = xfs_btree_free_block(cur, cbp);
3556         if (error)
3557                 return error;
3558
3559         cur->bc_bufs[level - 1] = NULL;
3560         be16_add_cpu(&block->bb_level, -1);
3561         xfs_trans_log_inode(cur->bc_tp, ip,
3562                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
3563         cur->bc_nlevels--;
3564 out0:
3565         return 0;
3566 }
3567
3568 /*
3569  * Kill the current root node, and replace it with it's only child node.
3570  */
3571 STATIC int
3572 xfs_btree_kill_root(
3573         struct xfs_btree_cur    *cur,
3574         struct xfs_buf          *bp,
3575         int                     level,
3576         union xfs_btree_ptr     *newroot)
3577 {
3578         int                     error;
3579
3580         XFS_BTREE_STATS_INC(cur, killroot);
3581
3582         /*
3583          * Update the root pointer, decreasing the level by 1 and then
3584          * free the old root.
3585          */
3586         cur->bc_ops->set_root(cur, newroot, -1);
3587
3588         error = xfs_btree_free_block(cur, bp);
3589         if (error)
3590                 return error;
3591
3592         cur->bc_bufs[level] = NULL;
3593         cur->bc_ra[level] = 0;
3594         cur->bc_nlevels--;
3595
3596         return 0;
3597 }
3598
3599 STATIC int
3600 xfs_btree_dec_cursor(
3601         struct xfs_btree_cur    *cur,
3602         int                     level,
3603         int                     *stat)
3604 {
3605         int                     error;
3606         int                     i;
3607
3608         if (level > 0) {
3609                 error = xfs_btree_decrement(cur, level, &i);
3610                 if (error)
3611                         return error;
3612         }
3613
3614         *stat = 1;
3615         return 0;
3616 }
3617
3618 /*
3619  * Single level of the btree record deletion routine.
3620  * Delete record pointed to by cur/level.
3621  * Remove the record from its block then rebalance the tree.
3622  * Return 0 for error, 1 for done, 2 to go on to the next level.
3623  */
3624 STATIC int                                      /* error */
3625 xfs_btree_delrec(
3626         struct xfs_btree_cur    *cur,           /* btree cursor */
3627         int                     level,          /* level removing record from */
3628         int                     *stat)          /* fail/done/go-on */
3629 {
3630         struct xfs_btree_block  *block;         /* btree block */
3631         union xfs_btree_ptr     cptr;           /* current block ptr */
3632         struct xfs_buf          *bp;            /* buffer for block */
3633         int                     error;          /* error return value */
3634         int                     i;              /* loop counter */
3635         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3636         struct xfs_buf          *lbp;           /* left buffer pointer */
3637         struct xfs_btree_block  *left;          /* left btree block */
3638         int                     lrecs = 0;      /* left record count */
3639         int                     ptr;            /* key/record index */
3640         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3641         struct xfs_buf          *rbp;           /* right buffer pointer */
3642         struct xfs_btree_block  *right;         /* right btree block */
3643         struct xfs_btree_block  *rrblock;       /* right-right btree block */
3644         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3645         int                     rrecs = 0;      /* right record count */
3646         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3647         int                     numrecs;        /* temporary numrec count */
3648
3649         tcur = NULL;
3650
3651         /* Get the index of the entry being deleted, check for nothing there. */
3652         ptr = cur->bc_ptrs[level];
3653         if (ptr == 0) {
3654                 *stat = 0;
3655                 return 0;
3656         }
3657
3658         /* Get the buffer & block containing the record or key/ptr. */
3659         block = xfs_btree_get_block(cur, level, &bp);
3660         numrecs = xfs_btree_get_numrecs(block);
3661
3662 #ifdef DEBUG
3663         error = xfs_btree_check_block(cur, block, level, bp);
3664         if (error)
3665                 goto error0;
3666 #endif
3667
3668         /* Fail if we're off the end of the block. */
3669         if (ptr > numrecs) {
3670                 *stat = 0;
3671                 return 0;
3672         }
3673
3674         XFS_BTREE_STATS_INC(cur, delrec);
3675         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3676
3677         /* Excise the entries being deleted. */
3678         if (level > 0) {
3679                 /* It's a nonleaf. operate on keys and ptrs */
3680                 union xfs_btree_key     *lkp;
3681                 union xfs_btree_ptr     *lpp;
3682
3683                 lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3684                 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3685
3686                 for (i = 0; i < numrecs - ptr; i++) {
3687                         error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
3688                         if (error)
3689                                 goto error0;
3690                 }
3691
3692                 if (ptr < numrecs) {
3693                         xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3694                         xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3695                         xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3696                         xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3697                 }
3698         } else {
3699                 /* It's a leaf. operate on records */
3700                 if (ptr < numrecs) {
3701                         xfs_btree_shift_recs(cur,
3702                                 xfs_btree_rec_addr(cur, ptr + 1, block),
3703                                 -1, numrecs - ptr);
3704                         xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3705                 }
3706         }
3707
3708         /*
3709          * Decrement and log the number of entries in the block.
3710          */
3711         xfs_btree_set_numrecs(block, --numrecs);
3712         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3713
3714         /*
3715          * If we are tracking the last record in the tree and
3716          * we are at the far right edge of the tree, update it.
3717          */
3718         if (xfs_btree_is_lastrec(cur, block, level)) {
3719                 cur->bc_ops->update_lastrec(cur, block, NULL,
3720                                             ptr, LASTREC_DELREC);
3721         }
3722
3723         /*
3724          * We're at the root level.  First, shrink the root block in-memory.
3725          * Try to get rid of the next level down.  If we can't then there's
3726          * nothing left to do.
3727          */
3728         if (level == cur->bc_nlevels - 1) {
3729                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3730                         xfs_iroot_realloc(cur->bc_ino.ip, -1,
3731                                           cur->bc_ino.whichfork);
3732
3733                         error = xfs_btree_kill_iroot(cur);
3734                         if (error)
3735                                 goto error0;
3736
3737                         error = xfs_btree_dec_cursor(cur, level, stat);
3738                         if (error)
3739                                 goto error0;
3740                         *stat = 1;
3741                         return 0;
3742                 }
3743
3744                 /*
3745                  * If this is the root level, and there's only one entry left,
3746                  * and it's NOT the leaf level, then we can get rid of this
3747                  * level.
3748                  */
3749                 if (numrecs == 1 && level > 0) {
3750                         union xfs_btree_ptr     *pp;
3751                         /*
3752                          * pp is still set to the first pointer in the block.
3753                          * Make it the new root of the btree.
3754                          */
3755                         pp = xfs_btree_ptr_addr(cur, 1, block);
3756                         error = xfs_btree_kill_root(cur, bp, level, pp);
3757                         if (error)
3758                                 goto error0;
3759                 } else if (level > 0) {
3760                         error = xfs_btree_dec_cursor(cur, level, stat);
3761                         if (error)
3762                                 goto error0;
3763                 }
3764                 *stat = 1;
3765                 return 0;
3766         }
3767
3768         /*
3769          * If we deleted the leftmost entry in the block, update the
3770          * key values above us in the tree.
3771          */
3772         if (xfs_btree_needs_key_update(cur, ptr)) {
3773                 error = xfs_btree_update_keys(cur, level);
3774                 if (error)
3775                         goto error0;
3776         }
3777
3778         /*
3779          * If the number of records remaining in the block is at least
3780          * the minimum, we're done.
3781          */
3782         if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3783                 error = xfs_btree_dec_cursor(cur, level, stat);
3784                 if (error)
3785                         goto error0;
3786                 return 0;
3787         }
3788
3789         /*
3790          * Otherwise, we have to move some records around to keep the
3791          * tree balanced.  Look at the left and right sibling blocks to
3792          * see if we can re-balance by moving only one record.
3793          */
3794         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3795         xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3796
3797         if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3798                 /*
3799                  * One child of root, need to get a chance to copy its contents
3800                  * into the root and delete it. Can't go up to next level,
3801                  * there's nothing to delete there.
3802                  */
3803                 if (xfs_btree_ptr_is_null(cur, &rptr) &&
3804                     xfs_btree_ptr_is_null(cur, &lptr) &&
3805                     level == cur->bc_nlevels - 2) {
3806                         error = xfs_btree_kill_iroot(cur);
3807                         if (!error)
3808                                 error = xfs_btree_dec_cursor(cur, level, stat);
3809                         if (error)
3810                                 goto error0;
3811                         return 0;
3812                 }
3813         }
3814
3815         ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3816                !xfs_btree_ptr_is_null(cur, &lptr));
3817
3818         /*
3819          * Duplicate the cursor so our btree manipulations here won't
3820          * disrupt the next level up.
3821          */
3822         error = xfs_btree_dup_cursor(cur, &tcur);
3823         if (error)
3824                 goto error0;
3825
3826         /*
3827          * If there's a right sibling, see if it's ok to shift an entry
3828          * out of it.
3829          */
3830         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3831                 /*
3832                  * Move the temp cursor to the last entry in the next block.
3833                  * Actually any entry but the first would suffice.
3834                  */
3835                 i = xfs_btree_lastrec(tcur, level);
3836                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3837                         error = -EFSCORRUPTED;
3838                         goto error0;
3839                 }
3840
3841                 error = xfs_btree_increment(tcur, level, &i);
3842                 if (error)
3843                         goto error0;
3844                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3845                         error = -EFSCORRUPTED;
3846                         goto error0;
3847                 }
3848
3849                 i = xfs_btree_lastrec(tcur, level);
3850                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3851                         error = -EFSCORRUPTED;
3852                         goto error0;
3853                 }
3854
3855                 /* Grab a pointer to the block. */
3856                 right = xfs_btree_get_block(tcur, level, &rbp);
3857 #ifdef DEBUG
3858                 error = xfs_btree_check_block(tcur, right, level, rbp);
3859                 if (error)
3860                         goto error0;
3861 #endif
3862                 /* Grab the current block number, for future use. */
3863                 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3864
3865                 /*
3866                  * If right block is full enough so that removing one entry
3867                  * won't make it too empty, and left-shifting an entry out
3868                  * of right to us works, we're done.
3869                  */
3870                 if (xfs_btree_get_numrecs(right) - 1 >=
3871                     cur->bc_ops->get_minrecs(tcur, level)) {
3872                         error = xfs_btree_lshift(tcur, level, &i);
3873                         if (error)
3874                                 goto error0;
3875                         if (i) {
3876                                 ASSERT(xfs_btree_get_numrecs(block) >=
3877                                        cur->bc_ops->get_minrecs(tcur, level));
3878
3879                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3880                                 tcur = NULL;
3881
3882                                 error = xfs_btree_dec_cursor(cur, level, stat);
3883                                 if (error)
3884                                         goto error0;
3885                                 return 0;
3886                         }
3887                 }
3888
3889                 /*
3890                  * Otherwise, grab the number of records in right for
3891                  * future reference, and fix up the temp cursor to point
3892                  * to our block again (last record).
3893                  */
3894                 rrecs = xfs_btree_get_numrecs(right);
3895                 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3896                         i = xfs_btree_firstrec(tcur, level);
3897                         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3898                                 error = -EFSCORRUPTED;
3899                                 goto error0;
3900                         }
3901
3902                         error = xfs_btree_decrement(tcur, level, &i);
3903                         if (error)
3904                                 goto error0;
3905                         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3906                                 error = -EFSCORRUPTED;
3907                                 goto error0;
3908                         }
3909                 }
3910         }
3911
3912         /*
3913          * If there's a left sibling, see if it's ok to shift an entry
3914          * out of it.
3915          */
3916         if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3917                 /*
3918                  * Move the temp cursor to the first entry in the
3919                  * previous block.
3920                  */
3921                 i = xfs_btree_firstrec(tcur, level);
3922                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3923                         error = -EFSCORRUPTED;
3924                         goto error0;
3925                 }
3926
3927                 error = xfs_btree_decrement(tcur, level, &i);
3928                 if (error)
3929                         goto error0;
3930                 i = xfs_btree_firstrec(tcur, level);
3931                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3932                         error = -EFSCORRUPTED;
3933                         goto error0;
3934                 }
3935
3936                 /* Grab a pointer to the block. */
3937                 left = xfs_btree_get_block(tcur, level, &lbp);
3938 #ifdef DEBUG
3939                 error = xfs_btree_check_block(cur, left, level, lbp);
3940                 if (error)
3941                         goto error0;
3942 #endif
3943                 /* Grab the current block number, for future use. */
3944                 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3945
3946                 /*
3947                  * If left block is full enough so that removing one entry
3948                  * won't make it too empty, and right-shifting an entry out
3949                  * of left to us works, we're done.
3950                  */
3951                 if (xfs_btree_get_numrecs(left) - 1 >=
3952                     cur->bc_ops->get_minrecs(tcur, level)) {
3953                         error = xfs_btree_rshift(tcur, level, &i);
3954                         if (error)
3955                                 goto error0;
3956                         if (i) {
3957                                 ASSERT(xfs_btree_get_numrecs(block) >=
3958                                        cur->bc_ops->get_minrecs(tcur, level));
3959                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3960                                 tcur = NULL;
3961                                 if (level == 0)
3962                                         cur->bc_ptrs[0]++;
3963
3964                                 *stat = 1;
3965                                 return 0;
3966                         }
3967                 }
3968
3969                 /*
3970                  * Otherwise, grab the number of records in right for
3971                  * future reference.
3972                  */
3973                 lrecs = xfs_btree_get_numrecs(left);
3974         }
3975
3976         /* Delete the temp cursor, we're done with it. */
3977         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3978         tcur = NULL;
3979
3980         /* If here, we need to do a join to keep the tree balanced. */
3981         ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3982
3983         if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3984             lrecs + xfs_btree_get_numrecs(block) <=
3985                         cur->bc_ops->get_maxrecs(cur, level)) {
3986                 /*
3987                  * Set "right" to be the starting block,
3988                  * "left" to be the left neighbor.
3989                  */
3990                 rptr = cptr;
3991                 right = block;
3992                 rbp = bp;
3993                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3994                 if (error)
3995                         goto error0;
3996
3997         /*
3998          * If that won't work, see if we can join with the right neighbor block.
3999          */
4000         } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4001                    rrecs + xfs_btree_get_numrecs(block) <=
4002                         cur->bc_ops->get_maxrecs(cur, level)) {
4003                 /*
4004                  * Set "left" to be the starting block,
4005                  * "right" to be the right neighbor.
4006                  */
4007                 lptr = cptr;
4008                 left = block;
4009                 lbp = bp;
4010                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4011                 if (error)
4012                         goto error0;
4013
4014         /*
4015          * Otherwise, we can't fix the imbalance.
4016          * Just return.  This is probably a logic error, but it's not fatal.
4017          */
4018         } else {
4019                 error = xfs_btree_dec_cursor(cur, level, stat);
4020                 if (error)
4021                         goto error0;
4022                 return 0;
4023         }
4024
4025         rrecs = xfs_btree_get_numrecs(right);
4026         lrecs = xfs_btree_get_numrecs(left);
4027
4028         /*
4029          * We're now going to join "left" and "right" by moving all the stuff
4030          * in "right" to "left" and deleting "right".
4031          */
4032         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4033         if (level > 0) {
4034                 /* It's a non-leaf.  Move keys and pointers. */
4035                 union xfs_btree_key     *lkp;   /* left btree key */
4036                 union xfs_btree_ptr     *lpp;   /* left address pointer */
4037                 union xfs_btree_key     *rkp;   /* right btree key */
4038                 union xfs_btree_ptr     *rpp;   /* right address pointer */
4039
4040                 lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4041                 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4042                 rkp = xfs_btree_key_addr(cur, 1, right);
4043                 rpp = xfs_btree_ptr_addr(cur, 1, right);
4044
4045                 for (i = 1; i < rrecs; i++) {
4046                         error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
4047                         if (error)
4048                                 goto error0;
4049                 }
4050
4051                 xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4052                 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4053
4054                 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4055                 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4056         } else {
4057                 /* It's a leaf.  Move records.  */
4058                 union xfs_btree_rec     *lrp;   /* left record pointer */
4059                 union xfs_btree_rec     *rrp;   /* right record pointer */
4060
4061                 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4062                 rrp = xfs_btree_rec_addr(cur, 1, right);
4063
4064                 xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4065                 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4066         }
4067
4068         XFS_BTREE_STATS_INC(cur, join);
4069
4070         /*
4071          * Fix up the number of records and right block pointer in the
4072          * surviving block, and log it.
4073          */
4074         xfs_btree_set_numrecs(left, lrecs + rrecs);
4075         xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB);
4076         xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4077         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4078
4079         /* If there is a right sibling, point it to the remaining block. */
4080         xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4081         if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4082                 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4083                 if (error)
4084                         goto error0;
4085                 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4086                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4087         }
4088
4089         /* Free the deleted block. */
4090         error = xfs_btree_free_block(cur, rbp);
4091         if (error)
4092                 goto error0;
4093
4094         /*
4095          * If we joined with the left neighbor, set the buffer in the
4096          * cursor to the left block, and fix up the index.
4097          */
4098         if (bp != lbp) {
4099                 cur->bc_bufs[level] = lbp;
4100                 cur->bc_ptrs[level] += lrecs;
4101                 cur->bc_ra[level] = 0;
4102         }
4103         /*
4104          * If we joined with the right neighbor and there's a level above
4105          * us, increment the cursor at that level.
4106          */
4107         else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
4108                    (level + 1 < cur->bc_nlevels)) {
4109                 error = xfs_btree_increment(cur, level + 1, &i);
4110                 if (error)
4111                         goto error0;
4112         }
4113
4114         /*
4115          * Readjust the ptr at this level if it's not a leaf, since it's
4116          * still pointing at the deletion point, which makes the cursor
4117          * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4118          * We can't use decrement because it would change the next level up.
4119          */
4120         if (level > 0)
4121                 cur->bc_ptrs[level]--;
4122
4123         /*
4124          * We combined blocks, so we have to update the parent keys if the
4125          * btree supports overlapped intervals.  However, bc_ptrs[level + 1]
4126          * points to the old block so that the caller knows which record to
4127          * delete.  Therefore, the caller must be savvy enough to call updkeys
4128          * for us if we return stat == 2.  The other exit points from this
4129          * function don't require deletions further up the tree, so they can
4130          * call updkeys directly.
4131          */
4132
4133         /* Return value means the next level up has something to do. */
4134         *stat = 2;
4135         return 0;
4136
4137 error0:
4138         if (tcur)
4139                 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4140         return error;
4141 }
4142
4143 /*
4144  * Delete the record pointed to by cur.
4145  * The cursor refers to the place where the record was (could be inserted)
4146  * when the operation returns.
4147  */
4148 int                                     /* error */
4149 xfs_btree_delete(
4150         struct xfs_btree_cur    *cur,
4151         int                     *stat)  /* success/failure */
4152 {
4153         int                     error;  /* error return value */
4154         int                     level;
4155         int                     i;
4156         bool                    joined = false;
4157
4158         /*
4159          * Go up the tree, starting at leaf level.
4160          *
4161          * If 2 is returned then a join was done; go to the next level.
4162          * Otherwise we are done.
4163          */
4164         for (level = 0, i = 2; i == 2; level++) {
4165                 error = xfs_btree_delrec(cur, level, &i);
4166                 if (error)
4167                         goto error0;
4168                 if (i == 2)
4169                         joined = true;
4170         }
4171
4172         /*
4173          * If we combined blocks as part of deleting the record, delrec won't
4174          * have updated the parent high keys so we have to do that here.
4175          */
4176         if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
4177                 error = xfs_btree_updkeys_force(cur, 0);
4178                 if (error)
4179                         goto error0;
4180         }
4181
4182         if (i == 0) {
4183                 for (level = 1; level < cur->bc_nlevels; level++) {
4184                         if (cur->bc_ptrs[level] == 0) {
4185                                 error = xfs_btree_decrement(cur, level, &i);
4186                                 if (error)
4187                                         goto error0;
4188                                 break;
4189                         }
4190                 }
4191         }
4192
4193         *stat = i;
4194         return 0;
4195 error0:
4196         return error;
4197 }
4198
4199 /*
4200  * Get the data from the pointed-to record.
4201  */
4202 int                                     /* error */
4203 xfs_btree_get_rec(
4204         struct xfs_btree_cur    *cur,   /* btree cursor */
4205         union xfs_btree_rec     **recp, /* output: btree record */
4206         int                     *stat)  /* output: success/failure */
4207 {
4208         struct xfs_btree_block  *block; /* btree block */
4209         struct xfs_buf          *bp;    /* buffer pointer */
4210         int                     ptr;    /* record number */
4211 #ifdef DEBUG
4212         int                     error;  /* error return value */
4213 #endif
4214
4215         ptr = cur->bc_ptrs[0];
4216         block = xfs_btree_get_block(cur, 0, &bp);
4217
4218 #ifdef DEBUG
4219         error = xfs_btree_check_block(cur, block, 0, bp);
4220         if (error)
4221                 return error;
4222 #endif
4223
4224         /*
4225          * Off the right end or left end, return failure.
4226          */
4227         if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4228                 *stat = 0;
4229                 return 0;
4230         }
4231
4232         /*
4233          * Point to the record and extract its data.
4234          */
4235         *recp = xfs_btree_rec_addr(cur, ptr, block);
4236         *stat = 1;
4237         return 0;
4238 }
4239
4240 /* Visit a block in a btree. */
4241 STATIC int
4242 xfs_btree_visit_block(
4243         struct xfs_btree_cur            *cur,
4244         int                             level,
4245         xfs_btree_visit_blocks_fn       fn,
4246         void                            *data)
4247 {
4248         struct xfs_btree_block          *block;
4249         struct xfs_buf                  *bp;
4250         union xfs_btree_ptr             rptr;
4251         int                             error;
4252
4253         /* do right sibling readahead */
4254         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4255         block = xfs_btree_get_block(cur, level, &bp);
4256
4257         /* process the block */
4258         error = fn(cur, level, data);
4259         if (error)
4260                 return error;
4261
4262         /* now read rh sibling block for next iteration */
4263         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4264         if (xfs_btree_ptr_is_null(cur, &rptr))
4265                 return -ENOENT;
4266
4267         return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4268 }
4269
4270
4271 /* Visit every block in a btree. */
4272 int
4273 xfs_btree_visit_blocks(
4274         struct xfs_btree_cur            *cur,
4275         xfs_btree_visit_blocks_fn       fn,
4276         unsigned int                    flags,
4277         void                            *data)
4278 {
4279         union xfs_btree_ptr             lptr;
4280         int                             level;
4281         struct xfs_btree_block          *block = NULL;
4282         int                             error = 0;
4283
4284         cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4285
4286         /* for each level */
4287         for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4288                 /* grab the left hand block */
4289                 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4290                 if (error)
4291                         return error;
4292
4293                 /* readahead the left most block for the next level down */
4294                 if (level > 0) {
4295                         union xfs_btree_ptr     *ptr;
4296
4297                         ptr = xfs_btree_ptr_addr(cur, 1, block);
4298                         xfs_btree_readahead_ptr(cur, ptr, 1);
4299
4300                         /* save for the next iteration of the loop */
4301                         xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4302
4303                         if (!(flags & XFS_BTREE_VISIT_LEAVES))
4304                                 continue;
4305                 } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) {
4306                         continue;
4307                 }
4308
4309                 /* for each buffer in the level */
4310                 do {
4311                         error = xfs_btree_visit_block(cur, level, fn, data);
4312                 } while (!error);
4313
4314                 if (error != -ENOENT)
4315                         return error;
4316         }
4317
4318         return 0;
4319 }
4320
4321 /*
4322  * Change the owner of a btree.
4323  *
4324  * The mechanism we use here is ordered buffer logging. Because we don't know
4325  * how many buffers were are going to need to modify, we don't really want to
4326  * have to make transaction reservations for the worst case of every buffer in a
4327  * full size btree as that may be more space that we can fit in the log....
4328  *
4329  * We do the btree walk in the most optimal manner possible - we have sibling
4330  * pointers so we can just walk all the blocks on each level from left to right
4331  * in a single pass, and then move to the next level and do the same. We can
4332  * also do readahead on the sibling pointers to get IO moving more quickly,
4333  * though for slow disks this is unlikely to make much difference to performance
4334  * as the amount of CPU work we have to do before moving to the next block is
4335  * relatively small.
4336  *
4337  * For each btree block that we load, modify the owner appropriately, set the
4338  * buffer as an ordered buffer and log it appropriately. We need to ensure that
4339  * we mark the region we change dirty so that if the buffer is relogged in
4340  * a subsequent transaction the changes we make here as an ordered buffer are
4341  * correctly relogged in that transaction.  If we are in recovery context, then
4342  * just queue the modified buffer as delayed write buffer so the transaction
4343  * recovery completion writes the changes to disk.
4344  */
4345 struct xfs_btree_block_change_owner_info {
4346         uint64_t                new_owner;
4347         struct list_head        *buffer_list;
4348 };
4349
4350 static int
4351 xfs_btree_block_change_owner(
4352         struct xfs_btree_cur    *cur,
4353         int                     level,
4354         void                    *data)
4355 {
4356         struct xfs_btree_block_change_owner_info        *bbcoi = data;
4357         struct xfs_btree_block  *block;
4358         struct xfs_buf          *bp;
4359
4360         /* modify the owner */
4361         block = xfs_btree_get_block(cur, level, &bp);
4362         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
4363                 if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
4364                         return 0;
4365                 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4366         } else {
4367                 if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
4368                         return 0;
4369                 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4370         }
4371
4372         /*
4373          * If the block is a root block hosted in an inode, we might not have a
4374          * buffer pointer here and we shouldn't attempt to log the change as the
4375          * information is already held in the inode and discarded when the root
4376          * block is formatted into the on-disk inode fork. We still change it,
4377          * though, so everything is consistent in memory.
4378          */
4379         if (!bp) {
4380                 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4381                 ASSERT(level == cur->bc_nlevels - 1);
4382                 return 0;
4383         }
4384
4385         if (cur->bc_tp) {
4386                 if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
4387                         xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4388                         return -EAGAIN;
4389                 }
4390         } else {
4391                 xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4392         }
4393
4394         return 0;
4395 }
4396
4397 int
4398 xfs_btree_change_owner(
4399         struct xfs_btree_cur    *cur,
4400         uint64_t                new_owner,
4401         struct list_head        *buffer_list)
4402 {
4403         struct xfs_btree_block_change_owner_info        bbcoi;
4404
4405         bbcoi.new_owner = new_owner;
4406         bbcoi.buffer_list = buffer_list;
4407
4408         return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4409                         XFS_BTREE_VISIT_ALL, &bbcoi);
4410 }
4411
4412 /* Verify the v5 fields of a long-format btree block. */
4413 xfs_failaddr_t
4414 xfs_btree_lblock_v5hdr_verify(
4415         struct xfs_buf          *bp,
4416         uint64_t                owner)
4417 {
4418         struct xfs_mount        *mp = bp->b_mount;
4419         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4420
4421         if (!xfs_sb_version_hascrc(&mp->m_sb))
4422                 return __this_address;
4423         if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
4424                 return __this_address;
4425         if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn))
4426                 return __this_address;
4427         if (owner != XFS_RMAP_OWN_UNKNOWN &&
4428             be64_to_cpu(block->bb_u.l.bb_owner) != owner)
4429                 return __this_address;
4430         return NULL;
4431 }
4432
4433 /* Verify a long-format btree block. */
4434 xfs_failaddr_t
4435 xfs_btree_lblock_verify(
4436         struct xfs_buf          *bp,
4437         unsigned int            max_recs)
4438 {
4439         struct xfs_mount        *mp = bp->b_mount;
4440         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4441
4442         /* numrecs verification */
4443         if (be16_to_cpu(block->bb_numrecs) > max_recs)
4444                 return __this_address;
4445
4446         /* sibling pointer verification */
4447         if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
4448             !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
4449                 return __this_address;
4450         if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
4451             !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
4452                 return __this_address;
4453
4454         return NULL;
4455 }
4456
4457 /**
4458  * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4459  *                                    btree block
4460  *
4461  * @bp: buffer containing the btree block
4462  */
4463 xfs_failaddr_t
4464 xfs_btree_sblock_v5hdr_verify(
4465         struct xfs_buf          *bp)
4466 {
4467         struct xfs_mount        *mp = bp->b_mount;
4468         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4469         struct xfs_perag        *pag = bp->b_pag;
4470
4471         if (!xfs_sb_version_hascrc(&mp->m_sb))
4472                 return __this_address;
4473         if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4474                 return __this_address;
4475         if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
4476                 return __this_address;
4477         if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4478                 return __this_address;
4479         return NULL;
4480 }
4481
4482 /**
4483  * xfs_btree_sblock_verify() -- verify a short-format btree block
4484  *
4485  * @bp: buffer containing the btree block
4486  * @max_recs: maximum records allowed in this btree node
4487  */
4488 xfs_failaddr_t
4489 xfs_btree_sblock_verify(
4490         struct xfs_buf          *bp,
4491         unsigned int            max_recs)
4492 {
4493         struct xfs_mount        *mp = bp->b_mount;
4494         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4495         xfs_agblock_t           agno;
4496
4497         /* numrecs verification */
4498         if (be16_to_cpu(block->bb_numrecs) > max_recs)
4499                 return __this_address;
4500
4501         /* sibling pointer verification */
4502         agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
4503         if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
4504             !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
4505                 return __this_address;
4506         if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
4507             !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
4508                 return __this_address;
4509
4510         return NULL;
4511 }
4512
4513 /*
4514  * Calculate the number of btree levels needed to store a given number of
4515  * records in a short-format btree.
4516  */
4517 uint
4518 xfs_btree_compute_maxlevels(
4519         uint                    *limits,
4520         unsigned long           len)
4521 {
4522         uint                    level;
4523         unsigned long           maxblocks;
4524
4525         maxblocks = (len + limits[0] - 1) / limits[0];
4526         for (level = 1; maxblocks > 1; level++)
4527                 maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4528         return level;
4529 }
4530
4531 /*
4532  * Query a regular btree for all records overlapping a given interval.
4533  * Start with a LE lookup of the key of low_rec and return all records
4534  * until we find a record with a key greater than the key of high_rec.
4535  */
4536 STATIC int
4537 xfs_btree_simple_query_range(
4538         struct xfs_btree_cur            *cur,
4539         const union xfs_btree_key       *low_key,
4540         const union xfs_btree_key       *high_key,
4541         xfs_btree_query_range_fn        fn,
4542         void                            *priv)
4543 {
4544         union xfs_btree_rec             *recp;
4545         union xfs_btree_key             rec_key;
4546         int64_t                         diff;
4547         int                             stat;
4548         bool                            firstrec = true;
4549         int                             error;
4550
4551         ASSERT(cur->bc_ops->init_high_key_from_rec);
4552         ASSERT(cur->bc_ops->diff_two_keys);
4553
4554         /*
4555          * Find the leftmost record.  The btree cursor must be set
4556          * to the low record used to generate low_key.
4557          */
4558         stat = 0;
4559         error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4560         if (error)
4561                 goto out;
4562
4563         /* Nothing?  See if there's anything to the right. */
4564         if (!stat) {
4565                 error = xfs_btree_increment(cur, 0, &stat);
4566                 if (error)
4567                         goto out;
4568         }
4569
4570         while (stat) {
4571                 /* Find the record. */
4572                 error = xfs_btree_get_rec(cur, &recp, &stat);
4573                 if (error || !stat)
4574                         break;
4575
4576                 /* Skip if high_key(rec) < low_key. */
4577                 if (firstrec) {
4578                         cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4579                         firstrec = false;
4580                         diff = cur->bc_ops->diff_two_keys(cur, low_key,
4581                                         &rec_key);
4582                         if (diff > 0)
4583                                 goto advloop;
4584                 }
4585
4586                 /* Stop if high_key < low_key(rec). */
4587                 cur->bc_ops->init_key_from_rec(&rec_key, recp);
4588                 diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4589                 if (diff > 0)
4590                         break;
4591
4592                 /* Callback */
4593                 error = fn(cur, recp, priv);
4594                 if (error)
4595                         break;
4596
4597 advloop:
4598                 /* Move on to the next record. */
4599                 error = xfs_btree_increment(cur, 0, &stat);
4600                 if (error)
4601                         break;
4602         }
4603
4604 out:
4605         return error;
4606 }
4607
4608 /*
4609  * Query an overlapped interval btree for all records overlapping a given
4610  * interval.  This function roughly follows the algorithm given in
4611  * "Interval Trees" of _Introduction to Algorithms_, which is section
4612  * 14.3 in the 2nd and 3rd editions.
4613  *
4614  * First, generate keys for the low and high records passed in.
4615  *
4616  * For any leaf node, generate the high and low keys for the record.
4617  * If the record keys overlap with the query low/high keys, pass the
4618  * record to the function iterator.
4619  *
4620  * For any internal node, compare the low and high keys of each
4621  * pointer against the query low/high keys.  If there's an overlap,
4622  * follow the pointer.
4623  *
4624  * As an optimization, we stop scanning a block when we find a low key
4625  * that is greater than the query's high key.
4626  */
4627 STATIC int
4628 xfs_btree_overlapped_query_range(
4629         struct xfs_btree_cur            *cur,
4630         const union xfs_btree_key       *low_key,
4631         const union xfs_btree_key       *high_key,
4632         xfs_btree_query_range_fn        fn,
4633         void                            *priv)
4634 {
4635         union xfs_btree_ptr             ptr;
4636         union xfs_btree_ptr             *pp;
4637         union xfs_btree_key             rec_key;
4638         union xfs_btree_key             rec_hkey;
4639         union xfs_btree_key             *lkp;
4640         union xfs_btree_key             *hkp;
4641         union xfs_btree_rec             *recp;
4642         struct xfs_btree_block          *block;
4643         int64_t                         ldiff;
4644         int64_t                         hdiff;
4645         int                             level;
4646         struct xfs_buf                  *bp;
4647         int                             i;
4648         int                             error;
4649
4650         /* Load the root of the btree. */
4651         level = cur->bc_nlevels - 1;
4652         cur->bc_ops->init_ptr_from_cur(cur, &ptr);
4653         error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
4654         if (error)
4655                 return error;
4656         xfs_btree_get_block(cur, level, &bp);
4657         trace_xfs_btree_overlapped_query_range(cur, level, bp);
4658 #ifdef DEBUG
4659         error = xfs_btree_check_block(cur, block, level, bp);
4660         if (error)
4661                 goto out;
4662 #endif
4663         cur->bc_ptrs[level] = 1;
4664
4665         while (level < cur->bc_nlevels) {
4666                 block = xfs_btree_get_block(cur, level, &bp);
4667
4668                 /* End of node, pop back towards the root. */
4669                 if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
4670 pop_up:
4671                         if (level < cur->bc_nlevels - 1)
4672                                 cur->bc_ptrs[level + 1]++;
4673                         level++;
4674                         continue;
4675                 }
4676
4677                 if (level == 0) {
4678                         /* Handle a leaf node. */
4679                         recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
4680
4681                         cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
4682                         ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
4683                                         low_key);
4684
4685                         cur->bc_ops->init_key_from_rec(&rec_key, recp);
4686                         hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
4687                                         &rec_key);
4688
4689                         /*
4690                          * If (record's high key >= query's low key) and
4691                          *    (query's high key >= record's low key), then
4692                          * this record overlaps the query range; callback.
4693                          */
4694                         if (ldiff >= 0 && hdiff >= 0) {
4695                                 error = fn(cur, recp, priv);
4696                                 if (error)
4697                                         break;
4698                         } else if (hdiff < 0) {
4699                                 /* Record is larger than high key; pop. */
4700                                 goto pop_up;
4701                         }
4702                         cur->bc_ptrs[level]++;
4703                         continue;
4704                 }
4705
4706                 /* Handle an internal node. */
4707                 lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
4708                 hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
4709                 pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
4710
4711                 ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
4712                 hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
4713
4714                 /*
4715                  * If (pointer's high key >= query's low key) and
4716                  *    (query's high key >= pointer's low key), then
4717                  * this record overlaps the query range; follow pointer.
4718                  */
4719                 if (ldiff >= 0 && hdiff >= 0) {
4720                         level--;
4721                         error = xfs_btree_lookup_get_block(cur, level, pp,
4722                                         &block);
4723                         if (error)
4724                                 goto out;
4725                         xfs_btree_get_block(cur, level, &bp);
4726                         trace_xfs_btree_overlapped_query_range(cur, level, bp);
4727 #ifdef DEBUG
4728                         error = xfs_btree_check_block(cur, block, level, bp);
4729                         if (error)
4730                                 goto out;
4731 #endif
4732                         cur->bc_ptrs[level] = 1;
4733                         continue;
4734                 } else if (hdiff < 0) {
4735                         /* The low key is larger than the upper range; pop. */
4736                         goto pop_up;
4737                 }
4738                 cur->bc_ptrs[level]++;
4739         }
4740
4741 out:
4742         /*
4743          * If we don't end this function with the cursor pointing at a record
4744          * block, a subsequent non-error cursor deletion will not release
4745          * node-level buffers, causing a buffer leak.  This is quite possible
4746          * with a zero-results range query, so release the buffers if we
4747          * failed to return any results.
4748          */
4749         if (cur->bc_bufs[0] == NULL) {
4750                 for (i = 0; i < cur->bc_nlevels; i++) {
4751                         if (cur->bc_bufs[i]) {
4752                                 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
4753                                 cur->bc_bufs[i] = NULL;
4754                                 cur->bc_ptrs[i] = 0;
4755                                 cur->bc_ra[i] = 0;
4756                         }
4757                 }
4758         }
4759
4760         return error;
4761 }
4762
4763 /*
4764  * Query a btree for all records overlapping a given interval of keys.  The
4765  * supplied function will be called with each record found; return one of the
4766  * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4767  * code.  This function returns -ECANCELED, zero, or a negative error code.
4768  */
4769 int
4770 xfs_btree_query_range(
4771         struct xfs_btree_cur            *cur,
4772         const union xfs_btree_irec      *low_rec,
4773         const union xfs_btree_irec      *high_rec,
4774         xfs_btree_query_range_fn        fn,
4775         void                            *priv)
4776 {
4777         union xfs_btree_rec             rec;
4778         union xfs_btree_key             low_key;
4779         union xfs_btree_key             high_key;
4780
4781         /* Find the keys of both ends of the interval. */
4782         cur->bc_rec = *high_rec;
4783         cur->bc_ops->init_rec_from_cur(cur, &rec);
4784         cur->bc_ops->init_key_from_rec(&high_key, &rec);
4785
4786         cur->bc_rec = *low_rec;
4787         cur->bc_ops->init_rec_from_cur(cur, &rec);
4788         cur->bc_ops->init_key_from_rec(&low_key, &rec);
4789
4790         /* Enforce low key < high key. */
4791         if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
4792                 return -EINVAL;
4793
4794         if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
4795                 return xfs_btree_simple_query_range(cur, &low_key,
4796                                 &high_key, fn, priv);
4797         return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4798                         fn, priv);
4799 }
4800
4801 /* Query a btree for all records. */
4802 int
4803 xfs_btree_query_all(
4804         struct xfs_btree_cur            *cur,
4805         xfs_btree_query_range_fn        fn,
4806         void                            *priv)
4807 {
4808         union xfs_btree_key             low_key;
4809         union xfs_btree_key             high_key;
4810
4811         memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
4812         memset(&low_key, 0, sizeof(low_key));
4813         memset(&high_key, 0xFF, sizeof(high_key));
4814
4815         return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
4816 }
4817
4818 /*
4819  * Calculate the number of blocks needed to store a given number of records
4820  * in a short-format (per-AG metadata) btree.
4821  */
4822 unsigned long long
4823 xfs_btree_calc_size(
4824         uint                    *limits,
4825         unsigned long long      len)
4826 {
4827         int                     level;
4828         int                     maxrecs;
4829         unsigned long long      rval;
4830
4831         maxrecs = limits[0];
4832         for (level = 0, rval = 0; len > 1; level++) {
4833                 len += maxrecs - 1;
4834                 do_div(len, maxrecs);
4835                 maxrecs = limits[1];
4836                 rval += len;
4837         }
4838         return rval;
4839 }
4840
4841 static int
4842 xfs_btree_count_blocks_helper(
4843         struct xfs_btree_cur    *cur,
4844         int                     level,
4845         void                    *data)
4846 {
4847         xfs_extlen_t            *blocks = data;
4848         (*blocks)++;
4849
4850         return 0;
4851 }
4852
4853 /* Count the blocks in a btree and return the result in *blocks. */
4854 int
4855 xfs_btree_count_blocks(
4856         struct xfs_btree_cur    *cur,
4857         xfs_extlen_t            *blocks)
4858 {
4859         *blocks = 0;
4860         return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4861                         XFS_BTREE_VISIT_ALL, blocks);
4862 }
4863
4864 /* Compare two btree pointers. */
4865 int64_t
4866 xfs_btree_diff_two_ptrs(
4867         struct xfs_btree_cur            *cur,
4868         const union xfs_btree_ptr       *a,
4869         const union xfs_btree_ptr       *b)
4870 {
4871         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4872                 return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
4873         return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
4874 }
4875
4876 /* If there's an extent, we're done. */
4877 STATIC int
4878 xfs_btree_has_record_helper(
4879         struct xfs_btree_cur            *cur,
4880         const union xfs_btree_rec       *rec,
4881         void                            *priv)
4882 {
4883         return -ECANCELED;
4884 }
4885
4886 /* Is there a record covering a given range of keys? */
4887 int
4888 xfs_btree_has_record(
4889         struct xfs_btree_cur            *cur,
4890         const union xfs_btree_irec      *low,
4891         const union xfs_btree_irec      *high,
4892         bool                            *exists)
4893 {
4894         int                             error;
4895
4896         error = xfs_btree_query_range(cur, low, high,
4897                         &xfs_btree_has_record_helper, NULL);
4898         if (error == -ECANCELED) {
4899                 *exists = true;
4900                 return 0;
4901         }
4902         *exists = false;
4903         return error;
4904 }
4905
4906 /* Are there more records in this btree? */
4907 bool
4908 xfs_btree_has_more_records(
4909         struct xfs_btree_cur    *cur)
4910 {
4911         struct xfs_btree_block  *block;
4912         struct xfs_buf          *bp;
4913
4914         block = xfs_btree_get_block(cur, 0, &bp);
4915
4916         /* There are still records in this block. */
4917         if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
4918                 return true;
4919
4920         /* There are more record blocks. */
4921         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4922                 return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
4923         else
4924                 return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
4925 }