Merge tag 'mmc-v4.18' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc
[linux-2.6-microblaze.git] / fs / xfs / libxfs / xfs_btree.c
1 /*
2  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_shared.h"
21 #include "xfs_format.h"
22 #include "xfs_log_format.h"
23 #include "xfs_trans_resv.h"
24 #include "xfs_bit.h"
25 #include "xfs_mount.h"
26 #include "xfs_defer.h"
27 #include "xfs_inode.h"
28 #include "xfs_trans.h"
29 #include "xfs_inode_item.h"
30 #include "xfs_buf_item.h"
31 #include "xfs_btree.h"
32 #include "xfs_errortag.h"
33 #include "xfs_error.h"
34 #include "xfs_trace.h"
35 #include "xfs_cksum.h"
36 #include "xfs_alloc.h"
37 #include "xfs_log.h"
38
39 /*
40  * Cursor allocation zone.
41  */
42 kmem_zone_t     *xfs_btree_cur_zone;
43
44 /*
45  * Btree magic numbers.
46  */
47 static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
48         { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
49           XFS_FIBT_MAGIC, 0 },
50         { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
51           XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
52           XFS_REFC_CRC_MAGIC }
53 };
54
55 uint32_t
56 xfs_btree_magic(
57         int                     crc,
58         xfs_btnum_t             btnum)
59 {
60         uint32_t                magic = xfs_magics[crc][btnum];
61
62         /* Ensure we asked for crc for crc-only magics. */
63         ASSERT(magic != 0);
64         return magic;
65 }
66
67 /*
68  * Check a long btree block header.  Return the address of the failing check,
69  * or NULL if everything is ok.
70  */
71 xfs_failaddr_t
72 __xfs_btree_check_lblock(
73         struct xfs_btree_cur    *cur,
74         struct xfs_btree_block  *block,
75         int                     level,
76         struct xfs_buf          *bp)
77 {
78         struct xfs_mount        *mp = cur->bc_mp;
79         xfs_btnum_t             btnum = cur->bc_btnum;
80         int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
81
82         if (crc) {
83                 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
84                         return __this_address;
85                 if (block->bb_u.l.bb_blkno !=
86                     cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
87                         return __this_address;
88                 if (block->bb_u.l.bb_pad != cpu_to_be32(0))
89                         return __this_address;
90         }
91
92         if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
93                 return __this_address;
94         if (be16_to_cpu(block->bb_level) != level)
95                 return __this_address;
96         if (be16_to_cpu(block->bb_numrecs) >
97             cur->bc_ops->get_maxrecs(cur, level))
98                 return __this_address;
99         if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
100             !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
101                         level + 1))
102                 return __this_address;
103         if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
104             !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
105                         level + 1))
106                 return __this_address;
107
108         return NULL;
109 }
110
111 /* Check a long btree block header. */
112 static int
113 xfs_btree_check_lblock(
114         struct xfs_btree_cur    *cur,
115         struct xfs_btree_block  *block,
116         int                     level,
117         struct xfs_buf          *bp)
118 {
119         struct xfs_mount        *mp = cur->bc_mp;
120         xfs_failaddr_t          fa;
121
122         fa = __xfs_btree_check_lblock(cur, block, level, bp);
123         if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
124                         XFS_ERRTAG_BTREE_CHECK_LBLOCK))) {
125                 if (bp)
126                         trace_xfs_btree_corrupt(bp, _RET_IP_);
127                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
128                 return -EFSCORRUPTED;
129         }
130         return 0;
131 }
132
133 /*
134  * Check a short btree block header.  Return the address of the failing check,
135  * or NULL if everything is ok.
136  */
137 xfs_failaddr_t
138 __xfs_btree_check_sblock(
139         struct xfs_btree_cur    *cur,
140         struct xfs_btree_block  *block,
141         int                     level,
142         struct xfs_buf          *bp)
143 {
144         struct xfs_mount        *mp = cur->bc_mp;
145         xfs_btnum_t             btnum = cur->bc_btnum;
146         int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
147
148         if (crc) {
149                 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
150                         return __this_address;
151                 if (block->bb_u.s.bb_blkno !=
152                     cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
153                         return __this_address;
154         }
155
156         if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
157                 return __this_address;
158         if (be16_to_cpu(block->bb_level) != level)
159                 return __this_address;
160         if (be16_to_cpu(block->bb_numrecs) >
161             cur->bc_ops->get_maxrecs(cur, level))
162                 return __this_address;
163         if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
164             !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
165                         level + 1))
166                 return __this_address;
167         if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
168             !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
169                         level + 1))
170                 return __this_address;
171
172         return NULL;
173 }
174
175 /* Check a short btree block header. */
176 STATIC int
177 xfs_btree_check_sblock(
178         struct xfs_btree_cur    *cur,
179         struct xfs_btree_block  *block,
180         int                     level,
181         struct xfs_buf          *bp)
182 {
183         struct xfs_mount        *mp = cur->bc_mp;
184         xfs_failaddr_t          fa;
185
186         fa = __xfs_btree_check_sblock(cur, block, level, bp);
187         if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
188                         XFS_ERRTAG_BTREE_CHECK_SBLOCK))) {
189                 if (bp)
190                         trace_xfs_btree_corrupt(bp, _RET_IP_);
191                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
192                 return -EFSCORRUPTED;
193         }
194         return 0;
195 }
196
197 /*
198  * Debug routine: check that block header is ok.
199  */
200 int
201 xfs_btree_check_block(
202         struct xfs_btree_cur    *cur,   /* btree cursor */
203         struct xfs_btree_block  *block, /* generic btree block pointer */
204         int                     level,  /* level of the btree block */
205         struct xfs_buf          *bp)    /* buffer containing block, if any */
206 {
207         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
208                 return xfs_btree_check_lblock(cur, block, level, bp);
209         else
210                 return xfs_btree_check_sblock(cur, block, level, bp);
211 }
212
213 /* Check that this long pointer is valid and points within the fs. */
214 bool
215 xfs_btree_check_lptr(
216         struct xfs_btree_cur    *cur,
217         xfs_fsblock_t           fsbno,
218         int                     level)
219 {
220         if (level <= 0)
221                 return false;
222         return xfs_verify_fsbno(cur->bc_mp, fsbno);
223 }
224
225 /* Check that this short pointer is valid and points within the AG. */
226 bool
227 xfs_btree_check_sptr(
228         struct xfs_btree_cur    *cur,
229         xfs_agblock_t           agbno,
230         int                     level)
231 {
232         if (level <= 0)
233                 return false;
234         return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno);
235 }
236
237 #ifdef DEBUG
238 /*
239  * Check that a given (indexed) btree pointer at a certain level of a
240  * btree is valid and doesn't point past where it should.
241  */
242 static int
243 xfs_btree_check_ptr(
244         struct xfs_btree_cur    *cur,
245         union xfs_btree_ptr     *ptr,
246         int                     index,
247         int                     level)
248 {
249         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
250                 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
251                                 xfs_btree_check_lptr(cur,
252                                         be64_to_cpu((&ptr->l)[index]), level));
253         } else {
254                 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
255                                 xfs_btree_check_sptr(cur,
256                                         be32_to_cpu((&ptr->s)[index]), level));
257         }
258
259         return 0;
260 }
261 #endif
262
263 /*
264  * Calculate CRC on the whole btree block and stuff it into the
265  * long-form btree header.
266  *
267  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
268  * it into the buffer so recovery knows what the last modification was that made
269  * it to disk.
270  */
271 void
272 xfs_btree_lblock_calc_crc(
273         struct xfs_buf          *bp)
274 {
275         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
276         struct xfs_buf_log_item *bip = bp->b_log_item;
277
278         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
279                 return;
280         if (bip)
281                 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
282         xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
283 }
284
285 bool
286 xfs_btree_lblock_verify_crc(
287         struct xfs_buf          *bp)
288 {
289         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
290         struct xfs_mount        *mp = bp->b_target->bt_mount;
291
292         if (xfs_sb_version_hascrc(&mp->m_sb)) {
293                 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
294                         return false;
295                 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
296         }
297
298         return true;
299 }
300
301 /*
302  * Calculate CRC on the whole btree block and stuff it into the
303  * short-form btree header.
304  *
305  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
306  * it into the buffer so recovery knows what the last modification was that made
307  * it to disk.
308  */
309 void
310 xfs_btree_sblock_calc_crc(
311         struct xfs_buf          *bp)
312 {
313         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
314         struct xfs_buf_log_item *bip = bp->b_log_item;
315
316         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
317                 return;
318         if (bip)
319                 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
320         xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
321 }
322
323 bool
324 xfs_btree_sblock_verify_crc(
325         struct xfs_buf          *bp)
326 {
327         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
328         struct xfs_mount        *mp = bp->b_target->bt_mount;
329
330         if (xfs_sb_version_hascrc(&mp->m_sb)) {
331                 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
332                         return __this_address;
333                 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
334         }
335
336         return true;
337 }
338
339 static int
340 xfs_btree_free_block(
341         struct xfs_btree_cur    *cur,
342         struct xfs_buf          *bp)
343 {
344         int                     error;
345
346         error = cur->bc_ops->free_block(cur, bp);
347         if (!error) {
348                 xfs_trans_binval(cur->bc_tp, bp);
349                 XFS_BTREE_STATS_INC(cur, free);
350         }
351         return error;
352 }
353
354 /*
355  * Delete the btree cursor.
356  */
357 void
358 xfs_btree_del_cursor(
359         xfs_btree_cur_t *cur,           /* btree cursor */
360         int             error)          /* del because of error */
361 {
362         int             i;              /* btree level */
363
364         /*
365          * Clear the buffer pointers, and release the buffers.
366          * If we're doing this in the face of an error, we
367          * need to make sure to inspect all of the entries
368          * in the bc_bufs array for buffers to be unlocked.
369          * This is because some of the btree code works from
370          * level n down to 0, and if we get an error along
371          * the way we won't have initialized all the entries
372          * down to 0.
373          */
374         for (i = 0; i < cur->bc_nlevels; i++) {
375                 if (cur->bc_bufs[i])
376                         xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
377                 else if (!error)
378                         break;
379         }
380         /*
381          * Can't free a bmap cursor without having dealt with the
382          * allocated indirect blocks' accounting.
383          */
384         ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
385                cur->bc_private.b.allocated == 0);
386         /*
387          * Free the cursor.
388          */
389         kmem_zone_free(xfs_btree_cur_zone, cur);
390 }
391
392 /*
393  * Duplicate the btree cursor.
394  * Allocate a new one, copy the record, re-get the buffers.
395  */
396 int                                     /* error */
397 xfs_btree_dup_cursor(
398         xfs_btree_cur_t *cur,           /* input cursor */
399         xfs_btree_cur_t **ncur)         /* output cursor */
400 {
401         xfs_buf_t       *bp;            /* btree block's buffer pointer */
402         int             error;          /* error return value */
403         int             i;              /* level number of btree block */
404         xfs_mount_t     *mp;            /* mount structure for filesystem */
405         xfs_btree_cur_t *new;           /* new cursor value */
406         xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
407
408         tp = cur->bc_tp;
409         mp = cur->bc_mp;
410
411         /*
412          * Allocate a new cursor like the old one.
413          */
414         new = cur->bc_ops->dup_cursor(cur);
415
416         /*
417          * Copy the record currently in the cursor.
418          */
419         new->bc_rec = cur->bc_rec;
420
421         /*
422          * For each level current, re-get the buffer and copy the ptr value.
423          */
424         for (i = 0; i < new->bc_nlevels; i++) {
425                 new->bc_ptrs[i] = cur->bc_ptrs[i];
426                 new->bc_ra[i] = cur->bc_ra[i];
427                 bp = cur->bc_bufs[i];
428                 if (bp) {
429                         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
430                                                    XFS_BUF_ADDR(bp), mp->m_bsize,
431                                                    0, &bp,
432                                                    cur->bc_ops->buf_ops);
433                         if (error) {
434                                 xfs_btree_del_cursor(new, error);
435                                 *ncur = NULL;
436                                 return error;
437                         }
438                 }
439                 new->bc_bufs[i] = bp;
440         }
441         *ncur = new;
442         return 0;
443 }
444
445 /*
446  * XFS btree block layout and addressing:
447  *
448  * There are two types of blocks in the btree: leaf and non-leaf blocks.
449  *
450  * The leaf record start with a header then followed by records containing
451  * the values.  A non-leaf block also starts with the same header, and
452  * then first contains lookup keys followed by an equal number of pointers
453  * to the btree blocks at the previous level.
454  *
455  *              +--------+-------+-------+-------+-------+-------+-------+
456  * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
457  *              +--------+-------+-------+-------+-------+-------+-------+
458  *
459  *              +--------+-------+-------+-------+-------+-------+-------+
460  * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
461  *              +--------+-------+-------+-------+-------+-------+-------+
462  *
463  * The header is called struct xfs_btree_block for reasons better left unknown
464  * and comes in different versions for short (32bit) and long (64bit) block
465  * pointers.  The record and key structures are defined by the btree instances
466  * and opaque to the btree core.  The block pointers are simple disk endian
467  * integers, available in a short (32bit) and long (64bit) variant.
468  *
469  * The helpers below calculate the offset of a given record, key or pointer
470  * into a btree block (xfs_btree_*_offset) or return a pointer to the given
471  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
472  * inside the btree block is done using indices starting at one, not zero!
473  *
474  * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
475  * overlapping intervals.  In such a tree, records are still sorted lowest to
476  * highest and indexed by the smallest key value that refers to the record.
477  * However, nodes are different: each pointer has two associated keys -- one
478  * indexing the lowest key available in the block(s) below (the same behavior
479  * as the key in a regular btree) and another indexing the highest key
480  * available in the block(s) below.  Because records are /not/ sorted by the
481  * highest key, all leaf block updates require us to compute the highest key
482  * that matches any record in the leaf and to recursively update the high keys
483  * in the nodes going further up in the tree, if necessary.  Nodes look like
484  * this:
485  *
486  *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
487  * Non-Leaf:    | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
488  *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
489  *
490  * To perform an interval query on an overlapped tree, perform the usual
491  * depth-first search and use the low and high keys to decide if we can skip
492  * that particular node.  If a leaf node is reached, return the records that
493  * intersect the interval.  Note that an interval query may return numerous
494  * entries.  For a non-overlapped tree, simply search for the record associated
495  * with the lowest key and iterate forward until a non-matching record is
496  * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
497  * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
498  * more detail.
499  *
500  * Why do we care about overlapping intervals?  Let's say you have a bunch of
501  * reverse mapping records on a reflink filesystem:
502  *
503  * 1: +- file A startblock B offset C length D -----------+
504  * 2:      +- file E startblock F offset G length H --------------+
505  * 3:      +- file I startblock F offset J length K --+
506  * 4:                                                        +- file L... --+
507  *
508  * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
509  * we'd simply increment the length of record 1.  But how do we find the record
510  * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
511  * record 3 because the keys are ordered first by startblock.  An interval
512  * query would return records 1 and 2 because they both overlap (B+D-1), and
513  * from that we can pick out record 1 as the appropriate left neighbor.
514  *
515  * In the non-overlapped case you can do a LE lookup and decrement the cursor
516  * because a record's interval must end before the next record.
517  */
518
519 /*
520  * Return size of the btree block header for this btree instance.
521  */
522 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
523 {
524         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
525                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
526                         return XFS_BTREE_LBLOCK_CRC_LEN;
527                 return XFS_BTREE_LBLOCK_LEN;
528         }
529         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
530                 return XFS_BTREE_SBLOCK_CRC_LEN;
531         return XFS_BTREE_SBLOCK_LEN;
532 }
533
534 /*
535  * Return size of btree block pointers for this btree instance.
536  */
537 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
538 {
539         return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
540                 sizeof(__be64) : sizeof(__be32);
541 }
542
543 /*
544  * Calculate offset of the n-th record in a btree block.
545  */
546 STATIC size_t
547 xfs_btree_rec_offset(
548         struct xfs_btree_cur    *cur,
549         int                     n)
550 {
551         return xfs_btree_block_len(cur) +
552                 (n - 1) * cur->bc_ops->rec_len;
553 }
554
555 /*
556  * Calculate offset of the n-th key in a btree block.
557  */
558 STATIC size_t
559 xfs_btree_key_offset(
560         struct xfs_btree_cur    *cur,
561         int                     n)
562 {
563         return xfs_btree_block_len(cur) +
564                 (n - 1) * cur->bc_ops->key_len;
565 }
566
567 /*
568  * Calculate offset of the n-th high key in a btree block.
569  */
570 STATIC size_t
571 xfs_btree_high_key_offset(
572         struct xfs_btree_cur    *cur,
573         int                     n)
574 {
575         return xfs_btree_block_len(cur) +
576                 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
577 }
578
579 /*
580  * Calculate offset of the n-th block pointer in a btree block.
581  */
582 STATIC size_t
583 xfs_btree_ptr_offset(
584         struct xfs_btree_cur    *cur,
585         int                     n,
586         int                     level)
587 {
588         return xfs_btree_block_len(cur) +
589                 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
590                 (n - 1) * xfs_btree_ptr_len(cur);
591 }
592
593 /*
594  * Return a pointer to the n-th record in the btree block.
595  */
596 union xfs_btree_rec *
597 xfs_btree_rec_addr(
598         struct xfs_btree_cur    *cur,
599         int                     n,
600         struct xfs_btree_block  *block)
601 {
602         return (union xfs_btree_rec *)
603                 ((char *)block + xfs_btree_rec_offset(cur, n));
604 }
605
606 /*
607  * Return a pointer to the n-th key in the btree block.
608  */
609 union xfs_btree_key *
610 xfs_btree_key_addr(
611         struct xfs_btree_cur    *cur,
612         int                     n,
613         struct xfs_btree_block  *block)
614 {
615         return (union xfs_btree_key *)
616                 ((char *)block + xfs_btree_key_offset(cur, n));
617 }
618
619 /*
620  * Return a pointer to the n-th high key in the btree block.
621  */
622 union xfs_btree_key *
623 xfs_btree_high_key_addr(
624         struct xfs_btree_cur    *cur,
625         int                     n,
626         struct xfs_btree_block  *block)
627 {
628         return (union xfs_btree_key *)
629                 ((char *)block + xfs_btree_high_key_offset(cur, n));
630 }
631
632 /*
633  * Return a pointer to the n-th block pointer in the btree block.
634  */
635 union xfs_btree_ptr *
636 xfs_btree_ptr_addr(
637         struct xfs_btree_cur    *cur,
638         int                     n,
639         struct xfs_btree_block  *block)
640 {
641         int                     level = xfs_btree_get_level(block);
642
643         ASSERT(block->bb_level != 0);
644
645         return (union xfs_btree_ptr *)
646                 ((char *)block + xfs_btree_ptr_offset(cur, n, level));
647 }
648
649 /*
650  * Get the root block which is stored in the inode.
651  *
652  * For now this btree implementation assumes the btree root is always
653  * stored in the if_broot field of an inode fork.
654  */
655 STATIC struct xfs_btree_block *
656 xfs_btree_get_iroot(
657         struct xfs_btree_cur    *cur)
658 {
659         struct xfs_ifork        *ifp;
660
661         ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
662         return (struct xfs_btree_block *)ifp->if_broot;
663 }
664
665 /*
666  * Retrieve the block pointer from the cursor at the given level.
667  * This may be an inode btree root or from a buffer.
668  */
669 struct xfs_btree_block *                /* generic btree block pointer */
670 xfs_btree_get_block(
671         struct xfs_btree_cur    *cur,   /* btree cursor */
672         int                     level,  /* level in btree */
673         struct xfs_buf          **bpp)  /* buffer containing the block */
674 {
675         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
676             (level == cur->bc_nlevels - 1)) {
677                 *bpp = NULL;
678                 return xfs_btree_get_iroot(cur);
679         }
680
681         *bpp = cur->bc_bufs[level];
682         return XFS_BUF_TO_BLOCK(*bpp);
683 }
684
685 /*
686  * Get a buffer for the block, return it with no data read.
687  * Long-form addressing.
688  */
689 xfs_buf_t *                             /* buffer for fsbno */
690 xfs_btree_get_bufl(
691         xfs_mount_t     *mp,            /* file system mount point */
692         xfs_trans_t     *tp,            /* transaction pointer */
693         xfs_fsblock_t   fsbno,          /* file system block number */
694         uint            lock)           /* lock flags for get_buf */
695 {
696         xfs_daddr_t             d;              /* real disk block address */
697
698         ASSERT(fsbno != NULLFSBLOCK);
699         d = XFS_FSB_TO_DADDR(mp, fsbno);
700         return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
701 }
702
703 /*
704  * Get a buffer for the block, return it with no data read.
705  * Short-form addressing.
706  */
707 xfs_buf_t *                             /* buffer for agno/agbno */
708 xfs_btree_get_bufs(
709         xfs_mount_t     *mp,            /* file system mount point */
710         xfs_trans_t     *tp,            /* transaction pointer */
711         xfs_agnumber_t  agno,           /* allocation group number */
712         xfs_agblock_t   agbno,          /* allocation group block number */
713         uint            lock)           /* lock flags for get_buf */
714 {
715         xfs_daddr_t             d;              /* real disk block address */
716
717         ASSERT(agno != NULLAGNUMBER);
718         ASSERT(agbno != NULLAGBLOCK);
719         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
720         return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
721 }
722
723 /*
724  * Check for the cursor referring to the last block at the given level.
725  */
726 int                                     /* 1=is last block, 0=not last block */
727 xfs_btree_islastblock(
728         xfs_btree_cur_t         *cur,   /* btree cursor */
729         int                     level)  /* level to check */
730 {
731         struct xfs_btree_block  *block; /* generic btree block pointer */
732         xfs_buf_t               *bp;    /* buffer containing block */
733
734         block = xfs_btree_get_block(cur, level, &bp);
735         xfs_btree_check_block(cur, block, level, bp);
736         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
737                 return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
738         else
739                 return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
740 }
741
742 /*
743  * Change the cursor to point to the first record at the given level.
744  * Other levels are unaffected.
745  */
746 STATIC int                              /* success=1, failure=0 */
747 xfs_btree_firstrec(
748         xfs_btree_cur_t         *cur,   /* btree cursor */
749         int                     level)  /* level to change */
750 {
751         struct xfs_btree_block  *block; /* generic btree block pointer */
752         xfs_buf_t               *bp;    /* buffer containing block */
753
754         /*
755          * Get the block pointer for this level.
756          */
757         block = xfs_btree_get_block(cur, level, &bp);
758         if (xfs_btree_check_block(cur, block, level, bp))
759                 return 0;
760         /*
761          * It's empty, there is no such record.
762          */
763         if (!block->bb_numrecs)
764                 return 0;
765         /*
766          * Set the ptr value to 1, that's the first record/key.
767          */
768         cur->bc_ptrs[level] = 1;
769         return 1;
770 }
771
772 /*
773  * Change the cursor to point to the last record in the current block
774  * at the given level.  Other levels are unaffected.
775  */
776 STATIC int                              /* success=1, failure=0 */
777 xfs_btree_lastrec(
778         xfs_btree_cur_t         *cur,   /* btree cursor */
779         int                     level)  /* level to change */
780 {
781         struct xfs_btree_block  *block; /* generic btree block pointer */
782         xfs_buf_t               *bp;    /* buffer containing block */
783
784         /*
785          * Get the block pointer for this level.
786          */
787         block = xfs_btree_get_block(cur, level, &bp);
788         if (xfs_btree_check_block(cur, block, level, bp))
789                 return 0;
790         /*
791          * It's empty, there is no such record.
792          */
793         if (!block->bb_numrecs)
794                 return 0;
795         /*
796          * Set the ptr value to numrecs, that's the last record/key.
797          */
798         cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
799         return 1;
800 }
801
802 /*
803  * Compute first and last byte offsets for the fields given.
804  * Interprets the offsets table, which contains struct field offsets.
805  */
806 void
807 xfs_btree_offsets(
808         int64_t         fields,         /* bitmask of fields */
809         const short     *offsets,       /* table of field offsets */
810         int             nbits,          /* number of bits to inspect */
811         int             *first,         /* output: first byte offset */
812         int             *last)          /* output: last byte offset */
813 {
814         int             i;              /* current bit number */
815         int64_t         imask;          /* mask for current bit number */
816
817         ASSERT(fields != 0);
818         /*
819          * Find the lowest bit, so the first byte offset.
820          */
821         for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
822                 if (imask & fields) {
823                         *first = offsets[i];
824                         break;
825                 }
826         }
827         /*
828          * Find the highest bit, so the last byte offset.
829          */
830         for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
831                 if (imask & fields) {
832                         *last = offsets[i + 1] - 1;
833                         break;
834                 }
835         }
836 }
837
838 /*
839  * Get a buffer for the block, return it read in.
840  * Long-form addressing.
841  */
842 int
843 xfs_btree_read_bufl(
844         struct xfs_mount        *mp,            /* file system mount point */
845         struct xfs_trans        *tp,            /* transaction pointer */
846         xfs_fsblock_t           fsbno,          /* file system block number */
847         uint                    lock,           /* lock flags for read_buf */
848         struct xfs_buf          **bpp,          /* buffer for fsbno */
849         int                     refval,         /* ref count value for buffer */
850         const struct xfs_buf_ops *ops)
851 {
852         struct xfs_buf          *bp;            /* return value */
853         xfs_daddr_t             d;              /* real disk block address */
854         int                     error;
855
856         if (!xfs_verify_fsbno(mp, fsbno))
857                 return -EFSCORRUPTED;
858         d = XFS_FSB_TO_DADDR(mp, fsbno);
859         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
860                                    mp->m_bsize, lock, &bp, ops);
861         if (error)
862                 return error;
863         if (bp)
864                 xfs_buf_set_ref(bp, refval);
865         *bpp = bp;
866         return 0;
867 }
868
869 /*
870  * Read-ahead the block, don't wait for it, don't return a buffer.
871  * Long-form addressing.
872  */
873 /* ARGSUSED */
874 void
875 xfs_btree_reada_bufl(
876         struct xfs_mount        *mp,            /* file system mount point */
877         xfs_fsblock_t           fsbno,          /* file system block number */
878         xfs_extlen_t            count,          /* count of filesystem blocks */
879         const struct xfs_buf_ops *ops)
880 {
881         xfs_daddr_t             d;
882
883         ASSERT(fsbno != NULLFSBLOCK);
884         d = XFS_FSB_TO_DADDR(mp, fsbno);
885         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
886 }
887
888 /*
889  * Read-ahead the block, don't wait for it, don't return a buffer.
890  * Short-form addressing.
891  */
892 /* ARGSUSED */
893 void
894 xfs_btree_reada_bufs(
895         struct xfs_mount        *mp,            /* file system mount point */
896         xfs_agnumber_t          agno,           /* allocation group number */
897         xfs_agblock_t           agbno,          /* allocation group block number */
898         xfs_extlen_t            count,          /* count of filesystem blocks */
899         const struct xfs_buf_ops *ops)
900 {
901         xfs_daddr_t             d;
902
903         ASSERT(agno != NULLAGNUMBER);
904         ASSERT(agbno != NULLAGBLOCK);
905         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
906         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
907 }
908
909 STATIC int
910 xfs_btree_readahead_lblock(
911         struct xfs_btree_cur    *cur,
912         int                     lr,
913         struct xfs_btree_block  *block)
914 {
915         int                     rval = 0;
916         xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
917         xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
918
919         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
920                 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
921                                      cur->bc_ops->buf_ops);
922                 rval++;
923         }
924
925         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
926                 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
927                                      cur->bc_ops->buf_ops);
928                 rval++;
929         }
930
931         return rval;
932 }
933
934 STATIC int
935 xfs_btree_readahead_sblock(
936         struct xfs_btree_cur    *cur,
937         int                     lr,
938         struct xfs_btree_block *block)
939 {
940         int                     rval = 0;
941         xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
942         xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
943
944
945         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
946                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
947                                      left, 1, cur->bc_ops->buf_ops);
948                 rval++;
949         }
950
951         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
952                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
953                                      right, 1, cur->bc_ops->buf_ops);
954                 rval++;
955         }
956
957         return rval;
958 }
959
960 /*
961  * Read-ahead btree blocks, at the given level.
962  * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
963  */
964 STATIC int
965 xfs_btree_readahead(
966         struct xfs_btree_cur    *cur,           /* btree cursor */
967         int                     lev,            /* level in btree */
968         int                     lr)             /* left/right bits */
969 {
970         struct xfs_btree_block  *block;
971
972         /*
973          * No readahead needed if we are at the root level and the
974          * btree root is stored in the inode.
975          */
976         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
977             (lev == cur->bc_nlevels - 1))
978                 return 0;
979
980         if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
981                 return 0;
982
983         cur->bc_ra[lev] |= lr;
984         block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
985
986         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
987                 return xfs_btree_readahead_lblock(cur, lr, block);
988         return xfs_btree_readahead_sblock(cur, lr, block);
989 }
990
991 STATIC xfs_daddr_t
992 xfs_btree_ptr_to_daddr(
993         struct xfs_btree_cur    *cur,
994         union xfs_btree_ptr     *ptr)
995 {
996         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
997                 ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
998
999                 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
1000         } else {
1001                 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
1002                 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
1003
1004                 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
1005                                         be32_to_cpu(ptr->s));
1006         }
1007 }
1008
1009 /*
1010  * Readahead @count btree blocks at the given @ptr location.
1011  *
1012  * We don't need to care about long or short form btrees here as we have a
1013  * method of converting the ptr directly to a daddr available to us.
1014  */
1015 STATIC void
1016 xfs_btree_readahead_ptr(
1017         struct xfs_btree_cur    *cur,
1018         union xfs_btree_ptr     *ptr,
1019         xfs_extlen_t            count)
1020 {
1021         xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
1022                           xfs_btree_ptr_to_daddr(cur, ptr),
1023                           cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
1024 }
1025
1026 /*
1027  * Set the buffer for level "lev" in the cursor to bp, releasing
1028  * any previous buffer.
1029  */
1030 STATIC void
1031 xfs_btree_setbuf(
1032         xfs_btree_cur_t         *cur,   /* btree cursor */
1033         int                     lev,    /* level in btree */
1034         xfs_buf_t               *bp)    /* new buffer to set */
1035 {
1036         struct xfs_btree_block  *b;     /* btree block */
1037
1038         if (cur->bc_bufs[lev])
1039                 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
1040         cur->bc_bufs[lev] = bp;
1041         cur->bc_ra[lev] = 0;
1042
1043         b = XFS_BUF_TO_BLOCK(bp);
1044         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1045                 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1046                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1047                 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1048                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1049         } else {
1050                 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1051                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1052                 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1053                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1054         }
1055 }
1056
1057 bool
1058 xfs_btree_ptr_is_null(
1059         struct xfs_btree_cur    *cur,
1060         union xfs_btree_ptr     *ptr)
1061 {
1062         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1063                 return ptr->l == cpu_to_be64(NULLFSBLOCK);
1064         else
1065                 return ptr->s == cpu_to_be32(NULLAGBLOCK);
1066 }
1067
1068 STATIC void
1069 xfs_btree_set_ptr_null(
1070         struct xfs_btree_cur    *cur,
1071         union xfs_btree_ptr     *ptr)
1072 {
1073         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1074                 ptr->l = cpu_to_be64(NULLFSBLOCK);
1075         else
1076                 ptr->s = cpu_to_be32(NULLAGBLOCK);
1077 }
1078
1079 /*
1080  * Get/set/init sibling pointers
1081  */
1082 void
1083 xfs_btree_get_sibling(
1084         struct xfs_btree_cur    *cur,
1085         struct xfs_btree_block  *block,
1086         union xfs_btree_ptr     *ptr,
1087         int                     lr)
1088 {
1089         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1090
1091         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1092                 if (lr == XFS_BB_RIGHTSIB)
1093                         ptr->l = block->bb_u.l.bb_rightsib;
1094                 else
1095                         ptr->l = block->bb_u.l.bb_leftsib;
1096         } else {
1097                 if (lr == XFS_BB_RIGHTSIB)
1098                         ptr->s = block->bb_u.s.bb_rightsib;
1099                 else
1100                         ptr->s = block->bb_u.s.bb_leftsib;
1101         }
1102 }
1103
1104 STATIC void
1105 xfs_btree_set_sibling(
1106         struct xfs_btree_cur    *cur,
1107         struct xfs_btree_block  *block,
1108         union xfs_btree_ptr     *ptr,
1109         int                     lr)
1110 {
1111         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1112
1113         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1114                 if (lr == XFS_BB_RIGHTSIB)
1115                         block->bb_u.l.bb_rightsib = ptr->l;
1116                 else
1117                         block->bb_u.l.bb_leftsib = ptr->l;
1118         } else {
1119                 if (lr == XFS_BB_RIGHTSIB)
1120                         block->bb_u.s.bb_rightsib = ptr->s;
1121                 else
1122                         block->bb_u.s.bb_leftsib = ptr->s;
1123         }
1124 }
1125
1126 void
1127 xfs_btree_init_block_int(
1128         struct xfs_mount        *mp,
1129         struct xfs_btree_block  *buf,
1130         xfs_daddr_t             blkno,
1131         xfs_btnum_t             btnum,
1132         __u16                   level,
1133         __u16                   numrecs,
1134         __u64                   owner,
1135         unsigned int            flags)
1136 {
1137         int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
1138         __u32                   magic = xfs_btree_magic(crc, btnum);
1139
1140         buf->bb_magic = cpu_to_be32(magic);
1141         buf->bb_level = cpu_to_be16(level);
1142         buf->bb_numrecs = cpu_to_be16(numrecs);
1143
1144         if (flags & XFS_BTREE_LONG_PTRS) {
1145                 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1146                 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1147                 if (crc) {
1148                         buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1149                         buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1150                         uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1151                         buf->bb_u.l.bb_pad = 0;
1152                         buf->bb_u.l.bb_lsn = 0;
1153                 }
1154         } else {
1155                 /* owner is a 32 bit value on short blocks */
1156                 __u32 __owner = (__u32)owner;
1157
1158                 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1159                 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1160                 if (crc) {
1161                         buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1162                         buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1163                         uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1164                         buf->bb_u.s.bb_lsn = 0;
1165                 }
1166         }
1167 }
1168
1169 void
1170 xfs_btree_init_block(
1171         struct xfs_mount *mp,
1172         struct xfs_buf  *bp,
1173         xfs_btnum_t     btnum,
1174         __u16           level,
1175         __u16           numrecs,
1176         __u64           owner,
1177         unsigned int    flags)
1178 {
1179         xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1180                                  btnum, level, numrecs, owner, flags);
1181 }
1182
1183 STATIC void
1184 xfs_btree_init_block_cur(
1185         struct xfs_btree_cur    *cur,
1186         struct xfs_buf          *bp,
1187         int                     level,
1188         int                     numrecs)
1189 {
1190         __u64                   owner;
1191
1192         /*
1193          * we can pull the owner from the cursor right now as the different
1194          * owners align directly with the pointer size of the btree. This may
1195          * change in future, but is safe for current users of the generic btree
1196          * code.
1197          */
1198         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1199                 owner = cur->bc_private.b.ip->i_ino;
1200         else
1201                 owner = cur->bc_private.a.agno;
1202
1203         xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1204                                  cur->bc_btnum, level, numrecs,
1205                                  owner, cur->bc_flags);
1206 }
1207
1208 /*
1209  * Return true if ptr is the last record in the btree and
1210  * we need to track updates to this record.  The decision
1211  * will be further refined in the update_lastrec method.
1212  */
1213 STATIC int
1214 xfs_btree_is_lastrec(
1215         struct xfs_btree_cur    *cur,
1216         struct xfs_btree_block  *block,
1217         int                     level)
1218 {
1219         union xfs_btree_ptr     ptr;
1220
1221         if (level > 0)
1222                 return 0;
1223         if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1224                 return 0;
1225
1226         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1227         if (!xfs_btree_ptr_is_null(cur, &ptr))
1228                 return 0;
1229         return 1;
1230 }
1231
1232 STATIC void
1233 xfs_btree_buf_to_ptr(
1234         struct xfs_btree_cur    *cur,
1235         struct xfs_buf          *bp,
1236         union xfs_btree_ptr     *ptr)
1237 {
1238         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1239                 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1240                                         XFS_BUF_ADDR(bp)));
1241         else {
1242                 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1243                                         XFS_BUF_ADDR(bp)));
1244         }
1245 }
1246
1247 STATIC void
1248 xfs_btree_set_refs(
1249         struct xfs_btree_cur    *cur,
1250         struct xfs_buf          *bp)
1251 {
1252         switch (cur->bc_btnum) {
1253         case XFS_BTNUM_BNO:
1254         case XFS_BTNUM_CNT:
1255                 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1256                 break;
1257         case XFS_BTNUM_INO:
1258         case XFS_BTNUM_FINO:
1259                 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1260                 break;
1261         case XFS_BTNUM_BMAP:
1262                 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1263                 break;
1264         case XFS_BTNUM_RMAP:
1265                 xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
1266                 break;
1267         case XFS_BTNUM_REFC:
1268                 xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
1269                 break;
1270         default:
1271                 ASSERT(0);
1272         }
1273 }
1274
1275 STATIC int
1276 xfs_btree_get_buf_block(
1277         struct xfs_btree_cur    *cur,
1278         union xfs_btree_ptr     *ptr,
1279         int                     flags,
1280         struct xfs_btree_block  **block,
1281         struct xfs_buf          **bpp)
1282 {
1283         struct xfs_mount        *mp = cur->bc_mp;
1284         xfs_daddr_t             d;
1285
1286         /* need to sort out how callers deal with failures first */
1287         ASSERT(!(flags & XBF_TRYLOCK));
1288
1289         d = xfs_btree_ptr_to_daddr(cur, ptr);
1290         *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1291                                  mp->m_bsize, flags);
1292
1293         if (!*bpp)
1294                 return -ENOMEM;
1295
1296         (*bpp)->b_ops = cur->bc_ops->buf_ops;
1297         *block = XFS_BUF_TO_BLOCK(*bpp);
1298         return 0;
1299 }
1300
1301 /*
1302  * Read in the buffer at the given ptr and return the buffer and
1303  * the block pointer within the buffer.
1304  */
1305 STATIC int
1306 xfs_btree_read_buf_block(
1307         struct xfs_btree_cur    *cur,
1308         union xfs_btree_ptr     *ptr,
1309         int                     flags,
1310         struct xfs_btree_block  **block,
1311         struct xfs_buf          **bpp)
1312 {
1313         struct xfs_mount        *mp = cur->bc_mp;
1314         xfs_daddr_t             d;
1315         int                     error;
1316
1317         /* need to sort out how callers deal with failures first */
1318         ASSERT(!(flags & XBF_TRYLOCK));
1319
1320         d = xfs_btree_ptr_to_daddr(cur, ptr);
1321         error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1322                                    mp->m_bsize, flags, bpp,
1323                                    cur->bc_ops->buf_ops);
1324         if (error)
1325                 return error;
1326
1327         xfs_btree_set_refs(cur, *bpp);
1328         *block = XFS_BUF_TO_BLOCK(*bpp);
1329         return 0;
1330 }
1331
1332 /*
1333  * Copy keys from one btree block to another.
1334  */
1335 STATIC void
1336 xfs_btree_copy_keys(
1337         struct xfs_btree_cur    *cur,
1338         union xfs_btree_key     *dst_key,
1339         union xfs_btree_key     *src_key,
1340         int                     numkeys)
1341 {
1342         ASSERT(numkeys >= 0);
1343         memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1344 }
1345
1346 /*
1347  * Copy records from one btree block to another.
1348  */
1349 STATIC void
1350 xfs_btree_copy_recs(
1351         struct xfs_btree_cur    *cur,
1352         union xfs_btree_rec     *dst_rec,
1353         union xfs_btree_rec     *src_rec,
1354         int                     numrecs)
1355 {
1356         ASSERT(numrecs >= 0);
1357         memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1358 }
1359
1360 /*
1361  * Copy block pointers from one btree block to another.
1362  */
1363 STATIC void
1364 xfs_btree_copy_ptrs(
1365         struct xfs_btree_cur    *cur,
1366         union xfs_btree_ptr     *dst_ptr,
1367         union xfs_btree_ptr     *src_ptr,
1368         int                     numptrs)
1369 {
1370         ASSERT(numptrs >= 0);
1371         memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1372 }
1373
1374 /*
1375  * Shift keys one index left/right inside a single btree block.
1376  */
1377 STATIC void
1378 xfs_btree_shift_keys(
1379         struct xfs_btree_cur    *cur,
1380         union xfs_btree_key     *key,
1381         int                     dir,
1382         int                     numkeys)
1383 {
1384         char                    *dst_key;
1385
1386         ASSERT(numkeys >= 0);
1387         ASSERT(dir == 1 || dir == -1);
1388
1389         dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1390         memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1391 }
1392
1393 /*
1394  * Shift records one index left/right inside a single btree block.
1395  */
1396 STATIC void
1397 xfs_btree_shift_recs(
1398         struct xfs_btree_cur    *cur,
1399         union xfs_btree_rec     *rec,
1400         int                     dir,
1401         int                     numrecs)
1402 {
1403         char                    *dst_rec;
1404
1405         ASSERT(numrecs >= 0);
1406         ASSERT(dir == 1 || dir == -1);
1407
1408         dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1409         memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1410 }
1411
1412 /*
1413  * Shift block pointers one index left/right inside a single btree block.
1414  */
1415 STATIC void
1416 xfs_btree_shift_ptrs(
1417         struct xfs_btree_cur    *cur,
1418         union xfs_btree_ptr     *ptr,
1419         int                     dir,
1420         int                     numptrs)
1421 {
1422         char                    *dst_ptr;
1423
1424         ASSERT(numptrs >= 0);
1425         ASSERT(dir == 1 || dir == -1);
1426
1427         dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1428         memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1429 }
1430
1431 /*
1432  * Log key values from the btree block.
1433  */
1434 STATIC void
1435 xfs_btree_log_keys(
1436         struct xfs_btree_cur    *cur,
1437         struct xfs_buf          *bp,
1438         int                     first,
1439         int                     last)
1440 {
1441
1442         if (bp) {
1443                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1444                 xfs_trans_log_buf(cur->bc_tp, bp,
1445                                   xfs_btree_key_offset(cur, first),
1446                                   xfs_btree_key_offset(cur, last + 1) - 1);
1447         } else {
1448                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1449                                 xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1450         }
1451 }
1452
1453 /*
1454  * Log record values from the btree block.
1455  */
1456 void
1457 xfs_btree_log_recs(
1458         struct xfs_btree_cur    *cur,
1459         struct xfs_buf          *bp,
1460         int                     first,
1461         int                     last)
1462 {
1463
1464         xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1465         xfs_trans_log_buf(cur->bc_tp, bp,
1466                           xfs_btree_rec_offset(cur, first),
1467                           xfs_btree_rec_offset(cur, last + 1) - 1);
1468
1469 }
1470
1471 /*
1472  * Log block pointer fields from a btree block (nonleaf).
1473  */
1474 STATIC void
1475 xfs_btree_log_ptrs(
1476         struct xfs_btree_cur    *cur,   /* btree cursor */
1477         struct xfs_buf          *bp,    /* buffer containing btree block */
1478         int                     first,  /* index of first pointer to log */
1479         int                     last)   /* index of last pointer to log */
1480 {
1481
1482         if (bp) {
1483                 struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1484                 int                     level = xfs_btree_get_level(block);
1485
1486                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1487                 xfs_trans_log_buf(cur->bc_tp, bp,
1488                                 xfs_btree_ptr_offset(cur, first, level),
1489                                 xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1490         } else {
1491                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1492                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1493         }
1494
1495 }
1496
1497 /*
1498  * Log fields from a btree block header.
1499  */
1500 void
1501 xfs_btree_log_block(
1502         struct xfs_btree_cur    *cur,   /* btree cursor */
1503         struct xfs_buf          *bp,    /* buffer containing btree block */
1504         int                     fields) /* mask of fields: XFS_BB_... */
1505 {
1506         int                     first;  /* first byte offset logged */
1507         int                     last;   /* last byte offset logged */
1508         static const short      soffsets[] = {  /* table of offsets (short) */
1509                 offsetof(struct xfs_btree_block, bb_magic),
1510                 offsetof(struct xfs_btree_block, bb_level),
1511                 offsetof(struct xfs_btree_block, bb_numrecs),
1512                 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1513                 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1514                 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1515                 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1516                 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1517                 offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1518                 offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1519                 XFS_BTREE_SBLOCK_CRC_LEN
1520         };
1521         static const short      loffsets[] = {  /* table of offsets (long) */
1522                 offsetof(struct xfs_btree_block, bb_magic),
1523                 offsetof(struct xfs_btree_block, bb_level),
1524                 offsetof(struct xfs_btree_block, bb_numrecs),
1525                 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1526                 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1527                 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1528                 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1529                 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1530                 offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1531                 offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1532                 offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1533                 XFS_BTREE_LBLOCK_CRC_LEN
1534         };
1535
1536         if (bp) {
1537                 int nbits;
1538
1539                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1540                         /*
1541                          * We don't log the CRC when updating a btree
1542                          * block but instead recreate it during log
1543                          * recovery.  As the log buffers have checksums
1544                          * of their own this is safe and avoids logging a crc
1545                          * update in a lot of places.
1546                          */
1547                         if (fields == XFS_BB_ALL_BITS)
1548                                 fields = XFS_BB_ALL_BITS_CRC;
1549                         nbits = XFS_BB_NUM_BITS_CRC;
1550                 } else {
1551                         nbits = XFS_BB_NUM_BITS;
1552                 }
1553                 xfs_btree_offsets(fields,
1554                                   (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1555                                         loffsets : soffsets,
1556                                   nbits, &first, &last);
1557                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1558                 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1559         } else {
1560                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1561                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1562         }
1563 }
1564
1565 /*
1566  * Increment cursor by one record at the level.
1567  * For nonzero levels the leaf-ward information is untouched.
1568  */
1569 int                                             /* error */
1570 xfs_btree_increment(
1571         struct xfs_btree_cur    *cur,
1572         int                     level,
1573         int                     *stat)          /* success/failure */
1574 {
1575         struct xfs_btree_block  *block;
1576         union xfs_btree_ptr     ptr;
1577         struct xfs_buf          *bp;
1578         int                     error;          /* error return value */
1579         int                     lev;
1580
1581         ASSERT(level < cur->bc_nlevels);
1582
1583         /* Read-ahead to the right at this level. */
1584         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1585
1586         /* Get a pointer to the btree block. */
1587         block = xfs_btree_get_block(cur, level, &bp);
1588
1589 #ifdef DEBUG
1590         error = xfs_btree_check_block(cur, block, level, bp);
1591         if (error)
1592                 goto error0;
1593 #endif
1594
1595         /* We're done if we remain in the block after the increment. */
1596         if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1597                 goto out1;
1598
1599         /* Fail if we just went off the right edge of the tree. */
1600         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1601         if (xfs_btree_ptr_is_null(cur, &ptr))
1602                 goto out0;
1603
1604         XFS_BTREE_STATS_INC(cur, increment);
1605
1606         /*
1607          * March up the tree incrementing pointers.
1608          * Stop when we don't go off the right edge of a block.
1609          */
1610         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1611                 block = xfs_btree_get_block(cur, lev, &bp);
1612
1613 #ifdef DEBUG
1614                 error = xfs_btree_check_block(cur, block, lev, bp);
1615                 if (error)
1616                         goto error0;
1617 #endif
1618
1619                 if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1620                         break;
1621
1622                 /* Read-ahead the right block for the next loop. */
1623                 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1624         }
1625
1626         /*
1627          * If we went off the root then we are either seriously
1628          * confused or have the tree root in an inode.
1629          */
1630         if (lev == cur->bc_nlevels) {
1631                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1632                         goto out0;
1633                 ASSERT(0);
1634                 error = -EFSCORRUPTED;
1635                 goto error0;
1636         }
1637         ASSERT(lev < cur->bc_nlevels);
1638
1639         /*
1640          * Now walk back down the tree, fixing up the cursor's buffer
1641          * pointers and key numbers.
1642          */
1643         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1644                 union xfs_btree_ptr     *ptrp;
1645
1646                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1647                 --lev;
1648                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1649                 if (error)
1650                         goto error0;
1651
1652                 xfs_btree_setbuf(cur, lev, bp);
1653                 cur->bc_ptrs[lev] = 1;
1654         }
1655 out1:
1656         *stat = 1;
1657         return 0;
1658
1659 out0:
1660         *stat = 0;
1661         return 0;
1662
1663 error0:
1664         return error;
1665 }
1666
1667 /*
1668  * Decrement cursor by one record at the level.
1669  * For nonzero levels the leaf-ward information is untouched.
1670  */
1671 int                                             /* error */
1672 xfs_btree_decrement(
1673         struct xfs_btree_cur    *cur,
1674         int                     level,
1675         int                     *stat)          /* success/failure */
1676 {
1677         struct xfs_btree_block  *block;
1678         xfs_buf_t               *bp;
1679         int                     error;          /* error return value */
1680         int                     lev;
1681         union xfs_btree_ptr     ptr;
1682
1683         ASSERT(level < cur->bc_nlevels);
1684
1685         /* Read-ahead to the left at this level. */
1686         xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1687
1688         /* We're done if we remain in the block after the decrement. */
1689         if (--cur->bc_ptrs[level] > 0)
1690                 goto out1;
1691
1692         /* Get a pointer to the btree block. */
1693         block = xfs_btree_get_block(cur, level, &bp);
1694
1695 #ifdef DEBUG
1696         error = xfs_btree_check_block(cur, block, level, bp);
1697         if (error)
1698                 goto error0;
1699 #endif
1700
1701         /* Fail if we just went off the left edge of the tree. */
1702         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1703         if (xfs_btree_ptr_is_null(cur, &ptr))
1704                 goto out0;
1705
1706         XFS_BTREE_STATS_INC(cur, decrement);
1707
1708         /*
1709          * March up the tree decrementing pointers.
1710          * Stop when we don't go off the left edge of a block.
1711          */
1712         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1713                 if (--cur->bc_ptrs[lev] > 0)
1714                         break;
1715                 /* Read-ahead the left block for the next loop. */
1716                 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1717         }
1718
1719         /*
1720          * If we went off the root then we are seriously confused.
1721          * or the root of the tree is in an inode.
1722          */
1723         if (lev == cur->bc_nlevels) {
1724                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1725                         goto out0;
1726                 ASSERT(0);
1727                 error = -EFSCORRUPTED;
1728                 goto error0;
1729         }
1730         ASSERT(lev < cur->bc_nlevels);
1731
1732         /*
1733          * Now walk back down the tree, fixing up the cursor's buffer
1734          * pointers and key numbers.
1735          */
1736         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1737                 union xfs_btree_ptr     *ptrp;
1738
1739                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1740                 --lev;
1741                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1742                 if (error)
1743                         goto error0;
1744                 xfs_btree_setbuf(cur, lev, bp);
1745                 cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1746         }
1747 out1:
1748         *stat = 1;
1749         return 0;
1750
1751 out0:
1752         *stat = 0;
1753         return 0;
1754
1755 error0:
1756         return error;
1757 }
1758
1759 int
1760 xfs_btree_lookup_get_block(
1761         struct xfs_btree_cur    *cur,   /* btree cursor */
1762         int                     level,  /* level in the btree */
1763         union xfs_btree_ptr     *pp,    /* ptr to btree block */
1764         struct xfs_btree_block  **blkp) /* return btree block */
1765 {
1766         struct xfs_buf          *bp;    /* buffer pointer for btree block */
1767         int                     error = 0;
1768
1769         /* special case the root block if in an inode */
1770         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1771             (level == cur->bc_nlevels - 1)) {
1772                 *blkp = xfs_btree_get_iroot(cur);
1773                 return 0;
1774         }
1775
1776         /*
1777          * If the old buffer at this level for the disk address we are
1778          * looking for re-use it.
1779          *
1780          * Otherwise throw it away and get a new one.
1781          */
1782         bp = cur->bc_bufs[level];
1783         if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
1784                 *blkp = XFS_BUF_TO_BLOCK(bp);
1785                 return 0;
1786         }
1787
1788         error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1789         if (error)
1790                 return error;
1791
1792         /* Check the inode owner since the verifiers don't. */
1793         if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
1794             !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
1795             (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
1796             be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
1797                         cur->bc_private.b.ip->i_ino)
1798                 goto out_bad;
1799
1800         /* Did we get the level we were looking for? */
1801         if (be16_to_cpu((*blkp)->bb_level) != level)
1802                 goto out_bad;
1803
1804         /* Check that internal nodes have at least one record. */
1805         if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1806                 goto out_bad;
1807
1808         xfs_btree_setbuf(cur, level, bp);
1809         return 0;
1810
1811 out_bad:
1812         *blkp = NULL;
1813         xfs_trans_brelse(cur->bc_tp, bp);
1814         return -EFSCORRUPTED;
1815 }
1816
1817 /*
1818  * Get current search key.  For level 0 we don't actually have a key
1819  * structure so we make one up from the record.  For all other levels
1820  * we just return the right key.
1821  */
1822 STATIC union xfs_btree_key *
1823 xfs_lookup_get_search_key(
1824         struct xfs_btree_cur    *cur,
1825         int                     level,
1826         int                     keyno,
1827         struct xfs_btree_block  *block,
1828         union xfs_btree_key     *kp)
1829 {
1830         if (level == 0) {
1831                 cur->bc_ops->init_key_from_rec(kp,
1832                                 xfs_btree_rec_addr(cur, keyno, block));
1833                 return kp;
1834         }
1835
1836         return xfs_btree_key_addr(cur, keyno, block);
1837 }
1838
1839 /*
1840  * Lookup the record.  The cursor is made to point to it, based on dir.
1841  * stat is set to 0 if can't find any such record, 1 for success.
1842  */
1843 int                                     /* error */
1844 xfs_btree_lookup(
1845         struct xfs_btree_cur    *cur,   /* btree cursor */
1846         xfs_lookup_t            dir,    /* <=, ==, or >= */
1847         int                     *stat)  /* success/failure */
1848 {
1849         struct xfs_btree_block  *block; /* current btree block */
1850         int64_t                 diff;   /* difference for the current key */
1851         int                     error;  /* error return value */
1852         int                     keyno;  /* current key number */
1853         int                     level;  /* level in the btree */
1854         union xfs_btree_ptr     *pp;    /* ptr to btree block */
1855         union xfs_btree_ptr     ptr;    /* ptr to btree block */
1856
1857         XFS_BTREE_STATS_INC(cur, lookup);
1858
1859         /* No such thing as a zero-level tree. */
1860         if (cur->bc_nlevels == 0)
1861                 return -EFSCORRUPTED;
1862
1863         block = NULL;
1864         keyno = 0;
1865
1866         /* initialise start pointer from cursor */
1867         cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1868         pp = &ptr;
1869
1870         /*
1871          * Iterate over each level in the btree, starting at the root.
1872          * For each level above the leaves, find the key we need, based
1873          * on the lookup record, then follow the corresponding block
1874          * pointer down to the next level.
1875          */
1876         for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1877                 /* Get the block we need to do the lookup on. */
1878                 error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1879                 if (error)
1880                         goto error0;
1881
1882                 if (diff == 0) {
1883                         /*
1884                          * If we already had a key match at a higher level, we
1885                          * know we need to use the first entry in this block.
1886                          */
1887                         keyno = 1;
1888                 } else {
1889                         /* Otherwise search this block. Do a binary search. */
1890
1891                         int     high;   /* high entry number */
1892                         int     low;    /* low entry number */
1893
1894                         /* Set low and high entry numbers, 1-based. */
1895                         low = 1;
1896                         high = xfs_btree_get_numrecs(block);
1897                         if (!high) {
1898                                 /* Block is empty, must be an empty leaf. */
1899                                 ASSERT(level == 0 && cur->bc_nlevels == 1);
1900
1901                                 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1902                                 *stat = 0;
1903                                 return 0;
1904                         }
1905
1906                         /* Binary search the block. */
1907                         while (low <= high) {
1908                                 union xfs_btree_key     key;
1909                                 union xfs_btree_key     *kp;
1910
1911                                 XFS_BTREE_STATS_INC(cur, compare);
1912
1913                                 /* keyno is average of low and high. */
1914                                 keyno = (low + high) >> 1;
1915
1916                                 /* Get current search key */
1917                                 kp = xfs_lookup_get_search_key(cur, level,
1918                                                 keyno, block, &key);
1919
1920                                 /*
1921                                  * Compute difference to get next direction:
1922                                  *  - less than, move right
1923                                  *  - greater than, move left
1924                                  *  - equal, we're done
1925                                  */
1926                                 diff = cur->bc_ops->key_diff(cur, kp);
1927                                 if (diff < 0)
1928                                         low = keyno + 1;
1929                                 else if (diff > 0)
1930                                         high = keyno - 1;
1931                                 else
1932                                         break;
1933                         }
1934                 }
1935
1936                 /*
1937                  * If there are more levels, set up for the next level
1938                  * by getting the block number and filling in the cursor.
1939                  */
1940                 if (level > 0) {
1941                         /*
1942                          * If we moved left, need the previous key number,
1943                          * unless there isn't one.
1944                          */
1945                         if (diff > 0 && --keyno < 1)
1946                                 keyno = 1;
1947                         pp = xfs_btree_ptr_addr(cur, keyno, block);
1948
1949 #ifdef DEBUG
1950                         error = xfs_btree_check_ptr(cur, pp, 0, level);
1951                         if (error)
1952                                 goto error0;
1953 #endif
1954                         cur->bc_ptrs[level] = keyno;
1955                 }
1956         }
1957
1958         /* Done with the search. See if we need to adjust the results. */
1959         if (dir != XFS_LOOKUP_LE && diff < 0) {
1960                 keyno++;
1961                 /*
1962                  * If ge search and we went off the end of the block, but it's
1963                  * not the last block, we're in the wrong block.
1964                  */
1965                 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1966                 if (dir == XFS_LOOKUP_GE &&
1967                     keyno > xfs_btree_get_numrecs(block) &&
1968                     !xfs_btree_ptr_is_null(cur, &ptr)) {
1969                         int     i;
1970
1971                         cur->bc_ptrs[0] = keyno;
1972                         error = xfs_btree_increment(cur, 0, &i);
1973                         if (error)
1974                                 goto error0;
1975                         XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1976                         *stat = 1;
1977                         return 0;
1978                 }
1979         } else if (dir == XFS_LOOKUP_LE && diff > 0)
1980                 keyno--;
1981         cur->bc_ptrs[0] = keyno;
1982
1983         /* Return if we succeeded or not. */
1984         if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1985                 *stat = 0;
1986         else if (dir != XFS_LOOKUP_EQ || diff == 0)
1987                 *stat = 1;
1988         else
1989                 *stat = 0;
1990         return 0;
1991
1992 error0:
1993         return error;
1994 }
1995
1996 /* Find the high key storage area from a regular key. */
1997 union xfs_btree_key *
1998 xfs_btree_high_key_from_key(
1999         struct xfs_btree_cur    *cur,
2000         union xfs_btree_key     *key)
2001 {
2002         ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2003         return (union xfs_btree_key *)((char *)key +
2004                         (cur->bc_ops->key_len / 2));
2005 }
2006
2007 /* Determine the low (and high if overlapped) keys of a leaf block */
2008 STATIC void
2009 xfs_btree_get_leaf_keys(
2010         struct xfs_btree_cur    *cur,
2011         struct xfs_btree_block  *block,
2012         union xfs_btree_key     *key)
2013 {
2014         union xfs_btree_key     max_hkey;
2015         union xfs_btree_key     hkey;
2016         union xfs_btree_rec     *rec;
2017         union xfs_btree_key     *high;
2018         int                     n;
2019
2020         rec = xfs_btree_rec_addr(cur, 1, block);
2021         cur->bc_ops->init_key_from_rec(key, rec);
2022
2023         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2024
2025                 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
2026                 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2027                         rec = xfs_btree_rec_addr(cur, n, block);
2028                         cur->bc_ops->init_high_key_from_rec(&hkey, rec);
2029                         if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
2030                                         > 0)
2031                                 max_hkey = hkey;
2032                 }
2033
2034                 high = xfs_btree_high_key_from_key(cur, key);
2035                 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2036         }
2037 }
2038
2039 /* Determine the low (and high if overlapped) keys of a node block */
2040 STATIC void
2041 xfs_btree_get_node_keys(
2042         struct xfs_btree_cur    *cur,
2043         struct xfs_btree_block  *block,
2044         union xfs_btree_key     *key)
2045 {
2046         union xfs_btree_key     *hkey;
2047         union xfs_btree_key     *max_hkey;
2048         union xfs_btree_key     *high;
2049         int                     n;
2050
2051         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2052                 memcpy(key, xfs_btree_key_addr(cur, 1, block),
2053                                 cur->bc_ops->key_len / 2);
2054
2055                 max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2056                 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2057                         hkey = xfs_btree_high_key_addr(cur, n, block);
2058                         if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
2059                                 max_hkey = hkey;
2060                 }
2061
2062                 high = xfs_btree_high_key_from_key(cur, key);
2063                 memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2064         } else {
2065                 memcpy(key, xfs_btree_key_addr(cur, 1, block),
2066                                 cur->bc_ops->key_len);
2067         }
2068 }
2069
2070 /* Derive the keys for any btree block. */
2071 void
2072 xfs_btree_get_keys(
2073         struct xfs_btree_cur    *cur,
2074         struct xfs_btree_block  *block,
2075         union xfs_btree_key     *key)
2076 {
2077         if (be16_to_cpu(block->bb_level) == 0)
2078                 xfs_btree_get_leaf_keys(cur, block, key);
2079         else
2080                 xfs_btree_get_node_keys(cur, block, key);
2081 }
2082
2083 /*
2084  * Decide if we need to update the parent keys of a btree block.  For
2085  * a standard btree this is only necessary if we're updating the first
2086  * record/key.  For an overlapping btree, we must always update the
2087  * keys because the highest key can be in any of the records or keys
2088  * in the block.
2089  */
2090 static inline bool
2091 xfs_btree_needs_key_update(
2092         struct xfs_btree_cur    *cur,
2093         int                     ptr)
2094 {
2095         return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
2096 }
2097
2098 /*
2099  * Update the low and high parent keys of the given level, progressing
2100  * towards the root.  If force_all is false, stop if the keys for a given
2101  * level do not need updating.
2102  */
2103 STATIC int
2104 __xfs_btree_updkeys(
2105         struct xfs_btree_cur    *cur,
2106         int                     level,
2107         struct xfs_btree_block  *block,
2108         struct xfs_buf          *bp0,
2109         bool                    force_all)
2110 {
2111         union xfs_btree_key     key;    /* keys from current level */
2112         union xfs_btree_key     *lkey;  /* keys from the next level up */
2113         union xfs_btree_key     *hkey;
2114         union xfs_btree_key     *nlkey; /* keys from the next level up */
2115         union xfs_btree_key     *nhkey;
2116         struct xfs_buf          *bp;
2117         int                     ptr;
2118
2119         ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2120
2121         /* Exit if there aren't any parent levels to update. */
2122         if (level + 1 >= cur->bc_nlevels)
2123                 return 0;
2124
2125         trace_xfs_btree_updkeys(cur, level, bp0);
2126
2127         lkey = &key;
2128         hkey = xfs_btree_high_key_from_key(cur, lkey);
2129         xfs_btree_get_keys(cur, block, lkey);
2130         for (level++; level < cur->bc_nlevels; level++) {
2131 #ifdef DEBUG
2132                 int             error;
2133 #endif
2134                 block = xfs_btree_get_block(cur, level, &bp);
2135                 trace_xfs_btree_updkeys(cur, level, bp);
2136 #ifdef DEBUG
2137                 error = xfs_btree_check_block(cur, block, level, bp);
2138                 if (error)
2139                         return error;
2140 #endif
2141                 ptr = cur->bc_ptrs[level];
2142                 nlkey = xfs_btree_key_addr(cur, ptr, block);
2143                 nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2144                 if (!force_all &&
2145                     !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
2146                       cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
2147                         break;
2148                 xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2149                 xfs_btree_log_keys(cur, bp, ptr, ptr);
2150                 if (level + 1 >= cur->bc_nlevels)
2151                         break;
2152                 xfs_btree_get_node_keys(cur, block, lkey);
2153         }
2154
2155         return 0;
2156 }
2157
2158 /* Update all the keys from some level in cursor back to the root. */
2159 STATIC int
2160 xfs_btree_updkeys_force(
2161         struct xfs_btree_cur    *cur,
2162         int                     level)
2163 {
2164         struct xfs_buf          *bp;
2165         struct xfs_btree_block  *block;
2166
2167         block = xfs_btree_get_block(cur, level, &bp);
2168         return __xfs_btree_updkeys(cur, level, block, bp, true);
2169 }
2170
2171 /*
2172  * Update the parent keys of the given level, progressing towards the root.
2173  */
2174 STATIC int
2175 xfs_btree_update_keys(
2176         struct xfs_btree_cur    *cur,
2177         int                     level)
2178 {
2179         struct xfs_btree_block  *block;
2180         struct xfs_buf          *bp;
2181         union xfs_btree_key     *kp;
2182         union xfs_btree_key     key;
2183         int                     ptr;
2184
2185         ASSERT(level >= 0);
2186
2187         block = xfs_btree_get_block(cur, level, &bp);
2188         if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
2189                 return __xfs_btree_updkeys(cur, level, block, bp, false);
2190
2191         /*
2192          * Go up the tree from this level toward the root.
2193          * At each level, update the key value to the value input.
2194          * Stop when we reach a level where the cursor isn't pointing
2195          * at the first entry in the block.
2196          */
2197         xfs_btree_get_keys(cur, block, &key);
2198         for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2199 #ifdef DEBUG
2200                 int             error;
2201 #endif
2202                 block = xfs_btree_get_block(cur, level, &bp);
2203 #ifdef DEBUG
2204                 error = xfs_btree_check_block(cur, block, level, bp);
2205                 if (error)
2206                         return error;
2207 #endif
2208                 ptr = cur->bc_ptrs[level];
2209                 kp = xfs_btree_key_addr(cur, ptr, block);
2210                 xfs_btree_copy_keys(cur, kp, &key, 1);
2211                 xfs_btree_log_keys(cur, bp, ptr, ptr);
2212         }
2213
2214         return 0;
2215 }
2216
2217 /*
2218  * Update the record referred to by cur to the value in the
2219  * given record. This either works (return 0) or gets an
2220  * EFSCORRUPTED error.
2221  */
2222 int
2223 xfs_btree_update(
2224         struct xfs_btree_cur    *cur,
2225         union xfs_btree_rec     *rec)
2226 {
2227         struct xfs_btree_block  *block;
2228         struct xfs_buf          *bp;
2229         int                     error;
2230         int                     ptr;
2231         union xfs_btree_rec     *rp;
2232
2233         /* Pick up the current block. */
2234         block = xfs_btree_get_block(cur, 0, &bp);
2235
2236 #ifdef DEBUG
2237         error = xfs_btree_check_block(cur, block, 0, bp);
2238         if (error)
2239                 goto error0;
2240 #endif
2241         /* Get the address of the rec to be updated. */
2242         ptr = cur->bc_ptrs[0];
2243         rp = xfs_btree_rec_addr(cur, ptr, block);
2244
2245         /* Fill in the new contents and log them. */
2246         xfs_btree_copy_recs(cur, rp, rec, 1);
2247         xfs_btree_log_recs(cur, bp, ptr, ptr);
2248
2249         /*
2250          * If we are tracking the last record in the tree and
2251          * we are at the far right edge of the tree, update it.
2252          */
2253         if (xfs_btree_is_lastrec(cur, block, 0)) {
2254                 cur->bc_ops->update_lastrec(cur, block, rec,
2255                                             ptr, LASTREC_UPDATE);
2256         }
2257
2258         /* Pass new key value up to our parent. */
2259         if (xfs_btree_needs_key_update(cur, ptr)) {
2260                 error = xfs_btree_update_keys(cur, 0);
2261                 if (error)
2262                         goto error0;
2263         }
2264
2265         return 0;
2266
2267 error0:
2268         return error;
2269 }
2270
2271 /*
2272  * Move 1 record left from cur/level if possible.
2273  * Update cur to reflect the new path.
2274  */
2275 STATIC int                                      /* error */
2276 xfs_btree_lshift(
2277         struct xfs_btree_cur    *cur,
2278         int                     level,
2279         int                     *stat)          /* success/failure */
2280 {
2281         struct xfs_buf          *lbp;           /* left buffer pointer */
2282         struct xfs_btree_block  *left;          /* left btree block */
2283         int                     lrecs;          /* left record count */
2284         struct xfs_buf          *rbp;           /* right buffer pointer */
2285         struct xfs_btree_block  *right;         /* right btree block */
2286         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2287         int                     rrecs;          /* right record count */
2288         union xfs_btree_ptr     lptr;           /* left btree pointer */
2289         union xfs_btree_key     *rkp = NULL;    /* right btree key */
2290         union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
2291         union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
2292         int                     error;          /* error return value */
2293         int                     i;
2294
2295         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2296             level == cur->bc_nlevels - 1)
2297                 goto out0;
2298
2299         /* Set up variables for this block as "right". */
2300         right = xfs_btree_get_block(cur, level, &rbp);
2301
2302 #ifdef DEBUG
2303         error = xfs_btree_check_block(cur, right, level, rbp);
2304         if (error)
2305                 goto error0;
2306 #endif
2307
2308         /* If we've got no left sibling then we can't shift an entry left. */
2309         xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2310         if (xfs_btree_ptr_is_null(cur, &lptr))
2311                 goto out0;
2312
2313         /*
2314          * If the cursor entry is the one that would be moved, don't
2315          * do it... it's too complicated.
2316          */
2317         if (cur->bc_ptrs[level] <= 1)
2318                 goto out0;
2319
2320         /* Set up the left neighbor as "left". */
2321         error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2322         if (error)
2323                 goto error0;
2324
2325         /* If it's full, it can't take another entry. */
2326         lrecs = xfs_btree_get_numrecs(left);
2327         if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2328                 goto out0;
2329
2330         rrecs = xfs_btree_get_numrecs(right);
2331
2332         /*
2333          * We add one entry to the left side and remove one for the right side.
2334          * Account for it here, the changes will be updated on disk and logged
2335          * later.
2336          */
2337         lrecs++;
2338         rrecs--;
2339
2340         XFS_BTREE_STATS_INC(cur, lshift);
2341         XFS_BTREE_STATS_ADD(cur, moves, 1);
2342
2343         /*
2344          * If non-leaf, copy a key and a ptr to the left block.
2345          * Log the changes to the left block.
2346          */
2347         if (level > 0) {
2348                 /* It's a non-leaf.  Move keys and pointers. */
2349                 union xfs_btree_key     *lkp;   /* left btree key */
2350                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2351
2352                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2353                 rkp = xfs_btree_key_addr(cur, 1, right);
2354
2355                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2356                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2357 #ifdef DEBUG
2358                 error = xfs_btree_check_ptr(cur, rpp, 0, level);
2359                 if (error)
2360                         goto error0;
2361 #endif
2362                 xfs_btree_copy_keys(cur, lkp, rkp, 1);
2363                 xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2364
2365                 xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2366                 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2367
2368                 ASSERT(cur->bc_ops->keys_inorder(cur,
2369                         xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2370         } else {
2371                 /* It's a leaf.  Move records.  */
2372                 union xfs_btree_rec     *lrp;   /* left record pointer */
2373
2374                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2375                 rrp = xfs_btree_rec_addr(cur, 1, right);
2376
2377                 xfs_btree_copy_recs(cur, lrp, rrp, 1);
2378                 xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2379
2380                 ASSERT(cur->bc_ops->recs_inorder(cur,
2381                         xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2382         }
2383
2384         xfs_btree_set_numrecs(left, lrecs);
2385         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2386
2387         xfs_btree_set_numrecs(right, rrecs);
2388         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2389
2390         /*
2391          * Slide the contents of right down one entry.
2392          */
2393         XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2394         if (level > 0) {
2395                 /* It's a nonleaf. operate on keys and ptrs */
2396 #ifdef DEBUG
2397                 int                     i;              /* loop index */
2398
2399                 for (i = 0; i < rrecs; i++) {
2400                         error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
2401                         if (error)
2402                                 goto error0;
2403                 }
2404 #endif
2405                 xfs_btree_shift_keys(cur,
2406                                 xfs_btree_key_addr(cur, 2, right),
2407                                 -1, rrecs);
2408                 xfs_btree_shift_ptrs(cur,
2409                                 xfs_btree_ptr_addr(cur, 2, right),
2410                                 -1, rrecs);
2411
2412                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2413                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2414         } else {
2415                 /* It's a leaf. operate on records */
2416                 xfs_btree_shift_recs(cur,
2417                         xfs_btree_rec_addr(cur, 2, right),
2418                         -1, rrecs);
2419                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2420         }
2421
2422         /*
2423          * Using a temporary cursor, update the parent key values of the
2424          * block on the left.
2425          */
2426         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2427                 error = xfs_btree_dup_cursor(cur, &tcur);
2428                 if (error)
2429                         goto error0;
2430                 i = xfs_btree_firstrec(tcur, level);
2431                 XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2432
2433                 error = xfs_btree_decrement(tcur, level, &i);
2434                 if (error)
2435                         goto error1;
2436
2437                 /* Update the parent high keys of the left block, if needed. */
2438                 error = xfs_btree_update_keys(tcur, level);
2439                 if (error)
2440                         goto error1;
2441
2442                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2443         }
2444
2445         /* Update the parent keys of the right block. */
2446         error = xfs_btree_update_keys(cur, level);
2447         if (error)
2448                 goto error0;
2449
2450         /* Slide the cursor value left one. */
2451         cur->bc_ptrs[level]--;
2452
2453         *stat = 1;
2454         return 0;
2455
2456 out0:
2457         *stat = 0;
2458         return 0;
2459
2460 error0:
2461         return error;
2462
2463 error1:
2464         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2465         return error;
2466 }
2467
2468 /*
2469  * Move 1 record right from cur/level if possible.
2470  * Update cur to reflect the new path.
2471  */
2472 STATIC int                                      /* error */
2473 xfs_btree_rshift(
2474         struct xfs_btree_cur    *cur,
2475         int                     level,
2476         int                     *stat)          /* success/failure */
2477 {
2478         struct xfs_buf          *lbp;           /* left buffer pointer */
2479         struct xfs_btree_block  *left;          /* left btree block */
2480         struct xfs_buf          *rbp;           /* right buffer pointer */
2481         struct xfs_btree_block  *right;         /* right btree block */
2482         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2483         union xfs_btree_ptr     rptr;           /* right block pointer */
2484         union xfs_btree_key     *rkp;           /* right btree key */
2485         int                     rrecs;          /* right record count */
2486         int                     lrecs;          /* left record count */
2487         int                     error;          /* error return value */
2488         int                     i;              /* loop counter */
2489
2490         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2491             (level == cur->bc_nlevels - 1))
2492                 goto out0;
2493
2494         /* Set up variables for this block as "left". */
2495         left = xfs_btree_get_block(cur, level, &lbp);
2496
2497 #ifdef DEBUG
2498         error = xfs_btree_check_block(cur, left, level, lbp);
2499         if (error)
2500                 goto error0;
2501 #endif
2502
2503         /* If we've got no right sibling then we can't shift an entry right. */
2504         xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2505         if (xfs_btree_ptr_is_null(cur, &rptr))
2506                 goto out0;
2507
2508         /*
2509          * If the cursor entry is the one that would be moved, don't
2510          * do it... it's too complicated.
2511          */
2512         lrecs = xfs_btree_get_numrecs(left);
2513         if (cur->bc_ptrs[level] >= lrecs)
2514                 goto out0;
2515
2516         /* Set up the right neighbor as "right". */
2517         error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2518         if (error)
2519                 goto error0;
2520
2521         /* If it's full, it can't take another entry. */
2522         rrecs = xfs_btree_get_numrecs(right);
2523         if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2524                 goto out0;
2525
2526         XFS_BTREE_STATS_INC(cur, rshift);
2527         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2528
2529         /*
2530          * Make a hole at the start of the right neighbor block, then
2531          * copy the last left block entry to the hole.
2532          */
2533         if (level > 0) {
2534                 /* It's a nonleaf. make a hole in the keys and ptrs */
2535                 union xfs_btree_key     *lkp;
2536                 union xfs_btree_ptr     *lpp;
2537                 union xfs_btree_ptr     *rpp;
2538
2539                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2540                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2541                 rkp = xfs_btree_key_addr(cur, 1, right);
2542                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2543
2544 #ifdef DEBUG
2545                 for (i = rrecs - 1; i >= 0; i--) {
2546                         error = xfs_btree_check_ptr(cur, rpp, i, level);
2547                         if (error)
2548                                 goto error0;
2549                 }
2550 #endif
2551
2552                 xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2553                 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2554
2555 #ifdef DEBUG
2556                 error = xfs_btree_check_ptr(cur, lpp, 0, level);
2557                 if (error)
2558                         goto error0;
2559 #endif
2560
2561                 /* Now put the new data in, and log it. */
2562                 xfs_btree_copy_keys(cur, rkp, lkp, 1);
2563                 xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2564
2565                 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2566                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2567
2568                 ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2569                         xfs_btree_key_addr(cur, 2, right)));
2570         } else {
2571                 /* It's a leaf. make a hole in the records */
2572                 union xfs_btree_rec     *lrp;
2573                 union xfs_btree_rec     *rrp;
2574
2575                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2576                 rrp = xfs_btree_rec_addr(cur, 1, right);
2577
2578                 xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2579
2580                 /* Now put the new data in, and log it. */
2581                 xfs_btree_copy_recs(cur, rrp, lrp, 1);
2582                 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2583         }
2584
2585         /*
2586          * Decrement and log left's numrecs, bump and log right's numrecs.
2587          */
2588         xfs_btree_set_numrecs(left, --lrecs);
2589         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2590
2591         xfs_btree_set_numrecs(right, ++rrecs);
2592         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2593
2594         /*
2595          * Using a temporary cursor, update the parent key values of the
2596          * block on the right.
2597          */
2598         error = xfs_btree_dup_cursor(cur, &tcur);
2599         if (error)
2600                 goto error0;
2601         i = xfs_btree_lastrec(tcur, level);
2602         XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2603
2604         error = xfs_btree_increment(tcur, level, &i);
2605         if (error)
2606                 goto error1;
2607
2608         /* Update the parent high keys of the left block, if needed. */
2609         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2610                 error = xfs_btree_update_keys(cur, level);
2611                 if (error)
2612                         goto error1;
2613         }
2614
2615         /* Update the parent keys of the right block. */
2616         error = xfs_btree_update_keys(tcur, level);
2617         if (error)
2618                 goto error1;
2619
2620         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2621
2622         *stat = 1;
2623         return 0;
2624
2625 out0:
2626         *stat = 0;
2627         return 0;
2628
2629 error0:
2630         return error;
2631
2632 error1:
2633         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2634         return error;
2635 }
2636
2637 /*
2638  * Split cur/level block in half.
2639  * Return new block number and the key to its first
2640  * record (to be inserted into parent).
2641  */
2642 STATIC int                                      /* error */
2643 __xfs_btree_split(
2644         struct xfs_btree_cur    *cur,
2645         int                     level,
2646         union xfs_btree_ptr     *ptrp,
2647         union xfs_btree_key     *key,
2648         struct xfs_btree_cur    **curp,
2649         int                     *stat)          /* success/failure */
2650 {
2651         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2652         struct xfs_buf          *lbp;           /* left buffer pointer */
2653         struct xfs_btree_block  *left;          /* left btree block */
2654         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2655         struct xfs_buf          *rbp;           /* right buffer pointer */
2656         struct xfs_btree_block  *right;         /* right btree block */
2657         union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2658         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2659         struct xfs_btree_block  *rrblock;       /* right-right btree block */
2660         int                     lrecs;
2661         int                     rrecs;
2662         int                     src_index;
2663         int                     error;          /* error return value */
2664 #ifdef DEBUG
2665         int                     i;
2666 #endif
2667
2668         XFS_BTREE_STATS_INC(cur, split);
2669
2670         /* Set up left block (current one). */
2671         left = xfs_btree_get_block(cur, level, &lbp);
2672
2673 #ifdef DEBUG
2674         error = xfs_btree_check_block(cur, left, level, lbp);
2675         if (error)
2676                 goto error0;
2677 #endif
2678
2679         xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2680
2681         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2682         error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2683         if (error)
2684                 goto error0;
2685         if (*stat == 0)
2686                 goto out0;
2687         XFS_BTREE_STATS_INC(cur, alloc);
2688
2689         /* Set up the new block as "right". */
2690         error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2691         if (error)
2692                 goto error0;
2693
2694         /* Fill in the btree header for the new right block. */
2695         xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2696
2697         /*
2698          * Split the entries between the old and the new block evenly.
2699          * Make sure that if there's an odd number of entries now, that
2700          * each new block will have the same number of entries.
2701          */
2702         lrecs = xfs_btree_get_numrecs(left);
2703         rrecs = lrecs / 2;
2704         if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2705                 rrecs++;
2706         src_index = (lrecs - rrecs + 1);
2707
2708         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2709
2710         /* Adjust numrecs for the later get_*_keys() calls. */
2711         lrecs -= rrecs;
2712         xfs_btree_set_numrecs(left, lrecs);
2713         xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2714
2715         /*
2716          * Copy btree block entries from the left block over to the
2717          * new block, the right. Update the right block and log the
2718          * changes.
2719          */
2720         if (level > 0) {
2721                 /* It's a non-leaf.  Move keys and pointers. */
2722                 union xfs_btree_key     *lkp;   /* left btree key */
2723                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2724                 union xfs_btree_key     *rkp;   /* right btree key */
2725                 union xfs_btree_ptr     *rpp;   /* right address pointer */
2726
2727                 lkp = xfs_btree_key_addr(cur, src_index, left);
2728                 lpp = xfs_btree_ptr_addr(cur, src_index, left);
2729                 rkp = xfs_btree_key_addr(cur, 1, right);
2730                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2731
2732 #ifdef DEBUG
2733                 for (i = src_index; i < rrecs; i++) {
2734                         error = xfs_btree_check_ptr(cur, lpp, i, level);
2735                         if (error)
2736                                 goto error0;
2737                 }
2738 #endif
2739
2740                 /* Copy the keys & pointers to the new block. */
2741                 xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2742                 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2743
2744                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2745                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2746
2747                 /* Stash the keys of the new block for later insertion. */
2748                 xfs_btree_get_node_keys(cur, right, key);
2749         } else {
2750                 /* It's a leaf.  Move records.  */
2751                 union xfs_btree_rec     *lrp;   /* left record pointer */
2752                 union xfs_btree_rec     *rrp;   /* right record pointer */
2753
2754                 lrp = xfs_btree_rec_addr(cur, src_index, left);
2755                 rrp = xfs_btree_rec_addr(cur, 1, right);
2756
2757                 /* Copy records to the new block. */
2758                 xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2759                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2760
2761                 /* Stash the keys of the new block for later insertion. */
2762                 xfs_btree_get_leaf_keys(cur, right, key);
2763         }
2764
2765         /*
2766          * Find the left block number by looking in the buffer.
2767          * Adjust sibling pointers.
2768          */
2769         xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2770         xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2771         xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2772         xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2773
2774         xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2775         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2776
2777         /*
2778          * If there's a block to the new block's right, make that block
2779          * point back to right instead of to left.
2780          */
2781         if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2782                 error = xfs_btree_read_buf_block(cur, &rrptr,
2783                                                         0, &rrblock, &rrbp);
2784                 if (error)
2785                         goto error0;
2786                 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2787                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2788         }
2789
2790         /* Update the parent high keys of the left block, if needed. */
2791         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2792                 error = xfs_btree_update_keys(cur, level);
2793                 if (error)
2794                         goto error0;
2795         }
2796
2797         /*
2798          * If the cursor is really in the right block, move it there.
2799          * If it's just pointing past the last entry in left, then we'll
2800          * insert there, so don't change anything in that case.
2801          */
2802         if (cur->bc_ptrs[level] > lrecs + 1) {
2803                 xfs_btree_setbuf(cur, level, rbp);
2804                 cur->bc_ptrs[level] -= lrecs;
2805         }
2806         /*
2807          * If there are more levels, we'll need another cursor which refers
2808          * the right block, no matter where this cursor was.
2809          */
2810         if (level + 1 < cur->bc_nlevels) {
2811                 error = xfs_btree_dup_cursor(cur, curp);
2812                 if (error)
2813                         goto error0;
2814                 (*curp)->bc_ptrs[level + 1]++;
2815         }
2816         *ptrp = rptr;
2817         *stat = 1;
2818         return 0;
2819 out0:
2820         *stat = 0;
2821         return 0;
2822
2823 error0:
2824         return error;
2825 }
2826
2827 struct xfs_btree_split_args {
2828         struct xfs_btree_cur    *cur;
2829         int                     level;
2830         union xfs_btree_ptr     *ptrp;
2831         union xfs_btree_key     *key;
2832         struct xfs_btree_cur    **curp;
2833         int                     *stat;          /* success/failure */
2834         int                     result;
2835         bool                    kswapd; /* allocation in kswapd context */
2836         struct completion       *done;
2837         struct work_struct      work;
2838 };
2839
2840 /*
2841  * Stack switching interfaces for allocation
2842  */
2843 static void
2844 xfs_btree_split_worker(
2845         struct work_struct      *work)
2846 {
2847         struct xfs_btree_split_args     *args = container_of(work,
2848                                                 struct xfs_btree_split_args, work);
2849         unsigned long           pflags;
2850         unsigned long           new_pflags = PF_MEMALLOC_NOFS;
2851
2852         /*
2853          * we are in a transaction context here, but may also be doing work
2854          * in kswapd context, and hence we may need to inherit that state
2855          * temporarily to ensure that we don't block waiting for memory reclaim
2856          * in any way.
2857          */
2858         if (args->kswapd)
2859                 new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2860
2861         current_set_flags_nested(&pflags, new_pflags);
2862
2863         args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2864                                          args->key, args->curp, args->stat);
2865         complete(args->done);
2866
2867         current_restore_flags_nested(&pflags, new_pflags);
2868 }
2869
2870 /*
2871  * BMBT split requests often come in with little stack to work on. Push
2872  * them off to a worker thread so there is lots of stack to use. For the other
2873  * btree types, just call directly to avoid the context switch overhead here.
2874  */
2875 STATIC int                                      /* error */
2876 xfs_btree_split(
2877         struct xfs_btree_cur    *cur,
2878         int                     level,
2879         union xfs_btree_ptr     *ptrp,
2880         union xfs_btree_key     *key,
2881         struct xfs_btree_cur    **curp,
2882         int                     *stat)          /* success/failure */
2883 {
2884         struct xfs_btree_split_args     args;
2885         DECLARE_COMPLETION_ONSTACK(done);
2886
2887         if (cur->bc_btnum != XFS_BTNUM_BMAP)
2888                 return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2889
2890         args.cur = cur;
2891         args.level = level;
2892         args.ptrp = ptrp;
2893         args.key = key;
2894         args.curp = curp;
2895         args.stat = stat;
2896         args.done = &done;
2897         args.kswapd = current_is_kswapd();
2898         INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2899         queue_work(xfs_alloc_wq, &args.work);
2900         wait_for_completion(&done);
2901         destroy_work_on_stack(&args.work);
2902         return args.result;
2903 }
2904
2905
2906 /*
2907  * Copy the old inode root contents into a real block and make the
2908  * broot point to it.
2909  */
2910 int                                             /* error */
2911 xfs_btree_new_iroot(
2912         struct xfs_btree_cur    *cur,           /* btree cursor */
2913         int                     *logflags,      /* logging flags for inode */
2914         int                     *stat)          /* return status - 0 fail */
2915 {
2916         struct xfs_buf          *cbp;           /* buffer for cblock */
2917         struct xfs_btree_block  *block;         /* btree block */
2918         struct xfs_btree_block  *cblock;        /* child btree block */
2919         union xfs_btree_key     *ckp;           /* child key pointer */
2920         union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2921         union xfs_btree_key     *kp;            /* pointer to btree key */
2922         union xfs_btree_ptr     *pp;            /* pointer to block addr */
2923         union xfs_btree_ptr     nptr;           /* new block addr */
2924         int                     level;          /* btree level */
2925         int                     error;          /* error return code */
2926 #ifdef DEBUG
2927         int                     i;              /* loop counter */
2928 #endif
2929
2930         XFS_BTREE_STATS_INC(cur, newroot);
2931
2932         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2933
2934         level = cur->bc_nlevels - 1;
2935
2936         block = xfs_btree_get_iroot(cur);
2937         pp = xfs_btree_ptr_addr(cur, 1, block);
2938
2939         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2940         error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2941         if (error)
2942                 goto error0;
2943         if (*stat == 0)
2944                 return 0;
2945
2946         XFS_BTREE_STATS_INC(cur, alloc);
2947
2948         /* Copy the root into a real block. */
2949         error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2950         if (error)
2951                 goto error0;
2952
2953         /*
2954          * we can't just memcpy() the root in for CRC enabled btree blocks.
2955          * In that case have to also ensure the blkno remains correct
2956          */
2957         memcpy(cblock, block, xfs_btree_block_len(cur));
2958         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2959                 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2960                         cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2961                 else
2962                         cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2963         }
2964
2965         be16_add_cpu(&block->bb_level, 1);
2966         xfs_btree_set_numrecs(block, 1);
2967         cur->bc_nlevels++;
2968         cur->bc_ptrs[level + 1] = 1;
2969
2970         kp = xfs_btree_key_addr(cur, 1, block);
2971         ckp = xfs_btree_key_addr(cur, 1, cblock);
2972         xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2973
2974         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2975 #ifdef DEBUG
2976         for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2977                 error = xfs_btree_check_ptr(cur, pp, i, level);
2978                 if (error)
2979                         goto error0;
2980         }
2981 #endif
2982         xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2983
2984 #ifdef DEBUG
2985         error = xfs_btree_check_ptr(cur, &nptr, 0, level);
2986         if (error)
2987                 goto error0;
2988 #endif
2989         xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2990
2991         xfs_iroot_realloc(cur->bc_private.b.ip,
2992                           1 - xfs_btree_get_numrecs(cblock),
2993                           cur->bc_private.b.whichfork);
2994
2995         xfs_btree_setbuf(cur, level, cbp);
2996
2997         /*
2998          * Do all this logging at the end so that
2999          * the root is at the right level.
3000          */
3001         xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
3002         xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3003         xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3004
3005         *logflags |=
3006                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
3007         *stat = 1;
3008         return 0;
3009 error0:
3010         return error;
3011 }
3012
3013 /*
3014  * Allocate a new root block, fill it in.
3015  */
3016 STATIC int                              /* error */
3017 xfs_btree_new_root(
3018         struct xfs_btree_cur    *cur,   /* btree cursor */
3019         int                     *stat)  /* success/failure */
3020 {
3021         struct xfs_btree_block  *block; /* one half of the old root block */
3022         struct xfs_buf          *bp;    /* buffer containing block */
3023         int                     error;  /* error return value */
3024         struct xfs_buf          *lbp;   /* left buffer pointer */
3025         struct xfs_btree_block  *left;  /* left btree block */
3026         struct xfs_buf          *nbp;   /* new (root) buffer */
3027         struct xfs_btree_block  *new;   /* new (root) btree block */
3028         int                     nptr;   /* new value for key index, 1 or 2 */
3029         struct xfs_buf          *rbp;   /* right buffer pointer */
3030         struct xfs_btree_block  *right; /* right btree block */
3031         union xfs_btree_ptr     rptr;
3032         union xfs_btree_ptr     lptr;
3033
3034         XFS_BTREE_STATS_INC(cur, newroot);
3035
3036         /* initialise our start point from the cursor */
3037         cur->bc_ops->init_ptr_from_cur(cur, &rptr);
3038
3039         /* Allocate the new block. If we can't do it, we're toast. Give up. */
3040         error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
3041         if (error)
3042                 goto error0;
3043         if (*stat == 0)
3044                 goto out0;
3045         XFS_BTREE_STATS_INC(cur, alloc);
3046
3047         /* Set up the new block. */
3048         error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
3049         if (error)
3050                 goto error0;
3051
3052         /* Set the root in the holding structure  increasing the level by 1. */
3053         cur->bc_ops->set_root(cur, &lptr, 1);
3054
3055         /*
3056          * At the previous root level there are now two blocks: the old root,
3057          * and the new block generated when it was split.  We don't know which
3058          * one the cursor is pointing at, so we set up variables "left" and
3059          * "right" for each case.
3060          */
3061         block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3062
3063 #ifdef DEBUG
3064         error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3065         if (error)
3066                 goto error0;
3067 #endif
3068
3069         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3070         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3071                 /* Our block is left, pick up the right block. */
3072                 lbp = bp;
3073                 xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3074                 left = block;
3075                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3076                 if (error)
3077                         goto error0;
3078                 bp = rbp;
3079                 nptr = 1;
3080         } else {
3081                 /* Our block is right, pick up the left block. */
3082                 rbp = bp;
3083                 xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3084                 right = block;
3085                 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3086                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3087                 if (error)
3088                         goto error0;
3089                 bp = lbp;
3090                 nptr = 2;
3091         }
3092
3093         /* Fill in the new block's btree header and log it. */
3094         xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3095         xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3096         ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3097                         !xfs_btree_ptr_is_null(cur, &rptr));
3098
3099         /* Fill in the key data in the new root. */
3100         if (xfs_btree_get_level(left) > 0) {
3101                 /*
3102                  * Get the keys for the left block's keys and put them directly
3103                  * in the parent block.  Do the same for the right block.
3104                  */
3105                 xfs_btree_get_node_keys(cur, left,
3106                                 xfs_btree_key_addr(cur, 1, new));
3107                 xfs_btree_get_node_keys(cur, right,
3108                                 xfs_btree_key_addr(cur, 2, new));
3109         } else {
3110                 /*
3111                  * Get the keys for the left block's records and put them
3112                  * directly in the parent block.  Do the same for the right
3113                  * block.
3114                  */
3115                 xfs_btree_get_leaf_keys(cur, left,
3116                         xfs_btree_key_addr(cur, 1, new));
3117                 xfs_btree_get_leaf_keys(cur, right,
3118                         xfs_btree_key_addr(cur, 2, new));
3119         }
3120         xfs_btree_log_keys(cur, nbp, 1, 2);
3121
3122         /* Fill in the pointer data in the new root. */
3123         xfs_btree_copy_ptrs(cur,
3124                 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3125         xfs_btree_copy_ptrs(cur,
3126                 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3127         xfs_btree_log_ptrs(cur, nbp, 1, 2);
3128
3129         /* Fix up the cursor. */
3130         xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3131         cur->bc_ptrs[cur->bc_nlevels] = nptr;
3132         cur->bc_nlevels++;
3133         *stat = 1;
3134         return 0;
3135 error0:
3136         return error;
3137 out0:
3138         *stat = 0;
3139         return 0;
3140 }
3141
3142 STATIC int
3143 xfs_btree_make_block_unfull(
3144         struct xfs_btree_cur    *cur,   /* btree cursor */
3145         int                     level,  /* btree level */
3146         int                     numrecs,/* # of recs in block */
3147         int                     *oindex,/* old tree index */
3148         int                     *index, /* new tree index */
3149         union xfs_btree_ptr     *nptr,  /* new btree ptr */
3150         struct xfs_btree_cur    **ncur, /* new btree cursor */
3151         union xfs_btree_key     *key,   /* key of new block */
3152         int                     *stat)
3153 {
3154         int                     error = 0;
3155
3156         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3157             level == cur->bc_nlevels - 1) {
3158                 struct xfs_inode *ip = cur->bc_private.b.ip;
3159
3160                 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3161                         /* A root block that can be made bigger. */
3162                         xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
3163                         *stat = 1;
3164                 } else {
3165                         /* A root block that needs replacing */
3166                         int     logflags = 0;
3167
3168                         error = xfs_btree_new_iroot(cur, &logflags, stat);
3169                         if (error || *stat == 0)
3170                                 return error;
3171
3172                         xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3173                 }
3174
3175                 return 0;
3176         }
3177
3178         /* First, try shifting an entry to the right neighbor. */
3179         error = xfs_btree_rshift(cur, level, stat);
3180         if (error || *stat)
3181                 return error;
3182
3183         /* Next, try shifting an entry to the left neighbor. */
3184         error = xfs_btree_lshift(cur, level, stat);
3185         if (error)
3186                 return error;
3187
3188         if (*stat) {
3189                 *oindex = *index = cur->bc_ptrs[level];
3190                 return 0;
3191         }
3192
3193         /*
3194          * Next, try splitting the current block in half.
3195          *
3196          * If this works we have to re-set our variables because we
3197          * could be in a different block now.
3198          */
3199         error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3200         if (error || *stat == 0)
3201                 return error;
3202
3203
3204         *index = cur->bc_ptrs[level];
3205         return 0;
3206 }
3207
3208 /*
3209  * Insert one record/level.  Return information to the caller
3210  * allowing the next level up to proceed if necessary.
3211  */
3212 STATIC int
3213 xfs_btree_insrec(
3214         struct xfs_btree_cur    *cur,   /* btree cursor */
3215         int                     level,  /* level to insert record at */
3216         union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
3217         union xfs_btree_rec     *rec,   /* record to insert */
3218         union xfs_btree_key     *key,   /* i/o: block key for ptrp */
3219         struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
3220         int                     *stat)  /* success/failure */
3221 {
3222         struct xfs_btree_block  *block; /* btree block */
3223         struct xfs_buf          *bp;    /* buffer for block */
3224         union xfs_btree_ptr     nptr;   /* new block ptr */
3225         struct xfs_btree_cur    *ncur;  /* new btree cursor */
3226         union xfs_btree_key     nkey;   /* new block key */
3227         union xfs_btree_key     *lkey;
3228         int                     optr;   /* old key/record index */
3229         int                     ptr;    /* key/record index */
3230         int                     numrecs;/* number of records */
3231         int                     error;  /* error return value */
3232 #ifdef DEBUG
3233         int                     i;
3234 #endif
3235         xfs_daddr_t             old_bn;
3236
3237         ncur = NULL;
3238         lkey = &nkey;
3239
3240         /*
3241          * If we have an external root pointer, and we've made it to the
3242          * root level, allocate a new root block and we're done.
3243          */
3244         if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3245             (level >= cur->bc_nlevels)) {
3246                 error = xfs_btree_new_root(cur, stat);
3247                 xfs_btree_set_ptr_null(cur, ptrp);
3248
3249                 return error;
3250         }
3251
3252         /* If we're off the left edge, return failure. */
3253         ptr = cur->bc_ptrs[level];
3254         if (ptr == 0) {
3255                 *stat = 0;
3256                 return 0;
3257         }
3258
3259         optr = ptr;
3260
3261         XFS_BTREE_STATS_INC(cur, insrec);
3262
3263         /* Get pointers to the btree buffer and block. */
3264         block = xfs_btree_get_block(cur, level, &bp);
3265         old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
3266         numrecs = xfs_btree_get_numrecs(block);
3267
3268 #ifdef DEBUG
3269         error = xfs_btree_check_block(cur, block, level, bp);
3270         if (error)
3271                 goto error0;
3272
3273         /* Check that the new entry is being inserted in the right place. */
3274         if (ptr <= numrecs) {
3275                 if (level == 0) {
3276                         ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3277                                 xfs_btree_rec_addr(cur, ptr, block)));
3278                 } else {
3279                         ASSERT(cur->bc_ops->keys_inorder(cur, key,
3280                                 xfs_btree_key_addr(cur, ptr, block)));
3281                 }
3282         }
3283 #endif
3284
3285         /*
3286          * If the block is full, we can't insert the new entry until we
3287          * make the block un-full.
3288          */
3289         xfs_btree_set_ptr_null(cur, &nptr);
3290         if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3291                 error = xfs_btree_make_block_unfull(cur, level, numrecs,
3292                                         &optr, &ptr, &nptr, &ncur, lkey, stat);
3293                 if (error || *stat == 0)
3294                         goto error0;
3295         }
3296
3297         /*
3298          * The current block may have changed if the block was
3299          * previously full and we have just made space in it.
3300          */
3301         block = xfs_btree_get_block(cur, level, &bp);
3302         numrecs = xfs_btree_get_numrecs(block);
3303
3304 #ifdef DEBUG
3305         error = xfs_btree_check_block(cur, block, level, bp);
3306         if (error)
3307                 return error;
3308 #endif
3309
3310         /*
3311          * At this point we know there's room for our new entry in the block
3312          * we're pointing at.
3313          */
3314         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3315
3316         if (level > 0) {
3317                 /* It's a nonleaf. make a hole in the keys and ptrs */
3318                 union xfs_btree_key     *kp;
3319                 union xfs_btree_ptr     *pp;
3320
3321                 kp = xfs_btree_key_addr(cur, ptr, block);
3322                 pp = xfs_btree_ptr_addr(cur, ptr, block);
3323
3324 #ifdef DEBUG
3325                 for (i = numrecs - ptr; i >= 0; i--) {
3326                         error = xfs_btree_check_ptr(cur, pp, i, level);
3327                         if (error)
3328                                 return error;
3329                 }
3330 #endif
3331
3332                 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3333                 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3334
3335 #ifdef DEBUG
3336                 error = xfs_btree_check_ptr(cur, ptrp, 0, level);
3337                 if (error)
3338                         goto error0;
3339 #endif
3340
3341                 /* Now put the new data in, bump numrecs and log it. */
3342                 xfs_btree_copy_keys(cur, kp, key, 1);
3343                 xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3344                 numrecs++;
3345                 xfs_btree_set_numrecs(block, numrecs);
3346                 xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3347                 xfs_btree_log_keys(cur, bp, ptr, numrecs);
3348 #ifdef DEBUG
3349                 if (ptr < numrecs) {
3350                         ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3351                                 xfs_btree_key_addr(cur, ptr + 1, block)));
3352                 }
3353 #endif
3354         } else {
3355                 /* It's a leaf. make a hole in the records */
3356                 union xfs_btree_rec             *rp;
3357
3358                 rp = xfs_btree_rec_addr(cur, ptr, block);
3359
3360                 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3361
3362                 /* Now put the new data in, bump numrecs and log it. */
3363                 xfs_btree_copy_recs(cur, rp, rec, 1);
3364                 xfs_btree_set_numrecs(block, ++numrecs);
3365                 xfs_btree_log_recs(cur, bp, ptr, numrecs);
3366 #ifdef DEBUG
3367                 if (ptr < numrecs) {
3368                         ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3369                                 xfs_btree_rec_addr(cur, ptr + 1, block)));
3370                 }
3371 #endif
3372         }
3373
3374         /* Log the new number of records in the btree header. */
3375         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3376
3377         /*
3378          * If we just inserted into a new tree block, we have to
3379          * recalculate nkey here because nkey is out of date.
3380          *
3381          * Otherwise we're just updating an existing block (having shoved
3382          * some records into the new tree block), so use the regular key
3383          * update mechanism.
3384          */
3385         if (bp && bp->b_bn != old_bn) {
3386                 xfs_btree_get_keys(cur, block, lkey);
3387         } else if (xfs_btree_needs_key_update(cur, optr)) {
3388                 error = xfs_btree_update_keys(cur, level);
3389                 if (error)
3390                         goto error0;
3391         }
3392
3393         /*
3394          * If we are tracking the last record in the tree and
3395          * we are at the far right edge of the tree, update it.
3396          */
3397         if (xfs_btree_is_lastrec(cur, block, level)) {
3398                 cur->bc_ops->update_lastrec(cur, block, rec,
3399                                             ptr, LASTREC_INSREC);
3400         }
3401
3402         /*
3403          * Return the new block number, if any.
3404          * If there is one, give back a record value and a cursor too.
3405          */
3406         *ptrp = nptr;
3407         if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3408                 xfs_btree_copy_keys(cur, key, lkey, 1);
3409                 *curp = ncur;
3410         }
3411
3412         *stat = 1;
3413         return 0;
3414
3415 error0:
3416         return error;
3417 }
3418
3419 /*
3420  * Insert the record at the point referenced by cur.
3421  *
3422  * A multi-level split of the tree on insert will invalidate the original
3423  * cursor.  All callers of this function should assume that the cursor is
3424  * no longer valid and revalidate it.
3425  */
3426 int
3427 xfs_btree_insert(
3428         struct xfs_btree_cur    *cur,
3429         int                     *stat)
3430 {
3431         int                     error;  /* error return value */
3432         int                     i;      /* result value, 0 for failure */
3433         int                     level;  /* current level number in btree */
3434         union xfs_btree_ptr     nptr;   /* new block number (split result) */
3435         struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3436         struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3437         union xfs_btree_key     bkey;   /* key of block to insert */
3438         union xfs_btree_key     *key;
3439         union xfs_btree_rec     rec;    /* record to insert */
3440
3441         level = 0;
3442         ncur = NULL;
3443         pcur = cur;
3444         key = &bkey;
3445
3446         xfs_btree_set_ptr_null(cur, &nptr);
3447
3448         /* Make a key out of the record data to be inserted, and save it. */
3449         cur->bc_ops->init_rec_from_cur(cur, &rec);
3450         cur->bc_ops->init_key_from_rec(key, &rec);
3451
3452         /*
3453          * Loop going up the tree, starting at the leaf level.
3454          * Stop when we don't get a split block, that must mean that
3455          * the insert is finished with this level.
3456          */
3457         do {
3458                 /*
3459                  * Insert nrec/nptr into this level of the tree.
3460                  * Note if we fail, nptr will be null.
3461                  */
3462                 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3463                                 &ncur, &i);
3464                 if (error) {
3465                         if (pcur != cur)
3466                                 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3467                         goto error0;
3468                 }
3469
3470                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3471                 level++;
3472
3473                 /*
3474                  * See if the cursor we just used is trash.
3475                  * Can't trash the caller's cursor, but otherwise we should
3476                  * if ncur is a new cursor or we're about to be done.
3477                  */
3478                 if (pcur != cur &&
3479                     (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3480                         /* Save the state from the cursor before we trash it */
3481                         if (cur->bc_ops->update_cursor)
3482                                 cur->bc_ops->update_cursor(pcur, cur);
3483                         cur->bc_nlevels = pcur->bc_nlevels;
3484                         xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3485                 }
3486                 /* If we got a new cursor, switch to it. */
3487                 if (ncur) {
3488                         pcur = ncur;
3489                         ncur = NULL;
3490                 }
3491         } while (!xfs_btree_ptr_is_null(cur, &nptr));
3492
3493         *stat = i;
3494         return 0;
3495 error0:
3496         return error;
3497 }
3498
3499 /*
3500  * Try to merge a non-leaf block back into the inode root.
3501  *
3502  * Note: the killroot names comes from the fact that we're effectively
3503  * killing the old root block.  But because we can't just delete the
3504  * inode we have to copy the single block it was pointing to into the
3505  * inode.
3506  */
3507 STATIC int
3508 xfs_btree_kill_iroot(
3509         struct xfs_btree_cur    *cur)
3510 {
3511         int                     whichfork = cur->bc_private.b.whichfork;
3512         struct xfs_inode        *ip = cur->bc_private.b.ip;
3513         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3514         struct xfs_btree_block  *block;
3515         struct xfs_btree_block  *cblock;
3516         union xfs_btree_key     *kp;
3517         union xfs_btree_key     *ckp;
3518         union xfs_btree_ptr     *pp;
3519         union xfs_btree_ptr     *cpp;
3520         struct xfs_buf          *cbp;
3521         int                     level;
3522         int                     index;
3523         int                     numrecs;
3524         int                     error;
3525 #ifdef DEBUG
3526         union xfs_btree_ptr     ptr;
3527         int                     i;
3528 #endif
3529
3530         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3531         ASSERT(cur->bc_nlevels > 1);
3532
3533         /*
3534          * Don't deal with the root block needs to be a leaf case.
3535          * We're just going to turn the thing back into extents anyway.
3536          */
3537         level = cur->bc_nlevels - 1;
3538         if (level == 1)
3539                 goto out0;
3540
3541         /*
3542          * Give up if the root has multiple children.
3543          */
3544         block = xfs_btree_get_iroot(cur);
3545         if (xfs_btree_get_numrecs(block) != 1)
3546                 goto out0;
3547
3548         cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3549         numrecs = xfs_btree_get_numrecs(cblock);
3550
3551         /*
3552          * Only do this if the next level will fit.
3553          * Then the data must be copied up to the inode,
3554          * instead of freeing the root you free the next level.
3555          */
3556         if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3557                 goto out0;
3558
3559         XFS_BTREE_STATS_INC(cur, killroot);
3560
3561 #ifdef DEBUG
3562         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3563         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3564         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3565         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3566 #endif
3567
3568         index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3569         if (index) {
3570                 xfs_iroot_realloc(cur->bc_private.b.ip, index,
3571                                   cur->bc_private.b.whichfork);
3572                 block = ifp->if_broot;
3573         }
3574
3575         be16_add_cpu(&block->bb_numrecs, index);
3576         ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3577
3578         kp = xfs_btree_key_addr(cur, 1, block);
3579         ckp = xfs_btree_key_addr(cur, 1, cblock);
3580         xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3581
3582         pp = xfs_btree_ptr_addr(cur, 1, block);
3583         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3584 #ifdef DEBUG
3585         for (i = 0; i < numrecs; i++) {
3586                 error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3587                 if (error)
3588                         return error;
3589         }
3590 #endif
3591         xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3592
3593         error = xfs_btree_free_block(cur, cbp);
3594         if (error)
3595                 return error;
3596
3597         cur->bc_bufs[level - 1] = NULL;
3598         be16_add_cpu(&block->bb_level, -1);
3599         xfs_trans_log_inode(cur->bc_tp, ip,
3600                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3601         cur->bc_nlevels--;
3602 out0:
3603         return 0;
3604 }
3605
3606 /*
3607  * Kill the current root node, and replace it with it's only child node.
3608  */
3609 STATIC int
3610 xfs_btree_kill_root(
3611         struct xfs_btree_cur    *cur,
3612         struct xfs_buf          *bp,
3613         int                     level,
3614         union xfs_btree_ptr     *newroot)
3615 {
3616         int                     error;
3617
3618         XFS_BTREE_STATS_INC(cur, killroot);
3619
3620         /*
3621          * Update the root pointer, decreasing the level by 1 and then
3622          * free the old root.
3623          */
3624         cur->bc_ops->set_root(cur, newroot, -1);
3625
3626         error = xfs_btree_free_block(cur, bp);
3627         if (error)
3628                 return error;
3629
3630         cur->bc_bufs[level] = NULL;
3631         cur->bc_ra[level] = 0;
3632         cur->bc_nlevels--;
3633
3634         return 0;
3635 }
3636
3637 STATIC int
3638 xfs_btree_dec_cursor(
3639         struct xfs_btree_cur    *cur,
3640         int                     level,
3641         int                     *stat)
3642 {
3643         int                     error;
3644         int                     i;
3645
3646         if (level > 0) {
3647                 error = xfs_btree_decrement(cur, level, &i);
3648                 if (error)
3649                         return error;
3650         }
3651
3652         *stat = 1;
3653         return 0;
3654 }
3655
3656 /*
3657  * Single level of the btree record deletion routine.
3658  * Delete record pointed to by cur/level.
3659  * Remove the record from its block then rebalance the tree.
3660  * Return 0 for error, 1 for done, 2 to go on to the next level.
3661  */
3662 STATIC int                                      /* error */
3663 xfs_btree_delrec(
3664         struct xfs_btree_cur    *cur,           /* btree cursor */
3665         int                     level,          /* level removing record from */
3666         int                     *stat)          /* fail/done/go-on */
3667 {
3668         struct xfs_btree_block  *block;         /* btree block */
3669         union xfs_btree_ptr     cptr;           /* current block ptr */
3670         struct xfs_buf          *bp;            /* buffer for block */
3671         int                     error;          /* error return value */
3672         int                     i;              /* loop counter */
3673         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3674         struct xfs_buf          *lbp;           /* left buffer pointer */
3675         struct xfs_btree_block  *left;          /* left btree block */
3676         int                     lrecs = 0;      /* left record count */
3677         int                     ptr;            /* key/record index */
3678         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3679         struct xfs_buf          *rbp;           /* right buffer pointer */
3680         struct xfs_btree_block  *right;         /* right btree block */
3681         struct xfs_btree_block  *rrblock;       /* right-right btree block */
3682         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3683         int                     rrecs = 0;      /* right record count */
3684         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3685         int                     numrecs;        /* temporary numrec count */
3686
3687         tcur = NULL;
3688
3689         /* Get the index of the entry being deleted, check for nothing there. */
3690         ptr = cur->bc_ptrs[level];
3691         if (ptr == 0) {
3692                 *stat = 0;
3693                 return 0;
3694         }
3695
3696         /* Get the buffer & block containing the record or key/ptr. */
3697         block = xfs_btree_get_block(cur, level, &bp);
3698         numrecs = xfs_btree_get_numrecs(block);
3699
3700 #ifdef DEBUG
3701         error = xfs_btree_check_block(cur, block, level, bp);
3702         if (error)
3703                 goto error0;
3704 #endif
3705
3706         /* Fail if we're off the end of the block. */
3707         if (ptr > numrecs) {
3708                 *stat = 0;
3709                 return 0;
3710         }
3711
3712         XFS_BTREE_STATS_INC(cur, delrec);
3713         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3714
3715         /* Excise the entries being deleted. */
3716         if (level > 0) {
3717                 /* It's a nonleaf. operate on keys and ptrs */
3718                 union xfs_btree_key     *lkp;
3719                 union xfs_btree_ptr     *lpp;
3720
3721                 lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3722                 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3723
3724 #ifdef DEBUG
3725                 for (i = 0; i < numrecs - ptr; i++) {
3726                         error = xfs_btree_check_ptr(cur, lpp, i, level);
3727                         if (error)
3728                                 goto error0;
3729                 }
3730 #endif
3731
3732                 if (ptr < numrecs) {
3733                         xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3734                         xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3735                         xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3736                         xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3737                 }
3738         } else {
3739                 /* It's a leaf. operate on records */
3740                 if (ptr < numrecs) {
3741                         xfs_btree_shift_recs(cur,
3742                                 xfs_btree_rec_addr(cur, ptr + 1, block),
3743                                 -1, numrecs - ptr);
3744                         xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3745                 }
3746         }
3747
3748         /*
3749          * Decrement and log the number of entries in the block.
3750          */
3751         xfs_btree_set_numrecs(block, --numrecs);
3752         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3753
3754         /*
3755          * If we are tracking the last record in the tree and
3756          * we are at the far right edge of the tree, update it.
3757          */
3758         if (xfs_btree_is_lastrec(cur, block, level)) {
3759                 cur->bc_ops->update_lastrec(cur, block, NULL,
3760                                             ptr, LASTREC_DELREC);
3761         }
3762
3763         /*
3764          * We're at the root level.  First, shrink the root block in-memory.
3765          * Try to get rid of the next level down.  If we can't then there's
3766          * nothing left to do.
3767          */
3768         if (level == cur->bc_nlevels - 1) {
3769                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3770                         xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3771                                           cur->bc_private.b.whichfork);
3772
3773                         error = xfs_btree_kill_iroot(cur);
3774                         if (error)
3775                                 goto error0;
3776
3777                         error = xfs_btree_dec_cursor(cur, level, stat);
3778                         if (error)
3779                                 goto error0;
3780                         *stat = 1;
3781                         return 0;
3782                 }
3783
3784                 /*
3785                  * If this is the root level, and there's only one entry left,
3786                  * and it's NOT the leaf level, then we can get rid of this
3787                  * level.
3788                  */
3789                 if (numrecs == 1 && level > 0) {
3790                         union xfs_btree_ptr     *pp;
3791                         /*
3792                          * pp is still set to the first pointer in the block.
3793                          * Make it the new root of the btree.
3794                          */
3795                         pp = xfs_btree_ptr_addr(cur, 1, block);
3796                         error = xfs_btree_kill_root(cur, bp, level, pp);
3797                         if (error)
3798                                 goto error0;
3799                 } else if (level > 0) {
3800                         error = xfs_btree_dec_cursor(cur, level, stat);
3801                         if (error)
3802                                 goto error0;
3803                 }
3804                 *stat = 1;
3805                 return 0;
3806         }
3807
3808         /*
3809          * If we deleted the leftmost entry in the block, update the
3810          * key values above us in the tree.
3811          */
3812         if (xfs_btree_needs_key_update(cur, ptr)) {
3813                 error = xfs_btree_update_keys(cur, level);
3814                 if (error)
3815                         goto error0;
3816         }
3817
3818         /*
3819          * If the number of records remaining in the block is at least
3820          * the minimum, we're done.
3821          */
3822         if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3823                 error = xfs_btree_dec_cursor(cur, level, stat);
3824                 if (error)
3825                         goto error0;
3826                 return 0;
3827         }
3828
3829         /*
3830          * Otherwise, we have to move some records around to keep the
3831          * tree balanced.  Look at the left and right sibling blocks to
3832          * see if we can re-balance by moving only one record.
3833          */
3834         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3835         xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3836
3837         if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3838                 /*
3839                  * One child of root, need to get a chance to copy its contents
3840                  * into the root and delete it. Can't go up to next level,
3841                  * there's nothing to delete there.
3842                  */
3843                 if (xfs_btree_ptr_is_null(cur, &rptr) &&
3844                     xfs_btree_ptr_is_null(cur, &lptr) &&
3845                     level == cur->bc_nlevels - 2) {
3846                         error = xfs_btree_kill_iroot(cur);
3847                         if (!error)
3848                                 error = xfs_btree_dec_cursor(cur, level, stat);
3849                         if (error)
3850                                 goto error0;
3851                         return 0;
3852                 }
3853         }
3854
3855         ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3856                !xfs_btree_ptr_is_null(cur, &lptr));
3857
3858         /*
3859          * Duplicate the cursor so our btree manipulations here won't
3860          * disrupt the next level up.
3861          */
3862         error = xfs_btree_dup_cursor(cur, &tcur);
3863         if (error)
3864                 goto error0;
3865
3866         /*
3867          * If there's a right sibling, see if it's ok to shift an entry
3868          * out of it.
3869          */
3870         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3871                 /*
3872                  * Move the temp cursor to the last entry in the next block.
3873                  * Actually any entry but the first would suffice.
3874                  */
3875                 i = xfs_btree_lastrec(tcur, level);
3876                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3877
3878                 error = xfs_btree_increment(tcur, level, &i);
3879                 if (error)
3880                         goto error0;
3881                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3882
3883                 i = xfs_btree_lastrec(tcur, level);
3884                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3885
3886                 /* Grab a pointer to the block. */
3887                 right = xfs_btree_get_block(tcur, level, &rbp);
3888 #ifdef DEBUG
3889                 error = xfs_btree_check_block(tcur, right, level, rbp);
3890                 if (error)
3891                         goto error0;
3892 #endif
3893                 /* Grab the current block number, for future use. */
3894                 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3895
3896                 /*
3897                  * If right block is full enough so that removing one entry
3898                  * won't make it too empty, and left-shifting an entry out
3899                  * of right to us works, we're done.
3900                  */
3901                 if (xfs_btree_get_numrecs(right) - 1 >=
3902                     cur->bc_ops->get_minrecs(tcur, level)) {
3903                         error = xfs_btree_lshift(tcur, level, &i);
3904                         if (error)
3905                                 goto error0;
3906                         if (i) {
3907                                 ASSERT(xfs_btree_get_numrecs(block) >=
3908                                        cur->bc_ops->get_minrecs(tcur, level));
3909
3910                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3911                                 tcur = NULL;
3912
3913                                 error = xfs_btree_dec_cursor(cur, level, stat);
3914                                 if (error)
3915                                         goto error0;
3916                                 return 0;
3917                         }
3918                 }
3919
3920                 /*
3921                  * Otherwise, grab the number of records in right for
3922                  * future reference, and fix up the temp cursor to point
3923                  * to our block again (last record).
3924                  */
3925                 rrecs = xfs_btree_get_numrecs(right);
3926                 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3927                         i = xfs_btree_firstrec(tcur, level);
3928                         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3929
3930                         error = xfs_btree_decrement(tcur, level, &i);
3931                         if (error)
3932                                 goto error0;
3933                         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3934                 }
3935         }
3936
3937         /*
3938          * If there's a left sibling, see if it's ok to shift an entry
3939          * out of it.
3940          */
3941         if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3942                 /*
3943                  * Move the temp cursor to the first entry in the
3944                  * previous block.
3945                  */
3946                 i = xfs_btree_firstrec(tcur, level);
3947                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3948
3949                 error = xfs_btree_decrement(tcur, level, &i);
3950                 if (error)
3951                         goto error0;
3952                 i = xfs_btree_firstrec(tcur, level);
3953                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3954
3955                 /* Grab a pointer to the block. */
3956                 left = xfs_btree_get_block(tcur, level, &lbp);
3957 #ifdef DEBUG
3958                 error = xfs_btree_check_block(cur, left, level, lbp);
3959                 if (error)
3960                         goto error0;
3961 #endif
3962                 /* Grab the current block number, for future use. */
3963                 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3964
3965                 /*
3966                  * If left block is full enough so that removing one entry
3967                  * won't make it too empty, and right-shifting an entry out
3968                  * of left to us works, we're done.
3969                  */
3970                 if (xfs_btree_get_numrecs(left) - 1 >=
3971                     cur->bc_ops->get_minrecs(tcur, level)) {
3972                         error = xfs_btree_rshift(tcur, level, &i);
3973                         if (error)
3974                                 goto error0;
3975                         if (i) {
3976                                 ASSERT(xfs_btree_get_numrecs(block) >=
3977                                        cur->bc_ops->get_minrecs(tcur, level));
3978                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3979                                 tcur = NULL;
3980                                 if (level == 0)
3981                                         cur->bc_ptrs[0]++;
3982
3983                                 *stat = 1;
3984                                 return 0;
3985                         }
3986                 }
3987
3988                 /*
3989                  * Otherwise, grab the number of records in right for
3990                  * future reference.
3991                  */
3992                 lrecs = xfs_btree_get_numrecs(left);
3993         }
3994
3995         /* Delete the temp cursor, we're done with it. */
3996         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3997         tcur = NULL;
3998
3999         /* If here, we need to do a join to keep the tree balanced. */
4000         ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
4001
4002         if (!xfs_btree_ptr_is_null(cur, &lptr) &&
4003             lrecs + xfs_btree_get_numrecs(block) <=
4004                         cur->bc_ops->get_maxrecs(cur, level)) {
4005                 /*
4006                  * Set "right" to be the starting block,
4007                  * "left" to be the left neighbor.
4008                  */
4009                 rptr = cptr;
4010                 right = block;
4011                 rbp = bp;
4012                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
4013                 if (error)
4014                         goto error0;
4015
4016         /*
4017          * If that won't work, see if we can join with the right neighbor block.
4018          */
4019         } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4020                    rrecs + xfs_btree_get_numrecs(block) <=
4021                         cur->bc_ops->get_maxrecs(cur, level)) {
4022                 /*
4023                  * Set "left" to be the starting block,
4024                  * "right" to be the right neighbor.
4025                  */
4026                 lptr = cptr;
4027                 left = block;
4028                 lbp = bp;
4029                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4030                 if (error)
4031                         goto error0;
4032
4033         /*
4034          * Otherwise, we can't fix the imbalance.
4035          * Just return.  This is probably a logic error, but it's not fatal.
4036          */
4037         } else {
4038                 error = xfs_btree_dec_cursor(cur, level, stat);
4039                 if (error)
4040                         goto error0;
4041                 return 0;
4042         }
4043
4044         rrecs = xfs_btree_get_numrecs(right);
4045         lrecs = xfs_btree_get_numrecs(left);
4046
4047         /*
4048          * We're now going to join "left" and "right" by moving all the stuff
4049          * in "right" to "left" and deleting "right".
4050          */
4051         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4052         if (level > 0) {
4053                 /* It's a non-leaf.  Move keys and pointers. */
4054                 union xfs_btree_key     *lkp;   /* left btree key */
4055                 union xfs_btree_ptr     *lpp;   /* left address pointer */
4056                 union xfs_btree_key     *rkp;   /* right btree key */
4057                 union xfs_btree_ptr     *rpp;   /* right address pointer */
4058
4059                 lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4060                 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4061                 rkp = xfs_btree_key_addr(cur, 1, right);
4062                 rpp = xfs_btree_ptr_addr(cur, 1, right);
4063 #ifdef DEBUG
4064                 for (i = 1; i < rrecs; i++) {
4065                         error = xfs_btree_check_ptr(cur, rpp, i, level);
4066                         if (error)
4067                                 goto error0;
4068                 }
4069 #endif
4070                 xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4071                 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4072
4073                 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4074                 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4075         } else {
4076                 /* It's a leaf.  Move records.  */
4077                 union xfs_btree_rec     *lrp;   /* left record pointer */
4078                 union xfs_btree_rec     *rrp;   /* right record pointer */
4079
4080                 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4081                 rrp = xfs_btree_rec_addr(cur, 1, right);
4082
4083                 xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4084                 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4085         }
4086
4087         XFS_BTREE_STATS_INC(cur, join);
4088
4089         /*
4090          * Fix up the number of records and right block pointer in the
4091          * surviving block, and log it.
4092          */
4093         xfs_btree_set_numrecs(left, lrecs + rrecs);
4094         xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
4095         xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4096         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4097
4098         /* If there is a right sibling, point it to the remaining block. */
4099         xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4100         if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4101                 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4102                 if (error)
4103                         goto error0;
4104                 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4105                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4106         }
4107
4108         /* Free the deleted block. */
4109         error = xfs_btree_free_block(cur, rbp);
4110         if (error)
4111                 goto error0;
4112
4113         /*
4114          * If we joined with the left neighbor, set the buffer in the
4115          * cursor to the left block, and fix up the index.
4116          */
4117         if (bp != lbp) {
4118                 cur->bc_bufs[level] = lbp;
4119                 cur->bc_ptrs[level] += lrecs;
4120                 cur->bc_ra[level] = 0;
4121         }
4122         /*
4123          * If we joined with the right neighbor and there's a level above
4124          * us, increment the cursor at that level.
4125          */
4126         else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
4127                    (level + 1 < cur->bc_nlevels)) {
4128                 error = xfs_btree_increment(cur, level + 1, &i);
4129                 if (error)
4130                         goto error0;
4131         }
4132
4133         /*
4134          * Readjust the ptr at this level if it's not a leaf, since it's
4135          * still pointing at the deletion point, which makes the cursor
4136          * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4137          * We can't use decrement because it would change the next level up.
4138          */
4139         if (level > 0)
4140                 cur->bc_ptrs[level]--;
4141
4142         /*
4143          * We combined blocks, so we have to update the parent keys if the
4144          * btree supports overlapped intervals.  However, bc_ptrs[level + 1]
4145          * points to the old block so that the caller knows which record to
4146          * delete.  Therefore, the caller must be savvy enough to call updkeys
4147          * for us if we return stat == 2.  The other exit points from this
4148          * function don't require deletions further up the tree, so they can
4149          * call updkeys directly.
4150          */
4151
4152         /* Return value means the next level up has something to do. */
4153         *stat = 2;
4154         return 0;
4155
4156 error0:
4157         if (tcur)
4158                 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4159         return error;
4160 }
4161
4162 /*
4163  * Delete the record pointed to by cur.
4164  * The cursor refers to the place where the record was (could be inserted)
4165  * when the operation returns.
4166  */
4167 int                                     /* error */
4168 xfs_btree_delete(
4169         struct xfs_btree_cur    *cur,
4170         int                     *stat)  /* success/failure */
4171 {
4172         int                     error;  /* error return value */
4173         int                     level;
4174         int                     i;
4175         bool                    joined = false;
4176
4177         /*
4178          * Go up the tree, starting at leaf level.
4179          *
4180          * If 2 is returned then a join was done; go to the next level.
4181          * Otherwise we are done.
4182          */
4183         for (level = 0, i = 2; i == 2; level++) {
4184                 error = xfs_btree_delrec(cur, level, &i);
4185                 if (error)
4186                         goto error0;
4187                 if (i == 2)
4188                         joined = true;
4189         }
4190
4191         /*
4192          * If we combined blocks as part of deleting the record, delrec won't
4193          * have updated the parent high keys so we have to do that here.
4194          */
4195         if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
4196                 error = xfs_btree_updkeys_force(cur, 0);
4197                 if (error)
4198                         goto error0;
4199         }
4200
4201         if (i == 0) {
4202                 for (level = 1; level < cur->bc_nlevels; level++) {
4203                         if (cur->bc_ptrs[level] == 0) {
4204                                 error = xfs_btree_decrement(cur, level, &i);
4205                                 if (error)
4206                                         goto error0;
4207                                 break;
4208                         }
4209                 }
4210         }
4211
4212         *stat = i;
4213         return 0;
4214 error0:
4215         return error;
4216 }
4217
4218 /*
4219  * Get the data from the pointed-to record.
4220  */
4221 int                                     /* error */
4222 xfs_btree_get_rec(
4223         struct xfs_btree_cur    *cur,   /* btree cursor */
4224         union xfs_btree_rec     **recp, /* output: btree record */
4225         int                     *stat)  /* output: success/failure */
4226 {
4227         struct xfs_btree_block  *block; /* btree block */
4228         struct xfs_buf          *bp;    /* buffer pointer */
4229         int                     ptr;    /* record number */
4230 #ifdef DEBUG
4231         int                     error;  /* error return value */
4232 #endif
4233
4234         ptr = cur->bc_ptrs[0];
4235         block = xfs_btree_get_block(cur, 0, &bp);
4236
4237 #ifdef DEBUG
4238         error = xfs_btree_check_block(cur, block, 0, bp);
4239         if (error)
4240                 return error;
4241 #endif
4242
4243         /*
4244          * Off the right end or left end, return failure.
4245          */
4246         if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4247                 *stat = 0;
4248                 return 0;
4249         }
4250
4251         /*
4252          * Point to the record and extract its data.
4253          */
4254         *recp = xfs_btree_rec_addr(cur, ptr, block);
4255         *stat = 1;
4256         return 0;
4257 }
4258
4259 /* Visit a block in a btree. */
4260 STATIC int
4261 xfs_btree_visit_block(
4262         struct xfs_btree_cur            *cur,
4263         int                             level,
4264         xfs_btree_visit_blocks_fn       fn,
4265         void                            *data)
4266 {
4267         struct xfs_btree_block          *block;
4268         struct xfs_buf                  *bp;
4269         union xfs_btree_ptr             rptr;
4270         int                             error;
4271
4272         /* do right sibling readahead */
4273         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4274         block = xfs_btree_get_block(cur, level, &bp);
4275
4276         /* process the block */
4277         error = fn(cur, level, data);
4278         if (error)
4279                 return error;
4280
4281         /* now read rh sibling block for next iteration */
4282         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4283         if (xfs_btree_ptr_is_null(cur, &rptr))
4284                 return -ENOENT;
4285
4286         return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4287 }
4288
4289
4290 /* Visit every block in a btree. */
4291 int
4292 xfs_btree_visit_blocks(
4293         struct xfs_btree_cur            *cur,
4294         xfs_btree_visit_blocks_fn       fn,
4295         void                            *data)
4296 {
4297         union xfs_btree_ptr             lptr;
4298         int                             level;
4299         struct xfs_btree_block          *block = NULL;
4300         int                             error = 0;
4301
4302         cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4303
4304         /* for each level */
4305         for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4306                 /* grab the left hand block */
4307                 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4308                 if (error)
4309                         return error;
4310
4311                 /* readahead the left most block for the next level down */
4312                 if (level > 0) {
4313                         union xfs_btree_ptr     *ptr;
4314
4315                         ptr = xfs_btree_ptr_addr(cur, 1, block);
4316                         xfs_btree_readahead_ptr(cur, ptr, 1);
4317
4318                         /* save for the next iteration of the loop */
4319                         xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4320                 }
4321
4322                 /* for each buffer in the level */
4323                 do {
4324                         error = xfs_btree_visit_block(cur, level, fn, data);
4325                 } while (!error);
4326
4327                 if (error != -ENOENT)
4328                         return error;
4329         }
4330
4331         return 0;
4332 }
4333
4334 /*
4335  * Change the owner of a btree.
4336  *
4337  * The mechanism we use here is ordered buffer logging. Because we don't know
4338  * how many buffers were are going to need to modify, we don't really want to
4339  * have to make transaction reservations for the worst case of every buffer in a
4340  * full size btree as that may be more space that we can fit in the log....
4341  *
4342  * We do the btree walk in the most optimal manner possible - we have sibling
4343  * pointers so we can just walk all the blocks on each level from left to right
4344  * in a single pass, and then move to the next level and do the same. We can
4345  * also do readahead on the sibling pointers to get IO moving more quickly,
4346  * though for slow disks this is unlikely to make much difference to performance
4347  * as the amount of CPU work we have to do before moving to the next block is
4348  * relatively small.
4349  *
4350  * For each btree block that we load, modify the owner appropriately, set the
4351  * buffer as an ordered buffer and log it appropriately. We need to ensure that
4352  * we mark the region we change dirty so that if the buffer is relogged in
4353  * a subsequent transaction the changes we make here as an ordered buffer are
4354  * correctly relogged in that transaction.  If we are in recovery context, then
4355  * just queue the modified buffer as delayed write buffer so the transaction
4356  * recovery completion writes the changes to disk.
4357  */
4358 struct xfs_btree_block_change_owner_info {
4359         uint64_t                new_owner;
4360         struct list_head        *buffer_list;
4361 };
4362
4363 static int
4364 xfs_btree_block_change_owner(
4365         struct xfs_btree_cur    *cur,
4366         int                     level,
4367         void                    *data)
4368 {
4369         struct xfs_btree_block_change_owner_info        *bbcoi = data;
4370         struct xfs_btree_block  *block;
4371         struct xfs_buf          *bp;
4372
4373         /* modify the owner */
4374         block = xfs_btree_get_block(cur, level, &bp);
4375         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
4376                 if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
4377                         return 0;
4378                 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4379         } else {
4380                 if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
4381                         return 0;
4382                 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4383         }
4384
4385         /*
4386          * If the block is a root block hosted in an inode, we might not have a
4387          * buffer pointer here and we shouldn't attempt to log the change as the
4388          * information is already held in the inode and discarded when the root
4389          * block is formatted into the on-disk inode fork. We still change it,
4390          * though, so everything is consistent in memory.
4391          */
4392         if (!bp) {
4393                 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4394                 ASSERT(level == cur->bc_nlevels - 1);
4395                 return 0;
4396         }
4397
4398         if (cur->bc_tp) {
4399                 if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
4400                         xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4401                         return -EAGAIN;
4402                 }
4403         } else {
4404                 xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4405         }
4406
4407         return 0;
4408 }
4409
4410 int
4411 xfs_btree_change_owner(
4412         struct xfs_btree_cur    *cur,
4413         uint64_t                new_owner,
4414         struct list_head        *buffer_list)
4415 {
4416         struct xfs_btree_block_change_owner_info        bbcoi;
4417
4418         bbcoi.new_owner = new_owner;
4419         bbcoi.buffer_list = buffer_list;
4420
4421         return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4422                         &bbcoi);
4423 }
4424
4425 /* Verify the v5 fields of a long-format btree block. */
4426 xfs_failaddr_t
4427 xfs_btree_lblock_v5hdr_verify(
4428         struct xfs_buf          *bp,
4429         uint64_t                owner)
4430 {
4431         struct xfs_mount        *mp = bp->b_target->bt_mount;
4432         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4433
4434         if (!xfs_sb_version_hascrc(&mp->m_sb))
4435                 return __this_address;
4436         if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
4437                 return __this_address;
4438         if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn))
4439                 return __this_address;
4440         if (owner != XFS_RMAP_OWN_UNKNOWN &&
4441             be64_to_cpu(block->bb_u.l.bb_owner) != owner)
4442                 return __this_address;
4443         return NULL;
4444 }
4445
4446 /* Verify a long-format btree block. */
4447 xfs_failaddr_t
4448 xfs_btree_lblock_verify(
4449         struct xfs_buf          *bp,
4450         unsigned int            max_recs)
4451 {
4452         struct xfs_mount        *mp = bp->b_target->bt_mount;
4453         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4454
4455         /* numrecs verification */
4456         if (be16_to_cpu(block->bb_numrecs) > max_recs)
4457                 return __this_address;
4458
4459         /* sibling pointer verification */
4460         if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
4461             !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
4462                 return __this_address;
4463         if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
4464             !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
4465                 return __this_address;
4466
4467         return NULL;
4468 }
4469
4470 /**
4471  * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4472  *                                    btree block
4473  *
4474  * @bp: buffer containing the btree block
4475  * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
4476  * @pag_max_level: pointer to the per-ag max level field
4477  */
4478 xfs_failaddr_t
4479 xfs_btree_sblock_v5hdr_verify(
4480         struct xfs_buf          *bp)
4481 {
4482         struct xfs_mount        *mp = bp->b_target->bt_mount;
4483         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4484         struct xfs_perag        *pag = bp->b_pag;
4485
4486         if (!xfs_sb_version_hascrc(&mp->m_sb))
4487                 return __this_address;
4488         if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4489                 return __this_address;
4490         if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
4491                 return __this_address;
4492         if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4493                 return __this_address;
4494         return NULL;
4495 }
4496
4497 /**
4498  * xfs_btree_sblock_verify() -- verify a short-format btree block
4499  *
4500  * @bp: buffer containing the btree block
4501  * @max_recs: maximum records allowed in this btree node
4502  */
4503 xfs_failaddr_t
4504 xfs_btree_sblock_verify(
4505         struct xfs_buf          *bp,
4506         unsigned int            max_recs)
4507 {
4508         struct xfs_mount        *mp = bp->b_target->bt_mount;
4509         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4510         xfs_agblock_t           agno;
4511
4512         /* numrecs verification */
4513         if (be16_to_cpu(block->bb_numrecs) > max_recs)
4514                 return __this_address;
4515
4516         /* sibling pointer verification */
4517         agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
4518         if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
4519             !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
4520                 return __this_address;
4521         if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
4522             !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
4523                 return __this_address;
4524
4525         return NULL;
4526 }
4527
4528 /*
4529  * Calculate the number of btree levels needed to store a given number of
4530  * records in a short-format btree.
4531  */
4532 uint
4533 xfs_btree_compute_maxlevels(
4534         uint                    *limits,
4535         unsigned long           len)
4536 {
4537         uint                    level;
4538         unsigned long           maxblocks;
4539
4540         maxblocks = (len + limits[0] - 1) / limits[0];
4541         for (level = 1; maxblocks > 1; level++)
4542                 maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4543         return level;
4544 }
4545
4546 /*
4547  * Query a regular btree for all records overlapping a given interval.
4548  * Start with a LE lookup of the key of low_rec and return all records
4549  * until we find a record with a key greater than the key of high_rec.
4550  */
4551 STATIC int
4552 xfs_btree_simple_query_range(
4553         struct xfs_btree_cur            *cur,
4554         union xfs_btree_key             *low_key,
4555         union xfs_btree_key             *high_key,
4556         xfs_btree_query_range_fn        fn,
4557         void                            *priv)
4558 {
4559         union xfs_btree_rec             *recp;
4560         union xfs_btree_key             rec_key;
4561         int64_t                         diff;
4562         int                             stat;
4563         bool                            firstrec = true;
4564         int                             error;
4565
4566         ASSERT(cur->bc_ops->init_high_key_from_rec);
4567         ASSERT(cur->bc_ops->diff_two_keys);
4568
4569         /*
4570          * Find the leftmost record.  The btree cursor must be set
4571          * to the low record used to generate low_key.
4572          */
4573         stat = 0;
4574         error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4575         if (error)
4576                 goto out;
4577
4578         /* Nothing?  See if there's anything to the right. */
4579         if (!stat) {
4580                 error = xfs_btree_increment(cur, 0, &stat);
4581                 if (error)
4582                         goto out;
4583         }
4584
4585         while (stat) {
4586                 /* Find the record. */
4587                 error = xfs_btree_get_rec(cur, &recp, &stat);
4588                 if (error || !stat)
4589                         break;
4590
4591                 /* Skip if high_key(rec) < low_key. */
4592                 if (firstrec) {
4593                         cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4594                         firstrec = false;
4595                         diff = cur->bc_ops->diff_two_keys(cur, low_key,
4596                                         &rec_key);
4597                         if (diff > 0)
4598                                 goto advloop;
4599                 }
4600
4601                 /* Stop if high_key < low_key(rec). */
4602                 cur->bc_ops->init_key_from_rec(&rec_key, recp);
4603                 diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4604                 if (diff > 0)
4605                         break;
4606
4607                 /* Callback */
4608                 error = fn(cur, recp, priv);
4609                 if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
4610                         break;
4611
4612 advloop:
4613                 /* Move on to the next record. */
4614                 error = xfs_btree_increment(cur, 0, &stat);
4615                 if (error)
4616                         break;
4617         }
4618
4619 out:
4620         return error;
4621 }
4622
4623 /*
4624  * Query an overlapped interval btree for all records overlapping a given
4625  * interval.  This function roughly follows the algorithm given in
4626  * "Interval Trees" of _Introduction to Algorithms_, which is section
4627  * 14.3 in the 2nd and 3rd editions.
4628  *
4629  * First, generate keys for the low and high records passed in.
4630  *
4631  * For any leaf node, generate the high and low keys for the record.
4632  * If the record keys overlap with the query low/high keys, pass the
4633  * record to the function iterator.
4634  *
4635  * For any internal node, compare the low and high keys of each
4636  * pointer against the query low/high keys.  If there's an overlap,
4637  * follow the pointer.
4638  *
4639  * As an optimization, we stop scanning a block when we find a low key
4640  * that is greater than the query's high key.
4641  */
4642 STATIC int
4643 xfs_btree_overlapped_query_range(
4644         struct xfs_btree_cur            *cur,
4645         union xfs_btree_key             *low_key,
4646         union xfs_btree_key             *high_key,
4647         xfs_btree_query_range_fn        fn,
4648         void                            *priv)
4649 {
4650         union xfs_btree_ptr             ptr;
4651         union xfs_btree_ptr             *pp;
4652         union xfs_btree_key             rec_key;
4653         union xfs_btree_key             rec_hkey;
4654         union xfs_btree_key             *lkp;
4655         union xfs_btree_key             *hkp;
4656         union xfs_btree_rec             *recp;
4657         struct xfs_btree_block          *block;
4658         int64_t                         ldiff;
4659         int64_t                         hdiff;
4660         int                             level;
4661         struct xfs_buf                  *bp;
4662         int                             i;
4663         int                             error;
4664
4665         /* Load the root of the btree. */
4666         level = cur->bc_nlevels - 1;
4667         cur->bc_ops->init_ptr_from_cur(cur, &ptr);
4668         error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
4669         if (error)
4670                 return error;
4671         xfs_btree_get_block(cur, level, &bp);
4672         trace_xfs_btree_overlapped_query_range(cur, level, bp);
4673 #ifdef DEBUG
4674         error = xfs_btree_check_block(cur, block, level, bp);
4675         if (error)
4676                 goto out;
4677 #endif
4678         cur->bc_ptrs[level] = 1;
4679
4680         while (level < cur->bc_nlevels) {
4681                 block = xfs_btree_get_block(cur, level, &bp);
4682
4683                 /* End of node, pop back towards the root. */
4684                 if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
4685 pop_up:
4686                         if (level < cur->bc_nlevels - 1)
4687                                 cur->bc_ptrs[level + 1]++;
4688                         level++;
4689                         continue;
4690                 }
4691
4692                 if (level == 0) {
4693                         /* Handle a leaf node. */
4694                         recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
4695
4696                         cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
4697                         ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
4698                                         low_key);
4699
4700                         cur->bc_ops->init_key_from_rec(&rec_key, recp);
4701                         hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
4702                                         &rec_key);
4703
4704                         /*
4705                          * If (record's high key >= query's low key) and
4706                          *    (query's high key >= record's low key), then
4707                          * this record overlaps the query range; callback.
4708                          */
4709                         if (ldiff >= 0 && hdiff >= 0) {
4710                                 error = fn(cur, recp, priv);
4711                                 if (error < 0 ||
4712                                     error == XFS_BTREE_QUERY_RANGE_ABORT)
4713                                         break;
4714                         } else if (hdiff < 0) {
4715                                 /* Record is larger than high key; pop. */
4716                                 goto pop_up;
4717                         }
4718                         cur->bc_ptrs[level]++;
4719                         continue;
4720                 }
4721
4722                 /* Handle an internal node. */
4723                 lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
4724                 hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
4725                 pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
4726
4727                 ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
4728                 hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
4729
4730                 /*
4731                  * If (pointer's high key >= query's low key) and
4732                  *    (query's high key >= pointer's low key), then
4733                  * this record overlaps the query range; follow pointer.
4734                  */
4735                 if (ldiff >= 0 && hdiff >= 0) {
4736                         level--;
4737                         error = xfs_btree_lookup_get_block(cur, level, pp,
4738                                         &block);
4739                         if (error)
4740                                 goto out;
4741                         xfs_btree_get_block(cur, level, &bp);
4742                         trace_xfs_btree_overlapped_query_range(cur, level, bp);
4743 #ifdef DEBUG
4744                         error = xfs_btree_check_block(cur, block, level, bp);
4745                         if (error)
4746                                 goto out;
4747 #endif
4748                         cur->bc_ptrs[level] = 1;
4749                         continue;
4750                 } else if (hdiff < 0) {
4751                         /* The low key is larger than the upper range; pop. */
4752                         goto pop_up;
4753                 }
4754                 cur->bc_ptrs[level]++;
4755         }
4756
4757 out:
4758         /*
4759          * If we don't end this function with the cursor pointing at a record
4760          * block, a subsequent non-error cursor deletion will not release
4761          * node-level buffers, causing a buffer leak.  This is quite possible
4762          * with a zero-results range query, so release the buffers if we
4763          * failed to return any results.
4764          */
4765         if (cur->bc_bufs[0] == NULL) {
4766                 for (i = 0; i < cur->bc_nlevels; i++) {
4767                         if (cur->bc_bufs[i]) {
4768                                 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
4769                                 cur->bc_bufs[i] = NULL;
4770                                 cur->bc_ptrs[i] = 0;
4771                                 cur->bc_ra[i] = 0;
4772                         }
4773                 }
4774         }
4775
4776         return error;
4777 }
4778
4779 /*
4780  * Query a btree for all records overlapping a given interval of keys.  The
4781  * supplied function will be called with each record found; return one of the
4782  * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4783  * code.  This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
4784  * negative error code.
4785  */
4786 int
4787 xfs_btree_query_range(
4788         struct xfs_btree_cur            *cur,
4789         union xfs_btree_irec            *low_rec,
4790         union xfs_btree_irec            *high_rec,
4791         xfs_btree_query_range_fn        fn,
4792         void                            *priv)
4793 {
4794         union xfs_btree_rec             rec;
4795         union xfs_btree_key             low_key;
4796         union xfs_btree_key             high_key;
4797
4798         /* Find the keys of both ends of the interval. */
4799         cur->bc_rec = *high_rec;
4800         cur->bc_ops->init_rec_from_cur(cur, &rec);
4801         cur->bc_ops->init_key_from_rec(&high_key, &rec);
4802
4803         cur->bc_rec = *low_rec;
4804         cur->bc_ops->init_rec_from_cur(cur, &rec);
4805         cur->bc_ops->init_key_from_rec(&low_key, &rec);
4806
4807         /* Enforce low key < high key. */
4808         if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
4809                 return -EINVAL;
4810
4811         if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
4812                 return xfs_btree_simple_query_range(cur, &low_key,
4813                                 &high_key, fn, priv);
4814         return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4815                         fn, priv);
4816 }
4817
4818 /* Query a btree for all records. */
4819 int
4820 xfs_btree_query_all(
4821         struct xfs_btree_cur            *cur,
4822         xfs_btree_query_range_fn        fn,
4823         void                            *priv)
4824 {
4825         union xfs_btree_key             low_key;
4826         union xfs_btree_key             high_key;
4827
4828         memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
4829         memset(&low_key, 0, sizeof(low_key));
4830         memset(&high_key, 0xFF, sizeof(high_key));
4831
4832         return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
4833 }
4834
4835 /*
4836  * Calculate the number of blocks needed to store a given number of records
4837  * in a short-format (per-AG metadata) btree.
4838  */
4839 unsigned long long
4840 xfs_btree_calc_size(
4841         uint                    *limits,
4842         unsigned long long      len)
4843 {
4844         int                     level;
4845         int                     maxrecs;
4846         unsigned long long      rval;
4847
4848         maxrecs = limits[0];
4849         for (level = 0, rval = 0; len > 1; level++) {
4850                 len += maxrecs - 1;
4851                 do_div(len, maxrecs);
4852                 maxrecs = limits[1];
4853                 rval += len;
4854         }
4855         return rval;
4856 }
4857
4858 static int
4859 xfs_btree_count_blocks_helper(
4860         struct xfs_btree_cur    *cur,
4861         int                     level,
4862         void                    *data)
4863 {
4864         xfs_extlen_t            *blocks = data;
4865         (*blocks)++;
4866
4867         return 0;
4868 }
4869
4870 /* Count the blocks in a btree and return the result in *blocks. */
4871 int
4872 xfs_btree_count_blocks(
4873         struct xfs_btree_cur    *cur,
4874         xfs_extlen_t            *blocks)
4875 {
4876         *blocks = 0;
4877         return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4878                         blocks);
4879 }
4880
4881 /* Compare two btree pointers. */
4882 int64_t
4883 xfs_btree_diff_two_ptrs(
4884         struct xfs_btree_cur            *cur,
4885         const union xfs_btree_ptr       *a,
4886         const union xfs_btree_ptr       *b)
4887 {
4888         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4889                 return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
4890         return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
4891 }
4892
4893 /* If there's an extent, we're done. */
4894 STATIC int
4895 xfs_btree_has_record_helper(
4896         struct xfs_btree_cur            *cur,
4897         union xfs_btree_rec             *rec,
4898         void                            *priv)
4899 {
4900         return XFS_BTREE_QUERY_RANGE_ABORT;
4901 }
4902
4903 /* Is there a record covering a given range of keys? */
4904 int
4905 xfs_btree_has_record(
4906         struct xfs_btree_cur    *cur,
4907         union xfs_btree_irec    *low,
4908         union xfs_btree_irec    *high,
4909         bool                    *exists)
4910 {
4911         int                     error;
4912
4913         error = xfs_btree_query_range(cur, low, high,
4914                         &xfs_btree_has_record_helper, NULL);
4915         if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
4916                 *exists = true;
4917                 return 0;
4918         }
4919         *exists = false;
4920         return error;
4921 }
4922
4923 /* Are there more records in this btree? */
4924 bool
4925 xfs_btree_has_more_records(
4926         struct xfs_btree_cur    *cur)
4927 {
4928         struct xfs_btree_block  *block;
4929         struct xfs_buf          *bp;
4930
4931         block = xfs_btree_get_block(cur, 0, &bp);
4932
4933         /* There are still records in this block. */
4934         if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
4935                 return true;
4936
4937         /* There are more record blocks. */
4938         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4939                 return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
4940         else
4941                 return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
4942 }