Merge tag 'dlm-5.15' of git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm
[linux-2.6-microblaze.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17 #include "tree-mod-log.h"
18
19 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
20                       *root, struct btrfs_path *path, int level);
21 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
22                       const struct btrfs_key *ins_key, struct btrfs_path *path,
23                       int data_size, int extend);
24 static int push_node_left(struct btrfs_trans_handle *trans,
25                           struct extent_buffer *dst,
26                           struct extent_buffer *src, int empty);
27 static int balance_node_right(struct btrfs_trans_handle *trans,
28                               struct extent_buffer *dst_buf,
29                               struct extent_buffer *src_buf);
30 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
31                     int level, int slot);
32
33 static const struct btrfs_csums {
34         u16             size;
35         const char      name[10];
36         const char      driver[12];
37 } btrfs_csums[] = {
38         [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
39         [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
40         [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
41         [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
42                                      .driver = "blake2b-256" },
43 };
44
45 int btrfs_super_csum_size(const struct btrfs_super_block *s)
46 {
47         u16 t = btrfs_super_csum_type(s);
48         /*
49          * csum type is validated at mount time
50          */
51         return btrfs_csums[t].size;
52 }
53
54 const char *btrfs_super_csum_name(u16 csum_type)
55 {
56         /* csum type is validated at mount time */
57         return btrfs_csums[csum_type].name;
58 }
59
60 /*
61  * Return driver name if defined, otherwise the name that's also a valid driver
62  * name
63  */
64 const char *btrfs_super_csum_driver(u16 csum_type)
65 {
66         /* csum type is validated at mount time */
67         return btrfs_csums[csum_type].driver[0] ?
68                 btrfs_csums[csum_type].driver :
69                 btrfs_csums[csum_type].name;
70 }
71
72 size_t __attribute_const__ btrfs_get_num_csums(void)
73 {
74         return ARRAY_SIZE(btrfs_csums);
75 }
76
77 struct btrfs_path *btrfs_alloc_path(void)
78 {
79         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
80 }
81
82 /* this also releases the path */
83 void btrfs_free_path(struct btrfs_path *p)
84 {
85         if (!p)
86                 return;
87         btrfs_release_path(p);
88         kmem_cache_free(btrfs_path_cachep, p);
89 }
90
91 /*
92  * path release drops references on the extent buffers in the path
93  * and it drops any locks held by this path
94  *
95  * It is safe to call this on paths that no locks or extent buffers held.
96  */
97 noinline void btrfs_release_path(struct btrfs_path *p)
98 {
99         int i;
100
101         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
102                 p->slots[i] = 0;
103                 if (!p->nodes[i])
104                         continue;
105                 if (p->locks[i]) {
106                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
107                         p->locks[i] = 0;
108                 }
109                 free_extent_buffer(p->nodes[i]);
110                 p->nodes[i] = NULL;
111         }
112 }
113
114 /*
115  * safely gets a reference on the root node of a tree.  A lock
116  * is not taken, so a concurrent writer may put a different node
117  * at the root of the tree.  See btrfs_lock_root_node for the
118  * looping required.
119  *
120  * The extent buffer returned by this has a reference taken, so
121  * it won't disappear.  It may stop being the root of the tree
122  * at any time because there are no locks held.
123  */
124 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
125 {
126         struct extent_buffer *eb;
127
128         while (1) {
129                 rcu_read_lock();
130                 eb = rcu_dereference(root->node);
131
132                 /*
133                  * RCU really hurts here, we could free up the root node because
134                  * it was COWed but we may not get the new root node yet so do
135                  * the inc_not_zero dance and if it doesn't work then
136                  * synchronize_rcu and try again.
137                  */
138                 if (atomic_inc_not_zero(&eb->refs)) {
139                         rcu_read_unlock();
140                         break;
141                 }
142                 rcu_read_unlock();
143                 synchronize_rcu();
144         }
145         return eb;
146 }
147
148 /*
149  * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
150  * just get put onto a simple dirty list.  Transaction walks this list to make
151  * sure they get properly updated on disk.
152  */
153 static void add_root_to_dirty_list(struct btrfs_root *root)
154 {
155         struct btrfs_fs_info *fs_info = root->fs_info;
156
157         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
158             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
159                 return;
160
161         spin_lock(&fs_info->trans_lock);
162         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
163                 /* Want the extent tree to be the last on the list */
164                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
165                         list_move_tail(&root->dirty_list,
166                                        &fs_info->dirty_cowonly_roots);
167                 else
168                         list_move(&root->dirty_list,
169                                   &fs_info->dirty_cowonly_roots);
170         }
171         spin_unlock(&fs_info->trans_lock);
172 }
173
174 /*
175  * used by snapshot creation to make a copy of a root for a tree with
176  * a given objectid.  The buffer with the new root node is returned in
177  * cow_ret, and this func returns zero on success or a negative error code.
178  */
179 int btrfs_copy_root(struct btrfs_trans_handle *trans,
180                       struct btrfs_root *root,
181                       struct extent_buffer *buf,
182                       struct extent_buffer **cow_ret, u64 new_root_objectid)
183 {
184         struct btrfs_fs_info *fs_info = root->fs_info;
185         struct extent_buffer *cow;
186         int ret = 0;
187         int level;
188         struct btrfs_disk_key disk_key;
189
190         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
191                 trans->transid != fs_info->running_transaction->transid);
192         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
193                 trans->transid != root->last_trans);
194
195         level = btrfs_header_level(buf);
196         if (level == 0)
197                 btrfs_item_key(buf, &disk_key, 0);
198         else
199                 btrfs_node_key(buf, &disk_key, 0);
200
201         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
202                                      &disk_key, level, buf->start, 0,
203                                      BTRFS_NESTING_NEW_ROOT);
204         if (IS_ERR(cow))
205                 return PTR_ERR(cow);
206
207         copy_extent_buffer_full(cow, buf);
208         btrfs_set_header_bytenr(cow, cow->start);
209         btrfs_set_header_generation(cow, trans->transid);
210         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
211         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
212                                      BTRFS_HEADER_FLAG_RELOC);
213         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
214                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
215         else
216                 btrfs_set_header_owner(cow, new_root_objectid);
217
218         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
219
220         WARN_ON(btrfs_header_generation(buf) > trans->transid);
221         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
222                 ret = btrfs_inc_ref(trans, root, cow, 1);
223         else
224                 ret = btrfs_inc_ref(trans, root, cow, 0);
225         if (ret) {
226                 btrfs_tree_unlock(cow);
227                 free_extent_buffer(cow);
228                 btrfs_abort_transaction(trans, ret);
229                 return ret;
230         }
231
232         btrfs_mark_buffer_dirty(cow);
233         *cow_ret = cow;
234         return 0;
235 }
236
237 /*
238  * check if the tree block can be shared by multiple trees
239  */
240 int btrfs_block_can_be_shared(struct btrfs_root *root,
241                               struct extent_buffer *buf)
242 {
243         /*
244          * Tree blocks not in shareable trees and tree roots are never shared.
245          * If a block was allocated after the last snapshot and the block was
246          * not allocated by tree relocation, we know the block is not shared.
247          */
248         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
249             buf != root->node && buf != root->commit_root &&
250             (btrfs_header_generation(buf) <=
251              btrfs_root_last_snapshot(&root->root_item) ||
252              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
253                 return 1;
254
255         return 0;
256 }
257
258 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
259                                        struct btrfs_root *root,
260                                        struct extent_buffer *buf,
261                                        struct extent_buffer *cow,
262                                        int *last_ref)
263 {
264         struct btrfs_fs_info *fs_info = root->fs_info;
265         u64 refs;
266         u64 owner;
267         u64 flags;
268         u64 new_flags = 0;
269         int ret;
270
271         /*
272          * Backrefs update rules:
273          *
274          * Always use full backrefs for extent pointers in tree block
275          * allocated by tree relocation.
276          *
277          * If a shared tree block is no longer referenced by its owner
278          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
279          * use full backrefs for extent pointers in tree block.
280          *
281          * If a tree block is been relocating
282          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
283          * use full backrefs for extent pointers in tree block.
284          * The reason for this is some operations (such as drop tree)
285          * are only allowed for blocks use full backrefs.
286          */
287
288         if (btrfs_block_can_be_shared(root, buf)) {
289                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
290                                                btrfs_header_level(buf), 1,
291                                                &refs, &flags);
292                 if (ret)
293                         return ret;
294                 if (refs == 0) {
295                         ret = -EROFS;
296                         btrfs_handle_fs_error(fs_info, ret, NULL);
297                         return ret;
298                 }
299         } else {
300                 refs = 1;
301                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
302                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
303                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
304                 else
305                         flags = 0;
306         }
307
308         owner = btrfs_header_owner(buf);
309         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
310                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
311
312         if (refs > 1) {
313                 if ((owner == root->root_key.objectid ||
314                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
315                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
316                         ret = btrfs_inc_ref(trans, root, buf, 1);
317                         if (ret)
318                                 return ret;
319
320                         if (root->root_key.objectid ==
321                             BTRFS_TREE_RELOC_OBJECTID) {
322                                 ret = btrfs_dec_ref(trans, root, buf, 0);
323                                 if (ret)
324                                         return ret;
325                                 ret = btrfs_inc_ref(trans, root, cow, 1);
326                                 if (ret)
327                                         return ret;
328                         }
329                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
330                 } else {
331
332                         if (root->root_key.objectid ==
333                             BTRFS_TREE_RELOC_OBJECTID)
334                                 ret = btrfs_inc_ref(trans, root, cow, 1);
335                         else
336                                 ret = btrfs_inc_ref(trans, root, cow, 0);
337                         if (ret)
338                                 return ret;
339                 }
340                 if (new_flags != 0) {
341                         int level = btrfs_header_level(buf);
342
343                         ret = btrfs_set_disk_extent_flags(trans, buf,
344                                                           new_flags, level, 0);
345                         if (ret)
346                                 return ret;
347                 }
348         } else {
349                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
350                         if (root->root_key.objectid ==
351                             BTRFS_TREE_RELOC_OBJECTID)
352                                 ret = btrfs_inc_ref(trans, root, cow, 1);
353                         else
354                                 ret = btrfs_inc_ref(trans, root, cow, 0);
355                         if (ret)
356                                 return ret;
357                         ret = btrfs_dec_ref(trans, root, buf, 1);
358                         if (ret)
359                                 return ret;
360                 }
361                 btrfs_clean_tree_block(buf);
362                 *last_ref = 1;
363         }
364         return 0;
365 }
366
367 /*
368  * does the dirty work in cow of a single block.  The parent block (if
369  * supplied) is updated to point to the new cow copy.  The new buffer is marked
370  * dirty and returned locked.  If you modify the block it needs to be marked
371  * dirty again.
372  *
373  * search_start -- an allocation hint for the new block
374  *
375  * empty_size -- a hint that you plan on doing more cow.  This is the size in
376  * bytes the allocator should try to find free next to the block it returns.
377  * This is just a hint and may be ignored by the allocator.
378  */
379 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
380                              struct btrfs_root *root,
381                              struct extent_buffer *buf,
382                              struct extent_buffer *parent, int parent_slot,
383                              struct extent_buffer **cow_ret,
384                              u64 search_start, u64 empty_size,
385                              enum btrfs_lock_nesting nest)
386 {
387         struct btrfs_fs_info *fs_info = root->fs_info;
388         struct btrfs_disk_key disk_key;
389         struct extent_buffer *cow;
390         int level, ret;
391         int last_ref = 0;
392         int unlock_orig = 0;
393         u64 parent_start = 0;
394
395         if (*cow_ret == buf)
396                 unlock_orig = 1;
397
398         btrfs_assert_tree_locked(buf);
399
400         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
401                 trans->transid != fs_info->running_transaction->transid);
402         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
403                 trans->transid != root->last_trans);
404
405         level = btrfs_header_level(buf);
406
407         if (level == 0)
408                 btrfs_item_key(buf, &disk_key, 0);
409         else
410                 btrfs_node_key(buf, &disk_key, 0);
411
412         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
413                 parent_start = parent->start;
414
415         cow = btrfs_alloc_tree_block(trans, root, parent_start,
416                                      root->root_key.objectid, &disk_key, level,
417                                      search_start, empty_size, nest);
418         if (IS_ERR(cow))
419                 return PTR_ERR(cow);
420
421         /* cow is set to blocking by btrfs_init_new_buffer */
422
423         copy_extent_buffer_full(cow, buf);
424         btrfs_set_header_bytenr(cow, cow->start);
425         btrfs_set_header_generation(cow, trans->transid);
426         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
427         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
428                                      BTRFS_HEADER_FLAG_RELOC);
429         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
430                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
431         else
432                 btrfs_set_header_owner(cow, root->root_key.objectid);
433
434         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
435
436         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
437         if (ret) {
438                 btrfs_tree_unlock(cow);
439                 free_extent_buffer(cow);
440                 btrfs_abort_transaction(trans, ret);
441                 return ret;
442         }
443
444         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
445                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
446                 if (ret) {
447                         btrfs_tree_unlock(cow);
448                         free_extent_buffer(cow);
449                         btrfs_abort_transaction(trans, ret);
450                         return ret;
451                 }
452         }
453
454         if (buf == root->node) {
455                 WARN_ON(parent && parent != buf);
456                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
457                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
458                         parent_start = buf->start;
459
460                 atomic_inc(&cow->refs);
461                 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
462                 BUG_ON(ret < 0);
463                 rcu_assign_pointer(root->node, cow);
464
465                 btrfs_free_tree_block(trans, root, buf, parent_start,
466                                       last_ref);
467                 free_extent_buffer(buf);
468                 add_root_to_dirty_list(root);
469         } else {
470                 WARN_ON(trans->transid != btrfs_header_generation(parent));
471                 btrfs_tree_mod_log_insert_key(parent, parent_slot,
472                                               BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
473                 btrfs_set_node_blockptr(parent, parent_slot,
474                                         cow->start);
475                 btrfs_set_node_ptr_generation(parent, parent_slot,
476                                               trans->transid);
477                 btrfs_mark_buffer_dirty(parent);
478                 if (last_ref) {
479                         ret = btrfs_tree_mod_log_free_eb(buf);
480                         if (ret) {
481                                 btrfs_tree_unlock(cow);
482                                 free_extent_buffer(cow);
483                                 btrfs_abort_transaction(trans, ret);
484                                 return ret;
485                         }
486                 }
487                 btrfs_free_tree_block(trans, root, buf, parent_start,
488                                       last_ref);
489         }
490         if (unlock_orig)
491                 btrfs_tree_unlock(buf);
492         free_extent_buffer_stale(buf);
493         btrfs_mark_buffer_dirty(cow);
494         *cow_ret = cow;
495         return 0;
496 }
497
498 static inline int should_cow_block(struct btrfs_trans_handle *trans,
499                                    struct btrfs_root *root,
500                                    struct extent_buffer *buf)
501 {
502         if (btrfs_is_testing(root->fs_info))
503                 return 0;
504
505         /* Ensure we can see the FORCE_COW bit */
506         smp_mb__before_atomic();
507
508         /*
509          * We do not need to cow a block if
510          * 1) this block is not created or changed in this transaction;
511          * 2) this block does not belong to TREE_RELOC tree;
512          * 3) the root is not forced COW.
513          *
514          * What is forced COW:
515          *    when we create snapshot during committing the transaction,
516          *    after we've finished copying src root, we must COW the shared
517          *    block to ensure the metadata consistency.
518          */
519         if (btrfs_header_generation(buf) == trans->transid &&
520             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
521             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
522               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
523             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
524                 return 0;
525         return 1;
526 }
527
528 /*
529  * cows a single block, see __btrfs_cow_block for the real work.
530  * This version of it has extra checks so that a block isn't COWed more than
531  * once per transaction, as long as it hasn't been written yet
532  */
533 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
534                     struct btrfs_root *root, struct extent_buffer *buf,
535                     struct extent_buffer *parent, int parent_slot,
536                     struct extent_buffer **cow_ret,
537                     enum btrfs_lock_nesting nest)
538 {
539         struct btrfs_fs_info *fs_info = root->fs_info;
540         u64 search_start;
541         int ret;
542
543         if (test_bit(BTRFS_ROOT_DELETING, &root->state))
544                 btrfs_err(fs_info,
545                         "COW'ing blocks on a fs root that's being dropped");
546
547         if (trans->transaction != fs_info->running_transaction)
548                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
549                        trans->transid,
550                        fs_info->running_transaction->transid);
551
552         if (trans->transid != fs_info->generation)
553                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
554                        trans->transid, fs_info->generation);
555
556         if (!should_cow_block(trans, root, buf)) {
557                 *cow_ret = buf;
558                 return 0;
559         }
560
561         search_start = buf->start & ~((u64)SZ_1G - 1);
562
563         /*
564          * Before CoWing this block for later modification, check if it's
565          * the subtree root and do the delayed subtree trace if needed.
566          *
567          * Also We don't care about the error, as it's handled internally.
568          */
569         btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
570         ret = __btrfs_cow_block(trans, root, buf, parent,
571                                  parent_slot, cow_ret, search_start, 0, nest);
572
573         trace_btrfs_cow_block(root, buf, *cow_ret);
574
575         return ret;
576 }
577 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
578
579 /*
580  * helper function for defrag to decide if two blocks pointed to by a
581  * node are actually close by
582  */
583 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
584 {
585         if (blocknr < other && other - (blocknr + blocksize) < 32768)
586                 return 1;
587         if (blocknr > other && blocknr - (other + blocksize) < 32768)
588                 return 1;
589         return 0;
590 }
591
592 #ifdef __LITTLE_ENDIAN
593
594 /*
595  * Compare two keys, on little-endian the disk order is same as CPU order and
596  * we can avoid the conversion.
597  */
598 static int comp_keys(const struct btrfs_disk_key *disk_key,
599                      const struct btrfs_key *k2)
600 {
601         const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
602
603         return btrfs_comp_cpu_keys(k1, k2);
604 }
605
606 #else
607
608 /*
609  * compare two keys in a memcmp fashion
610  */
611 static int comp_keys(const struct btrfs_disk_key *disk,
612                      const struct btrfs_key *k2)
613 {
614         struct btrfs_key k1;
615
616         btrfs_disk_key_to_cpu(&k1, disk);
617
618         return btrfs_comp_cpu_keys(&k1, k2);
619 }
620 #endif
621
622 /*
623  * same as comp_keys only with two btrfs_key's
624  */
625 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
626 {
627         if (k1->objectid > k2->objectid)
628                 return 1;
629         if (k1->objectid < k2->objectid)
630                 return -1;
631         if (k1->type > k2->type)
632                 return 1;
633         if (k1->type < k2->type)
634                 return -1;
635         if (k1->offset > k2->offset)
636                 return 1;
637         if (k1->offset < k2->offset)
638                 return -1;
639         return 0;
640 }
641
642 /*
643  * this is used by the defrag code to go through all the
644  * leaves pointed to by a node and reallocate them so that
645  * disk order is close to key order
646  */
647 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
648                        struct btrfs_root *root, struct extent_buffer *parent,
649                        int start_slot, u64 *last_ret,
650                        struct btrfs_key *progress)
651 {
652         struct btrfs_fs_info *fs_info = root->fs_info;
653         struct extent_buffer *cur;
654         u64 blocknr;
655         u64 search_start = *last_ret;
656         u64 last_block = 0;
657         u64 other;
658         u32 parent_nritems;
659         int end_slot;
660         int i;
661         int err = 0;
662         u32 blocksize;
663         int progress_passed = 0;
664         struct btrfs_disk_key disk_key;
665
666         WARN_ON(trans->transaction != fs_info->running_transaction);
667         WARN_ON(trans->transid != fs_info->generation);
668
669         parent_nritems = btrfs_header_nritems(parent);
670         blocksize = fs_info->nodesize;
671         end_slot = parent_nritems - 1;
672
673         if (parent_nritems <= 1)
674                 return 0;
675
676         for (i = start_slot; i <= end_slot; i++) {
677                 int close = 1;
678
679                 btrfs_node_key(parent, &disk_key, i);
680                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
681                         continue;
682
683                 progress_passed = 1;
684                 blocknr = btrfs_node_blockptr(parent, i);
685                 if (last_block == 0)
686                         last_block = blocknr;
687
688                 if (i > 0) {
689                         other = btrfs_node_blockptr(parent, i - 1);
690                         close = close_blocks(blocknr, other, blocksize);
691                 }
692                 if (!close && i < end_slot) {
693                         other = btrfs_node_blockptr(parent, i + 1);
694                         close = close_blocks(blocknr, other, blocksize);
695                 }
696                 if (close) {
697                         last_block = blocknr;
698                         continue;
699                 }
700
701                 cur = btrfs_read_node_slot(parent, i);
702                 if (IS_ERR(cur))
703                         return PTR_ERR(cur);
704                 if (search_start == 0)
705                         search_start = last_block;
706
707                 btrfs_tree_lock(cur);
708                 err = __btrfs_cow_block(trans, root, cur, parent, i,
709                                         &cur, search_start,
710                                         min(16 * blocksize,
711                                             (end_slot - i) * blocksize),
712                                         BTRFS_NESTING_COW);
713                 if (err) {
714                         btrfs_tree_unlock(cur);
715                         free_extent_buffer(cur);
716                         break;
717                 }
718                 search_start = cur->start;
719                 last_block = cur->start;
720                 *last_ret = search_start;
721                 btrfs_tree_unlock(cur);
722                 free_extent_buffer(cur);
723         }
724         return err;
725 }
726
727 /*
728  * search for key in the extent_buffer.  The items start at offset p,
729  * and they are item_size apart.
730  *
731  * the slot in the array is returned via slot, and it points to
732  * the place where you would insert key if it is not found in
733  * the array.
734  *
735  * Slot may point to total number of items if the key is bigger than
736  * all of the keys
737  */
738 static noinline int generic_bin_search(struct extent_buffer *eb,
739                                        unsigned long p, int item_size,
740                                        const struct btrfs_key *key, int *slot)
741 {
742         int low = 0;
743         int high = btrfs_header_nritems(eb);
744         int ret;
745         const int key_size = sizeof(struct btrfs_disk_key);
746
747         if (low > high) {
748                 btrfs_err(eb->fs_info,
749                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
750                           __func__, low, high, eb->start,
751                           btrfs_header_owner(eb), btrfs_header_level(eb));
752                 return -EINVAL;
753         }
754
755         while (low < high) {
756                 unsigned long oip;
757                 unsigned long offset;
758                 struct btrfs_disk_key *tmp;
759                 struct btrfs_disk_key unaligned;
760                 int mid;
761
762                 mid = (low + high) / 2;
763                 offset = p + mid * item_size;
764                 oip = offset_in_page(offset);
765
766                 if (oip + key_size <= PAGE_SIZE) {
767                         const unsigned long idx = get_eb_page_index(offset);
768                         char *kaddr = page_address(eb->pages[idx]);
769
770                         oip = get_eb_offset_in_page(eb, offset);
771                         tmp = (struct btrfs_disk_key *)(kaddr + oip);
772                 } else {
773                         read_extent_buffer(eb, &unaligned, offset, key_size);
774                         tmp = &unaligned;
775                 }
776
777                 ret = comp_keys(tmp, key);
778
779                 if (ret < 0)
780                         low = mid + 1;
781                 else if (ret > 0)
782                         high = mid;
783                 else {
784                         *slot = mid;
785                         return 0;
786                 }
787         }
788         *slot = low;
789         return 1;
790 }
791
792 /*
793  * simple bin_search frontend that does the right thing for
794  * leaves vs nodes
795  */
796 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
797                      int *slot)
798 {
799         if (btrfs_header_level(eb) == 0)
800                 return generic_bin_search(eb,
801                                           offsetof(struct btrfs_leaf, items),
802                                           sizeof(struct btrfs_item), key, slot);
803         else
804                 return generic_bin_search(eb,
805                                           offsetof(struct btrfs_node, ptrs),
806                                           sizeof(struct btrfs_key_ptr), key, slot);
807 }
808
809 static void root_add_used(struct btrfs_root *root, u32 size)
810 {
811         spin_lock(&root->accounting_lock);
812         btrfs_set_root_used(&root->root_item,
813                             btrfs_root_used(&root->root_item) + size);
814         spin_unlock(&root->accounting_lock);
815 }
816
817 static void root_sub_used(struct btrfs_root *root, u32 size)
818 {
819         spin_lock(&root->accounting_lock);
820         btrfs_set_root_used(&root->root_item,
821                             btrfs_root_used(&root->root_item) - size);
822         spin_unlock(&root->accounting_lock);
823 }
824
825 /* given a node and slot number, this reads the blocks it points to.  The
826  * extent buffer is returned with a reference taken (but unlocked).
827  */
828 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
829                                            int slot)
830 {
831         int level = btrfs_header_level(parent);
832         struct extent_buffer *eb;
833         struct btrfs_key first_key;
834
835         if (slot < 0 || slot >= btrfs_header_nritems(parent))
836                 return ERR_PTR(-ENOENT);
837
838         BUG_ON(level == 0);
839
840         btrfs_node_key_to_cpu(parent, &first_key, slot);
841         eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
842                              btrfs_header_owner(parent),
843                              btrfs_node_ptr_generation(parent, slot),
844                              level - 1, &first_key);
845         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
846                 free_extent_buffer(eb);
847                 eb = ERR_PTR(-EIO);
848         }
849
850         return eb;
851 }
852
853 /*
854  * node level balancing, used to make sure nodes are in proper order for
855  * item deletion.  We balance from the top down, so we have to make sure
856  * that a deletion won't leave an node completely empty later on.
857  */
858 static noinline int balance_level(struct btrfs_trans_handle *trans,
859                          struct btrfs_root *root,
860                          struct btrfs_path *path, int level)
861 {
862         struct btrfs_fs_info *fs_info = root->fs_info;
863         struct extent_buffer *right = NULL;
864         struct extent_buffer *mid;
865         struct extent_buffer *left = NULL;
866         struct extent_buffer *parent = NULL;
867         int ret = 0;
868         int wret;
869         int pslot;
870         int orig_slot = path->slots[level];
871         u64 orig_ptr;
872
873         ASSERT(level > 0);
874
875         mid = path->nodes[level];
876
877         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
878         WARN_ON(btrfs_header_generation(mid) != trans->transid);
879
880         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
881
882         if (level < BTRFS_MAX_LEVEL - 1) {
883                 parent = path->nodes[level + 1];
884                 pslot = path->slots[level + 1];
885         }
886
887         /*
888          * deal with the case where there is only one pointer in the root
889          * by promoting the node below to a root
890          */
891         if (!parent) {
892                 struct extent_buffer *child;
893
894                 if (btrfs_header_nritems(mid) != 1)
895                         return 0;
896
897                 /* promote the child to a root */
898                 child = btrfs_read_node_slot(mid, 0);
899                 if (IS_ERR(child)) {
900                         ret = PTR_ERR(child);
901                         btrfs_handle_fs_error(fs_info, ret, NULL);
902                         goto enospc;
903                 }
904
905                 btrfs_tree_lock(child);
906                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
907                                       BTRFS_NESTING_COW);
908                 if (ret) {
909                         btrfs_tree_unlock(child);
910                         free_extent_buffer(child);
911                         goto enospc;
912                 }
913
914                 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
915                 BUG_ON(ret < 0);
916                 rcu_assign_pointer(root->node, child);
917
918                 add_root_to_dirty_list(root);
919                 btrfs_tree_unlock(child);
920
921                 path->locks[level] = 0;
922                 path->nodes[level] = NULL;
923                 btrfs_clean_tree_block(mid);
924                 btrfs_tree_unlock(mid);
925                 /* once for the path */
926                 free_extent_buffer(mid);
927
928                 root_sub_used(root, mid->len);
929                 btrfs_free_tree_block(trans, root, mid, 0, 1);
930                 /* once for the root ptr */
931                 free_extent_buffer_stale(mid);
932                 return 0;
933         }
934         if (btrfs_header_nritems(mid) >
935             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
936                 return 0;
937
938         left = btrfs_read_node_slot(parent, pslot - 1);
939         if (IS_ERR(left))
940                 left = NULL;
941
942         if (left) {
943                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
944                 wret = btrfs_cow_block(trans, root, left,
945                                        parent, pslot - 1, &left,
946                                        BTRFS_NESTING_LEFT_COW);
947                 if (wret) {
948                         ret = wret;
949                         goto enospc;
950                 }
951         }
952
953         right = btrfs_read_node_slot(parent, pslot + 1);
954         if (IS_ERR(right))
955                 right = NULL;
956
957         if (right) {
958                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
959                 wret = btrfs_cow_block(trans, root, right,
960                                        parent, pslot + 1, &right,
961                                        BTRFS_NESTING_RIGHT_COW);
962                 if (wret) {
963                         ret = wret;
964                         goto enospc;
965                 }
966         }
967
968         /* first, try to make some room in the middle buffer */
969         if (left) {
970                 orig_slot += btrfs_header_nritems(left);
971                 wret = push_node_left(trans, left, mid, 1);
972                 if (wret < 0)
973                         ret = wret;
974         }
975
976         /*
977          * then try to empty the right most buffer into the middle
978          */
979         if (right) {
980                 wret = push_node_left(trans, mid, right, 1);
981                 if (wret < 0 && wret != -ENOSPC)
982                         ret = wret;
983                 if (btrfs_header_nritems(right) == 0) {
984                         btrfs_clean_tree_block(right);
985                         btrfs_tree_unlock(right);
986                         del_ptr(root, path, level + 1, pslot + 1);
987                         root_sub_used(root, right->len);
988                         btrfs_free_tree_block(trans, root, right, 0, 1);
989                         free_extent_buffer_stale(right);
990                         right = NULL;
991                 } else {
992                         struct btrfs_disk_key right_key;
993                         btrfs_node_key(right, &right_key, 0);
994                         ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
995                                         BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
996                         BUG_ON(ret < 0);
997                         btrfs_set_node_key(parent, &right_key, pslot + 1);
998                         btrfs_mark_buffer_dirty(parent);
999                 }
1000         }
1001         if (btrfs_header_nritems(mid) == 1) {
1002                 /*
1003                  * we're not allowed to leave a node with one item in the
1004                  * tree during a delete.  A deletion from lower in the tree
1005                  * could try to delete the only pointer in this node.
1006                  * So, pull some keys from the left.
1007                  * There has to be a left pointer at this point because
1008                  * otherwise we would have pulled some pointers from the
1009                  * right
1010                  */
1011                 if (!left) {
1012                         ret = -EROFS;
1013                         btrfs_handle_fs_error(fs_info, ret, NULL);
1014                         goto enospc;
1015                 }
1016                 wret = balance_node_right(trans, mid, left);
1017                 if (wret < 0) {
1018                         ret = wret;
1019                         goto enospc;
1020                 }
1021                 if (wret == 1) {
1022                         wret = push_node_left(trans, left, mid, 1);
1023                         if (wret < 0)
1024                                 ret = wret;
1025                 }
1026                 BUG_ON(wret == 1);
1027         }
1028         if (btrfs_header_nritems(mid) == 0) {
1029                 btrfs_clean_tree_block(mid);
1030                 btrfs_tree_unlock(mid);
1031                 del_ptr(root, path, level + 1, pslot);
1032                 root_sub_used(root, mid->len);
1033                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1034                 free_extent_buffer_stale(mid);
1035                 mid = NULL;
1036         } else {
1037                 /* update the parent key to reflect our changes */
1038                 struct btrfs_disk_key mid_key;
1039                 btrfs_node_key(mid, &mid_key, 0);
1040                 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1041                                 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1042                 BUG_ON(ret < 0);
1043                 btrfs_set_node_key(parent, &mid_key, pslot);
1044                 btrfs_mark_buffer_dirty(parent);
1045         }
1046
1047         /* update the path */
1048         if (left) {
1049                 if (btrfs_header_nritems(left) > orig_slot) {
1050                         atomic_inc(&left->refs);
1051                         /* left was locked after cow */
1052                         path->nodes[level] = left;
1053                         path->slots[level + 1] -= 1;
1054                         path->slots[level] = orig_slot;
1055                         if (mid) {
1056                                 btrfs_tree_unlock(mid);
1057                                 free_extent_buffer(mid);
1058                         }
1059                 } else {
1060                         orig_slot -= btrfs_header_nritems(left);
1061                         path->slots[level] = orig_slot;
1062                 }
1063         }
1064         /* double check we haven't messed things up */
1065         if (orig_ptr !=
1066             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1067                 BUG();
1068 enospc:
1069         if (right) {
1070                 btrfs_tree_unlock(right);
1071                 free_extent_buffer(right);
1072         }
1073         if (left) {
1074                 if (path->nodes[level] != left)
1075                         btrfs_tree_unlock(left);
1076                 free_extent_buffer(left);
1077         }
1078         return ret;
1079 }
1080
1081 /* Node balancing for insertion.  Here we only split or push nodes around
1082  * when they are completely full.  This is also done top down, so we
1083  * have to be pessimistic.
1084  */
1085 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1086                                           struct btrfs_root *root,
1087                                           struct btrfs_path *path, int level)
1088 {
1089         struct btrfs_fs_info *fs_info = root->fs_info;
1090         struct extent_buffer *right = NULL;
1091         struct extent_buffer *mid;
1092         struct extent_buffer *left = NULL;
1093         struct extent_buffer *parent = NULL;
1094         int ret = 0;
1095         int wret;
1096         int pslot;
1097         int orig_slot = path->slots[level];
1098
1099         if (level == 0)
1100                 return 1;
1101
1102         mid = path->nodes[level];
1103         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1104
1105         if (level < BTRFS_MAX_LEVEL - 1) {
1106                 parent = path->nodes[level + 1];
1107                 pslot = path->slots[level + 1];
1108         }
1109
1110         if (!parent)
1111                 return 1;
1112
1113         left = btrfs_read_node_slot(parent, pslot - 1);
1114         if (IS_ERR(left))
1115                 left = NULL;
1116
1117         /* first, try to make some room in the middle buffer */
1118         if (left) {
1119                 u32 left_nr;
1120
1121                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1122
1123                 left_nr = btrfs_header_nritems(left);
1124                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1125                         wret = 1;
1126                 } else {
1127                         ret = btrfs_cow_block(trans, root, left, parent,
1128                                               pslot - 1, &left,
1129                                               BTRFS_NESTING_LEFT_COW);
1130                         if (ret)
1131                                 wret = 1;
1132                         else {
1133                                 wret = push_node_left(trans, left, mid, 0);
1134                         }
1135                 }
1136                 if (wret < 0)
1137                         ret = wret;
1138                 if (wret == 0) {
1139                         struct btrfs_disk_key disk_key;
1140                         orig_slot += left_nr;
1141                         btrfs_node_key(mid, &disk_key, 0);
1142                         ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1143                                         BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1144                         BUG_ON(ret < 0);
1145                         btrfs_set_node_key(parent, &disk_key, pslot);
1146                         btrfs_mark_buffer_dirty(parent);
1147                         if (btrfs_header_nritems(left) > orig_slot) {
1148                                 path->nodes[level] = left;
1149                                 path->slots[level + 1] -= 1;
1150                                 path->slots[level] = orig_slot;
1151                                 btrfs_tree_unlock(mid);
1152                                 free_extent_buffer(mid);
1153                         } else {
1154                                 orig_slot -=
1155                                         btrfs_header_nritems(left);
1156                                 path->slots[level] = orig_slot;
1157                                 btrfs_tree_unlock(left);
1158                                 free_extent_buffer(left);
1159                         }
1160                         return 0;
1161                 }
1162                 btrfs_tree_unlock(left);
1163                 free_extent_buffer(left);
1164         }
1165         right = btrfs_read_node_slot(parent, pslot + 1);
1166         if (IS_ERR(right))
1167                 right = NULL;
1168
1169         /*
1170          * then try to empty the right most buffer into the middle
1171          */
1172         if (right) {
1173                 u32 right_nr;
1174
1175                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1176
1177                 right_nr = btrfs_header_nritems(right);
1178                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1179                         wret = 1;
1180                 } else {
1181                         ret = btrfs_cow_block(trans, root, right,
1182                                               parent, pslot + 1,
1183                                               &right, BTRFS_NESTING_RIGHT_COW);
1184                         if (ret)
1185                                 wret = 1;
1186                         else {
1187                                 wret = balance_node_right(trans, right, mid);
1188                         }
1189                 }
1190                 if (wret < 0)
1191                         ret = wret;
1192                 if (wret == 0) {
1193                         struct btrfs_disk_key disk_key;
1194
1195                         btrfs_node_key(right, &disk_key, 0);
1196                         ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1197                                         BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1198                         BUG_ON(ret < 0);
1199                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1200                         btrfs_mark_buffer_dirty(parent);
1201
1202                         if (btrfs_header_nritems(mid) <= orig_slot) {
1203                                 path->nodes[level] = right;
1204                                 path->slots[level + 1] += 1;
1205                                 path->slots[level] = orig_slot -
1206                                         btrfs_header_nritems(mid);
1207                                 btrfs_tree_unlock(mid);
1208                                 free_extent_buffer(mid);
1209                         } else {
1210                                 btrfs_tree_unlock(right);
1211                                 free_extent_buffer(right);
1212                         }
1213                         return 0;
1214                 }
1215                 btrfs_tree_unlock(right);
1216                 free_extent_buffer(right);
1217         }
1218         return 1;
1219 }
1220
1221 /*
1222  * readahead one full node of leaves, finding things that are close
1223  * to the block in 'slot', and triggering ra on them.
1224  */
1225 static void reada_for_search(struct btrfs_fs_info *fs_info,
1226                              struct btrfs_path *path,
1227                              int level, int slot, u64 objectid)
1228 {
1229         struct extent_buffer *node;
1230         struct btrfs_disk_key disk_key;
1231         u32 nritems;
1232         u64 search;
1233         u64 target;
1234         u64 nread = 0;
1235         u64 nread_max;
1236         u32 nr;
1237         u32 blocksize;
1238         u32 nscan = 0;
1239
1240         if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1241                 return;
1242
1243         if (!path->nodes[level])
1244                 return;
1245
1246         node = path->nodes[level];
1247
1248         /*
1249          * Since the time between visiting leaves is much shorter than the time
1250          * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1251          * much IO at once (possibly random).
1252          */
1253         if (path->reada == READA_FORWARD_ALWAYS) {
1254                 if (level > 1)
1255                         nread_max = node->fs_info->nodesize;
1256                 else
1257                         nread_max = SZ_128K;
1258         } else {
1259                 nread_max = SZ_64K;
1260         }
1261
1262         search = btrfs_node_blockptr(node, slot);
1263         blocksize = fs_info->nodesize;
1264         if (path->reada != READA_FORWARD_ALWAYS) {
1265                 struct extent_buffer *eb;
1266
1267                 eb = find_extent_buffer(fs_info, search);
1268                 if (eb) {
1269                         free_extent_buffer(eb);
1270                         return;
1271                 }
1272         }
1273
1274         target = search;
1275
1276         nritems = btrfs_header_nritems(node);
1277         nr = slot;
1278
1279         while (1) {
1280                 if (path->reada == READA_BACK) {
1281                         if (nr == 0)
1282                                 break;
1283                         nr--;
1284                 } else if (path->reada == READA_FORWARD ||
1285                            path->reada == READA_FORWARD_ALWAYS) {
1286                         nr++;
1287                         if (nr >= nritems)
1288                                 break;
1289                 }
1290                 if (path->reada == READA_BACK && objectid) {
1291                         btrfs_node_key(node, &disk_key, nr);
1292                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1293                                 break;
1294                 }
1295                 search = btrfs_node_blockptr(node, nr);
1296                 if (path->reada == READA_FORWARD_ALWAYS ||
1297                     (search <= target && target - search <= 65536) ||
1298                     (search > target && search - target <= 65536)) {
1299                         btrfs_readahead_node_child(node, nr);
1300                         nread += blocksize;
1301                 }
1302                 nscan++;
1303                 if (nread > nread_max || nscan > 32)
1304                         break;
1305         }
1306 }
1307
1308 static noinline void reada_for_balance(struct btrfs_path *path, int level)
1309 {
1310         struct extent_buffer *parent;
1311         int slot;
1312         int nritems;
1313
1314         parent = path->nodes[level + 1];
1315         if (!parent)
1316                 return;
1317
1318         nritems = btrfs_header_nritems(parent);
1319         slot = path->slots[level + 1];
1320
1321         if (slot > 0)
1322                 btrfs_readahead_node_child(parent, slot - 1);
1323         if (slot + 1 < nritems)
1324                 btrfs_readahead_node_child(parent, slot + 1);
1325 }
1326
1327
1328 /*
1329  * when we walk down the tree, it is usually safe to unlock the higher layers
1330  * in the tree.  The exceptions are when our path goes through slot 0, because
1331  * operations on the tree might require changing key pointers higher up in the
1332  * tree.
1333  *
1334  * callers might also have set path->keep_locks, which tells this code to keep
1335  * the lock if the path points to the last slot in the block.  This is part of
1336  * walking through the tree, and selecting the next slot in the higher block.
1337  *
1338  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1339  * if lowest_unlock is 1, level 0 won't be unlocked
1340  */
1341 static noinline void unlock_up(struct btrfs_path *path, int level,
1342                                int lowest_unlock, int min_write_lock_level,
1343                                int *write_lock_level)
1344 {
1345         int i;
1346         int skip_level = level;
1347         int no_skips = 0;
1348         struct extent_buffer *t;
1349
1350         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1351                 if (!path->nodes[i])
1352                         break;
1353                 if (!path->locks[i])
1354                         break;
1355                 if (!no_skips && path->slots[i] == 0) {
1356                         skip_level = i + 1;
1357                         continue;
1358                 }
1359                 if (!no_skips && path->keep_locks) {
1360                         u32 nritems;
1361                         t = path->nodes[i];
1362                         nritems = btrfs_header_nritems(t);
1363                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
1364                                 skip_level = i + 1;
1365                                 continue;
1366                         }
1367                 }
1368                 if (skip_level < i && i >= lowest_unlock)
1369                         no_skips = 1;
1370
1371                 t = path->nodes[i];
1372                 if (i >= lowest_unlock && i > skip_level) {
1373                         btrfs_tree_unlock_rw(t, path->locks[i]);
1374                         path->locks[i] = 0;
1375                         if (write_lock_level &&
1376                             i > min_write_lock_level &&
1377                             i <= *write_lock_level) {
1378                                 *write_lock_level = i - 1;
1379                         }
1380                 }
1381         }
1382 }
1383
1384 /*
1385  * helper function for btrfs_search_slot.  The goal is to find a block
1386  * in cache without setting the path to blocking.  If we find the block
1387  * we return zero and the path is unchanged.
1388  *
1389  * If we can't find the block, we set the path blocking and do some
1390  * reada.  -EAGAIN is returned and the search must be repeated.
1391  */
1392 static int
1393 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1394                       struct extent_buffer **eb_ret, int level, int slot,
1395                       const struct btrfs_key *key)
1396 {
1397         struct btrfs_fs_info *fs_info = root->fs_info;
1398         u64 blocknr;
1399         u64 gen;
1400         struct extent_buffer *tmp;
1401         struct btrfs_key first_key;
1402         int ret;
1403         int parent_level;
1404
1405         blocknr = btrfs_node_blockptr(*eb_ret, slot);
1406         gen = btrfs_node_ptr_generation(*eb_ret, slot);
1407         parent_level = btrfs_header_level(*eb_ret);
1408         btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
1409
1410         tmp = find_extent_buffer(fs_info, blocknr);
1411         if (tmp) {
1412                 if (p->reada == READA_FORWARD_ALWAYS)
1413                         reada_for_search(fs_info, p, level, slot, key->objectid);
1414
1415                 /* first we do an atomic uptodate check */
1416                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1417                         /*
1418                          * Do extra check for first_key, eb can be stale due to
1419                          * being cached, read from scrub, or have multiple
1420                          * parents (shared tree blocks).
1421                          */
1422                         if (btrfs_verify_level_key(tmp,
1423                                         parent_level - 1, &first_key, gen)) {
1424                                 free_extent_buffer(tmp);
1425                                 return -EUCLEAN;
1426                         }
1427                         *eb_ret = tmp;
1428                         return 0;
1429                 }
1430
1431                 /* now we're allowed to do a blocking uptodate check */
1432                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
1433                 if (!ret) {
1434                         *eb_ret = tmp;
1435                         return 0;
1436                 }
1437                 free_extent_buffer(tmp);
1438                 btrfs_release_path(p);
1439                 return -EIO;
1440         }
1441
1442         /*
1443          * reduce lock contention at high levels
1444          * of the btree by dropping locks before
1445          * we read.  Don't release the lock on the current
1446          * level because we need to walk this node to figure
1447          * out which blocks to read.
1448          */
1449         btrfs_unlock_up_safe(p, level + 1);
1450
1451         if (p->reada != READA_NONE)
1452                 reada_for_search(fs_info, p, level, slot, key->objectid);
1453
1454         ret = -EAGAIN;
1455         tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
1456                               gen, parent_level - 1, &first_key);
1457         if (!IS_ERR(tmp)) {
1458                 /*
1459                  * If the read above didn't mark this buffer up to date,
1460                  * it will never end up being up to date.  Set ret to EIO now
1461                  * and give up so that our caller doesn't loop forever
1462                  * on our EAGAINs.
1463                  */
1464                 if (!extent_buffer_uptodate(tmp))
1465                         ret = -EIO;
1466                 free_extent_buffer(tmp);
1467         } else {
1468                 ret = PTR_ERR(tmp);
1469         }
1470
1471         btrfs_release_path(p);
1472         return ret;
1473 }
1474
1475 /*
1476  * helper function for btrfs_search_slot.  This does all of the checks
1477  * for node-level blocks and does any balancing required based on
1478  * the ins_len.
1479  *
1480  * If no extra work was required, zero is returned.  If we had to
1481  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1482  * start over
1483  */
1484 static int
1485 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1486                        struct btrfs_root *root, struct btrfs_path *p,
1487                        struct extent_buffer *b, int level, int ins_len,
1488                        int *write_lock_level)
1489 {
1490         struct btrfs_fs_info *fs_info = root->fs_info;
1491         int ret = 0;
1492
1493         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1494             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1495
1496                 if (*write_lock_level < level + 1) {
1497                         *write_lock_level = level + 1;
1498                         btrfs_release_path(p);
1499                         return -EAGAIN;
1500                 }
1501
1502                 reada_for_balance(p, level);
1503                 ret = split_node(trans, root, p, level);
1504
1505                 b = p->nodes[level];
1506         } else if (ins_len < 0 && btrfs_header_nritems(b) <
1507                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1508
1509                 if (*write_lock_level < level + 1) {
1510                         *write_lock_level = level + 1;
1511                         btrfs_release_path(p);
1512                         return -EAGAIN;
1513                 }
1514
1515                 reada_for_balance(p, level);
1516                 ret = balance_level(trans, root, p, level);
1517                 if (ret)
1518                         return ret;
1519
1520                 b = p->nodes[level];
1521                 if (!b) {
1522                         btrfs_release_path(p);
1523                         return -EAGAIN;
1524                 }
1525                 BUG_ON(btrfs_header_nritems(b) == 1);
1526         }
1527         return ret;
1528 }
1529
1530 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1531                 u64 iobjectid, u64 ioff, u8 key_type,
1532                 struct btrfs_key *found_key)
1533 {
1534         int ret;
1535         struct btrfs_key key;
1536         struct extent_buffer *eb;
1537
1538         ASSERT(path);
1539         ASSERT(found_key);
1540
1541         key.type = key_type;
1542         key.objectid = iobjectid;
1543         key.offset = ioff;
1544
1545         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1546         if (ret < 0)
1547                 return ret;
1548
1549         eb = path->nodes[0];
1550         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1551                 ret = btrfs_next_leaf(fs_root, path);
1552                 if (ret)
1553                         return ret;
1554                 eb = path->nodes[0];
1555         }
1556
1557         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1558         if (found_key->type != key.type ||
1559                         found_key->objectid != key.objectid)
1560                 return 1;
1561
1562         return 0;
1563 }
1564
1565 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1566                                                         struct btrfs_path *p,
1567                                                         int write_lock_level)
1568 {
1569         struct btrfs_fs_info *fs_info = root->fs_info;
1570         struct extent_buffer *b;
1571         int root_lock;
1572         int level = 0;
1573
1574         /* We try very hard to do read locks on the root */
1575         root_lock = BTRFS_READ_LOCK;
1576
1577         if (p->search_commit_root) {
1578                 /*
1579                  * The commit roots are read only so we always do read locks,
1580                  * and we always must hold the commit_root_sem when doing
1581                  * searches on them, the only exception is send where we don't
1582                  * want to block transaction commits for a long time, so
1583                  * we need to clone the commit root in order to avoid races
1584                  * with transaction commits that create a snapshot of one of
1585                  * the roots used by a send operation.
1586                  */
1587                 if (p->need_commit_sem) {
1588                         down_read(&fs_info->commit_root_sem);
1589                         b = btrfs_clone_extent_buffer(root->commit_root);
1590                         up_read(&fs_info->commit_root_sem);
1591                         if (!b)
1592                                 return ERR_PTR(-ENOMEM);
1593
1594                 } else {
1595                         b = root->commit_root;
1596                         atomic_inc(&b->refs);
1597                 }
1598                 level = btrfs_header_level(b);
1599                 /*
1600                  * Ensure that all callers have set skip_locking when
1601                  * p->search_commit_root = 1.
1602                  */
1603                 ASSERT(p->skip_locking == 1);
1604
1605                 goto out;
1606         }
1607
1608         if (p->skip_locking) {
1609                 b = btrfs_root_node(root);
1610                 level = btrfs_header_level(b);
1611                 goto out;
1612         }
1613
1614         /*
1615          * If the level is set to maximum, we can skip trying to get the read
1616          * lock.
1617          */
1618         if (write_lock_level < BTRFS_MAX_LEVEL) {
1619                 /*
1620                  * We don't know the level of the root node until we actually
1621                  * have it read locked
1622                  */
1623                 b = btrfs_read_lock_root_node(root);
1624                 level = btrfs_header_level(b);
1625                 if (level > write_lock_level)
1626                         goto out;
1627
1628                 /* Whoops, must trade for write lock */
1629                 btrfs_tree_read_unlock(b);
1630                 free_extent_buffer(b);
1631         }
1632
1633         b = btrfs_lock_root_node(root);
1634         root_lock = BTRFS_WRITE_LOCK;
1635
1636         /* The level might have changed, check again */
1637         level = btrfs_header_level(b);
1638
1639 out:
1640         p->nodes[level] = b;
1641         if (!p->skip_locking)
1642                 p->locks[level] = root_lock;
1643         /*
1644          * Callers are responsible for dropping b's references.
1645          */
1646         return b;
1647 }
1648
1649
1650 /*
1651  * btrfs_search_slot - look for a key in a tree and perform necessary
1652  * modifications to preserve tree invariants.
1653  *
1654  * @trans:      Handle of transaction, used when modifying the tree
1655  * @p:          Holds all btree nodes along the search path
1656  * @root:       The root node of the tree
1657  * @key:        The key we are looking for
1658  * @ins_len:    Indicates purpose of search:
1659  *              >0  for inserts it's size of item inserted (*)
1660  *              <0  for deletions
1661  *               0  for plain searches, not modifying the tree
1662  *
1663  *              (*) If size of item inserted doesn't include
1664  *              sizeof(struct btrfs_item), then p->search_for_extension must
1665  *              be set.
1666  * @cow:        boolean should CoW operations be performed. Must always be 1
1667  *              when modifying the tree.
1668  *
1669  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1670  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1671  *
1672  * If @key is found, 0 is returned and you can find the item in the leaf level
1673  * of the path (level 0)
1674  *
1675  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1676  * points to the slot where it should be inserted
1677  *
1678  * If an error is encountered while searching the tree a negative error number
1679  * is returned
1680  */
1681 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1682                       const struct btrfs_key *key, struct btrfs_path *p,
1683                       int ins_len, int cow)
1684 {
1685         struct extent_buffer *b;
1686         int slot;
1687         int ret;
1688         int err;
1689         int level;
1690         int lowest_unlock = 1;
1691         /* everything at write_lock_level or lower must be write locked */
1692         int write_lock_level = 0;
1693         u8 lowest_level = 0;
1694         int min_write_lock_level;
1695         int prev_cmp;
1696
1697         lowest_level = p->lowest_level;
1698         WARN_ON(lowest_level && ins_len > 0);
1699         WARN_ON(p->nodes[0] != NULL);
1700         BUG_ON(!cow && ins_len);
1701
1702         if (ins_len < 0) {
1703                 lowest_unlock = 2;
1704
1705                 /* when we are removing items, we might have to go up to level
1706                  * two as we update tree pointers  Make sure we keep write
1707                  * for those levels as well
1708                  */
1709                 write_lock_level = 2;
1710         } else if (ins_len > 0) {
1711                 /*
1712                  * for inserting items, make sure we have a write lock on
1713                  * level 1 so we can update keys
1714                  */
1715                 write_lock_level = 1;
1716         }
1717
1718         if (!cow)
1719                 write_lock_level = -1;
1720
1721         if (cow && (p->keep_locks || p->lowest_level))
1722                 write_lock_level = BTRFS_MAX_LEVEL;
1723
1724         min_write_lock_level = write_lock_level;
1725
1726 again:
1727         prev_cmp = -1;
1728         b = btrfs_search_slot_get_root(root, p, write_lock_level);
1729         if (IS_ERR(b)) {
1730                 ret = PTR_ERR(b);
1731                 goto done;
1732         }
1733
1734         while (b) {
1735                 int dec = 0;
1736
1737                 level = btrfs_header_level(b);
1738
1739                 if (cow) {
1740                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
1741
1742                         /*
1743                          * if we don't really need to cow this block
1744                          * then we don't want to set the path blocking,
1745                          * so we test it here
1746                          */
1747                         if (!should_cow_block(trans, root, b))
1748                                 goto cow_done;
1749
1750                         /*
1751                          * must have write locks on this node and the
1752                          * parent
1753                          */
1754                         if (level > write_lock_level ||
1755                             (level + 1 > write_lock_level &&
1756                             level + 1 < BTRFS_MAX_LEVEL &&
1757                             p->nodes[level + 1])) {
1758                                 write_lock_level = level + 1;
1759                                 btrfs_release_path(p);
1760                                 goto again;
1761                         }
1762
1763                         if (last_level)
1764                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
1765                                                       &b,
1766                                                       BTRFS_NESTING_COW);
1767                         else
1768                                 err = btrfs_cow_block(trans, root, b,
1769                                                       p->nodes[level + 1],
1770                                                       p->slots[level + 1], &b,
1771                                                       BTRFS_NESTING_COW);
1772                         if (err) {
1773                                 ret = err;
1774                                 goto done;
1775                         }
1776                 }
1777 cow_done:
1778                 p->nodes[level] = b;
1779                 /*
1780                  * Leave path with blocking locks to avoid massive
1781                  * lock context switch, this is made on purpose.
1782                  */
1783
1784                 /*
1785                  * we have a lock on b and as long as we aren't changing
1786                  * the tree, there is no way to for the items in b to change.
1787                  * It is safe to drop the lock on our parent before we
1788                  * go through the expensive btree search on b.
1789                  *
1790                  * If we're inserting or deleting (ins_len != 0), then we might
1791                  * be changing slot zero, which may require changing the parent.
1792                  * So, we can't drop the lock until after we know which slot
1793                  * we're operating on.
1794                  */
1795                 if (!ins_len && !p->keep_locks) {
1796                         int u = level + 1;
1797
1798                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
1799                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
1800                                 p->locks[u] = 0;
1801                         }
1802                 }
1803
1804                 /*
1805                  * If btrfs_bin_search returns an exact match (prev_cmp == 0)
1806                  * we can safely assume the target key will always be in slot 0
1807                  * on lower levels due to the invariants BTRFS' btree provides,
1808                  * namely that a btrfs_key_ptr entry always points to the
1809                  * lowest key in the child node, thus we can skip searching
1810                  * lower levels
1811                  */
1812                 if (prev_cmp == 0) {
1813                         slot = 0;
1814                         ret = 0;
1815                 } else {
1816                         ret = btrfs_bin_search(b, key, &slot);
1817                         prev_cmp = ret;
1818                         if (ret < 0)
1819                                 goto done;
1820                 }
1821
1822                 if (level == 0) {
1823                         p->slots[level] = slot;
1824                         /*
1825                          * Item key already exists. In this case, if we are
1826                          * allowed to insert the item (for example, in dir_item
1827                          * case, item key collision is allowed), it will be
1828                          * merged with the original item. Only the item size
1829                          * grows, no new btrfs item will be added. If
1830                          * search_for_extension is not set, ins_len already
1831                          * accounts the size btrfs_item, deduct it here so leaf
1832                          * space check will be correct.
1833                          */
1834                         if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
1835                                 ASSERT(ins_len >= sizeof(struct btrfs_item));
1836                                 ins_len -= sizeof(struct btrfs_item);
1837                         }
1838                         if (ins_len > 0 &&
1839                             btrfs_leaf_free_space(b) < ins_len) {
1840                                 if (write_lock_level < 1) {
1841                                         write_lock_level = 1;
1842                                         btrfs_release_path(p);
1843                                         goto again;
1844                                 }
1845
1846                                 err = split_leaf(trans, root, key,
1847                                                  p, ins_len, ret == 0);
1848
1849                                 BUG_ON(err > 0);
1850                                 if (err) {
1851                                         ret = err;
1852                                         goto done;
1853                                 }
1854                         }
1855                         if (!p->search_for_split)
1856                                 unlock_up(p, level, lowest_unlock,
1857                                           min_write_lock_level, NULL);
1858                         goto done;
1859                 }
1860                 if (ret && slot > 0) {
1861                         dec = 1;
1862                         slot--;
1863                 }
1864                 p->slots[level] = slot;
1865                 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
1866                                              &write_lock_level);
1867                 if (err == -EAGAIN)
1868                         goto again;
1869                 if (err) {
1870                         ret = err;
1871                         goto done;
1872                 }
1873                 b = p->nodes[level];
1874                 slot = p->slots[level];
1875
1876                 /*
1877                  * Slot 0 is special, if we change the key we have to update
1878                  * the parent pointer which means we must have a write lock on
1879                  * the parent
1880                  */
1881                 if (slot == 0 && ins_len && write_lock_level < level + 1) {
1882                         write_lock_level = level + 1;
1883                         btrfs_release_path(p);
1884                         goto again;
1885                 }
1886
1887                 unlock_up(p, level, lowest_unlock, min_write_lock_level,
1888                           &write_lock_level);
1889
1890                 if (level == lowest_level) {
1891                         if (dec)
1892                                 p->slots[level]++;
1893                         goto done;
1894                 }
1895
1896                 err = read_block_for_search(root, p, &b, level, slot, key);
1897                 if (err == -EAGAIN)
1898                         goto again;
1899                 if (err) {
1900                         ret = err;
1901                         goto done;
1902                 }
1903
1904                 if (!p->skip_locking) {
1905                         level = btrfs_header_level(b);
1906                         if (level <= write_lock_level) {
1907                                 btrfs_tree_lock(b);
1908                                 p->locks[level] = BTRFS_WRITE_LOCK;
1909                         } else {
1910                                 btrfs_tree_read_lock(b);
1911                                 p->locks[level] = BTRFS_READ_LOCK;
1912                         }
1913                         p->nodes[level] = b;
1914                 }
1915         }
1916         ret = 1;
1917 done:
1918         if (ret < 0 && !p->skip_release_on_error)
1919                 btrfs_release_path(p);
1920         return ret;
1921 }
1922 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
1923
1924 /*
1925  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
1926  * current state of the tree together with the operations recorded in the tree
1927  * modification log to search for the key in a previous version of this tree, as
1928  * denoted by the time_seq parameter.
1929  *
1930  * Naturally, there is no support for insert, delete or cow operations.
1931  *
1932  * The resulting path and return value will be set up as if we called
1933  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
1934  */
1935 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
1936                           struct btrfs_path *p, u64 time_seq)
1937 {
1938         struct btrfs_fs_info *fs_info = root->fs_info;
1939         struct extent_buffer *b;
1940         int slot;
1941         int ret;
1942         int err;
1943         int level;
1944         int lowest_unlock = 1;
1945         u8 lowest_level = 0;
1946
1947         lowest_level = p->lowest_level;
1948         WARN_ON(p->nodes[0] != NULL);
1949
1950         if (p->search_commit_root) {
1951                 BUG_ON(time_seq);
1952                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
1953         }
1954
1955 again:
1956         b = btrfs_get_old_root(root, time_seq);
1957         if (!b) {
1958                 ret = -EIO;
1959                 goto done;
1960         }
1961         level = btrfs_header_level(b);
1962         p->locks[level] = BTRFS_READ_LOCK;
1963
1964         while (b) {
1965                 int dec = 0;
1966
1967                 level = btrfs_header_level(b);
1968                 p->nodes[level] = b;
1969
1970                 /*
1971                  * we have a lock on b and as long as we aren't changing
1972                  * the tree, there is no way to for the items in b to change.
1973                  * It is safe to drop the lock on our parent before we
1974                  * go through the expensive btree search on b.
1975                  */
1976                 btrfs_unlock_up_safe(p, level + 1);
1977
1978                 ret = btrfs_bin_search(b, key, &slot);
1979                 if (ret < 0)
1980                         goto done;
1981
1982                 if (level == 0) {
1983                         p->slots[level] = slot;
1984                         unlock_up(p, level, lowest_unlock, 0, NULL);
1985                         goto done;
1986                 }
1987
1988                 if (ret && slot > 0) {
1989                         dec = 1;
1990                         slot--;
1991                 }
1992                 p->slots[level] = slot;
1993                 unlock_up(p, level, lowest_unlock, 0, NULL);
1994
1995                 if (level == lowest_level) {
1996                         if (dec)
1997                                 p->slots[level]++;
1998                         goto done;
1999                 }
2000
2001                 err = read_block_for_search(root, p, &b, level, slot, key);
2002                 if (err == -EAGAIN)
2003                         goto again;
2004                 if (err) {
2005                         ret = err;
2006                         goto done;
2007                 }
2008
2009                 level = btrfs_header_level(b);
2010                 btrfs_tree_read_lock(b);
2011                 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2012                 if (!b) {
2013                         ret = -ENOMEM;
2014                         goto done;
2015                 }
2016                 p->locks[level] = BTRFS_READ_LOCK;
2017                 p->nodes[level] = b;
2018         }
2019         ret = 1;
2020 done:
2021         if (ret < 0)
2022                 btrfs_release_path(p);
2023
2024         return ret;
2025 }
2026
2027 /*
2028  * helper to use instead of search slot if no exact match is needed but
2029  * instead the next or previous item should be returned.
2030  * When find_higher is true, the next higher item is returned, the next lower
2031  * otherwise.
2032  * When return_any and find_higher are both true, and no higher item is found,
2033  * return the next lower instead.
2034  * When return_any is true and find_higher is false, and no lower item is found,
2035  * return the next higher instead.
2036  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2037  * < 0 on error
2038  */
2039 int btrfs_search_slot_for_read(struct btrfs_root *root,
2040                                const struct btrfs_key *key,
2041                                struct btrfs_path *p, int find_higher,
2042                                int return_any)
2043 {
2044         int ret;
2045         struct extent_buffer *leaf;
2046
2047 again:
2048         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2049         if (ret <= 0)
2050                 return ret;
2051         /*
2052          * a return value of 1 means the path is at the position where the
2053          * item should be inserted. Normally this is the next bigger item,
2054          * but in case the previous item is the last in a leaf, path points
2055          * to the first free slot in the previous leaf, i.e. at an invalid
2056          * item.
2057          */
2058         leaf = p->nodes[0];
2059
2060         if (find_higher) {
2061                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2062                         ret = btrfs_next_leaf(root, p);
2063                         if (ret <= 0)
2064                                 return ret;
2065                         if (!return_any)
2066                                 return 1;
2067                         /*
2068                          * no higher item found, return the next
2069                          * lower instead
2070                          */
2071                         return_any = 0;
2072                         find_higher = 0;
2073                         btrfs_release_path(p);
2074                         goto again;
2075                 }
2076         } else {
2077                 if (p->slots[0] == 0) {
2078                         ret = btrfs_prev_leaf(root, p);
2079                         if (ret < 0)
2080                                 return ret;
2081                         if (!ret) {
2082                                 leaf = p->nodes[0];
2083                                 if (p->slots[0] == btrfs_header_nritems(leaf))
2084                                         p->slots[0]--;
2085                                 return 0;
2086                         }
2087                         if (!return_any)
2088                                 return 1;
2089                         /*
2090                          * no lower item found, return the next
2091                          * higher instead
2092                          */
2093                         return_any = 0;
2094                         find_higher = 1;
2095                         btrfs_release_path(p);
2096                         goto again;
2097                 } else {
2098                         --p->slots[0];
2099                 }
2100         }
2101         return 0;
2102 }
2103
2104 /*
2105  * Execute search and call btrfs_previous_item to traverse backwards if the item
2106  * was not found.
2107  *
2108  * Return 0 if found, 1 if not found and < 0 if error.
2109  */
2110 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2111                            struct btrfs_path *path)
2112 {
2113         int ret;
2114
2115         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2116         if (ret > 0)
2117                 ret = btrfs_previous_item(root, path, key->objectid, key->type);
2118
2119         if (ret == 0)
2120                 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2121
2122         return ret;
2123 }
2124
2125 /*
2126  * adjust the pointers going up the tree, starting at level
2127  * making sure the right key of each node is points to 'key'.
2128  * This is used after shifting pointers to the left, so it stops
2129  * fixing up pointers when a given leaf/node is not in slot 0 of the
2130  * higher levels
2131  *
2132  */
2133 static void fixup_low_keys(struct btrfs_path *path,
2134                            struct btrfs_disk_key *key, int level)
2135 {
2136         int i;
2137         struct extent_buffer *t;
2138         int ret;
2139
2140         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2141                 int tslot = path->slots[i];
2142
2143                 if (!path->nodes[i])
2144                         break;
2145                 t = path->nodes[i];
2146                 ret = btrfs_tree_mod_log_insert_key(t, tslot,
2147                                 BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
2148                 BUG_ON(ret < 0);
2149                 btrfs_set_node_key(t, key, tslot);
2150                 btrfs_mark_buffer_dirty(path->nodes[i]);
2151                 if (tslot != 0)
2152                         break;
2153         }
2154 }
2155
2156 /*
2157  * update item key.
2158  *
2159  * This function isn't completely safe. It's the caller's responsibility
2160  * that the new key won't break the order
2161  */
2162 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2163                              struct btrfs_path *path,
2164                              const struct btrfs_key *new_key)
2165 {
2166         struct btrfs_disk_key disk_key;
2167         struct extent_buffer *eb;
2168         int slot;
2169
2170         eb = path->nodes[0];
2171         slot = path->slots[0];
2172         if (slot > 0) {
2173                 btrfs_item_key(eb, &disk_key, slot - 1);
2174                 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2175                         btrfs_crit(fs_info,
2176                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2177                                    slot, btrfs_disk_key_objectid(&disk_key),
2178                                    btrfs_disk_key_type(&disk_key),
2179                                    btrfs_disk_key_offset(&disk_key),
2180                                    new_key->objectid, new_key->type,
2181                                    new_key->offset);
2182                         btrfs_print_leaf(eb);
2183                         BUG();
2184                 }
2185         }
2186         if (slot < btrfs_header_nritems(eb) - 1) {
2187                 btrfs_item_key(eb, &disk_key, slot + 1);
2188                 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2189                         btrfs_crit(fs_info,
2190                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2191                                    slot, btrfs_disk_key_objectid(&disk_key),
2192                                    btrfs_disk_key_type(&disk_key),
2193                                    btrfs_disk_key_offset(&disk_key),
2194                                    new_key->objectid, new_key->type,
2195                                    new_key->offset);
2196                         btrfs_print_leaf(eb);
2197                         BUG();
2198                 }
2199         }
2200
2201         btrfs_cpu_key_to_disk(&disk_key, new_key);
2202         btrfs_set_item_key(eb, &disk_key, slot);
2203         btrfs_mark_buffer_dirty(eb);
2204         if (slot == 0)
2205                 fixup_low_keys(path, &disk_key, 1);
2206 }
2207
2208 /*
2209  * Check key order of two sibling extent buffers.
2210  *
2211  * Return true if something is wrong.
2212  * Return false if everything is fine.
2213  *
2214  * Tree-checker only works inside one tree block, thus the following
2215  * corruption can not be detected by tree-checker:
2216  *
2217  * Leaf @left                   | Leaf @right
2218  * --------------------------------------------------------------
2219  * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
2220  *
2221  * Key f6 in leaf @left itself is valid, but not valid when the next
2222  * key in leaf @right is 7.
2223  * This can only be checked at tree block merge time.
2224  * And since tree checker has ensured all key order in each tree block
2225  * is correct, we only need to bother the last key of @left and the first
2226  * key of @right.
2227  */
2228 static bool check_sibling_keys(struct extent_buffer *left,
2229                                struct extent_buffer *right)
2230 {
2231         struct btrfs_key left_last;
2232         struct btrfs_key right_first;
2233         int level = btrfs_header_level(left);
2234         int nr_left = btrfs_header_nritems(left);
2235         int nr_right = btrfs_header_nritems(right);
2236
2237         /* No key to check in one of the tree blocks */
2238         if (!nr_left || !nr_right)
2239                 return false;
2240
2241         if (level) {
2242                 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2243                 btrfs_node_key_to_cpu(right, &right_first, 0);
2244         } else {
2245                 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2246                 btrfs_item_key_to_cpu(right, &right_first, 0);
2247         }
2248
2249         if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
2250                 btrfs_crit(left->fs_info,
2251 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2252                            left_last.objectid, left_last.type,
2253                            left_last.offset, right_first.objectid,
2254                            right_first.type, right_first.offset);
2255                 return true;
2256         }
2257         return false;
2258 }
2259
2260 /*
2261  * try to push data from one node into the next node left in the
2262  * tree.
2263  *
2264  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2265  * error, and > 0 if there was no room in the left hand block.
2266  */
2267 static int push_node_left(struct btrfs_trans_handle *trans,
2268                           struct extent_buffer *dst,
2269                           struct extent_buffer *src, int empty)
2270 {
2271         struct btrfs_fs_info *fs_info = trans->fs_info;
2272         int push_items = 0;
2273         int src_nritems;
2274         int dst_nritems;
2275         int ret = 0;
2276
2277         src_nritems = btrfs_header_nritems(src);
2278         dst_nritems = btrfs_header_nritems(dst);
2279         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2280         WARN_ON(btrfs_header_generation(src) != trans->transid);
2281         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2282
2283         if (!empty && src_nritems <= 8)
2284                 return 1;
2285
2286         if (push_items <= 0)
2287                 return 1;
2288
2289         if (empty) {
2290                 push_items = min(src_nritems, push_items);
2291                 if (push_items < src_nritems) {
2292                         /* leave at least 8 pointers in the node if
2293                          * we aren't going to empty it
2294                          */
2295                         if (src_nritems - push_items < 8) {
2296                                 if (push_items <= 8)
2297                                         return 1;
2298                                 push_items -= 8;
2299                         }
2300                 }
2301         } else
2302                 push_items = min(src_nritems - 8, push_items);
2303
2304         /* dst is the left eb, src is the middle eb */
2305         if (check_sibling_keys(dst, src)) {
2306                 ret = -EUCLEAN;
2307                 btrfs_abort_transaction(trans, ret);
2308                 return ret;
2309         }
2310         ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2311         if (ret) {
2312                 btrfs_abort_transaction(trans, ret);
2313                 return ret;
2314         }
2315         copy_extent_buffer(dst, src,
2316                            btrfs_node_key_ptr_offset(dst_nritems),
2317                            btrfs_node_key_ptr_offset(0),
2318                            push_items * sizeof(struct btrfs_key_ptr));
2319
2320         if (push_items < src_nritems) {
2321                 /*
2322                  * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2323                  * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
2324                  */
2325                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2326                                       btrfs_node_key_ptr_offset(push_items),
2327                                       (src_nritems - push_items) *
2328                                       sizeof(struct btrfs_key_ptr));
2329         }
2330         btrfs_set_header_nritems(src, src_nritems - push_items);
2331         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2332         btrfs_mark_buffer_dirty(src);
2333         btrfs_mark_buffer_dirty(dst);
2334
2335         return ret;
2336 }
2337
2338 /*
2339  * try to push data from one node into the next node right in the
2340  * tree.
2341  *
2342  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2343  * error, and > 0 if there was no room in the right hand block.
2344  *
2345  * this will  only push up to 1/2 the contents of the left node over
2346  */
2347 static int balance_node_right(struct btrfs_trans_handle *trans,
2348                               struct extent_buffer *dst,
2349                               struct extent_buffer *src)
2350 {
2351         struct btrfs_fs_info *fs_info = trans->fs_info;
2352         int push_items = 0;
2353         int max_push;
2354         int src_nritems;
2355         int dst_nritems;
2356         int ret = 0;
2357
2358         WARN_ON(btrfs_header_generation(src) != trans->transid);
2359         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2360
2361         src_nritems = btrfs_header_nritems(src);
2362         dst_nritems = btrfs_header_nritems(dst);
2363         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2364         if (push_items <= 0)
2365                 return 1;
2366
2367         if (src_nritems < 4)
2368                 return 1;
2369
2370         max_push = src_nritems / 2 + 1;
2371         /* don't try to empty the node */
2372         if (max_push >= src_nritems)
2373                 return 1;
2374
2375         if (max_push < push_items)
2376                 push_items = max_push;
2377
2378         /* dst is the right eb, src is the middle eb */
2379         if (check_sibling_keys(src, dst)) {
2380                 ret = -EUCLEAN;
2381                 btrfs_abort_transaction(trans, ret);
2382                 return ret;
2383         }
2384         ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
2385         BUG_ON(ret < 0);
2386         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2387                                       btrfs_node_key_ptr_offset(0),
2388                                       (dst_nritems) *
2389                                       sizeof(struct btrfs_key_ptr));
2390
2391         ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2392                                          push_items);
2393         if (ret) {
2394                 btrfs_abort_transaction(trans, ret);
2395                 return ret;
2396         }
2397         copy_extent_buffer(dst, src,
2398                            btrfs_node_key_ptr_offset(0),
2399                            btrfs_node_key_ptr_offset(src_nritems - push_items),
2400                            push_items * sizeof(struct btrfs_key_ptr));
2401
2402         btrfs_set_header_nritems(src, src_nritems - push_items);
2403         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2404
2405         btrfs_mark_buffer_dirty(src);
2406         btrfs_mark_buffer_dirty(dst);
2407
2408         return ret;
2409 }
2410
2411 /*
2412  * helper function to insert a new root level in the tree.
2413  * A new node is allocated, and a single item is inserted to
2414  * point to the existing root
2415  *
2416  * returns zero on success or < 0 on failure.
2417  */
2418 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2419                            struct btrfs_root *root,
2420                            struct btrfs_path *path, int level)
2421 {
2422         struct btrfs_fs_info *fs_info = root->fs_info;
2423         u64 lower_gen;
2424         struct extent_buffer *lower;
2425         struct extent_buffer *c;
2426         struct extent_buffer *old;
2427         struct btrfs_disk_key lower_key;
2428         int ret;
2429
2430         BUG_ON(path->nodes[level]);
2431         BUG_ON(path->nodes[level-1] != root->node);
2432
2433         lower = path->nodes[level-1];
2434         if (level == 1)
2435                 btrfs_item_key(lower, &lower_key, 0);
2436         else
2437                 btrfs_node_key(lower, &lower_key, 0);
2438
2439         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2440                                    &lower_key, level, root->node->start, 0,
2441                                    BTRFS_NESTING_NEW_ROOT);
2442         if (IS_ERR(c))
2443                 return PTR_ERR(c);
2444
2445         root_add_used(root, fs_info->nodesize);
2446
2447         btrfs_set_header_nritems(c, 1);
2448         btrfs_set_node_key(c, &lower_key, 0);
2449         btrfs_set_node_blockptr(c, 0, lower->start);
2450         lower_gen = btrfs_header_generation(lower);
2451         WARN_ON(lower_gen != trans->transid);
2452
2453         btrfs_set_node_ptr_generation(c, 0, lower_gen);
2454
2455         btrfs_mark_buffer_dirty(c);
2456
2457         old = root->node;
2458         ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2459         BUG_ON(ret < 0);
2460         rcu_assign_pointer(root->node, c);
2461
2462         /* the super has an extra ref to root->node */
2463         free_extent_buffer(old);
2464
2465         add_root_to_dirty_list(root);
2466         atomic_inc(&c->refs);
2467         path->nodes[level] = c;
2468         path->locks[level] = BTRFS_WRITE_LOCK;
2469         path->slots[level] = 0;
2470         return 0;
2471 }
2472
2473 /*
2474  * worker function to insert a single pointer in a node.
2475  * the node should have enough room for the pointer already
2476  *
2477  * slot and level indicate where you want the key to go, and
2478  * blocknr is the block the key points to.
2479  */
2480 static void insert_ptr(struct btrfs_trans_handle *trans,
2481                        struct btrfs_path *path,
2482                        struct btrfs_disk_key *key, u64 bytenr,
2483                        int slot, int level)
2484 {
2485         struct extent_buffer *lower;
2486         int nritems;
2487         int ret;
2488
2489         BUG_ON(!path->nodes[level]);
2490         btrfs_assert_tree_locked(path->nodes[level]);
2491         lower = path->nodes[level];
2492         nritems = btrfs_header_nritems(lower);
2493         BUG_ON(slot > nritems);
2494         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2495         if (slot != nritems) {
2496                 if (level) {
2497                         ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2498                                         slot, nritems - slot);
2499                         BUG_ON(ret < 0);
2500                 }
2501                 memmove_extent_buffer(lower,
2502                               btrfs_node_key_ptr_offset(slot + 1),
2503                               btrfs_node_key_ptr_offset(slot),
2504                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2505         }
2506         if (level) {
2507                 ret = btrfs_tree_mod_log_insert_key(lower, slot,
2508                                             BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
2509                 BUG_ON(ret < 0);
2510         }
2511         btrfs_set_node_key(lower, key, slot);
2512         btrfs_set_node_blockptr(lower, slot, bytenr);
2513         WARN_ON(trans->transid == 0);
2514         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2515         btrfs_set_header_nritems(lower, nritems + 1);
2516         btrfs_mark_buffer_dirty(lower);
2517 }
2518
2519 /*
2520  * split the node at the specified level in path in two.
2521  * The path is corrected to point to the appropriate node after the split
2522  *
2523  * Before splitting this tries to make some room in the node by pushing
2524  * left and right, if either one works, it returns right away.
2525  *
2526  * returns 0 on success and < 0 on failure
2527  */
2528 static noinline int split_node(struct btrfs_trans_handle *trans,
2529                                struct btrfs_root *root,
2530                                struct btrfs_path *path, int level)
2531 {
2532         struct btrfs_fs_info *fs_info = root->fs_info;
2533         struct extent_buffer *c;
2534         struct extent_buffer *split;
2535         struct btrfs_disk_key disk_key;
2536         int mid;
2537         int ret;
2538         u32 c_nritems;
2539
2540         c = path->nodes[level];
2541         WARN_ON(btrfs_header_generation(c) != trans->transid);
2542         if (c == root->node) {
2543                 /*
2544                  * trying to split the root, lets make a new one
2545                  *
2546                  * tree mod log: We don't log_removal old root in
2547                  * insert_new_root, because that root buffer will be kept as a
2548                  * normal node. We are going to log removal of half of the
2549                  * elements below with btrfs_tree_mod_log_eb_copy(). We're
2550                  * holding a tree lock on the buffer, which is why we cannot
2551                  * race with other tree_mod_log users.
2552                  */
2553                 ret = insert_new_root(trans, root, path, level + 1);
2554                 if (ret)
2555                         return ret;
2556         } else {
2557                 ret = push_nodes_for_insert(trans, root, path, level);
2558                 c = path->nodes[level];
2559                 if (!ret && btrfs_header_nritems(c) <
2560                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
2561                         return 0;
2562                 if (ret < 0)
2563                         return ret;
2564         }
2565
2566         c_nritems = btrfs_header_nritems(c);
2567         mid = (c_nritems + 1) / 2;
2568         btrfs_node_key(c, &disk_key, mid);
2569
2570         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2571                                        &disk_key, level, c->start, 0,
2572                                        BTRFS_NESTING_SPLIT);
2573         if (IS_ERR(split))
2574                 return PTR_ERR(split);
2575
2576         root_add_used(root, fs_info->nodesize);
2577         ASSERT(btrfs_header_level(c) == level);
2578
2579         ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
2580         if (ret) {
2581                 btrfs_abort_transaction(trans, ret);
2582                 return ret;
2583         }
2584         copy_extent_buffer(split, c,
2585                            btrfs_node_key_ptr_offset(0),
2586                            btrfs_node_key_ptr_offset(mid),
2587                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2588         btrfs_set_header_nritems(split, c_nritems - mid);
2589         btrfs_set_header_nritems(c, mid);
2590
2591         btrfs_mark_buffer_dirty(c);
2592         btrfs_mark_buffer_dirty(split);
2593
2594         insert_ptr(trans, path, &disk_key, split->start,
2595                    path->slots[level + 1] + 1, level + 1);
2596
2597         if (path->slots[level] >= mid) {
2598                 path->slots[level] -= mid;
2599                 btrfs_tree_unlock(c);
2600                 free_extent_buffer(c);
2601                 path->nodes[level] = split;
2602                 path->slots[level + 1] += 1;
2603         } else {
2604                 btrfs_tree_unlock(split);
2605                 free_extent_buffer(split);
2606         }
2607         return 0;
2608 }
2609
2610 /*
2611  * how many bytes are required to store the items in a leaf.  start
2612  * and nr indicate which items in the leaf to check.  This totals up the
2613  * space used both by the item structs and the item data
2614  */
2615 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2616 {
2617         struct btrfs_item *start_item;
2618         struct btrfs_item *end_item;
2619         int data_len;
2620         int nritems = btrfs_header_nritems(l);
2621         int end = min(nritems, start + nr) - 1;
2622
2623         if (!nr)
2624                 return 0;
2625         start_item = btrfs_item_nr(start);
2626         end_item = btrfs_item_nr(end);
2627         data_len = btrfs_item_offset(l, start_item) +
2628                    btrfs_item_size(l, start_item);
2629         data_len = data_len - btrfs_item_offset(l, end_item);
2630         data_len += sizeof(struct btrfs_item) * nr;
2631         WARN_ON(data_len < 0);
2632         return data_len;
2633 }
2634
2635 /*
2636  * The space between the end of the leaf items and
2637  * the start of the leaf data.  IOW, how much room
2638  * the leaf has left for both items and data
2639  */
2640 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
2641 {
2642         struct btrfs_fs_info *fs_info = leaf->fs_info;
2643         int nritems = btrfs_header_nritems(leaf);
2644         int ret;
2645
2646         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
2647         if (ret < 0) {
2648                 btrfs_crit(fs_info,
2649                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
2650                            ret,
2651                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
2652                            leaf_space_used(leaf, 0, nritems), nritems);
2653         }
2654         return ret;
2655 }
2656
2657 /*
2658  * min slot controls the lowest index we're willing to push to the
2659  * right.  We'll push up to and including min_slot, but no lower
2660  */
2661 static noinline int __push_leaf_right(struct btrfs_path *path,
2662                                       int data_size, int empty,
2663                                       struct extent_buffer *right,
2664                                       int free_space, u32 left_nritems,
2665                                       u32 min_slot)
2666 {
2667         struct btrfs_fs_info *fs_info = right->fs_info;
2668         struct extent_buffer *left = path->nodes[0];
2669         struct extent_buffer *upper = path->nodes[1];
2670         struct btrfs_map_token token;
2671         struct btrfs_disk_key disk_key;
2672         int slot;
2673         u32 i;
2674         int push_space = 0;
2675         int push_items = 0;
2676         struct btrfs_item *item;
2677         u32 nr;
2678         u32 right_nritems;
2679         u32 data_end;
2680         u32 this_item_size;
2681
2682         if (empty)
2683                 nr = 0;
2684         else
2685                 nr = max_t(u32, 1, min_slot);
2686
2687         if (path->slots[0] >= left_nritems)
2688                 push_space += data_size;
2689
2690         slot = path->slots[1];
2691         i = left_nritems - 1;
2692         while (i >= nr) {
2693                 item = btrfs_item_nr(i);
2694
2695                 if (!empty && push_items > 0) {
2696                         if (path->slots[0] > i)
2697                                 break;
2698                         if (path->slots[0] == i) {
2699                                 int space = btrfs_leaf_free_space(left);
2700
2701                                 if (space + push_space * 2 > free_space)
2702                                         break;
2703                         }
2704                 }
2705
2706                 if (path->slots[0] == i)
2707                         push_space += data_size;
2708
2709                 this_item_size = btrfs_item_size(left, item);
2710                 if (this_item_size + sizeof(*item) + push_space > free_space)
2711                         break;
2712
2713                 push_items++;
2714                 push_space += this_item_size + sizeof(*item);
2715                 if (i == 0)
2716                         break;
2717                 i--;
2718         }
2719
2720         if (push_items == 0)
2721                 goto out_unlock;
2722
2723         WARN_ON(!empty && push_items == left_nritems);
2724
2725         /* push left to right */
2726         right_nritems = btrfs_header_nritems(right);
2727
2728         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2729         push_space -= leaf_data_end(left);
2730
2731         /* make room in the right data area */
2732         data_end = leaf_data_end(right);
2733         memmove_extent_buffer(right,
2734                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
2735                               BTRFS_LEAF_DATA_OFFSET + data_end,
2736                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
2737
2738         /* copy from the left data area */
2739         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
2740                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
2741                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
2742                      push_space);
2743
2744         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2745                               btrfs_item_nr_offset(0),
2746                               right_nritems * sizeof(struct btrfs_item));
2747
2748         /* copy the items from left to right */
2749         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2750                    btrfs_item_nr_offset(left_nritems - push_items),
2751                    push_items * sizeof(struct btrfs_item));
2752
2753         /* update the item pointers */
2754         btrfs_init_map_token(&token, right);
2755         right_nritems += push_items;
2756         btrfs_set_header_nritems(right, right_nritems);
2757         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
2758         for (i = 0; i < right_nritems; i++) {
2759                 item = btrfs_item_nr(i);
2760                 push_space -= btrfs_token_item_size(&token, item);
2761                 btrfs_set_token_item_offset(&token, item, push_space);
2762         }
2763
2764         left_nritems -= push_items;
2765         btrfs_set_header_nritems(left, left_nritems);
2766
2767         if (left_nritems)
2768                 btrfs_mark_buffer_dirty(left);
2769         else
2770                 btrfs_clean_tree_block(left);
2771
2772         btrfs_mark_buffer_dirty(right);
2773
2774         btrfs_item_key(right, &disk_key, 0);
2775         btrfs_set_node_key(upper, &disk_key, slot + 1);
2776         btrfs_mark_buffer_dirty(upper);
2777
2778         /* then fixup the leaf pointer in the path */
2779         if (path->slots[0] >= left_nritems) {
2780                 path->slots[0] -= left_nritems;
2781                 if (btrfs_header_nritems(path->nodes[0]) == 0)
2782                         btrfs_clean_tree_block(path->nodes[0]);
2783                 btrfs_tree_unlock(path->nodes[0]);
2784                 free_extent_buffer(path->nodes[0]);
2785                 path->nodes[0] = right;
2786                 path->slots[1] += 1;
2787         } else {
2788                 btrfs_tree_unlock(right);
2789                 free_extent_buffer(right);
2790         }
2791         return 0;
2792
2793 out_unlock:
2794         btrfs_tree_unlock(right);
2795         free_extent_buffer(right);
2796         return 1;
2797 }
2798
2799 /*
2800  * push some data in the path leaf to the right, trying to free up at
2801  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2802  *
2803  * returns 1 if the push failed because the other node didn't have enough
2804  * room, 0 if everything worked out and < 0 if there were major errors.
2805  *
2806  * this will push starting from min_slot to the end of the leaf.  It won't
2807  * push any slot lower than min_slot
2808  */
2809 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2810                            *root, struct btrfs_path *path,
2811                            int min_data_size, int data_size,
2812                            int empty, u32 min_slot)
2813 {
2814         struct extent_buffer *left = path->nodes[0];
2815         struct extent_buffer *right;
2816         struct extent_buffer *upper;
2817         int slot;
2818         int free_space;
2819         u32 left_nritems;
2820         int ret;
2821
2822         if (!path->nodes[1])
2823                 return 1;
2824
2825         slot = path->slots[1];
2826         upper = path->nodes[1];
2827         if (slot >= btrfs_header_nritems(upper) - 1)
2828                 return 1;
2829
2830         btrfs_assert_tree_locked(path->nodes[1]);
2831
2832         right = btrfs_read_node_slot(upper, slot + 1);
2833         /*
2834          * slot + 1 is not valid or we fail to read the right node,
2835          * no big deal, just return.
2836          */
2837         if (IS_ERR(right))
2838                 return 1;
2839
2840         __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2841
2842         free_space = btrfs_leaf_free_space(right);
2843         if (free_space < data_size)
2844                 goto out_unlock;
2845
2846         /* cow and double check */
2847         ret = btrfs_cow_block(trans, root, right, upper,
2848                               slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
2849         if (ret)
2850                 goto out_unlock;
2851
2852         free_space = btrfs_leaf_free_space(right);
2853         if (free_space < data_size)
2854                 goto out_unlock;
2855
2856         left_nritems = btrfs_header_nritems(left);
2857         if (left_nritems == 0)
2858                 goto out_unlock;
2859
2860         if (check_sibling_keys(left, right)) {
2861                 ret = -EUCLEAN;
2862                 btrfs_tree_unlock(right);
2863                 free_extent_buffer(right);
2864                 return ret;
2865         }
2866         if (path->slots[0] == left_nritems && !empty) {
2867                 /* Key greater than all keys in the leaf, right neighbor has
2868                  * enough room for it and we're not emptying our leaf to delete
2869                  * it, therefore use right neighbor to insert the new item and
2870                  * no need to touch/dirty our left leaf. */
2871                 btrfs_tree_unlock(left);
2872                 free_extent_buffer(left);
2873                 path->nodes[0] = right;
2874                 path->slots[0] = 0;
2875                 path->slots[1]++;
2876                 return 0;
2877         }
2878
2879         return __push_leaf_right(path, min_data_size, empty,
2880                                 right, free_space, left_nritems, min_slot);
2881 out_unlock:
2882         btrfs_tree_unlock(right);
2883         free_extent_buffer(right);
2884         return 1;
2885 }
2886
2887 /*
2888  * push some data in the path leaf to the left, trying to free up at
2889  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2890  *
2891  * max_slot can put a limit on how far into the leaf we'll push items.  The
2892  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
2893  * items
2894  */
2895 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
2896                                      int empty, struct extent_buffer *left,
2897                                      int free_space, u32 right_nritems,
2898                                      u32 max_slot)
2899 {
2900         struct btrfs_fs_info *fs_info = left->fs_info;
2901         struct btrfs_disk_key disk_key;
2902         struct extent_buffer *right = path->nodes[0];
2903         int i;
2904         int push_space = 0;
2905         int push_items = 0;
2906         struct btrfs_item *item;
2907         u32 old_left_nritems;
2908         u32 nr;
2909         int ret = 0;
2910         u32 this_item_size;
2911         u32 old_left_item_size;
2912         struct btrfs_map_token token;
2913
2914         if (empty)
2915                 nr = min(right_nritems, max_slot);
2916         else
2917                 nr = min(right_nritems - 1, max_slot);
2918
2919         for (i = 0; i < nr; i++) {
2920                 item = btrfs_item_nr(i);
2921
2922                 if (!empty && push_items > 0) {
2923                         if (path->slots[0] < i)
2924                                 break;
2925                         if (path->slots[0] == i) {
2926                                 int space = btrfs_leaf_free_space(right);
2927
2928                                 if (space + push_space * 2 > free_space)
2929                                         break;
2930                         }
2931                 }
2932
2933                 if (path->slots[0] == i)
2934                         push_space += data_size;
2935
2936                 this_item_size = btrfs_item_size(right, item);
2937                 if (this_item_size + sizeof(*item) + push_space > free_space)
2938                         break;
2939
2940                 push_items++;
2941                 push_space += this_item_size + sizeof(*item);
2942         }
2943
2944         if (push_items == 0) {
2945                 ret = 1;
2946                 goto out;
2947         }
2948         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
2949
2950         /* push data from right to left */
2951         copy_extent_buffer(left, right,
2952                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
2953                            btrfs_item_nr_offset(0),
2954                            push_items * sizeof(struct btrfs_item));
2955
2956         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
2957                      btrfs_item_offset_nr(right, push_items - 1);
2958
2959         copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
2960                      leaf_data_end(left) - push_space,
2961                      BTRFS_LEAF_DATA_OFFSET +
2962                      btrfs_item_offset_nr(right, push_items - 1),
2963                      push_space);
2964         old_left_nritems = btrfs_header_nritems(left);
2965         BUG_ON(old_left_nritems <= 0);
2966
2967         btrfs_init_map_token(&token, left);
2968         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2969         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2970                 u32 ioff;
2971
2972                 item = btrfs_item_nr(i);
2973
2974                 ioff = btrfs_token_item_offset(&token, item);
2975                 btrfs_set_token_item_offset(&token, item,
2976                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
2977         }
2978         btrfs_set_header_nritems(left, old_left_nritems + push_items);
2979
2980         /* fixup right node */
2981         if (push_items > right_nritems)
2982                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
2983                        right_nritems);
2984
2985         if (push_items < right_nritems) {
2986                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2987                                                   leaf_data_end(right);
2988                 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
2989                                       BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
2990                                       BTRFS_LEAF_DATA_OFFSET +
2991                                       leaf_data_end(right), push_space);
2992
2993                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2994                               btrfs_item_nr_offset(push_items),
2995                              (btrfs_header_nritems(right) - push_items) *
2996                              sizeof(struct btrfs_item));
2997         }
2998
2999         btrfs_init_map_token(&token, right);
3000         right_nritems -= push_items;
3001         btrfs_set_header_nritems(right, right_nritems);
3002         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3003         for (i = 0; i < right_nritems; i++) {
3004                 item = btrfs_item_nr(i);
3005
3006                 push_space = push_space - btrfs_token_item_size(&token, item);
3007                 btrfs_set_token_item_offset(&token, item, push_space);
3008         }
3009
3010         btrfs_mark_buffer_dirty(left);
3011         if (right_nritems)
3012                 btrfs_mark_buffer_dirty(right);
3013         else
3014                 btrfs_clean_tree_block(right);
3015
3016         btrfs_item_key(right, &disk_key, 0);
3017         fixup_low_keys(path, &disk_key, 1);
3018
3019         /* then fixup the leaf pointer in the path */
3020         if (path->slots[0] < push_items) {
3021                 path->slots[0] += old_left_nritems;
3022                 btrfs_tree_unlock(path->nodes[0]);
3023                 free_extent_buffer(path->nodes[0]);
3024                 path->nodes[0] = left;
3025                 path->slots[1] -= 1;
3026         } else {
3027                 btrfs_tree_unlock(left);
3028                 free_extent_buffer(left);
3029                 path->slots[0] -= push_items;
3030         }
3031         BUG_ON(path->slots[0] < 0);
3032         return ret;
3033 out:
3034         btrfs_tree_unlock(left);
3035         free_extent_buffer(left);
3036         return ret;
3037 }
3038
3039 /*
3040  * push some data in the path leaf to the left, trying to free up at
3041  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3042  *
3043  * max_slot can put a limit on how far into the leaf we'll push items.  The
3044  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3045  * items
3046  */
3047 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3048                           *root, struct btrfs_path *path, int min_data_size,
3049                           int data_size, int empty, u32 max_slot)
3050 {
3051         struct extent_buffer *right = path->nodes[0];
3052         struct extent_buffer *left;
3053         int slot;
3054         int free_space;
3055         u32 right_nritems;
3056         int ret = 0;
3057
3058         slot = path->slots[1];
3059         if (slot == 0)
3060                 return 1;
3061         if (!path->nodes[1])
3062                 return 1;
3063
3064         right_nritems = btrfs_header_nritems(right);
3065         if (right_nritems == 0)
3066                 return 1;
3067
3068         btrfs_assert_tree_locked(path->nodes[1]);
3069
3070         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3071         /*
3072          * slot - 1 is not valid or we fail to read the left node,
3073          * no big deal, just return.
3074          */
3075         if (IS_ERR(left))
3076                 return 1;
3077
3078         __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3079
3080         free_space = btrfs_leaf_free_space(left);
3081         if (free_space < data_size) {
3082                 ret = 1;
3083                 goto out;
3084         }
3085
3086         /* cow and double check */
3087         ret = btrfs_cow_block(trans, root, left,
3088                               path->nodes[1], slot - 1, &left,
3089                               BTRFS_NESTING_LEFT_COW);
3090         if (ret) {
3091                 /* we hit -ENOSPC, but it isn't fatal here */
3092                 if (ret == -ENOSPC)
3093                         ret = 1;
3094                 goto out;
3095         }
3096
3097         free_space = btrfs_leaf_free_space(left);
3098         if (free_space < data_size) {
3099                 ret = 1;
3100                 goto out;
3101         }
3102
3103         if (check_sibling_keys(left, right)) {
3104                 ret = -EUCLEAN;
3105                 goto out;
3106         }
3107         return __push_leaf_left(path, min_data_size,
3108                                empty, left, free_space, right_nritems,
3109                                max_slot);
3110 out:
3111         btrfs_tree_unlock(left);
3112         free_extent_buffer(left);
3113         return ret;
3114 }
3115
3116 /*
3117  * split the path's leaf in two, making sure there is at least data_size
3118  * available for the resulting leaf level of the path.
3119  */
3120 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3121                                     struct btrfs_path *path,
3122                                     struct extent_buffer *l,
3123                                     struct extent_buffer *right,
3124                                     int slot, int mid, int nritems)
3125 {
3126         struct btrfs_fs_info *fs_info = trans->fs_info;
3127         int data_copy_size;
3128         int rt_data_off;
3129         int i;
3130         struct btrfs_disk_key disk_key;
3131         struct btrfs_map_token token;
3132
3133         nritems = nritems - mid;
3134         btrfs_set_header_nritems(right, nritems);
3135         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
3136
3137         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3138                            btrfs_item_nr_offset(mid),
3139                            nritems * sizeof(struct btrfs_item));
3140
3141         copy_extent_buffer(right, l,
3142                      BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
3143                      data_copy_size, BTRFS_LEAF_DATA_OFFSET +
3144                      leaf_data_end(l), data_copy_size);
3145
3146         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
3147
3148         btrfs_init_map_token(&token, right);
3149         for (i = 0; i < nritems; i++) {
3150                 struct btrfs_item *item = btrfs_item_nr(i);
3151                 u32 ioff;
3152
3153                 ioff = btrfs_token_item_offset(&token, item);
3154                 btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
3155         }
3156
3157         btrfs_set_header_nritems(l, mid);
3158         btrfs_item_key(right, &disk_key, 0);
3159         insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3160
3161         btrfs_mark_buffer_dirty(right);
3162         btrfs_mark_buffer_dirty(l);
3163         BUG_ON(path->slots[0] != slot);
3164
3165         if (mid <= slot) {
3166                 btrfs_tree_unlock(path->nodes[0]);
3167                 free_extent_buffer(path->nodes[0]);
3168                 path->nodes[0] = right;
3169                 path->slots[0] -= mid;
3170                 path->slots[1] += 1;
3171         } else {
3172                 btrfs_tree_unlock(right);
3173                 free_extent_buffer(right);
3174         }
3175
3176         BUG_ON(path->slots[0] < 0);
3177 }
3178
3179 /*
3180  * double splits happen when we need to insert a big item in the middle
3181  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3182  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3183  *          A                 B                 C
3184  *
3185  * We avoid this by trying to push the items on either side of our target
3186  * into the adjacent leaves.  If all goes well we can avoid the double split
3187  * completely.
3188  */
3189 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3190                                           struct btrfs_root *root,
3191                                           struct btrfs_path *path,
3192                                           int data_size)
3193 {
3194         int ret;
3195         int progress = 0;
3196         int slot;
3197         u32 nritems;
3198         int space_needed = data_size;
3199
3200         slot = path->slots[0];
3201         if (slot < btrfs_header_nritems(path->nodes[0]))
3202                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3203
3204         /*
3205          * try to push all the items after our slot into the
3206          * right leaf
3207          */
3208         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3209         if (ret < 0)
3210                 return ret;
3211
3212         if (ret == 0)
3213                 progress++;
3214
3215         nritems = btrfs_header_nritems(path->nodes[0]);
3216         /*
3217          * our goal is to get our slot at the start or end of a leaf.  If
3218          * we've done so we're done
3219          */
3220         if (path->slots[0] == 0 || path->slots[0] == nritems)
3221                 return 0;
3222
3223         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3224                 return 0;
3225
3226         /* try to push all the items before our slot into the next leaf */
3227         slot = path->slots[0];
3228         space_needed = data_size;
3229         if (slot > 0)
3230                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3231         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3232         if (ret < 0)
3233                 return ret;
3234
3235         if (ret == 0)
3236                 progress++;
3237
3238         if (progress)
3239                 return 0;
3240         return 1;
3241 }
3242
3243 /*
3244  * split the path's leaf in two, making sure there is at least data_size
3245  * available for the resulting leaf level of the path.
3246  *
3247  * returns 0 if all went well and < 0 on failure.
3248  */
3249 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3250                                struct btrfs_root *root,
3251                                const struct btrfs_key *ins_key,
3252                                struct btrfs_path *path, int data_size,
3253                                int extend)
3254 {
3255         struct btrfs_disk_key disk_key;
3256         struct extent_buffer *l;
3257         u32 nritems;
3258         int mid;
3259         int slot;
3260         struct extent_buffer *right;
3261         struct btrfs_fs_info *fs_info = root->fs_info;
3262         int ret = 0;
3263         int wret;
3264         int split;
3265         int num_doubles = 0;
3266         int tried_avoid_double = 0;
3267
3268         l = path->nodes[0];
3269         slot = path->slots[0];
3270         if (extend && data_size + btrfs_item_size_nr(l, slot) +
3271             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3272                 return -EOVERFLOW;
3273
3274         /* first try to make some room by pushing left and right */
3275         if (data_size && path->nodes[1]) {
3276                 int space_needed = data_size;
3277
3278                 if (slot < btrfs_header_nritems(l))
3279                         space_needed -= btrfs_leaf_free_space(l);
3280
3281                 wret = push_leaf_right(trans, root, path, space_needed,
3282                                        space_needed, 0, 0);
3283                 if (wret < 0)
3284                         return wret;
3285                 if (wret) {
3286                         space_needed = data_size;
3287                         if (slot > 0)
3288                                 space_needed -= btrfs_leaf_free_space(l);
3289                         wret = push_leaf_left(trans, root, path, space_needed,
3290                                               space_needed, 0, (u32)-1);
3291                         if (wret < 0)
3292                                 return wret;
3293                 }
3294                 l = path->nodes[0];
3295
3296                 /* did the pushes work? */
3297                 if (btrfs_leaf_free_space(l) >= data_size)
3298                         return 0;
3299         }
3300
3301         if (!path->nodes[1]) {
3302                 ret = insert_new_root(trans, root, path, 1);
3303                 if (ret)
3304                         return ret;
3305         }
3306 again:
3307         split = 1;
3308         l = path->nodes[0];
3309         slot = path->slots[0];
3310         nritems = btrfs_header_nritems(l);
3311         mid = (nritems + 1) / 2;
3312
3313         if (mid <= slot) {
3314                 if (nritems == 1 ||
3315                     leaf_space_used(l, mid, nritems - mid) + data_size >
3316                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
3317                         if (slot >= nritems) {
3318                                 split = 0;
3319                         } else {
3320                                 mid = slot;
3321                                 if (mid != nritems &&
3322                                     leaf_space_used(l, mid, nritems - mid) +
3323                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3324                                         if (data_size && !tried_avoid_double)
3325                                                 goto push_for_double;
3326                                         split = 2;
3327                                 }
3328                         }
3329                 }
3330         } else {
3331                 if (leaf_space_used(l, 0, mid) + data_size >
3332                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
3333                         if (!extend && data_size && slot == 0) {
3334                                 split = 0;
3335                         } else if ((extend || !data_size) && slot == 0) {
3336                                 mid = 1;
3337                         } else {
3338                                 mid = slot;
3339                                 if (mid != nritems &&
3340                                     leaf_space_used(l, mid, nritems - mid) +
3341                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3342                                         if (data_size && !tried_avoid_double)
3343                                                 goto push_for_double;
3344                                         split = 2;
3345                                 }
3346                         }
3347                 }
3348         }
3349
3350         if (split == 0)
3351                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3352         else
3353                 btrfs_item_key(l, &disk_key, mid);
3354
3355         /*
3356          * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3357          * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3358          * subclasses, which is 8 at the time of this patch, and we've maxed it
3359          * out.  In the future we could add a
3360          * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3361          * use BTRFS_NESTING_NEW_ROOT.
3362          */
3363         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3364                                        &disk_key, 0, l->start, 0,
3365                                        num_doubles ? BTRFS_NESTING_NEW_ROOT :
3366                                        BTRFS_NESTING_SPLIT);
3367         if (IS_ERR(right))
3368                 return PTR_ERR(right);
3369
3370         root_add_used(root, fs_info->nodesize);
3371
3372         if (split == 0) {
3373                 if (mid <= slot) {
3374                         btrfs_set_header_nritems(right, 0);
3375                         insert_ptr(trans, path, &disk_key,
3376                                    right->start, path->slots[1] + 1, 1);
3377                         btrfs_tree_unlock(path->nodes[0]);
3378                         free_extent_buffer(path->nodes[0]);
3379                         path->nodes[0] = right;
3380                         path->slots[0] = 0;
3381                         path->slots[1] += 1;
3382                 } else {
3383                         btrfs_set_header_nritems(right, 0);
3384                         insert_ptr(trans, path, &disk_key,
3385                                    right->start, path->slots[1], 1);
3386                         btrfs_tree_unlock(path->nodes[0]);
3387                         free_extent_buffer(path->nodes[0]);
3388                         path->nodes[0] = right;
3389                         path->slots[0] = 0;
3390                         if (path->slots[1] == 0)
3391                                 fixup_low_keys(path, &disk_key, 1);
3392                 }
3393                 /*
3394                  * We create a new leaf 'right' for the required ins_len and
3395                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3396                  * the content of ins_len to 'right'.
3397                  */
3398                 return ret;
3399         }
3400
3401         copy_for_split(trans, path, l, right, slot, mid, nritems);
3402
3403         if (split == 2) {
3404                 BUG_ON(num_doubles != 0);
3405                 num_doubles++;
3406                 goto again;
3407         }
3408
3409         return 0;
3410
3411 push_for_double:
3412         push_for_double_split(trans, root, path, data_size);
3413         tried_avoid_double = 1;
3414         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3415                 return 0;
3416         goto again;
3417 }
3418
3419 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3420                                          struct btrfs_root *root,
3421                                          struct btrfs_path *path, int ins_len)
3422 {
3423         struct btrfs_key key;
3424         struct extent_buffer *leaf;
3425         struct btrfs_file_extent_item *fi;
3426         u64 extent_len = 0;
3427         u32 item_size;
3428         int ret;
3429
3430         leaf = path->nodes[0];
3431         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3432
3433         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3434                key.type != BTRFS_EXTENT_CSUM_KEY);
3435
3436         if (btrfs_leaf_free_space(leaf) >= ins_len)
3437                 return 0;
3438
3439         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3440         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3441                 fi = btrfs_item_ptr(leaf, path->slots[0],
3442                                     struct btrfs_file_extent_item);
3443                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3444         }
3445         btrfs_release_path(path);
3446
3447         path->keep_locks = 1;
3448         path->search_for_split = 1;
3449         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3450         path->search_for_split = 0;
3451         if (ret > 0)
3452                 ret = -EAGAIN;
3453         if (ret < 0)
3454                 goto err;
3455
3456         ret = -EAGAIN;
3457         leaf = path->nodes[0];
3458         /* if our item isn't there, return now */
3459         if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3460                 goto err;
3461
3462         /* the leaf has  changed, it now has room.  return now */
3463         if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3464                 goto err;
3465
3466         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3467                 fi = btrfs_item_ptr(leaf, path->slots[0],
3468                                     struct btrfs_file_extent_item);
3469                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3470                         goto err;
3471         }
3472
3473         ret = split_leaf(trans, root, &key, path, ins_len, 1);
3474         if (ret)
3475                 goto err;
3476
3477         path->keep_locks = 0;
3478         btrfs_unlock_up_safe(path, 1);
3479         return 0;
3480 err:
3481         path->keep_locks = 0;
3482         return ret;
3483 }
3484
3485 static noinline int split_item(struct btrfs_path *path,
3486                                const struct btrfs_key *new_key,
3487                                unsigned long split_offset)
3488 {
3489         struct extent_buffer *leaf;
3490         struct btrfs_item *item;
3491         struct btrfs_item *new_item;
3492         int slot;
3493         char *buf;
3494         u32 nritems;
3495         u32 item_size;
3496         u32 orig_offset;
3497         struct btrfs_disk_key disk_key;
3498
3499         leaf = path->nodes[0];
3500         BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
3501
3502         item = btrfs_item_nr(path->slots[0]);
3503         orig_offset = btrfs_item_offset(leaf, item);
3504         item_size = btrfs_item_size(leaf, item);
3505
3506         buf = kmalloc(item_size, GFP_NOFS);
3507         if (!buf)
3508                 return -ENOMEM;
3509
3510         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3511                             path->slots[0]), item_size);
3512
3513         slot = path->slots[0] + 1;
3514         nritems = btrfs_header_nritems(leaf);
3515         if (slot != nritems) {
3516                 /* shift the items */
3517                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3518                                 btrfs_item_nr_offset(slot),
3519                                 (nritems - slot) * sizeof(struct btrfs_item));
3520         }
3521
3522         btrfs_cpu_key_to_disk(&disk_key, new_key);
3523         btrfs_set_item_key(leaf, &disk_key, slot);
3524
3525         new_item = btrfs_item_nr(slot);
3526
3527         btrfs_set_item_offset(leaf, new_item, orig_offset);
3528         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3529
3530         btrfs_set_item_offset(leaf, item,
3531                               orig_offset + item_size - split_offset);
3532         btrfs_set_item_size(leaf, item, split_offset);
3533
3534         btrfs_set_header_nritems(leaf, nritems + 1);
3535
3536         /* write the data for the start of the original item */
3537         write_extent_buffer(leaf, buf,
3538                             btrfs_item_ptr_offset(leaf, path->slots[0]),
3539                             split_offset);
3540
3541         /* write the data for the new item */
3542         write_extent_buffer(leaf, buf + split_offset,
3543                             btrfs_item_ptr_offset(leaf, slot),
3544                             item_size - split_offset);
3545         btrfs_mark_buffer_dirty(leaf);
3546
3547         BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3548         kfree(buf);
3549         return 0;
3550 }
3551
3552 /*
3553  * This function splits a single item into two items,
3554  * giving 'new_key' to the new item and splitting the
3555  * old one at split_offset (from the start of the item).
3556  *
3557  * The path may be released by this operation.  After
3558  * the split, the path is pointing to the old item.  The
3559  * new item is going to be in the same node as the old one.
3560  *
3561  * Note, the item being split must be smaller enough to live alone on
3562  * a tree block with room for one extra struct btrfs_item
3563  *
3564  * This allows us to split the item in place, keeping a lock on the
3565  * leaf the entire time.
3566  */
3567 int btrfs_split_item(struct btrfs_trans_handle *trans,
3568                      struct btrfs_root *root,
3569                      struct btrfs_path *path,
3570                      const struct btrfs_key *new_key,
3571                      unsigned long split_offset)
3572 {
3573         int ret;
3574         ret = setup_leaf_for_split(trans, root, path,
3575                                    sizeof(struct btrfs_item));
3576         if (ret)
3577                 return ret;
3578
3579         ret = split_item(path, new_key, split_offset);
3580         return ret;
3581 }
3582
3583 /*
3584  * This function duplicate a item, giving 'new_key' to the new item.
3585  * It guarantees both items live in the same tree leaf and the new item
3586  * is contiguous with the original item.
3587  *
3588  * This allows us to split file extent in place, keeping a lock on the
3589  * leaf the entire time.
3590  */
3591 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3592                          struct btrfs_root *root,
3593                          struct btrfs_path *path,
3594                          const struct btrfs_key *new_key)
3595 {
3596         struct extent_buffer *leaf;
3597         int ret;
3598         u32 item_size;
3599
3600         leaf = path->nodes[0];
3601         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3602         ret = setup_leaf_for_split(trans, root, path,
3603                                    item_size + sizeof(struct btrfs_item));
3604         if (ret)
3605                 return ret;
3606
3607         path->slots[0]++;
3608         setup_items_for_insert(root, path, new_key, &item_size, 1);
3609         leaf = path->nodes[0];
3610         memcpy_extent_buffer(leaf,
3611                              btrfs_item_ptr_offset(leaf, path->slots[0]),
3612                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3613                              item_size);
3614         return 0;
3615 }
3616
3617 /*
3618  * make the item pointed to by the path smaller.  new_size indicates
3619  * how small to make it, and from_end tells us if we just chop bytes
3620  * off the end of the item or if we shift the item to chop bytes off
3621  * the front.
3622  */
3623 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
3624 {
3625         int slot;
3626         struct extent_buffer *leaf;
3627         struct btrfs_item *item;
3628         u32 nritems;
3629         unsigned int data_end;
3630         unsigned int old_data_start;
3631         unsigned int old_size;
3632         unsigned int size_diff;
3633         int i;
3634         struct btrfs_map_token token;
3635
3636         leaf = path->nodes[0];
3637         slot = path->slots[0];
3638
3639         old_size = btrfs_item_size_nr(leaf, slot);
3640         if (old_size == new_size)
3641                 return;
3642
3643         nritems = btrfs_header_nritems(leaf);
3644         data_end = leaf_data_end(leaf);
3645
3646         old_data_start = btrfs_item_offset_nr(leaf, slot);
3647
3648         size_diff = old_size - new_size;
3649
3650         BUG_ON(slot < 0);
3651         BUG_ON(slot >= nritems);
3652
3653         /*
3654          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3655          */
3656         /* first correct the data pointers */
3657         btrfs_init_map_token(&token, leaf);
3658         for (i = slot; i < nritems; i++) {
3659                 u32 ioff;
3660                 item = btrfs_item_nr(i);
3661
3662                 ioff = btrfs_token_item_offset(&token, item);
3663                 btrfs_set_token_item_offset(&token, item, ioff + size_diff);
3664         }
3665
3666         /* shift the data */
3667         if (from_end) {
3668                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3669                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3670                               data_end, old_data_start + new_size - data_end);
3671         } else {
3672                 struct btrfs_disk_key disk_key;
3673                 u64 offset;
3674
3675                 btrfs_item_key(leaf, &disk_key, slot);
3676
3677                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3678                         unsigned long ptr;
3679                         struct btrfs_file_extent_item *fi;
3680
3681                         fi = btrfs_item_ptr(leaf, slot,
3682                                             struct btrfs_file_extent_item);
3683                         fi = (struct btrfs_file_extent_item *)(
3684                              (unsigned long)fi - size_diff);
3685
3686                         if (btrfs_file_extent_type(leaf, fi) ==
3687                             BTRFS_FILE_EXTENT_INLINE) {
3688                                 ptr = btrfs_item_ptr_offset(leaf, slot);
3689                                 memmove_extent_buffer(leaf, ptr,
3690                                       (unsigned long)fi,
3691                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
3692                         }
3693                 }
3694
3695                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3696                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3697                               data_end, old_data_start - data_end);
3698
3699                 offset = btrfs_disk_key_offset(&disk_key);
3700                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3701                 btrfs_set_item_key(leaf, &disk_key, slot);
3702                 if (slot == 0)
3703                         fixup_low_keys(path, &disk_key, 1);
3704         }
3705
3706         item = btrfs_item_nr(slot);
3707         btrfs_set_item_size(leaf, item, new_size);
3708         btrfs_mark_buffer_dirty(leaf);
3709
3710         if (btrfs_leaf_free_space(leaf) < 0) {
3711                 btrfs_print_leaf(leaf);
3712                 BUG();
3713         }
3714 }
3715
3716 /*
3717  * make the item pointed to by the path bigger, data_size is the added size.
3718  */
3719 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
3720 {
3721         int slot;
3722         struct extent_buffer *leaf;
3723         struct btrfs_item *item;
3724         u32 nritems;
3725         unsigned int data_end;
3726         unsigned int old_data;
3727         unsigned int old_size;
3728         int i;
3729         struct btrfs_map_token token;
3730
3731         leaf = path->nodes[0];
3732
3733         nritems = btrfs_header_nritems(leaf);
3734         data_end = leaf_data_end(leaf);
3735
3736         if (btrfs_leaf_free_space(leaf) < data_size) {
3737                 btrfs_print_leaf(leaf);
3738                 BUG();
3739         }
3740         slot = path->slots[0];
3741         old_data = btrfs_item_end_nr(leaf, slot);
3742
3743         BUG_ON(slot < 0);
3744         if (slot >= nritems) {
3745                 btrfs_print_leaf(leaf);
3746                 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
3747                            slot, nritems);
3748                 BUG();
3749         }
3750
3751         /*
3752          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3753          */
3754         /* first correct the data pointers */
3755         btrfs_init_map_token(&token, leaf);
3756         for (i = slot; i < nritems; i++) {
3757                 u32 ioff;
3758                 item = btrfs_item_nr(i);
3759
3760                 ioff = btrfs_token_item_offset(&token, item);
3761                 btrfs_set_token_item_offset(&token, item, ioff - data_size);
3762         }
3763
3764         /* shift the data */
3765         memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3766                       data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
3767                       data_end, old_data - data_end);
3768
3769         data_end = old_data;
3770         old_size = btrfs_item_size_nr(leaf, slot);
3771         item = btrfs_item_nr(slot);
3772         btrfs_set_item_size(leaf, item, old_size + data_size);
3773         btrfs_mark_buffer_dirty(leaf);
3774
3775         if (btrfs_leaf_free_space(leaf) < 0) {
3776                 btrfs_print_leaf(leaf);
3777                 BUG();
3778         }
3779 }
3780
3781 /**
3782  * setup_items_for_insert - Helper called before inserting one or more items
3783  * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
3784  * in a function that doesn't call btrfs_search_slot
3785  *
3786  * @root:       root we are inserting items to
3787  * @path:       points to the leaf/slot where we are going to insert new items
3788  * @cpu_key:    array of keys for items to be inserted
3789  * @data_size:  size of the body of each item we are going to insert
3790  * @nr:         size of @cpu_key/@data_size arrays
3791  */
3792 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
3793                             const struct btrfs_key *cpu_key, u32 *data_size,
3794                             int nr)
3795 {
3796         struct btrfs_fs_info *fs_info = root->fs_info;
3797         struct btrfs_item *item;
3798         int i;
3799         u32 nritems;
3800         unsigned int data_end;
3801         struct btrfs_disk_key disk_key;
3802         struct extent_buffer *leaf;
3803         int slot;
3804         struct btrfs_map_token token;
3805         u32 total_size;
3806         u32 total_data = 0;
3807
3808         for (i = 0; i < nr; i++)
3809                 total_data += data_size[i];
3810         total_size = total_data + (nr * sizeof(struct btrfs_item));
3811
3812         if (path->slots[0] == 0) {
3813                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3814                 fixup_low_keys(path, &disk_key, 1);
3815         }
3816         btrfs_unlock_up_safe(path, 1);
3817
3818         leaf = path->nodes[0];
3819         slot = path->slots[0];
3820
3821         nritems = btrfs_header_nritems(leaf);
3822         data_end = leaf_data_end(leaf);
3823
3824         if (btrfs_leaf_free_space(leaf) < total_size) {
3825                 btrfs_print_leaf(leaf);
3826                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
3827                            total_size, btrfs_leaf_free_space(leaf));
3828                 BUG();
3829         }
3830
3831         btrfs_init_map_token(&token, leaf);
3832         if (slot != nritems) {
3833                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3834
3835                 if (old_data < data_end) {
3836                         btrfs_print_leaf(leaf);
3837                         btrfs_crit(fs_info,
3838                 "item at slot %d with data offset %u beyond data end of leaf %u",
3839                                    slot, old_data, data_end);
3840                         BUG();
3841                 }
3842                 /*
3843                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3844                  */
3845                 /* first correct the data pointers */
3846                 for (i = slot; i < nritems; i++) {
3847                         u32 ioff;
3848
3849                         item = btrfs_item_nr(i);
3850                         ioff = btrfs_token_item_offset(&token, item);
3851                         btrfs_set_token_item_offset(&token, item,
3852                                                     ioff - total_data);
3853                 }
3854                 /* shift the items */
3855                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3856                               btrfs_item_nr_offset(slot),
3857                               (nritems - slot) * sizeof(struct btrfs_item));
3858
3859                 /* shift the data */
3860                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3861                               data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
3862                               data_end, old_data - data_end);
3863                 data_end = old_data;
3864         }
3865
3866         /* setup the item for the new data */
3867         for (i = 0; i < nr; i++) {
3868                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3869                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3870                 item = btrfs_item_nr(slot + i);
3871                 data_end -= data_size[i];
3872                 btrfs_set_token_item_offset(&token, item, data_end);
3873                 btrfs_set_token_item_size(&token, item, data_size[i]);
3874         }
3875
3876         btrfs_set_header_nritems(leaf, nritems + nr);
3877         btrfs_mark_buffer_dirty(leaf);
3878
3879         if (btrfs_leaf_free_space(leaf) < 0) {
3880                 btrfs_print_leaf(leaf);
3881                 BUG();
3882         }
3883 }
3884
3885 /*
3886  * Given a key and some data, insert items into the tree.
3887  * This does all the path init required, making room in the tree if needed.
3888  */
3889 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3890                             struct btrfs_root *root,
3891                             struct btrfs_path *path,
3892                             const struct btrfs_key *cpu_key, u32 *data_size,
3893                             int nr)
3894 {
3895         int ret = 0;
3896         int slot;
3897         int i;
3898         u32 total_size = 0;
3899         u32 total_data = 0;
3900
3901         for (i = 0; i < nr; i++)
3902                 total_data += data_size[i];
3903
3904         total_size = total_data + (nr * sizeof(struct btrfs_item));
3905         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3906         if (ret == 0)
3907                 return -EEXIST;
3908         if (ret < 0)
3909                 return ret;
3910
3911         slot = path->slots[0];
3912         BUG_ON(slot < 0);
3913
3914         setup_items_for_insert(root, path, cpu_key, data_size, nr);
3915         return 0;
3916 }
3917
3918 /*
3919  * Given a key and some data, insert an item into the tree.
3920  * This does all the path init required, making room in the tree if needed.
3921  */
3922 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3923                       const struct btrfs_key *cpu_key, void *data,
3924                       u32 data_size)
3925 {
3926         int ret = 0;
3927         struct btrfs_path *path;
3928         struct extent_buffer *leaf;
3929         unsigned long ptr;
3930
3931         path = btrfs_alloc_path();
3932         if (!path)
3933                 return -ENOMEM;
3934         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3935         if (!ret) {
3936                 leaf = path->nodes[0];
3937                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3938                 write_extent_buffer(leaf, data, ptr, data_size);
3939                 btrfs_mark_buffer_dirty(leaf);
3940         }
3941         btrfs_free_path(path);
3942         return ret;
3943 }
3944
3945 /*
3946  * delete the pointer from a given node.
3947  *
3948  * the tree should have been previously balanced so the deletion does not
3949  * empty a node.
3950  */
3951 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
3952                     int level, int slot)
3953 {
3954         struct extent_buffer *parent = path->nodes[level];
3955         u32 nritems;
3956         int ret;
3957
3958         nritems = btrfs_header_nritems(parent);
3959         if (slot != nritems - 1) {
3960                 if (level) {
3961                         ret = btrfs_tree_mod_log_insert_move(parent, slot,
3962                                         slot + 1, nritems - slot - 1);
3963                         BUG_ON(ret < 0);
3964                 }
3965                 memmove_extent_buffer(parent,
3966                               btrfs_node_key_ptr_offset(slot),
3967                               btrfs_node_key_ptr_offset(slot + 1),
3968                               sizeof(struct btrfs_key_ptr) *
3969                               (nritems - slot - 1));
3970         } else if (level) {
3971                 ret = btrfs_tree_mod_log_insert_key(parent, slot,
3972                                 BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
3973                 BUG_ON(ret < 0);
3974         }
3975
3976         nritems--;
3977         btrfs_set_header_nritems(parent, nritems);
3978         if (nritems == 0 && parent == root->node) {
3979                 BUG_ON(btrfs_header_level(root->node) != 1);
3980                 /* just turn the root into a leaf and break */
3981                 btrfs_set_header_level(root->node, 0);
3982         } else if (slot == 0) {
3983                 struct btrfs_disk_key disk_key;
3984
3985                 btrfs_node_key(parent, &disk_key, 0);
3986                 fixup_low_keys(path, &disk_key, level + 1);
3987         }
3988         btrfs_mark_buffer_dirty(parent);
3989 }
3990
3991 /*
3992  * a helper function to delete the leaf pointed to by path->slots[1] and
3993  * path->nodes[1].
3994  *
3995  * This deletes the pointer in path->nodes[1] and frees the leaf
3996  * block extent.  zero is returned if it all worked out, < 0 otherwise.
3997  *
3998  * The path must have already been setup for deleting the leaf, including
3999  * all the proper balancing.  path->nodes[1] must be locked.
4000  */
4001 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4002                                     struct btrfs_root *root,
4003                                     struct btrfs_path *path,
4004                                     struct extent_buffer *leaf)
4005 {
4006         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4007         del_ptr(root, path, 1, path->slots[1]);
4008
4009         /*
4010          * btrfs_free_extent is expensive, we want to make sure we
4011          * aren't holding any locks when we call it
4012          */
4013         btrfs_unlock_up_safe(path, 0);
4014
4015         root_sub_used(root, leaf->len);
4016
4017         atomic_inc(&leaf->refs);
4018         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4019         free_extent_buffer_stale(leaf);
4020 }
4021 /*
4022  * delete the item at the leaf level in path.  If that empties
4023  * the leaf, remove it from the tree
4024  */
4025 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4026                     struct btrfs_path *path, int slot, int nr)
4027 {
4028         struct btrfs_fs_info *fs_info = root->fs_info;
4029         struct extent_buffer *leaf;
4030         struct btrfs_item *item;
4031         u32 last_off;
4032         u32 dsize = 0;
4033         int ret = 0;
4034         int wret;
4035         int i;
4036         u32 nritems;
4037
4038         leaf = path->nodes[0];
4039         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4040
4041         for (i = 0; i < nr; i++)
4042                 dsize += btrfs_item_size_nr(leaf, slot + i);
4043
4044         nritems = btrfs_header_nritems(leaf);
4045
4046         if (slot + nr != nritems) {
4047                 int data_end = leaf_data_end(leaf);
4048                 struct btrfs_map_token token;
4049
4050                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4051                               data_end + dsize,
4052                               BTRFS_LEAF_DATA_OFFSET + data_end,
4053                               last_off - data_end);
4054
4055                 btrfs_init_map_token(&token, leaf);
4056                 for (i = slot + nr; i < nritems; i++) {
4057                         u32 ioff;
4058
4059                         item = btrfs_item_nr(i);
4060                         ioff = btrfs_token_item_offset(&token, item);
4061                         btrfs_set_token_item_offset(&token, item, ioff + dsize);
4062                 }
4063
4064                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4065                               btrfs_item_nr_offset(slot + nr),
4066                               sizeof(struct btrfs_item) *
4067                               (nritems - slot - nr));
4068         }
4069         btrfs_set_header_nritems(leaf, nritems - nr);
4070         nritems -= nr;
4071
4072         /* delete the leaf if we've emptied it */
4073         if (nritems == 0) {
4074                 if (leaf == root->node) {
4075                         btrfs_set_header_level(leaf, 0);
4076                 } else {
4077                         btrfs_clean_tree_block(leaf);
4078                         btrfs_del_leaf(trans, root, path, leaf);
4079                 }
4080         } else {
4081                 int used = leaf_space_used(leaf, 0, nritems);
4082                 if (slot == 0) {
4083                         struct btrfs_disk_key disk_key;
4084
4085                         btrfs_item_key(leaf, &disk_key, 0);
4086                         fixup_low_keys(path, &disk_key, 1);
4087                 }
4088
4089                 /* delete the leaf if it is mostly empty */
4090                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4091                         /* push_leaf_left fixes the path.
4092                          * make sure the path still points to our leaf
4093                          * for possible call to del_ptr below
4094                          */
4095                         slot = path->slots[1];
4096                         atomic_inc(&leaf->refs);
4097
4098                         wret = push_leaf_left(trans, root, path, 1, 1,
4099                                               1, (u32)-1);
4100                         if (wret < 0 && wret != -ENOSPC)
4101                                 ret = wret;
4102
4103                         if (path->nodes[0] == leaf &&
4104                             btrfs_header_nritems(leaf)) {
4105                                 wret = push_leaf_right(trans, root, path, 1,
4106                                                        1, 1, 0);
4107                                 if (wret < 0 && wret != -ENOSPC)
4108                                         ret = wret;
4109                         }
4110
4111                         if (btrfs_header_nritems(leaf) == 0) {
4112                                 path->slots[1] = slot;
4113                                 btrfs_del_leaf(trans, root, path, leaf);
4114                                 free_extent_buffer(leaf);
4115                                 ret = 0;
4116                         } else {
4117                                 /* if we're still in the path, make sure
4118                                  * we're dirty.  Otherwise, one of the
4119                                  * push_leaf functions must have already
4120                                  * dirtied this buffer
4121                                  */
4122                                 if (path->nodes[0] == leaf)
4123                                         btrfs_mark_buffer_dirty(leaf);
4124                                 free_extent_buffer(leaf);
4125                         }
4126                 } else {
4127                         btrfs_mark_buffer_dirty(leaf);
4128                 }
4129         }
4130         return ret;
4131 }
4132
4133 /*
4134  * search the tree again to find a leaf with lesser keys
4135  * returns 0 if it found something or 1 if there are no lesser leaves.
4136  * returns < 0 on io errors.
4137  *
4138  * This may release the path, and so you may lose any locks held at the
4139  * time you call it.
4140  */
4141 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4142 {
4143         struct btrfs_key key;
4144         struct btrfs_disk_key found_key;
4145         int ret;
4146
4147         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4148
4149         if (key.offset > 0) {
4150                 key.offset--;
4151         } else if (key.type > 0) {
4152                 key.type--;
4153                 key.offset = (u64)-1;
4154         } else if (key.objectid > 0) {
4155                 key.objectid--;
4156                 key.type = (u8)-1;
4157                 key.offset = (u64)-1;
4158         } else {
4159                 return 1;
4160         }
4161
4162         btrfs_release_path(path);
4163         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4164         if (ret < 0)
4165                 return ret;
4166         btrfs_item_key(path->nodes[0], &found_key, 0);
4167         ret = comp_keys(&found_key, &key);
4168         /*
4169          * We might have had an item with the previous key in the tree right
4170          * before we released our path. And after we released our path, that
4171          * item might have been pushed to the first slot (0) of the leaf we
4172          * were holding due to a tree balance. Alternatively, an item with the
4173          * previous key can exist as the only element of a leaf (big fat item).
4174          * Therefore account for these 2 cases, so that our callers (like
4175          * btrfs_previous_item) don't miss an existing item with a key matching
4176          * the previous key we computed above.
4177          */
4178         if (ret <= 0)
4179                 return 0;
4180         return 1;
4181 }
4182
4183 /*
4184  * A helper function to walk down the tree starting at min_key, and looking
4185  * for nodes or leaves that are have a minimum transaction id.
4186  * This is used by the btree defrag code, and tree logging
4187  *
4188  * This does not cow, but it does stuff the starting key it finds back
4189  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4190  * key and get a writable path.
4191  *
4192  * This honors path->lowest_level to prevent descent past a given level
4193  * of the tree.
4194  *
4195  * min_trans indicates the oldest transaction that you are interested
4196  * in walking through.  Any nodes or leaves older than min_trans are
4197  * skipped over (without reading them).
4198  *
4199  * returns zero if something useful was found, < 0 on error and 1 if there
4200  * was nothing in the tree that matched the search criteria.
4201  */
4202 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4203                          struct btrfs_path *path,
4204                          u64 min_trans)
4205 {
4206         struct extent_buffer *cur;
4207         struct btrfs_key found_key;
4208         int slot;
4209         int sret;
4210         u32 nritems;
4211         int level;
4212         int ret = 1;
4213         int keep_locks = path->keep_locks;
4214
4215         path->keep_locks = 1;
4216 again:
4217         cur = btrfs_read_lock_root_node(root);
4218         level = btrfs_header_level(cur);
4219         WARN_ON(path->nodes[level]);
4220         path->nodes[level] = cur;
4221         path->locks[level] = BTRFS_READ_LOCK;
4222
4223         if (btrfs_header_generation(cur) < min_trans) {
4224                 ret = 1;
4225                 goto out;
4226         }
4227         while (1) {
4228                 nritems = btrfs_header_nritems(cur);
4229                 level = btrfs_header_level(cur);
4230                 sret = btrfs_bin_search(cur, min_key, &slot);
4231                 if (sret < 0) {
4232                         ret = sret;
4233                         goto out;
4234                 }
4235
4236                 /* at the lowest level, we're done, setup the path and exit */
4237                 if (level == path->lowest_level) {
4238                         if (slot >= nritems)
4239                                 goto find_next_key;
4240                         ret = 0;
4241                         path->slots[level] = slot;
4242                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4243                         goto out;
4244                 }
4245                 if (sret && slot > 0)
4246                         slot--;
4247                 /*
4248                  * check this node pointer against the min_trans parameters.
4249                  * If it is too old, skip to the next one.
4250                  */
4251                 while (slot < nritems) {
4252                         u64 gen;
4253
4254                         gen = btrfs_node_ptr_generation(cur, slot);
4255                         if (gen < min_trans) {
4256                                 slot++;
4257                                 continue;
4258                         }
4259                         break;
4260                 }
4261 find_next_key:
4262                 /*
4263                  * we didn't find a candidate key in this node, walk forward
4264                  * and find another one
4265                  */
4266                 if (slot >= nritems) {
4267                         path->slots[level] = slot;
4268                         sret = btrfs_find_next_key(root, path, min_key, level,
4269                                                   min_trans);
4270                         if (sret == 0) {
4271                                 btrfs_release_path(path);
4272                                 goto again;
4273                         } else {
4274                                 goto out;
4275                         }
4276                 }
4277                 /* save our key for returning back */
4278                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4279                 path->slots[level] = slot;
4280                 if (level == path->lowest_level) {
4281                         ret = 0;
4282                         goto out;
4283                 }
4284                 cur = btrfs_read_node_slot(cur, slot);
4285                 if (IS_ERR(cur)) {
4286                         ret = PTR_ERR(cur);
4287                         goto out;
4288                 }
4289
4290                 btrfs_tree_read_lock(cur);
4291
4292                 path->locks[level - 1] = BTRFS_READ_LOCK;
4293                 path->nodes[level - 1] = cur;
4294                 unlock_up(path, level, 1, 0, NULL);
4295         }
4296 out:
4297         path->keep_locks = keep_locks;
4298         if (ret == 0) {
4299                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
4300                 memcpy(min_key, &found_key, sizeof(found_key));
4301         }
4302         return ret;
4303 }
4304
4305 /*
4306  * this is similar to btrfs_next_leaf, but does not try to preserve
4307  * and fixup the path.  It looks for and returns the next key in the
4308  * tree based on the current path and the min_trans parameters.
4309  *
4310  * 0 is returned if another key is found, < 0 if there are any errors
4311  * and 1 is returned if there are no higher keys in the tree
4312  *
4313  * path->keep_locks should be set to 1 on the search made before
4314  * calling this function.
4315  */
4316 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4317                         struct btrfs_key *key, int level, u64 min_trans)
4318 {
4319         int slot;
4320         struct extent_buffer *c;
4321
4322         WARN_ON(!path->keep_locks && !path->skip_locking);
4323         while (level < BTRFS_MAX_LEVEL) {
4324                 if (!path->nodes[level])
4325                         return 1;
4326
4327                 slot = path->slots[level] + 1;
4328                 c = path->nodes[level];
4329 next:
4330                 if (slot >= btrfs_header_nritems(c)) {
4331                         int ret;
4332                         int orig_lowest;
4333                         struct btrfs_key cur_key;
4334                         if (level + 1 >= BTRFS_MAX_LEVEL ||
4335                             !path->nodes[level + 1])
4336                                 return 1;
4337
4338                         if (path->locks[level + 1] || path->skip_locking) {
4339                                 level++;
4340                                 continue;
4341                         }
4342
4343                         slot = btrfs_header_nritems(c) - 1;
4344                         if (level == 0)
4345                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
4346                         else
4347                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
4348
4349                         orig_lowest = path->lowest_level;
4350                         btrfs_release_path(path);
4351                         path->lowest_level = level;
4352                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
4353                                                 0, 0);
4354                         path->lowest_level = orig_lowest;
4355                         if (ret < 0)
4356                                 return ret;
4357
4358                         c = path->nodes[level];
4359                         slot = path->slots[level];
4360                         if (ret == 0)
4361                                 slot++;
4362                         goto next;
4363                 }
4364
4365                 if (level == 0)
4366                         btrfs_item_key_to_cpu(c, key, slot);
4367                 else {
4368                         u64 gen = btrfs_node_ptr_generation(c, slot);
4369
4370                         if (gen < min_trans) {
4371                                 slot++;
4372                                 goto next;
4373                         }
4374                         btrfs_node_key_to_cpu(c, key, slot);
4375                 }
4376                 return 0;
4377         }
4378         return 1;
4379 }
4380
4381 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4382                         u64 time_seq)
4383 {
4384         int slot;
4385         int level;
4386         struct extent_buffer *c;
4387         struct extent_buffer *next;
4388         struct btrfs_key key;
4389         u32 nritems;
4390         int ret;
4391         int i;
4392
4393         nritems = btrfs_header_nritems(path->nodes[0]);
4394         if (nritems == 0)
4395                 return 1;
4396
4397         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4398 again:
4399         level = 1;
4400         next = NULL;
4401         btrfs_release_path(path);
4402
4403         path->keep_locks = 1;
4404
4405         if (time_seq)
4406                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
4407         else
4408                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4409         path->keep_locks = 0;
4410
4411         if (ret < 0)
4412                 return ret;
4413
4414         nritems = btrfs_header_nritems(path->nodes[0]);
4415         /*
4416          * by releasing the path above we dropped all our locks.  A balance
4417          * could have added more items next to the key that used to be
4418          * at the very end of the block.  So, check again here and
4419          * advance the path if there are now more items available.
4420          */
4421         if (nritems > 0 && path->slots[0] < nritems - 1) {
4422                 if (ret == 0)
4423                         path->slots[0]++;
4424                 ret = 0;
4425                 goto done;
4426         }
4427         /*
4428          * So the above check misses one case:
4429          * - after releasing the path above, someone has removed the item that
4430          *   used to be at the very end of the block, and balance between leafs
4431          *   gets another one with bigger key.offset to replace it.
4432          *
4433          * This one should be returned as well, or we can get leaf corruption
4434          * later(esp. in __btrfs_drop_extents()).
4435          *
4436          * And a bit more explanation about this check,
4437          * with ret > 0, the key isn't found, the path points to the slot
4438          * where it should be inserted, so the path->slots[0] item must be the
4439          * bigger one.
4440          */
4441         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4442                 ret = 0;
4443                 goto done;
4444         }
4445
4446         while (level < BTRFS_MAX_LEVEL) {
4447                 if (!path->nodes[level]) {
4448                         ret = 1;
4449                         goto done;
4450                 }
4451
4452                 slot = path->slots[level] + 1;
4453                 c = path->nodes[level];
4454                 if (slot >= btrfs_header_nritems(c)) {
4455                         level++;
4456                         if (level == BTRFS_MAX_LEVEL) {
4457                                 ret = 1;
4458                                 goto done;
4459                         }
4460                         continue;
4461                 }
4462
4463
4464                 /*
4465                  * Our current level is where we're going to start from, and to
4466                  * make sure lockdep doesn't complain we need to drop our locks
4467                  * and nodes from 0 to our current level.
4468                  */
4469                 for (i = 0; i < level; i++) {
4470                         if (path->locks[level]) {
4471                                 btrfs_tree_read_unlock(path->nodes[i]);
4472                                 path->locks[i] = 0;
4473                         }
4474                         free_extent_buffer(path->nodes[i]);
4475                         path->nodes[i] = NULL;
4476                 }
4477
4478                 next = c;
4479                 ret = read_block_for_search(root, path, &next, level,
4480                                             slot, &key);
4481                 if (ret == -EAGAIN)
4482                         goto again;
4483
4484                 if (ret < 0) {
4485                         btrfs_release_path(path);
4486                         goto done;
4487                 }
4488
4489                 if (!path->skip_locking) {
4490                         ret = btrfs_try_tree_read_lock(next);
4491                         if (!ret && time_seq) {
4492                                 /*
4493                                  * If we don't get the lock, we may be racing
4494                                  * with push_leaf_left, holding that lock while
4495                                  * itself waiting for the leaf we've currently
4496                                  * locked. To solve this situation, we give up
4497                                  * on our lock and cycle.
4498                                  */
4499                                 free_extent_buffer(next);
4500                                 btrfs_release_path(path);
4501                                 cond_resched();
4502                                 goto again;
4503                         }
4504                         if (!ret)
4505                                 btrfs_tree_read_lock(next);
4506                 }
4507                 break;
4508         }
4509         path->slots[level] = slot;
4510         while (1) {
4511                 level--;
4512                 path->nodes[level] = next;
4513                 path->slots[level] = 0;
4514                 if (!path->skip_locking)
4515                         path->locks[level] = BTRFS_READ_LOCK;
4516                 if (!level)
4517                         break;
4518
4519                 ret = read_block_for_search(root, path, &next, level,
4520                                             0, &key);
4521                 if (ret == -EAGAIN)
4522                         goto again;
4523
4524                 if (ret < 0) {
4525                         btrfs_release_path(path);
4526                         goto done;
4527                 }
4528
4529                 if (!path->skip_locking)
4530                         btrfs_tree_read_lock(next);
4531         }
4532         ret = 0;
4533 done:
4534         unlock_up(path, 0, 1, 0, NULL);
4535
4536         return ret;
4537 }
4538
4539 /*
4540  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4541  * searching until it gets past min_objectid or finds an item of 'type'
4542  *
4543  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4544  */
4545 int btrfs_previous_item(struct btrfs_root *root,
4546                         struct btrfs_path *path, u64 min_objectid,
4547                         int type)
4548 {
4549         struct btrfs_key found_key;
4550         struct extent_buffer *leaf;
4551         u32 nritems;
4552         int ret;
4553
4554         while (1) {
4555                 if (path->slots[0] == 0) {
4556                         ret = btrfs_prev_leaf(root, path);
4557                         if (ret != 0)
4558                                 return ret;
4559                 } else {
4560                         path->slots[0]--;
4561                 }
4562                 leaf = path->nodes[0];
4563                 nritems = btrfs_header_nritems(leaf);
4564                 if (nritems == 0)
4565                         return 1;
4566                 if (path->slots[0] == nritems)
4567                         path->slots[0]--;
4568
4569                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4570                 if (found_key.objectid < min_objectid)
4571                         break;
4572                 if (found_key.type == type)
4573                         return 0;
4574                 if (found_key.objectid == min_objectid &&
4575                     found_key.type < type)
4576                         break;
4577         }
4578         return 1;
4579 }
4580
4581 /*
4582  * search in extent tree to find a previous Metadata/Data extent item with
4583  * min objecitd.
4584  *
4585  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4586  */
4587 int btrfs_previous_extent_item(struct btrfs_root *root,
4588                         struct btrfs_path *path, u64 min_objectid)
4589 {
4590         struct btrfs_key found_key;
4591         struct extent_buffer *leaf;
4592         u32 nritems;
4593         int ret;
4594
4595         while (1) {
4596                 if (path->slots[0] == 0) {
4597                         ret = btrfs_prev_leaf(root, path);
4598                         if (ret != 0)
4599                                 return ret;
4600                 } else {
4601                         path->slots[0]--;
4602                 }
4603                 leaf = path->nodes[0];
4604                 nritems = btrfs_header_nritems(leaf);
4605                 if (nritems == 0)
4606                         return 1;
4607                 if (path->slots[0] == nritems)
4608                         path->slots[0]--;
4609
4610                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4611                 if (found_key.objectid < min_objectid)
4612                         break;
4613                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
4614                     found_key.type == BTRFS_METADATA_ITEM_KEY)
4615                         return 0;
4616                 if (found_key.objectid == min_objectid &&
4617                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
4618                         break;
4619         }
4620         return 1;
4621 }