Merge branches 'clk-nvidia', 'clk-rockchip', 'clk-at91' and 'clk-vc5' into clk-next
[linux-2.6-microblaze.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17 #include "tree-mod-log.h"
18
19 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
20                       *root, struct btrfs_path *path, int level);
21 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
22                       const struct btrfs_key *ins_key, struct btrfs_path *path,
23                       int data_size, int extend);
24 static int push_node_left(struct btrfs_trans_handle *trans,
25                           struct extent_buffer *dst,
26                           struct extent_buffer *src, int empty);
27 static int balance_node_right(struct btrfs_trans_handle *trans,
28                               struct extent_buffer *dst_buf,
29                               struct extent_buffer *src_buf);
30 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
31                     int level, int slot);
32
33 static const struct btrfs_csums {
34         u16             size;
35         const char      name[10];
36         const char      driver[12];
37 } btrfs_csums[] = {
38         [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
39         [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
40         [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
41         [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
42                                      .driver = "blake2b-256" },
43 };
44
45 int btrfs_super_csum_size(const struct btrfs_super_block *s)
46 {
47         u16 t = btrfs_super_csum_type(s);
48         /*
49          * csum type is validated at mount time
50          */
51         return btrfs_csums[t].size;
52 }
53
54 const char *btrfs_super_csum_name(u16 csum_type)
55 {
56         /* csum type is validated at mount time */
57         return btrfs_csums[csum_type].name;
58 }
59
60 /*
61  * Return driver name if defined, otherwise the name that's also a valid driver
62  * name
63  */
64 const char *btrfs_super_csum_driver(u16 csum_type)
65 {
66         /* csum type is validated at mount time */
67         return btrfs_csums[csum_type].driver[0] ?
68                 btrfs_csums[csum_type].driver :
69                 btrfs_csums[csum_type].name;
70 }
71
72 size_t __attribute_const__ btrfs_get_num_csums(void)
73 {
74         return ARRAY_SIZE(btrfs_csums);
75 }
76
77 struct btrfs_path *btrfs_alloc_path(void)
78 {
79         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
80 }
81
82 /* this also releases the path */
83 void btrfs_free_path(struct btrfs_path *p)
84 {
85         if (!p)
86                 return;
87         btrfs_release_path(p);
88         kmem_cache_free(btrfs_path_cachep, p);
89 }
90
91 /*
92  * path release drops references on the extent buffers in the path
93  * and it drops any locks held by this path
94  *
95  * It is safe to call this on paths that no locks or extent buffers held.
96  */
97 noinline void btrfs_release_path(struct btrfs_path *p)
98 {
99         int i;
100
101         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
102                 p->slots[i] = 0;
103                 if (!p->nodes[i])
104                         continue;
105                 if (p->locks[i]) {
106                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
107                         p->locks[i] = 0;
108                 }
109                 free_extent_buffer(p->nodes[i]);
110                 p->nodes[i] = NULL;
111         }
112 }
113
114 /*
115  * safely gets a reference on the root node of a tree.  A lock
116  * is not taken, so a concurrent writer may put a different node
117  * at the root of the tree.  See btrfs_lock_root_node for the
118  * looping required.
119  *
120  * The extent buffer returned by this has a reference taken, so
121  * it won't disappear.  It may stop being the root of the tree
122  * at any time because there are no locks held.
123  */
124 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
125 {
126         struct extent_buffer *eb;
127
128         while (1) {
129                 rcu_read_lock();
130                 eb = rcu_dereference(root->node);
131
132                 /*
133                  * RCU really hurts here, we could free up the root node because
134                  * it was COWed but we may not get the new root node yet so do
135                  * the inc_not_zero dance and if it doesn't work then
136                  * synchronize_rcu and try again.
137                  */
138                 if (atomic_inc_not_zero(&eb->refs)) {
139                         rcu_read_unlock();
140                         break;
141                 }
142                 rcu_read_unlock();
143                 synchronize_rcu();
144         }
145         return eb;
146 }
147
148 /*
149  * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
150  * just get put onto a simple dirty list.  Transaction walks this list to make
151  * sure they get properly updated on disk.
152  */
153 static void add_root_to_dirty_list(struct btrfs_root *root)
154 {
155         struct btrfs_fs_info *fs_info = root->fs_info;
156
157         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
158             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
159                 return;
160
161         spin_lock(&fs_info->trans_lock);
162         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
163                 /* Want the extent tree to be the last on the list */
164                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
165                         list_move_tail(&root->dirty_list,
166                                        &fs_info->dirty_cowonly_roots);
167                 else
168                         list_move(&root->dirty_list,
169                                   &fs_info->dirty_cowonly_roots);
170         }
171         spin_unlock(&fs_info->trans_lock);
172 }
173
174 /*
175  * used by snapshot creation to make a copy of a root for a tree with
176  * a given objectid.  The buffer with the new root node is returned in
177  * cow_ret, and this func returns zero on success or a negative error code.
178  */
179 int btrfs_copy_root(struct btrfs_trans_handle *trans,
180                       struct btrfs_root *root,
181                       struct extent_buffer *buf,
182                       struct extent_buffer **cow_ret, u64 new_root_objectid)
183 {
184         struct btrfs_fs_info *fs_info = root->fs_info;
185         struct extent_buffer *cow;
186         int ret = 0;
187         int level;
188         struct btrfs_disk_key disk_key;
189
190         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
191                 trans->transid != fs_info->running_transaction->transid);
192         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
193                 trans->transid != root->last_trans);
194
195         level = btrfs_header_level(buf);
196         if (level == 0)
197                 btrfs_item_key(buf, &disk_key, 0);
198         else
199                 btrfs_node_key(buf, &disk_key, 0);
200
201         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
202                                      &disk_key, level, buf->start, 0,
203                                      BTRFS_NESTING_NEW_ROOT);
204         if (IS_ERR(cow))
205                 return PTR_ERR(cow);
206
207         copy_extent_buffer_full(cow, buf);
208         btrfs_set_header_bytenr(cow, cow->start);
209         btrfs_set_header_generation(cow, trans->transid);
210         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
211         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
212                                      BTRFS_HEADER_FLAG_RELOC);
213         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
214                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
215         else
216                 btrfs_set_header_owner(cow, new_root_objectid);
217
218         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
219
220         WARN_ON(btrfs_header_generation(buf) > trans->transid);
221         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
222                 ret = btrfs_inc_ref(trans, root, cow, 1);
223         else
224                 ret = btrfs_inc_ref(trans, root, cow, 0);
225         if (ret) {
226                 btrfs_tree_unlock(cow);
227                 free_extent_buffer(cow);
228                 btrfs_abort_transaction(trans, ret);
229                 return ret;
230         }
231
232         btrfs_mark_buffer_dirty(cow);
233         *cow_ret = cow;
234         return 0;
235 }
236
237 /*
238  * check if the tree block can be shared by multiple trees
239  */
240 int btrfs_block_can_be_shared(struct btrfs_root *root,
241                               struct extent_buffer *buf)
242 {
243         /*
244          * Tree blocks not in shareable trees and tree roots are never shared.
245          * If a block was allocated after the last snapshot and the block was
246          * not allocated by tree relocation, we know the block is not shared.
247          */
248         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
249             buf != root->node && buf != root->commit_root &&
250             (btrfs_header_generation(buf) <=
251              btrfs_root_last_snapshot(&root->root_item) ||
252              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
253                 return 1;
254
255         return 0;
256 }
257
258 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
259                                        struct btrfs_root *root,
260                                        struct extent_buffer *buf,
261                                        struct extent_buffer *cow,
262                                        int *last_ref)
263 {
264         struct btrfs_fs_info *fs_info = root->fs_info;
265         u64 refs;
266         u64 owner;
267         u64 flags;
268         u64 new_flags = 0;
269         int ret;
270
271         /*
272          * Backrefs update rules:
273          *
274          * Always use full backrefs for extent pointers in tree block
275          * allocated by tree relocation.
276          *
277          * If a shared tree block is no longer referenced by its owner
278          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
279          * use full backrefs for extent pointers in tree block.
280          *
281          * If a tree block is been relocating
282          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
283          * use full backrefs for extent pointers in tree block.
284          * The reason for this is some operations (such as drop tree)
285          * are only allowed for blocks use full backrefs.
286          */
287
288         if (btrfs_block_can_be_shared(root, buf)) {
289                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
290                                                btrfs_header_level(buf), 1,
291                                                &refs, &flags);
292                 if (ret)
293                         return ret;
294                 if (refs == 0) {
295                         ret = -EROFS;
296                         btrfs_handle_fs_error(fs_info, ret, NULL);
297                         return ret;
298                 }
299         } else {
300                 refs = 1;
301                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
302                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
303                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
304                 else
305                         flags = 0;
306         }
307
308         owner = btrfs_header_owner(buf);
309         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
310                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
311
312         if (refs > 1) {
313                 if ((owner == root->root_key.objectid ||
314                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
315                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
316                         ret = btrfs_inc_ref(trans, root, buf, 1);
317                         if (ret)
318                                 return ret;
319
320                         if (root->root_key.objectid ==
321                             BTRFS_TREE_RELOC_OBJECTID) {
322                                 ret = btrfs_dec_ref(trans, root, buf, 0);
323                                 if (ret)
324                                         return ret;
325                                 ret = btrfs_inc_ref(trans, root, cow, 1);
326                                 if (ret)
327                                         return ret;
328                         }
329                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
330                 } else {
331
332                         if (root->root_key.objectid ==
333                             BTRFS_TREE_RELOC_OBJECTID)
334                                 ret = btrfs_inc_ref(trans, root, cow, 1);
335                         else
336                                 ret = btrfs_inc_ref(trans, root, cow, 0);
337                         if (ret)
338                                 return ret;
339                 }
340                 if (new_flags != 0) {
341                         int level = btrfs_header_level(buf);
342
343                         ret = btrfs_set_disk_extent_flags(trans, buf,
344                                                           new_flags, level, 0);
345                         if (ret)
346                                 return ret;
347                 }
348         } else {
349                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
350                         if (root->root_key.objectid ==
351                             BTRFS_TREE_RELOC_OBJECTID)
352                                 ret = btrfs_inc_ref(trans, root, cow, 1);
353                         else
354                                 ret = btrfs_inc_ref(trans, root, cow, 0);
355                         if (ret)
356                                 return ret;
357                         ret = btrfs_dec_ref(trans, root, buf, 1);
358                         if (ret)
359                                 return ret;
360                 }
361                 btrfs_clean_tree_block(buf);
362                 *last_ref = 1;
363         }
364         return 0;
365 }
366
367 /*
368  * does the dirty work in cow of a single block.  The parent block (if
369  * supplied) is updated to point to the new cow copy.  The new buffer is marked
370  * dirty and returned locked.  If you modify the block it needs to be marked
371  * dirty again.
372  *
373  * search_start -- an allocation hint for the new block
374  *
375  * empty_size -- a hint that you plan on doing more cow.  This is the size in
376  * bytes the allocator should try to find free next to the block it returns.
377  * This is just a hint and may be ignored by the allocator.
378  */
379 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
380                              struct btrfs_root *root,
381                              struct extent_buffer *buf,
382                              struct extent_buffer *parent, int parent_slot,
383                              struct extent_buffer **cow_ret,
384                              u64 search_start, u64 empty_size,
385                              enum btrfs_lock_nesting nest)
386 {
387         struct btrfs_fs_info *fs_info = root->fs_info;
388         struct btrfs_disk_key disk_key;
389         struct extent_buffer *cow;
390         int level, ret;
391         int last_ref = 0;
392         int unlock_orig = 0;
393         u64 parent_start = 0;
394
395         if (*cow_ret == buf)
396                 unlock_orig = 1;
397
398         btrfs_assert_tree_locked(buf);
399
400         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
401                 trans->transid != fs_info->running_transaction->transid);
402         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
403                 trans->transid != root->last_trans);
404
405         level = btrfs_header_level(buf);
406
407         if (level == 0)
408                 btrfs_item_key(buf, &disk_key, 0);
409         else
410                 btrfs_node_key(buf, &disk_key, 0);
411
412         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
413                 parent_start = parent->start;
414
415         cow = btrfs_alloc_tree_block(trans, root, parent_start,
416                                      root->root_key.objectid, &disk_key, level,
417                                      search_start, empty_size, nest);
418         if (IS_ERR(cow))
419                 return PTR_ERR(cow);
420
421         /* cow is set to blocking by btrfs_init_new_buffer */
422
423         copy_extent_buffer_full(cow, buf);
424         btrfs_set_header_bytenr(cow, cow->start);
425         btrfs_set_header_generation(cow, trans->transid);
426         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
427         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
428                                      BTRFS_HEADER_FLAG_RELOC);
429         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
430                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
431         else
432                 btrfs_set_header_owner(cow, root->root_key.objectid);
433
434         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
435
436         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
437         if (ret) {
438                 btrfs_tree_unlock(cow);
439                 free_extent_buffer(cow);
440                 btrfs_abort_transaction(trans, ret);
441                 return ret;
442         }
443
444         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
445                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
446                 if (ret) {
447                         btrfs_tree_unlock(cow);
448                         free_extent_buffer(cow);
449                         btrfs_abort_transaction(trans, ret);
450                         return ret;
451                 }
452         }
453
454         if (buf == root->node) {
455                 WARN_ON(parent && parent != buf);
456                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
457                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
458                         parent_start = buf->start;
459
460                 atomic_inc(&cow->refs);
461                 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
462                 BUG_ON(ret < 0);
463                 rcu_assign_pointer(root->node, cow);
464
465                 btrfs_free_tree_block(trans, root, buf, parent_start,
466                                       last_ref);
467                 free_extent_buffer(buf);
468                 add_root_to_dirty_list(root);
469         } else {
470                 WARN_ON(trans->transid != btrfs_header_generation(parent));
471                 btrfs_tree_mod_log_insert_key(parent, parent_slot,
472                                               BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
473                 btrfs_set_node_blockptr(parent, parent_slot,
474                                         cow->start);
475                 btrfs_set_node_ptr_generation(parent, parent_slot,
476                                               trans->transid);
477                 btrfs_mark_buffer_dirty(parent);
478                 if (last_ref) {
479                         ret = btrfs_tree_mod_log_free_eb(buf);
480                         if (ret) {
481                                 btrfs_tree_unlock(cow);
482                                 free_extent_buffer(cow);
483                                 btrfs_abort_transaction(trans, ret);
484                                 return ret;
485                         }
486                 }
487                 btrfs_free_tree_block(trans, root, buf, parent_start,
488                                       last_ref);
489         }
490         if (unlock_orig)
491                 btrfs_tree_unlock(buf);
492         free_extent_buffer_stale(buf);
493         btrfs_mark_buffer_dirty(cow);
494         *cow_ret = cow;
495         return 0;
496 }
497
498 static inline int should_cow_block(struct btrfs_trans_handle *trans,
499                                    struct btrfs_root *root,
500                                    struct extent_buffer *buf)
501 {
502         if (btrfs_is_testing(root->fs_info))
503                 return 0;
504
505         /* Ensure we can see the FORCE_COW bit */
506         smp_mb__before_atomic();
507
508         /*
509          * We do not need to cow a block if
510          * 1) this block is not created or changed in this transaction;
511          * 2) this block does not belong to TREE_RELOC tree;
512          * 3) the root is not forced COW.
513          *
514          * What is forced COW:
515          *    when we create snapshot during committing the transaction,
516          *    after we've finished copying src root, we must COW the shared
517          *    block to ensure the metadata consistency.
518          */
519         if (btrfs_header_generation(buf) == trans->transid &&
520             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
521             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
522               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
523             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
524                 return 0;
525         return 1;
526 }
527
528 /*
529  * cows a single block, see __btrfs_cow_block for the real work.
530  * This version of it has extra checks so that a block isn't COWed more than
531  * once per transaction, as long as it hasn't been written yet
532  */
533 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
534                     struct btrfs_root *root, struct extent_buffer *buf,
535                     struct extent_buffer *parent, int parent_slot,
536                     struct extent_buffer **cow_ret,
537                     enum btrfs_lock_nesting nest)
538 {
539         struct btrfs_fs_info *fs_info = root->fs_info;
540         u64 search_start;
541         int ret;
542
543         if (test_bit(BTRFS_ROOT_DELETING, &root->state))
544                 btrfs_err(fs_info,
545                         "COW'ing blocks on a fs root that's being dropped");
546
547         if (trans->transaction != fs_info->running_transaction)
548                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
549                        trans->transid,
550                        fs_info->running_transaction->transid);
551
552         if (trans->transid != fs_info->generation)
553                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
554                        trans->transid, fs_info->generation);
555
556         if (!should_cow_block(trans, root, buf)) {
557                 *cow_ret = buf;
558                 return 0;
559         }
560
561         search_start = buf->start & ~((u64)SZ_1G - 1);
562
563         /*
564          * Before CoWing this block for later modification, check if it's
565          * the subtree root and do the delayed subtree trace if needed.
566          *
567          * Also We don't care about the error, as it's handled internally.
568          */
569         btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
570         ret = __btrfs_cow_block(trans, root, buf, parent,
571                                  parent_slot, cow_ret, search_start, 0, nest);
572
573         trace_btrfs_cow_block(root, buf, *cow_ret);
574
575         return ret;
576 }
577 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
578
579 /*
580  * helper function for defrag to decide if two blocks pointed to by a
581  * node are actually close by
582  */
583 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
584 {
585         if (blocknr < other && other - (blocknr + blocksize) < 32768)
586                 return 1;
587         if (blocknr > other && blocknr - (other + blocksize) < 32768)
588                 return 1;
589         return 0;
590 }
591
592 #ifdef __LITTLE_ENDIAN
593
594 /*
595  * Compare two keys, on little-endian the disk order is same as CPU order and
596  * we can avoid the conversion.
597  */
598 static int comp_keys(const struct btrfs_disk_key *disk_key,
599                      const struct btrfs_key *k2)
600 {
601         const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
602
603         return btrfs_comp_cpu_keys(k1, k2);
604 }
605
606 #else
607
608 /*
609  * compare two keys in a memcmp fashion
610  */
611 static int comp_keys(const struct btrfs_disk_key *disk,
612                      const struct btrfs_key *k2)
613 {
614         struct btrfs_key k1;
615
616         btrfs_disk_key_to_cpu(&k1, disk);
617
618         return btrfs_comp_cpu_keys(&k1, k2);
619 }
620 #endif
621
622 /*
623  * same as comp_keys only with two btrfs_key's
624  */
625 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
626 {
627         if (k1->objectid > k2->objectid)
628                 return 1;
629         if (k1->objectid < k2->objectid)
630                 return -1;
631         if (k1->type > k2->type)
632                 return 1;
633         if (k1->type < k2->type)
634                 return -1;
635         if (k1->offset > k2->offset)
636                 return 1;
637         if (k1->offset < k2->offset)
638                 return -1;
639         return 0;
640 }
641
642 /*
643  * this is used by the defrag code to go through all the
644  * leaves pointed to by a node and reallocate them so that
645  * disk order is close to key order
646  */
647 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
648                        struct btrfs_root *root, struct extent_buffer *parent,
649                        int start_slot, u64 *last_ret,
650                        struct btrfs_key *progress)
651 {
652         struct btrfs_fs_info *fs_info = root->fs_info;
653         struct extent_buffer *cur;
654         u64 blocknr;
655         u64 search_start = *last_ret;
656         u64 last_block = 0;
657         u64 other;
658         u32 parent_nritems;
659         int end_slot;
660         int i;
661         int err = 0;
662         u32 blocksize;
663         int progress_passed = 0;
664         struct btrfs_disk_key disk_key;
665
666         WARN_ON(trans->transaction != fs_info->running_transaction);
667         WARN_ON(trans->transid != fs_info->generation);
668
669         parent_nritems = btrfs_header_nritems(parent);
670         blocksize = fs_info->nodesize;
671         end_slot = parent_nritems - 1;
672
673         if (parent_nritems <= 1)
674                 return 0;
675
676         for (i = start_slot; i <= end_slot; i++) {
677                 int close = 1;
678
679                 btrfs_node_key(parent, &disk_key, i);
680                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
681                         continue;
682
683                 progress_passed = 1;
684                 blocknr = btrfs_node_blockptr(parent, i);
685                 if (last_block == 0)
686                         last_block = blocknr;
687
688                 if (i > 0) {
689                         other = btrfs_node_blockptr(parent, i - 1);
690                         close = close_blocks(blocknr, other, blocksize);
691                 }
692                 if (!close && i < end_slot) {
693                         other = btrfs_node_blockptr(parent, i + 1);
694                         close = close_blocks(blocknr, other, blocksize);
695                 }
696                 if (close) {
697                         last_block = blocknr;
698                         continue;
699                 }
700
701                 cur = btrfs_read_node_slot(parent, i);
702                 if (IS_ERR(cur))
703                         return PTR_ERR(cur);
704                 if (search_start == 0)
705                         search_start = last_block;
706
707                 btrfs_tree_lock(cur);
708                 err = __btrfs_cow_block(trans, root, cur, parent, i,
709                                         &cur, search_start,
710                                         min(16 * blocksize,
711                                             (end_slot - i) * blocksize),
712                                         BTRFS_NESTING_COW);
713                 if (err) {
714                         btrfs_tree_unlock(cur);
715                         free_extent_buffer(cur);
716                         break;
717                 }
718                 search_start = cur->start;
719                 last_block = cur->start;
720                 *last_ret = search_start;
721                 btrfs_tree_unlock(cur);
722                 free_extent_buffer(cur);
723         }
724         return err;
725 }
726
727 /*
728  * search for key in the extent_buffer.  The items start at offset p,
729  * and they are item_size apart.  There are 'max' items in p.
730  *
731  * the slot in the array is returned via slot, and it points to
732  * the place where you would insert key if it is not found in
733  * the array.
734  *
735  * slot may point to max if the key is bigger than all of the keys
736  */
737 static noinline int generic_bin_search(struct extent_buffer *eb,
738                                        unsigned long p, int item_size,
739                                        const struct btrfs_key *key,
740                                        int max, int *slot)
741 {
742         int low = 0;
743         int high = max;
744         int ret;
745         const int key_size = sizeof(struct btrfs_disk_key);
746
747         if (low > high) {
748                 btrfs_err(eb->fs_info,
749                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
750                           __func__, low, high, eb->start,
751                           btrfs_header_owner(eb), btrfs_header_level(eb));
752                 return -EINVAL;
753         }
754
755         while (low < high) {
756                 unsigned long oip;
757                 unsigned long offset;
758                 struct btrfs_disk_key *tmp;
759                 struct btrfs_disk_key unaligned;
760                 int mid;
761
762                 mid = (low + high) / 2;
763                 offset = p + mid * item_size;
764                 oip = offset_in_page(offset);
765
766                 if (oip + key_size <= PAGE_SIZE) {
767                         const unsigned long idx = get_eb_page_index(offset);
768                         char *kaddr = page_address(eb->pages[idx]);
769
770                         oip = get_eb_offset_in_page(eb, offset);
771                         tmp = (struct btrfs_disk_key *)(kaddr + oip);
772                 } else {
773                         read_extent_buffer(eb, &unaligned, offset, key_size);
774                         tmp = &unaligned;
775                 }
776
777                 ret = comp_keys(tmp, key);
778
779                 if (ret < 0)
780                         low = mid + 1;
781                 else if (ret > 0)
782                         high = mid;
783                 else {
784                         *slot = mid;
785                         return 0;
786                 }
787         }
788         *slot = low;
789         return 1;
790 }
791
792 /*
793  * simple bin_search frontend that does the right thing for
794  * leaves vs nodes
795  */
796 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
797                      int *slot)
798 {
799         if (btrfs_header_level(eb) == 0)
800                 return generic_bin_search(eb,
801                                           offsetof(struct btrfs_leaf, items),
802                                           sizeof(struct btrfs_item),
803                                           key, btrfs_header_nritems(eb),
804                                           slot);
805         else
806                 return generic_bin_search(eb,
807                                           offsetof(struct btrfs_node, ptrs),
808                                           sizeof(struct btrfs_key_ptr),
809                                           key, btrfs_header_nritems(eb),
810                                           slot);
811 }
812
813 static void root_add_used(struct btrfs_root *root, u32 size)
814 {
815         spin_lock(&root->accounting_lock);
816         btrfs_set_root_used(&root->root_item,
817                             btrfs_root_used(&root->root_item) + size);
818         spin_unlock(&root->accounting_lock);
819 }
820
821 static void root_sub_used(struct btrfs_root *root, u32 size)
822 {
823         spin_lock(&root->accounting_lock);
824         btrfs_set_root_used(&root->root_item,
825                             btrfs_root_used(&root->root_item) - size);
826         spin_unlock(&root->accounting_lock);
827 }
828
829 /* given a node and slot number, this reads the blocks it points to.  The
830  * extent buffer is returned with a reference taken (but unlocked).
831  */
832 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
833                                            int slot)
834 {
835         int level = btrfs_header_level(parent);
836         struct extent_buffer *eb;
837         struct btrfs_key first_key;
838
839         if (slot < 0 || slot >= btrfs_header_nritems(parent))
840                 return ERR_PTR(-ENOENT);
841
842         BUG_ON(level == 0);
843
844         btrfs_node_key_to_cpu(parent, &first_key, slot);
845         eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
846                              btrfs_header_owner(parent),
847                              btrfs_node_ptr_generation(parent, slot),
848                              level - 1, &first_key);
849         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
850                 free_extent_buffer(eb);
851                 eb = ERR_PTR(-EIO);
852         }
853
854         return eb;
855 }
856
857 /*
858  * node level balancing, used to make sure nodes are in proper order for
859  * item deletion.  We balance from the top down, so we have to make sure
860  * that a deletion won't leave an node completely empty later on.
861  */
862 static noinline int balance_level(struct btrfs_trans_handle *trans,
863                          struct btrfs_root *root,
864                          struct btrfs_path *path, int level)
865 {
866         struct btrfs_fs_info *fs_info = root->fs_info;
867         struct extent_buffer *right = NULL;
868         struct extent_buffer *mid;
869         struct extent_buffer *left = NULL;
870         struct extent_buffer *parent = NULL;
871         int ret = 0;
872         int wret;
873         int pslot;
874         int orig_slot = path->slots[level];
875         u64 orig_ptr;
876
877         ASSERT(level > 0);
878
879         mid = path->nodes[level];
880
881         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
882         WARN_ON(btrfs_header_generation(mid) != trans->transid);
883
884         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
885
886         if (level < BTRFS_MAX_LEVEL - 1) {
887                 parent = path->nodes[level + 1];
888                 pslot = path->slots[level + 1];
889         }
890
891         /*
892          * deal with the case where there is only one pointer in the root
893          * by promoting the node below to a root
894          */
895         if (!parent) {
896                 struct extent_buffer *child;
897
898                 if (btrfs_header_nritems(mid) != 1)
899                         return 0;
900
901                 /* promote the child to a root */
902                 child = btrfs_read_node_slot(mid, 0);
903                 if (IS_ERR(child)) {
904                         ret = PTR_ERR(child);
905                         btrfs_handle_fs_error(fs_info, ret, NULL);
906                         goto enospc;
907                 }
908
909                 btrfs_tree_lock(child);
910                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
911                                       BTRFS_NESTING_COW);
912                 if (ret) {
913                         btrfs_tree_unlock(child);
914                         free_extent_buffer(child);
915                         goto enospc;
916                 }
917
918                 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
919                 BUG_ON(ret < 0);
920                 rcu_assign_pointer(root->node, child);
921
922                 add_root_to_dirty_list(root);
923                 btrfs_tree_unlock(child);
924
925                 path->locks[level] = 0;
926                 path->nodes[level] = NULL;
927                 btrfs_clean_tree_block(mid);
928                 btrfs_tree_unlock(mid);
929                 /* once for the path */
930                 free_extent_buffer(mid);
931
932                 root_sub_used(root, mid->len);
933                 btrfs_free_tree_block(trans, root, mid, 0, 1);
934                 /* once for the root ptr */
935                 free_extent_buffer_stale(mid);
936                 return 0;
937         }
938         if (btrfs_header_nritems(mid) >
939             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
940                 return 0;
941
942         left = btrfs_read_node_slot(parent, pslot - 1);
943         if (IS_ERR(left))
944                 left = NULL;
945
946         if (left) {
947                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
948                 wret = btrfs_cow_block(trans, root, left,
949                                        parent, pslot - 1, &left,
950                                        BTRFS_NESTING_LEFT_COW);
951                 if (wret) {
952                         ret = wret;
953                         goto enospc;
954                 }
955         }
956
957         right = btrfs_read_node_slot(parent, pslot + 1);
958         if (IS_ERR(right))
959                 right = NULL;
960
961         if (right) {
962                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
963                 wret = btrfs_cow_block(trans, root, right,
964                                        parent, pslot + 1, &right,
965                                        BTRFS_NESTING_RIGHT_COW);
966                 if (wret) {
967                         ret = wret;
968                         goto enospc;
969                 }
970         }
971
972         /* first, try to make some room in the middle buffer */
973         if (left) {
974                 orig_slot += btrfs_header_nritems(left);
975                 wret = push_node_left(trans, left, mid, 1);
976                 if (wret < 0)
977                         ret = wret;
978         }
979
980         /*
981          * then try to empty the right most buffer into the middle
982          */
983         if (right) {
984                 wret = push_node_left(trans, mid, right, 1);
985                 if (wret < 0 && wret != -ENOSPC)
986                         ret = wret;
987                 if (btrfs_header_nritems(right) == 0) {
988                         btrfs_clean_tree_block(right);
989                         btrfs_tree_unlock(right);
990                         del_ptr(root, path, level + 1, pslot + 1);
991                         root_sub_used(root, right->len);
992                         btrfs_free_tree_block(trans, root, right, 0, 1);
993                         free_extent_buffer_stale(right);
994                         right = NULL;
995                 } else {
996                         struct btrfs_disk_key right_key;
997                         btrfs_node_key(right, &right_key, 0);
998                         ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
999                                         BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1000                         BUG_ON(ret < 0);
1001                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1002                         btrfs_mark_buffer_dirty(parent);
1003                 }
1004         }
1005         if (btrfs_header_nritems(mid) == 1) {
1006                 /*
1007                  * we're not allowed to leave a node with one item in the
1008                  * tree during a delete.  A deletion from lower in the tree
1009                  * could try to delete the only pointer in this node.
1010                  * So, pull some keys from the left.
1011                  * There has to be a left pointer at this point because
1012                  * otherwise we would have pulled some pointers from the
1013                  * right
1014                  */
1015                 if (!left) {
1016                         ret = -EROFS;
1017                         btrfs_handle_fs_error(fs_info, ret, NULL);
1018                         goto enospc;
1019                 }
1020                 wret = balance_node_right(trans, mid, left);
1021                 if (wret < 0) {
1022                         ret = wret;
1023                         goto enospc;
1024                 }
1025                 if (wret == 1) {
1026                         wret = push_node_left(trans, left, mid, 1);
1027                         if (wret < 0)
1028                                 ret = wret;
1029                 }
1030                 BUG_ON(wret == 1);
1031         }
1032         if (btrfs_header_nritems(mid) == 0) {
1033                 btrfs_clean_tree_block(mid);
1034                 btrfs_tree_unlock(mid);
1035                 del_ptr(root, path, level + 1, pslot);
1036                 root_sub_used(root, mid->len);
1037                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1038                 free_extent_buffer_stale(mid);
1039                 mid = NULL;
1040         } else {
1041                 /* update the parent key to reflect our changes */
1042                 struct btrfs_disk_key mid_key;
1043                 btrfs_node_key(mid, &mid_key, 0);
1044                 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1045                                 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1046                 BUG_ON(ret < 0);
1047                 btrfs_set_node_key(parent, &mid_key, pslot);
1048                 btrfs_mark_buffer_dirty(parent);
1049         }
1050
1051         /* update the path */
1052         if (left) {
1053                 if (btrfs_header_nritems(left) > orig_slot) {
1054                         atomic_inc(&left->refs);
1055                         /* left was locked after cow */
1056                         path->nodes[level] = left;
1057                         path->slots[level + 1] -= 1;
1058                         path->slots[level] = orig_slot;
1059                         if (mid) {
1060                                 btrfs_tree_unlock(mid);
1061                                 free_extent_buffer(mid);
1062                         }
1063                 } else {
1064                         orig_slot -= btrfs_header_nritems(left);
1065                         path->slots[level] = orig_slot;
1066                 }
1067         }
1068         /* double check we haven't messed things up */
1069         if (orig_ptr !=
1070             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1071                 BUG();
1072 enospc:
1073         if (right) {
1074                 btrfs_tree_unlock(right);
1075                 free_extent_buffer(right);
1076         }
1077         if (left) {
1078                 if (path->nodes[level] != left)
1079                         btrfs_tree_unlock(left);
1080                 free_extent_buffer(left);
1081         }
1082         return ret;
1083 }
1084
1085 /* Node balancing for insertion.  Here we only split or push nodes around
1086  * when they are completely full.  This is also done top down, so we
1087  * have to be pessimistic.
1088  */
1089 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1090                                           struct btrfs_root *root,
1091                                           struct btrfs_path *path, int level)
1092 {
1093         struct btrfs_fs_info *fs_info = root->fs_info;
1094         struct extent_buffer *right = NULL;
1095         struct extent_buffer *mid;
1096         struct extent_buffer *left = NULL;
1097         struct extent_buffer *parent = NULL;
1098         int ret = 0;
1099         int wret;
1100         int pslot;
1101         int orig_slot = path->slots[level];
1102
1103         if (level == 0)
1104                 return 1;
1105
1106         mid = path->nodes[level];
1107         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1108
1109         if (level < BTRFS_MAX_LEVEL - 1) {
1110                 parent = path->nodes[level + 1];
1111                 pslot = path->slots[level + 1];
1112         }
1113
1114         if (!parent)
1115                 return 1;
1116
1117         left = btrfs_read_node_slot(parent, pslot - 1);
1118         if (IS_ERR(left))
1119                 left = NULL;
1120
1121         /* first, try to make some room in the middle buffer */
1122         if (left) {
1123                 u32 left_nr;
1124
1125                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1126
1127                 left_nr = btrfs_header_nritems(left);
1128                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1129                         wret = 1;
1130                 } else {
1131                         ret = btrfs_cow_block(trans, root, left, parent,
1132                                               pslot - 1, &left,
1133                                               BTRFS_NESTING_LEFT_COW);
1134                         if (ret)
1135                                 wret = 1;
1136                         else {
1137                                 wret = push_node_left(trans, left, mid, 0);
1138                         }
1139                 }
1140                 if (wret < 0)
1141                         ret = wret;
1142                 if (wret == 0) {
1143                         struct btrfs_disk_key disk_key;
1144                         orig_slot += left_nr;
1145                         btrfs_node_key(mid, &disk_key, 0);
1146                         ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1147                                         BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1148                         BUG_ON(ret < 0);
1149                         btrfs_set_node_key(parent, &disk_key, pslot);
1150                         btrfs_mark_buffer_dirty(parent);
1151                         if (btrfs_header_nritems(left) > orig_slot) {
1152                                 path->nodes[level] = left;
1153                                 path->slots[level + 1] -= 1;
1154                                 path->slots[level] = orig_slot;
1155                                 btrfs_tree_unlock(mid);
1156                                 free_extent_buffer(mid);
1157                         } else {
1158                                 orig_slot -=
1159                                         btrfs_header_nritems(left);
1160                                 path->slots[level] = orig_slot;
1161                                 btrfs_tree_unlock(left);
1162                                 free_extent_buffer(left);
1163                         }
1164                         return 0;
1165                 }
1166                 btrfs_tree_unlock(left);
1167                 free_extent_buffer(left);
1168         }
1169         right = btrfs_read_node_slot(parent, pslot + 1);
1170         if (IS_ERR(right))
1171                 right = NULL;
1172
1173         /*
1174          * then try to empty the right most buffer into the middle
1175          */
1176         if (right) {
1177                 u32 right_nr;
1178
1179                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1180
1181                 right_nr = btrfs_header_nritems(right);
1182                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1183                         wret = 1;
1184                 } else {
1185                         ret = btrfs_cow_block(trans, root, right,
1186                                               parent, pslot + 1,
1187                                               &right, BTRFS_NESTING_RIGHT_COW);
1188                         if (ret)
1189                                 wret = 1;
1190                         else {
1191                                 wret = balance_node_right(trans, right, mid);
1192                         }
1193                 }
1194                 if (wret < 0)
1195                         ret = wret;
1196                 if (wret == 0) {
1197                         struct btrfs_disk_key disk_key;
1198
1199                         btrfs_node_key(right, &disk_key, 0);
1200                         ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1201                                         BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1202                         BUG_ON(ret < 0);
1203                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1204                         btrfs_mark_buffer_dirty(parent);
1205
1206                         if (btrfs_header_nritems(mid) <= orig_slot) {
1207                                 path->nodes[level] = right;
1208                                 path->slots[level + 1] += 1;
1209                                 path->slots[level] = orig_slot -
1210                                         btrfs_header_nritems(mid);
1211                                 btrfs_tree_unlock(mid);
1212                                 free_extent_buffer(mid);
1213                         } else {
1214                                 btrfs_tree_unlock(right);
1215                                 free_extent_buffer(right);
1216                         }
1217                         return 0;
1218                 }
1219                 btrfs_tree_unlock(right);
1220                 free_extent_buffer(right);
1221         }
1222         return 1;
1223 }
1224
1225 /*
1226  * readahead one full node of leaves, finding things that are close
1227  * to the block in 'slot', and triggering ra on them.
1228  */
1229 static void reada_for_search(struct btrfs_fs_info *fs_info,
1230                              struct btrfs_path *path,
1231                              int level, int slot, u64 objectid)
1232 {
1233         struct extent_buffer *node;
1234         struct btrfs_disk_key disk_key;
1235         u32 nritems;
1236         u64 search;
1237         u64 target;
1238         u64 nread = 0;
1239         u64 nread_max;
1240         struct extent_buffer *eb;
1241         u32 nr;
1242         u32 blocksize;
1243         u32 nscan = 0;
1244
1245         if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1246                 return;
1247
1248         if (!path->nodes[level])
1249                 return;
1250
1251         node = path->nodes[level];
1252
1253         /*
1254          * Since the time between visiting leaves is much shorter than the time
1255          * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1256          * much IO at once (possibly random).
1257          */
1258         if (path->reada == READA_FORWARD_ALWAYS) {
1259                 if (level > 1)
1260                         nread_max = node->fs_info->nodesize;
1261                 else
1262                         nread_max = SZ_128K;
1263         } else {
1264                 nread_max = SZ_64K;
1265         }
1266
1267         search = btrfs_node_blockptr(node, slot);
1268         blocksize = fs_info->nodesize;
1269         eb = find_extent_buffer(fs_info, search);
1270         if (eb) {
1271                 free_extent_buffer(eb);
1272                 return;
1273         }
1274
1275         target = search;
1276
1277         nritems = btrfs_header_nritems(node);
1278         nr = slot;
1279
1280         while (1) {
1281                 if (path->reada == READA_BACK) {
1282                         if (nr == 0)
1283                                 break;
1284                         nr--;
1285                 } else if (path->reada == READA_FORWARD ||
1286                            path->reada == READA_FORWARD_ALWAYS) {
1287                         nr++;
1288                         if (nr >= nritems)
1289                                 break;
1290                 }
1291                 if (path->reada == READA_BACK && objectid) {
1292                         btrfs_node_key(node, &disk_key, nr);
1293                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1294                                 break;
1295                 }
1296                 search = btrfs_node_blockptr(node, nr);
1297                 if (path->reada == READA_FORWARD_ALWAYS ||
1298                     (search <= target && target - search <= 65536) ||
1299                     (search > target && search - target <= 65536)) {
1300                         btrfs_readahead_node_child(node, nr);
1301                         nread += blocksize;
1302                 }
1303                 nscan++;
1304                 if (nread > nread_max || nscan > 32)
1305                         break;
1306         }
1307 }
1308
1309 static noinline void reada_for_balance(struct btrfs_path *path, int level)
1310 {
1311         struct extent_buffer *parent;
1312         int slot;
1313         int nritems;
1314
1315         parent = path->nodes[level + 1];
1316         if (!parent)
1317                 return;
1318
1319         nritems = btrfs_header_nritems(parent);
1320         slot = path->slots[level + 1];
1321
1322         if (slot > 0)
1323                 btrfs_readahead_node_child(parent, slot - 1);
1324         if (slot + 1 < nritems)
1325                 btrfs_readahead_node_child(parent, slot + 1);
1326 }
1327
1328
1329 /*
1330  * when we walk down the tree, it is usually safe to unlock the higher layers
1331  * in the tree.  The exceptions are when our path goes through slot 0, because
1332  * operations on the tree might require changing key pointers higher up in the
1333  * tree.
1334  *
1335  * callers might also have set path->keep_locks, which tells this code to keep
1336  * the lock if the path points to the last slot in the block.  This is part of
1337  * walking through the tree, and selecting the next slot in the higher block.
1338  *
1339  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1340  * if lowest_unlock is 1, level 0 won't be unlocked
1341  */
1342 static noinline void unlock_up(struct btrfs_path *path, int level,
1343                                int lowest_unlock, int min_write_lock_level,
1344                                int *write_lock_level)
1345 {
1346         int i;
1347         int skip_level = level;
1348         int no_skips = 0;
1349         struct extent_buffer *t;
1350
1351         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1352                 if (!path->nodes[i])
1353                         break;
1354                 if (!path->locks[i])
1355                         break;
1356                 if (!no_skips && path->slots[i] == 0) {
1357                         skip_level = i + 1;
1358                         continue;
1359                 }
1360                 if (!no_skips && path->keep_locks) {
1361                         u32 nritems;
1362                         t = path->nodes[i];
1363                         nritems = btrfs_header_nritems(t);
1364                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
1365                                 skip_level = i + 1;
1366                                 continue;
1367                         }
1368                 }
1369                 if (skip_level < i && i >= lowest_unlock)
1370                         no_skips = 1;
1371
1372                 t = path->nodes[i];
1373                 if (i >= lowest_unlock && i > skip_level) {
1374                         btrfs_tree_unlock_rw(t, path->locks[i]);
1375                         path->locks[i] = 0;
1376                         if (write_lock_level &&
1377                             i > min_write_lock_level &&
1378                             i <= *write_lock_level) {
1379                                 *write_lock_level = i - 1;
1380                         }
1381                 }
1382         }
1383 }
1384
1385 /*
1386  * helper function for btrfs_search_slot.  The goal is to find a block
1387  * in cache without setting the path to blocking.  If we find the block
1388  * we return zero and the path is unchanged.
1389  *
1390  * If we can't find the block, we set the path blocking and do some
1391  * reada.  -EAGAIN is returned and the search must be repeated.
1392  */
1393 static int
1394 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1395                       struct extent_buffer **eb_ret, int level, int slot,
1396                       const struct btrfs_key *key)
1397 {
1398         struct btrfs_fs_info *fs_info = root->fs_info;
1399         u64 blocknr;
1400         u64 gen;
1401         struct extent_buffer *tmp;
1402         struct btrfs_key first_key;
1403         int ret;
1404         int parent_level;
1405
1406         blocknr = btrfs_node_blockptr(*eb_ret, slot);
1407         gen = btrfs_node_ptr_generation(*eb_ret, slot);
1408         parent_level = btrfs_header_level(*eb_ret);
1409         btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
1410
1411         tmp = find_extent_buffer(fs_info, blocknr);
1412         if (tmp) {
1413                 if (p->reada == READA_FORWARD_ALWAYS)
1414                         reada_for_search(fs_info, p, level, slot, key->objectid);
1415
1416                 /* first we do an atomic uptodate check */
1417                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1418                         /*
1419                          * Do extra check for first_key, eb can be stale due to
1420                          * being cached, read from scrub, or have multiple
1421                          * parents (shared tree blocks).
1422                          */
1423                         if (btrfs_verify_level_key(tmp,
1424                                         parent_level - 1, &first_key, gen)) {
1425                                 free_extent_buffer(tmp);
1426                                 return -EUCLEAN;
1427                         }
1428                         *eb_ret = tmp;
1429                         return 0;
1430                 }
1431
1432                 /* now we're allowed to do a blocking uptodate check */
1433                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
1434                 if (!ret) {
1435                         *eb_ret = tmp;
1436                         return 0;
1437                 }
1438                 free_extent_buffer(tmp);
1439                 btrfs_release_path(p);
1440                 return -EIO;
1441         }
1442
1443         /*
1444          * reduce lock contention at high levels
1445          * of the btree by dropping locks before
1446          * we read.  Don't release the lock on the current
1447          * level because we need to walk this node to figure
1448          * out which blocks to read.
1449          */
1450         btrfs_unlock_up_safe(p, level + 1);
1451
1452         if (p->reada != READA_NONE)
1453                 reada_for_search(fs_info, p, level, slot, key->objectid);
1454
1455         ret = -EAGAIN;
1456         tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
1457                               gen, parent_level - 1, &first_key);
1458         if (!IS_ERR(tmp)) {
1459                 /*
1460                  * If the read above didn't mark this buffer up to date,
1461                  * it will never end up being up to date.  Set ret to EIO now
1462                  * and give up so that our caller doesn't loop forever
1463                  * on our EAGAINs.
1464                  */
1465                 if (!extent_buffer_uptodate(tmp))
1466                         ret = -EIO;
1467                 free_extent_buffer(tmp);
1468         } else {
1469                 ret = PTR_ERR(tmp);
1470         }
1471
1472         btrfs_release_path(p);
1473         return ret;
1474 }
1475
1476 /*
1477  * helper function for btrfs_search_slot.  This does all of the checks
1478  * for node-level blocks and does any balancing required based on
1479  * the ins_len.
1480  *
1481  * If no extra work was required, zero is returned.  If we had to
1482  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1483  * start over
1484  */
1485 static int
1486 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1487                        struct btrfs_root *root, struct btrfs_path *p,
1488                        struct extent_buffer *b, int level, int ins_len,
1489                        int *write_lock_level)
1490 {
1491         struct btrfs_fs_info *fs_info = root->fs_info;
1492         int ret = 0;
1493
1494         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1495             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1496
1497                 if (*write_lock_level < level + 1) {
1498                         *write_lock_level = level + 1;
1499                         btrfs_release_path(p);
1500                         return -EAGAIN;
1501                 }
1502
1503                 reada_for_balance(p, level);
1504                 ret = split_node(trans, root, p, level);
1505
1506                 b = p->nodes[level];
1507         } else if (ins_len < 0 && btrfs_header_nritems(b) <
1508                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1509
1510                 if (*write_lock_level < level + 1) {
1511                         *write_lock_level = level + 1;
1512                         btrfs_release_path(p);
1513                         return -EAGAIN;
1514                 }
1515
1516                 reada_for_balance(p, level);
1517                 ret = balance_level(trans, root, p, level);
1518                 if (ret)
1519                         return ret;
1520
1521                 b = p->nodes[level];
1522                 if (!b) {
1523                         btrfs_release_path(p);
1524                         return -EAGAIN;
1525                 }
1526                 BUG_ON(btrfs_header_nritems(b) == 1);
1527         }
1528         return ret;
1529 }
1530
1531 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1532                 u64 iobjectid, u64 ioff, u8 key_type,
1533                 struct btrfs_key *found_key)
1534 {
1535         int ret;
1536         struct btrfs_key key;
1537         struct extent_buffer *eb;
1538
1539         ASSERT(path);
1540         ASSERT(found_key);
1541
1542         key.type = key_type;
1543         key.objectid = iobjectid;
1544         key.offset = ioff;
1545
1546         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1547         if (ret < 0)
1548                 return ret;
1549
1550         eb = path->nodes[0];
1551         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1552                 ret = btrfs_next_leaf(fs_root, path);
1553                 if (ret)
1554                         return ret;
1555                 eb = path->nodes[0];
1556         }
1557
1558         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1559         if (found_key->type != key.type ||
1560                         found_key->objectid != key.objectid)
1561                 return 1;
1562
1563         return 0;
1564 }
1565
1566 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1567                                                         struct btrfs_path *p,
1568                                                         int write_lock_level)
1569 {
1570         struct btrfs_fs_info *fs_info = root->fs_info;
1571         struct extent_buffer *b;
1572         int root_lock;
1573         int level = 0;
1574
1575         /* We try very hard to do read locks on the root */
1576         root_lock = BTRFS_READ_LOCK;
1577
1578         if (p->search_commit_root) {
1579                 /*
1580                  * The commit roots are read only so we always do read locks,
1581                  * and we always must hold the commit_root_sem when doing
1582                  * searches on them, the only exception is send where we don't
1583                  * want to block transaction commits for a long time, so
1584                  * we need to clone the commit root in order to avoid races
1585                  * with transaction commits that create a snapshot of one of
1586                  * the roots used by a send operation.
1587                  */
1588                 if (p->need_commit_sem) {
1589                         down_read(&fs_info->commit_root_sem);
1590                         b = btrfs_clone_extent_buffer(root->commit_root);
1591                         up_read(&fs_info->commit_root_sem);
1592                         if (!b)
1593                                 return ERR_PTR(-ENOMEM);
1594
1595                 } else {
1596                         b = root->commit_root;
1597                         atomic_inc(&b->refs);
1598                 }
1599                 level = btrfs_header_level(b);
1600                 /*
1601                  * Ensure that all callers have set skip_locking when
1602                  * p->search_commit_root = 1.
1603                  */
1604                 ASSERT(p->skip_locking == 1);
1605
1606                 goto out;
1607         }
1608
1609         if (p->skip_locking) {
1610                 b = btrfs_root_node(root);
1611                 level = btrfs_header_level(b);
1612                 goto out;
1613         }
1614
1615         /*
1616          * If the level is set to maximum, we can skip trying to get the read
1617          * lock.
1618          */
1619         if (write_lock_level < BTRFS_MAX_LEVEL) {
1620                 /*
1621                  * We don't know the level of the root node until we actually
1622                  * have it read locked
1623                  */
1624                 b = btrfs_read_lock_root_node(root);
1625                 level = btrfs_header_level(b);
1626                 if (level > write_lock_level)
1627                         goto out;
1628
1629                 /* Whoops, must trade for write lock */
1630                 btrfs_tree_read_unlock(b);
1631                 free_extent_buffer(b);
1632         }
1633
1634         b = btrfs_lock_root_node(root);
1635         root_lock = BTRFS_WRITE_LOCK;
1636
1637         /* The level might have changed, check again */
1638         level = btrfs_header_level(b);
1639
1640 out:
1641         p->nodes[level] = b;
1642         if (!p->skip_locking)
1643                 p->locks[level] = root_lock;
1644         /*
1645          * Callers are responsible for dropping b's references.
1646          */
1647         return b;
1648 }
1649
1650
1651 /*
1652  * btrfs_search_slot - look for a key in a tree and perform necessary
1653  * modifications to preserve tree invariants.
1654  *
1655  * @trans:      Handle of transaction, used when modifying the tree
1656  * @p:          Holds all btree nodes along the search path
1657  * @root:       The root node of the tree
1658  * @key:        The key we are looking for
1659  * @ins_len:    Indicates purpose of search:
1660  *              >0  for inserts it's size of item inserted (*)
1661  *              <0  for deletions
1662  *               0  for plain searches, not modifying the tree
1663  *
1664  *              (*) If size of item inserted doesn't include
1665  *              sizeof(struct btrfs_item), then p->search_for_extension must
1666  *              be set.
1667  * @cow:        boolean should CoW operations be performed. Must always be 1
1668  *              when modifying the tree.
1669  *
1670  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1671  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1672  *
1673  * If @key is found, 0 is returned and you can find the item in the leaf level
1674  * of the path (level 0)
1675  *
1676  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1677  * points to the slot where it should be inserted
1678  *
1679  * If an error is encountered while searching the tree a negative error number
1680  * is returned
1681  */
1682 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1683                       const struct btrfs_key *key, struct btrfs_path *p,
1684                       int ins_len, int cow)
1685 {
1686         struct extent_buffer *b;
1687         int slot;
1688         int ret;
1689         int err;
1690         int level;
1691         int lowest_unlock = 1;
1692         /* everything at write_lock_level or lower must be write locked */
1693         int write_lock_level = 0;
1694         u8 lowest_level = 0;
1695         int min_write_lock_level;
1696         int prev_cmp;
1697
1698         lowest_level = p->lowest_level;
1699         WARN_ON(lowest_level && ins_len > 0);
1700         WARN_ON(p->nodes[0] != NULL);
1701         BUG_ON(!cow && ins_len);
1702
1703         if (ins_len < 0) {
1704                 lowest_unlock = 2;
1705
1706                 /* when we are removing items, we might have to go up to level
1707                  * two as we update tree pointers  Make sure we keep write
1708                  * for those levels as well
1709                  */
1710                 write_lock_level = 2;
1711         } else if (ins_len > 0) {
1712                 /*
1713                  * for inserting items, make sure we have a write lock on
1714                  * level 1 so we can update keys
1715                  */
1716                 write_lock_level = 1;
1717         }
1718
1719         if (!cow)
1720                 write_lock_level = -1;
1721
1722         if (cow && (p->keep_locks || p->lowest_level))
1723                 write_lock_level = BTRFS_MAX_LEVEL;
1724
1725         min_write_lock_level = write_lock_level;
1726
1727 again:
1728         prev_cmp = -1;
1729         b = btrfs_search_slot_get_root(root, p, write_lock_level);
1730         if (IS_ERR(b)) {
1731                 ret = PTR_ERR(b);
1732                 goto done;
1733         }
1734
1735         while (b) {
1736                 int dec = 0;
1737
1738                 level = btrfs_header_level(b);
1739
1740                 if (cow) {
1741                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
1742
1743                         /*
1744                          * if we don't really need to cow this block
1745                          * then we don't want to set the path blocking,
1746                          * so we test it here
1747                          */
1748                         if (!should_cow_block(trans, root, b))
1749                                 goto cow_done;
1750
1751                         /*
1752                          * must have write locks on this node and the
1753                          * parent
1754                          */
1755                         if (level > write_lock_level ||
1756                             (level + 1 > write_lock_level &&
1757                             level + 1 < BTRFS_MAX_LEVEL &&
1758                             p->nodes[level + 1])) {
1759                                 write_lock_level = level + 1;
1760                                 btrfs_release_path(p);
1761                                 goto again;
1762                         }
1763
1764                         if (last_level)
1765                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
1766                                                       &b,
1767                                                       BTRFS_NESTING_COW);
1768                         else
1769                                 err = btrfs_cow_block(trans, root, b,
1770                                                       p->nodes[level + 1],
1771                                                       p->slots[level + 1], &b,
1772                                                       BTRFS_NESTING_COW);
1773                         if (err) {
1774                                 ret = err;
1775                                 goto done;
1776                         }
1777                 }
1778 cow_done:
1779                 p->nodes[level] = b;
1780                 /*
1781                  * Leave path with blocking locks to avoid massive
1782                  * lock context switch, this is made on purpose.
1783                  */
1784
1785                 /*
1786                  * we have a lock on b and as long as we aren't changing
1787                  * the tree, there is no way to for the items in b to change.
1788                  * It is safe to drop the lock on our parent before we
1789                  * go through the expensive btree search on b.
1790                  *
1791                  * If we're inserting or deleting (ins_len != 0), then we might
1792                  * be changing slot zero, which may require changing the parent.
1793                  * So, we can't drop the lock until after we know which slot
1794                  * we're operating on.
1795                  */
1796                 if (!ins_len && !p->keep_locks) {
1797                         int u = level + 1;
1798
1799                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
1800                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
1801                                 p->locks[u] = 0;
1802                         }
1803                 }
1804
1805                 /*
1806                  * If btrfs_bin_search returns an exact match (prev_cmp == 0)
1807                  * we can safely assume the target key will always be in slot 0
1808                  * on lower levels due to the invariants BTRFS' btree provides,
1809                  * namely that a btrfs_key_ptr entry always points to the
1810                  * lowest key in the child node, thus we can skip searching
1811                  * lower levels
1812                  */
1813                 if (prev_cmp == 0) {
1814                         slot = 0;
1815                         ret = 0;
1816                 } else {
1817                         ret = btrfs_bin_search(b, key, &slot);
1818                         prev_cmp = ret;
1819                         if (ret < 0)
1820                                 goto done;
1821                 }
1822
1823                 if (level == 0) {
1824                         p->slots[level] = slot;
1825                         /*
1826                          * Item key already exists. In this case, if we are
1827                          * allowed to insert the item (for example, in dir_item
1828                          * case, item key collision is allowed), it will be
1829                          * merged with the original item. Only the item size
1830                          * grows, no new btrfs item will be added. If
1831                          * search_for_extension is not set, ins_len already
1832                          * accounts the size btrfs_item, deduct it here so leaf
1833                          * space check will be correct.
1834                          */
1835                         if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
1836                                 ASSERT(ins_len >= sizeof(struct btrfs_item));
1837                                 ins_len -= sizeof(struct btrfs_item);
1838                         }
1839                         if (ins_len > 0 &&
1840                             btrfs_leaf_free_space(b) < ins_len) {
1841                                 if (write_lock_level < 1) {
1842                                         write_lock_level = 1;
1843                                         btrfs_release_path(p);
1844                                         goto again;
1845                                 }
1846
1847                                 err = split_leaf(trans, root, key,
1848                                                  p, ins_len, ret == 0);
1849
1850                                 BUG_ON(err > 0);
1851                                 if (err) {
1852                                         ret = err;
1853                                         goto done;
1854                                 }
1855                         }
1856                         if (!p->search_for_split)
1857                                 unlock_up(p, level, lowest_unlock,
1858                                           min_write_lock_level, NULL);
1859                         goto done;
1860                 }
1861                 if (ret && slot > 0) {
1862                         dec = 1;
1863                         slot--;
1864                 }
1865                 p->slots[level] = slot;
1866                 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
1867                                              &write_lock_level);
1868                 if (err == -EAGAIN)
1869                         goto again;
1870                 if (err) {
1871                         ret = err;
1872                         goto done;
1873                 }
1874                 b = p->nodes[level];
1875                 slot = p->slots[level];
1876
1877                 /*
1878                  * Slot 0 is special, if we change the key we have to update
1879                  * the parent pointer which means we must have a write lock on
1880                  * the parent
1881                  */
1882                 if (slot == 0 && ins_len && write_lock_level < level + 1) {
1883                         write_lock_level = level + 1;
1884                         btrfs_release_path(p);
1885                         goto again;
1886                 }
1887
1888                 unlock_up(p, level, lowest_unlock, min_write_lock_level,
1889                           &write_lock_level);
1890
1891                 if (level == lowest_level) {
1892                         if (dec)
1893                                 p->slots[level]++;
1894                         goto done;
1895                 }
1896
1897                 err = read_block_for_search(root, p, &b, level, slot, key);
1898                 if (err == -EAGAIN)
1899                         goto again;
1900                 if (err) {
1901                         ret = err;
1902                         goto done;
1903                 }
1904
1905                 if (!p->skip_locking) {
1906                         level = btrfs_header_level(b);
1907                         if (level <= write_lock_level) {
1908                                 btrfs_tree_lock(b);
1909                                 p->locks[level] = BTRFS_WRITE_LOCK;
1910                         } else {
1911                                 btrfs_tree_read_lock(b);
1912                                 p->locks[level] = BTRFS_READ_LOCK;
1913                         }
1914                         p->nodes[level] = b;
1915                 }
1916         }
1917         ret = 1;
1918 done:
1919         if (ret < 0 && !p->skip_release_on_error)
1920                 btrfs_release_path(p);
1921         return ret;
1922 }
1923 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
1924
1925 /*
1926  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
1927  * current state of the tree together with the operations recorded in the tree
1928  * modification log to search for the key in a previous version of this tree, as
1929  * denoted by the time_seq parameter.
1930  *
1931  * Naturally, there is no support for insert, delete or cow operations.
1932  *
1933  * The resulting path and return value will be set up as if we called
1934  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
1935  */
1936 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
1937                           struct btrfs_path *p, u64 time_seq)
1938 {
1939         struct btrfs_fs_info *fs_info = root->fs_info;
1940         struct extent_buffer *b;
1941         int slot;
1942         int ret;
1943         int err;
1944         int level;
1945         int lowest_unlock = 1;
1946         u8 lowest_level = 0;
1947
1948         lowest_level = p->lowest_level;
1949         WARN_ON(p->nodes[0] != NULL);
1950
1951         if (p->search_commit_root) {
1952                 BUG_ON(time_seq);
1953                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
1954         }
1955
1956 again:
1957         b = btrfs_get_old_root(root, time_seq);
1958         if (!b) {
1959                 ret = -EIO;
1960                 goto done;
1961         }
1962         level = btrfs_header_level(b);
1963         p->locks[level] = BTRFS_READ_LOCK;
1964
1965         while (b) {
1966                 int dec = 0;
1967
1968                 level = btrfs_header_level(b);
1969                 p->nodes[level] = b;
1970
1971                 /*
1972                  * we have a lock on b and as long as we aren't changing
1973                  * the tree, there is no way to for the items in b to change.
1974                  * It is safe to drop the lock on our parent before we
1975                  * go through the expensive btree search on b.
1976                  */
1977                 btrfs_unlock_up_safe(p, level + 1);
1978
1979                 ret = btrfs_bin_search(b, key, &slot);
1980                 if (ret < 0)
1981                         goto done;
1982
1983                 if (level == 0) {
1984                         p->slots[level] = slot;
1985                         unlock_up(p, level, lowest_unlock, 0, NULL);
1986                         goto done;
1987                 }
1988
1989                 if (ret && slot > 0) {
1990                         dec = 1;
1991                         slot--;
1992                 }
1993                 p->slots[level] = slot;
1994                 unlock_up(p, level, lowest_unlock, 0, NULL);
1995
1996                 if (level == lowest_level) {
1997                         if (dec)
1998                                 p->slots[level]++;
1999                         goto done;
2000                 }
2001
2002                 err = read_block_for_search(root, p, &b, level, slot, key);
2003                 if (err == -EAGAIN)
2004                         goto again;
2005                 if (err) {
2006                         ret = err;
2007                         goto done;
2008                 }
2009
2010                 level = btrfs_header_level(b);
2011                 btrfs_tree_read_lock(b);
2012                 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2013                 if (!b) {
2014                         ret = -ENOMEM;
2015                         goto done;
2016                 }
2017                 p->locks[level] = BTRFS_READ_LOCK;
2018                 p->nodes[level] = b;
2019         }
2020         ret = 1;
2021 done:
2022         if (ret < 0)
2023                 btrfs_release_path(p);
2024
2025         return ret;
2026 }
2027
2028 /*
2029  * helper to use instead of search slot if no exact match is needed but
2030  * instead the next or previous item should be returned.
2031  * When find_higher is true, the next higher item is returned, the next lower
2032  * otherwise.
2033  * When return_any and find_higher are both true, and no higher item is found,
2034  * return the next lower instead.
2035  * When return_any is true and find_higher is false, and no lower item is found,
2036  * return the next higher instead.
2037  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2038  * < 0 on error
2039  */
2040 int btrfs_search_slot_for_read(struct btrfs_root *root,
2041                                const struct btrfs_key *key,
2042                                struct btrfs_path *p, int find_higher,
2043                                int return_any)
2044 {
2045         int ret;
2046         struct extent_buffer *leaf;
2047
2048 again:
2049         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2050         if (ret <= 0)
2051                 return ret;
2052         /*
2053          * a return value of 1 means the path is at the position where the
2054          * item should be inserted. Normally this is the next bigger item,
2055          * but in case the previous item is the last in a leaf, path points
2056          * to the first free slot in the previous leaf, i.e. at an invalid
2057          * item.
2058          */
2059         leaf = p->nodes[0];
2060
2061         if (find_higher) {
2062                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2063                         ret = btrfs_next_leaf(root, p);
2064                         if (ret <= 0)
2065                                 return ret;
2066                         if (!return_any)
2067                                 return 1;
2068                         /*
2069                          * no higher item found, return the next
2070                          * lower instead
2071                          */
2072                         return_any = 0;
2073                         find_higher = 0;
2074                         btrfs_release_path(p);
2075                         goto again;
2076                 }
2077         } else {
2078                 if (p->slots[0] == 0) {
2079                         ret = btrfs_prev_leaf(root, p);
2080                         if (ret < 0)
2081                                 return ret;
2082                         if (!ret) {
2083                                 leaf = p->nodes[0];
2084                                 if (p->slots[0] == btrfs_header_nritems(leaf))
2085                                         p->slots[0]--;
2086                                 return 0;
2087                         }
2088                         if (!return_any)
2089                                 return 1;
2090                         /*
2091                          * no lower item found, return the next
2092                          * higher instead
2093                          */
2094                         return_any = 0;
2095                         find_higher = 1;
2096                         btrfs_release_path(p);
2097                         goto again;
2098                 } else {
2099                         --p->slots[0];
2100                 }
2101         }
2102         return 0;
2103 }
2104
2105 /*
2106  * adjust the pointers going up the tree, starting at level
2107  * making sure the right key of each node is points to 'key'.
2108  * This is used after shifting pointers to the left, so it stops
2109  * fixing up pointers when a given leaf/node is not in slot 0 of the
2110  * higher levels
2111  *
2112  */
2113 static void fixup_low_keys(struct btrfs_path *path,
2114                            struct btrfs_disk_key *key, int level)
2115 {
2116         int i;
2117         struct extent_buffer *t;
2118         int ret;
2119
2120         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2121                 int tslot = path->slots[i];
2122
2123                 if (!path->nodes[i])
2124                         break;
2125                 t = path->nodes[i];
2126                 ret = btrfs_tree_mod_log_insert_key(t, tslot,
2127                                 BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
2128                 BUG_ON(ret < 0);
2129                 btrfs_set_node_key(t, key, tslot);
2130                 btrfs_mark_buffer_dirty(path->nodes[i]);
2131                 if (tslot != 0)
2132                         break;
2133         }
2134 }
2135
2136 /*
2137  * update item key.
2138  *
2139  * This function isn't completely safe. It's the caller's responsibility
2140  * that the new key won't break the order
2141  */
2142 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2143                              struct btrfs_path *path,
2144                              const struct btrfs_key *new_key)
2145 {
2146         struct btrfs_disk_key disk_key;
2147         struct extent_buffer *eb;
2148         int slot;
2149
2150         eb = path->nodes[0];
2151         slot = path->slots[0];
2152         if (slot > 0) {
2153                 btrfs_item_key(eb, &disk_key, slot - 1);
2154                 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2155                         btrfs_crit(fs_info,
2156                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2157                                    slot, btrfs_disk_key_objectid(&disk_key),
2158                                    btrfs_disk_key_type(&disk_key),
2159                                    btrfs_disk_key_offset(&disk_key),
2160                                    new_key->objectid, new_key->type,
2161                                    new_key->offset);
2162                         btrfs_print_leaf(eb);
2163                         BUG();
2164                 }
2165         }
2166         if (slot < btrfs_header_nritems(eb) - 1) {
2167                 btrfs_item_key(eb, &disk_key, slot + 1);
2168                 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2169                         btrfs_crit(fs_info,
2170                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2171                                    slot, btrfs_disk_key_objectid(&disk_key),
2172                                    btrfs_disk_key_type(&disk_key),
2173                                    btrfs_disk_key_offset(&disk_key),
2174                                    new_key->objectid, new_key->type,
2175                                    new_key->offset);
2176                         btrfs_print_leaf(eb);
2177                         BUG();
2178                 }
2179         }
2180
2181         btrfs_cpu_key_to_disk(&disk_key, new_key);
2182         btrfs_set_item_key(eb, &disk_key, slot);
2183         btrfs_mark_buffer_dirty(eb);
2184         if (slot == 0)
2185                 fixup_low_keys(path, &disk_key, 1);
2186 }
2187
2188 /*
2189  * Check key order of two sibling extent buffers.
2190  *
2191  * Return true if something is wrong.
2192  * Return false if everything is fine.
2193  *
2194  * Tree-checker only works inside one tree block, thus the following
2195  * corruption can not be detected by tree-checker:
2196  *
2197  * Leaf @left                   | Leaf @right
2198  * --------------------------------------------------------------
2199  * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
2200  *
2201  * Key f6 in leaf @left itself is valid, but not valid when the next
2202  * key in leaf @right is 7.
2203  * This can only be checked at tree block merge time.
2204  * And since tree checker has ensured all key order in each tree block
2205  * is correct, we only need to bother the last key of @left and the first
2206  * key of @right.
2207  */
2208 static bool check_sibling_keys(struct extent_buffer *left,
2209                                struct extent_buffer *right)
2210 {
2211         struct btrfs_key left_last;
2212         struct btrfs_key right_first;
2213         int level = btrfs_header_level(left);
2214         int nr_left = btrfs_header_nritems(left);
2215         int nr_right = btrfs_header_nritems(right);
2216
2217         /* No key to check in one of the tree blocks */
2218         if (!nr_left || !nr_right)
2219                 return false;
2220
2221         if (level) {
2222                 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2223                 btrfs_node_key_to_cpu(right, &right_first, 0);
2224         } else {
2225                 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2226                 btrfs_item_key_to_cpu(right, &right_first, 0);
2227         }
2228
2229         if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
2230                 btrfs_crit(left->fs_info,
2231 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2232                            left_last.objectid, left_last.type,
2233                            left_last.offset, right_first.objectid,
2234                            right_first.type, right_first.offset);
2235                 return true;
2236         }
2237         return false;
2238 }
2239
2240 /*
2241  * try to push data from one node into the next node left in the
2242  * tree.
2243  *
2244  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2245  * error, and > 0 if there was no room in the left hand block.
2246  */
2247 static int push_node_left(struct btrfs_trans_handle *trans,
2248                           struct extent_buffer *dst,
2249                           struct extent_buffer *src, int empty)
2250 {
2251         struct btrfs_fs_info *fs_info = trans->fs_info;
2252         int push_items = 0;
2253         int src_nritems;
2254         int dst_nritems;
2255         int ret = 0;
2256
2257         src_nritems = btrfs_header_nritems(src);
2258         dst_nritems = btrfs_header_nritems(dst);
2259         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2260         WARN_ON(btrfs_header_generation(src) != trans->transid);
2261         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2262
2263         if (!empty && src_nritems <= 8)
2264                 return 1;
2265
2266         if (push_items <= 0)
2267                 return 1;
2268
2269         if (empty) {
2270                 push_items = min(src_nritems, push_items);
2271                 if (push_items < src_nritems) {
2272                         /* leave at least 8 pointers in the node if
2273                          * we aren't going to empty it
2274                          */
2275                         if (src_nritems - push_items < 8) {
2276                                 if (push_items <= 8)
2277                                         return 1;
2278                                 push_items -= 8;
2279                         }
2280                 }
2281         } else
2282                 push_items = min(src_nritems - 8, push_items);
2283
2284         /* dst is the left eb, src is the middle eb */
2285         if (check_sibling_keys(dst, src)) {
2286                 ret = -EUCLEAN;
2287                 btrfs_abort_transaction(trans, ret);
2288                 return ret;
2289         }
2290         ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2291         if (ret) {
2292                 btrfs_abort_transaction(trans, ret);
2293                 return ret;
2294         }
2295         copy_extent_buffer(dst, src,
2296                            btrfs_node_key_ptr_offset(dst_nritems),
2297                            btrfs_node_key_ptr_offset(0),
2298                            push_items * sizeof(struct btrfs_key_ptr));
2299
2300         if (push_items < src_nritems) {
2301                 /*
2302                  * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2303                  * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
2304                  */
2305                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2306                                       btrfs_node_key_ptr_offset(push_items),
2307                                       (src_nritems - push_items) *
2308                                       sizeof(struct btrfs_key_ptr));
2309         }
2310         btrfs_set_header_nritems(src, src_nritems - push_items);
2311         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2312         btrfs_mark_buffer_dirty(src);
2313         btrfs_mark_buffer_dirty(dst);
2314
2315         return ret;
2316 }
2317
2318 /*
2319  * try to push data from one node into the next node right in the
2320  * tree.
2321  *
2322  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2323  * error, and > 0 if there was no room in the right hand block.
2324  *
2325  * this will  only push up to 1/2 the contents of the left node over
2326  */
2327 static int balance_node_right(struct btrfs_trans_handle *trans,
2328                               struct extent_buffer *dst,
2329                               struct extent_buffer *src)
2330 {
2331         struct btrfs_fs_info *fs_info = trans->fs_info;
2332         int push_items = 0;
2333         int max_push;
2334         int src_nritems;
2335         int dst_nritems;
2336         int ret = 0;
2337
2338         WARN_ON(btrfs_header_generation(src) != trans->transid);
2339         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2340
2341         src_nritems = btrfs_header_nritems(src);
2342         dst_nritems = btrfs_header_nritems(dst);
2343         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2344         if (push_items <= 0)
2345                 return 1;
2346
2347         if (src_nritems < 4)
2348                 return 1;
2349
2350         max_push = src_nritems / 2 + 1;
2351         /* don't try to empty the node */
2352         if (max_push >= src_nritems)
2353                 return 1;
2354
2355         if (max_push < push_items)
2356                 push_items = max_push;
2357
2358         /* dst is the right eb, src is the middle eb */
2359         if (check_sibling_keys(src, dst)) {
2360                 ret = -EUCLEAN;
2361                 btrfs_abort_transaction(trans, ret);
2362                 return ret;
2363         }
2364         ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
2365         BUG_ON(ret < 0);
2366         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2367                                       btrfs_node_key_ptr_offset(0),
2368                                       (dst_nritems) *
2369                                       sizeof(struct btrfs_key_ptr));
2370
2371         ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2372                                          push_items);
2373         if (ret) {
2374                 btrfs_abort_transaction(trans, ret);
2375                 return ret;
2376         }
2377         copy_extent_buffer(dst, src,
2378                            btrfs_node_key_ptr_offset(0),
2379                            btrfs_node_key_ptr_offset(src_nritems - push_items),
2380                            push_items * sizeof(struct btrfs_key_ptr));
2381
2382         btrfs_set_header_nritems(src, src_nritems - push_items);
2383         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2384
2385         btrfs_mark_buffer_dirty(src);
2386         btrfs_mark_buffer_dirty(dst);
2387
2388         return ret;
2389 }
2390
2391 /*
2392  * helper function to insert a new root level in the tree.
2393  * A new node is allocated, and a single item is inserted to
2394  * point to the existing root
2395  *
2396  * returns zero on success or < 0 on failure.
2397  */
2398 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2399                            struct btrfs_root *root,
2400                            struct btrfs_path *path, int level)
2401 {
2402         struct btrfs_fs_info *fs_info = root->fs_info;
2403         u64 lower_gen;
2404         struct extent_buffer *lower;
2405         struct extent_buffer *c;
2406         struct extent_buffer *old;
2407         struct btrfs_disk_key lower_key;
2408         int ret;
2409
2410         BUG_ON(path->nodes[level]);
2411         BUG_ON(path->nodes[level-1] != root->node);
2412
2413         lower = path->nodes[level-1];
2414         if (level == 1)
2415                 btrfs_item_key(lower, &lower_key, 0);
2416         else
2417                 btrfs_node_key(lower, &lower_key, 0);
2418
2419         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2420                                    &lower_key, level, root->node->start, 0,
2421                                    BTRFS_NESTING_NEW_ROOT);
2422         if (IS_ERR(c))
2423                 return PTR_ERR(c);
2424
2425         root_add_used(root, fs_info->nodesize);
2426
2427         btrfs_set_header_nritems(c, 1);
2428         btrfs_set_node_key(c, &lower_key, 0);
2429         btrfs_set_node_blockptr(c, 0, lower->start);
2430         lower_gen = btrfs_header_generation(lower);
2431         WARN_ON(lower_gen != trans->transid);
2432
2433         btrfs_set_node_ptr_generation(c, 0, lower_gen);
2434
2435         btrfs_mark_buffer_dirty(c);
2436
2437         old = root->node;
2438         ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2439         BUG_ON(ret < 0);
2440         rcu_assign_pointer(root->node, c);
2441
2442         /* the super has an extra ref to root->node */
2443         free_extent_buffer(old);
2444
2445         add_root_to_dirty_list(root);
2446         atomic_inc(&c->refs);
2447         path->nodes[level] = c;
2448         path->locks[level] = BTRFS_WRITE_LOCK;
2449         path->slots[level] = 0;
2450         return 0;
2451 }
2452
2453 /*
2454  * worker function to insert a single pointer in a node.
2455  * the node should have enough room for the pointer already
2456  *
2457  * slot and level indicate where you want the key to go, and
2458  * blocknr is the block the key points to.
2459  */
2460 static void insert_ptr(struct btrfs_trans_handle *trans,
2461                        struct btrfs_path *path,
2462                        struct btrfs_disk_key *key, u64 bytenr,
2463                        int slot, int level)
2464 {
2465         struct extent_buffer *lower;
2466         int nritems;
2467         int ret;
2468
2469         BUG_ON(!path->nodes[level]);
2470         btrfs_assert_tree_locked(path->nodes[level]);
2471         lower = path->nodes[level];
2472         nritems = btrfs_header_nritems(lower);
2473         BUG_ON(slot > nritems);
2474         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2475         if (slot != nritems) {
2476                 if (level) {
2477                         ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2478                                         slot, nritems - slot);
2479                         BUG_ON(ret < 0);
2480                 }
2481                 memmove_extent_buffer(lower,
2482                               btrfs_node_key_ptr_offset(slot + 1),
2483                               btrfs_node_key_ptr_offset(slot),
2484                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2485         }
2486         if (level) {
2487                 ret = btrfs_tree_mod_log_insert_key(lower, slot,
2488                                             BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
2489                 BUG_ON(ret < 0);
2490         }
2491         btrfs_set_node_key(lower, key, slot);
2492         btrfs_set_node_blockptr(lower, slot, bytenr);
2493         WARN_ON(trans->transid == 0);
2494         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2495         btrfs_set_header_nritems(lower, nritems + 1);
2496         btrfs_mark_buffer_dirty(lower);
2497 }
2498
2499 /*
2500  * split the node at the specified level in path in two.
2501  * The path is corrected to point to the appropriate node after the split
2502  *
2503  * Before splitting this tries to make some room in the node by pushing
2504  * left and right, if either one works, it returns right away.
2505  *
2506  * returns 0 on success and < 0 on failure
2507  */
2508 static noinline int split_node(struct btrfs_trans_handle *trans,
2509                                struct btrfs_root *root,
2510                                struct btrfs_path *path, int level)
2511 {
2512         struct btrfs_fs_info *fs_info = root->fs_info;
2513         struct extent_buffer *c;
2514         struct extent_buffer *split;
2515         struct btrfs_disk_key disk_key;
2516         int mid;
2517         int ret;
2518         u32 c_nritems;
2519
2520         c = path->nodes[level];
2521         WARN_ON(btrfs_header_generation(c) != trans->transid);
2522         if (c == root->node) {
2523                 /*
2524                  * trying to split the root, lets make a new one
2525                  *
2526                  * tree mod log: We don't log_removal old root in
2527                  * insert_new_root, because that root buffer will be kept as a
2528                  * normal node. We are going to log removal of half of the
2529                  * elements below with btrfs_tree_mod_log_eb_copy(). We're
2530                  * holding a tree lock on the buffer, which is why we cannot
2531                  * race with other tree_mod_log users.
2532                  */
2533                 ret = insert_new_root(trans, root, path, level + 1);
2534                 if (ret)
2535                         return ret;
2536         } else {
2537                 ret = push_nodes_for_insert(trans, root, path, level);
2538                 c = path->nodes[level];
2539                 if (!ret && btrfs_header_nritems(c) <
2540                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
2541                         return 0;
2542                 if (ret < 0)
2543                         return ret;
2544         }
2545
2546         c_nritems = btrfs_header_nritems(c);
2547         mid = (c_nritems + 1) / 2;
2548         btrfs_node_key(c, &disk_key, mid);
2549
2550         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2551                                        &disk_key, level, c->start, 0,
2552                                        BTRFS_NESTING_SPLIT);
2553         if (IS_ERR(split))
2554                 return PTR_ERR(split);
2555
2556         root_add_used(root, fs_info->nodesize);
2557         ASSERT(btrfs_header_level(c) == level);
2558
2559         ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
2560         if (ret) {
2561                 btrfs_abort_transaction(trans, ret);
2562                 return ret;
2563         }
2564         copy_extent_buffer(split, c,
2565                            btrfs_node_key_ptr_offset(0),
2566                            btrfs_node_key_ptr_offset(mid),
2567                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2568         btrfs_set_header_nritems(split, c_nritems - mid);
2569         btrfs_set_header_nritems(c, mid);
2570
2571         btrfs_mark_buffer_dirty(c);
2572         btrfs_mark_buffer_dirty(split);
2573
2574         insert_ptr(trans, path, &disk_key, split->start,
2575                    path->slots[level + 1] + 1, level + 1);
2576
2577         if (path->slots[level] >= mid) {
2578                 path->slots[level] -= mid;
2579                 btrfs_tree_unlock(c);
2580                 free_extent_buffer(c);
2581                 path->nodes[level] = split;
2582                 path->slots[level + 1] += 1;
2583         } else {
2584                 btrfs_tree_unlock(split);
2585                 free_extent_buffer(split);
2586         }
2587         return 0;
2588 }
2589
2590 /*
2591  * how many bytes are required to store the items in a leaf.  start
2592  * and nr indicate which items in the leaf to check.  This totals up the
2593  * space used both by the item structs and the item data
2594  */
2595 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2596 {
2597         struct btrfs_item *start_item;
2598         struct btrfs_item *end_item;
2599         int data_len;
2600         int nritems = btrfs_header_nritems(l);
2601         int end = min(nritems, start + nr) - 1;
2602
2603         if (!nr)
2604                 return 0;
2605         start_item = btrfs_item_nr(start);
2606         end_item = btrfs_item_nr(end);
2607         data_len = btrfs_item_offset(l, start_item) +
2608                    btrfs_item_size(l, start_item);
2609         data_len = data_len - btrfs_item_offset(l, end_item);
2610         data_len += sizeof(struct btrfs_item) * nr;
2611         WARN_ON(data_len < 0);
2612         return data_len;
2613 }
2614
2615 /*
2616  * The space between the end of the leaf items and
2617  * the start of the leaf data.  IOW, how much room
2618  * the leaf has left for both items and data
2619  */
2620 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
2621 {
2622         struct btrfs_fs_info *fs_info = leaf->fs_info;
2623         int nritems = btrfs_header_nritems(leaf);
2624         int ret;
2625
2626         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
2627         if (ret < 0) {
2628                 btrfs_crit(fs_info,
2629                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
2630                            ret,
2631                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
2632                            leaf_space_used(leaf, 0, nritems), nritems);
2633         }
2634         return ret;
2635 }
2636
2637 /*
2638  * min slot controls the lowest index we're willing to push to the
2639  * right.  We'll push up to and including min_slot, but no lower
2640  */
2641 static noinline int __push_leaf_right(struct btrfs_path *path,
2642                                       int data_size, int empty,
2643                                       struct extent_buffer *right,
2644                                       int free_space, u32 left_nritems,
2645                                       u32 min_slot)
2646 {
2647         struct btrfs_fs_info *fs_info = right->fs_info;
2648         struct extent_buffer *left = path->nodes[0];
2649         struct extent_buffer *upper = path->nodes[1];
2650         struct btrfs_map_token token;
2651         struct btrfs_disk_key disk_key;
2652         int slot;
2653         u32 i;
2654         int push_space = 0;
2655         int push_items = 0;
2656         struct btrfs_item *item;
2657         u32 nr;
2658         u32 right_nritems;
2659         u32 data_end;
2660         u32 this_item_size;
2661
2662         if (empty)
2663                 nr = 0;
2664         else
2665                 nr = max_t(u32, 1, min_slot);
2666
2667         if (path->slots[0] >= left_nritems)
2668                 push_space += data_size;
2669
2670         slot = path->slots[1];
2671         i = left_nritems - 1;
2672         while (i >= nr) {
2673                 item = btrfs_item_nr(i);
2674
2675                 if (!empty && push_items > 0) {
2676                         if (path->slots[0] > i)
2677                                 break;
2678                         if (path->slots[0] == i) {
2679                                 int space = btrfs_leaf_free_space(left);
2680
2681                                 if (space + push_space * 2 > free_space)
2682                                         break;
2683                         }
2684                 }
2685
2686                 if (path->slots[0] == i)
2687                         push_space += data_size;
2688
2689                 this_item_size = btrfs_item_size(left, item);
2690                 if (this_item_size + sizeof(*item) + push_space > free_space)
2691                         break;
2692
2693                 push_items++;
2694                 push_space += this_item_size + sizeof(*item);
2695                 if (i == 0)
2696                         break;
2697                 i--;
2698         }
2699
2700         if (push_items == 0)
2701                 goto out_unlock;
2702
2703         WARN_ON(!empty && push_items == left_nritems);
2704
2705         /* push left to right */
2706         right_nritems = btrfs_header_nritems(right);
2707
2708         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2709         push_space -= leaf_data_end(left);
2710
2711         /* make room in the right data area */
2712         data_end = leaf_data_end(right);
2713         memmove_extent_buffer(right,
2714                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
2715                               BTRFS_LEAF_DATA_OFFSET + data_end,
2716                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
2717
2718         /* copy from the left data area */
2719         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
2720                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
2721                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
2722                      push_space);
2723
2724         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2725                               btrfs_item_nr_offset(0),
2726                               right_nritems * sizeof(struct btrfs_item));
2727
2728         /* copy the items from left to right */
2729         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2730                    btrfs_item_nr_offset(left_nritems - push_items),
2731                    push_items * sizeof(struct btrfs_item));
2732
2733         /* update the item pointers */
2734         btrfs_init_map_token(&token, right);
2735         right_nritems += push_items;
2736         btrfs_set_header_nritems(right, right_nritems);
2737         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
2738         for (i = 0; i < right_nritems; i++) {
2739                 item = btrfs_item_nr(i);
2740                 push_space -= btrfs_token_item_size(&token, item);
2741                 btrfs_set_token_item_offset(&token, item, push_space);
2742         }
2743
2744         left_nritems -= push_items;
2745         btrfs_set_header_nritems(left, left_nritems);
2746
2747         if (left_nritems)
2748                 btrfs_mark_buffer_dirty(left);
2749         else
2750                 btrfs_clean_tree_block(left);
2751
2752         btrfs_mark_buffer_dirty(right);
2753
2754         btrfs_item_key(right, &disk_key, 0);
2755         btrfs_set_node_key(upper, &disk_key, slot + 1);
2756         btrfs_mark_buffer_dirty(upper);
2757
2758         /* then fixup the leaf pointer in the path */
2759         if (path->slots[0] >= left_nritems) {
2760                 path->slots[0] -= left_nritems;
2761                 if (btrfs_header_nritems(path->nodes[0]) == 0)
2762                         btrfs_clean_tree_block(path->nodes[0]);
2763                 btrfs_tree_unlock(path->nodes[0]);
2764                 free_extent_buffer(path->nodes[0]);
2765                 path->nodes[0] = right;
2766                 path->slots[1] += 1;
2767         } else {
2768                 btrfs_tree_unlock(right);
2769                 free_extent_buffer(right);
2770         }
2771         return 0;
2772
2773 out_unlock:
2774         btrfs_tree_unlock(right);
2775         free_extent_buffer(right);
2776         return 1;
2777 }
2778
2779 /*
2780  * push some data in the path leaf to the right, trying to free up at
2781  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2782  *
2783  * returns 1 if the push failed because the other node didn't have enough
2784  * room, 0 if everything worked out and < 0 if there were major errors.
2785  *
2786  * this will push starting from min_slot to the end of the leaf.  It won't
2787  * push any slot lower than min_slot
2788  */
2789 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2790                            *root, struct btrfs_path *path,
2791                            int min_data_size, int data_size,
2792                            int empty, u32 min_slot)
2793 {
2794         struct extent_buffer *left = path->nodes[0];
2795         struct extent_buffer *right;
2796         struct extent_buffer *upper;
2797         int slot;
2798         int free_space;
2799         u32 left_nritems;
2800         int ret;
2801
2802         if (!path->nodes[1])
2803                 return 1;
2804
2805         slot = path->slots[1];
2806         upper = path->nodes[1];
2807         if (slot >= btrfs_header_nritems(upper) - 1)
2808                 return 1;
2809
2810         btrfs_assert_tree_locked(path->nodes[1]);
2811
2812         right = btrfs_read_node_slot(upper, slot + 1);
2813         /*
2814          * slot + 1 is not valid or we fail to read the right node,
2815          * no big deal, just return.
2816          */
2817         if (IS_ERR(right))
2818                 return 1;
2819
2820         __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2821
2822         free_space = btrfs_leaf_free_space(right);
2823         if (free_space < data_size)
2824                 goto out_unlock;
2825
2826         /* cow and double check */
2827         ret = btrfs_cow_block(trans, root, right, upper,
2828                               slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
2829         if (ret)
2830                 goto out_unlock;
2831
2832         free_space = btrfs_leaf_free_space(right);
2833         if (free_space < data_size)
2834                 goto out_unlock;
2835
2836         left_nritems = btrfs_header_nritems(left);
2837         if (left_nritems == 0)
2838                 goto out_unlock;
2839
2840         if (check_sibling_keys(left, right)) {
2841                 ret = -EUCLEAN;
2842                 btrfs_tree_unlock(right);
2843                 free_extent_buffer(right);
2844                 return ret;
2845         }
2846         if (path->slots[0] == left_nritems && !empty) {
2847                 /* Key greater than all keys in the leaf, right neighbor has
2848                  * enough room for it and we're not emptying our leaf to delete
2849                  * it, therefore use right neighbor to insert the new item and
2850                  * no need to touch/dirty our left leaf. */
2851                 btrfs_tree_unlock(left);
2852                 free_extent_buffer(left);
2853                 path->nodes[0] = right;
2854                 path->slots[0] = 0;
2855                 path->slots[1]++;
2856                 return 0;
2857         }
2858
2859         return __push_leaf_right(path, min_data_size, empty,
2860                                 right, free_space, left_nritems, min_slot);
2861 out_unlock:
2862         btrfs_tree_unlock(right);
2863         free_extent_buffer(right);
2864         return 1;
2865 }
2866
2867 /*
2868  * push some data in the path leaf to the left, trying to free up at
2869  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2870  *
2871  * max_slot can put a limit on how far into the leaf we'll push items.  The
2872  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
2873  * items
2874  */
2875 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
2876                                      int empty, struct extent_buffer *left,
2877                                      int free_space, u32 right_nritems,
2878                                      u32 max_slot)
2879 {
2880         struct btrfs_fs_info *fs_info = left->fs_info;
2881         struct btrfs_disk_key disk_key;
2882         struct extent_buffer *right = path->nodes[0];
2883         int i;
2884         int push_space = 0;
2885         int push_items = 0;
2886         struct btrfs_item *item;
2887         u32 old_left_nritems;
2888         u32 nr;
2889         int ret = 0;
2890         u32 this_item_size;
2891         u32 old_left_item_size;
2892         struct btrfs_map_token token;
2893
2894         if (empty)
2895                 nr = min(right_nritems, max_slot);
2896         else
2897                 nr = min(right_nritems - 1, max_slot);
2898
2899         for (i = 0; i < nr; i++) {
2900                 item = btrfs_item_nr(i);
2901
2902                 if (!empty && push_items > 0) {
2903                         if (path->slots[0] < i)
2904                                 break;
2905                         if (path->slots[0] == i) {
2906                                 int space = btrfs_leaf_free_space(right);
2907
2908                                 if (space + push_space * 2 > free_space)
2909                                         break;
2910                         }
2911                 }
2912
2913                 if (path->slots[0] == i)
2914                         push_space += data_size;
2915
2916                 this_item_size = btrfs_item_size(right, item);
2917                 if (this_item_size + sizeof(*item) + push_space > free_space)
2918                         break;
2919
2920                 push_items++;
2921                 push_space += this_item_size + sizeof(*item);
2922         }
2923
2924         if (push_items == 0) {
2925                 ret = 1;
2926                 goto out;
2927         }
2928         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
2929
2930         /* push data from right to left */
2931         copy_extent_buffer(left, right,
2932                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
2933                            btrfs_item_nr_offset(0),
2934                            push_items * sizeof(struct btrfs_item));
2935
2936         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
2937                      btrfs_item_offset_nr(right, push_items - 1);
2938
2939         copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
2940                      leaf_data_end(left) - push_space,
2941                      BTRFS_LEAF_DATA_OFFSET +
2942                      btrfs_item_offset_nr(right, push_items - 1),
2943                      push_space);
2944         old_left_nritems = btrfs_header_nritems(left);
2945         BUG_ON(old_left_nritems <= 0);
2946
2947         btrfs_init_map_token(&token, left);
2948         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2949         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2950                 u32 ioff;
2951
2952                 item = btrfs_item_nr(i);
2953
2954                 ioff = btrfs_token_item_offset(&token, item);
2955                 btrfs_set_token_item_offset(&token, item,
2956                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
2957         }
2958         btrfs_set_header_nritems(left, old_left_nritems + push_items);
2959
2960         /* fixup right node */
2961         if (push_items > right_nritems)
2962                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
2963                        right_nritems);
2964
2965         if (push_items < right_nritems) {
2966                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2967                                                   leaf_data_end(right);
2968                 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
2969                                       BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
2970                                       BTRFS_LEAF_DATA_OFFSET +
2971                                       leaf_data_end(right), push_space);
2972
2973                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2974                               btrfs_item_nr_offset(push_items),
2975                              (btrfs_header_nritems(right) - push_items) *
2976                              sizeof(struct btrfs_item));
2977         }
2978
2979         btrfs_init_map_token(&token, right);
2980         right_nritems -= push_items;
2981         btrfs_set_header_nritems(right, right_nritems);
2982         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
2983         for (i = 0; i < right_nritems; i++) {
2984                 item = btrfs_item_nr(i);
2985
2986                 push_space = push_space - btrfs_token_item_size(&token, item);
2987                 btrfs_set_token_item_offset(&token, item, push_space);
2988         }
2989
2990         btrfs_mark_buffer_dirty(left);
2991         if (right_nritems)
2992                 btrfs_mark_buffer_dirty(right);
2993         else
2994                 btrfs_clean_tree_block(right);
2995
2996         btrfs_item_key(right, &disk_key, 0);
2997         fixup_low_keys(path, &disk_key, 1);
2998
2999         /* then fixup the leaf pointer in the path */
3000         if (path->slots[0] < push_items) {
3001                 path->slots[0] += old_left_nritems;
3002                 btrfs_tree_unlock(path->nodes[0]);
3003                 free_extent_buffer(path->nodes[0]);
3004                 path->nodes[0] = left;
3005                 path->slots[1] -= 1;
3006         } else {
3007                 btrfs_tree_unlock(left);
3008                 free_extent_buffer(left);
3009                 path->slots[0] -= push_items;
3010         }
3011         BUG_ON(path->slots[0] < 0);
3012         return ret;
3013 out:
3014         btrfs_tree_unlock(left);
3015         free_extent_buffer(left);
3016         return ret;
3017 }
3018
3019 /*
3020  * push some data in the path leaf to the left, trying to free up at
3021  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3022  *
3023  * max_slot can put a limit on how far into the leaf we'll push items.  The
3024  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3025  * items
3026  */
3027 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3028                           *root, struct btrfs_path *path, int min_data_size,
3029                           int data_size, int empty, u32 max_slot)
3030 {
3031         struct extent_buffer *right = path->nodes[0];
3032         struct extent_buffer *left;
3033         int slot;
3034         int free_space;
3035         u32 right_nritems;
3036         int ret = 0;
3037
3038         slot = path->slots[1];
3039         if (slot == 0)
3040                 return 1;
3041         if (!path->nodes[1])
3042                 return 1;
3043
3044         right_nritems = btrfs_header_nritems(right);
3045         if (right_nritems == 0)
3046                 return 1;
3047
3048         btrfs_assert_tree_locked(path->nodes[1]);
3049
3050         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3051         /*
3052          * slot - 1 is not valid or we fail to read the left node,
3053          * no big deal, just return.
3054          */
3055         if (IS_ERR(left))
3056                 return 1;
3057
3058         __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3059
3060         free_space = btrfs_leaf_free_space(left);
3061         if (free_space < data_size) {
3062                 ret = 1;
3063                 goto out;
3064         }
3065
3066         /* cow and double check */
3067         ret = btrfs_cow_block(trans, root, left,
3068                               path->nodes[1], slot - 1, &left,
3069                               BTRFS_NESTING_LEFT_COW);
3070         if (ret) {
3071                 /* we hit -ENOSPC, but it isn't fatal here */
3072                 if (ret == -ENOSPC)
3073                         ret = 1;
3074                 goto out;
3075         }
3076
3077         free_space = btrfs_leaf_free_space(left);
3078         if (free_space < data_size) {
3079                 ret = 1;
3080                 goto out;
3081         }
3082
3083         if (check_sibling_keys(left, right)) {
3084                 ret = -EUCLEAN;
3085                 goto out;
3086         }
3087         return __push_leaf_left(path, min_data_size,
3088                                empty, left, free_space, right_nritems,
3089                                max_slot);
3090 out:
3091         btrfs_tree_unlock(left);
3092         free_extent_buffer(left);
3093         return ret;
3094 }
3095
3096 /*
3097  * split the path's leaf in two, making sure there is at least data_size
3098  * available for the resulting leaf level of the path.
3099  */
3100 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3101                                     struct btrfs_path *path,
3102                                     struct extent_buffer *l,
3103                                     struct extent_buffer *right,
3104                                     int slot, int mid, int nritems)
3105 {
3106         struct btrfs_fs_info *fs_info = trans->fs_info;
3107         int data_copy_size;
3108         int rt_data_off;
3109         int i;
3110         struct btrfs_disk_key disk_key;
3111         struct btrfs_map_token token;
3112
3113         nritems = nritems - mid;
3114         btrfs_set_header_nritems(right, nritems);
3115         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
3116
3117         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3118                            btrfs_item_nr_offset(mid),
3119                            nritems * sizeof(struct btrfs_item));
3120
3121         copy_extent_buffer(right, l,
3122                      BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
3123                      data_copy_size, BTRFS_LEAF_DATA_OFFSET +
3124                      leaf_data_end(l), data_copy_size);
3125
3126         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
3127
3128         btrfs_init_map_token(&token, right);
3129         for (i = 0; i < nritems; i++) {
3130                 struct btrfs_item *item = btrfs_item_nr(i);
3131                 u32 ioff;
3132
3133                 ioff = btrfs_token_item_offset(&token, item);
3134                 btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
3135         }
3136
3137         btrfs_set_header_nritems(l, mid);
3138         btrfs_item_key(right, &disk_key, 0);
3139         insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3140
3141         btrfs_mark_buffer_dirty(right);
3142         btrfs_mark_buffer_dirty(l);
3143         BUG_ON(path->slots[0] != slot);
3144
3145         if (mid <= slot) {
3146                 btrfs_tree_unlock(path->nodes[0]);
3147                 free_extent_buffer(path->nodes[0]);
3148                 path->nodes[0] = right;
3149                 path->slots[0] -= mid;
3150                 path->slots[1] += 1;
3151         } else {
3152                 btrfs_tree_unlock(right);
3153                 free_extent_buffer(right);
3154         }
3155
3156         BUG_ON(path->slots[0] < 0);
3157 }
3158
3159 /*
3160  * double splits happen when we need to insert a big item in the middle
3161  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3162  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3163  *          A                 B                 C
3164  *
3165  * We avoid this by trying to push the items on either side of our target
3166  * into the adjacent leaves.  If all goes well we can avoid the double split
3167  * completely.
3168  */
3169 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3170                                           struct btrfs_root *root,
3171                                           struct btrfs_path *path,
3172                                           int data_size)
3173 {
3174         int ret;
3175         int progress = 0;
3176         int slot;
3177         u32 nritems;
3178         int space_needed = data_size;
3179
3180         slot = path->slots[0];
3181         if (slot < btrfs_header_nritems(path->nodes[0]))
3182                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3183
3184         /*
3185          * try to push all the items after our slot into the
3186          * right leaf
3187          */
3188         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3189         if (ret < 0)
3190                 return ret;
3191
3192         if (ret == 0)
3193                 progress++;
3194
3195         nritems = btrfs_header_nritems(path->nodes[0]);
3196         /*
3197          * our goal is to get our slot at the start or end of a leaf.  If
3198          * we've done so we're done
3199          */
3200         if (path->slots[0] == 0 || path->slots[0] == nritems)
3201                 return 0;
3202
3203         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3204                 return 0;
3205
3206         /* try to push all the items before our slot into the next leaf */
3207         slot = path->slots[0];
3208         space_needed = data_size;
3209         if (slot > 0)
3210                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3211         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3212         if (ret < 0)
3213                 return ret;
3214
3215         if (ret == 0)
3216                 progress++;
3217
3218         if (progress)
3219                 return 0;
3220         return 1;
3221 }
3222
3223 /*
3224  * split the path's leaf in two, making sure there is at least data_size
3225  * available for the resulting leaf level of the path.
3226  *
3227  * returns 0 if all went well and < 0 on failure.
3228  */
3229 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3230                                struct btrfs_root *root,
3231                                const struct btrfs_key *ins_key,
3232                                struct btrfs_path *path, int data_size,
3233                                int extend)
3234 {
3235         struct btrfs_disk_key disk_key;
3236         struct extent_buffer *l;
3237         u32 nritems;
3238         int mid;
3239         int slot;
3240         struct extent_buffer *right;
3241         struct btrfs_fs_info *fs_info = root->fs_info;
3242         int ret = 0;
3243         int wret;
3244         int split;
3245         int num_doubles = 0;
3246         int tried_avoid_double = 0;
3247
3248         l = path->nodes[0];
3249         slot = path->slots[0];
3250         if (extend && data_size + btrfs_item_size_nr(l, slot) +
3251             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3252                 return -EOVERFLOW;
3253
3254         /* first try to make some room by pushing left and right */
3255         if (data_size && path->nodes[1]) {
3256                 int space_needed = data_size;
3257
3258                 if (slot < btrfs_header_nritems(l))
3259                         space_needed -= btrfs_leaf_free_space(l);
3260
3261                 wret = push_leaf_right(trans, root, path, space_needed,
3262                                        space_needed, 0, 0);
3263                 if (wret < 0)
3264                         return wret;
3265                 if (wret) {
3266                         space_needed = data_size;
3267                         if (slot > 0)
3268                                 space_needed -= btrfs_leaf_free_space(l);
3269                         wret = push_leaf_left(trans, root, path, space_needed,
3270                                               space_needed, 0, (u32)-1);
3271                         if (wret < 0)
3272                                 return wret;
3273                 }
3274                 l = path->nodes[0];
3275
3276                 /* did the pushes work? */
3277                 if (btrfs_leaf_free_space(l) >= data_size)
3278                         return 0;
3279         }
3280
3281         if (!path->nodes[1]) {
3282                 ret = insert_new_root(trans, root, path, 1);
3283                 if (ret)
3284                         return ret;
3285         }
3286 again:
3287         split = 1;
3288         l = path->nodes[0];
3289         slot = path->slots[0];
3290         nritems = btrfs_header_nritems(l);
3291         mid = (nritems + 1) / 2;
3292
3293         if (mid <= slot) {
3294                 if (nritems == 1 ||
3295                     leaf_space_used(l, mid, nritems - mid) + data_size >
3296                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
3297                         if (slot >= nritems) {
3298                                 split = 0;
3299                         } else {
3300                                 mid = slot;
3301                                 if (mid != nritems &&
3302                                     leaf_space_used(l, mid, nritems - mid) +
3303                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3304                                         if (data_size && !tried_avoid_double)
3305                                                 goto push_for_double;
3306                                         split = 2;
3307                                 }
3308                         }
3309                 }
3310         } else {
3311                 if (leaf_space_used(l, 0, mid) + data_size >
3312                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
3313                         if (!extend && data_size && slot == 0) {
3314                                 split = 0;
3315                         } else if ((extend || !data_size) && slot == 0) {
3316                                 mid = 1;
3317                         } else {
3318                                 mid = slot;
3319                                 if (mid != nritems &&
3320                                     leaf_space_used(l, mid, nritems - mid) +
3321                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3322                                         if (data_size && !tried_avoid_double)
3323                                                 goto push_for_double;
3324                                         split = 2;
3325                                 }
3326                         }
3327                 }
3328         }
3329
3330         if (split == 0)
3331                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3332         else
3333                 btrfs_item_key(l, &disk_key, mid);
3334
3335         /*
3336          * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3337          * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3338          * subclasses, which is 8 at the time of this patch, and we've maxed it
3339          * out.  In the future we could add a
3340          * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3341          * use BTRFS_NESTING_NEW_ROOT.
3342          */
3343         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3344                                        &disk_key, 0, l->start, 0,
3345                                        num_doubles ? BTRFS_NESTING_NEW_ROOT :
3346                                        BTRFS_NESTING_SPLIT);
3347         if (IS_ERR(right))
3348                 return PTR_ERR(right);
3349
3350         root_add_used(root, fs_info->nodesize);
3351
3352         if (split == 0) {
3353                 if (mid <= slot) {
3354                         btrfs_set_header_nritems(right, 0);
3355                         insert_ptr(trans, path, &disk_key,
3356                                    right->start, path->slots[1] + 1, 1);
3357                         btrfs_tree_unlock(path->nodes[0]);
3358                         free_extent_buffer(path->nodes[0]);
3359                         path->nodes[0] = right;
3360                         path->slots[0] = 0;
3361                         path->slots[1] += 1;
3362                 } else {
3363                         btrfs_set_header_nritems(right, 0);
3364                         insert_ptr(trans, path, &disk_key,
3365                                    right->start, path->slots[1], 1);
3366                         btrfs_tree_unlock(path->nodes[0]);
3367                         free_extent_buffer(path->nodes[0]);
3368                         path->nodes[0] = right;
3369                         path->slots[0] = 0;
3370                         if (path->slots[1] == 0)
3371                                 fixup_low_keys(path, &disk_key, 1);
3372                 }
3373                 /*
3374                  * We create a new leaf 'right' for the required ins_len and
3375                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3376                  * the content of ins_len to 'right'.
3377                  */
3378                 return ret;
3379         }
3380
3381         copy_for_split(trans, path, l, right, slot, mid, nritems);
3382
3383         if (split == 2) {
3384                 BUG_ON(num_doubles != 0);
3385                 num_doubles++;
3386                 goto again;
3387         }
3388
3389         return 0;
3390
3391 push_for_double:
3392         push_for_double_split(trans, root, path, data_size);
3393         tried_avoid_double = 1;
3394         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3395                 return 0;
3396         goto again;
3397 }
3398
3399 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3400                                          struct btrfs_root *root,
3401                                          struct btrfs_path *path, int ins_len)
3402 {
3403         struct btrfs_key key;
3404         struct extent_buffer *leaf;
3405         struct btrfs_file_extent_item *fi;
3406         u64 extent_len = 0;
3407         u32 item_size;
3408         int ret;
3409
3410         leaf = path->nodes[0];
3411         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3412
3413         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3414                key.type != BTRFS_EXTENT_CSUM_KEY);
3415
3416         if (btrfs_leaf_free_space(leaf) >= ins_len)
3417                 return 0;
3418
3419         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3420         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3421                 fi = btrfs_item_ptr(leaf, path->slots[0],
3422                                     struct btrfs_file_extent_item);
3423                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3424         }
3425         btrfs_release_path(path);
3426
3427         path->keep_locks = 1;
3428         path->search_for_split = 1;
3429         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3430         path->search_for_split = 0;
3431         if (ret > 0)
3432                 ret = -EAGAIN;
3433         if (ret < 0)
3434                 goto err;
3435
3436         ret = -EAGAIN;
3437         leaf = path->nodes[0];
3438         /* if our item isn't there, return now */
3439         if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3440                 goto err;
3441
3442         /* the leaf has  changed, it now has room.  return now */
3443         if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3444                 goto err;
3445
3446         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3447                 fi = btrfs_item_ptr(leaf, path->slots[0],
3448                                     struct btrfs_file_extent_item);
3449                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3450                         goto err;
3451         }
3452
3453         ret = split_leaf(trans, root, &key, path, ins_len, 1);
3454         if (ret)
3455                 goto err;
3456
3457         path->keep_locks = 0;
3458         btrfs_unlock_up_safe(path, 1);
3459         return 0;
3460 err:
3461         path->keep_locks = 0;
3462         return ret;
3463 }
3464
3465 static noinline int split_item(struct btrfs_path *path,
3466                                const struct btrfs_key *new_key,
3467                                unsigned long split_offset)
3468 {
3469         struct extent_buffer *leaf;
3470         struct btrfs_item *item;
3471         struct btrfs_item *new_item;
3472         int slot;
3473         char *buf;
3474         u32 nritems;
3475         u32 item_size;
3476         u32 orig_offset;
3477         struct btrfs_disk_key disk_key;
3478
3479         leaf = path->nodes[0];
3480         BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
3481
3482         item = btrfs_item_nr(path->slots[0]);
3483         orig_offset = btrfs_item_offset(leaf, item);
3484         item_size = btrfs_item_size(leaf, item);
3485
3486         buf = kmalloc(item_size, GFP_NOFS);
3487         if (!buf)
3488                 return -ENOMEM;
3489
3490         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3491                             path->slots[0]), item_size);
3492
3493         slot = path->slots[0] + 1;
3494         nritems = btrfs_header_nritems(leaf);
3495         if (slot != nritems) {
3496                 /* shift the items */
3497                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3498                                 btrfs_item_nr_offset(slot),
3499                                 (nritems - slot) * sizeof(struct btrfs_item));
3500         }
3501
3502         btrfs_cpu_key_to_disk(&disk_key, new_key);
3503         btrfs_set_item_key(leaf, &disk_key, slot);
3504
3505         new_item = btrfs_item_nr(slot);
3506
3507         btrfs_set_item_offset(leaf, new_item, orig_offset);
3508         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3509
3510         btrfs_set_item_offset(leaf, item,
3511                               orig_offset + item_size - split_offset);
3512         btrfs_set_item_size(leaf, item, split_offset);
3513
3514         btrfs_set_header_nritems(leaf, nritems + 1);
3515
3516         /* write the data for the start of the original item */
3517         write_extent_buffer(leaf, buf,
3518                             btrfs_item_ptr_offset(leaf, path->slots[0]),
3519                             split_offset);
3520
3521         /* write the data for the new item */
3522         write_extent_buffer(leaf, buf + split_offset,
3523                             btrfs_item_ptr_offset(leaf, slot),
3524                             item_size - split_offset);
3525         btrfs_mark_buffer_dirty(leaf);
3526
3527         BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3528         kfree(buf);
3529         return 0;
3530 }
3531
3532 /*
3533  * This function splits a single item into two items,
3534  * giving 'new_key' to the new item and splitting the
3535  * old one at split_offset (from the start of the item).
3536  *
3537  * The path may be released by this operation.  After
3538  * the split, the path is pointing to the old item.  The
3539  * new item is going to be in the same node as the old one.
3540  *
3541  * Note, the item being split must be smaller enough to live alone on
3542  * a tree block with room for one extra struct btrfs_item
3543  *
3544  * This allows us to split the item in place, keeping a lock on the
3545  * leaf the entire time.
3546  */
3547 int btrfs_split_item(struct btrfs_trans_handle *trans,
3548                      struct btrfs_root *root,
3549                      struct btrfs_path *path,
3550                      const struct btrfs_key *new_key,
3551                      unsigned long split_offset)
3552 {
3553         int ret;
3554         ret = setup_leaf_for_split(trans, root, path,
3555                                    sizeof(struct btrfs_item));
3556         if (ret)
3557                 return ret;
3558
3559         ret = split_item(path, new_key, split_offset);
3560         return ret;
3561 }
3562
3563 /*
3564  * This function duplicate a item, giving 'new_key' to the new item.
3565  * It guarantees both items live in the same tree leaf and the new item
3566  * is contiguous with the original item.
3567  *
3568  * This allows us to split file extent in place, keeping a lock on the
3569  * leaf the entire time.
3570  */
3571 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3572                          struct btrfs_root *root,
3573                          struct btrfs_path *path,
3574                          const struct btrfs_key *new_key)
3575 {
3576         struct extent_buffer *leaf;
3577         int ret;
3578         u32 item_size;
3579
3580         leaf = path->nodes[0];
3581         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3582         ret = setup_leaf_for_split(trans, root, path,
3583                                    item_size + sizeof(struct btrfs_item));
3584         if (ret)
3585                 return ret;
3586
3587         path->slots[0]++;
3588         setup_items_for_insert(root, path, new_key, &item_size, 1);
3589         leaf = path->nodes[0];
3590         memcpy_extent_buffer(leaf,
3591                              btrfs_item_ptr_offset(leaf, path->slots[0]),
3592                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3593                              item_size);
3594         return 0;
3595 }
3596
3597 /*
3598  * make the item pointed to by the path smaller.  new_size indicates
3599  * how small to make it, and from_end tells us if we just chop bytes
3600  * off the end of the item or if we shift the item to chop bytes off
3601  * the front.
3602  */
3603 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
3604 {
3605         int slot;
3606         struct extent_buffer *leaf;
3607         struct btrfs_item *item;
3608         u32 nritems;
3609         unsigned int data_end;
3610         unsigned int old_data_start;
3611         unsigned int old_size;
3612         unsigned int size_diff;
3613         int i;
3614         struct btrfs_map_token token;
3615
3616         leaf = path->nodes[0];
3617         slot = path->slots[0];
3618
3619         old_size = btrfs_item_size_nr(leaf, slot);
3620         if (old_size == new_size)
3621                 return;
3622
3623         nritems = btrfs_header_nritems(leaf);
3624         data_end = leaf_data_end(leaf);
3625
3626         old_data_start = btrfs_item_offset_nr(leaf, slot);
3627
3628         size_diff = old_size - new_size;
3629
3630         BUG_ON(slot < 0);
3631         BUG_ON(slot >= nritems);
3632
3633         /*
3634          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3635          */
3636         /* first correct the data pointers */
3637         btrfs_init_map_token(&token, leaf);
3638         for (i = slot; i < nritems; i++) {
3639                 u32 ioff;
3640                 item = btrfs_item_nr(i);
3641
3642                 ioff = btrfs_token_item_offset(&token, item);
3643                 btrfs_set_token_item_offset(&token, item, ioff + size_diff);
3644         }
3645
3646         /* shift the data */
3647         if (from_end) {
3648                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3649                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3650                               data_end, old_data_start + new_size - data_end);
3651         } else {
3652                 struct btrfs_disk_key disk_key;
3653                 u64 offset;
3654
3655                 btrfs_item_key(leaf, &disk_key, slot);
3656
3657                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3658                         unsigned long ptr;
3659                         struct btrfs_file_extent_item *fi;
3660
3661                         fi = btrfs_item_ptr(leaf, slot,
3662                                             struct btrfs_file_extent_item);
3663                         fi = (struct btrfs_file_extent_item *)(
3664                              (unsigned long)fi - size_diff);
3665
3666                         if (btrfs_file_extent_type(leaf, fi) ==
3667                             BTRFS_FILE_EXTENT_INLINE) {
3668                                 ptr = btrfs_item_ptr_offset(leaf, slot);
3669                                 memmove_extent_buffer(leaf, ptr,
3670                                       (unsigned long)fi,
3671                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
3672                         }
3673                 }
3674
3675                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3676                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3677                               data_end, old_data_start - data_end);
3678
3679                 offset = btrfs_disk_key_offset(&disk_key);
3680                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3681                 btrfs_set_item_key(leaf, &disk_key, slot);
3682                 if (slot == 0)
3683                         fixup_low_keys(path, &disk_key, 1);
3684         }
3685
3686         item = btrfs_item_nr(slot);
3687         btrfs_set_item_size(leaf, item, new_size);
3688         btrfs_mark_buffer_dirty(leaf);
3689
3690         if (btrfs_leaf_free_space(leaf) < 0) {
3691                 btrfs_print_leaf(leaf);
3692                 BUG();
3693         }
3694 }
3695
3696 /*
3697  * make the item pointed to by the path bigger, data_size is the added size.
3698  */
3699 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
3700 {
3701         int slot;
3702         struct extent_buffer *leaf;
3703         struct btrfs_item *item;
3704         u32 nritems;
3705         unsigned int data_end;
3706         unsigned int old_data;
3707         unsigned int old_size;
3708         int i;
3709         struct btrfs_map_token token;
3710
3711         leaf = path->nodes[0];
3712
3713         nritems = btrfs_header_nritems(leaf);
3714         data_end = leaf_data_end(leaf);
3715
3716         if (btrfs_leaf_free_space(leaf) < data_size) {
3717                 btrfs_print_leaf(leaf);
3718                 BUG();
3719         }
3720         slot = path->slots[0];
3721         old_data = btrfs_item_end_nr(leaf, slot);
3722
3723         BUG_ON(slot < 0);
3724         if (slot >= nritems) {
3725                 btrfs_print_leaf(leaf);
3726                 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
3727                            slot, nritems);
3728                 BUG();
3729         }
3730
3731         /*
3732          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3733          */
3734         /* first correct the data pointers */
3735         btrfs_init_map_token(&token, leaf);
3736         for (i = slot; i < nritems; i++) {
3737                 u32 ioff;
3738                 item = btrfs_item_nr(i);
3739
3740                 ioff = btrfs_token_item_offset(&token, item);
3741                 btrfs_set_token_item_offset(&token, item, ioff - data_size);
3742         }
3743
3744         /* shift the data */
3745         memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3746                       data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
3747                       data_end, old_data - data_end);
3748
3749         data_end = old_data;
3750         old_size = btrfs_item_size_nr(leaf, slot);
3751         item = btrfs_item_nr(slot);
3752         btrfs_set_item_size(leaf, item, old_size + data_size);
3753         btrfs_mark_buffer_dirty(leaf);
3754
3755         if (btrfs_leaf_free_space(leaf) < 0) {
3756                 btrfs_print_leaf(leaf);
3757                 BUG();
3758         }
3759 }
3760
3761 /**
3762  * setup_items_for_insert - Helper called before inserting one or more items
3763  * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
3764  * in a function that doesn't call btrfs_search_slot
3765  *
3766  * @root:       root we are inserting items to
3767  * @path:       points to the leaf/slot where we are going to insert new items
3768  * @cpu_key:    array of keys for items to be inserted
3769  * @data_size:  size of the body of each item we are going to insert
3770  * @nr:         size of @cpu_key/@data_size arrays
3771  */
3772 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
3773                             const struct btrfs_key *cpu_key, u32 *data_size,
3774                             int nr)
3775 {
3776         struct btrfs_fs_info *fs_info = root->fs_info;
3777         struct btrfs_item *item;
3778         int i;
3779         u32 nritems;
3780         unsigned int data_end;
3781         struct btrfs_disk_key disk_key;
3782         struct extent_buffer *leaf;
3783         int slot;
3784         struct btrfs_map_token token;
3785         u32 total_size;
3786         u32 total_data = 0;
3787
3788         for (i = 0; i < nr; i++)
3789                 total_data += data_size[i];
3790         total_size = total_data + (nr * sizeof(struct btrfs_item));
3791
3792         if (path->slots[0] == 0) {
3793                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3794                 fixup_low_keys(path, &disk_key, 1);
3795         }
3796         btrfs_unlock_up_safe(path, 1);
3797
3798         leaf = path->nodes[0];
3799         slot = path->slots[0];
3800
3801         nritems = btrfs_header_nritems(leaf);
3802         data_end = leaf_data_end(leaf);
3803
3804         if (btrfs_leaf_free_space(leaf) < total_size) {
3805                 btrfs_print_leaf(leaf);
3806                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
3807                            total_size, btrfs_leaf_free_space(leaf));
3808                 BUG();
3809         }
3810
3811         btrfs_init_map_token(&token, leaf);
3812         if (slot != nritems) {
3813                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3814
3815                 if (old_data < data_end) {
3816                         btrfs_print_leaf(leaf);
3817                         btrfs_crit(fs_info,
3818                 "item at slot %d with data offset %u beyond data end of leaf %u",
3819                                    slot, old_data, data_end);
3820                         BUG();
3821                 }
3822                 /*
3823                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3824                  */
3825                 /* first correct the data pointers */
3826                 for (i = slot; i < nritems; i++) {
3827                         u32 ioff;
3828
3829                         item = btrfs_item_nr(i);
3830                         ioff = btrfs_token_item_offset(&token, item);
3831                         btrfs_set_token_item_offset(&token, item,
3832                                                     ioff - total_data);
3833                 }
3834                 /* shift the items */
3835                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3836                               btrfs_item_nr_offset(slot),
3837                               (nritems - slot) * sizeof(struct btrfs_item));
3838
3839                 /* shift the data */
3840                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3841                               data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
3842                               data_end, old_data - data_end);
3843                 data_end = old_data;
3844         }
3845
3846         /* setup the item for the new data */
3847         for (i = 0; i < nr; i++) {
3848                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3849                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3850                 item = btrfs_item_nr(slot + i);
3851                 data_end -= data_size[i];
3852                 btrfs_set_token_item_offset(&token, item, data_end);
3853                 btrfs_set_token_item_size(&token, item, data_size[i]);
3854         }
3855
3856         btrfs_set_header_nritems(leaf, nritems + nr);
3857         btrfs_mark_buffer_dirty(leaf);
3858
3859         if (btrfs_leaf_free_space(leaf) < 0) {
3860                 btrfs_print_leaf(leaf);
3861                 BUG();
3862         }
3863 }
3864
3865 /*
3866  * Given a key and some data, insert items into the tree.
3867  * This does all the path init required, making room in the tree if needed.
3868  */
3869 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3870                             struct btrfs_root *root,
3871                             struct btrfs_path *path,
3872                             const struct btrfs_key *cpu_key, u32 *data_size,
3873                             int nr)
3874 {
3875         int ret = 0;
3876         int slot;
3877         int i;
3878         u32 total_size = 0;
3879         u32 total_data = 0;
3880
3881         for (i = 0; i < nr; i++)
3882                 total_data += data_size[i];
3883
3884         total_size = total_data + (nr * sizeof(struct btrfs_item));
3885         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3886         if (ret == 0)
3887                 return -EEXIST;
3888         if (ret < 0)
3889                 return ret;
3890
3891         slot = path->slots[0];
3892         BUG_ON(slot < 0);
3893
3894         setup_items_for_insert(root, path, cpu_key, data_size, nr);
3895         return 0;
3896 }
3897
3898 /*
3899  * Given a key and some data, insert an item into the tree.
3900  * This does all the path init required, making room in the tree if needed.
3901  */
3902 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3903                       const struct btrfs_key *cpu_key, void *data,
3904                       u32 data_size)
3905 {
3906         int ret = 0;
3907         struct btrfs_path *path;
3908         struct extent_buffer *leaf;
3909         unsigned long ptr;
3910
3911         path = btrfs_alloc_path();
3912         if (!path)
3913                 return -ENOMEM;
3914         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3915         if (!ret) {
3916                 leaf = path->nodes[0];
3917                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3918                 write_extent_buffer(leaf, data, ptr, data_size);
3919                 btrfs_mark_buffer_dirty(leaf);
3920         }
3921         btrfs_free_path(path);
3922         return ret;
3923 }
3924
3925 /*
3926  * delete the pointer from a given node.
3927  *
3928  * the tree should have been previously balanced so the deletion does not
3929  * empty a node.
3930  */
3931 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
3932                     int level, int slot)
3933 {
3934         struct extent_buffer *parent = path->nodes[level];
3935         u32 nritems;
3936         int ret;
3937
3938         nritems = btrfs_header_nritems(parent);
3939         if (slot != nritems - 1) {
3940                 if (level) {
3941                         ret = btrfs_tree_mod_log_insert_move(parent, slot,
3942                                         slot + 1, nritems - slot - 1);
3943                         BUG_ON(ret < 0);
3944                 }
3945                 memmove_extent_buffer(parent,
3946                               btrfs_node_key_ptr_offset(slot),
3947                               btrfs_node_key_ptr_offset(slot + 1),
3948                               sizeof(struct btrfs_key_ptr) *
3949                               (nritems - slot - 1));
3950         } else if (level) {
3951                 ret = btrfs_tree_mod_log_insert_key(parent, slot,
3952                                 BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
3953                 BUG_ON(ret < 0);
3954         }
3955
3956         nritems--;
3957         btrfs_set_header_nritems(parent, nritems);
3958         if (nritems == 0 && parent == root->node) {
3959                 BUG_ON(btrfs_header_level(root->node) != 1);
3960                 /* just turn the root into a leaf and break */
3961                 btrfs_set_header_level(root->node, 0);
3962         } else if (slot == 0) {
3963                 struct btrfs_disk_key disk_key;
3964
3965                 btrfs_node_key(parent, &disk_key, 0);
3966                 fixup_low_keys(path, &disk_key, level + 1);
3967         }
3968         btrfs_mark_buffer_dirty(parent);
3969 }
3970
3971 /*
3972  * a helper function to delete the leaf pointed to by path->slots[1] and
3973  * path->nodes[1].
3974  *
3975  * This deletes the pointer in path->nodes[1] and frees the leaf
3976  * block extent.  zero is returned if it all worked out, < 0 otherwise.
3977  *
3978  * The path must have already been setup for deleting the leaf, including
3979  * all the proper balancing.  path->nodes[1] must be locked.
3980  */
3981 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
3982                                     struct btrfs_root *root,
3983                                     struct btrfs_path *path,
3984                                     struct extent_buffer *leaf)
3985 {
3986         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
3987         del_ptr(root, path, 1, path->slots[1]);
3988
3989         /*
3990          * btrfs_free_extent is expensive, we want to make sure we
3991          * aren't holding any locks when we call it
3992          */
3993         btrfs_unlock_up_safe(path, 0);
3994
3995         root_sub_used(root, leaf->len);
3996
3997         atomic_inc(&leaf->refs);
3998         btrfs_free_tree_block(trans, root, leaf, 0, 1);
3999         free_extent_buffer_stale(leaf);
4000 }
4001 /*
4002  * delete the item at the leaf level in path.  If that empties
4003  * the leaf, remove it from the tree
4004  */
4005 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4006                     struct btrfs_path *path, int slot, int nr)
4007 {
4008         struct btrfs_fs_info *fs_info = root->fs_info;
4009         struct extent_buffer *leaf;
4010         struct btrfs_item *item;
4011         u32 last_off;
4012         u32 dsize = 0;
4013         int ret = 0;
4014         int wret;
4015         int i;
4016         u32 nritems;
4017
4018         leaf = path->nodes[0];
4019         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4020
4021         for (i = 0; i < nr; i++)
4022                 dsize += btrfs_item_size_nr(leaf, slot + i);
4023
4024         nritems = btrfs_header_nritems(leaf);
4025
4026         if (slot + nr != nritems) {
4027                 int data_end = leaf_data_end(leaf);
4028                 struct btrfs_map_token token;
4029
4030                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4031                               data_end + dsize,
4032                               BTRFS_LEAF_DATA_OFFSET + data_end,
4033                               last_off - data_end);
4034
4035                 btrfs_init_map_token(&token, leaf);
4036                 for (i = slot + nr; i < nritems; i++) {
4037                         u32 ioff;
4038
4039                         item = btrfs_item_nr(i);
4040                         ioff = btrfs_token_item_offset(&token, item);
4041                         btrfs_set_token_item_offset(&token, item, ioff + dsize);
4042                 }
4043
4044                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4045                               btrfs_item_nr_offset(slot + nr),
4046                               sizeof(struct btrfs_item) *
4047                               (nritems - slot - nr));
4048         }
4049         btrfs_set_header_nritems(leaf, nritems - nr);
4050         nritems -= nr;
4051
4052         /* delete the leaf if we've emptied it */
4053         if (nritems == 0) {
4054                 if (leaf == root->node) {
4055                         btrfs_set_header_level(leaf, 0);
4056                 } else {
4057                         btrfs_clean_tree_block(leaf);
4058                         btrfs_del_leaf(trans, root, path, leaf);
4059                 }
4060         } else {
4061                 int used = leaf_space_used(leaf, 0, nritems);
4062                 if (slot == 0) {
4063                         struct btrfs_disk_key disk_key;
4064
4065                         btrfs_item_key(leaf, &disk_key, 0);
4066                         fixup_low_keys(path, &disk_key, 1);
4067                 }
4068
4069                 /* delete the leaf if it is mostly empty */
4070                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4071                         /* push_leaf_left fixes the path.
4072                          * make sure the path still points to our leaf
4073                          * for possible call to del_ptr below
4074                          */
4075                         slot = path->slots[1];
4076                         atomic_inc(&leaf->refs);
4077
4078                         wret = push_leaf_left(trans, root, path, 1, 1,
4079                                               1, (u32)-1);
4080                         if (wret < 0 && wret != -ENOSPC)
4081                                 ret = wret;
4082
4083                         if (path->nodes[0] == leaf &&
4084                             btrfs_header_nritems(leaf)) {
4085                                 wret = push_leaf_right(trans, root, path, 1,
4086                                                        1, 1, 0);
4087                                 if (wret < 0 && wret != -ENOSPC)
4088                                         ret = wret;
4089                         }
4090
4091                         if (btrfs_header_nritems(leaf) == 0) {
4092                                 path->slots[1] = slot;
4093                                 btrfs_del_leaf(trans, root, path, leaf);
4094                                 free_extent_buffer(leaf);
4095                                 ret = 0;
4096                         } else {
4097                                 /* if we're still in the path, make sure
4098                                  * we're dirty.  Otherwise, one of the
4099                                  * push_leaf functions must have already
4100                                  * dirtied this buffer
4101                                  */
4102                                 if (path->nodes[0] == leaf)
4103                                         btrfs_mark_buffer_dirty(leaf);
4104                                 free_extent_buffer(leaf);
4105                         }
4106                 } else {
4107                         btrfs_mark_buffer_dirty(leaf);
4108                 }
4109         }
4110         return ret;
4111 }
4112
4113 /*
4114  * search the tree again to find a leaf with lesser keys
4115  * returns 0 if it found something or 1 if there are no lesser leaves.
4116  * returns < 0 on io errors.
4117  *
4118  * This may release the path, and so you may lose any locks held at the
4119  * time you call it.
4120  */
4121 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4122 {
4123         struct btrfs_key key;
4124         struct btrfs_disk_key found_key;
4125         int ret;
4126
4127         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4128
4129         if (key.offset > 0) {
4130                 key.offset--;
4131         } else if (key.type > 0) {
4132                 key.type--;
4133                 key.offset = (u64)-1;
4134         } else if (key.objectid > 0) {
4135                 key.objectid--;
4136                 key.type = (u8)-1;
4137                 key.offset = (u64)-1;
4138         } else {
4139                 return 1;
4140         }
4141
4142         btrfs_release_path(path);
4143         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4144         if (ret < 0)
4145                 return ret;
4146         btrfs_item_key(path->nodes[0], &found_key, 0);
4147         ret = comp_keys(&found_key, &key);
4148         /*
4149          * We might have had an item with the previous key in the tree right
4150          * before we released our path. And after we released our path, that
4151          * item might have been pushed to the first slot (0) of the leaf we
4152          * were holding due to a tree balance. Alternatively, an item with the
4153          * previous key can exist as the only element of a leaf (big fat item).
4154          * Therefore account for these 2 cases, so that our callers (like
4155          * btrfs_previous_item) don't miss an existing item with a key matching
4156          * the previous key we computed above.
4157          */
4158         if (ret <= 0)
4159                 return 0;
4160         return 1;
4161 }
4162
4163 /*
4164  * A helper function to walk down the tree starting at min_key, and looking
4165  * for nodes or leaves that are have a minimum transaction id.
4166  * This is used by the btree defrag code, and tree logging
4167  *
4168  * This does not cow, but it does stuff the starting key it finds back
4169  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4170  * key and get a writable path.
4171  *
4172  * This honors path->lowest_level to prevent descent past a given level
4173  * of the tree.
4174  *
4175  * min_trans indicates the oldest transaction that you are interested
4176  * in walking through.  Any nodes or leaves older than min_trans are
4177  * skipped over (without reading them).
4178  *
4179  * returns zero if something useful was found, < 0 on error and 1 if there
4180  * was nothing in the tree that matched the search criteria.
4181  */
4182 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4183                          struct btrfs_path *path,
4184                          u64 min_trans)
4185 {
4186         struct extent_buffer *cur;
4187         struct btrfs_key found_key;
4188         int slot;
4189         int sret;
4190         u32 nritems;
4191         int level;
4192         int ret = 1;
4193         int keep_locks = path->keep_locks;
4194
4195         path->keep_locks = 1;
4196 again:
4197         cur = btrfs_read_lock_root_node(root);
4198         level = btrfs_header_level(cur);
4199         WARN_ON(path->nodes[level]);
4200         path->nodes[level] = cur;
4201         path->locks[level] = BTRFS_READ_LOCK;
4202
4203         if (btrfs_header_generation(cur) < min_trans) {
4204                 ret = 1;
4205                 goto out;
4206         }
4207         while (1) {
4208                 nritems = btrfs_header_nritems(cur);
4209                 level = btrfs_header_level(cur);
4210                 sret = btrfs_bin_search(cur, min_key, &slot);
4211                 if (sret < 0) {
4212                         ret = sret;
4213                         goto out;
4214                 }
4215
4216                 /* at the lowest level, we're done, setup the path and exit */
4217                 if (level == path->lowest_level) {
4218                         if (slot >= nritems)
4219                                 goto find_next_key;
4220                         ret = 0;
4221                         path->slots[level] = slot;
4222                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4223                         goto out;
4224                 }
4225                 if (sret && slot > 0)
4226                         slot--;
4227                 /*
4228                  * check this node pointer against the min_trans parameters.
4229                  * If it is too old, skip to the next one.
4230                  */
4231                 while (slot < nritems) {
4232                         u64 gen;
4233
4234                         gen = btrfs_node_ptr_generation(cur, slot);
4235                         if (gen < min_trans) {
4236                                 slot++;
4237                                 continue;
4238                         }
4239                         break;
4240                 }
4241 find_next_key:
4242                 /*
4243                  * we didn't find a candidate key in this node, walk forward
4244                  * and find another one
4245                  */
4246                 if (slot >= nritems) {
4247                         path->slots[level] = slot;
4248                         sret = btrfs_find_next_key(root, path, min_key, level,
4249                                                   min_trans);
4250                         if (sret == 0) {
4251                                 btrfs_release_path(path);
4252                                 goto again;
4253                         } else {
4254                                 goto out;
4255                         }
4256                 }
4257                 /* save our key for returning back */
4258                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4259                 path->slots[level] = slot;
4260                 if (level == path->lowest_level) {
4261                         ret = 0;
4262                         goto out;
4263                 }
4264                 cur = btrfs_read_node_slot(cur, slot);
4265                 if (IS_ERR(cur)) {
4266                         ret = PTR_ERR(cur);
4267                         goto out;
4268                 }
4269
4270                 btrfs_tree_read_lock(cur);
4271
4272                 path->locks[level - 1] = BTRFS_READ_LOCK;
4273                 path->nodes[level - 1] = cur;
4274                 unlock_up(path, level, 1, 0, NULL);
4275         }
4276 out:
4277         path->keep_locks = keep_locks;
4278         if (ret == 0) {
4279                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
4280                 memcpy(min_key, &found_key, sizeof(found_key));
4281         }
4282         return ret;
4283 }
4284
4285 /*
4286  * this is similar to btrfs_next_leaf, but does not try to preserve
4287  * and fixup the path.  It looks for and returns the next key in the
4288  * tree based on the current path and the min_trans parameters.
4289  *
4290  * 0 is returned if another key is found, < 0 if there are any errors
4291  * and 1 is returned if there are no higher keys in the tree
4292  *
4293  * path->keep_locks should be set to 1 on the search made before
4294  * calling this function.
4295  */
4296 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4297                         struct btrfs_key *key, int level, u64 min_trans)
4298 {
4299         int slot;
4300         struct extent_buffer *c;
4301
4302         WARN_ON(!path->keep_locks && !path->skip_locking);
4303         while (level < BTRFS_MAX_LEVEL) {
4304                 if (!path->nodes[level])
4305                         return 1;
4306
4307                 slot = path->slots[level] + 1;
4308                 c = path->nodes[level];
4309 next:
4310                 if (slot >= btrfs_header_nritems(c)) {
4311                         int ret;
4312                         int orig_lowest;
4313                         struct btrfs_key cur_key;
4314                         if (level + 1 >= BTRFS_MAX_LEVEL ||
4315                             !path->nodes[level + 1])
4316                                 return 1;
4317
4318                         if (path->locks[level + 1] || path->skip_locking) {
4319                                 level++;
4320                                 continue;
4321                         }
4322
4323                         slot = btrfs_header_nritems(c) - 1;
4324                         if (level == 0)
4325                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
4326                         else
4327                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
4328
4329                         orig_lowest = path->lowest_level;
4330                         btrfs_release_path(path);
4331                         path->lowest_level = level;
4332                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
4333                                                 0, 0);
4334                         path->lowest_level = orig_lowest;
4335                         if (ret < 0)
4336                                 return ret;
4337
4338                         c = path->nodes[level];
4339                         slot = path->slots[level];
4340                         if (ret == 0)
4341                                 slot++;
4342                         goto next;
4343                 }
4344
4345                 if (level == 0)
4346                         btrfs_item_key_to_cpu(c, key, slot);
4347                 else {
4348                         u64 gen = btrfs_node_ptr_generation(c, slot);
4349
4350                         if (gen < min_trans) {
4351                                 slot++;
4352                                 goto next;
4353                         }
4354                         btrfs_node_key_to_cpu(c, key, slot);
4355                 }
4356                 return 0;
4357         }
4358         return 1;
4359 }
4360
4361 /*
4362  * search the tree again to find a leaf with greater keys
4363  * returns 0 if it found something or 1 if there are no greater leaves.
4364  * returns < 0 on io errors.
4365  */
4366 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4367 {
4368         return btrfs_next_old_leaf(root, path, 0);
4369 }
4370
4371 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4372                         u64 time_seq)
4373 {
4374         int slot;
4375         int level;
4376         struct extent_buffer *c;
4377         struct extent_buffer *next;
4378         struct btrfs_key key;
4379         u32 nritems;
4380         int ret;
4381         int i;
4382
4383         nritems = btrfs_header_nritems(path->nodes[0]);
4384         if (nritems == 0)
4385                 return 1;
4386
4387         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4388 again:
4389         level = 1;
4390         next = NULL;
4391         btrfs_release_path(path);
4392
4393         path->keep_locks = 1;
4394
4395         if (time_seq)
4396                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
4397         else
4398                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4399         path->keep_locks = 0;
4400
4401         if (ret < 0)
4402                 return ret;
4403
4404         nritems = btrfs_header_nritems(path->nodes[0]);
4405         /*
4406          * by releasing the path above we dropped all our locks.  A balance
4407          * could have added more items next to the key that used to be
4408          * at the very end of the block.  So, check again here and
4409          * advance the path if there are now more items available.
4410          */
4411         if (nritems > 0 && path->slots[0] < nritems - 1) {
4412                 if (ret == 0)
4413                         path->slots[0]++;
4414                 ret = 0;
4415                 goto done;
4416         }
4417         /*
4418          * So the above check misses one case:
4419          * - after releasing the path above, someone has removed the item that
4420          *   used to be at the very end of the block, and balance between leafs
4421          *   gets another one with bigger key.offset to replace it.
4422          *
4423          * This one should be returned as well, or we can get leaf corruption
4424          * later(esp. in __btrfs_drop_extents()).
4425          *
4426          * And a bit more explanation about this check,
4427          * with ret > 0, the key isn't found, the path points to the slot
4428          * where it should be inserted, so the path->slots[0] item must be the
4429          * bigger one.
4430          */
4431         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4432                 ret = 0;
4433                 goto done;
4434         }
4435
4436         while (level < BTRFS_MAX_LEVEL) {
4437                 if (!path->nodes[level]) {
4438                         ret = 1;
4439                         goto done;
4440                 }
4441
4442                 slot = path->slots[level] + 1;
4443                 c = path->nodes[level];
4444                 if (slot >= btrfs_header_nritems(c)) {
4445                         level++;
4446                         if (level == BTRFS_MAX_LEVEL) {
4447                                 ret = 1;
4448                                 goto done;
4449                         }
4450                         continue;
4451                 }
4452
4453
4454                 /*
4455                  * Our current level is where we're going to start from, and to
4456                  * make sure lockdep doesn't complain we need to drop our locks
4457                  * and nodes from 0 to our current level.
4458                  */
4459                 for (i = 0; i < level; i++) {
4460                         if (path->locks[level]) {
4461                                 btrfs_tree_read_unlock(path->nodes[i]);
4462                                 path->locks[i] = 0;
4463                         }
4464                         free_extent_buffer(path->nodes[i]);
4465                         path->nodes[i] = NULL;
4466                 }
4467
4468                 next = c;
4469                 ret = read_block_for_search(root, path, &next, level,
4470                                             slot, &key);
4471                 if (ret == -EAGAIN)
4472                         goto again;
4473
4474                 if (ret < 0) {
4475                         btrfs_release_path(path);
4476                         goto done;
4477                 }
4478
4479                 if (!path->skip_locking) {
4480                         ret = btrfs_try_tree_read_lock(next);
4481                         if (!ret && time_seq) {
4482                                 /*
4483                                  * If we don't get the lock, we may be racing
4484                                  * with push_leaf_left, holding that lock while
4485                                  * itself waiting for the leaf we've currently
4486                                  * locked. To solve this situation, we give up
4487                                  * on our lock and cycle.
4488                                  */
4489                                 free_extent_buffer(next);
4490                                 btrfs_release_path(path);
4491                                 cond_resched();
4492                                 goto again;
4493                         }
4494                         if (!ret)
4495                                 btrfs_tree_read_lock(next);
4496                 }
4497                 break;
4498         }
4499         path->slots[level] = slot;
4500         while (1) {
4501                 level--;
4502                 path->nodes[level] = next;
4503                 path->slots[level] = 0;
4504                 if (!path->skip_locking)
4505                         path->locks[level] = BTRFS_READ_LOCK;
4506                 if (!level)
4507                         break;
4508
4509                 ret = read_block_for_search(root, path, &next, level,
4510                                             0, &key);
4511                 if (ret == -EAGAIN)
4512                         goto again;
4513
4514                 if (ret < 0) {
4515                         btrfs_release_path(path);
4516                         goto done;
4517                 }
4518
4519                 if (!path->skip_locking)
4520                         btrfs_tree_read_lock(next);
4521         }
4522         ret = 0;
4523 done:
4524         unlock_up(path, 0, 1, 0, NULL);
4525
4526         return ret;
4527 }
4528
4529 /*
4530  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4531  * searching until it gets past min_objectid or finds an item of 'type'
4532  *
4533  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4534  */
4535 int btrfs_previous_item(struct btrfs_root *root,
4536                         struct btrfs_path *path, u64 min_objectid,
4537                         int type)
4538 {
4539         struct btrfs_key found_key;
4540         struct extent_buffer *leaf;
4541         u32 nritems;
4542         int ret;
4543
4544         while (1) {
4545                 if (path->slots[0] == 0) {
4546                         ret = btrfs_prev_leaf(root, path);
4547                         if (ret != 0)
4548                                 return ret;
4549                 } else {
4550                         path->slots[0]--;
4551                 }
4552                 leaf = path->nodes[0];
4553                 nritems = btrfs_header_nritems(leaf);
4554                 if (nritems == 0)
4555                         return 1;
4556                 if (path->slots[0] == nritems)
4557                         path->slots[0]--;
4558
4559                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4560                 if (found_key.objectid < min_objectid)
4561                         break;
4562                 if (found_key.type == type)
4563                         return 0;
4564                 if (found_key.objectid == min_objectid &&
4565                     found_key.type < type)
4566                         break;
4567         }
4568         return 1;
4569 }
4570
4571 /*
4572  * search in extent tree to find a previous Metadata/Data extent item with
4573  * min objecitd.
4574  *
4575  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4576  */
4577 int btrfs_previous_extent_item(struct btrfs_root *root,
4578                         struct btrfs_path *path, u64 min_objectid)
4579 {
4580         struct btrfs_key found_key;
4581         struct extent_buffer *leaf;
4582         u32 nritems;
4583         int ret;
4584
4585         while (1) {
4586                 if (path->slots[0] == 0) {
4587                         ret = btrfs_prev_leaf(root, path);
4588                         if (ret != 0)
4589                                 return ret;
4590                 } else {
4591                         path->slots[0]--;
4592                 }
4593                 leaf = path->nodes[0];
4594                 nritems = btrfs_header_nritems(leaf);
4595                 if (nritems == 0)
4596                         return 1;
4597                 if (path->slots[0] == nritems)
4598                         path->slots[0]--;
4599
4600                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4601                 if (found_key.objectid < min_objectid)
4602                         break;
4603                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
4604                     found_key.type == BTRFS_METADATA_ITEM_KEY)
4605                         return 0;
4606                 if (found_key.objectid == min_objectid &&
4607                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
4608                         break;
4609         }
4610         return 1;
4611 }