btrfs: improve error message in setup_items_for_insert
[linux-2.6-microblaze.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17
18 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
19                       *root, struct btrfs_path *path, int level);
20 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
21                       const struct btrfs_key *ins_key, struct btrfs_path *path,
22                       int data_size, int extend);
23 static int push_node_left(struct btrfs_trans_handle *trans,
24                           struct extent_buffer *dst,
25                           struct extent_buffer *src, int empty);
26 static int balance_node_right(struct btrfs_trans_handle *trans,
27                               struct extent_buffer *dst_buf,
28                               struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30                     int level, int slot);
31
32 static const struct btrfs_csums {
33         u16             size;
34         const char      name[10];
35         const char      driver[12];
36 } btrfs_csums[] = {
37         [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
38         [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
39         [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
40         [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
41                                      .driver = "blake2b-256" },
42 };
43
44 int btrfs_super_csum_size(const struct btrfs_super_block *s)
45 {
46         u16 t = btrfs_super_csum_type(s);
47         /*
48          * csum type is validated at mount time
49          */
50         return btrfs_csums[t].size;
51 }
52
53 const char *btrfs_super_csum_name(u16 csum_type)
54 {
55         /* csum type is validated at mount time */
56         return btrfs_csums[csum_type].name;
57 }
58
59 /*
60  * Return driver name if defined, otherwise the name that's also a valid driver
61  * name
62  */
63 const char *btrfs_super_csum_driver(u16 csum_type)
64 {
65         /* csum type is validated at mount time */
66         return btrfs_csums[csum_type].driver[0] ?
67                 btrfs_csums[csum_type].driver :
68                 btrfs_csums[csum_type].name;
69 }
70
71 size_t __attribute_const__ btrfs_get_num_csums(void)
72 {
73         return ARRAY_SIZE(btrfs_csums);
74 }
75
76 struct btrfs_path *btrfs_alloc_path(void)
77 {
78         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
79 }
80
81 /* this also releases the path */
82 void btrfs_free_path(struct btrfs_path *p)
83 {
84         if (!p)
85                 return;
86         btrfs_release_path(p);
87         kmem_cache_free(btrfs_path_cachep, p);
88 }
89
90 /*
91  * path release drops references on the extent buffers in the path
92  * and it drops any locks held by this path
93  *
94  * It is safe to call this on paths that no locks or extent buffers held.
95  */
96 noinline void btrfs_release_path(struct btrfs_path *p)
97 {
98         int i;
99
100         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
101                 p->slots[i] = 0;
102                 if (!p->nodes[i])
103                         continue;
104                 if (p->locks[i]) {
105                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
106                         p->locks[i] = 0;
107                 }
108                 free_extent_buffer(p->nodes[i]);
109                 p->nodes[i] = NULL;
110         }
111 }
112
113 /*
114  * safely gets a reference on the root node of a tree.  A lock
115  * is not taken, so a concurrent writer may put a different node
116  * at the root of the tree.  See btrfs_lock_root_node for the
117  * looping required.
118  *
119  * The extent buffer returned by this has a reference taken, so
120  * it won't disappear.  It may stop being the root of the tree
121  * at any time because there are no locks held.
122  */
123 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
124 {
125         struct extent_buffer *eb;
126
127         while (1) {
128                 rcu_read_lock();
129                 eb = rcu_dereference(root->node);
130
131                 /*
132                  * RCU really hurts here, we could free up the root node because
133                  * it was COWed but we may not get the new root node yet so do
134                  * the inc_not_zero dance and if it doesn't work then
135                  * synchronize_rcu and try again.
136                  */
137                 if (atomic_inc_not_zero(&eb->refs)) {
138                         rcu_read_unlock();
139                         break;
140                 }
141                 rcu_read_unlock();
142                 synchronize_rcu();
143         }
144         return eb;
145 }
146
147 /*
148  * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
149  * just get put onto a simple dirty list.  Transaction walks this list to make
150  * sure they get properly updated on disk.
151  */
152 static void add_root_to_dirty_list(struct btrfs_root *root)
153 {
154         struct btrfs_fs_info *fs_info = root->fs_info;
155
156         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
157             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
158                 return;
159
160         spin_lock(&fs_info->trans_lock);
161         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
162                 /* Want the extent tree to be the last on the list */
163                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
164                         list_move_tail(&root->dirty_list,
165                                        &fs_info->dirty_cowonly_roots);
166                 else
167                         list_move(&root->dirty_list,
168                                   &fs_info->dirty_cowonly_roots);
169         }
170         spin_unlock(&fs_info->trans_lock);
171 }
172
173 /*
174  * used by snapshot creation to make a copy of a root for a tree with
175  * a given objectid.  The buffer with the new root node is returned in
176  * cow_ret, and this func returns zero on success or a negative error code.
177  */
178 int btrfs_copy_root(struct btrfs_trans_handle *trans,
179                       struct btrfs_root *root,
180                       struct extent_buffer *buf,
181                       struct extent_buffer **cow_ret, u64 new_root_objectid)
182 {
183         struct btrfs_fs_info *fs_info = root->fs_info;
184         struct extent_buffer *cow;
185         int ret = 0;
186         int level;
187         struct btrfs_disk_key disk_key;
188
189         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
190                 trans->transid != fs_info->running_transaction->transid);
191         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
192                 trans->transid != root->last_trans);
193
194         level = btrfs_header_level(buf);
195         if (level == 0)
196                 btrfs_item_key(buf, &disk_key, 0);
197         else
198                 btrfs_node_key(buf, &disk_key, 0);
199
200         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
201                                      &disk_key, level, buf->start, 0,
202                                      BTRFS_NESTING_NEW_ROOT);
203         if (IS_ERR(cow))
204                 return PTR_ERR(cow);
205
206         copy_extent_buffer_full(cow, buf);
207         btrfs_set_header_bytenr(cow, cow->start);
208         btrfs_set_header_generation(cow, trans->transid);
209         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
210         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
211                                      BTRFS_HEADER_FLAG_RELOC);
212         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
213                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
214         else
215                 btrfs_set_header_owner(cow, new_root_objectid);
216
217         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
218
219         WARN_ON(btrfs_header_generation(buf) > trans->transid);
220         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
221                 ret = btrfs_inc_ref(trans, root, cow, 1);
222         else
223                 ret = btrfs_inc_ref(trans, root, cow, 0);
224
225         if (ret)
226                 return ret;
227
228         btrfs_mark_buffer_dirty(cow);
229         *cow_ret = cow;
230         return 0;
231 }
232
233 enum mod_log_op {
234         MOD_LOG_KEY_REPLACE,
235         MOD_LOG_KEY_ADD,
236         MOD_LOG_KEY_REMOVE,
237         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
238         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
239         MOD_LOG_MOVE_KEYS,
240         MOD_LOG_ROOT_REPLACE,
241 };
242
243 struct tree_mod_root {
244         u64 logical;
245         u8 level;
246 };
247
248 struct tree_mod_elem {
249         struct rb_node node;
250         u64 logical;
251         u64 seq;
252         enum mod_log_op op;
253
254         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
255         int slot;
256
257         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
258         u64 generation;
259
260         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
261         struct btrfs_disk_key key;
262         u64 blockptr;
263
264         /* this is used for op == MOD_LOG_MOVE_KEYS */
265         struct {
266                 int dst_slot;
267                 int nr_items;
268         } move;
269
270         /* this is used for op == MOD_LOG_ROOT_REPLACE */
271         struct tree_mod_root old_root;
272 };
273
274 /*
275  * Pull a new tree mod seq number for our operation.
276  */
277 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
278 {
279         return atomic64_inc_return(&fs_info->tree_mod_seq);
280 }
281
282 /*
283  * This adds a new blocker to the tree mod log's blocker list if the @elem
284  * passed does not already have a sequence number set. So when a caller expects
285  * to record tree modifications, it should ensure to set elem->seq to zero
286  * before calling btrfs_get_tree_mod_seq.
287  * Returns a fresh, unused tree log modification sequence number, even if no new
288  * blocker was added.
289  */
290 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
291                            struct seq_list *elem)
292 {
293         write_lock(&fs_info->tree_mod_log_lock);
294         if (!elem->seq) {
295                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
296                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
297         }
298         write_unlock(&fs_info->tree_mod_log_lock);
299
300         return elem->seq;
301 }
302
303 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
304                             struct seq_list *elem)
305 {
306         struct rb_root *tm_root;
307         struct rb_node *node;
308         struct rb_node *next;
309         struct tree_mod_elem *tm;
310         u64 min_seq = (u64)-1;
311         u64 seq_putting = elem->seq;
312
313         if (!seq_putting)
314                 return;
315
316         write_lock(&fs_info->tree_mod_log_lock);
317         list_del(&elem->list);
318         elem->seq = 0;
319
320         if (!list_empty(&fs_info->tree_mod_seq_list)) {
321                 struct seq_list *first;
322
323                 first = list_first_entry(&fs_info->tree_mod_seq_list,
324                                          struct seq_list, list);
325                 if (seq_putting > first->seq) {
326                         /*
327                          * Blocker with lower sequence number exists, we
328                          * cannot remove anything from the log.
329                          */
330                         write_unlock(&fs_info->tree_mod_log_lock);
331                         return;
332                 }
333                 min_seq = first->seq;
334         }
335
336         /*
337          * anything that's lower than the lowest existing (read: blocked)
338          * sequence number can be removed from the tree.
339          */
340         tm_root = &fs_info->tree_mod_log;
341         for (node = rb_first(tm_root); node; node = next) {
342                 next = rb_next(node);
343                 tm = rb_entry(node, struct tree_mod_elem, node);
344                 if (tm->seq >= min_seq)
345                         continue;
346                 rb_erase(node, tm_root);
347                 kfree(tm);
348         }
349         write_unlock(&fs_info->tree_mod_log_lock);
350 }
351
352 /*
353  * key order of the log:
354  *       node/leaf start address -> sequence
355  *
356  * The 'start address' is the logical address of the *new* root node
357  * for root replace operations, or the logical address of the affected
358  * block for all other operations.
359  */
360 static noinline int
361 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
362 {
363         struct rb_root *tm_root;
364         struct rb_node **new;
365         struct rb_node *parent = NULL;
366         struct tree_mod_elem *cur;
367
368         lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
369
370         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
371
372         tm_root = &fs_info->tree_mod_log;
373         new = &tm_root->rb_node;
374         while (*new) {
375                 cur = rb_entry(*new, struct tree_mod_elem, node);
376                 parent = *new;
377                 if (cur->logical < tm->logical)
378                         new = &((*new)->rb_left);
379                 else if (cur->logical > tm->logical)
380                         new = &((*new)->rb_right);
381                 else if (cur->seq < tm->seq)
382                         new = &((*new)->rb_left);
383                 else if (cur->seq > tm->seq)
384                         new = &((*new)->rb_right);
385                 else
386                         return -EEXIST;
387         }
388
389         rb_link_node(&tm->node, parent, new);
390         rb_insert_color(&tm->node, tm_root);
391         return 0;
392 }
393
394 /*
395  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
396  * returns zero with the tree_mod_log_lock acquired. The caller must hold
397  * this until all tree mod log insertions are recorded in the rb tree and then
398  * write unlock fs_info::tree_mod_log_lock.
399  */
400 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
401                                     struct extent_buffer *eb) {
402         smp_mb();
403         if (list_empty(&(fs_info)->tree_mod_seq_list))
404                 return 1;
405         if (eb && btrfs_header_level(eb) == 0)
406                 return 1;
407
408         write_lock(&fs_info->tree_mod_log_lock);
409         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
410                 write_unlock(&fs_info->tree_mod_log_lock);
411                 return 1;
412         }
413
414         return 0;
415 }
416
417 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
418 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
419                                     struct extent_buffer *eb)
420 {
421         smp_mb();
422         if (list_empty(&(fs_info)->tree_mod_seq_list))
423                 return 0;
424         if (eb && btrfs_header_level(eb) == 0)
425                 return 0;
426
427         return 1;
428 }
429
430 static struct tree_mod_elem *
431 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
432                     enum mod_log_op op, gfp_t flags)
433 {
434         struct tree_mod_elem *tm;
435
436         tm = kzalloc(sizeof(*tm), flags);
437         if (!tm)
438                 return NULL;
439
440         tm->logical = eb->start;
441         if (op != MOD_LOG_KEY_ADD) {
442                 btrfs_node_key(eb, &tm->key, slot);
443                 tm->blockptr = btrfs_node_blockptr(eb, slot);
444         }
445         tm->op = op;
446         tm->slot = slot;
447         tm->generation = btrfs_node_ptr_generation(eb, slot);
448         RB_CLEAR_NODE(&tm->node);
449
450         return tm;
451 }
452
453 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
454                 enum mod_log_op op, gfp_t flags)
455 {
456         struct tree_mod_elem *tm;
457         int ret;
458
459         if (!tree_mod_need_log(eb->fs_info, eb))
460                 return 0;
461
462         tm = alloc_tree_mod_elem(eb, slot, op, flags);
463         if (!tm)
464                 return -ENOMEM;
465
466         if (tree_mod_dont_log(eb->fs_info, eb)) {
467                 kfree(tm);
468                 return 0;
469         }
470
471         ret = __tree_mod_log_insert(eb->fs_info, tm);
472         write_unlock(&eb->fs_info->tree_mod_log_lock);
473         if (ret)
474                 kfree(tm);
475
476         return ret;
477 }
478
479 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
480                 int dst_slot, int src_slot, int nr_items)
481 {
482         struct tree_mod_elem *tm = NULL;
483         struct tree_mod_elem **tm_list = NULL;
484         int ret = 0;
485         int i;
486         int locked = 0;
487
488         if (!tree_mod_need_log(eb->fs_info, eb))
489                 return 0;
490
491         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
492         if (!tm_list)
493                 return -ENOMEM;
494
495         tm = kzalloc(sizeof(*tm), GFP_NOFS);
496         if (!tm) {
497                 ret = -ENOMEM;
498                 goto free_tms;
499         }
500
501         tm->logical = eb->start;
502         tm->slot = src_slot;
503         tm->move.dst_slot = dst_slot;
504         tm->move.nr_items = nr_items;
505         tm->op = MOD_LOG_MOVE_KEYS;
506
507         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
508                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
509                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
510                 if (!tm_list[i]) {
511                         ret = -ENOMEM;
512                         goto free_tms;
513                 }
514         }
515
516         if (tree_mod_dont_log(eb->fs_info, eb))
517                 goto free_tms;
518         locked = 1;
519
520         /*
521          * When we override something during the move, we log these removals.
522          * This can only happen when we move towards the beginning of the
523          * buffer, i.e. dst_slot < src_slot.
524          */
525         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
526                 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
527                 if (ret)
528                         goto free_tms;
529         }
530
531         ret = __tree_mod_log_insert(eb->fs_info, tm);
532         if (ret)
533                 goto free_tms;
534         write_unlock(&eb->fs_info->tree_mod_log_lock);
535         kfree(tm_list);
536
537         return 0;
538 free_tms:
539         for (i = 0; i < nr_items; i++) {
540                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
541                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
542                 kfree(tm_list[i]);
543         }
544         if (locked)
545                 write_unlock(&eb->fs_info->tree_mod_log_lock);
546         kfree(tm_list);
547         kfree(tm);
548
549         return ret;
550 }
551
552 static inline int
553 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
554                        struct tree_mod_elem **tm_list,
555                        int nritems)
556 {
557         int i, j;
558         int ret;
559
560         for (i = nritems - 1; i >= 0; i--) {
561                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
562                 if (ret) {
563                         for (j = nritems - 1; j > i; j--)
564                                 rb_erase(&tm_list[j]->node,
565                                          &fs_info->tree_mod_log);
566                         return ret;
567                 }
568         }
569
570         return 0;
571 }
572
573 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
574                          struct extent_buffer *new_root, int log_removal)
575 {
576         struct btrfs_fs_info *fs_info = old_root->fs_info;
577         struct tree_mod_elem *tm = NULL;
578         struct tree_mod_elem **tm_list = NULL;
579         int nritems = 0;
580         int ret = 0;
581         int i;
582
583         if (!tree_mod_need_log(fs_info, NULL))
584                 return 0;
585
586         if (log_removal && btrfs_header_level(old_root) > 0) {
587                 nritems = btrfs_header_nritems(old_root);
588                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
589                                   GFP_NOFS);
590                 if (!tm_list) {
591                         ret = -ENOMEM;
592                         goto free_tms;
593                 }
594                 for (i = 0; i < nritems; i++) {
595                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
596                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
597                         if (!tm_list[i]) {
598                                 ret = -ENOMEM;
599                                 goto free_tms;
600                         }
601                 }
602         }
603
604         tm = kzalloc(sizeof(*tm), GFP_NOFS);
605         if (!tm) {
606                 ret = -ENOMEM;
607                 goto free_tms;
608         }
609
610         tm->logical = new_root->start;
611         tm->old_root.logical = old_root->start;
612         tm->old_root.level = btrfs_header_level(old_root);
613         tm->generation = btrfs_header_generation(old_root);
614         tm->op = MOD_LOG_ROOT_REPLACE;
615
616         if (tree_mod_dont_log(fs_info, NULL))
617                 goto free_tms;
618
619         if (tm_list)
620                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
621         if (!ret)
622                 ret = __tree_mod_log_insert(fs_info, tm);
623
624         write_unlock(&fs_info->tree_mod_log_lock);
625         if (ret)
626                 goto free_tms;
627         kfree(tm_list);
628
629         return ret;
630
631 free_tms:
632         if (tm_list) {
633                 for (i = 0; i < nritems; i++)
634                         kfree(tm_list[i]);
635                 kfree(tm_list);
636         }
637         kfree(tm);
638
639         return ret;
640 }
641
642 static struct tree_mod_elem *
643 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
644                       int smallest)
645 {
646         struct rb_root *tm_root;
647         struct rb_node *node;
648         struct tree_mod_elem *cur = NULL;
649         struct tree_mod_elem *found = NULL;
650
651         read_lock(&fs_info->tree_mod_log_lock);
652         tm_root = &fs_info->tree_mod_log;
653         node = tm_root->rb_node;
654         while (node) {
655                 cur = rb_entry(node, struct tree_mod_elem, node);
656                 if (cur->logical < start) {
657                         node = node->rb_left;
658                 } else if (cur->logical > start) {
659                         node = node->rb_right;
660                 } else if (cur->seq < min_seq) {
661                         node = node->rb_left;
662                 } else if (!smallest) {
663                         /* we want the node with the highest seq */
664                         if (found)
665                                 BUG_ON(found->seq > cur->seq);
666                         found = cur;
667                         node = node->rb_left;
668                 } else if (cur->seq > min_seq) {
669                         /* we want the node with the smallest seq */
670                         if (found)
671                                 BUG_ON(found->seq < cur->seq);
672                         found = cur;
673                         node = node->rb_right;
674                 } else {
675                         found = cur;
676                         break;
677                 }
678         }
679         read_unlock(&fs_info->tree_mod_log_lock);
680
681         return found;
682 }
683
684 /*
685  * this returns the element from the log with the smallest time sequence
686  * value that's in the log (the oldest log item). any element with a time
687  * sequence lower than min_seq will be ignored.
688  */
689 static struct tree_mod_elem *
690 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
691                            u64 min_seq)
692 {
693         return __tree_mod_log_search(fs_info, start, min_seq, 1);
694 }
695
696 /*
697  * this returns the element from the log with the largest time sequence
698  * value that's in the log (the most recent log item). any element with
699  * a time sequence lower than min_seq will be ignored.
700  */
701 static struct tree_mod_elem *
702 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
703 {
704         return __tree_mod_log_search(fs_info, start, min_seq, 0);
705 }
706
707 static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
708                      struct extent_buffer *src, unsigned long dst_offset,
709                      unsigned long src_offset, int nr_items)
710 {
711         struct btrfs_fs_info *fs_info = dst->fs_info;
712         int ret = 0;
713         struct tree_mod_elem **tm_list = NULL;
714         struct tree_mod_elem **tm_list_add, **tm_list_rem;
715         int i;
716         int locked = 0;
717
718         if (!tree_mod_need_log(fs_info, NULL))
719                 return 0;
720
721         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
722                 return 0;
723
724         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
725                           GFP_NOFS);
726         if (!tm_list)
727                 return -ENOMEM;
728
729         tm_list_add = tm_list;
730         tm_list_rem = tm_list + nr_items;
731         for (i = 0; i < nr_items; i++) {
732                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
733                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
734                 if (!tm_list_rem[i]) {
735                         ret = -ENOMEM;
736                         goto free_tms;
737                 }
738
739                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
740                     MOD_LOG_KEY_ADD, GFP_NOFS);
741                 if (!tm_list_add[i]) {
742                         ret = -ENOMEM;
743                         goto free_tms;
744                 }
745         }
746
747         if (tree_mod_dont_log(fs_info, NULL))
748                 goto free_tms;
749         locked = 1;
750
751         for (i = 0; i < nr_items; i++) {
752                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
753                 if (ret)
754                         goto free_tms;
755                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
756                 if (ret)
757                         goto free_tms;
758         }
759
760         write_unlock(&fs_info->tree_mod_log_lock);
761         kfree(tm_list);
762
763         return 0;
764
765 free_tms:
766         for (i = 0; i < nr_items * 2; i++) {
767                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
768                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
769                 kfree(tm_list[i]);
770         }
771         if (locked)
772                 write_unlock(&fs_info->tree_mod_log_lock);
773         kfree(tm_list);
774
775         return ret;
776 }
777
778 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
779 {
780         struct tree_mod_elem **tm_list = NULL;
781         int nritems = 0;
782         int i;
783         int ret = 0;
784
785         if (btrfs_header_level(eb) == 0)
786                 return 0;
787
788         if (!tree_mod_need_log(eb->fs_info, NULL))
789                 return 0;
790
791         nritems = btrfs_header_nritems(eb);
792         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
793         if (!tm_list)
794                 return -ENOMEM;
795
796         for (i = 0; i < nritems; i++) {
797                 tm_list[i] = alloc_tree_mod_elem(eb, i,
798                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
799                 if (!tm_list[i]) {
800                         ret = -ENOMEM;
801                         goto free_tms;
802                 }
803         }
804
805         if (tree_mod_dont_log(eb->fs_info, eb))
806                 goto free_tms;
807
808         ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
809         write_unlock(&eb->fs_info->tree_mod_log_lock);
810         if (ret)
811                 goto free_tms;
812         kfree(tm_list);
813
814         return 0;
815
816 free_tms:
817         for (i = 0; i < nritems; i++)
818                 kfree(tm_list[i]);
819         kfree(tm_list);
820
821         return ret;
822 }
823
824 /*
825  * check if the tree block can be shared by multiple trees
826  */
827 int btrfs_block_can_be_shared(struct btrfs_root *root,
828                               struct extent_buffer *buf)
829 {
830         /*
831          * Tree blocks not in shareable trees and tree roots are never shared.
832          * If a block was allocated after the last snapshot and the block was
833          * not allocated by tree relocation, we know the block is not shared.
834          */
835         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
836             buf != root->node && buf != root->commit_root &&
837             (btrfs_header_generation(buf) <=
838              btrfs_root_last_snapshot(&root->root_item) ||
839              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
840                 return 1;
841
842         return 0;
843 }
844
845 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
846                                        struct btrfs_root *root,
847                                        struct extent_buffer *buf,
848                                        struct extent_buffer *cow,
849                                        int *last_ref)
850 {
851         struct btrfs_fs_info *fs_info = root->fs_info;
852         u64 refs;
853         u64 owner;
854         u64 flags;
855         u64 new_flags = 0;
856         int ret;
857
858         /*
859          * Backrefs update rules:
860          *
861          * Always use full backrefs for extent pointers in tree block
862          * allocated by tree relocation.
863          *
864          * If a shared tree block is no longer referenced by its owner
865          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
866          * use full backrefs for extent pointers in tree block.
867          *
868          * If a tree block is been relocating
869          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
870          * use full backrefs for extent pointers in tree block.
871          * The reason for this is some operations (such as drop tree)
872          * are only allowed for blocks use full backrefs.
873          */
874
875         if (btrfs_block_can_be_shared(root, buf)) {
876                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
877                                                btrfs_header_level(buf), 1,
878                                                &refs, &flags);
879                 if (ret)
880                         return ret;
881                 if (refs == 0) {
882                         ret = -EROFS;
883                         btrfs_handle_fs_error(fs_info, ret, NULL);
884                         return ret;
885                 }
886         } else {
887                 refs = 1;
888                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
889                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
890                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
891                 else
892                         flags = 0;
893         }
894
895         owner = btrfs_header_owner(buf);
896         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
897                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
898
899         if (refs > 1) {
900                 if ((owner == root->root_key.objectid ||
901                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
902                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
903                         ret = btrfs_inc_ref(trans, root, buf, 1);
904                         if (ret)
905                                 return ret;
906
907                         if (root->root_key.objectid ==
908                             BTRFS_TREE_RELOC_OBJECTID) {
909                                 ret = btrfs_dec_ref(trans, root, buf, 0);
910                                 if (ret)
911                                         return ret;
912                                 ret = btrfs_inc_ref(trans, root, cow, 1);
913                                 if (ret)
914                                         return ret;
915                         }
916                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
917                 } else {
918
919                         if (root->root_key.objectid ==
920                             BTRFS_TREE_RELOC_OBJECTID)
921                                 ret = btrfs_inc_ref(trans, root, cow, 1);
922                         else
923                                 ret = btrfs_inc_ref(trans, root, cow, 0);
924                         if (ret)
925                                 return ret;
926                 }
927                 if (new_flags != 0) {
928                         int level = btrfs_header_level(buf);
929
930                         ret = btrfs_set_disk_extent_flags(trans, buf,
931                                                           new_flags, level, 0);
932                         if (ret)
933                                 return ret;
934                 }
935         } else {
936                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
937                         if (root->root_key.objectid ==
938                             BTRFS_TREE_RELOC_OBJECTID)
939                                 ret = btrfs_inc_ref(trans, root, cow, 1);
940                         else
941                                 ret = btrfs_inc_ref(trans, root, cow, 0);
942                         if (ret)
943                                 return ret;
944                         ret = btrfs_dec_ref(trans, root, buf, 1);
945                         if (ret)
946                                 return ret;
947                 }
948                 btrfs_clean_tree_block(buf);
949                 *last_ref = 1;
950         }
951         return 0;
952 }
953
954 static struct extent_buffer *alloc_tree_block_no_bg_flush(
955                                           struct btrfs_trans_handle *trans,
956                                           struct btrfs_root *root,
957                                           u64 parent_start,
958                                           const struct btrfs_disk_key *disk_key,
959                                           int level,
960                                           u64 hint,
961                                           u64 empty_size,
962                                           enum btrfs_lock_nesting nest)
963 {
964         struct btrfs_fs_info *fs_info = root->fs_info;
965         struct extent_buffer *ret;
966
967         /*
968          * If we are COWing a node/leaf from the extent, chunk, device or free
969          * space trees, make sure that we do not finish block group creation of
970          * pending block groups. We do this to avoid a deadlock.
971          * COWing can result in allocation of a new chunk, and flushing pending
972          * block groups (btrfs_create_pending_block_groups()) can be triggered
973          * when finishing allocation of a new chunk. Creation of a pending block
974          * group modifies the extent, chunk, device and free space trees,
975          * therefore we could deadlock with ourselves since we are holding a
976          * lock on an extent buffer that btrfs_create_pending_block_groups() may
977          * try to COW later.
978          * For similar reasons, we also need to delay flushing pending block
979          * groups when splitting a leaf or node, from one of those trees, since
980          * we are holding a write lock on it and its parent or when inserting a
981          * new root node for one of those trees.
982          */
983         if (root == fs_info->extent_root ||
984             root == fs_info->chunk_root ||
985             root == fs_info->dev_root ||
986             root == fs_info->free_space_root)
987                 trans->can_flush_pending_bgs = false;
988
989         ret = btrfs_alloc_tree_block(trans, root, parent_start,
990                                      root->root_key.objectid, disk_key, level,
991                                      hint, empty_size, nest);
992         trans->can_flush_pending_bgs = true;
993
994         return ret;
995 }
996
997 /*
998  * does the dirty work in cow of a single block.  The parent block (if
999  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1000  * dirty and returned locked.  If you modify the block it needs to be marked
1001  * dirty again.
1002  *
1003  * search_start -- an allocation hint for the new block
1004  *
1005  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1006  * bytes the allocator should try to find free next to the block it returns.
1007  * This is just a hint and may be ignored by the allocator.
1008  */
1009 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1010                              struct btrfs_root *root,
1011                              struct extent_buffer *buf,
1012                              struct extent_buffer *parent, int parent_slot,
1013                              struct extent_buffer **cow_ret,
1014                              u64 search_start, u64 empty_size,
1015                              enum btrfs_lock_nesting nest)
1016 {
1017         struct btrfs_fs_info *fs_info = root->fs_info;
1018         struct btrfs_disk_key disk_key;
1019         struct extent_buffer *cow;
1020         int level, ret;
1021         int last_ref = 0;
1022         int unlock_orig = 0;
1023         u64 parent_start = 0;
1024
1025         if (*cow_ret == buf)
1026                 unlock_orig = 1;
1027
1028         btrfs_assert_tree_locked(buf);
1029
1030         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1031                 trans->transid != fs_info->running_transaction->transid);
1032         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1033                 trans->transid != root->last_trans);
1034
1035         level = btrfs_header_level(buf);
1036
1037         if (level == 0)
1038                 btrfs_item_key(buf, &disk_key, 0);
1039         else
1040                 btrfs_node_key(buf, &disk_key, 0);
1041
1042         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1043                 parent_start = parent->start;
1044
1045         cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1046                                            level, search_start, empty_size, nest);
1047         if (IS_ERR(cow))
1048                 return PTR_ERR(cow);
1049
1050         /* cow is set to blocking by btrfs_init_new_buffer */
1051
1052         copy_extent_buffer_full(cow, buf);
1053         btrfs_set_header_bytenr(cow, cow->start);
1054         btrfs_set_header_generation(cow, trans->transid);
1055         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1056         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1057                                      BTRFS_HEADER_FLAG_RELOC);
1058         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1059                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1060         else
1061                 btrfs_set_header_owner(cow, root->root_key.objectid);
1062
1063         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1064
1065         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1066         if (ret) {
1067                 btrfs_abort_transaction(trans, ret);
1068                 return ret;
1069         }
1070
1071         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
1072                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1073                 if (ret) {
1074                         btrfs_abort_transaction(trans, ret);
1075                         return ret;
1076                 }
1077         }
1078
1079         if (buf == root->node) {
1080                 WARN_ON(parent && parent != buf);
1081                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1082                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1083                         parent_start = buf->start;
1084
1085                 atomic_inc(&cow->refs);
1086                 ret = tree_mod_log_insert_root(root->node, cow, 1);
1087                 BUG_ON(ret < 0);
1088                 rcu_assign_pointer(root->node, cow);
1089
1090                 btrfs_free_tree_block(trans, root, buf, parent_start,
1091                                       last_ref);
1092                 free_extent_buffer(buf);
1093                 add_root_to_dirty_list(root);
1094         } else {
1095                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1096                 tree_mod_log_insert_key(parent, parent_slot,
1097                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1098                 btrfs_set_node_blockptr(parent, parent_slot,
1099                                         cow->start);
1100                 btrfs_set_node_ptr_generation(parent, parent_slot,
1101                                               trans->transid);
1102                 btrfs_mark_buffer_dirty(parent);
1103                 if (last_ref) {
1104                         ret = tree_mod_log_free_eb(buf);
1105                         if (ret) {
1106                                 btrfs_abort_transaction(trans, ret);
1107                                 return ret;
1108                         }
1109                 }
1110                 btrfs_free_tree_block(trans, root, buf, parent_start,
1111                                       last_ref);
1112         }
1113         if (unlock_orig)
1114                 btrfs_tree_unlock(buf);
1115         free_extent_buffer_stale(buf);
1116         btrfs_mark_buffer_dirty(cow);
1117         *cow_ret = cow;
1118         return 0;
1119 }
1120
1121 /*
1122  * returns the logical address of the oldest predecessor of the given root.
1123  * entries older than time_seq are ignored.
1124  */
1125 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1126                 struct extent_buffer *eb_root, u64 time_seq)
1127 {
1128         struct tree_mod_elem *tm;
1129         struct tree_mod_elem *found = NULL;
1130         u64 root_logical = eb_root->start;
1131         int looped = 0;
1132
1133         if (!time_seq)
1134                 return NULL;
1135
1136         /*
1137          * the very last operation that's logged for a root is the
1138          * replacement operation (if it is replaced at all). this has
1139          * the logical address of the *new* root, making it the very
1140          * first operation that's logged for this root.
1141          */
1142         while (1) {
1143                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1144                                                 time_seq);
1145                 if (!looped && !tm)
1146                         return NULL;
1147                 /*
1148                  * if there are no tree operation for the oldest root, we simply
1149                  * return it. this should only happen if that (old) root is at
1150                  * level 0.
1151                  */
1152                 if (!tm)
1153                         break;
1154
1155                 /*
1156                  * if there's an operation that's not a root replacement, we
1157                  * found the oldest version of our root. normally, we'll find a
1158                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1159                  */
1160                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1161                         break;
1162
1163                 found = tm;
1164                 root_logical = tm->old_root.logical;
1165                 looped = 1;
1166         }
1167
1168         /* if there's no old root to return, return what we found instead */
1169         if (!found)
1170                 found = tm;
1171
1172         return found;
1173 }
1174
1175 /*
1176  * tm is a pointer to the first operation to rewind within eb. then, all
1177  * previous operations will be rewound (until we reach something older than
1178  * time_seq).
1179  */
1180 static void
1181 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1182                       u64 time_seq, struct tree_mod_elem *first_tm)
1183 {
1184         u32 n;
1185         struct rb_node *next;
1186         struct tree_mod_elem *tm = first_tm;
1187         unsigned long o_dst;
1188         unsigned long o_src;
1189         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1190
1191         n = btrfs_header_nritems(eb);
1192         read_lock(&fs_info->tree_mod_log_lock);
1193         while (tm && tm->seq >= time_seq) {
1194                 /*
1195                  * all the operations are recorded with the operator used for
1196                  * the modification. as we're going backwards, we do the
1197                  * opposite of each operation here.
1198                  */
1199                 switch (tm->op) {
1200                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1201                         BUG_ON(tm->slot < n);
1202                         fallthrough;
1203                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1204                 case MOD_LOG_KEY_REMOVE:
1205                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1206                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1207                         btrfs_set_node_ptr_generation(eb, tm->slot,
1208                                                       tm->generation);
1209                         n++;
1210                         break;
1211                 case MOD_LOG_KEY_REPLACE:
1212                         BUG_ON(tm->slot >= n);
1213                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1214                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1215                         btrfs_set_node_ptr_generation(eb, tm->slot,
1216                                                       tm->generation);
1217                         break;
1218                 case MOD_LOG_KEY_ADD:
1219                         /* if a move operation is needed it's in the log */
1220                         n--;
1221                         break;
1222                 case MOD_LOG_MOVE_KEYS:
1223                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1224                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1225                         memmove_extent_buffer(eb, o_dst, o_src,
1226                                               tm->move.nr_items * p_size);
1227                         break;
1228                 case MOD_LOG_ROOT_REPLACE:
1229                         /*
1230                          * this operation is special. for roots, this must be
1231                          * handled explicitly before rewinding.
1232                          * for non-roots, this operation may exist if the node
1233                          * was a root: root A -> child B; then A gets empty and
1234                          * B is promoted to the new root. in the mod log, we'll
1235                          * have a root-replace operation for B, a tree block
1236                          * that is no root. we simply ignore that operation.
1237                          */
1238                         break;
1239                 }
1240                 next = rb_next(&tm->node);
1241                 if (!next)
1242                         break;
1243                 tm = rb_entry(next, struct tree_mod_elem, node);
1244                 if (tm->logical != first_tm->logical)
1245                         break;
1246         }
1247         read_unlock(&fs_info->tree_mod_log_lock);
1248         btrfs_set_header_nritems(eb, n);
1249 }
1250
1251 /*
1252  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1253  * is returned. If rewind operations happen, a fresh buffer is returned. The
1254  * returned buffer is always read-locked. If the returned buffer is not the
1255  * input buffer, the lock on the input buffer is released and the input buffer
1256  * is freed (its refcount is decremented).
1257  */
1258 static struct extent_buffer *
1259 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1260                     struct extent_buffer *eb, u64 time_seq)
1261 {
1262         struct extent_buffer *eb_rewin;
1263         struct tree_mod_elem *tm;
1264
1265         if (!time_seq)
1266                 return eb;
1267
1268         if (btrfs_header_level(eb) == 0)
1269                 return eb;
1270
1271         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1272         if (!tm)
1273                 return eb;
1274
1275         btrfs_set_path_blocking(path);
1276         btrfs_set_lock_blocking_read(eb);
1277
1278         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1279                 BUG_ON(tm->slot != 0);
1280                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1281                 if (!eb_rewin) {
1282                         btrfs_tree_read_unlock_blocking(eb);
1283                         free_extent_buffer(eb);
1284                         return NULL;
1285                 }
1286                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1287                 btrfs_set_header_backref_rev(eb_rewin,
1288                                              btrfs_header_backref_rev(eb));
1289                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1290                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1291         } else {
1292                 eb_rewin = btrfs_clone_extent_buffer(eb);
1293                 if (!eb_rewin) {
1294                         btrfs_tree_read_unlock_blocking(eb);
1295                         free_extent_buffer(eb);
1296                         return NULL;
1297                 }
1298         }
1299
1300         btrfs_tree_read_unlock_blocking(eb);
1301         free_extent_buffer(eb);
1302
1303         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
1304                                        eb_rewin, btrfs_header_level(eb_rewin));
1305         btrfs_tree_read_lock(eb_rewin);
1306         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1307         WARN_ON(btrfs_header_nritems(eb_rewin) >
1308                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1309
1310         return eb_rewin;
1311 }
1312
1313 /*
1314  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1315  * value. If there are no changes, the current root->root_node is returned. If
1316  * anything changed in between, there's a fresh buffer allocated on which the
1317  * rewind operations are done. In any case, the returned buffer is read locked.
1318  * Returns NULL on error (with no locks held).
1319  */
1320 static inline struct extent_buffer *
1321 get_old_root(struct btrfs_root *root, u64 time_seq)
1322 {
1323         struct btrfs_fs_info *fs_info = root->fs_info;
1324         struct tree_mod_elem *tm;
1325         struct extent_buffer *eb = NULL;
1326         struct extent_buffer *eb_root;
1327         u64 eb_root_owner = 0;
1328         struct extent_buffer *old;
1329         struct tree_mod_root *old_root = NULL;
1330         u64 old_generation = 0;
1331         u64 logical;
1332         int level;
1333
1334         eb_root = btrfs_read_lock_root_node(root);
1335         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1336         if (!tm)
1337                 return eb_root;
1338
1339         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1340                 old_root = &tm->old_root;
1341                 old_generation = tm->generation;
1342                 logical = old_root->logical;
1343                 level = old_root->level;
1344         } else {
1345                 logical = eb_root->start;
1346                 level = btrfs_header_level(eb_root);
1347         }
1348
1349         tm = tree_mod_log_search(fs_info, logical, time_seq);
1350         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1351                 btrfs_tree_read_unlock(eb_root);
1352                 free_extent_buffer(eb_root);
1353                 old = read_tree_block(fs_info, logical, 0, level, NULL);
1354                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1355                         if (!IS_ERR(old))
1356                                 free_extent_buffer(old);
1357                         btrfs_warn(fs_info,
1358                                    "failed to read tree block %llu from get_old_root",
1359                                    logical);
1360                 } else {
1361                         eb = btrfs_clone_extent_buffer(old);
1362                         free_extent_buffer(old);
1363                 }
1364         } else if (old_root) {
1365                 eb_root_owner = btrfs_header_owner(eb_root);
1366                 btrfs_tree_read_unlock(eb_root);
1367                 free_extent_buffer(eb_root);
1368                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1369         } else {
1370                 btrfs_set_lock_blocking_read(eb_root);
1371                 eb = btrfs_clone_extent_buffer(eb_root);
1372                 btrfs_tree_read_unlock_blocking(eb_root);
1373                 free_extent_buffer(eb_root);
1374         }
1375
1376         if (!eb)
1377                 return NULL;
1378         if (old_root) {
1379                 btrfs_set_header_bytenr(eb, eb->start);
1380                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1381                 btrfs_set_header_owner(eb, eb_root_owner);
1382                 btrfs_set_header_level(eb, old_root->level);
1383                 btrfs_set_header_generation(eb, old_generation);
1384         }
1385         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1386                                        btrfs_header_level(eb));
1387         btrfs_tree_read_lock(eb);
1388         if (tm)
1389                 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1390         else
1391                 WARN_ON(btrfs_header_level(eb) != 0);
1392         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1393
1394         return eb;
1395 }
1396
1397 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1398 {
1399         struct tree_mod_elem *tm;
1400         int level;
1401         struct extent_buffer *eb_root = btrfs_root_node(root);
1402
1403         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1404         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1405                 level = tm->old_root.level;
1406         } else {
1407                 level = btrfs_header_level(eb_root);
1408         }
1409         free_extent_buffer(eb_root);
1410
1411         return level;
1412 }
1413
1414 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1415                                    struct btrfs_root *root,
1416                                    struct extent_buffer *buf)
1417 {
1418         if (btrfs_is_testing(root->fs_info))
1419                 return 0;
1420
1421         /* Ensure we can see the FORCE_COW bit */
1422         smp_mb__before_atomic();
1423
1424         /*
1425          * We do not need to cow a block if
1426          * 1) this block is not created or changed in this transaction;
1427          * 2) this block does not belong to TREE_RELOC tree;
1428          * 3) the root is not forced COW.
1429          *
1430          * What is forced COW:
1431          *    when we create snapshot during committing the transaction,
1432          *    after we've finished copying src root, we must COW the shared
1433          *    block to ensure the metadata consistency.
1434          */
1435         if (btrfs_header_generation(buf) == trans->transid &&
1436             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1437             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1438               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1439             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1440                 return 0;
1441         return 1;
1442 }
1443
1444 /*
1445  * cows a single block, see __btrfs_cow_block for the real work.
1446  * This version of it has extra checks so that a block isn't COWed more than
1447  * once per transaction, as long as it hasn't been written yet
1448  */
1449 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1450                     struct btrfs_root *root, struct extent_buffer *buf,
1451                     struct extent_buffer *parent, int parent_slot,
1452                     struct extent_buffer **cow_ret,
1453                     enum btrfs_lock_nesting nest)
1454 {
1455         struct btrfs_fs_info *fs_info = root->fs_info;
1456         u64 search_start;
1457         int ret;
1458
1459         if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1460                 btrfs_err(fs_info,
1461                         "COW'ing blocks on a fs root that's being dropped");
1462
1463         if (trans->transaction != fs_info->running_transaction)
1464                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1465                        trans->transid,
1466                        fs_info->running_transaction->transid);
1467
1468         if (trans->transid != fs_info->generation)
1469                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1470                        trans->transid, fs_info->generation);
1471
1472         if (!should_cow_block(trans, root, buf)) {
1473                 trans->dirty = true;
1474                 *cow_ret = buf;
1475                 return 0;
1476         }
1477
1478         search_start = buf->start & ~((u64)SZ_1G - 1);
1479
1480         if (parent)
1481                 btrfs_set_lock_blocking_write(parent);
1482         btrfs_set_lock_blocking_write(buf);
1483
1484         /*
1485          * Before CoWing this block for later modification, check if it's
1486          * the subtree root and do the delayed subtree trace if needed.
1487          *
1488          * Also We don't care about the error, as it's handled internally.
1489          */
1490         btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1491         ret = __btrfs_cow_block(trans, root, buf, parent,
1492                                  parent_slot, cow_ret, search_start, 0, nest);
1493
1494         trace_btrfs_cow_block(root, buf, *cow_ret);
1495
1496         return ret;
1497 }
1498
1499 /*
1500  * helper function for defrag to decide if two blocks pointed to by a
1501  * node are actually close by
1502  */
1503 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1504 {
1505         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1506                 return 1;
1507         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1508                 return 1;
1509         return 0;
1510 }
1511
1512 #ifdef __LITTLE_ENDIAN
1513
1514 /*
1515  * Compare two keys, on little-endian the disk order is same as CPU order and
1516  * we can avoid the conversion.
1517  */
1518 static int comp_keys(const struct btrfs_disk_key *disk_key,
1519                      const struct btrfs_key *k2)
1520 {
1521         const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
1522
1523         return btrfs_comp_cpu_keys(k1, k2);
1524 }
1525
1526 #else
1527
1528 /*
1529  * compare two keys in a memcmp fashion
1530  */
1531 static int comp_keys(const struct btrfs_disk_key *disk,
1532                      const struct btrfs_key *k2)
1533 {
1534         struct btrfs_key k1;
1535
1536         btrfs_disk_key_to_cpu(&k1, disk);
1537
1538         return btrfs_comp_cpu_keys(&k1, k2);
1539 }
1540 #endif
1541
1542 /*
1543  * same as comp_keys only with two btrfs_key's
1544  */
1545 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1546 {
1547         if (k1->objectid > k2->objectid)
1548                 return 1;
1549         if (k1->objectid < k2->objectid)
1550                 return -1;
1551         if (k1->type > k2->type)
1552                 return 1;
1553         if (k1->type < k2->type)
1554                 return -1;
1555         if (k1->offset > k2->offset)
1556                 return 1;
1557         if (k1->offset < k2->offset)
1558                 return -1;
1559         return 0;
1560 }
1561
1562 /*
1563  * this is used by the defrag code to go through all the
1564  * leaves pointed to by a node and reallocate them so that
1565  * disk order is close to key order
1566  */
1567 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1568                        struct btrfs_root *root, struct extent_buffer *parent,
1569                        int start_slot, u64 *last_ret,
1570                        struct btrfs_key *progress)
1571 {
1572         struct btrfs_fs_info *fs_info = root->fs_info;
1573         struct extent_buffer *cur;
1574         u64 blocknr;
1575         u64 gen;
1576         u64 search_start = *last_ret;
1577         u64 last_block = 0;
1578         u64 other;
1579         u32 parent_nritems;
1580         int end_slot;
1581         int i;
1582         int err = 0;
1583         int parent_level;
1584         int uptodate;
1585         u32 blocksize;
1586         int progress_passed = 0;
1587         struct btrfs_disk_key disk_key;
1588
1589         parent_level = btrfs_header_level(parent);
1590
1591         WARN_ON(trans->transaction != fs_info->running_transaction);
1592         WARN_ON(trans->transid != fs_info->generation);
1593
1594         parent_nritems = btrfs_header_nritems(parent);
1595         blocksize = fs_info->nodesize;
1596         end_slot = parent_nritems - 1;
1597
1598         if (parent_nritems <= 1)
1599                 return 0;
1600
1601         btrfs_set_lock_blocking_write(parent);
1602
1603         for (i = start_slot; i <= end_slot; i++) {
1604                 struct btrfs_key first_key;
1605                 int close = 1;
1606
1607                 btrfs_node_key(parent, &disk_key, i);
1608                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1609                         continue;
1610
1611                 progress_passed = 1;
1612                 blocknr = btrfs_node_blockptr(parent, i);
1613                 gen = btrfs_node_ptr_generation(parent, i);
1614                 btrfs_node_key_to_cpu(parent, &first_key, i);
1615                 if (last_block == 0)
1616                         last_block = blocknr;
1617
1618                 if (i > 0) {
1619                         other = btrfs_node_blockptr(parent, i - 1);
1620                         close = close_blocks(blocknr, other, blocksize);
1621                 }
1622                 if (!close && i < end_slot) {
1623                         other = btrfs_node_blockptr(parent, i + 1);
1624                         close = close_blocks(blocknr, other, blocksize);
1625                 }
1626                 if (close) {
1627                         last_block = blocknr;
1628                         continue;
1629                 }
1630
1631                 cur = find_extent_buffer(fs_info, blocknr);
1632                 if (cur)
1633                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1634                 else
1635                         uptodate = 0;
1636                 if (!cur || !uptodate) {
1637                         if (!cur) {
1638                                 cur = read_tree_block(fs_info, blocknr, gen,
1639                                                       parent_level - 1,
1640                                                       &first_key);
1641                                 if (IS_ERR(cur)) {
1642                                         return PTR_ERR(cur);
1643                                 } else if (!extent_buffer_uptodate(cur)) {
1644                                         free_extent_buffer(cur);
1645                                         return -EIO;
1646                                 }
1647                         } else if (!uptodate) {
1648                                 err = btrfs_read_buffer(cur, gen,
1649                                                 parent_level - 1,&first_key);
1650                                 if (err) {
1651                                         free_extent_buffer(cur);
1652                                         return err;
1653                                 }
1654                         }
1655                 }
1656                 if (search_start == 0)
1657                         search_start = last_block;
1658
1659                 btrfs_tree_lock(cur);
1660                 btrfs_set_lock_blocking_write(cur);
1661                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1662                                         &cur, search_start,
1663                                         min(16 * blocksize,
1664                                             (end_slot - i) * blocksize),
1665                                         BTRFS_NESTING_COW);
1666                 if (err) {
1667                         btrfs_tree_unlock(cur);
1668                         free_extent_buffer(cur);
1669                         break;
1670                 }
1671                 search_start = cur->start;
1672                 last_block = cur->start;
1673                 *last_ret = search_start;
1674                 btrfs_tree_unlock(cur);
1675                 free_extent_buffer(cur);
1676         }
1677         return err;
1678 }
1679
1680 /*
1681  * search for key in the extent_buffer.  The items start at offset p,
1682  * and they are item_size apart.  There are 'max' items in p.
1683  *
1684  * the slot in the array is returned via slot, and it points to
1685  * the place where you would insert key if it is not found in
1686  * the array.
1687  *
1688  * slot may point to max if the key is bigger than all of the keys
1689  */
1690 static noinline int generic_bin_search(struct extent_buffer *eb,
1691                                        unsigned long p, int item_size,
1692                                        const struct btrfs_key *key,
1693                                        int max, int *slot)
1694 {
1695         int low = 0;
1696         int high = max;
1697         int ret;
1698         const int key_size = sizeof(struct btrfs_disk_key);
1699
1700         if (low > high) {
1701                 btrfs_err(eb->fs_info,
1702                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1703                           __func__, low, high, eb->start,
1704                           btrfs_header_owner(eb), btrfs_header_level(eb));
1705                 return -EINVAL;
1706         }
1707
1708         while (low < high) {
1709                 unsigned long oip;
1710                 unsigned long offset;
1711                 struct btrfs_disk_key *tmp;
1712                 struct btrfs_disk_key unaligned;
1713                 int mid;
1714
1715                 mid = (low + high) / 2;
1716                 offset = p + mid * item_size;
1717                 oip = offset_in_page(offset);
1718
1719                 if (oip + key_size <= PAGE_SIZE) {
1720                         const unsigned long idx = offset >> PAGE_SHIFT;
1721                         char *kaddr = page_address(eb->pages[idx]);
1722
1723                         tmp = (struct btrfs_disk_key *)(kaddr + oip);
1724                 } else {
1725                         read_extent_buffer(eb, &unaligned, offset, key_size);
1726                         tmp = &unaligned;
1727                 }
1728
1729                 ret = comp_keys(tmp, key);
1730
1731                 if (ret < 0)
1732                         low = mid + 1;
1733                 else if (ret > 0)
1734                         high = mid;
1735                 else {
1736                         *slot = mid;
1737                         return 0;
1738                 }
1739         }
1740         *slot = low;
1741         return 1;
1742 }
1743
1744 /*
1745  * simple bin_search frontend that does the right thing for
1746  * leaves vs nodes
1747  */
1748 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1749                      int *slot)
1750 {
1751         if (btrfs_header_level(eb) == 0)
1752                 return generic_bin_search(eb,
1753                                           offsetof(struct btrfs_leaf, items),
1754                                           sizeof(struct btrfs_item),
1755                                           key, btrfs_header_nritems(eb),
1756                                           slot);
1757         else
1758                 return generic_bin_search(eb,
1759                                           offsetof(struct btrfs_node, ptrs),
1760                                           sizeof(struct btrfs_key_ptr),
1761                                           key, btrfs_header_nritems(eb),
1762                                           slot);
1763 }
1764
1765 static void root_add_used(struct btrfs_root *root, u32 size)
1766 {
1767         spin_lock(&root->accounting_lock);
1768         btrfs_set_root_used(&root->root_item,
1769                             btrfs_root_used(&root->root_item) + size);
1770         spin_unlock(&root->accounting_lock);
1771 }
1772
1773 static void root_sub_used(struct btrfs_root *root, u32 size)
1774 {
1775         spin_lock(&root->accounting_lock);
1776         btrfs_set_root_used(&root->root_item,
1777                             btrfs_root_used(&root->root_item) - size);
1778         spin_unlock(&root->accounting_lock);
1779 }
1780
1781 /* given a node and slot number, this reads the blocks it points to.  The
1782  * extent buffer is returned with a reference taken (but unlocked).
1783  */
1784 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
1785                                            int slot)
1786 {
1787         int level = btrfs_header_level(parent);
1788         struct extent_buffer *eb;
1789         struct btrfs_key first_key;
1790
1791         if (slot < 0 || slot >= btrfs_header_nritems(parent))
1792                 return ERR_PTR(-ENOENT);
1793
1794         BUG_ON(level == 0);
1795
1796         btrfs_node_key_to_cpu(parent, &first_key, slot);
1797         eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1798                              btrfs_node_ptr_generation(parent, slot),
1799                              level - 1, &first_key);
1800         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1801                 free_extent_buffer(eb);
1802                 eb = ERR_PTR(-EIO);
1803         }
1804
1805         return eb;
1806 }
1807
1808 /*
1809  * node level balancing, used to make sure nodes are in proper order for
1810  * item deletion.  We balance from the top down, so we have to make sure
1811  * that a deletion won't leave an node completely empty later on.
1812  */
1813 static noinline int balance_level(struct btrfs_trans_handle *trans,
1814                          struct btrfs_root *root,
1815                          struct btrfs_path *path, int level)
1816 {
1817         struct btrfs_fs_info *fs_info = root->fs_info;
1818         struct extent_buffer *right = NULL;
1819         struct extent_buffer *mid;
1820         struct extent_buffer *left = NULL;
1821         struct extent_buffer *parent = NULL;
1822         int ret = 0;
1823         int wret;
1824         int pslot;
1825         int orig_slot = path->slots[level];
1826         u64 orig_ptr;
1827
1828         ASSERT(level > 0);
1829
1830         mid = path->nodes[level];
1831
1832         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1833                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1834         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1835
1836         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1837
1838         if (level < BTRFS_MAX_LEVEL - 1) {
1839                 parent = path->nodes[level + 1];
1840                 pslot = path->slots[level + 1];
1841         }
1842
1843         /*
1844          * deal with the case where there is only one pointer in the root
1845          * by promoting the node below to a root
1846          */
1847         if (!parent) {
1848                 struct extent_buffer *child;
1849
1850                 if (btrfs_header_nritems(mid) != 1)
1851                         return 0;
1852
1853                 /* promote the child to a root */
1854                 child = btrfs_read_node_slot(mid, 0);
1855                 if (IS_ERR(child)) {
1856                         ret = PTR_ERR(child);
1857                         btrfs_handle_fs_error(fs_info, ret, NULL);
1858                         goto enospc;
1859                 }
1860
1861                 btrfs_tree_lock(child);
1862                 btrfs_set_lock_blocking_write(child);
1863                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
1864                                       BTRFS_NESTING_COW);
1865                 if (ret) {
1866                         btrfs_tree_unlock(child);
1867                         free_extent_buffer(child);
1868                         goto enospc;
1869                 }
1870
1871                 ret = tree_mod_log_insert_root(root->node, child, 1);
1872                 BUG_ON(ret < 0);
1873                 rcu_assign_pointer(root->node, child);
1874
1875                 add_root_to_dirty_list(root);
1876                 btrfs_tree_unlock(child);
1877
1878                 path->locks[level] = 0;
1879                 path->nodes[level] = NULL;
1880                 btrfs_clean_tree_block(mid);
1881                 btrfs_tree_unlock(mid);
1882                 /* once for the path */
1883                 free_extent_buffer(mid);
1884
1885                 root_sub_used(root, mid->len);
1886                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1887                 /* once for the root ptr */
1888                 free_extent_buffer_stale(mid);
1889                 return 0;
1890         }
1891         if (btrfs_header_nritems(mid) >
1892             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1893                 return 0;
1894
1895         left = btrfs_read_node_slot(parent, pslot - 1);
1896         if (IS_ERR(left))
1897                 left = NULL;
1898
1899         if (left) {
1900                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1901                 btrfs_set_lock_blocking_write(left);
1902                 wret = btrfs_cow_block(trans, root, left,
1903                                        parent, pslot - 1, &left,
1904                                        BTRFS_NESTING_LEFT_COW);
1905                 if (wret) {
1906                         ret = wret;
1907                         goto enospc;
1908                 }
1909         }
1910
1911         right = btrfs_read_node_slot(parent, pslot + 1);
1912         if (IS_ERR(right))
1913                 right = NULL;
1914
1915         if (right) {
1916                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1917                 btrfs_set_lock_blocking_write(right);
1918                 wret = btrfs_cow_block(trans, root, right,
1919                                        parent, pslot + 1, &right,
1920                                        BTRFS_NESTING_RIGHT_COW);
1921                 if (wret) {
1922                         ret = wret;
1923                         goto enospc;
1924                 }
1925         }
1926
1927         /* first, try to make some room in the middle buffer */
1928         if (left) {
1929                 orig_slot += btrfs_header_nritems(left);
1930                 wret = push_node_left(trans, left, mid, 1);
1931                 if (wret < 0)
1932                         ret = wret;
1933         }
1934
1935         /*
1936          * then try to empty the right most buffer into the middle
1937          */
1938         if (right) {
1939                 wret = push_node_left(trans, mid, right, 1);
1940                 if (wret < 0 && wret != -ENOSPC)
1941                         ret = wret;
1942                 if (btrfs_header_nritems(right) == 0) {
1943                         btrfs_clean_tree_block(right);
1944                         btrfs_tree_unlock(right);
1945                         del_ptr(root, path, level + 1, pslot + 1);
1946                         root_sub_used(root, right->len);
1947                         btrfs_free_tree_block(trans, root, right, 0, 1);
1948                         free_extent_buffer_stale(right);
1949                         right = NULL;
1950                 } else {
1951                         struct btrfs_disk_key right_key;
1952                         btrfs_node_key(right, &right_key, 0);
1953                         ret = tree_mod_log_insert_key(parent, pslot + 1,
1954                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1955                         BUG_ON(ret < 0);
1956                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1957                         btrfs_mark_buffer_dirty(parent);
1958                 }
1959         }
1960         if (btrfs_header_nritems(mid) == 1) {
1961                 /*
1962                  * we're not allowed to leave a node with one item in the
1963                  * tree during a delete.  A deletion from lower in the tree
1964                  * could try to delete the only pointer in this node.
1965                  * So, pull some keys from the left.
1966                  * There has to be a left pointer at this point because
1967                  * otherwise we would have pulled some pointers from the
1968                  * right
1969                  */
1970                 if (!left) {
1971                         ret = -EROFS;
1972                         btrfs_handle_fs_error(fs_info, ret, NULL);
1973                         goto enospc;
1974                 }
1975                 wret = balance_node_right(trans, mid, left);
1976                 if (wret < 0) {
1977                         ret = wret;
1978                         goto enospc;
1979                 }
1980                 if (wret == 1) {
1981                         wret = push_node_left(trans, left, mid, 1);
1982                         if (wret < 0)
1983                                 ret = wret;
1984                 }
1985                 BUG_ON(wret == 1);
1986         }
1987         if (btrfs_header_nritems(mid) == 0) {
1988                 btrfs_clean_tree_block(mid);
1989                 btrfs_tree_unlock(mid);
1990                 del_ptr(root, path, level + 1, pslot);
1991                 root_sub_used(root, mid->len);
1992                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1993                 free_extent_buffer_stale(mid);
1994                 mid = NULL;
1995         } else {
1996                 /* update the parent key to reflect our changes */
1997                 struct btrfs_disk_key mid_key;
1998                 btrfs_node_key(mid, &mid_key, 0);
1999                 ret = tree_mod_log_insert_key(parent, pslot,
2000                                 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2001                 BUG_ON(ret < 0);
2002                 btrfs_set_node_key(parent, &mid_key, pslot);
2003                 btrfs_mark_buffer_dirty(parent);
2004         }
2005
2006         /* update the path */
2007         if (left) {
2008                 if (btrfs_header_nritems(left) > orig_slot) {
2009                         atomic_inc(&left->refs);
2010                         /* left was locked after cow */
2011                         path->nodes[level] = left;
2012                         path->slots[level + 1] -= 1;
2013                         path->slots[level] = orig_slot;
2014                         if (mid) {
2015                                 btrfs_tree_unlock(mid);
2016                                 free_extent_buffer(mid);
2017                         }
2018                 } else {
2019                         orig_slot -= btrfs_header_nritems(left);
2020                         path->slots[level] = orig_slot;
2021                 }
2022         }
2023         /* double check we haven't messed things up */
2024         if (orig_ptr !=
2025             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2026                 BUG();
2027 enospc:
2028         if (right) {
2029                 btrfs_tree_unlock(right);
2030                 free_extent_buffer(right);
2031         }
2032         if (left) {
2033                 if (path->nodes[level] != left)
2034                         btrfs_tree_unlock(left);
2035                 free_extent_buffer(left);
2036         }
2037         return ret;
2038 }
2039
2040 /* Node balancing for insertion.  Here we only split or push nodes around
2041  * when they are completely full.  This is also done top down, so we
2042  * have to be pessimistic.
2043  */
2044 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2045                                           struct btrfs_root *root,
2046                                           struct btrfs_path *path, int level)
2047 {
2048         struct btrfs_fs_info *fs_info = root->fs_info;
2049         struct extent_buffer *right = NULL;
2050         struct extent_buffer *mid;
2051         struct extent_buffer *left = NULL;
2052         struct extent_buffer *parent = NULL;
2053         int ret = 0;
2054         int wret;
2055         int pslot;
2056         int orig_slot = path->slots[level];
2057
2058         if (level == 0)
2059                 return 1;
2060
2061         mid = path->nodes[level];
2062         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2063
2064         if (level < BTRFS_MAX_LEVEL - 1) {
2065                 parent = path->nodes[level + 1];
2066                 pslot = path->slots[level + 1];
2067         }
2068
2069         if (!parent)
2070                 return 1;
2071
2072         left = btrfs_read_node_slot(parent, pslot - 1);
2073         if (IS_ERR(left))
2074                 left = NULL;
2075
2076         /* first, try to make some room in the middle buffer */
2077         if (left) {
2078                 u32 left_nr;
2079
2080                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
2081                 btrfs_set_lock_blocking_write(left);
2082
2083                 left_nr = btrfs_header_nritems(left);
2084                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2085                         wret = 1;
2086                 } else {
2087                         ret = btrfs_cow_block(trans, root, left, parent,
2088                                               pslot - 1, &left,
2089                                               BTRFS_NESTING_LEFT_COW);
2090                         if (ret)
2091                                 wret = 1;
2092                         else {
2093                                 wret = push_node_left(trans, left, mid, 0);
2094                         }
2095                 }
2096                 if (wret < 0)
2097                         ret = wret;
2098                 if (wret == 0) {
2099                         struct btrfs_disk_key disk_key;
2100                         orig_slot += left_nr;
2101                         btrfs_node_key(mid, &disk_key, 0);
2102                         ret = tree_mod_log_insert_key(parent, pslot,
2103                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2104                         BUG_ON(ret < 0);
2105                         btrfs_set_node_key(parent, &disk_key, pslot);
2106                         btrfs_mark_buffer_dirty(parent);
2107                         if (btrfs_header_nritems(left) > orig_slot) {
2108                                 path->nodes[level] = left;
2109                                 path->slots[level + 1] -= 1;
2110                                 path->slots[level] = orig_slot;
2111                                 btrfs_tree_unlock(mid);
2112                                 free_extent_buffer(mid);
2113                         } else {
2114                                 orig_slot -=
2115                                         btrfs_header_nritems(left);
2116                                 path->slots[level] = orig_slot;
2117                                 btrfs_tree_unlock(left);
2118                                 free_extent_buffer(left);
2119                         }
2120                         return 0;
2121                 }
2122                 btrfs_tree_unlock(left);
2123                 free_extent_buffer(left);
2124         }
2125         right = btrfs_read_node_slot(parent, pslot + 1);
2126         if (IS_ERR(right))
2127                 right = NULL;
2128
2129         /*
2130          * then try to empty the right most buffer into the middle
2131          */
2132         if (right) {
2133                 u32 right_nr;
2134
2135                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2136                 btrfs_set_lock_blocking_write(right);
2137
2138                 right_nr = btrfs_header_nritems(right);
2139                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2140                         wret = 1;
2141                 } else {
2142                         ret = btrfs_cow_block(trans, root, right,
2143                                               parent, pslot + 1,
2144                                               &right, BTRFS_NESTING_RIGHT_COW);
2145                         if (ret)
2146                                 wret = 1;
2147                         else {
2148                                 wret = balance_node_right(trans, right, mid);
2149                         }
2150                 }
2151                 if (wret < 0)
2152                         ret = wret;
2153                 if (wret == 0) {
2154                         struct btrfs_disk_key disk_key;
2155
2156                         btrfs_node_key(right, &disk_key, 0);
2157                         ret = tree_mod_log_insert_key(parent, pslot + 1,
2158                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2159                         BUG_ON(ret < 0);
2160                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2161                         btrfs_mark_buffer_dirty(parent);
2162
2163                         if (btrfs_header_nritems(mid) <= orig_slot) {
2164                                 path->nodes[level] = right;
2165                                 path->slots[level + 1] += 1;
2166                                 path->slots[level] = orig_slot -
2167                                         btrfs_header_nritems(mid);
2168                                 btrfs_tree_unlock(mid);
2169                                 free_extent_buffer(mid);
2170                         } else {
2171                                 btrfs_tree_unlock(right);
2172                                 free_extent_buffer(right);
2173                         }
2174                         return 0;
2175                 }
2176                 btrfs_tree_unlock(right);
2177                 free_extent_buffer(right);
2178         }
2179         return 1;
2180 }
2181
2182 /*
2183  * readahead one full node of leaves, finding things that are close
2184  * to the block in 'slot', and triggering ra on them.
2185  */
2186 static void reada_for_search(struct btrfs_fs_info *fs_info,
2187                              struct btrfs_path *path,
2188                              int level, int slot, u64 objectid)
2189 {
2190         struct extent_buffer *node;
2191         struct btrfs_disk_key disk_key;
2192         u32 nritems;
2193         u64 search;
2194         u64 target;
2195         u64 nread = 0;
2196         struct extent_buffer *eb;
2197         u32 nr;
2198         u32 blocksize;
2199         u32 nscan = 0;
2200
2201         if (level != 1)
2202                 return;
2203
2204         if (!path->nodes[level])
2205                 return;
2206
2207         node = path->nodes[level];
2208
2209         search = btrfs_node_blockptr(node, slot);
2210         blocksize = fs_info->nodesize;
2211         eb = find_extent_buffer(fs_info, search);
2212         if (eb) {
2213                 free_extent_buffer(eb);
2214                 return;
2215         }
2216
2217         target = search;
2218
2219         nritems = btrfs_header_nritems(node);
2220         nr = slot;
2221
2222         while (1) {
2223                 if (path->reada == READA_BACK) {
2224                         if (nr == 0)
2225                                 break;
2226                         nr--;
2227                 } else if (path->reada == READA_FORWARD) {
2228                         nr++;
2229                         if (nr >= nritems)
2230                                 break;
2231                 }
2232                 if (path->reada == READA_BACK && objectid) {
2233                         btrfs_node_key(node, &disk_key, nr);
2234                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2235                                 break;
2236                 }
2237                 search = btrfs_node_blockptr(node, nr);
2238                 if ((search <= target && target - search <= 65536) ||
2239                     (search > target && search - target <= 65536)) {
2240                         readahead_tree_block(fs_info, search);
2241                         nread += blocksize;
2242                 }
2243                 nscan++;
2244                 if ((nread > 65536 || nscan > 32))
2245                         break;
2246         }
2247 }
2248
2249 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2250                                        struct btrfs_path *path, int level)
2251 {
2252         int slot;
2253         int nritems;
2254         struct extent_buffer *parent;
2255         struct extent_buffer *eb;
2256         u64 gen;
2257         u64 block1 = 0;
2258         u64 block2 = 0;
2259
2260         parent = path->nodes[level + 1];
2261         if (!parent)
2262                 return;
2263
2264         nritems = btrfs_header_nritems(parent);
2265         slot = path->slots[level + 1];
2266
2267         if (slot > 0) {
2268                 block1 = btrfs_node_blockptr(parent, slot - 1);
2269                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2270                 eb = find_extent_buffer(fs_info, block1);
2271                 /*
2272                  * if we get -eagain from btrfs_buffer_uptodate, we
2273                  * don't want to return eagain here.  That will loop
2274                  * forever
2275                  */
2276                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2277                         block1 = 0;
2278                 free_extent_buffer(eb);
2279         }
2280         if (slot + 1 < nritems) {
2281                 block2 = btrfs_node_blockptr(parent, slot + 1);
2282                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2283                 eb = find_extent_buffer(fs_info, block2);
2284                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2285                         block2 = 0;
2286                 free_extent_buffer(eb);
2287         }
2288
2289         if (block1)
2290                 readahead_tree_block(fs_info, block1);
2291         if (block2)
2292                 readahead_tree_block(fs_info, block2);
2293 }
2294
2295
2296 /*
2297  * when we walk down the tree, it is usually safe to unlock the higher layers
2298  * in the tree.  The exceptions are when our path goes through slot 0, because
2299  * operations on the tree might require changing key pointers higher up in the
2300  * tree.
2301  *
2302  * callers might also have set path->keep_locks, which tells this code to keep
2303  * the lock if the path points to the last slot in the block.  This is part of
2304  * walking through the tree, and selecting the next slot in the higher block.
2305  *
2306  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2307  * if lowest_unlock is 1, level 0 won't be unlocked
2308  */
2309 static noinline void unlock_up(struct btrfs_path *path, int level,
2310                                int lowest_unlock, int min_write_lock_level,
2311                                int *write_lock_level)
2312 {
2313         int i;
2314         int skip_level = level;
2315         int no_skips = 0;
2316         struct extent_buffer *t;
2317
2318         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2319                 if (!path->nodes[i])
2320                         break;
2321                 if (!path->locks[i])
2322                         break;
2323                 if (!no_skips && path->slots[i] == 0) {
2324                         skip_level = i + 1;
2325                         continue;
2326                 }
2327                 if (!no_skips && path->keep_locks) {
2328                         u32 nritems;
2329                         t = path->nodes[i];
2330                         nritems = btrfs_header_nritems(t);
2331                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2332                                 skip_level = i + 1;
2333                                 continue;
2334                         }
2335                 }
2336                 if (skip_level < i && i >= lowest_unlock)
2337                         no_skips = 1;
2338
2339                 t = path->nodes[i];
2340                 if (i >= lowest_unlock && i > skip_level) {
2341                         btrfs_tree_unlock_rw(t, path->locks[i]);
2342                         path->locks[i] = 0;
2343                         if (write_lock_level &&
2344                             i > min_write_lock_level &&
2345                             i <= *write_lock_level) {
2346                                 *write_lock_level = i - 1;
2347                         }
2348                 }
2349         }
2350 }
2351
2352 /*
2353  * helper function for btrfs_search_slot.  The goal is to find a block
2354  * in cache without setting the path to blocking.  If we find the block
2355  * we return zero and the path is unchanged.
2356  *
2357  * If we can't find the block, we set the path blocking and do some
2358  * reada.  -EAGAIN is returned and the search must be repeated.
2359  */
2360 static int
2361 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2362                       struct extent_buffer **eb_ret, int level, int slot,
2363                       const struct btrfs_key *key)
2364 {
2365         struct btrfs_fs_info *fs_info = root->fs_info;
2366         u64 blocknr;
2367         u64 gen;
2368         struct extent_buffer *tmp;
2369         struct btrfs_key first_key;
2370         int ret;
2371         int parent_level;
2372
2373         blocknr = btrfs_node_blockptr(*eb_ret, slot);
2374         gen = btrfs_node_ptr_generation(*eb_ret, slot);
2375         parent_level = btrfs_header_level(*eb_ret);
2376         btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
2377
2378         tmp = find_extent_buffer(fs_info, blocknr);
2379         if (tmp) {
2380                 /* first we do an atomic uptodate check */
2381                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2382                         /*
2383                          * Do extra check for first_key, eb can be stale due to
2384                          * being cached, read from scrub, or have multiple
2385                          * parents (shared tree blocks).
2386                          */
2387                         if (btrfs_verify_level_key(tmp,
2388                                         parent_level - 1, &first_key, gen)) {
2389                                 free_extent_buffer(tmp);
2390                                 return -EUCLEAN;
2391                         }
2392                         *eb_ret = tmp;
2393                         return 0;
2394                 }
2395
2396                 /* the pages were up to date, but we failed
2397                  * the generation number check.  Do a full
2398                  * read for the generation number that is correct.
2399                  * We must do this without dropping locks so
2400                  * we can trust our generation number
2401                  */
2402                 btrfs_set_path_blocking(p);
2403
2404                 /* now we're allowed to do a blocking uptodate check */
2405                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2406                 if (!ret) {
2407                         *eb_ret = tmp;
2408                         return 0;
2409                 }
2410                 free_extent_buffer(tmp);
2411                 btrfs_release_path(p);
2412                 return -EIO;
2413         }
2414
2415         /*
2416          * reduce lock contention at high levels
2417          * of the btree by dropping locks before
2418          * we read.  Don't release the lock on the current
2419          * level because we need to walk this node to figure
2420          * out which blocks to read.
2421          */
2422         btrfs_unlock_up_safe(p, level + 1);
2423         btrfs_set_path_blocking(p);
2424
2425         if (p->reada != READA_NONE)
2426                 reada_for_search(fs_info, p, level, slot, key->objectid);
2427
2428         ret = -EAGAIN;
2429         tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2430                               &first_key);
2431         if (!IS_ERR(tmp)) {
2432                 /*
2433                  * If the read above didn't mark this buffer up to date,
2434                  * it will never end up being up to date.  Set ret to EIO now
2435                  * and give up so that our caller doesn't loop forever
2436                  * on our EAGAINs.
2437                  */
2438                 if (!extent_buffer_uptodate(tmp))
2439                         ret = -EIO;
2440                 free_extent_buffer(tmp);
2441         } else {
2442                 ret = PTR_ERR(tmp);
2443         }
2444
2445         btrfs_release_path(p);
2446         return ret;
2447 }
2448
2449 /*
2450  * helper function for btrfs_search_slot.  This does all of the checks
2451  * for node-level blocks and does any balancing required based on
2452  * the ins_len.
2453  *
2454  * If no extra work was required, zero is returned.  If we had to
2455  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2456  * start over
2457  */
2458 static int
2459 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2460                        struct btrfs_root *root, struct btrfs_path *p,
2461                        struct extent_buffer *b, int level, int ins_len,
2462                        int *write_lock_level)
2463 {
2464         struct btrfs_fs_info *fs_info = root->fs_info;
2465         int ret;
2466
2467         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2468             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2469                 int sret;
2470
2471                 if (*write_lock_level < level + 1) {
2472                         *write_lock_level = level + 1;
2473                         btrfs_release_path(p);
2474                         goto again;
2475                 }
2476
2477                 btrfs_set_path_blocking(p);
2478                 reada_for_balance(fs_info, p, level);
2479                 sret = split_node(trans, root, p, level);
2480
2481                 BUG_ON(sret > 0);
2482                 if (sret) {
2483                         ret = sret;
2484                         goto done;
2485                 }
2486                 b = p->nodes[level];
2487         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2488                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2489                 int sret;
2490
2491                 if (*write_lock_level < level + 1) {
2492                         *write_lock_level = level + 1;
2493                         btrfs_release_path(p);
2494                         goto again;
2495                 }
2496
2497                 btrfs_set_path_blocking(p);
2498                 reada_for_balance(fs_info, p, level);
2499                 sret = balance_level(trans, root, p, level);
2500
2501                 if (sret) {
2502                         ret = sret;
2503                         goto done;
2504                 }
2505                 b = p->nodes[level];
2506                 if (!b) {
2507                         btrfs_release_path(p);
2508                         goto again;
2509                 }
2510                 BUG_ON(btrfs_header_nritems(b) == 1);
2511         }
2512         return 0;
2513
2514 again:
2515         ret = -EAGAIN;
2516 done:
2517         return ret;
2518 }
2519
2520 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2521                 u64 iobjectid, u64 ioff, u8 key_type,
2522                 struct btrfs_key *found_key)
2523 {
2524         int ret;
2525         struct btrfs_key key;
2526         struct extent_buffer *eb;
2527
2528         ASSERT(path);
2529         ASSERT(found_key);
2530
2531         key.type = key_type;
2532         key.objectid = iobjectid;
2533         key.offset = ioff;
2534
2535         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2536         if (ret < 0)
2537                 return ret;
2538
2539         eb = path->nodes[0];
2540         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2541                 ret = btrfs_next_leaf(fs_root, path);
2542                 if (ret)
2543                         return ret;
2544                 eb = path->nodes[0];
2545         }
2546
2547         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2548         if (found_key->type != key.type ||
2549                         found_key->objectid != key.objectid)
2550                 return 1;
2551
2552         return 0;
2553 }
2554
2555 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2556                                                         struct btrfs_path *p,
2557                                                         int write_lock_level)
2558 {
2559         struct btrfs_fs_info *fs_info = root->fs_info;
2560         struct extent_buffer *b;
2561         int root_lock;
2562         int level = 0;
2563
2564         /* We try very hard to do read locks on the root */
2565         root_lock = BTRFS_READ_LOCK;
2566
2567         if (p->search_commit_root) {
2568                 /*
2569                  * The commit roots are read only so we always do read locks,
2570                  * and we always must hold the commit_root_sem when doing
2571                  * searches on them, the only exception is send where we don't
2572                  * want to block transaction commits for a long time, so
2573                  * we need to clone the commit root in order to avoid races
2574                  * with transaction commits that create a snapshot of one of
2575                  * the roots used by a send operation.
2576                  */
2577                 if (p->need_commit_sem) {
2578                         down_read(&fs_info->commit_root_sem);
2579                         b = btrfs_clone_extent_buffer(root->commit_root);
2580                         up_read(&fs_info->commit_root_sem);
2581                         if (!b)
2582                                 return ERR_PTR(-ENOMEM);
2583
2584                 } else {
2585                         b = root->commit_root;
2586                         atomic_inc(&b->refs);
2587                 }
2588                 level = btrfs_header_level(b);
2589                 /*
2590                  * Ensure that all callers have set skip_locking when
2591                  * p->search_commit_root = 1.
2592                  */
2593                 ASSERT(p->skip_locking == 1);
2594
2595                 goto out;
2596         }
2597
2598         if (p->skip_locking) {
2599                 b = btrfs_root_node(root);
2600                 level = btrfs_header_level(b);
2601                 goto out;
2602         }
2603
2604         /*
2605          * If the level is set to maximum, we can skip trying to get the read
2606          * lock.
2607          */
2608         if (write_lock_level < BTRFS_MAX_LEVEL) {
2609                 /*
2610                  * We don't know the level of the root node until we actually
2611                  * have it read locked
2612                  */
2613                 b = __btrfs_read_lock_root_node(root, p->recurse);
2614                 level = btrfs_header_level(b);
2615                 if (level > write_lock_level)
2616                         goto out;
2617
2618                 /* Whoops, must trade for write lock */
2619                 btrfs_tree_read_unlock(b);
2620                 free_extent_buffer(b);
2621         }
2622
2623         b = btrfs_lock_root_node(root);
2624         root_lock = BTRFS_WRITE_LOCK;
2625
2626         /* The level might have changed, check again */
2627         level = btrfs_header_level(b);
2628
2629 out:
2630         p->nodes[level] = b;
2631         if (!p->skip_locking)
2632                 p->locks[level] = root_lock;
2633         /*
2634          * Callers are responsible for dropping b's references.
2635          */
2636         return b;
2637 }
2638
2639
2640 /*
2641  * btrfs_search_slot - look for a key in a tree and perform necessary
2642  * modifications to preserve tree invariants.
2643  *
2644  * @trans:      Handle of transaction, used when modifying the tree
2645  * @p:          Holds all btree nodes along the search path
2646  * @root:       The root node of the tree
2647  * @key:        The key we are looking for
2648  * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2649  *              deletions it's -1. 0 for plain searches
2650  * @cow:        boolean should CoW operations be performed. Must always be 1
2651  *              when modifying the tree.
2652  *
2653  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2654  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2655  *
2656  * If @key is found, 0 is returned and you can find the item in the leaf level
2657  * of the path (level 0)
2658  *
2659  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2660  * points to the slot where it should be inserted
2661  *
2662  * If an error is encountered while searching the tree a negative error number
2663  * is returned
2664  */
2665 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2666                       const struct btrfs_key *key, struct btrfs_path *p,
2667                       int ins_len, int cow)
2668 {
2669         struct extent_buffer *b;
2670         int slot;
2671         int ret;
2672         int err;
2673         int level;
2674         int lowest_unlock = 1;
2675         /* everything at write_lock_level or lower must be write locked */
2676         int write_lock_level = 0;
2677         u8 lowest_level = 0;
2678         int min_write_lock_level;
2679         int prev_cmp;
2680
2681         lowest_level = p->lowest_level;
2682         WARN_ON(lowest_level && ins_len > 0);
2683         WARN_ON(p->nodes[0] != NULL);
2684         BUG_ON(!cow && ins_len);
2685
2686         if (ins_len < 0) {
2687                 lowest_unlock = 2;
2688
2689                 /* when we are removing items, we might have to go up to level
2690                  * two as we update tree pointers  Make sure we keep write
2691                  * for those levels as well
2692                  */
2693                 write_lock_level = 2;
2694         } else if (ins_len > 0) {
2695                 /*
2696                  * for inserting items, make sure we have a write lock on
2697                  * level 1 so we can update keys
2698                  */
2699                 write_lock_level = 1;
2700         }
2701
2702         if (!cow)
2703                 write_lock_level = -1;
2704
2705         if (cow && (p->keep_locks || p->lowest_level))
2706                 write_lock_level = BTRFS_MAX_LEVEL;
2707
2708         min_write_lock_level = write_lock_level;
2709
2710 again:
2711         prev_cmp = -1;
2712         b = btrfs_search_slot_get_root(root, p, write_lock_level);
2713         if (IS_ERR(b)) {
2714                 ret = PTR_ERR(b);
2715                 goto done;
2716         }
2717
2718         while (b) {
2719                 int dec = 0;
2720
2721                 level = btrfs_header_level(b);
2722
2723                 if (cow) {
2724                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2725
2726                         /*
2727                          * if we don't really need to cow this block
2728                          * then we don't want to set the path blocking,
2729                          * so we test it here
2730                          */
2731                         if (!should_cow_block(trans, root, b)) {
2732                                 trans->dirty = true;
2733                                 goto cow_done;
2734                         }
2735
2736                         /*
2737                          * must have write locks on this node and the
2738                          * parent
2739                          */
2740                         if (level > write_lock_level ||
2741                             (level + 1 > write_lock_level &&
2742                             level + 1 < BTRFS_MAX_LEVEL &&
2743                             p->nodes[level + 1])) {
2744                                 write_lock_level = level + 1;
2745                                 btrfs_release_path(p);
2746                                 goto again;
2747                         }
2748
2749                         btrfs_set_path_blocking(p);
2750                         if (last_level)
2751                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2752                                                       &b,
2753                                                       BTRFS_NESTING_COW);
2754                         else
2755                                 err = btrfs_cow_block(trans, root, b,
2756                                                       p->nodes[level + 1],
2757                                                       p->slots[level + 1], &b,
2758                                                       BTRFS_NESTING_COW);
2759                         if (err) {
2760                                 ret = err;
2761                                 goto done;
2762                         }
2763                 }
2764 cow_done:
2765                 p->nodes[level] = b;
2766                 /*
2767                  * Leave path with blocking locks to avoid massive
2768                  * lock context switch, this is made on purpose.
2769                  */
2770
2771                 /*
2772                  * we have a lock on b and as long as we aren't changing
2773                  * the tree, there is no way to for the items in b to change.
2774                  * It is safe to drop the lock on our parent before we
2775                  * go through the expensive btree search on b.
2776                  *
2777                  * If we're inserting or deleting (ins_len != 0), then we might
2778                  * be changing slot zero, which may require changing the parent.
2779                  * So, we can't drop the lock until after we know which slot
2780                  * we're operating on.
2781                  */
2782                 if (!ins_len && !p->keep_locks) {
2783                         int u = level + 1;
2784
2785                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2786                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2787                                 p->locks[u] = 0;
2788                         }
2789                 }
2790
2791                 /*
2792                  * If btrfs_bin_search returns an exact match (prev_cmp == 0)
2793                  * we can safely assume the target key will always be in slot 0
2794                  * on lower levels due to the invariants BTRFS' btree provides,
2795                  * namely that a btrfs_key_ptr entry always points to the
2796                  * lowest key in the child node, thus we can skip searching
2797                  * lower levels
2798                  */
2799                 if (prev_cmp == 0) {
2800                         slot = 0;
2801                         ret = 0;
2802                 } else {
2803                         ret = btrfs_bin_search(b, key, &slot);
2804                         prev_cmp = ret;
2805                         if (ret < 0)
2806                                 goto done;
2807                 }
2808
2809                 if (level == 0) {
2810                         p->slots[level] = slot;
2811                         if (ins_len > 0 &&
2812                             btrfs_leaf_free_space(b) < ins_len) {
2813                                 if (write_lock_level < 1) {
2814                                         write_lock_level = 1;
2815                                         btrfs_release_path(p);
2816                                         goto again;
2817                                 }
2818
2819                                 btrfs_set_path_blocking(p);
2820                                 err = split_leaf(trans, root, key,
2821                                                  p, ins_len, ret == 0);
2822
2823                                 BUG_ON(err > 0);
2824                                 if (err) {
2825                                         ret = err;
2826                                         goto done;
2827                                 }
2828                         }
2829                         if (!p->search_for_split)
2830                                 unlock_up(p, level, lowest_unlock,
2831                                           min_write_lock_level, NULL);
2832                         goto done;
2833                 }
2834                 if (ret && slot > 0) {
2835                         dec = 1;
2836                         slot--;
2837                 }
2838                 p->slots[level] = slot;
2839                 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2840                                              &write_lock_level);
2841                 if (err == -EAGAIN)
2842                         goto again;
2843                 if (err) {
2844                         ret = err;
2845                         goto done;
2846                 }
2847                 b = p->nodes[level];
2848                 slot = p->slots[level];
2849
2850                 /*
2851                  * Slot 0 is special, if we change the key we have to update
2852                  * the parent pointer which means we must have a write lock on
2853                  * the parent
2854                  */
2855                 if (slot == 0 && ins_len && write_lock_level < level + 1) {
2856                         write_lock_level = level + 1;
2857                         btrfs_release_path(p);
2858                         goto again;
2859                 }
2860
2861                 unlock_up(p, level, lowest_unlock, min_write_lock_level,
2862                           &write_lock_level);
2863
2864                 if (level == lowest_level) {
2865                         if (dec)
2866                                 p->slots[level]++;
2867                         goto done;
2868                 }
2869
2870                 err = read_block_for_search(root, p, &b, level, slot, key);
2871                 if (err == -EAGAIN)
2872                         goto again;
2873                 if (err) {
2874                         ret = err;
2875                         goto done;
2876                 }
2877
2878                 if (!p->skip_locking) {
2879                         level = btrfs_header_level(b);
2880                         if (level <= write_lock_level) {
2881                                 if (!btrfs_try_tree_write_lock(b)) {
2882                                         btrfs_set_path_blocking(p);
2883                                         btrfs_tree_lock(b);
2884                                 }
2885                                 p->locks[level] = BTRFS_WRITE_LOCK;
2886                         } else {
2887                                 if (!btrfs_tree_read_lock_atomic(b)) {
2888                                         btrfs_set_path_blocking(p);
2889                                         __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
2890                                                                p->recurse);
2891                                 }
2892                                 p->locks[level] = BTRFS_READ_LOCK;
2893                         }
2894                         p->nodes[level] = b;
2895                 }
2896         }
2897         ret = 1;
2898 done:
2899         /*
2900          * we don't really know what they plan on doing with the path
2901          * from here on, so for now just mark it as blocking
2902          */
2903         if (!p->leave_spinning)
2904                 btrfs_set_path_blocking(p);
2905         if (ret < 0 && !p->skip_release_on_error)
2906                 btrfs_release_path(p);
2907         return ret;
2908 }
2909
2910 /*
2911  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2912  * current state of the tree together with the operations recorded in the tree
2913  * modification log to search for the key in a previous version of this tree, as
2914  * denoted by the time_seq parameter.
2915  *
2916  * Naturally, there is no support for insert, delete or cow operations.
2917  *
2918  * The resulting path and return value will be set up as if we called
2919  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2920  */
2921 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2922                           struct btrfs_path *p, u64 time_seq)
2923 {
2924         struct btrfs_fs_info *fs_info = root->fs_info;
2925         struct extent_buffer *b;
2926         int slot;
2927         int ret;
2928         int err;
2929         int level;
2930         int lowest_unlock = 1;
2931         u8 lowest_level = 0;
2932
2933         lowest_level = p->lowest_level;
2934         WARN_ON(p->nodes[0] != NULL);
2935
2936         if (p->search_commit_root) {
2937                 BUG_ON(time_seq);
2938                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2939         }
2940
2941 again:
2942         b = get_old_root(root, time_seq);
2943         if (!b) {
2944                 ret = -EIO;
2945                 goto done;
2946         }
2947         level = btrfs_header_level(b);
2948         p->locks[level] = BTRFS_READ_LOCK;
2949
2950         while (b) {
2951                 int dec = 0;
2952
2953                 level = btrfs_header_level(b);
2954                 p->nodes[level] = b;
2955
2956                 /*
2957                  * we have a lock on b and as long as we aren't changing
2958                  * the tree, there is no way to for the items in b to change.
2959                  * It is safe to drop the lock on our parent before we
2960                  * go through the expensive btree search on b.
2961                  */
2962                 btrfs_unlock_up_safe(p, level + 1);
2963
2964                 ret = btrfs_bin_search(b, key, &slot);
2965                 if (ret < 0)
2966                         goto done;
2967
2968                 if (level == 0) {
2969                         p->slots[level] = slot;
2970                         unlock_up(p, level, lowest_unlock, 0, NULL);
2971                         goto done;
2972                 }
2973
2974                 if (ret && slot > 0) {
2975                         dec = 1;
2976                         slot--;
2977                 }
2978                 p->slots[level] = slot;
2979                 unlock_up(p, level, lowest_unlock, 0, NULL);
2980
2981                 if (level == lowest_level) {
2982                         if (dec)
2983                                 p->slots[level]++;
2984                         goto done;
2985                 }
2986
2987                 err = read_block_for_search(root, p, &b, level, slot, key);
2988                 if (err == -EAGAIN)
2989                         goto again;
2990                 if (err) {
2991                         ret = err;
2992                         goto done;
2993                 }
2994
2995                 level = btrfs_header_level(b);
2996                 if (!btrfs_tree_read_lock_atomic(b)) {
2997                         btrfs_set_path_blocking(p);
2998                         btrfs_tree_read_lock(b);
2999                 }
3000                 b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3001                 if (!b) {
3002                         ret = -ENOMEM;
3003                         goto done;
3004                 }
3005                 p->locks[level] = BTRFS_READ_LOCK;
3006                 p->nodes[level] = b;
3007         }
3008         ret = 1;
3009 done:
3010         if (!p->leave_spinning)
3011                 btrfs_set_path_blocking(p);
3012         if (ret < 0)
3013                 btrfs_release_path(p);
3014
3015         return ret;
3016 }
3017
3018 /*
3019  * helper to use instead of search slot if no exact match is needed but
3020  * instead the next or previous item should be returned.
3021  * When find_higher is true, the next higher item is returned, the next lower
3022  * otherwise.
3023  * When return_any and find_higher are both true, and no higher item is found,
3024  * return the next lower instead.
3025  * When return_any is true and find_higher is false, and no lower item is found,
3026  * return the next higher instead.
3027  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3028  * < 0 on error
3029  */
3030 int btrfs_search_slot_for_read(struct btrfs_root *root,
3031                                const struct btrfs_key *key,
3032                                struct btrfs_path *p, int find_higher,
3033                                int return_any)
3034 {
3035         int ret;
3036         struct extent_buffer *leaf;
3037
3038 again:
3039         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3040         if (ret <= 0)
3041                 return ret;
3042         /*
3043          * a return value of 1 means the path is at the position where the
3044          * item should be inserted. Normally this is the next bigger item,
3045          * but in case the previous item is the last in a leaf, path points
3046          * to the first free slot in the previous leaf, i.e. at an invalid
3047          * item.
3048          */
3049         leaf = p->nodes[0];
3050
3051         if (find_higher) {
3052                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3053                         ret = btrfs_next_leaf(root, p);
3054                         if (ret <= 0)
3055                                 return ret;
3056                         if (!return_any)
3057                                 return 1;
3058                         /*
3059                          * no higher item found, return the next
3060                          * lower instead
3061                          */
3062                         return_any = 0;
3063                         find_higher = 0;
3064                         btrfs_release_path(p);
3065                         goto again;
3066                 }
3067         } else {
3068                 if (p->slots[0] == 0) {
3069                         ret = btrfs_prev_leaf(root, p);
3070                         if (ret < 0)
3071                                 return ret;
3072                         if (!ret) {
3073                                 leaf = p->nodes[0];
3074                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3075                                         p->slots[0]--;
3076                                 return 0;
3077                         }
3078                         if (!return_any)
3079                                 return 1;
3080                         /*
3081                          * no lower item found, return the next
3082                          * higher instead
3083                          */
3084                         return_any = 0;
3085                         find_higher = 1;
3086                         btrfs_release_path(p);
3087                         goto again;
3088                 } else {
3089                         --p->slots[0];
3090                 }
3091         }
3092         return 0;
3093 }
3094
3095 /*
3096  * adjust the pointers going up the tree, starting at level
3097  * making sure the right key of each node is points to 'key'.
3098  * This is used after shifting pointers to the left, so it stops
3099  * fixing up pointers when a given leaf/node is not in slot 0 of the
3100  * higher levels
3101  *
3102  */
3103 static void fixup_low_keys(struct btrfs_path *path,
3104                            struct btrfs_disk_key *key, int level)
3105 {
3106         int i;
3107         struct extent_buffer *t;
3108         int ret;
3109
3110         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3111                 int tslot = path->slots[i];
3112
3113                 if (!path->nodes[i])
3114                         break;
3115                 t = path->nodes[i];
3116                 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3117                                 GFP_ATOMIC);
3118                 BUG_ON(ret < 0);
3119                 btrfs_set_node_key(t, key, tslot);
3120                 btrfs_mark_buffer_dirty(path->nodes[i]);
3121                 if (tslot != 0)
3122                         break;
3123         }
3124 }
3125
3126 /*
3127  * update item key.
3128  *
3129  * This function isn't completely safe. It's the caller's responsibility
3130  * that the new key won't break the order
3131  */
3132 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3133                              struct btrfs_path *path,
3134                              const struct btrfs_key *new_key)
3135 {
3136         struct btrfs_disk_key disk_key;
3137         struct extent_buffer *eb;
3138         int slot;
3139
3140         eb = path->nodes[0];
3141         slot = path->slots[0];
3142         if (slot > 0) {
3143                 btrfs_item_key(eb, &disk_key, slot - 1);
3144                 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
3145                         btrfs_crit(fs_info,
3146                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3147                                    slot, btrfs_disk_key_objectid(&disk_key),
3148                                    btrfs_disk_key_type(&disk_key),
3149                                    btrfs_disk_key_offset(&disk_key),
3150                                    new_key->objectid, new_key->type,
3151                                    new_key->offset);
3152                         btrfs_print_leaf(eb);
3153                         BUG();
3154                 }
3155         }
3156         if (slot < btrfs_header_nritems(eb) - 1) {
3157                 btrfs_item_key(eb, &disk_key, slot + 1);
3158                 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
3159                         btrfs_crit(fs_info,
3160                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3161                                    slot, btrfs_disk_key_objectid(&disk_key),
3162                                    btrfs_disk_key_type(&disk_key),
3163                                    btrfs_disk_key_offset(&disk_key),
3164                                    new_key->objectid, new_key->type,
3165                                    new_key->offset);
3166                         btrfs_print_leaf(eb);
3167                         BUG();
3168                 }
3169         }
3170
3171         btrfs_cpu_key_to_disk(&disk_key, new_key);
3172         btrfs_set_item_key(eb, &disk_key, slot);
3173         btrfs_mark_buffer_dirty(eb);
3174         if (slot == 0)
3175                 fixup_low_keys(path, &disk_key, 1);
3176 }
3177
3178 /*
3179  * Check key order of two sibling extent buffers.
3180  *
3181  * Return true if something is wrong.
3182  * Return false if everything is fine.
3183  *
3184  * Tree-checker only works inside one tree block, thus the following
3185  * corruption can not be detected by tree-checker:
3186  *
3187  * Leaf @left                   | Leaf @right
3188  * --------------------------------------------------------------
3189  * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
3190  *
3191  * Key f6 in leaf @left itself is valid, but not valid when the next
3192  * key in leaf @right is 7.
3193  * This can only be checked at tree block merge time.
3194  * And since tree checker has ensured all key order in each tree block
3195  * is correct, we only need to bother the last key of @left and the first
3196  * key of @right.
3197  */
3198 static bool check_sibling_keys(struct extent_buffer *left,
3199                                struct extent_buffer *right)
3200 {
3201         struct btrfs_key left_last;
3202         struct btrfs_key right_first;
3203         int level = btrfs_header_level(left);
3204         int nr_left = btrfs_header_nritems(left);
3205         int nr_right = btrfs_header_nritems(right);
3206
3207         /* No key to check in one of the tree blocks */
3208         if (!nr_left || !nr_right)
3209                 return false;
3210
3211         if (level) {
3212                 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
3213                 btrfs_node_key_to_cpu(right, &right_first, 0);
3214         } else {
3215                 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
3216                 btrfs_item_key_to_cpu(right, &right_first, 0);
3217         }
3218
3219         if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
3220                 btrfs_crit(left->fs_info,
3221 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
3222                            left_last.objectid, left_last.type,
3223                            left_last.offset, right_first.objectid,
3224                            right_first.type, right_first.offset);
3225                 return true;
3226         }
3227         return false;
3228 }
3229
3230 /*
3231  * try to push data from one node into the next node left in the
3232  * tree.
3233  *
3234  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3235  * error, and > 0 if there was no room in the left hand block.
3236  */
3237 static int push_node_left(struct btrfs_trans_handle *trans,
3238                           struct extent_buffer *dst,
3239                           struct extent_buffer *src, int empty)
3240 {
3241         struct btrfs_fs_info *fs_info = trans->fs_info;
3242         int push_items = 0;
3243         int src_nritems;
3244         int dst_nritems;
3245         int ret = 0;
3246
3247         src_nritems = btrfs_header_nritems(src);
3248         dst_nritems = btrfs_header_nritems(dst);
3249         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3250         WARN_ON(btrfs_header_generation(src) != trans->transid);
3251         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3252
3253         if (!empty && src_nritems <= 8)
3254                 return 1;
3255
3256         if (push_items <= 0)
3257                 return 1;
3258
3259         if (empty) {
3260                 push_items = min(src_nritems, push_items);
3261                 if (push_items < src_nritems) {
3262                         /* leave at least 8 pointers in the node if
3263                          * we aren't going to empty it
3264                          */
3265                         if (src_nritems - push_items < 8) {
3266                                 if (push_items <= 8)
3267                                         return 1;
3268                                 push_items -= 8;
3269                         }
3270                 }
3271         } else
3272                 push_items = min(src_nritems - 8, push_items);
3273
3274         /* dst is the left eb, src is the middle eb */
3275         if (check_sibling_keys(dst, src)) {
3276                 ret = -EUCLEAN;
3277                 btrfs_abort_transaction(trans, ret);
3278                 return ret;
3279         }
3280         ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
3281         if (ret) {
3282                 btrfs_abort_transaction(trans, ret);
3283                 return ret;
3284         }
3285         copy_extent_buffer(dst, src,
3286                            btrfs_node_key_ptr_offset(dst_nritems),
3287                            btrfs_node_key_ptr_offset(0),
3288                            push_items * sizeof(struct btrfs_key_ptr));
3289
3290         if (push_items < src_nritems) {
3291                 /*
3292                  * Don't call tree_mod_log_insert_move here, key removal was
3293                  * already fully logged by tree_mod_log_eb_copy above.
3294                  */
3295                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3296                                       btrfs_node_key_ptr_offset(push_items),
3297                                       (src_nritems - push_items) *
3298                                       sizeof(struct btrfs_key_ptr));
3299         }
3300         btrfs_set_header_nritems(src, src_nritems - push_items);
3301         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3302         btrfs_mark_buffer_dirty(src);
3303         btrfs_mark_buffer_dirty(dst);
3304
3305         return ret;
3306 }
3307
3308 /*
3309  * try to push data from one node into the next node right in the
3310  * tree.
3311  *
3312  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3313  * error, and > 0 if there was no room in the right hand block.
3314  *
3315  * this will  only push up to 1/2 the contents of the left node over
3316  */
3317 static int balance_node_right(struct btrfs_trans_handle *trans,
3318                               struct extent_buffer *dst,
3319                               struct extent_buffer *src)
3320 {
3321         struct btrfs_fs_info *fs_info = trans->fs_info;
3322         int push_items = 0;
3323         int max_push;
3324         int src_nritems;
3325         int dst_nritems;
3326         int ret = 0;
3327
3328         WARN_ON(btrfs_header_generation(src) != trans->transid);
3329         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3330
3331         src_nritems = btrfs_header_nritems(src);
3332         dst_nritems = btrfs_header_nritems(dst);
3333         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3334         if (push_items <= 0)
3335                 return 1;
3336
3337         if (src_nritems < 4)
3338                 return 1;
3339
3340         max_push = src_nritems / 2 + 1;
3341         /* don't try to empty the node */
3342         if (max_push >= src_nritems)
3343                 return 1;
3344
3345         if (max_push < push_items)
3346                 push_items = max_push;
3347
3348         /* dst is the right eb, src is the middle eb */
3349         if (check_sibling_keys(src, dst)) {
3350                 ret = -EUCLEAN;
3351                 btrfs_abort_transaction(trans, ret);
3352                 return ret;
3353         }
3354         ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3355         BUG_ON(ret < 0);
3356         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3357                                       btrfs_node_key_ptr_offset(0),
3358                                       (dst_nritems) *
3359                                       sizeof(struct btrfs_key_ptr));
3360
3361         ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
3362                                    push_items);
3363         if (ret) {
3364                 btrfs_abort_transaction(trans, ret);
3365                 return ret;
3366         }
3367         copy_extent_buffer(dst, src,
3368                            btrfs_node_key_ptr_offset(0),
3369                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3370                            push_items * sizeof(struct btrfs_key_ptr));
3371
3372         btrfs_set_header_nritems(src, src_nritems - push_items);
3373         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3374
3375         btrfs_mark_buffer_dirty(src);
3376         btrfs_mark_buffer_dirty(dst);
3377
3378         return ret;
3379 }
3380
3381 /*
3382  * helper function to insert a new root level in the tree.
3383  * A new node is allocated, and a single item is inserted to
3384  * point to the existing root
3385  *
3386  * returns zero on success or < 0 on failure.
3387  */
3388 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3389                            struct btrfs_root *root,
3390                            struct btrfs_path *path, int level)
3391 {
3392         struct btrfs_fs_info *fs_info = root->fs_info;
3393         u64 lower_gen;
3394         struct extent_buffer *lower;
3395         struct extent_buffer *c;
3396         struct extent_buffer *old;
3397         struct btrfs_disk_key lower_key;
3398         int ret;
3399
3400         BUG_ON(path->nodes[level]);
3401         BUG_ON(path->nodes[level-1] != root->node);
3402
3403         lower = path->nodes[level-1];
3404         if (level == 1)
3405                 btrfs_item_key(lower, &lower_key, 0);
3406         else
3407                 btrfs_node_key(lower, &lower_key, 0);
3408
3409         c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3410                                          root->node->start, 0,
3411                                          BTRFS_NESTING_NEW_ROOT);
3412         if (IS_ERR(c))
3413                 return PTR_ERR(c);
3414
3415         root_add_used(root, fs_info->nodesize);
3416
3417         btrfs_set_header_nritems(c, 1);
3418         btrfs_set_node_key(c, &lower_key, 0);
3419         btrfs_set_node_blockptr(c, 0, lower->start);
3420         lower_gen = btrfs_header_generation(lower);
3421         WARN_ON(lower_gen != trans->transid);
3422
3423         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3424
3425         btrfs_mark_buffer_dirty(c);
3426
3427         old = root->node;
3428         ret = tree_mod_log_insert_root(root->node, c, 0);
3429         BUG_ON(ret < 0);
3430         rcu_assign_pointer(root->node, c);
3431
3432         /* the super has an extra ref to root->node */
3433         free_extent_buffer(old);
3434
3435         add_root_to_dirty_list(root);
3436         atomic_inc(&c->refs);
3437         path->nodes[level] = c;
3438         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3439         path->slots[level] = 0;
3440         return 0;
3441 }
3442
3443 /*
3444  * worker function to insert a single pointer in a node.
3445  * the node should have enough room for the pointer already
3446  *
3447  * slot and level indicate where you want the key to go, and
3448  * blocknr is the block the key points to.
3449  */
3450 static void insert_ptr(struct btrfs_trans_handle *trans,
3451                        struct btrfs_path *path,
3452                        struct btrfs_disk_key *key, u64 bytenr,
3453                        int slot, int level)
3454 {
3455         struct extent_buffer *lower;
3456         int nritems;
3457         int ret;
3458
3459         BUG_ON(!path->nodes[level]);
3460         btrfs_assert_tree_locked(path->nodes[level]);
3461         lower = path->nodes[level];
3462         nritems = btrfs_header_nritems(lower);
3463         BUG_ON(slot > nritems);
3464         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3465         if (slot != nritems) {
3466                 if (level) {
3467                         ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3468                                         nritems - slot);
3469                         BUG_ON(ret < 0);
3470                 }
3471                 memmove_extent_buffer(lower,
3472                               btrfs_node_key_ptr_offset(slot + 1),
3473                               btrfs_node_key_ptr_offset(slot),
3474                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3475         }
3476         if (level) {
3477                 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3478                                 GFP_NOFS);
3479                 BUG_ON(ret < 0);
3480         }
3481         btrfs_set_node_key(lower, key, slot);
3482         btrfs_set_node_blockptr(lower, slot, bytenr);
3483         WARN_ON(trans->transid == 0);
3484         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3485         btrfs_set_header_nritems(lower, nritems + 1);
3486         btrfs_mark_buffer_dirty(lower);
3487 }
3488
3489 /*
3490  * split the node at the specified level in path in two.
3491  * The path is corrected to point to the appropriate node after the split
3492  *
3493  * Before splitting this tries to make some room in the node by pushing
3494  * left and right, if either one works, it returns right away.
3495  *
3496  * returns 0 on success and < 0 on failure
3497  */
3498 static noinline int split_node(struct btrfs_trans_handle *trans,
3499                                struct btrfs_root *root,
3500                                struct btrfs_path *path, int level)
3501 {
3502         struct btrfs_fs_info *fs_info = root->fs_info;
3503         struct extent_buffer *c;
3504         struct extent_buffer *split;
3505         struct btrfs_disk_key disk_key;
3506         int mid;
3507         int ret;
3508         u32 c_nritems;
3509
3510         c = path->nodes[level];
3511         WARN_ON(btrfs_header_generation(c) != trans->transid);
3512         if (c == root->node) {
3513                 /*
3514                  * trying to split the root, lets make a new one
3515                  *
3516                  * tree mod log: We don't log_removal old root in
3517                  * insert_new_root, because that root buffer will be kept as a
3518                  * normal node. We are going to log removal of half of the
3519                  * elements below with tree_mod_log_eb_copy. We're holding a
3520                  * tree lock on the buffer, which is why we cannot race with
3521                  * other tree_mod_log users.
3522                  */
3523                 ret = insert_new_root(trans, root, path, level + 1);
3524                 if (ret)
3525                         return ret;
3526         } else {
3527                 ret = push_nodes_for_insert(trans, root, path, level);
3528                 c = path->nodes[level];
3529                 if (!ret && btrfs_header_nritems(c) <
3530                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3531                         return 0;
3532                 if (ret < 0)
3533                         return ret;
3534         }
3535
3536         c_nritems = btrfs_header_nritems(c);
3537         mid = (c_nritems + 1) / 2;
3538         btrfs_node_key(c, &disk_key, mid);
3539
3540         split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3541                                              c->start, 0, BTRFS_NESTING_SPLIT);
3542         if (IS_ERR(split))
3543                 return PTR_ERR(split);
3544
3545         root_add_used(root, fs_info->nodesize);
3546         ASSERT(btrfs_header_level(c) == level);
3547
3548         ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3549         if (ret) {
3550                 btrfs_abort_transaction(trans, ret);
3551                 return ret;
3552         }
3553         copy_extent_buffer(split, c,
3554                            btrfs_node_key_ptr_offset(0),
3555                            btrfs_node_key_ptr_offset(mid),
3556                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3557         btrfs_set_header_nritems(split, c_nritems - mid);
3558         btrfs_set_header_nritems(c, mid);
3559         ret = 0;
3560
3561         btrfs_mark_buffer_dirty(c);
3562         btrfs_mark_buffer_dirty(split);
3563
3564         insert_ptr(trans, path, &disk_key, split->start,
3565                    path->slots[level + 1] + 1, level + 1);
3566
3567         if (path->slots[level] >= mid) {
3568                 path->slots[level] -= mid;
3569                 btrfs_tree_unlock(c);
3570                 free_extent_buffer(c);
3571                 path->nodes[level] = split;
3572                 path->slots[level + 1] += 1;
3573         } else {
3574                 btrfs_tree_unlock(split);
3575                 free_extent_buffer(split);
3576         }
3577         return ret;
3578 }
3579
3580 /*
3581  * how many bytes are required to store the items in a leaf.  start
3582  * and nr indicate which items in the leaf to check.  This totals up the
3583  * space used both by the item structs and the item data
3584  */
3585 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3586 {
3587         struct btrfs_item *start_item;
3588         struct btrfs_item *end_item;
3589         int data_len;
3590         int nritems = btrfs_header_nritems(l);
3591         int end = min(nritems, start + nr) - 1;
3592
3593         if (!nr)
3594                 return 0;
3595         start_item = btrfs_item_nr(start);
3596         end_item = btrfs_item_nr(end);
3597         data_len = btrfs_item_offset(l, start_item) +
3598                    btrfs_item_size(l, start_item);
3599         data_len = data_len - btrfs_item_offset(l, end_item);
3600         data_len += sizeof(struct btrfs_item) * nr;
3601         WARN_ON(data_len < 0);
3602         return data_len;
3603 }
3604
3605 /*
3606  * The space between the end of the leaf items and
3607  * the start of the leaf data.  IOW, how much room
3608  * the leaf has left for both items and data
3609  */
3610 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3611 {
3612         struct btrfs_fs_info *fs_info = leaf->fs_info;
3613         int nritems = btrfs_header_nritems(leaf);
3614         int ret;
3615
3616         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3617         if (ret < 0) {
3618                 btrfs_crit(fs_info,
3619                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3620                            ret,
3621                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3622                            leaf_space_used(leaf, 0, nritems), nritems);
3623         }
3624         return ret;
3625 }
3626
3627 /*
3628  * min slot controls the lowest index we're willing to push to the
3629  * right.  We'll push up to and including min_slot, but no lower
3630  */
3631 static noinline int __push_leaf_right(struct btrfs_path *path,
3632                                       int data_size, int empty,
3633                                       struct extent_buffer *right,
3634                                       int free_space, u32 left_nritems,
3635                                       u32 min_slot)
3636 {
3637         struct btrfs_fs_info *fs_info = right->fs_info;
3638         struct extent_buffer *left = path->nodes[0];
3639         struct extent_buffer *upper = path->nodes[1];
3640         struct btrfs_map_token token;
3641         struct btrfs_disk_key disk_key;
3642         int slot;
3643         u32 i;
3644         int push_space = 0;
3645         int push_items = 0;
3646         struct btrfs_item *item;
3647         u32 nr;
3648         u32 right_nritems;
3649         u32 data_end;
3650         u32 this_item_size;
3651
3652         if (empty)
3653                 nr = 0;
3654         else
3655                 nr = max_t(u32, 1, min_slot);
3656
3657         if (path->slots[0] >= left_nritems)
3658                 push_space += data_size;
3659
3660         slot = path->slots[1];
3661         i = left_nritems - 1;
3662         while (i >= nr) {
3663                 item = btrfs_item_nr(i);
3664
3665                 if (!empty && push_items > 0) {
3666                         if (path->slots[0] > i)
3667                                 break;
3668                         if (path->slots[0] == i) {
3669                                 int space = btrfs_leaf_free_space(left);
3670
3671                                 if (space + push_space * 2 > free_space)
3672                                         break;
3673                         }
3674                 }
3675
3676                 if (path->slots[0] == i)
3677                         push_space += data_size;
3678
3679                 this_item_size = btrfs_item_size(left, item);
3680                 if (this_item_size + sizeof(*item) + push_space > free_space)
3681                         break;
3682
3683                 push_items++;
3684                 push_space += this_item_size + sizeof(*item);
3685                 if (i == 0)
3686                         break;
3687                 i--;
3688         }
3689
3690         if (push_items == 0)
3691                 goto out_unlock;
3692
3693         WARN_ON(!empty && push_items == left_nritems);
3694
3695         /* push left to right */
3696         right_nritems = btrfs_header_nritems(right);
3697
3698         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3699         push_space -= leaf_data_end(left);
3700
3701         /* make room in the right data area */
3702         data_end = leaf_data_end(right);
3703         memmove_extent_buffer(right,
3704                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3705                               BTRFS_LEAF_DATA_OFFSET + data_end,
3706                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3707
3708         /* copy from the left data area */
3709         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3710                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3711                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
3712                      push_space);
3713
3714         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3715                               btrfs_item_nr_offset(0),
3716                               right_nritems * sizeof(struct btrfs_item));
3717
3718         /* copy the items from left to right */
3719         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3720                    btrfs_item_nr_offset(left_nritems - push_items),
3721                    push_items * sizeof(struct btrfs_item));
3722
3723         /* update the item pointers */
3724         btrfs_init_map_token(&token, right);
3725         right_nritems += push_items;
3726         btrfs_set_header_nritems(right, right_nritems);
3727         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3728         for (i = 0; i < right_nritems; i++) {
3729                 item = btrfs_item_nr(i);
3730                 push_space -= btrfs_token_item_size(&token, item);
3731                 btrfs_set_token_item_offset(&token, item, push_space);
3732         }
3733
3734         left_nritems -= push_items;
3735         btrfs_set_header_nritems(left, left_nritems);
3736
3737         if (left_nritems)
3738                 btrfs_mark_buffer_dirty(left);
3739         else
3740                 btrfs_clean_tree_block(left);
3741
3742         btrfs_mark_buffer_dirty(right);
3743
3744         btrfs_item_key(right, &disk_key, 0);
3745         btrfs_set_node_key(upper, &disk_key, slot + 1);
3746         btrfs_mark_buffer_dirty(upper);
3747
3748         /* then fixup the leaf pointer in the path */
3749         if (path->slots[0] >= left_nritems) {
3750                 path->slots[0] -= left_nritems;
3751                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3752                         btrfs_clean_tree_block(path->nodes[0]);
3753                 btrfs_tree_unlock(path->nodes[0]);
3754                 free_extent_buffer(path->nodes[0]);
3755                 path->nodes[0] = right;
3756                 path->slots[1] += 1;
3757         } else {
3758                 btrfs_tree_unlock(right);
3759                 free_extent_buffer(right);
3760         }
3761         return 0;
3762
3763 out_unlock:
3764         btrfs_tree_unlock(right);
3765         free_extent_buffer(right);
3766         return 1;
3767 }
3768
3769 /*
3770  * push some data in the path leaf to the right, trying to free up at
3771  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3772  *
3773  * returns 1 if the push failed because the other node didn't have enough
3774  * room, 0 if everything worked out and < 0 if there were major errors.
3775  *
3776  * this will push starting from min_slot to the end of the leaf.  It won't
3777  * push any slot lower than min_slot
3778  */
3779 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3780                            *root, struct btrfs_path *path,
3781                            int min_data_size, int data_size,
3782                            int empty, u32 min_slot)
3783 {
3784         struct extent_buffer *left = path->nodes[0];
3785         struct extent_buffer *right;
3786         struct extent_buffer *upper;
3787         int slot;
3788         int free_space;
3789         u32 left_nritems;
3790         int ret;
3791
3792         if (!path->nodes[1])
3793                 return 1;
3794
3795         slot = path->slots[1];
3796         upper = path->nodes[1];
3797         if (slot >= btrfs_header_nritems(upper) - 1)
3798                 return 1;
3799
3800         btrfs_assert_tree_locked(path->nodes[1]);
3801
3802         right = btrfs_read_node_slot(upper, slot + 1);
3803         /*
3804          * slot + 1 is not valid or we fail to read the right node,
3805          * no big deal, just return.
3806          */
3807         if (IS_ERR(right))
3808                 return 1;
3809
3810         __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3811         btrfs_set_lock_blocking_write(right);
3812
3813         free_space = btrfs_leaf_free_space(right);
3814         if (free_space < data_size)
3815                 goto out_unlock;
3816
3817         /* cow and double check */
3818         ret = btrfs_cow_block(trans, root, right, upper,
3819                               slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3820         if (ret)
3821                 goto out_unlock;
3822
3823         free_space = btrfs_leaf_free_space(right);
3824         if (free_space < data_size)
3825                 goto out_unlock;
3826
3827         left_nritems = btrfs_header_nritems(left);
3828         if (left_nritems == 0)
3829                 goto out_unlock;
3830
3831         if (check_sibling_keys(left, right)) {
3832                 ret = -EUCLEAN;
3833                 btrfs_tree_unlock(right);
3834                 free_extent_buffer(right);
3835                 return ret;
3836         }
3837         if (path->slots[0] == left_nritems && !empty) {
3838                 /* Key greater than all keys in the leaf, right neighbor has
3839                  * enough room for it and we're not emptying our leaf to delete
3840                  * it, therefore use right neighbor to insert the new item and
3841                  * no need to touch/dirty our left leaf. */
3842                 btrfs_tree_unlock(left);
3843                 free_extent_buffer(left);
3844                 path->nodes[0] = right;
3845                 path->slots[0] = 0;
3846                 path->slots[1]++;
3847                 return 0;
3848         }
3849
3850         return __push_leaf_right(path, min_data_size, empty,
3851                                 right, free_space, left_nritems, min_slot);
3852 out_unlock:
3853         btrfs_tree_unlock(right);
3854         free_extent_buffer(right);
3855         return 1;
3856 }
3857
3858 /*
3859  * push some data in the path leaf to the left, trying to free up at
3860  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3861  *
3862  * max_slot can put a limit on how far into the leaf we'll push items.  The
3863  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3864  * items
3865  */
3866 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3867                                      int empty, struct extent_buffer *left,
3868                                      int free_space, u32 right_nritems,
3869                                      u32 max_slot)
3870 {
3871         struct btrfs_fs_info *fs_info = left->fs_info;
3872         struct btrfs_disk_key disk_key;
3873         struct extent_buffer *right = path->nodes[0];
3874         int i;
3875         int push_space = 0;
3876         int push_items = 0;
3877         struct btrfs_item *item;
3878         u32 old_left_nritems;
3879         u32 nr;
3880         int ret = 0;
3881         u32 this_item_size;
3882         u32 old_left_item_size;
3883         struct btrfs_map_token token;
3884
3885         if (empty)
3886                 nr = min(right_nritems, max_slot);
3887         else
3888                 nr = min(right_nritems - 1, max_slot);
3889
3890         for (i = 0; i < nr; i++) {
3891                 item = btrfs_item_nr(i);
3892
3893                 if (!empty && push_items > 0) {
3894                         if (path->slots[0] < i)
3895                                 break;
3896                         if (path->slots[0] == i) {
3897                                 int space = btrfs_leaf_free_space(right);
3898
3899                                 if (space + push_space * 2 > free_space)
3900                                         break;
3901                         }
3902                 }
3903
3904                 if (path->slots[0] == i)
3905                         push_space += data_size;
3906
3907                 this_item_size = btrfs_item_size(right, item);
3908                 if (this_item_size + sizeof(*item) + push_space > free_space)
3909                         break;
3910
3911                 push_items++;
3912                 push_space += this_item_size + sizeof(*item);
3913         }
3914
3915         if (push_items == 0) {
3916                 ret = 1;
3917                 goto out;
3918         }
3919         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3920
3921         /* push data from right to left */
3922         copy_extent_buffer(left, right,
3923                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3924                            btrfs_item_nr_offset(0),
3925                            push_items * sizeof(struct btrfs_item));
3926
3927         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3928                      btrfs_item_offset_nr(right, push_items - 1);
3929
3930         copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3931                      leaf_data_end(left) - push_space,
3932                      BTRFS_LEAF_DATA_OFFSET +
3933                      btrfs_item_offset_nr(right, push_items - 1),
3934                      push_space);
3935         old_left_nritems = btrfs_header_nritems(left);
3936         BUG_ON(old_left_nritems <= 0);
3937
3938         btrfs_init_map_token(&token, left);
3939         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3940         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3941                 u32 ioff;
3942
3943                 item = btrfs_item_nr(i);
3944
3945                 ioff = btrfs_token_item_offset(&token, item);
3946                 btrfs_set_token_item_offset(&token, item,
3947                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3948         }
3949         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3950
3951         /* fixup right node */
3952         if (push_items > right_nritems)
3953                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3954                        right_nritems);
3955
3956         if (push_items < right_nritems) {
3957                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3958                                                   leaf_data_end(right);
3959                 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3960                                       BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3961                                       BTRFS_LEAF_DATA_OFFSET +
3962                                       leaf_data_end(right), push_space);
3963
3964                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3965                               btrfs_item_nr_offset(push_items),
3966                              (btrfs_header_nritems(right) - push_items) *
3967                              sizeof(struct btrfs_item));
3968         }
3969
3970         btrfs_init_map_token(&token, right);
3971         right_nritems -= push_items;
3972         btrfs_set_header_nritems(right, right_nritems);
3973         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3974         for (i = 0; i < right_nritems; i++) {
3975                 item = btrfs_item_nr(i);
3976
3977                 push_space = push_space - btrfs_token_item_size(&token, item);
3978                 btrfs_set_token_item_offset(&token, item, push_space);
3979         }
3980
3981         btrfs_mark_buffer_dirty(left);
3982         if (right_nritems)
3983                 btrfs_mark_buffer_dirty(right);
3984         else
3985                 btrfs_clean_tree_block(right);
3986
3987         btrfs_item_key(right, &disk_key, 0);
3988         fixup_low_keys(path, &disk_key, 1);
3989
3990         /* then fixup the leaf pointer in the path */
3991         if (path->slots[0] < push_items) {
3992                 path->slots[0] += old_left_nritems;
3993                 btrfs_tree_unlock(path->nodes[0]);
3994                 free_extent_buffer(path->nodes[0]);
3995                 path->nodes[0] = left;
3996                 path->slots[1] -= 1;
3997         } else {
3998                 btrfs_tree_unlock(left);
3999                 free_extent_buffer(left);
4000                 path->slots[0] -= push_items;
4001         }
4002         BUG_ON(path->slots[0] < 0);
4003         return ret;
4004 out:
4005         btrfs_tree_unlock(left);
4006         free_extent_buffer(left);
4007         return ret;
4008 }
4009
4010 /*
4011  * push some data in the path leaf to the left, trying to free up at
4012  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
4013  *
4014  * max_slot can put a limit on how far into the leaf we'll push items.  The
4015  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
4016  * items
4017  */
4018 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4019                           *root, struct btrfs_path *path, int min_data_size,
4020                           int data_size, int empty, u32 max_slot)
4021 {
4022         struct extent_buffer *right = path->nodes[0];
4023         struct extent_buffer *left;
4024         int slot;
4025         int free_space;
4026         u32 right_nritems;
4027         int ret = 0;
4028
4029         slot = path->slots[1];
4030         if (slot == 0)
4031                 return 1;
4032         if (!path->nodes[1])
4033                 return 1;
4034
4035         right_nritems = btrfs_header_nritems(right);
4036         if (right_nritems == 0)
4037                 return 1;
4038
4039         btrfs_assert_tree_locked(path->nodes[1]);
4040
4041         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
4042         /*
4043          * slot - 1 is not valid or we fail to read the left node,
4044          * no big deal, just return.
4045          */
4046         if (IS_ERR(left))
4047                 return 1;
4048
4049         __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
4050         btrfs_set_lock_blocking_write(left);
4051
4052         free_space = btrfs_leaf_free_space(left);
4053         if (free_space < data_size) {
4054                 ret = 1;
4055                 goto out;
4056         }
4057
4058         /* cow and double check */
4059         ret = btrfs_cow_block(trans, root, left,
4060                               path->nodes[1], slot - 1, &left,
4061                               BTRFS_NESTING_LEFT_COW);
4062         if (ret) {
4063                 /* we hit -ENOSPC, but it isn't fatal here */
4064                 if (ret == -ENOSPC)
4065                         ret = 1;
4066                 goto out;
4067         }
4068
4069         free_space = btrfs_leaf_free_space(left);
4070         if (free_space < data_size) {
4071                 ret = 1;
4072                 goto out;
4073         }
4074
4075         if (check_sibling_keys(left, right)) {
4076                 ret = -EUCLEAN;
4077                 goto out;
4078         }
4079         return __push_leaf_left(path, min_data_size,
4080                                empty, left, free_space, right_nritems,
4081                                max_slot);
4082 out:
4083         btrfs_tree_unlock(left);
4084         free_extent_buffer(left);
4085         return ret;
4086 }
4087
4088 /*
4089  * split the path's leaf in two, making sure there is at least data_size
4090  * available for the resulting leaf level of the path.
4091  */
4092 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4093                                     struct btrfs_path *path,
4094                                     struct extent_buffer *l,
4095                                     struct extent_buffer *right,
4096                                     int slot, int mid, int nritems)
4097 {
4098         struct btrfs_fs_info *fs_info = trans->fs_info;
4099         int data_copy_size;
4100         int rt_data_off;
4101         int i;
4102         struct btrfs_disk_key disk_key;
4103         struct btrfs_map_token token;
4104
4105         nritems = nritems - mid;
4106         btrfs_set_header_nritems(right, nritems);
4107         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
4108
4109         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4110                            btrfs_item_nr_offset(mid),
4111                            nritems * sizeof(struct btrfs_item));
4112
4113         copy_extent_buffer(right, l,
4114                      BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4115                      data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4116                      leaf_data_end(l), data_copy_size);
4117
4118         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4119
4120         btrfs_init_map_token(&token, right);
4121         for (i = 0; i < nritems; i++) {
4122                 struct btrfs_item *item = btrfs_item_nr(i);
4123                 u32 ioff;
4124
4125                 ioff = btrfs_token_item_offset(&token, item);
4126                 btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
4127         }
4128
4129         btrfs_set_header_nritems(l, mid);
4130         btrfs_item_key(right, &disk_key, 0);
4131         insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
4132
4133         btrfs_mark_buffer_dirty(right);
4134         btrfs_mark_buffer_dirty(l);
4135         BUG_ON(path->slots[0] != slot);
4136
4137         if (mid <= slot) {
4138                 btrfs_tree_unlock(path->nodes[0]);
4139                 free_extent_buffer(path->nodes[0]);
4140                 path->nodes[0] = right;
4141                 path->slots[0] -= mid;
4142                 path->slots[1] += 1;
4143         } else {
4144                 btrfs_tree_unlock(right);
4145                 free_extent_buffer(right);
4146         }
4147
4148         BUG_ON(path->slots[0] < 0);
4149 }
4150
4151 /*
4152  * double splits happen when we need to insert a big item in the middle
4153  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4154  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4155  *          A                 B                 C
4156  *
4157  * We avoid this by trying to push the items on either side of our target
4158  * into the adjacent leaves.  If all goes well we can avoid the double split
4159  * completely.
4160  */
4161 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4162                                           struct btrfs_root *root,
4163                                           struct btrfs_path *path,
4164                                           int data_size)
4165 {
4166         int ret;
4167         int progress = 0;
4168         int slot;
4169         u32 nritems;
4170         int space_needed = data_size;
4171
4172         slot = path->slots[0];
4173         if (slot < btrfs_header_nritems(path->nodes[0]))
4174                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4175
4176         /*
4177          * try to push all the items after our slot into the
4178          * right leaf
4179          */
4180         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4181         if (ret < 0)
4182                 return ret;
4183
4184         if (ret == 0)
4185                 progress++;
4186
4187         nritems = btrfs_header_nritems(path->nodes[0]);
4188         /*
4189          * our goal is to get our slot at the start or end of a leaf.  If
4190          * we've done so we're done
4191          */
4192         if (path->slots[0] == 0 || path->slots[0] == nritems)
4193                 return 0;
4194
4195         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4196                 return 0;
4197
4198         /* try to push all the items before our slot into the next leaf */
4199         slot = path->slots[0];
4200         space_needed = data_size;
4201         if (slot > 0)
4202                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4203         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4204         if (ret < 0)
4205                 return ret;
4206
4207         if (ret == 0)
4208                 progress++;
4209
4210         if (progress)
4211                 return 0;
4212         return 1;
4213 }
4214
4215 /*
4216  * split the path's leaf in two, making sure there is at least data_size
4217  * available for the resulting leaf level of the path.
4218  *
4219  * returns 0 if all went well and < 0 on failure.
4220  */
4221 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4222                                struct btrfs_root *root,
4223                                const struct btrfs_key *ins_key,
4224                                struct btrfs_path *path, int data_size,
4225                                int extend)
4226 {
4227         struct btrfs_disk_key disk_key;
4228         struct extent_buffer *l;
4229         u32 nritems;
4230         int mid;
4231         int slot;
4232         struct extent_buffer *right;
4233         struct btrfs_fs_info *fs_info = root->fs_info;
4234         int ret = 0;
4235         int wret;
4236         int split;
4237         int num_doubles = 0;
4238         int tried_avoid_double = 0;
4239
4240         l = path->nodes[0];
4241         slot = path->slots[0];
4242         if (extend && data_size + btrfs_item_size_nr(l, slot) +
4243             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4244                 return -EOVERFLOW;
4245
4246         /* first try to make some room by pushing left and right */
4247         if (data_size && path->nodes[1]) {
4248                 int space_needed = data_size;
4249
4250                 if (slot < btrfs_header_nritems(l))
4251                         space_needed -= btrfs_leaf_free_space(l);
4252
4253                 wret = push_leaf_right(trans, root, path, space_needed,
4254                                        space_needed, 0, 0);
4255                 if (wret < 0)
4256                         return wret;
4257                 if (wret) {
4258                         space_needed = data_size;
4259                         if (slot > 0)
4260                                 space_needed -= btrfs_leaf_free_space(l);
4261                         wret = push_leaf_left(trans, root, path, space_needed,
4262                                               space_needed, 0, (u32)-1);
4263                         if (wret < 0)
4264                                 return wret;
4265                 }
4266                 l = path->nodes[0];
4267
4268                 /* did the pushes work? */
4269                 if (btrfs_leaf_free_space(l) >= data_size)
4270                         return 0;
4271         }
4272
4273         if (!path->nodes[1]) {
4274                 ret = insert_new_root(trans, root, path, 1);
4275                 if (ret)
4276                         return ret;
4277         }
4278 again:
4279         split = 1;
4280         l = path->nodes[0];
4281         slot = path->slots[0];
4282         nritems = btrfs_header_nritems(l);
4283         mid = (nritems + 1) / 2;
4284
4285         if (mid <= slot) {
4286                 if (nritems == 1 ||
4287                     leaf_space_used(l, mid, nritems - mid) + data_size >
4288                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4289                         if (slot >= nritems) {
4290                                 split = 0;
4291                         } else {
4292                                 mid = slot;
4293                                 if (mid != nritems &&
4294                                     leaf_space_used(l, mid, nritems - mid) +
4295                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4296                                         if (data_size && !tried_avoid_double)
4297                                                 goto push_for_double;
4298                                         split = 2;
4299                                 }
4300                         }
4301                 }
4302         } else {
4303                 if (leaf_space_used(l, 0, mid) + data_size >
4304                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4305                         if (!extend && data_size && slot == 0) {
4306                                 split = 0;
4307                         } else if ((extend || !data_size) && slot == 0) {
4308                                 mid = 1;
4309                         } else {
4310                                 mid = slot;
4311                                 if (mid != nritems &&
4312                                     leaf_space_used(l, mid, nritems - mid) +
4313                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4314                                         if (data_size && !tried_avoid_double)
4315                                                 goto push_for_double;
4316                                         split = 2;
4317                                 }
4318                         }
4319                 }
4320         }
4321
4322         if (split == 0)
4323                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4324         else
4325                 btrfs_item_key(l, &disk_key, mid);
4326
4327         /*
4328          * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
4329          * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
4330          * subclasses, which is 8 at the time of this patch, and we've maxed it
4331          * out.  In the future we could add a
4332          * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
4333          * use BTRFS_NESTING_NEW_ROOT.
4334          */
4335         right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4336                                              l->start, 0, num_doubles ?
4337                                              BTRFS_NESTING_NEW_ROOT :
4338                                              BTRFS_NESTING_SPLIT);
4339         if (IS_ERR(right))
4340                 return PTR_ERR(right);
4341
4342         root_add_used(root, fs_info->nodesize);
4343
4344         if (split == 0) {
4345                 if (mid <= slot) {
4346                         btrfs_set_header_nritems(right, 0);
4347                         insert_ptr(trans, path, &disk_key,
4348                                    right->start, path->slots[1] + 1, 1);
4349                         btrfs_tree_unlock(path->nodes[0]);
4350                         free_extent_buffer(path->nodes[0]);
4351                         path->nodes[0] = right;
4352                         path->slots[0] = 0;
4353                         path->slots[1] += 1;
4354                 } else {
4355                         btrfs_set_header_nritems(right, 0);
4356                         insert_ptr(trans, path, &disk_key,
4357                                    right->start, path->slots[1], 1);
4358                         btrfs_tree_unlock(path->nodes[0]);
4359                         free_extent_buffer(path->nodes[0]);
4360                         path->nodes[0] = right;
4361                         path->slots[0] = 0;
4362                         if (path->slots[1] == 0)
4363                                 fixup_low_keys(path, &disk_key, 1);
4364                 }
4365                 /*
4366                  * We create a new leaf 'right' for the required ins_len and
4367                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4368                  * the content of ins_len to 'right'.
4369                  */
4370                 return ret;
4371         }
4372
4373         copy_for_split(trans, path, l, right, slot, mid, nritems);
4374
4375         if (split == 2) {
4376                 BUG_ON(num_doubles != 0);
4377                 num_doubles++;
4378                 goto again;
4379         }
4380
4381         return 0;
4382
4383 push_for_double:
4384         push_for_double_split(trans, root, path, data_size);
4385         tried_avoid_double = 1;
4386         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4387                 return 0;
4388         goto again;
4389 }
4390
4391 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4392                                          struct btrfs_root *root,
4393                                          struct btrfs_path *path, int ins_len)
4394 {
4395         struct btrfs_key key;
4396         struct extent_buffer *leaf;
4397         struct btrfs_file_extent_item *fi;
4398         u64 extent_len = 0;
4399         u32 item_size;
4400         int ret;
4401
4402         leaf = path->nodes[0];
4403         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4404
4405         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4406                key.type != BTRFS_EXTENT_CSUM_KEY);
4407
4408         if (btrfs_leaf_free_space(leaf) >= ins_len)
4409                 return 0;
4410
4411         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4412         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4413                 fi = btrfs_item_ptr(leaf, path->slots[0],
4414                                     struct btrfs_file_extent_item);
4415                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4416         }
4417         btrfs_release_path(path);
4418
4419         path->keep_locks = 1;
4420         path->search_for_split = 1;
4421         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4422         path->search_for_split = 0;
4423         if (ret > 0)
4424                 ret = -EAGAIN;
4425         if (ret < 0)
4426                 goto err;
4427
4428         ret = -EAGAIN;
4429         leaf = path->nodes[0];
4430         /* if our item isn't there, return now */
4431         if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4432                 goto err;
4433
4434         /* the leaf has  changed, it now has room.  return now */
4435         if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
4436                 goto err;
4437
4438         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4439                 fi = btrfs_item_ptr(leaf, path->slots[0],
4440                                     struct btrfs_file_extent_item);
4441                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4442                         goto err;
4443         }
4444
4445         btrfs_set_path_blocking(path);
4446         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4447         if (ret)
4448                 goto err;
4449
4450         path->keep_locks = 0;
4451         btrfs_unlock_up_safe(path, 1);
4452         return 0;
4453 err:
4454         path->keep_locks = 0;
4455         return ret;
4456 }
4457
4458 static noinline int split_item(struct btrfs_path *path,
4459                                const struct btrfs_key *new_key,
4460                                unsigned long split_offset)
4461 {
4462         struct extent_buffer *leaf;
4463         struct btrfs_item *item;
4464         struct btrfs_item *new_item;
4465         int slot;
4466         char *buf;
4467         u32 nritems;
4468         u32 item_size;
4469         u32 orig_offset;
4470         struct btrfs_disk_key disk_key;
4471
4472         leaf = path->nodes[0];
4473         BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
4474
4475         btrfs_set_path_blocking(path);
4476
4477         item = btrfs_item_nr(path->slots[0]);
4478         orig_offset = btrfs_item_offset(leaf, item);
4479         item_size = btrfs_item_size(leaf, item);
4480
4481         buf = kmalloc(item_size, GFP_NOFS);
4482         if (!buf)
4483                 return -ENOMEM;
4484
4485         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4486                             path->slots[0]), item_size);
4487
4488         slot = path->slots[0] + 1;
4489         nritems = btrfs_header_nritems(leaf);
4490         if (slot != nritems) {
4491                 /* shift the items */
4492                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4493                                 btrfs_item_nr_offset(slot),
4494                                 (nritems - slot) * sizeof(struct btrfs_item));
4495         }
4496
4497         btrfs_cpu_key_to_disk(&disk_key, new_key);
4498         btrfs_set_item_key(leaf, &disk_key, slot);
4499
4500         new_item = btrfs_item_nr(slot);
4501
4502         btrfs_set_item_offset(leaf, new_item, orig_offset);
4503         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4504
4505         btrfs_set_item_offset(leaf, item,
4506                               orig_offset + item_size - split_offset);
4507         btrfs_set_item_size(leaf, item, split_offset);
4508
4509         btrfs_set_header_nritems(leaf, nritems + 1);
4510
4511         /* write the data for the start of the original item */
4512         write_extent_buffer(leaf, buf,
4513                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4514                             split_offset);
4515
4516         /* write the data for the new item */
4517         write_extent_buffer(leaf, buf + split_offset,
4518                             btrfs_item_ptr_offset(leaf, slot),
4519                             item_size - split_offset);
4520         btrfs_mark_buffer_dirty(leaf);
4521
4522         BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4523         kfree(buf);
4524         return 0;
4525 }
4526
4527 /*
4528  * This function splits a single item into two items,
4529  * giving 'new_key' to the new item and splitting the
4530  * old one at split_offset (from the start of the item).
4531  *
4532  * The path may be released by this operation.  After
4533  * the split, the path is pointing to the old item.  The
4534  * new item is going to be in the same node as the old one.
4535  *
4536  * Note, the item being split must be smaller enough to live alone on
4537  * a tree block with room for one extra struct btrfs_item
4538  *
4539  * This allows us to split the item in place, keeping a lock on the
4540  * leaf the entire time.
4541  */
4542 int btrfs_split_item(struct btrfs_trans_handle *trans,
4543                      struct btrfs_root *root,
4544                      struct btrfs_path *path,
4545                      const struct btrfs_key *new_key,
4546                      unsigned long split_offset)
4547 {
4548         int ret;
4549         ret = setup_leaf_for_split(trans, root, path,
4550                                    sizeof(struct btrfs_item));
4551         if (ret)
4552                 return ret;
4553
4554         ret = split_item(path, new_key, split_offset);
4555         return ret;
4556 }
4557
4558 /*
4559  * This function duplicate a item, giving 'new_key' to the new item.
4560  * It guarantees both items live in the same tree leaf and the new item
4561  * is contiguous with the original item.
4562  *
4563  * This allows us to split file extent in place, keeping a lock on the
4564  * leaf the entire time.
4565  */
4566 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4567                          struct btrfs_root *root,
4568                          struct btrfs_path *path,
4569                          const struct btrfs_key *new_key)
4570 {
4571         struct extent_buffer *leaf;
4572         int ret;
4573         u32 item_size;
4574
4575         leaf = path->nodes[0];
4576         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4577         ret = setup_leaf_for_split(trans, root, path,
4578                                    item_size + sizeof(struct btrfs_item));
4579         if (ret)
4580                 return ret;
4581
4582         path->slots[0]++;
4583         setup_items_for_insert(root, path, new_key, &item_size, 1);
4584         leaf = path->nodes[0];
4585         memcpy_extent_buffer(leaf,
4586                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4587                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4588                              item_size);
4589         return 0;
4590 }
4591
4592 /*
4593  * make the item pointed to by the path smaller.  new_size indicates
4594  * how small to make it, and from_end tells us if we just chop bytes
4595  * off the end of the item or if we shift the item to chop bytes off
4596  * the front.
4597  */
4598 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4599 {
4600         int slot;
4601         struct extent_buffer *leaf;
4602         struct btrfs_item *item;
4603         u32 nritems;
4604         unsigned int data_end;
4605         unsigned int old_data_start;
4606         unsigned int old_size;
4607         unsigned int size_diff;
4608         int i;
4609         struct btrfs_map_token token;
4610
4611         leaf = path->nodes[0];
4612         slot = path->slots[0];
4613
4614         old_size = btrfs_item_size_nr(leaf, slot);
4615         if (old_size == new_size)
4616                 return;
4617
4618         nritems = btrfs_header_nritems(leaf);
4619         data_end = leaf_data_end(leaf);
4620
4621         old_data_start = btrfs_item_offset_nr(leaf, slot);
4622
4623         size_diff = old_size - new_size;
4624
4625         BUG_ON(slot < 0);
4626         BUG_ON(slot >= nritems);
4627
4628         /*
4629          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4630          */
4631         /* first correct the data pointers */
4632         btrfs_init_map_token(&token, leaf);
4633         for (i = slot; i < nritems; i++) {
4634                 u32 ioff;
4635                 item = btrfs_item_nr(i);
4636
4637                 ioff = btrfs_token_item_offset(&token, item);
4638                 btrfs_set_token_item_offset(&token, item, ioff + size_diff);
4639         }
4640
4641         /* shift the data */
4642         if (from_end) {
4643                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4644                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4645                               data_end, old_data_start + new_size - data_end);
4646         } else {
4647                 struct btrfs_disk_key disk_key;
4648                 u64 offset;
4649
4650                 btrfs_item_key(leaf, &disk_key, slot);
4651
4652                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4653                         unsigned long ptr;
4654                         struct btrfs_file_extent_item *fi;
4655
4656                         fi = btrfs_item_ptr(leaf, slot,
4657                                             struct btrfs_file_extent_item);
4658                         fi = (struct btrfs_file_extent_item *)(
4659                              (unsigned long)fi - size_diff);
4660
4661                         if (btrfs_file_extent_type(leaf, fi) ==
4662                             BTRFS_FILE_EXTENT_INLINE) {
4663                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4664                                 memmove_extent_buffer(leaf, ptr,
4665                                       (unsigned long)fi,
4666                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4667                         }
4668                 }
4669
4670                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4671                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4672                               data_end, old_data_start - data_end);
4673
4674                 offset = btrfs_disk_key_offset(&disk_key);
4675                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4676                 btrfs_set_item_key(leaf, &disk_key, slot);
4677                 if (slot == 0)
4678                         fixup_low_keys(path, &disk_key, 1);
4679         }
4680
4681         item = btrfs_item_nr(slot);
4682         btrfs_set_item_size(leaf, item, new_size);
4683         btrfs_mark_buffer_dirty(leaf);
4684
4685         if (btrfs_leaf_free_space(leaf) < 0) {
4686                 btrfs_print_leaf(leaf);
4687                 BUG();
4688         }
4689 }
4690
4691 /*
4692  * make the item pointed to by the path bigger, data_size is the added size.
4693  */
4694 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4695 {
4696         int slot;
4697         struct extent_buffer *leaf;
4698         struct btrfs_item *item;
4699         u32 nritems;
4700         unsigned int data_end;
4701         unsigned int old_data;
4702         unsigned int old_size;
4703         int i;
4704         struct btrfs_map_token token;
4705
4706         leaf = path->nodes[0];
4707
4708         nritems = btrfs_header_nritems(leaf);
4709         data_end = leaf_data_end(leaf);
4710
4711         if (btrfs_leaf_free_space(leaf) < data_size) {
4712                 btrfs_print_leaf(leaf);
4713                 BUG();
4714         }
4715         slot = path->slots[0];
4716         old_data = btrfs_item_end_nr(leaf, slot);
4717
4718         BUG_ON(slot < 0);
4719         if (slot >= nritems) {
4720                 btrfs_print_leaf(leaf);
4721                 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4722                            slot, nritems);
4723                 BUG();
4724         }
4725
4726         /*
4727          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4728          */
4729         /* first correct the data pointers */
4730         btrfs_init_map_token(&token, leaf);
4731         for (i = slot; i < nritems; i++) {
4732                 u32 ioff;
4733                 item = btrfs_item_nr(i);
4734
4735                 ioff = btrfs_token_item_offset(&token, item);
4736                 btrfs_set_token_item_offset(&token, item, ioff - data_size);
4737         }
4738
4739         /* shift the data */
4740         memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4741                       data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4742                       data_end, old_data - data_end);
4743
4744         data_end = old_data;
4745         old_size = btrfs_item_size_nr(leaf, slot);
4746         item = btrfs_item_nr(slot);
4747         btrfs_set_item_size(leaf, item, old_size + data_size);
4748         btrfs_mark_buffer_dirty(leaf);
4749
4750         if (btrfs_leaf_free_space(leaf) < 0) {
4751                 btrfs_print_leaf(leaf);
4752                 BUG();
4753         }
4754 }
4755
4756 /**
4757  * setup_items_for_insert - Helper called before inserting one or more items
4758  * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
4759  * in a function that doesn't call btrfs_search_slot
4760  *
4761  * @root:       root we are inserting items to
4762  * @path:       points to the leaf/slot where we are going to insert new items
4763  * @cpu_key:    array of keys for items to be inserted
4764  * @data_size:  size of the body of each item we are going to insert
4765  * @nr:         size of @cpu_key/@data_size arrays
4766  */
4767 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4768                             const struct btrfs_key *cpu_key, u32 *data_size,
4769                             int nr)
4770 {
4771         struct btrfs_fs_info *fs_info = root->fs_info;
4772         struct btrfs_item *item;
4773         int i;
4774         u32 nritems;
4775         unsigned int data_end;
4776         struct btrfs_disk_key disk_key;
4777         struct extent_buffer *leaf;
4778         int slot;
4779         struct btrfs_map_token token;
4780         u32 total_size;
4781         u32 total_data = 0;
4782
4783         for (i = 0; i < nr; i++)
4784                 total_data += data_size[i];
4785         total_size = total_data + (nr * sizeof(struct btrfs_item));
4786
4787         if (path->slots[0] == 0) {
4788                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4789                 fixup_low_keys(path, &disk_key, 1);
4790         }
4791         btrfs_unlock_up_safe(path, 1);
4792
4793         leaf = path->nodes[0];
4794         slot = path->slots[0];
4795
4796         nritems = btrfs_header_nritems(leaf);
4797         data_end = leaf_data_end(leaf);
4798
4799         if (btrfs_leaf_free_space(leaf) < total_size) {
4800                 btrfs_print_leaf(leaf);
4801                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4802                            total_size, btrfs_leaf_free_space(leaf));
4803                 BUG();
4804         }
4805
4806         btrfs_init_map_token(&token, leaf);
4807         if (slot != nritems) {
4808                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4809
4810                 if (old_data < data_end) {
4811                         btrfs_print_leaf(leaf);
4812                         btrfs_crit(fs_info,
4813                 "item at slot %d with data offset %u beyond data end of leaf %u",
4814                                    slot, old_data, data_end);
4815                         BUG();
4816                 }
4817                 /*
4818                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4819                  */
4820                 /* first correct the data pointers */
4821                 for (i = slot; i < nritems; i++) {
4822                         u32 ioff;
4823
4824                         item = btrfs_item_nr(i);
4825                         ioff = btrfs_token_item_offset(&token, item);
4826                         btrfs_set_token_item_offset(&token, item,
4827                                                     ioff - total_data);
4828                 }
4829                 /* shift the items */
4830                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4831                               btrfs_item_nr_offset(slot),
4832                               (nritems - slot) * sizeof(struct btrfs_item));
4833
4834                 /* shift the data */
4835                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4836                               data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4837                               data_end, old_data - data_end);
4838                 data_end = old_data;
4839         }
4840
4841         /* setup the item for the new data */
4842         for (i = 0; i < nr; i++) {
4843                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4844                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4845                 item = btrfs_item_nr(slot + i);
4846                 data_end -= data_size[i];
4847                 btrfs_set_token_item_offset(&token, item, data_end);
4848                 btrfs_set_token_item_size(&token, item, data_size[i]);
4849         }
4850
4851         btrfs_set_header_nritems(leaf, nritems + nr);
4852         btrfs_mark_buffer_dirty(leaf);
4853
4854         if (btrfs_leaf_free_space(leaf) < 0) {
4855                 btrfs_print_leaf(leaf);
4856                 BUG();
4857         }
4858 }
4859
4860 /*
4861  * Given a key and some data, insert items into the tree.
4862  * This does all the path init required, making room in the tree if needed.
4863  */
4864 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4865                             struct btrfs_root *root,
4866                             struct btrfs_path *path,
4867                             const struct btrfs_key *cpu_key, u32 *data_size,
4868                             int nr)
4869 {
4870         int ret = 0;
4871         int slot;
4872         int i;
4873         u32 total_size = 0;
4874         u32 total_data = 0;
4875
4876         for (i = 0; i < nr; i++)
4877                 total_data += data_size[i];
4878
4879         total_size = total_data + (nr * sizeof(struct btrfs_item));
4880         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4881         if (ret == 0)
4882                 return -EEXIST;
4883         if (ret < 0)
4884                 return ret;
4885
4886         slot = path->slots[0];
4887         BUG_ON(slot < 0);
4888
4889         setup_items_for_insert(root, path, cpu_key, data_size, nr);
4890         return 0;
4891 }
4892
4893 /*
4894  * Given a key and some data, insert an item into the tree.
4895  * This does all the path init required, making room in the tree if needed.
4896  */
4897 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4898                       const struct btrfs_key *cpu_key, void *data,
4899                       u32 data_size)
4900 {
4901         int ret = 0;
4902         struct btrfs_path *path;
4903         struct extent_buffer *leaf;
4904         unsigned long ptr;
4905
4906         path = btrfs_alloc_path();
4907         if (!path)
4908                 return -ENOMEM;
4909         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4910         if (!ret) {
4911                 leaf = path->nodes[0];
4912                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4913                 write_extent_buffer(leaf, data, ptr, data_size);
4914                 btrfs_mark_buffer_dirty(leaf);
4915         }
4916         btrfs_free_path(path);
4917         return ret;
4918 }
4919
4920 /*
4921  * delete the pointer from a given node.
4922  *
4923  * the tree should have been previously balanced so the deletion does not
4924  * empty a node.
4925  */
4926 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4927                     int level, int slot)
4928 {
4929         struct extent_buffer *parent = path->nodes[level];
4930         u32 nritems;
4931         int ret;
4932
4933         nritems = btrfs_header_nritems(parent);
4934         if (slot != nritems - 1) {
4935                 if (level) {
4936                         ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4937                                         nritems - slot - 1);
4938                         BUG_ON(ret < 0);
4939                 }
4940                 memmove_extent_buffer(parent,
4941                               btrfs_node_key_ptr_offset(slot),
4942                               btrfs_node_key_ptr_offset(slot + 1),
4943                               sizeof(struct btrfs_key_ptr) *
4944                               (nritems - slot - 1));
4945         } else if (level) {
4946                 ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4947                                 GFP_NOFS);
4948                 BUG_ON(ret < 0);
4949         }
4950
4951         nritems--;
4952         btrfs_set_header_nritems(parent, nritems);
4953         if (nritems == 0 && parent == root->node) {
4954                 BUG_ON(btrfs_header_level(root->node) != 1);
4955                 /* just turn the root into a leaf and break */
4956                 btrfs_set_header_level(root->node, 0);
4957         } else if (slot == 0) {
4958                 struct btrfs_disk_key disk_key;
4959
4960                 btrfs_node_key(parent, &disk_key, 0);
4961                 fixup_low_keys(path, &disk_key, level + 1);
4962         }
4963         btrfs_mark_buffer_dirty(parent);
4964 }
4965
4966 /*
4967  * a helper function to delete the leaf pointed to by path->slots[1] and
4968  * path->nodes[1].
4969  *
4970  * This deletes the pointer in path->nodes[1] and frees the leaf
4971  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4972  *
4973  * The path must have already been setup for deleting the leaf, including
4974  * all the proper balancing.  path->nodes[1] must be locked.
4975  */
4976 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4977                                     struct btrfs_root *root,
4978                                     struct btrfs_path *path,
4979                                     struct extent_buffer *leaf)
4980 {
4981         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4982         del_ptr(root, path, 1, path->slots[1]);
4983
4984         /*
4985          * btrfs_free_extent is expensive, we want to make sure we
4986          * aren't holding any locks when we call it
4987          */
4988         btrfs_unlock_up_safe(path, 0);
4989
4990         root_sub_used(root, leaf->len);
4991
4992         atomic_inc(&leaf->refs);
4993         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4994         free_extent_buffer_stale(leaf);
4995 }
4996 /*
4997  * delete the item at the leaf level in path.  If that empties
4998  * the leaf, remove it from the tree
4999  */
5000 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5001                     struct btrfs_path *path, int slot, int nr)
5002 {
5003         struct btrfs_fs_info *fs_info = root->fs_info;
5004         struct extent_buffer *leaf;
5005         struct btrfs_item *item;
5006         u32 last_off;
5007         u32 dsize = 0;
5008         int ret = 0;
5009         int wret;
5010         int i;
5011         u32 nritems;
5012
5013         leaf = path->nodes[0];
5014         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
5015
5016         for (i = 0; i < nr; i++)
5017                 dsize += btrfs_item_size_nr(leaf, slot + i);
5018
5019         nritems = btrfs_header_nritems(leaf);
5020
5021         if (slot + nr != nritems) {
5022                 int data_end = leaf_data_end(leaf);
5023                 struct btrfs_map_token token;
5024
5025                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
5026                               data_end + dsize,
5027                               BTRFS_LEAF_DATA_OFFSET + data_end,
5028                               last_off - data_end);
5029
5030                 btrfs_init_map_token(&token, leaf);
5031                 for (i = slot + nr; i < nritems; i++) {
5032                         u32 ioff;
5033
5034                         item = btrfs_item_nr(i);
5035                         ioff = btrfs_token_item_offset(&token, item);
5036                         btrfs_set_token_item_offset(&token, item, ioff + dsize);
5037                 }
5038
5039                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5040                               btrfs_item_nr_offset(slot + nr),
5041                               sizeof(struct btrfs_item) *
5042                               (nritems - slot - nr));
5043         }
5044         btrfs_set_header_nritems(leaf, nritems - nr);
5045         nritems -= nr;
5046
5047         /* delete the leaf if we've emptied it */
5048         if (nritems == 0) {
5049                 if (leaf == root->node) {
5050                         btrfs_set_header_level(leaf, 0);
5051                 } else {
5052                         btrfs_set_path_blocking(path);
5053                         btrfs_clean_tree_block(leaf);
5054                         btrfs_del_leaf(trans, root, path, leaf);
5055                 }
5056         } else {
5057                 int used = leaf_space_used(leaf, 0, nritems);
5058                 if (slot == 0) {
5059                         struct btrfs_disk_key disk_key;
5060
5061                         btrfs_item_key(leaf, &disk_key, 0);
5062                         fixup_low_keys(path, &disk_key, 1);
5063                 }
5064
5065                 /* delete the leaf if it is mostly empty */
5066                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5067                         /* push_leaf_left fixes the path.
5068                          * make sure the path still points to our leaf
5069                          * for possible call to del_ptr below
5070                          */
5071                         slot = path->slots[1];
5072                         atomic_inc(&leaf->refs);
5073
5074                         btrfs_set_path_blocking(path);
5075                         wret = push_leaf_left(trans, root, path, 1, 1,
5076                                               1, (u32)-1);
5077                         if (wret < 0 && wret != -ENOSPC)
5078                                 ret = wret;
5079
5080                         if (path->nodes[0] == leaf &&
5081                             btrfs_header_nritems(leaf)) {
5082                                 wret = push_leaf_right(trans, root, path, 1,
5083                                                        1, 1, 0);
5084                                 if (wret < 0 && wret != -ENOSPC)
5085                                         ret = wret;
5086                         }
5087
5088                         if (btrfs_header_nritems(leaf) == 0) {
5089                                 path->slots[1] = slot;
5090                                 btrfs_del_leaf(trans, root, path, leaf);
5091                                 free_extent_buffer(leaf);
5092                                 ret = 0;
5093                         } else {
5094                                 /* if we're still in the path, make sure
5095                                  * we're dirty.  Otherwise, one of the
5096                                  * push_leaf functions must have already
5097                                  * dirtied this buffer
5098                                  */
5099                                 if (path->nodes[0] == leaf)
5100                                         btrfs_mark_buffer_dirty(leaf);
5101                                 free_extent_buffer(leaf);
5102                         }
5103                 } else {
5104                         btrfs_mark_buffer_dirty(leaf);
5105                 }
5106         }
5107         return ret;
5108 }
5109
5110 /*
5111  * search the tree again to find a leaf with lesser keys
5112  * returns 0 if it found something or 1 if there are no lesser leaves.
5113  * returns < 0 on io errors.
5114  *
5115  * This may release the path, and so you may lose any locks held at the
5116  * time you call it.
5117  */
5118 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5119 {
5120         struct btrfs_key key;
5121         struct btrfs_disk_key found_key;
5122         int ret;
5123
5124         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5125
5126         if (key.offset > 0) {
5127                 key.offset--;
5128         } else if (key.type > 0) {
5129                 key.type--;
5130                 key.offset = (u64)-1;
5131         } else if (key.objectid > 0) {
5132                 key.objectid--;
5133                 key.type = (u8)-1;
5134                 key.offset = (u64)-1;
5135         } else {
5136                 return 1;
5137         }
5138
5139         btrfs_release_path(path);
5140         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5141         if (ret < 0)
5142                 return ret;
5143         btrfs_item_key(path->nodes[0], &found_key, 0);
5144         ret = comp_keys(&found_key, &key);
5145         /*
5146          * We might have had an item with the previous key in the tree right
5147          * before we released our path. And after we released our path, that
5148          * item might have been pushed to the first slot (0) of the leaf we
5149          * were holding due to a tree balance. Alternatively, an item with the
5150          * previous key can exist as the only element of a leaf (big fat item).
5151          * Therefore account for these 2 cases, so that our callers (like
5152          * btrfs_previous_item) don't miss an existing item with a key matching
5153          * the previous key we computed above.
5154          */
5155         if (ret <= 0)
5156                 return 0;
5157         return 1;
5158 }
5159
5160 /*
5161  * A helper function to walk down the tree starting at min_key, and looking
5162  * for nodes or leaves that are have a minimum transaction id.
5163  * This is used by the btree defrag code, and tree logging
5164  *
5165  * This does not cow, but it does stuff the starting key it finds back
5166  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5167  * key and get a writable path.
5168  *
5169  * This honors path->lowest_level to prevent descent past a given level
5170  * of the tree.
5171  *
5172  * min_trans indicates the oldest transaction that you are interested
5173  * in walking through.  Any nodes or leaves older than min_trans are
5174  * skipped over (without reading them).
5175  *
5176  * returns zero if something useful was found, < 0 on error and 1 if there
5177  * was nothing in the tree that matched the search criteria.
5178  */
5179 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5180                          struct btrfs_path *path,
5181                          u64 min_trans)
5182 {
5183         struct extent_buffer *cur;
5184         struct btrfs_key found_key;
5185         int slot;
5186         int sret;
5187         u32 nritems;
5188         int level;
5189         int ret = 1;
5190         int keep_locks = path->keep_locks;
5191
5192         path->keep_locks = 1;
5193 again:
5194         cur = btrfs_read_lock_root_node(root);
5195         level = btrfs_header_level(cur);
5196         WARN_ON(path->nodes[level]);
5197         path->nodes[level] = cur;
5198         path->locks[level] = BTRFS_READ_LOCK;
5199
5200         if (btrfs_header_generation(cur) < min_trans) {
5201                 ret = 1;
5202                 goto out;
5203         }
5204         while (1) {
5205                 nritems = btrfs_header_nritems(cur);
5206                 level = btrfs_header_level(cur);
5207                 sret = btrfs_bin_search(cur, min_key, &slot);
5208                 if (sret < 0) {
5209                         ret = sret;
5210                         goto out;
5211                 }
5212
5213                 /* at the lowest level, we're done, setup the path and exit */
5214                 if (level == path->lowest_level) {
5215                         if (slot >= nritems)
5216                                 goto find_next_key;
5217                         ret = 0;
5218                         path->slots[level] = slot;
5219                         btrfs_item_key_to_cpu(cur, &found_key, slot);
5220                         goto out;
5221                 }
5222                 if (sret && slot > 0)
5223                         slot--;
5224                 /*
5225                  * check this node pointer against the min_trans parameters.
5226                  * If it is too old, skip to the next one.
5227                  */
5228                 while (slot < nritems) {
5229                         u64 gen;
5230
5231                         gen = btrfs_node_ptr_generation(cur, slot);
5232                         if (gen < min_trans) {
5233                                 slot++;
5234                                 continue;
5235                         }
5236                         break;
5237                 }
5238 find_next_key:
5239                 /*
5240                  * we didn't find a candidate key in this node, walk forward
5241                  * and find another one
5242                  */
5243                 if (slot >= nritems) {
5244                         path->slots[level] = slot;
5245                         btrfs_set_path_blocking(path);
5246                         sret = btrfs_find_next_key(root, path, min_key, level,
5247                                                   min_trans);
5248                         if (sret == 0) {
5249                                 btrfs_release_path(path);
5250                                 goto again;
5251                         } else {
5252                                 goto out;
5253                         }
5254                 }
5255                 /* save our key for returning back */
5256                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5257                 path->slots[level] = slot;
5258                 if (level == path->lowest_level) {
5259                         ret = 0;
5260                         goto out;
5261                 }
5262                 btrfs_set_path_blocking(path);
5263                 cur = btrfs_read_node_slot(cur, slot);
5264                 if (IS_ERR(cur)) {
5265                         ret = PTR_ERR(cur);
5266                         goto out;
5267                 }
5268
5269                 btrfs_tree_read_lock(cur);
5270
5271                 path->locks[level - 1] = BTRFS_READ_LOCK;
5272                 path->nodes[level - 1] = cur;
5273                 unlock_up(path, level, 1, 0, NULL);
5274         }
5275 out:
5276         path->keep_locks = keep_locks;
5277         if (ret == 0) {
5278                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5279                 btrfs_set_path_blocking(path);
5280                 memcpy(min_key, &found_key, sizeof(found_key));
5281         }
5282         return ret;
5283 }
5284
5285 /*
5286  * this is similar to btrfs_next_leaf, but does not try to preserve
5287  * and fixup the path.  It looks for and returns the next key in the
5288  * tree based on the current path and the min_trans parameters.
5289  *
5290  * 0 is returned if another key is found, < 0 if there are any errors
5291  * and 1 is returned if there are no higher keys in the tree
5292  *
5293  * path->keep_locks should be set to 1 on the search made before
5294  * calling this function.
5295  */
5296 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5297                         struct btrfs_key *key, int level, u64 min_trans)
5298 {
5299         int slot;
5300         struct extent_buffer *c;
5301
5302         WARN_ON(!path->keep_locks && !path->skip_locking);
5303         while (level < BTRFS_MAX_LEVEL) {
5304                 if (!path->nodes[level])
5305                         return 1;
5306
5307                 slot = path->slots[level] + 1;
5308                 c = path->nodes[level];
5309 next:
5310                 if (slot >= btrfs_header_nritems(c)) {
5311                         int ret;
5312                         int orig_lowest;
5313                         struct btrfs_key cur_key;
5314                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5315                             !path->nodes[level + 1])
5316                                 return 1;
5317
5318                         if (path->locks[level + 1] || path->skip_locking) {
5319                                 level++;
5320                                 continue;
5321                         }
5322
5323                         slot = btrfs_header_nritems(c) - 1;
5324                         if (level == 0)
5325                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5326                         else
5327                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5328
5329                         orig_lowest = path->lowest_level;
5330                         btrfs_release_path(path);
5331                         path->lowest_level = level;
5332                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5333                                                 0, 0);
5334                         path->lowest_level = orig_lowest;
5335                         if (ret < 0)
5336                                 return ret;
5337
5338                         c = path->nodes[level];
5339                         slot = path->slots[level];
5340                         if (ret == 0)
5341                                 slot++;
5342                         goto next;
5343                 }
5344
5345                 if (level == 0)
5346                         btrfs_item_key_to_cpu(c, key, slot);
5347                 else {
5348                         u64 gen = btrfs_node_ptr_generation(c, slot);
5349
5350                         if (gen < min_trans) {
5351                                 slot++;
5352                                 goto next;
5353                         }
5354                         btrfs_node_key_to_cpu(c, key, slot);
5355                 }
5356                 return 0;
5357         }
5358         return 1;
5359 }
5360
5361 /*
5362  * search the tree again to find a leaf with greater keys
5363  * returns 0 if it found something or 1 if there are no greater leaves.
5364  * returns < 0 on io errors.
5365  */
5366 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5367 {
5368         return btrfs_next_old_leaf(root, path, 0);
5369 }
5370
5371 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5372                         u64 time_seq)
5373 {
5374         int slot;
5375         int level;
5376         struct extent_buffer *c;
5377         struct extent_buffer *next;
5378         struct btrfs_key key;
5379         u32 nritems;
5380         int ret;
5381         int old_spinning = path->leave_spinning;
5382         int next_rw_lock = 0;
5383
5384         nritems = btrfs_header_nritems(path->nodes[0]);
5385         if (nritems == 0)
5386                 return 1;
5387
5388         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5389 again:
5390         level = 1;
5391         next = NULL;
5392         next_rw_lock = 0;
5393         btrfs_release_path(path);
5394
5395         path->keep_locks = 1;
5396         path->leave_spinning = 1;
5397
5398         if (time_seq)
5399                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5400         else
5401                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5402         path->keep_locks = 0;
5403
5404         if (ret < 0)
5405                 return ret;
5406
5407         nritems = btrfs_header_nritems(path->nodes[0]);
5408         /*
5409          * by releasing the path above we dropped all our locks.  A balance
5410          * could have added more items next to the key that used to be
5411          * at the very end of the block.  So, check again here and
5412          * advance the path if there are now more items available.
5413          */
5414         if (nritems > 0 && path->slots[0] < nritems - 1) {
5415                 if (ret == 0)
5416                         path->slots[0]++;
5417                 ret = 0;
5418                 goto done;
5419         }
5420         /*
5421          * So the above check misses one case:
5422          * - after releasing the path above, someone has removed the item that
5423          *   used to be at the very end of the block, and balance between leafs
5424          *   gets another one with bigger key.offset to replace it.
5425          *
5426          * This one should be returned as well, or we can get leaf corruption
5427          * later(esp. in __btrfs_drop_extents()).
5428          *
5429          * And a bit more explanation about this check,
5430          * with ret > 0, the key isn't found, the path points to the slot
5431          * where it should be inserted, so the path->slots[0] item must be the
5432          * bigger one.
5433          */
5434         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5435                 ret = 0;
5436                 goto done;
5437         }
5438
5439         while (level < BTRFS_MAX_LEVEL) {
5440                 if (!path->nodes[level]) {
5441                         ret = 1;
5442                         goto done;
5443                 }
5444
5445                 slot = path->slots[level] + 1;
5446                 c = path->nodes[level];
5447                 if (slot >= btrfs_header_nritems(c)) {
5448                         level++;
5449                         if (level == BTRFS_MAX_LEVEL) {
5450                                 ret = 1;
5451                                 goto done;
5452                         }
5453                         continue;
5454                 }
5455
5456                 if (next) {
5457                         btrfs_tree_unlock_rw(next, next_rw_lock);
5458                         free_extent_buffer(next);
5459                 }
5460
5461                 next = c;
5462                 next_rw_lock = path->locks[level];
5463                 ret = read_block_for_search(root, path, &next, level,
5464                                             slot, &key);
5465                 if (ret == -EAGAIN)
5466                         goto again;
5467
5468                 if (ret < 0) {
5469                         btrfs_release_path(path);
5470                         goto done;
5471                 }
5472
5473                 if (!path->skip_locking) {
5474                         ret = btrfs_try_tree_read_lock(next);
5475                         if (!ret && time_seq) {
5476                                 /*
5477                                  * If we don't get the lock, we may be racing
5478                                  * with push_leaf_left, holding that lock while
5479                                  * itself waiting for the leaf we've currently
5480                                  * locked. To solve this situation, we give up
5481                                  * on our lock and cycle.
5482                                  */
5483                                 free_extent_buffer(next);
5484                                 btrfs_release_path(path);
5485                                 cond_resched();
5486                                 goto again;
5487                         }
5488                         if (!ret) {
5489                                 btrfs_set_path_blocking(path);
5490                                 __btrfs_tree_read_lock(next,
5491                                                        BTRFS_NESTING_RIGHT,
5492                                                        path->recurse);
5493                         }
5494                         next_rw_lock = BTRFS_READ_LOCK;
5495                 }
5496                 break;
5497         }
5498         path->slots[level] = slot;
5499         while (1) {
5500                 level--;
5501                 c = path->nodes[level];
5502                 if (path->locks[level])
5503                         btrfs_tree_unlock_rw(c, path->locks[level]);
5504
5505                 free_extent_buffer(c);
5506                 path->nodes[level] = next;
5507                 path->slots[level] = 0;
5508                 if (!path->skip_locking)
5509                         path->locks[level] = next_rw_lock;
5510                 if (!level)
5511                         break;
5512
5513                 ret = read_block_for_search(root, path, &next, level,
5514                                             0, &key);
5515                 if (ret == -EAGAIN)
5516                         goto again;
5517
5518                 if (ret < 0) {
5519                         btrfs_release_path(path);
5520                         goto done;
5521                 }
5522
5523                 if (!path->skip_locking) {
5524                         ret = btrfs_try_tree_read_lock(next);
5525                         if (!ret) {
5526                                 btrfs_set_path_blocking(path);
5527                                 __btrfs_tree_read_lock(next,
5528                                                        BTRFS_NESTING_RIGHT,
5529                                                        path->recurse);
5530                         }
5531                         next_rw_lock = BTRFS_READ_LOCK;
5532                 }
5533         }
5534         ret = 0;
5535 done:
5536         unlock_up(path, 0, 1, 0, NULL);
5537         path->leave_spinning = old_spinning;
5538         if (!old_spinning)
5539                 btrfs_set_path_blocking(path);
5540
5541         return ret;
5542 }
5543
5544 /*
5545  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5546  * searching until it gets past min_objectid or finds an item of 'type'
5547  *
5548  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5549  */
5550 int btrfs_previous_item(struct btrfs_root *root,
5551                         struct btrfs_path *path, u64 min_objectid,
5552                         int type)
5553 {
5554         struct btrfs_key found_key;
5555         struct extent_buffer *leaf;
5556         u32 nritems;
5557         int ret;
5558
5559         while (1) {
5560                 if (path->slots[0] == 0) {
5561                         btrfs_set_path_blocking(path);
5562                         ret = btrfs_prev_leaf(root, path);
5563                         if (ret != 0)
5564                                 return ret;
5565                 } else {
5566                         path->slots[0]--;
5567                 }
5568                 leaf = path->nodes[0];
5569                 nritems = btrfs_header_nritems(leaf);
5570                 if (nritems == 0)
5571                         return 1;
5572                 if (path->slots[0] == nritems)
5573                         path->slots[0]--;
5574
5575                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5576                 if (found_key.objectid < min_objectid)
5577                         break;
5578                 if (found_key.type == type)
5579                         return 0;
5580                 if (found_key.objectid == min_objectid &&
5581                     found_key.type < type)
5582                         break;
5583         }
5584         return 1;
5585 }
5586
5587 /*
5588  * search in extent tree to find a previous Metadata/Data extent item with
5589  * min objecitd.
5590  *
5591  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5592  */
5593 int btrfs_previous_extent_item(struct btrfs_root *root,
5594                         struct btrfs_path *path, u64 min_objectid)
5595 {
5596         struct btrfs_key found_key;
5597         struct extent_buffer *leaf;
5598         u32 nritems;
5599         int ret;
5600
5601         while (1) {
5602                 if (path->slots[0] == 0) {
5603                         btrfs_set_path_blocking(path);
5604                         ret = btrfs_prev_leaf(root, path);
5605                         if (ret != 0)
5606                                 return ret;
5607                 } else {
5608                         path->slots[0]--;
5609                 }
5610                 leaf = path->nodes[0];
5611                 nritems = btrfs_header_nritems(leaf);
5612                 if (nritems == 0)
5613                         return 1;
5614                 if (path->slots[0] == nritems)
5615                         path->slots[0]--;
5616
5617                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5618                 if (found_key.objectid < min_objectid)
5619                         break;
5620                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5621                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5622                         return 0;
5623                 if (found_key.objectid == min_objectid &&
5624                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5625                         break;
5626         }
5627         return 1;
5628 }