Merge tag 'scsi-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb...
[linux-2.6-microblaze.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
27
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29                       *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31                       *root, struct btrfs_key *ins_key,
32                       struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34                           struct btrfs_root *root, struct extent_buffer *dst,
35                           struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37                               struct btrfs_root *root,
38                               struct extent_buffer *dst_buf,
39                               struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41                     int level, int slot);
42 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
43                                  struct extent_buffer *eb);
44 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
45
46 struct btrfs_path *btrfs_alloc_path(void)
47 {
48         struct btrfs_path *path;
49         path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
50         return path;
51 }
52
53 /*
54  * set all locked nodes in the path to blocking locks.  This should
55  * be done before scheduling
56  */
57 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
58 {
59         int i;
60         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
61                 if (!p->nodes[i] || !p->locks[i])
62                         continue;
63                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
64                 if (p->locks[i] == BTRFS_READ_LOCK)
65                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
66                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
67                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
68         }
69 }
70
71 /*
72  * reset all the locked nodes in the patch to spinning locks.
73  *
74  * held is used to keep lockdep happy, when lockdep is enabled
75  * we set held to a blocking lock before we go around and
76  * retake all the spinlocks in the path.  You can safely use NULL
77  * for held
78  */
79 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
80                                         struct extent_buffer *held, int held_rw)
81 {
82         int i;
83
84 #ifdef CONFIG_DEBUG_LOCK_ALLOC
85         /* lockdep really cares that we take all of these spinlocks
86          * in the right order.  If any of the locks in the path are not
87          * currently blocking, it is going to complain.  So, make really
88          * really sure by forcing the path to blocking before we clear
89          * the path blocking.
90          */
91         if (held) {
92                 btrfs_set_lock_blocking_rw(held, held_rw);
93                 if (held_rw == BTRFS_WRITE_LOCK)
94                         held_rw = BTRFS_WRITE_LOCK_BLOCKING;
95                 else if (held_rw == BTRFS_READ_LOCK)
96                         held_rw = BTRFS_READ_LOCK_BLOCKING;
97         }
98         btrfs_set_path_blocking(p);
99 #endif
100
101         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
102                 if (p->nodes[i] && p->locks[i]) {
103                         btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
104                         if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
105                                 p->locks[i] = BTRFS_WRITE_LOCK;
106                         else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
107                                 p->locks[i] = BTRFS_READ_LOCK;
108                 }
109         }
110
111 #ifdef CONFIG_DEBUG_LOCK_ALLOC
112         if (held)
113                 btrfs_clear_lock_blocking_rw(held, held_rw);
114 #endif
115 }
116
117 /* this also releases the path */
118 void btrfs_free_path(struct btrfs_path *p)
119 {
120         if (!p)
121                 return;
122         btrfs_release_path(p);
123         kmem_cache_free(btrfs_path_cachep, p);
124 }
125
126 /*
127  * path release drops references on the extent buffers in the path
128  * and it drops any locks held by this path
129  *
130  * It is safe to call this on paths that no locks or extent buffers held.
131  */
132 noinline void btrfs_release_path(struct btrfs_path *p)
133 {
134         int i;
135
136         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
137                 p->slots[i] = 0;
138                 if (!p->nodes[i])
139                         continue;
140                 if (p->locks[i]) {
141                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
142                         p->locks[i] = 0;
143                 }
144                 free_extent_buffer(p->nodes[i]);
145                 p->nodes[i] = NULL;
146         }
147 }
148
149 /*
150  * safely gets a reference on the root node of a tree.  A lock
151  * is not taken, so a concurrent writer may put a different node
152  * at the root of the tree.  See btrfs_lock_root_node for the
153  * looping required.
154  *
155  * The extent buffer returned by this has a reference taken, so
156  * it won't disappear.  It may stop being the root of the tree
157  * at any time because there are no locks held.
158  */
159 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
160 {
161         struct extent_buffer *eb;
162
163         while (1) {
164                 rcu_read_lock();
165                 eb = rcu_dereference(root->node);
166
167                 /*
168                  * RCU really hurts here, we could free up the root node because
169                  * it was cow'ed but we may not get the new root node yet so do
170                  * the inc_not_zero dance and if it doesn't work then
171                  * synchronize_rcu and try again.
172                  */
173                 if (atomic_inc_not_zero(&eb->refs)) {
174                         rcu_read_unlock();
175                         break;
176                 }
177                 rcu_read_unlock();
178                 synchronize_rcu();
179         }
180         return eb;
181 }
182
183 /* loop around taking references on and locking the root node of the
184  * tree until you end up with a lock on the root.  A locked buffer
185  * is returned, with a reference held.
186  */
187 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
188 {
189         struct extent_buffer *eb;
190
191         while (1) {
192                 eb = btrfs_root_node(root);
193                 btrfs_tree_lock(eb);
194                 if (eb == root->node)
195                         break;
196                 btrfs_tree_unlock(eb);
197                 free_extent_buffer(eb);
198         }
199         return eb;
200 }
201
202 /* loop around taking references on and locking the root node of the
203  * tree until you end up with a lock on the root.  A locked buffer
204  * is returned, with a reference held.
205  */
206 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
207 {
208         struct extent_buffer *eb;
209
210         while (1) {
211                 eb = btrfs_root_node(root);
212                 btrfs_tree_read_lock(eb);
213                 if (eb == root->node)
214                         break;
215                 btrfs_tree_read_unlock(eb);
216                 free_extent_buffer(eb);
217         }
218         return eb;
219 }
220
221 /* cowonly root (everything not a reference counted cow subvolume), just get
222  * put onto a simple dirty list.  transaction.c walks this to make sure they
223  * get properly updated on disk.
224  */
225 static void add_root_to_dirty_list(struct btrfs_root *root)
226 {
227         spin_lock(&root->fs_info->trans_lock);
228         if (root->track_dirty && list_empty(&root->dirty_list)) {
229                 list_add(&root->dirty_list,
230                          &root->fs_info->dirty_cowonly_roots);
231         }
232         spin_unlock(&root->fs_info->trans_lock);
233 }
234
235 /*
236  * used by snapshot creation to make a copy of a root for a tree with
237  * a given objectid.  The buffer with the new root node is returned in
238  * cow_ret, and this func returns zero on success or a negative error code.
239  */
240 int btrfs_copy_root(struct btrfs_trans_handle *trans,
241                       struct btrfs_root *root,
242                       struct extent_buffer *buf,
243                       struct extent_buffer **cow_ret, u64 new_root_objectid)
244 {
245         struct extent_buffer *cow;
246         int ret = 0;
247         int level;
248         struct btrfs_disk_key disk_key;
249
250         WARN_ON(root->ref_cows && trans->transid !=
251                 root->fs_info->running_transaction->transid);
252         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
253
254         level = btrfs_header_level(buf);
255         if (level == 0)
256                 btrfs_item_key(buf, &disk_key, 0);
257         else
258                 btrfs_node_key(buf, &disk_key, 0);
259
260         cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
261                                      new_root_objectid, &disk_key, level,
262                                      buf->start, 0);
263         if (IS_ERR(cow))
264                 return PTR_ERR(cow);
265
266         copy_extent_buffer(cow, buf, 0, 0, cow->len);
267         btrfs_set_header_bytenr(cow, cow->start);
268         btrfs_set_header_generation(cow, trans->transid);
269         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
270         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
271                                      BTRFS_HEADER_FLAG_RELOC);
272         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
273                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
274         else
275                 btrfs_set_header_owner(cow, new_root_objectid);
276
277         write_extent_buffer(cow, root->fs_info->fsid,
278                             (unsigned long)btrfs_header_fsid(cow),
279                             BTRFS_FSID_SIZE);
280
281         WARN_ON(btrfs_header_generation(buf) > trans->transid);
282         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
283                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
284         else
285                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
286
287         if (ret)
288                 return ret;
289
290         btrfs_mark_buffer_dirty(cow);
291         *cow_ret = cow;
292         return 0;
293 }
294
295 enum mod_log_op {
296         MOD_LOG_KEY_REPLACE,
297         MOD_LOG_KEY_ADD,
298         MOD_LOG_KEY_REMOVE,
299         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
300         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
301         MOD_LOG_MOVE_KEYS,
302         MOD_LOG_ROOT_REPLACE,
303 };
304
305 struct tree_mod_move {
306         int dst_slot;
307         int nr_items;
308 };
309
310 struct tree_mod_root {
311         u64 logical;
312         u8 level;
313 };
314
315 struct tree_mod_elem {
316         struct rb_node node;
317         u64 index;              /* shifted logical */
318         u64 seq;
319         enum mod_log_op op;
320
321         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
322         int slot;
323
324         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
325         u64 generation;
326
327         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
328         struct btrfs_disk_key key;
329         u64 blockptr;
330
331         /* this is used for op == MOD_LOG_MOVE_KEYS */
332         struct tree_mod_move move;
333
334         /* this is used for op == MOD_LOG_ROOT_REPLACE */
335         struct tree_mod_root old_root;
336 };
337
338 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
339 {
340         read_lock(&fs_info->tree_mod_log_lock);
341 }
342
343 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
344 {
345         read_unlock(&fs_info->tree_mod_log_lock);
346 }
347
348 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
349 {
350         write_lock(&fs_info->tree_mod_log_lock);
351 }
352
353 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
354 {
355         write_unlock(&fs_info->tree_mod_log_lock);
356 }
357
358 /*
359  * Increment the upper half of tree_mod_seq, set lower half zero.
360  *
361  * Must be called with fs_info->tree_mod_seq_lock held.
362  */
363 static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
364 {
365         u64 seq = atomic64_read(&fs_info->tree_mod_seq);
366         seq &= 0xffffffff00000000ull;
367         seq += 1ull << 32;
368         atomic64_set(&fs_info->tree_mod_seq, seq);
369         return seq;
370 }
371
372 /*
373  * Increment the lower half of tree_mod_seq.
374  *
375  * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
376  * are generated should not technically require a spin lock here. (Rationale:
377  * incrementing the minor while incrementing the major seq number is between its
378  * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
379  * just returns a unique sequence number as usual.) We have decided to leave
380  * that requirement in here and rethink it once we notice it really imposes a
381  * problem on some workload.
382  */
383 static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
384 {
385         return atomic64_inc_return(&fs_info->tree_mod_seq);
386 }
387
388 /*
389  * return the last minor in the previous major tree_mod_seq number
390  */
391 u64 btrfs_tree_mod_seq_prev(u64 seq)
392 {
393         return (seq & 0xffffffff00000000ull) - 1ull;
394 }
395
396 /*
397  * This adds a new blocker to the tree mod log's blocker list if the @elem
398  * passed does not already have a sequence number set. So when a caller expects
399  * to record tree modifications, it should ensure to set elem->seq to zero
400  * before calling btrfs_get_tree_mod_seq.
401  * Returns a fresh, unused tree log modification sequence number, even if no new
402  * blocker was added.
403  */
404 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
405                            struct seq_list *elem)
406 {
407         u64 seq;
408
409         tree_mod_log_write_lock(fs_info);
410         spin_lock(&fs_info->tree_mod_seq_lock);
411         if (!elem->seq) {
412                 elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
413                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
414         }
415         seq = btrfs_inc_tree_mod_seq_minor(fs_info);
416         spin_unlock(&fs_info->tree_mod_seq_lock);
417         tree_mod_log_write_unlock(fs_info);
418
419         return seq;
420 }
421
422 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
423                             struct seq_list *elem)
424 {
425         struct rb_root *tm_root;
426         struct rb_node *node;
427         struct rb_node *next;
428         struct seq_list *cur_elem;
429         struct tree_mod_elem *tm;
430         u64 min_seq = (u64)-1;
431         u64 seq_putting = elem->seq;
432
433         if (!seq_putting)
434                 return;
435
436         spin_lock(&fs_info->tree_mod_seq_lock);
437         list_del(&elem->list);
438         elem->seq = 0;
439
440         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
441                 if (cur_elem->seq < min_seq) {
442                         if (seq_putting > cur_elem->seq) {
443                                 /*
444                                  * blocker with lower sequence number exists, we
445                                  * cannot remove anything from the log
446                                  */
447                                 spin_unlock(&fs_info->tree_mod_seq_lock);
448                                 return;
449                         }
450                         min_seq = cur_elem->seq;
451                 }
452         }
453         spin_unlock(&fs_info->tree_mod_seq_lock);
454
455         /*
456          * anything that's lower than the lowest existing (read: blocked)
457          * sequence number can be removed from the tree.
458          */
459         tree_mod_log_write_lock(fs_info);
460         tm_root = &fs_info->tree_mod_log;
461         for (node = rb_first(tm_root); node; node = next) {
462                 next = rb_next(node);
463                 tm = container_of(node, struct tree_mod_elem, node);
464                 if (tm->seq > min_seq)
465                         continue;
466                 rb_erase(node, tm_root);
467                 kfree(tm);
468         }
469         tree_mod_log_write_unlock(fs_info);
470 }
471
472 /*
473  * key order of the log:
474  *       index -> sequence
475  *
476  * the index is the shifted logical of the *new* root node for root replace
477  * operations, or the shifted logical of the affected block for all other
478  * operations.
479  */
480 static noinline int
481 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
482 {
483         struct rb_root *tm_root;
484         struct rb_node **new;
485         struct rb_node *parent = NULL;
486         struct tree_mod_elem *cur;
487
488         BUG_ON(!tm || !tm->seq);
489
490         tm_root = &fs_info->tree_mod_log;
491         new = &tm_root->rb_node;
492         while (*new) {
493                 cur = container_of(*new, struct tree_mod_elem, node);
494                 parent = *new;
495                 if (cur->index < tm->index)
496                         new = &((*new)->rb_left);
497                 else if (cur->index > tm->index)
498                         new = &((*new)->rb_right);
499                 else if (cur->seq < tm->seq)
500                         new = &((*new)->rb_left);
501                 else if (cur->seq > tm->seq)
502                         new = &((*new)->rb_right);
503                 else {
504                         kfree(tm);
505                         return -EEXIST;
506                 }
507         }
508
509         rb_link_node(&tm->node, parent, new);
510         rb_insert_color(&tm->node, tm_root);
511         return 0;
512 }
513
514 /*
515  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
516  * returns zero with the tree_mod_log_lock acquired. The caller must hold
517  * this until all tree mod log insertions are recorded in the rb tree and then
518  * call tree_mod_log_write_unlock() to release.
519  */
520 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
521                                     struct extent_buffer *eb) {
522         smp_mb();
523         if (list_empty(&(fs_info)->tree_mod_seq_list))
524                 return 1;
525         if (eb && btrfs_header_level(eb) == 0)
526                 return 1;
527
528         tree_mod_log_write_lock(fs_info);
529         if (list_empty(&fs_info->tree_mod_seq_list)) {
530                 /*
531                  * someone emptied the list while we were waiting for the lock.
532                  * we must not add to the list when no blocker exists.
533                  */
534                 tree_mod_log_write_unlock(fs_info);
535                 return 1;
536         }
537
538         return 0;
539 }
540
541 /*
542  * This allocates memory and gets a tree modification sequence number.
543  *
544  * Returns <0 on error.
545  * Returns >0 (the added sequence number) on success.
546  */
547 static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
548                                  struct tree_mod_elem **tm_ret)
549 {
550         struct tree_mod_elem *tm;
551
552         /*
553          * once we switch from spin locks to something different, we should
554          * honor the flags parameter here.
555          */
556         tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
557         if (!tm)
558                 return -ENOMEM;
559
560         spin_lock(&fs_info->tree_mod_seq_lock);
561         tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
562         spin_unlock(&fs_info->tree_mod_seq_lock);
563
564         return tm->seq;
565 }
566
567 static inline int
568 __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
569                           struct extent_buffer *eb, int slot,
570                           enum mod_log_op op, gfp_t flags)
571 {
572         int ret;
573         struct tree_mod_elem *tm;
574
575         ret = tree_mod_alloc(fs_info, flags, &tm);
576         if (ret < 0)
577                 return ret;
578
579         tm->index = eb->start >> PAGE_CACHE_SHIFT;
580         if (op != MOD_LOG_KEY_ADD) {
581                 btrfs_node_key(eb, &tm->key, slot);
582                 tm->blockptr = btrfs_node_blockptr(eb, slot);
583         }
584         tm->op = op;
585         tm->slot = slot;
586         tm->generation = btrfs_node_ptr_generation(eb, slot);
587
588         return __tree_mod_log_insert(fs_info, tm);
589 }
590
591 static noinline int
592 tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
593                              struct extent_buffer *eb, int slot,
594                              enum mod_log_op op, gfp_t flags)
595 {
596         int ret;
597
598         if (tree_mod_dont_log(fs_info, eb))
599                 return 0;
600
601         ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
602
603         tree_mod_log_write_unlock(fs_info);
604         return ret;
605 }
606
607 static noinline int
608 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
609                         int slot, enum mod_log_op op)
610 {
611         return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
612 }
613
614 static noinline int
615 tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
616                              struct extent_buffer *eb, int slot,
617                              enum mod_log_op op)
618 {
619         return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
620 }
621
622 static noinline int
623 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
624                          struct extent_buffer *eb, int dst_slot, int src_slot,
625                          int nr_items, gfp_t flags)
626 {
627         struct tree_mod_elem *tm;
628         int ret;
629         int i;
630
631         if (tree_mod_dont_log(fs_info, eb))
632                 return 0;
633
634         /*
635          * When we override something during the move, we log these removals.
636          * This can only happen when we move towards the beginning of the
637          * buffer, i.e. dst_slot < src_slot.
638          */
639         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
640                 ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
641                                               MOD_LOG_KEY_REMOVE_WHILE_MOVING);
642                 BUG_ON(ret < 0);
643         }
644
645         ret = tree_mod_alloc(fs_info, flags, &tm);
646         if (ret < 0)
647                 goto out;
648
649         tm->index = eb->start >> PAGE_CACHE_SHIFT;
650         tm->slot = src_slot;
651         tm->move.dst_slot = dst_slot;
652         tm->move.nr_items = nr_items;
653         tm->op = MOD_LOG_MOVE_KEYS;
654
655         ret = __tree_mod_log_insert(fs_info, tm);
656 out:
657         tree_mod_log_write_unlock(fs_info);
658         return ret;
659 }
660
661 static inline void
662 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
663 {
664         int i;
665         u32 nritems;
666         int ret;
667
668         if (btrfs_header_level(eb) == 0)
669                 return;
670
671         nritems = btrfs_header_nritems(eb);
672         for (i = nritems - 1; i >= 0; i--) {
673                 ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
674                                               MOD_LOG_KEY_REMOVE_WHILE_FREEING);
675                 BUG_ON(ret < 0);
676         }
677 }
678
679 static noinline int
680 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
681                          struct extent_buffer *old_root,
682                          struct extent_buffer *new_root, gfp_t flags,
683                          int log_removal)
684 {
685         struct tree_mod_elem *tm;
686         int ret;
687
688         if (tree_mod_dont_log(fs_info, NULL))
689                 return 0;
690
691         if (log_removal)
692                 __tree_mod_log_free_eb(fs_info, old_root);
693
694         ret = tree_mod_alloc(fs_info, flags, &tm);
695         if (ret < 0)
696                 goto out;
697
698         tm->index = new_root->start >> PAGE_CACHE_SHIFT;
699         tm->old_root.logical = old_root->start;
700         tm->old_root.level = btrfs_header_level(old_root);
701         tm->generation = btrfs_header_generation(old_root);
702         tm->op = MOD_LOG_ROOT_REPLACE;
703
704         ret = __tree_mod_log_insert(fs_info, tm);
705 out:
706         tree_mod_log_write_unlock(fs_info);
707         return ret;
708 }
709
710 static struct tree_mod_elem *
711 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
712                       int smallest)
713 {
714         struct rb_root *tm_root;
715         struct rb_node *node;
716         struct tree_mod_elem *cur = NULL;
717         struct tree_mod_elem *found = NULL;
718         u64 index = start >> PAGE_CACHE_SHIFT;
719
720         tree_mod_log_read_lock(fs_info);
721         tm_root = &fs_info->tree_mod_log;
722         node = tm_root->rb_node;
723         while (node) {
724                 cur = container_of(node, struct tree_mod_elem, node);
725                 if (cur->index < index) {
726                         node = node->rb_left;
727                 } else if (cur->index > index) {
728                         node = node->rb_right;
729                 } else if (cur->seq < min_seq) {
730                         node = node->rb_left;
731                 } else if (!smallest) {
732                         /* we want the node with the highest seq */
733                         if (found)
734                                 BUG_ON(found->seq > cur->seq);
735                         found = cur;
736                         node = node->rb_left;
737                 } else if (cur->seq > min_seq) {
738                         /* we want the node with the smallest seq */
739                         if (found)
740                                 BUG_ON(found->seq < cur->seq);
741                         found = cur;
742                         node = node->rb_right;
743                 } else {
744                         found = cur;
745                         break;
746                 }
747         }
748         tree_mod_log_read_unlock(fs_info);
749
750         return found;
751 }
752
753 /*
754  * this returns the element from the log with the smallest time sequence
755  * value that's in the log (the oldest log item). any element with a time
756  * sequence lower than min_seq will be ignored.
757  */
758 static struct tree_mod_elem *
759 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
760                            u64 min_seq)
761 {
762         return __tree_mod_log_search(fs_info, start, min_seq, 1);
763 }
764
765 /*
766  * this returns the element from the log with the largest time sequence
767  * value that's in the log (the most recent log item). any element with
768  * a time sequence lower than min_seq will be ignored.
769  */
770 static struct tree_mod_elem *
771 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
772 {
773         return __tree_mod_log_search(fs_info, start, min_seq, 0);
774 }
775
776 static noinline void
777 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
778                      struct extent_buffer *src, unsigned long dst_offset,
779                      unsigned long src_offset, int nr_items)
780 {
781         int ret;
782         int i;
783
784         if (tree_mod_dont_log(fs_info, NULL))
785                 return;
786
787         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
788                 tree_mod_log_write_unlock(fs_info);
789                 return;
790         }
791
792         for (i = 0; i < nr_items; i++) {
793                 ret = tree_mod_log_insert_key_locked(fs_info, src,
794                                                 i + src_offset,
795                                                 MOD_LOG_KEY_REMOVE);
796                 BUG_ON(ret < 0);
797                 ret = tree_mod_log_insert_key_locked(fs_info, dst,
798                                                      i + dst_offset,
799                                                      MOD_LOG_KEY_ADD);
800                 BUG_ON(ret < 0);
801         }
802
803         tree_mod_log_write_unlock(fs_info);
804 }
805
806 static inline void
807 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
808                      int dst_offset, int src_offset, int nr_items)
809 {
810         int ret;
811         ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
812                                        nr_items, GFP_NOFS);
813         BUG_ON(ret < 0);
814 }
815
816 static noinline void
817 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
818                           struct extent_buffer *eb, int slot, int atomic)
819 {
820         int ret;
821
822         ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
823                                            MOD_LOG_KEY_REPLACE,
824                                            atomic ? GFP_ATOMIC : GFP_NOFS);
825         BUG_ON(ret < 0);
826 }
827
828 static noinline void
829 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
830 {
831         if (tree_mod_dont_log(fs_info, eb))
832                 return;
833
834         __tree_mod_log_free_eb(fs_info, eb);
835
836         tree_mod_log_write_unlock(fs_info);
837 }
838
839 static noinline void
840 tree_mod_log_set_root_pointer(struct btrfs_root *root,
841                               struct extent_buffer *new_root_node,
842                               int log_removal)
843 {
844         int ret;
845         ret = tree_mod_log_insert_root(root->fs_info, root->node,
846                                        new_root_node, GFP_NOFS, log_removal);
847         BUG_ON(ret < 0);
848 }
849
850 /*
851  * check if the tree block can be shared by multiple trees
852  */
853 int btrfs_block_can_be_shared(struct btrfs_root *root,
854                               struct extent_buffer *buf)
855 {
856         /*
857          * Tree blocks not in refernece counted trees and tree roots
858          * are never shared. If a block was allocated after the last
859          * snapshot and the block was not allocated by tree relocation,
860          * we know the block is not shared.
861          */
862         if (root->ref_cows &&
863             buf != root->node && buf != root->commit_root &&
864             (btrfs_header_generation(buf) <=
865              btrfs_root_last_snapshot(&root->root_item) ||
866              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
867                 return 1;
868 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
869         if (root->ref_cows &&
870             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
871                 return 1;
872 #endif
873         return 0;
874 }
875
876 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
877                                        struct btrfs_root *root,
878                                        struct extent_buffer *buf,
879                                        struct extent_buffer *cow,
880                                        int *last_ref)
881 {
882         u64 refs;
883         u64 owner;
884         u64 flags;
885         u64 new_flags = 0;
886         int ret;
887
888         /*
889          * Backrefs update rules:
890          *
891          * Always use full backrefs for extent pointers in tree block
892          * allocated by tree relocation.
893          *
894          * If a shared tree block is no longer referenced by its owner
895          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
896          * use full backrefs for extent pointers in tree block.
897          *
898          * If a tree block is been relocating
899          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
900          * use full backrefs for extent pointers in tree block.
901          * The reason for this is some operations (such as drop tree)
902          * are only allowed for blocks use full backrefs.
903          */
904
905         if (btrfs_block_can_be_shared(root, buf)) {
906                 ret = btrfs_lookup_extent_info(trans, root, buf->start,
907                                                btrfs_header_level(buf), 1,
908                                                &refs, &flags);
909                 if (ret)
910                         return ret;
911                 if (refs == 0) {
912                         ret = -EROFS;
913                         btrfs_std_error(root->fs_info, ret);
914                         return ret;
915                 }
916         } else {
917                 refs = 1;
918                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
919                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
920                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
921                 else
922                         flags = 0;
923         }
924
925         owner = btrfs_header_owner(buf);
926         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
927                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
928
929         if (refs > 1) {
930                 if ((owner == root->root_key.objectid ||
931                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
932                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
933                         ret = btrfs_inc_ref(trans, root, buf, 1, 1);
934                         BUG_ON(ret); /* -ENOMEM */
935
936                         if (root->root_key.objectid ==
937                             BTRFS_TREE_RELOC_OBJECTID) {
938                                 ret = btrfs_dec_ref(trans, root, buf, 0, 1);
939                                 BUG_ON(ret); /* -ENOMEM */
940                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
941                                 BUG_ON(ret); /* -ENOMEM */
942                         }
943                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
944                 } else {
945
946                         if (root->root_key.objectid ==
947                             BTRFS_TREE_RELOC_OBJECTID)
948                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
949                         else
950                                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
951                         BUG_ON(ret); /* -ENOMEM */
952                 }
953                 if (new_flags != 0) {
954                         ret = btrfs_set_disk_extent_flags(trans, root,
955                                                           buf->start,
956                                                           buf->len,
957                                                           new_flags, 0);
958                         if (ret)
959                                 return ret;
960                 }
961         } else {
962                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
963                         if (root->root_key.objectid ==
964                             BTRFS_TREE_RELOC_OBJECTID)
965                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
966                         else
967                                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
968                         BUG_ON(ret); /* -ENOMEM */
969                         ret = btrfs_dec_ref(trans, root, buf, 1, 1);
970                         BUG_ON(ret); /* -ENOMEM */
971                 }
972                 clean_tree_block(trans, root, buf);
973                 *last_ref = 1;
974         }
975         return 0;
976 }
977
978 /*
979  * does the dirty work in cow of a single block.  The parent block (if
980  * supplied) is updated to point to the new cow copy.  The new buffer is marked
981  * dirty and returned locked.  If you modify the block it needs to be marked
982  * dirty again.
983  *
984  * search_start -- an allocation hint for the new block
985  *
986  * empty_size -- a hint that you plan on doing more cow.  This is the size in
987  * bytes the allocator should try to find free next to the block it returns.
988  * This is just a hint and may be ignored by the allocator.
989  */
990 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
991                              struct btrfs_root *root,
992                              struct extent_buffer *buf,
993                              struct extent_buffer *parent, int parent_slot,
994                              struct extent_buffer **cow_ret,
995                              u64 search_start, u64 empty_size)
996 {
997         struct btrfs_disk_key disk_key;
998         struct extent_buffer *cow;
999         int level, ret;
1000         int last_ref = 0;
1001         int unlock_orig = 0;
1002         u64 parent_start;
1003
1004         if (*cow_ret == buf)
1005                 unlock_orig = 1;
1006
1007         btrfs_assert_tree_locked(buf);
1008
1009         WARN_ON(root->ref_cows && trans->transid !=
1010                 root->fs_info->running_transaction->transid);
1011         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
1012
1013         level = btrfs_header_level(buf);
1014
1015         if (level == 0)
1016                 btrfs_item_key(buf, &disk_key, 0);
1017         else
1018                 btrfs_node_key(buf, &disk_key, 0);
1019
1020         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1021                 if (parent)
1022                         parent_start = parent->start;
1023                 else
1024                         parent_start = 0;
1025         } else
1026                 parent_start = 0;
1027
1028         cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
1029                                      root->root_key.objectid, &disk_key,
1030                                      level, search_start, empty_size);
1031         if (IS_ERR(cow))
1032                 return PTR_ERR(cow);
1033
1034         /* cow is set to blocking by btrfs_init_new_buffer */
1035
1036         copy_extent_buffer(cow, buf, 0, 0, cow->len);
1037         btrfs_set_header_bytenr(cow, cow->start);
1038         btrfs_set_header_generation(cow, trans->transid);
1039         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1040         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1041                                      BTRFS_HEADER_FLAG_RELOC);
1042         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1043                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1044         else
1045                 btrfs_set_header_owner(cow, root->root_key.objectid);
1046
1047         write_extent_buffer(cow, root->fs_info->fsid,
1048                             (unsigned long)btrfs_header_fsid(cow),
1049                             BTRFS_FSID_SIZE);
1050
1051         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1052         if (ret) {
1053                 btrfs_abort_transaction(trans, root, ret);
1054                 return ret;
1055         }
1056
1057         if (root->ref_cows)
1058                 btrfs_reloc_cow_block(trans, root, buf, cow);
1059
1060         if (buf == root->node) {
1061                 WARN_ON(parent && parent != buf);
1062                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1063                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1064                         parent_start = buf->start;
1065                 else
1066                         parent_start = 0;
1067
1068                 extent_buffer_get(cow);
1069                 tree_mod_log_set_root_pointer(root, cow, 1);
1070                 rcu_assign_pointer(root->node, cow);
1071
1072                 btrfs_free_tree_block(trans, root, buf, parent_start,
1073                                       last_ref);
1074                 free_extent_buffer(buf);
1075                 add_root_to_dirty_list(root);
1076         } else {
1077                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1078                         parent_start = parent->start;
1079                 else
1080                         parent_start = 0;
1081
1082                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1083                 tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1084                                         MOD_LOG_KEY_REPLACE);
1085                 btrfs_set_node_blockptr(parent, parent_slot,
1086                                         cow->start);
1087                 btrfs_set_node_ptr_generation(parent, parent_slot,
1088                                               trans->transid);
1089                 btrfs_mark_buffer_dirty(parent);
1090                 tree_mod_log_free_eb(root->fs_info, buf);
1091                 btrfs_free_tree_block(trans, root, buf, parent_start,
1092                                       last_ref);
1093         }
1094         if (unlock_orig)
1095                 btrfs_tree_unlock(buf);
1096         free_extent_buffer_stale(buf);
1097         btrfs_mark_buffer_dirty(cow);
1098         *cow_ret = cow;
1099         return 0;
1100 }
1101
1102 /*
1103  * returns the logical address of the oldest predecessor of the given root.
1104  * entries older than time_seq are ignored.
1105  */
1106 static struct tree_mod_elem *
1107 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1108                            struct extent_buffer *eb_root, u64 time_seq)
1109 {
1110         struct tree_mod_elem *tm;
1111         struct tree_mod_elem *found = NULL;
1112         u64 root_logical = eb_root->start;
1113         int looped = 0;
1114
1115         if (!time_seq)
1116                 return 0;
1117
1118         /*
1119          * the very last operation that's logged for a root is the replacement
1120          * operation (if it is replaced at all). this has the index of the *new*
1121          * root, making it the very first operation that's logged for this root.
1122          */
1123         while (1) {
1124                 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1125                                                 time_seq);
1126                 if (!looped && !tm)
1127                         return 0;
1128                 /*
1129                  * if there are no tree operation for the oldest root, we simply
1130                  * return it. this should only happen if that (old) root is at
1131                  * level 0.
1132                  */
1133                 if (!tm)
1134                         break;
1135
1136                 /*
1137                  * if there's an operation that's not a root replacement, we
1138                  * found the oldest version of our root. normally, we'll find a
1139                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1140                  */
1141                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1142                         break;
1143
1144                 found = tm;
1145                 root_logical = tm->old_root.logical;
1146                 looped = 1;
1147         }
1148
1149         /* if there's no old root to return, return what we found instead */
1150         if (!found)
1151                 found = tm;
1152
1153         return found;
1154 }
1155
1156 /*
1157  * tm is a pointer to the first operation to rewind within eb. then, all
1158  * previous operations will be rewinded (until we reach something older than
1159  * time_seq).
1160  */
1161 static void
1162 __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1163                       struct tree_mod_elem *first_tm)
1164 {
1165         u32 n;
1166         struct rb_node *next;
1167         struct tree_mod_elem *tm = first_tm;
1168         unsigned long o_dst;
1169         unsigned long o_src;
1170         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1171
1172         n = btrfs_header_nritems(eb);
1173         while (tm && tm->seq >= time_seq) {
1174                 /*
1175                  * all the operations are recorded with the operator used for
1176                  * the modification. as we're going backwards, we do the
1177                  * opposite of each operation here.
1178                  */
1179                 switch (tm->op) {
1180                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1181                         BUG_ON(tm->slot < n);
1182                         /* Fallthrough */
1183                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1184                 case MOD_LOG_KEY_REMOVE:
1185                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1186                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1187                         btrfs_set_node_ptr_generation(eb, tm->slot,
1188                                                       tm->generation);
1189                         n++;
1190                         break;
1191                 case MOD_LOG_KEY_REPLACE:
1192                         BUG_ON(tm->slot >= n);
1193                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1194                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1195                         btrfs_set_node_ptr_generation(eb, tm->slot,
1196                                                       tm->generation);
1197                         break;
1198                 case MOD_LOG_KEY_ADD:
1199                         /* if a move operation is needed it's in the log */
1200                         n--;
1201                         break;
1202                 case MOD_LOG_MOVE_KEYS:
1203                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1204                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1205                         memmove_extent_buffer(eb, o_dst, o_src,
1206                                               tm->move.nr_items * p_size);
1207                         break;
1208                 case MOD_LOG_ROOT_REPLACE:
1209                         /*
1210                          * this operation is special. for roots, this must be
1211                          * handled explicitly before rewinding.
1212                          * for non-roots, this operation may exist if the node
1213                          * was a root: root A -> child B; then A gets empty and
1214                          * B is promoted to the new root. in the mod log, we'll
1215                          * have a root-replace operation for B, a tree block
1216                          * that is no root. we simply ignore that operation.
1217                          */
1218                         break;
1219                 }
1220                 next = rb_next(&tm->node);
1221                 if (!next)
1222                         break;
1223                 tm = container_of(next, struct tree_mod_elem, node);
1224                 if (tm->index != first_tm->index)
1225                         break;
1226         }
1227         btrfs_set_header_nritems(eb, n);
1228 }
1229
1230 /*
1231  * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1232  * is returned. If rewind operations happen, a fresh buffer is returned. The
1233  * returned buffer is always read-locked. If the returned buffer is not the
1234  * input buffer, the lock on the input buffer is released and the input buffer
1235  * is freed (its refcount is decremented).
1236  */
1237 static struct extent_buffer *
1238 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1239                     u64 time_seq)
1240 {
1241         struct extent_buffer *eb_rewin;
1242         struct tree_mod_elem *tm;
1243
1244         if (!time_seq)
1245                 return eb;
1246
1247         if (btrfs_header_level(eb) == 0)
1248                 return eb;
1249
1250         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1251         if (!tm)
1252                 return eb;
1253
1254         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1255                 BUG_ON(tm->slot != 0);
1256                 eb_rewin = alloc_dummy_extent_buffer(eb->start,
1257                                                 fs_info->tree_root->nodesize);
1258                 BUG_ON(!eb_rewin);
1259                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1260                 btrfs_set_header_backref_rev(eb_rewin,
1261                                              btrfs_header_backref_rev(eb));
1262                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1263                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1264         } else {
1265                 eb_rewin = btrfs_clone_extent_buffer(eb);
1266                 BUG_ON(!eb_rewin);
1267         }
1268
1269         extent_buffer_get(eb_rewin);
1270         btrfs_tree_read_unlock(eb);
1271         free_extent_buffer(eb);
1272
1273         extent_buffer_get(eb_rewin);
1274         btrfs_tree_read_lock(eb_rewin);
1275         __tree_mod_log_rewind(eb_rewin, time_seq, tm);
1276         WARN_ON(btrfs_header_nritems(eb_rewin) >
1277                 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1278
1279         return eb_rewin;
1280 }
1281
1282 /*
1283  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1284  * value. If there are no changes, the current root->root_node is returned. If
1285  * anything changed in between, there's a fresh buffer allocated on which the
1286  * rewind operations are done. In any case, the returned buffer is read locked.
1287  * Returns NULL on error (with no locks held).
1288  */
1289 static inline struct extent_buffer *
1290 get_old_root(struct btrfs_root *root, u64 time_seq)
1291 {
1292         struct tree_mod_elem *tm;
1293         struct extent_buffer *eb = NULL;
1294         struct extent_buffer *eb_root;
1295         struct extent_buffer *old;
1296         struct tree_mod_root *old_root = NULL;
1297         u64 old_generation = 0;
1298         u64 logical;
1299         u32 blocksize;
1300
1301         eb_root = btrfs_read_lock_root_node(root);
1302         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1303         if (!tm)
1304                 return eb_root;
1305
1306         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1307                 old_root = &tm->old_root;
1308                 old_generation = tm->generation;
1309                 logical = old_root->logical;
1310         } else {
1311                 logical = eb_root->start;
1312         }
1313
1314         tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1315         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1316                 btrfs_tree_read_unlock(eb_root);
1317                 free_extent_buffer(eb_root);
1318                 blocksize = btrfs_level_size(root, old_root->level);
1319                 old = read_tree_block(root, logical, blocksize, 0);
1320                 if (!old || !extent_buffer_uptodate(old)) {
1321                         free_extent_buffer(old);
1322                         pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
1323                                 logical);
1324                         WARN_ON(1);
1325                 } else {
1326                         eb = btrfs_clone_extent_buffer(old);
1327                         free_extent_buffer(old);
1328                 }
1329         } else if (old_root) {
1330                 btrfs_tree_read_unlock(eb_root);
1331                 free_extent_buffer(eb_root);
1332                 eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1333         } else {
1334                 eb = btrfs_clone_extent_buffer(eb_root);
1335                 btrfs_tree_read_unlock(eb_root);
1336                 free_extent_buffer(eb_root);
1337         }
1338
1339         if (!eb)
1340                 return NULL;
1341         extent_buffer_get(eb);
1342         btrfs_tree_read_lock(eb);
1343         if (old_root) {
1344                 btrfs_set_header_bytenr(eb, eb->start);
1345                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1346                 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1347                 btrfs_set_header_level(eb, old_root->level);
1348                 btrfs_set_header_generation(eb, old_generation);
1349         }
1350         if (tm)
1351                 __tree_mod_log_rewind(eb, time_seq, tm);
1352         else
1353                 WARN_ON(btrfs_header_level(eb) != 0);
1354         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1355
1356         return eb;
1357 }
1358
1359 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1360 {
1361         struct tree_mod_elem *tm;
1362         int level;
1363         struct extent_buffer *eb_root = btrfs_root_node(root);
1364
1365         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1366         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1367                 level = tm->old_root.level;
1368         } else {
1369                 level = btrfs_header_level(eb_root);
1370         }
1371         free_extent_buffer(eb_root);
1372
1373         return level;
1374 }
1375
1376 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1377                                    struct btrfs_root *root,
1378                                    struct extent_buffer *buf)
1379 {
1380         /* ensure we can see the force_cow */
1381         smp_rmb();
1382
1383         /*
1384          * We do not need to cow a block if
1385          * 1) this block is not created or changed in this transaction;
1386          * 2) this block does not belong to TREE_RELOC tree;
1387          * 3) the root is not forced COW.
1388          *
1389          * What is forced COW:
1390          *    when we create snapshot during commiting the transaction,
1391          *    after we've finished coping src root, we must COW the shared
1392          *    block to ensure the metadata consistency.
1393          */
1394         if (btrfs_header_generation(buf) == trans->transid &&
1395             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1396             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1397               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1398             !root->force_cow)
1399                 return 0;
1400         return 1;
1401 }
1402
1403 /*
1404  * cows a single block, see __btrfs_cow_block for the real work.
1405  * This version of it has extra checks so that a block isn't cow'd more than
1406  * once per transaction, as long as it hasn't been written yet
1407  */
1408 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1409                     struct btrfs_root *root, struct extent_buffer *buf,
1410                     struct extent_buffer *parent, int parent_slot,
1411                     struct extent_buffer **cow_ret)
1412 {
1413         u64 search_start;
1414         int ret;
1415
1416         if (trans->transaction != root->fs_info->running_transaction)
1417                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1418                        (unsigned long long)trans->transid,
1419                        (unsigned long long)
1420                        root->fs_info->running_transaction->transid);
1421
1422         if (trans->transid != root->fs_info->generation)
1423                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1424                        (unsigned long long)trans->transid,
1425                        (unsigned long long)root->fs_info->generation);
1426
1427         if (!should_cow_block(trans, root, buf)) {
1428                 *cow_ret = buf;
1429                 return 0;
1430         }
1431
1432         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1433
1434         if (parent)
1435                 btrfs_set_lock_blocking(parent);
1436         btrfs_set_lock_blocking(buf);
1437
1438         ret = __btrfs_cow_block(trans, root, buf, parent,
1439                                  parent_slot, cow_ret, search_start, 0);
1440
1441         trace_btrfs_cow_block(root, buf, *cow_ret);
1442
1443         return ret;
1444 }
1445
1446 /*
1447  * helper function for defrag to decide if two blocks pointed to by a
1448  * node are actually close by
1449  */
1450 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1451 {
1452         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1453                 return 1;
1454         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1455                 return 1;
1456         return 0;
1457 }
1458
1459 /*
1460  * compare two keys in a memcmp fashion
1461  */
1462 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1463 {
1464         struct btrfs_key k1;
1465
1466         btrfs_disk_key_to_cpu(&k1, disk);
1467
1468         return btrfs_comp_cpu_keys(&k1, k2);
1469 }
1470
1471 /*
1472  * same as comp_keys only with two btrfs_key's
1473  */
1474 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1475 {
1476         if (k1->objectid > k2->objectid)
1477                 return 1;
1478         if (k1->objectid < k2->objectid)
1479                 return -1;
1480         if (k1->type > k2->type)
1481                 return 1;
1482         if (k1->type < k2->type)
1483                 return -1;
1484         if (k1->offset > k2->offset)
1485                 return 1;
1486         if (k1->offset < k2->offset)
1487                 return -1;
1488         return 0;
1489 }
1490
1491 /*
1492  * this is used by the defrag code to go through all the
1493  * leaves pointed to by a node and reallocate them so that
1494  * disk order is close to key order
1495  */
1496 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1497                        struct btrfs_root *root, struct extent_buffer *parent,
1498                        int start_slot, u64 *last_ret,
1499                        struct btrfs_key *progress)
1500 {
1501         struct extent_buffer *cur;
1502         u64 blocknr;
1503         u64 gen;
1504         u64 search_start = *last_ret;
1505         u64 last_block = 0;
1506         u64 other;
1507         u32 parent_nritems;
1508         int end_slot;
1509         int i;
1510         int err = 0;
1511         int parent_level;
1512         int uptodate;
1513         u32 blocksize;
1514         int progress_passed = 0;
1515         struct btrfs_disk_key disk_key;
1516
1517         parent_level = btrfs_header_level(parent);
1518
1519         WARN_ON(trans->transaction != root->fs_info->running_transaction);
1520         WARN_ON(trans->transid != root->fs_info->generation);
1521
1522         parent_nritems = btrfs_header_nritems(parent);
1523         blocksize = btrfs_level_size(root, parent_level - 1);
1524         end_slot = parent_nritems;
1525
1526         if (parent_nritems == 1)
1527                 return 0;
1528
1529         btrfs_set_lock_blocking(parent);
1530
1531         for (i = start_slot; i < end_slot; i++) {
1532                 int close = 1;
1533
1534                 btrfs_node_key(parent, &disk_key, i);
1535                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1536                         continue;
1537
1538                 progress_passed = 1;
1539                 blocknr = btrfs_node_blockptr(parent, i);
1540                 gen = btrfs_node_ptr_generation(parent, i);
1541                 if (last_block == 0)
1542                         last_block = blocknr;
1543
1544                 if (i > 0) {
1545                         other = btrfs_node_blockptr(parent, i - 1);
1546                         close = close_blocks(blocknr, other, blocksize);
1547                 }
1548                 if (!close && i < end_slot - 2) {
1549                         other = btrfs_node_blockptr(parent, i + 1);
1550                         close = close_blocks(blocknr, other, blocksize);
1551                 }
1552                 if (close) {
1553                         last_block = blocknr;
1554                         continue;
1555                 }
1556
1557                 cur = btrfs_find_tree_block(root, blocknr, blocksize);
1558                 if (cur)
1559                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1560                 else
1561                         uptodate = 0;
1562                 if (!cur || !uptodate) {
1563                         if (!cur) {
1564                                 cur = read_tree_block(root, blocknr,
1565                                                          blocksize, gen);
1566                                 if (!cur || !extent_buffer_uptodate(cur)) {
1567                                         free_extent_buffer(cur);
1568                                         return -EIO;
1569                                 }
1570                         } else if (!uptodate) {
1571                                 err = btrfs_read_buffer(cur, gen);
1572                                 if (err) {
1573                                         free_extent_buffer(cur);
1574                                         return err;
1575                                 }
1576                         }
1577                 }
1578                 if (search_start == 0)
1579                         search_start = last_block;
1580
1581                 btrfs_tree_lock(cur);
1582                 btrfs_set_lock_blocking(cur);
1583                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1584                                         &cur, search_start,
1585                                         min(16 * blocksize,
1586                                             (end_slot - i) * blocksize));
1587                 if (err) {
1588                         btrfs_tree_unlock(cur);
1589                         free_extent_buffer(cur);
1590                         break;
1591                 }
1592                 search_start = cur->start;
1593                 last_block = cur->start;
1594                 *last_ret = search_start;
1595                 btrfs_tree_unlock(cur);
1596                 free_extent_buffer(cur);
1597         }
1598         return err;
1599 }
1600
1601 /*
1602  * The leaf data grows from end-to-front in the node.
1603  * this returns the address of the start of the last item,
1604  * which is the stop of the leaf data stack
1605  */
1606 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1607                                          struct extent_buffer *leaf)
1608 {
1609         u32 nr = btrfs_header_nritems(leaf);
1610         if (nr == 0)
1611                 return BTRFS_LEAF_DATA_SIZE(root);
1612         return btrfs_item_offset_nr(leaf, nr - 1);
1613 }
1614
1615
1616 /*
1617  * search for key in the extent_buffer.  The items start at offset p,
1618  * and they are item_size apart.  There are 'max' items in p.
1619  *
1620  * the slot in the array is returned via slot, and it points to
1621  * the place where you would insert key if it is not found in
1622  * the array.
1623  *
1624  * slot may point to max if the key is bigger than all of the keys
1625  */
1626 static noinline int generic_bin_search(struct extent_buffer *eb,
1627                                        unsigned long p,
1628                                        int item_size, struct btrfs_key *key,
1629                                        int max, int *slot)
1630 {
1631         int low = 0;
1632         int high = max;
1633         int mid;
1634         int ret;
1635         struct btrfs_disk_key *tmp = NULL;
1636         struct btrfs_disk_key unaligned;
1637         unsigned long offset;
1638         char *kaddr = NULL;
1639         unsigned long map_start = 0;
1640         unsigned long map_len = 0;
1641         int err;
1642
1643         while (low < high) {
1644                 mid = (low + high) / 2;
1645                 offset = p + mid * item_size;
1646
1647                 if (!kaddr || offset < map_start ||
1648                     (offset + sizeof(struct btrfs_disk_key)) >
1649                     map_start + map_len) {
1650
1651                         err = map_private_extent_buffer(eb, offset,
1652                                                 sizeof(struct btrfs_disk_key),
1653                                                 &kaddr, &map_start, &map_len);
1654
1655                         if (!err) {
1656                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1657                                                         map_start);
1658                         } else {
1659                                 read_extent_buffer(eb, &unaligned,
1660                                                    offset, sizeof(unaligned));
1661                                 tmp = &unaligned;
1662                         }
1663
1664                 } else {
1665                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1666                                                         map_start);
1667                 }
1668                 ret = comp_keys(tmp, key);
1669
1670                 if (ret < 0)
1671                         low = mid + 1;
1672                 else if (ret > 0)
1673                         high = mid;
1674                 else {
1675                         *slot = mid;
1676                         return 0;
1677                 }
1678         }
1679         *slot = low;
1680         return 1;
1681 }
1682
1683 /*
1684  * simple bin_search frontend that does the right thing for
1685  * leaves vs nodes
1686  */
1687 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1688                       int level, int *slot)
1689 {
1690         if (level == 0)
1691                 return generic_bin_search(eb,
1692                                           offsetof(struct btrfs_leaf, items),
1693                                           sizeof(struct btrfs_item),
1694                                           key, btrfs_header_nritems(eb),
1695                                           slot);
1696         else
1697                 return generic_bin_search(eb,
1698                                           offsetof(struct btrfs_node, ptrs),
1699                                           sizeof(struct btrfs_key_ptr),
1700                                           key, btrfs_header_nritems(eb),
1701                                           slot);
1702 }
1703
1704 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1705                      int level, int *slot)
1706 {
1707         return bin_search(eb, key, level, slot);
1708 }
1709
1710 static void root_add_used(struct btrfs_root *root, u32 size)
1711 {
1712         spin_lock(&root->accounting_lock);
1713         btrfs_set_root_used(&root->root_item,
1714                             btrfs_root_used(&root->root_item) + size);
1715         spin_unlock(&root->accounting_lock);
1716 }
1717
1718 static void root_sub_used(struct btrfs_root *root, u32 size)
1719 {
1720         spin_lock(&root->accounting_lock);
1721         btrfs_set_root_used(&root->root_item,
1722                             btrfs_root_used(&root->root_item) - size);
1723         spin_unlock(&root->accounting_lock);
1724 }
1725
1726 /* given a node and slot number, this reads the blocks it points to.  The
1727  * extent buffer is returned with a reference taken (but unlocked).
1728  * NULL is returned on error.
1729  */
1730 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1731                                    struct extent_buffer *parent, int slot)
1732 {
1733         int level = btrfs_header_level(parent);
1734         struct extent_buffer *eb;
1735
1736         if (slot < 0)
1737                 return NULL;
1738         if (slot >= btrfs_header_nritems(parent))
1739                 return NULL;
1740
1741         BUG_ON(level == 0);
1742
1743         eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1744                              btrfs_level_size(root, level - 1),
1745                              btrfs_node_ptr_generation(parent, slot));
1746         if (eb && !extent_buffer_uptodate(eb)) {
1747                 free_extent_buffer(eb);
1748                 eb = NULL;
1749         }
1750
1751         return eb;
1752 }
1753
1754 /*
1755  * node level balancing, used to make sure nodes are in proper order for
1756  * item deletion.  We balance from the top down, so we have to make sure
1757  * that a deletion won't leave an node completely empty later on.
1758  */
1759 static noinline int balance_level(struct btrfs_trans_handle *trans,
1760                          struct btrfs_root *root,
1761                          struct btrfs_path *path, int level)
1762 {
1763         struct extent_buffer *right = NULL;
1764         struct extent_buffer *mid;
1765         struct extent_buffer *left = NULL;
1766         struct extent_buffer *parent = NULL;
1767         int ret = 0;
1768         int wret;
1769         int pslot;
1770         int orig_slot = path->slots[level];
1771         u64 orig_ptr;
1772
1773         if (level == 0)
1774                 return 0;
1775
1776         mid = path->nodes[level];
1777
1778         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1779                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1780         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1781
1782         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1783
1784         if (level < BTRFS_MAX_LEVEL - 1) {
1785                 parent = path->nodes[level + 1];
1786                 pslot = path->slots[level + 1];
1787         }
1788
1789         /*
1790          * deal with the case where there is only one pointer in the root
1791          * by promoting the node below to a root
1792          */
1793         if (!parent) {
1794                 struct extent_buffer *child;
1795
1796                 if (btrfs_header_nritems(mid) != 1)
1797                         return 0;
1798
1799                 /* promote the child to a root */
1800                 child = read_node_slot(root, mid, 0);
1801                 if (!child) {
1802                         ret = -EROFS;
1803                         btrfs_std_error(root->fs_info, ret);
1804                         goto enospc;
1805                 }
1806
1807                 btrfs_tree_lock(child);
1808                 btrfs_set_lock_blocking(child);
1809                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1810                 if (ret) {
1811                         btrfs_tree_unlock(child);
1812                         free_extent_buffer(child);
1813                         goto enospc;
1814                 }
1815
1816                 tree_mod_log_set_root_pointer(root, child, 1);
1817                 rcu_assign_pointer(root->node, child);
1818
1819                 add_root_to_dirty_list(root);
1820                 btrfs_tree_unlock(child);
1821
1822                 path->locks[level] = 0;
1823                 path->nodes[level] = NULL;
1824                 clean_tree_block(trans, root, mid);
1825                 btrfs_tree_unlock(mid);
1826                 /* once for the path */
1827                 free_extent_buffer(mid);
1828
1829                 root_sub_used(root, mid->len);
1830                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1831                 /* once for the root ptr */
1832                 free_extent_buffer_stale(mid);
1833                 return 0;
1834         }
1835         if (btrfs_header_nritems(mid) >
1836             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1837                 return 0;
1838
1839         left = read_node_slot(root, parent, pslot - 1);
1840         if (left) {
1841                 btrfs_tree_lock(left);
1842                 btrfs_set_lock_blocking(left);
1843                 wret = btrfs_cow_block(trans, root, left,
1844                                        parent, pslot - 1, &left);
1845                 if (wret) {
1846                         ret = wret;
1847                         goto enospc;
1848                 }
1849         }
1850         right = read_node_slot(root, parent, pslot + 1);
1851         if (right) {
1852                 btrfs_tree_lock(right);
1853                 btrfs_set_lock_blocking(right);
1854                 wret = btrfs_cow_block(trans, root, right,
1855                                        parent, pslot + 1, &right);
1856                 if (wret) {
1857                         ret = wret;
1858                         goto enospc;
1859                 }
1860         }
1861
1862         /* first, try to make some room in the middle buffer */
1863         if (left) {
1864                 orig_slot += btrfs_header_nritems(left);
1865                 wret = push_node_left(trans, root, left, mid, 1);
1866                 if (wret < 0)
1867                         ret = wret;
1868         }
1869
1870         /*
1871          * then try to empty the right most buffer into the middle
1872          */
1873         if (right) {
1874                 wret = push_node_left(trans, root, mid, right, 1);
1875                 if (wret < 0 && wret != -ENOSPC)
1876                         ret = wret;
1877                 if (btrfs_header_nritems(right) == 0) {
1878                         clean_tree_block(trans, root, right);
1879                         btrfs_tree_unlock(right);
1880                         del_ptr(root, path, level + 1, pslot + 1);
1881                         root_sub_used(root, right->len);
1882                         btrfs_free_tree_block(trans, root, right, 0, 1);
1883                         free_extent_buffer_stale(right);
1884                         right = NULL;
1885                 } else {
1886                         struct btrfs_disk_key right_key;
1887                         btrfs_node_key(right, &right_key, 0);
1888                         tree_mod_log_set_node_key(root->fs_info, parent,
1889                                                   pslot + 1, 0);
1890                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1891                         btrfs_mark_buffer_dirty(parent);
1892                 }
1893         }
1894         if (btrfs_header_nritems(mid) == 1) {
1895                 /*
1896                  * we're not allowed to leave a node with one item in the
1897                  * tree during a delete.  A deletion from lower in the tree
1898                  * could try to delete the only pointer in this node.
1899                  * So, pull some keys from the left.
1900                  * There has to be a left pointer at this point because
1901                  * otherwise we would have pulled some pointers from the
1902                  * right
1903                  */
1904                 if (!left) {
1905                         ret = -EROFS;
1906                         btrfs_std_error(root->fs_info, ret);
1907                         goto enospc;
1908                 }
1909                 wret = balance_node_right(trans, root, mid, left);
1910                 if (wret < 0) {
1911                         ret = wret;
1912                         goto enospc;
1913                 }
1914                 if (wret == 1) {
1915                         wret = push_node_left(trans, root, left, mid, 1);
1916                         if (wret < 0)
1917                                 ret = wret;
1918                 }
1919                 BUG_ON(wret == 1);
1920         }
1921         if (btrfs_header_nritems(mid) == 0) {
1922                 clean_tree_block(trans, root, mid);
1923                 btrfs_tree_unlock(mid);
1924                 del_ptr(root, path, level + 1, pslot);
1925                 root_sub_used(root, mid->len);
1926                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1927                 free_extent_buffer_stale(mid);
1928                 mid = NULL;
1929         } else {
1930                 /* update the parent key to reflect our changes */
1931                 struct btrfs_disk_key mid_key;
1932                 btrfs_node_key(mid, &mid_key, 0);
1933                 tree_mod_log_set_node_key(root->fs_info, parent,
1934                                           pslot, 0);
1935                 btrfs_set_node_key(parent, &mid_key, pslot);
1936                 btrfs_mark_buffer_dirty(parent);
1937         }
1938
1939         /* update the path */
1940         if (left) {
1941                 if (btrfs_header_nritems(left) > orig_slot) {
1942                         extent_buffer_get(left);
1943                         /* left was locked after cow */
1944                         path->nodes[level] = left;
1945                         path->slots[level + 1] -= 1;
1946                         path->slots[level] = orig_slot;
1947                         if (mid) {
1948                                 btrfs_tree_unlock(mid);
1949                                 free_extent_buffer(mid);
1950                         }
1951                 } else {
1952                         orig_slot -= btrfs_header_nritems(left);
1953                         path->slots[level] = orig_slot;
1954                 }
1955         }
1956         /* double check we haven't messed things up */
1957         if (orig_ptr !=
1958             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1959                 BUG();
1960 enospc:
1961         if (right) {
1962                 btrfs_tree_unlock(right);
1963                 free_extent_buffer(right);
1964         }
1965         if (left) {
1966                 if (path->nodes[level] != left)
1967                         btrfs_tree_unlock(left);
1968                 free_extent_buffer(left);
1969         }
1970         return ret;
1971 }
1972
1973 /* Node balancing for insertion.  Here we only split or push nodes around
1974  * when they are completely full.  This is also done top down, so we
1975  * have to be pessimistic.
1976  */
1977 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1978                                           struct btrfs_root *root,
1979                                           struct btrfs_path *path, int level)
1980 {
1981         struct extent_buffer *right = NULL;
1982         struct extent_buffer *mid;
1983         struct extent_buffer *left = NULL;
1984         struct extent_buffer *parent = NULL;
1985         int ret = 0;
1986         int wret;
1987         int pslot;
1988         int orig_slot = path->slots[level];
1989
1990         if (level == 0)
1991                 return 1;
1992
1993         mid = path->nodes[level];
1994         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1995
1996         if (level < BTRFS_MAX_LEVEL - 1) {
1997                 parent = path->nodes[level + 1];
1998                 pslot = path->slots[level + 1];
1999         }
2000
2001         if (!parent)
2002                 return 1;
2003
2004         left = read_node_slot(root, parent, pslot - 1);
2005
2006         /* first, try to make some room in the middle buffer */
2007         if (left) {
2008                 u32 left_nr;
2009
2010                 btrfs_tree_lock(left);
2011                 btrfs_set_lock_blocking(left);
2012
2013                 left_nr = btrfs_header_nritems(left);
2014                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2015                         wret = 1;
2016                 } else {
2017                         ret = btrfs_cow_block(trans, root, left, parent,
2018                                               pslot - 1, &left);
2019                         if (ret)
2020                                 wret = 1;
2021                         else {
2022                                 wret = push_node_left(trans, root,
2023                                                       left, mid, 0);
2024                         }
2025                 }
2026                 if (wret < 0)
2027                         ret = wret;
2028                 if (wret == 0) {
2029                         struct btrfs_disk_key disk_key;
2030                         orig_slot += left_nr;
2031                         btrfs_node_key(mid, &disk_key, 0);
2032                         tree_mod_log_set_node_key(root->fs_info, parent,
2033                                                   pslot, 0);
2034                         btrfs_set_node_key(parent, &disk_key, pslot);
2035                         btrfs_mark_buffer_dirty(parent);
2036                         if (btrfs_header_nritems(left) > orig_slot) {
2037                                 path->nodes[level] = left;
2038                                 path->slots[level + 1] -= 1;
2039                                 path->slots[level] = orig_slot;
2040                                 btrfs_tree_unlock(mid);
2041                                 free_extent_buffer(mid);
2042                         } else {
2043                                 orig_slot -=
2044                                         btrfs_header_nritems(left);
2045                                 path->slots[level] = orig_slot;
2046                                 btrfs_tree_unlock(left);
2047                                 free_extent_buffer(left);
2048                         }
2049                         return 0;
2050                 }
2051                 btrfs_tree_unlock(left);
2052                 free_extent_buffer(left);
2053         }
2054         right = read_node_slot(root, parent, pslot + 1);
2055
2056         /*
2057          * then try to empty the right most buffer into the middle
2058          */
2059         if (right) {
2060                 u32 right_nr;
2061
2062                 btrfs_tree_lock(right);
2063                 btrfs_set_lock_blocking(right);
2064
2065                 right_nr = btrfs_header_nritems(right);
2066                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2067                         wret = 1;
2068                 } else {
2069                         ret = btrfs_cow_block(trans, root, right,
2070                                               parent, pslot + 1,
2071                                               &right);
2072                         if (ret)
2073                                 wret = 1;
2074                         else {
2075                                 wret = balance_node_right(trans, root,
2076                                                           right, mid);
2077                         }
2078                 }
2079                 if (wret < 0)
2080                         ret = wret;
2081                 if (wret == 0) {
2082                         struct btrfs_disk_key disk_key;
2083
2084                         btrfs_node_key(right, &disk_key, 0);
2085                         tree_mod_log_set_node_key(root->fs_info, parent,
2086                                                   pslot + 1, 0);
2087                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2088                         btrfs_mark_buffer_dirty(parent);
2089
2090                         if (btrfs_header_nritems(mid) <= orig_slot) {
2091                                 path->nodes[level] = right;
2092                                 path->slots[level + 1] += 1;
2093                                 path->slots[level] = orig_slot -
2094                                         btrfs_header_nritems(mid);
2095                                 btrfs_tree_unlock(mid);
2096                                 free_extent_buffer(mid);
2097                         } else {
2098                                 btrfs_tree_unlock(right);
2099                                 free_extent_buffer(right);
2100                         }
2101                         return 0;
2102                 }
2103                 btrfs_tree_unlock(right);
2104                 free_extent_buffer(right);
2105         }
2106         return 1;
2107 }
2108
2109 /*
2110  * readahead one full node of leaves, finding things that are close
2111  * to the block in 'slot', and triggering ra on them.
2112  */
2113 static void reada_for_search(struct btrfs_root *root,
2114                              struct btrfs_path *path,
2115                              int level, int slot, u64 objectid)
2116 {
2117         struct extent_buffer *node;
2118         struct btrfs_disk_key disk_key;
2119         u32 nritems;
2120         u64 search;
2121         u64 target;
2122         u64 nread = 0;
2123         u64 gen;
2124         int direction = path->reada;
2125         struct extent_buffer *eb;
2126         u32 nr;
2127         u32 blocksize;
2128         u32 nscan = 0;
2129
2130         if (level != 1)
2131                 return;
2132
2133         if (!path->nodes[level])
2134                 return;
2135
2136         node = path->nodes[level];
2137
2138         search = btrfs_node_blockptr(node, slot);
2139         blocksize = btrfs_level_size(root, level - 1);
2140         eb = btrfs_find_tree_block(root, search, blocksize);
2141         if (eb) {
2142                 free_extent_buffer(eb);
2143                 return;
2144         }
2145
2146         target = search;
2147
2148         nritems = btrfs_header_nritems(node);
2149         nr = slot;
2150
2151         while (1) {
2152                 if (direction < 0) {
2153                         if (nr == 0)
2154                                 break;
2155                         nr--;
2156                 } else if (direction > 0) {
2157                         nr++;
2158                         if (nr >= nritems)
2159                                 break;
2160                 }
2161                 if (path->reada < 0 && objectid) {
2162                         btrfs_node_key(node, &disk_key, nr);
2163                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2164                                 break;
2165                 }
2166                 search = btrfs_node_blockptr(node, nr);
2167                 if ((search <= target && target - search <= 65536) ||
2168                     (search > target && search - target <= 65536)) {
2169                         gen = btrfs_node_ptr_generation(node, nr);
2170                         readahead_tree_block(root, search, blocksize, gen);
2171                         nread += blocksize;
2172                 }
2173                 nscan++;
2174                 if ((nread > 65536 || nscan > 32))
2175                         break;
2176         }
2177 }
2178
2179 /*
2180  * returns -EAGAIN if it had to drop the path, or zero if everything was in
2181  * cache
2182  */
2183 static noinline int reada_for_balance(struct btrfs_root *root,
2184                                       struct btrfs_path *path, int level)
2185 {
2186         int slot;
2187         int nritems;
2188         struct extent_buffer *parent;
2189         struct extent_buffer *eb;
2190         u64 gen;
2191         u64 block1 = 0;
2192         u64 block2 = 0;
2193         int ret = 0;
2194         int blocksize;
2195
2196         parent = path->nodes[level + 1];
2197         if (!parent)
2198                 return 0;
2199
2200         nritems = btrfs_header_nritems(parent);
2201         slot = path->slots[level + 1];
2202         blocksize = btrfs_level_size(root, level);
2203
2204         if (slot > 0) {
2205                 block1 = btrfs_node_blockptr(parent, slot - 1);
2206                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2207                 eb = btrfs_find_tree_block(root, block1, blocksize);
2208                 /*
2209                  * if we get -eagain from btrfs_buffer_uptodate, we
2210                  * don't want to return eagain here.  That will loop
2211                  * forever
2212                  */
2213                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2214                         block1 = 0;
2215                 free_extent_buffer(eb);
2216         }
2217         if (slot + 1 < nritems) {
2218                 block2 = btrfs_node_blockptr(parent, slot + 1);
2219                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2220                 eb = btrfs_find_tree_block(root, block2, blocksize);
2221                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2222                         block2 = 0;
2223                 free_extent_buffer(eb);
2224         }
2225         if (block1 || block2) {
2226                 ret = -EAGAIN;
2227
2228                 /* release the whole path */
2229                 btrfs_release_path(path);
2230
2231                 /* read the blocks */
2232                 if (block1)
2233                         readahead_tree_block(root, block1, blocksize, 0);
2234                 if (block2)
2235                         readahead_tree_block(root, block2, blocksize, 0);
2236
2237                 if (block1) {
2238                         eb = read_tree_block(root, block1, blocksize, 0);
2239                         free_extent_buffer(eb);
2240                 }
2241                 if (block2) {
2242                         eb = read_tree_block(root, block2, blocksize, 0);
2243                         free_extent_buffer(eb);
2244                 }
2245         }
2246         return ret;
2247 }
2248
2249
2250 /*
2251  * when we walk down the tree, it is usually safe to unlock the higher layers
2252  * in the tree.  The exceptions are when our path goes through slot 0, because
2253  * operations on the tree might require changing key pointers higher up in the
2254  * tree.
2255  *
2256  * callers might also have set path->keep_locks, which tells this code to keep
2257  * the lock if the path points to the last slot in the block.  This is part of
2258  * walking through the tree, and selecting the next slot in the higher block.
2259  *
2260  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2261  * if lowest_unlock is 1, level 0 won't be unlocked
2262  */
2263 static noinline void unlock_up(struct btrfs_path *path, int level,
2264                                int lowest_unlock, int min_write_lock_level,
2265                                int *write_lock_level)
2266 {
2267         int i;
2268         int skip_level = level;
2269         int no_skips = 0;
2270         struct extent_buffer *t;
2271
2272         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2273                 if (!path->nodes[i])
2274                         break;
2275                 if (!path->locks[i])
2276                         break;
2277                 if (!no_skips && path->slots[i] == 0) {
2278                         skip_level = i + 1;
2279                         continue;
2280                 }
2281                 if (!no_skips && path->keep_locks) {
2282                         u32 nritems;
2283                         t = path->nodes[i];
2284                         nritems = btrfs_header_nritems(t);
2285                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2286                                 skip_level = i + 1;
2287                                 continue;
2288                         }
2289                 }
2290                 if (skip_level < i && i >= lowest_unlock)
2291                         no_skips = 1;
2292
2293                 t = path->nodes[i];
2294                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2295                         btrfs_tree_unlock_rw(t, path->locks[i]);
2296                         path->locks[i] = 0;
2297                         if (write_lock_level &&
2298                             i > min_write_lock_level &&
2299                             i <= *write_lock_level) {
2300                                 *write_lock_level = i - 1;
2301                         }
2302                 }
2303         }
2304 }
2305
2306 /*
2307  * This releases any locks held in the path starting at level and
2308  * going all the way up to the root.
2309  *
2310  * btrfs_search_slot will keep the lock held on higher nodes in a few
2311  * corner cases, such as COW of the block at slot zero in the node.  This
2312  * ignores those rules, and it should only be called when there are no
2313  * more updates to be done higher up in the tree.
2314  */
2315 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2316 {
2317         int i;
2318
2319         if (path->keep_locks)
2320                 return;
2321
2322         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2323                 if (!path->nodes[i])
2324                         continue;
2325                 if (!path->locks[i])
2326                         continue;
2327                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2328                 path->locks[i] = 0;
2329         }
2330 }
2331
2332 /*
2333  * helper function for btrfs_search_slot.  The goal is to find a block
2334  * in cache without setting the path to blocking.  If we find the block
2335  * we return zero and the path is unchanged.
2336  *
2337  * If we can't find the block, we set the path blocking and do some
2338  * reada.  -EAGAIN is returned and the search must be repeated.
2339  */
2340 static int
2341 read_block_for_search(struct btrfs_trans_handle *trans,
2342                        struct btrfs_root *root, struct btrfs_path *p,
2343                        struct extent_buffer **eb_ret, int level, int slot,
2344                        struct btrfs_key *key, u64 time_seq)
2345 {
2346         u64 blocknr;
2347         u64 gen;
2348         u32 blocksize;
2349         struct extent_buffer *b = *eb_ret;
2350         struct extent_buffer *tmp;
2351         int ret;
2352
2353         blocknr = btrfs_node_blockptr(b, slot);
2354         gen = btrfs_node_ptr_generation(b, slot);
2355         blocksize = btrfs_level_size(root, level - 1);
2356
2357         tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2358         if (tmp) {
2359                 /* first we do an atomic uptodate check */
2360                 if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
2361                         if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2362                                 /*
2363                                  * we found an up to date block without
2364                                  * sleeping, return
2365                                  * right away
2366                                  */
2367                                 *eb_ret = tmp;
2368                                 return 0;
2369                         }
2370                         /* the pages were up to date, but we failed
2371                          * the generation number check.  Do a full
2372                          * read for the generation number that is correct.
2373                          * We must do this without dropping locks so
2374                          * we can trust our generation number
2375                          */
2376                         free_extent_buffer(tmp);
2377                         btrfs_set_path_blocking(p);
2378
2379                         /* now we're allowed to do a blocking uptodate check */
2380                         tmp = read_tree_block(root, blocknr, blocksize, gen);
2381                         if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
2382                                 *eb_ret = tmp;
2383                                 return 0;
2384                         }
2385                         free_extent_buffer(tmp);
2386                         btrfs_release_path(p);
2387                         return -EIO;
2388                 }
2389         }
2390
2391         /*
2392          * reduce lock contention at high levels
2393          * of the btree by dropping locks before
2394          * we read.  Don't release the lock on the current
2395          * level because we need to walk this node to figure
2396          * out which blocks to read.
2397          */
2398         btrfs_unlock_up_safe(p, level + 1);
2399         btrfs_set_path_blocking(p);
2400
2401         free_extent_buffer(tmp);
2402         if (p->reada)
2403                 reada_for_search(root, p, level, slot, key->objectid);
2404
2405         btrfs_release_path(p);
2406
2407         ret = -EAGAIN;
2408         tmp = read_tree_block(root, blocknr, blocksize, 0);
2409         if (tmp) {
2410                 /*
2411                  * If the read above didn't mark this buffer up to date,
2412                  * it will never end up being up to date.  Set ret to EIO now
2413                  * and give up so that our caller doesn't loop forever
2414                  * on our EAGAINs.
2415                  */
2416                 if (!btrfs_buffer_uptodate(tmp, 0, 0))
2417                         ret = -EIO;
2418                 free_extent_buffer(tmp);
2419         }
2420         return ret;
2421 }
2422
2423 /*
2424  * helper function for btrfs_search_slot.  This does all of the checks
2425  * for node-level blocks and does any balancing required based on
2426  * the ins_len.
2427  *
2428  * If no extra work was required, zero is returned.  If we had to
2429  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2430  * start over
2431  */
2432 static int
2433 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2434                        struct btrfs_root *root, struct btrfs_path *p,
2435                        struct extent_buffer *b, int level, int ins_len,
2436                        int *write_lock_level)
2437 {
2438         int ret;
2439         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2440             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2441                 int sret;
2442
2443                 if (*write_lock_level < level + 1) {
2444                         *write_lock_level = level + 1;
2445                         btrfs_release_path(p);
2446                         goto again;
2447                 }
2448
2449                 sret = reada_for_balance(root, p, level);
2450                 if (sret)
2451                         goto again;
2452
2453                 btrfs_set_path_blocking(p);
2454                 sret = split_node(trans, root, p, level);
2455                 btrfs_clear_path_blocking(p, NULL, 0);
2456
2457                 BUG_ON(sret > 0);
2458                 if (sret) {
2459                         ret = sret;
2460                         goto done;
2461                 }
2462                 b = p->nodes[level];
2463         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2464                    BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2465                 int sret;
2466
2467                 if (*write_lock_level < level + 1) {
2468                         *write_lock_level = level + 1;
2469                         btrfs_release_path(p);
2470                         goto again;
2471                 }
2472
2473                 sret = reada_for_balance(root, p, level);
2474                 if (sret)
2475                         goto again;
2476
2477                 btrfs_set_path_blocking(p);
2478                 sret = balance_level(trans, root, p, level);
2479                 btrfs_clear_path_blocking(p, NULL, 0);
2480
2481                 if (sret) {
2482                         ret = sret;
2483                         goto done;
2484                 }
2485                 b = p->nodes[level];
2486                 if (!b) {
2487                         btrfs_release_path(p);
2488                         goto again;
2489                 }
2490                 BUG_ON(btrfs_header_nritems(b) == 1);
2491         }
2492         return 0;
2493
2494 again:
2495         ret = -EAGAIN;
2496 done:
2497         return ret;
2498 }
2499
2500 /*
2501  * look for key in the tree.  path is filled in with nodes along the way
2502  * if key is found, we return zero and you can find the item in the leaf
2503  * level of the path (level 0)
2504  *
2505  * If the key isn't found, the path points to the slot where it should
2506  * be inserted, and 1 is returned.  If there are other errors during the
2507  * search a negative error number is returned.
2508  *
2509  * if ins_len > 0, nodes and leaves will be split as we walk down the
2510  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2511  * possible)
2512  */
2513 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2514                       *root, struct btrfs_key *key, struct btrfs_path *p, int
2515                       ins_len, int cow)
2516 {
2517         struct extent_buffer *b;
2518         int slot;
2519         int ret;
2520         int err;
2521         int level;
2522         int lowest_unlock = 1;
2523         int root_lock;
2524         /* everything at write_lock_level or lower must be write locked */
2525         int write_lock_level = 0;
2526         u8 lowest_level = 0;
2527         int min_write_lock_level;
2528
2529         lowest_level = p->lowest_level;
2530         WARN_ON(lowest_level && ins_len > 0);
2531         WARN_ON(p->nodes[0] != NULL);
2532
2533         if (ins_len < 0) {
2534                 lowest_unlock = 2;
2535
2536                 /* when we are removing items, we might have to go up to level
2537                  * two as we update tree pointers  Make sure we keep write
2538                  * for those levels as well
2539                  */
2540                 write_lock_level = 2;
2541         } else if (ins_len > 0) {
2542                 /*
2543                  * for inserting items, make sure we have a write lock on
2544                  * level 1 so we can update keys
2545                  */
2546                 write_lock_level = 1;
2547         }
2548
2549         if (!cow)
2550                 write_lock_level = -1;
2551
2552         if (cow && (p->keep_locks || p->lowest_level))
2553                 write_lock_level = BTRFS_MAX_LEVEL;
2554
2555         min_write_lock_level = write_lock_level;
2556
2557 again:
2558         /*
2559          * we try very hard to do read locks on the root
2560          */
2561         root_lock = BTRFS_READ_LOCK;
2562         level = 0;
2563         if (p->search_commit_root) {
2564                 /*
2565                  * the commit roots are read only
2566                  * so we always do read locks
2567                  */
2568                 b = root->commit_root;
2569                 extent_buffer_get(b);
2570                 level = btrfs_header_level(b);
2571                 if (!p->skip_locking)
2572                         btrfs_tree_read_lock(b);
2573         } else {
2574                 if (p->skip_locking) {
2575                         b = btrfs_root_node(root);
2576                         level = btrfs_header_level(b);
2577                 } else {
2578                         /* we don't know the level of the root node
2579                          * until we actually have it read locked
2580                          */
2581                         b = btrfs_read_lock_root_node(root);
2582                         level = btrfs_header_level(b);
2583                         if (level <= write_lock_level) {
2584                                 /* whoops, must trade for write lock */
2585                                 btrfs_tree_read_unlock(b);
2586                                 free_extent_buffer(b);
2587                                 b = btrfs_lock_root_node(root);
2588                                 root_lock = BTRFS_WRITE_LOCK;
2589
2590                                 /* the level might have changed, check again */
2591                                 level = btrfs_header_level(b);
2592                         }
2593                 }
2594         }
2595         p->nodes[level] = b;
2596         if (!p->skip_locking)
2597                 p->locks[level] = root_lock;
2598
2599         while (b) {
2600                 level = btrfs_header_level(b);
2601
2602                 /*
2603                  * setup the path here so we can release it under lock
2604                  * contention with the cow code
2605                  */
2606                 if (cow) {
2607                         /*
2608                          * if we don't really need to cow this block
2609                          * then we don't want to set the path blocking,
2610                          * so we test it here
2611                          */
2612                         if (!should_cow_block(trans, root, b))
2613                                 goto cow_done;
2614
2615                         btrfs_set_path_blocking(p);
2616
2617                         /*
2618                          * must have write locks on this node and the
2619                          * parent
2620                          */
2621                         if (level > write_lock_level ||
2622                             (level + 1 > write_lock_level &&
2623                             level + 1 < BTRFS_MAX_LEVEL &&
2624                             p->nodes[level + 1])) {
2625                                 write_lock_level = level + 1;
2626                                 btrfs_release_path(p);
2627                                 goto again;
2628                         }
2629
2630                         err = btrfs_cow_block(trans, root, b,
2631                                               p->nodes[level + 1],
2632                                               p->slots[level + 1], &b);
2633                         if (err) {
2634                                 ret = err;
2635                                 goto done;
2636                         }
2637                 }
2638 cow_done:
2639                 BUG_ON(!cow && ins_len);
2640
2641                 p->nodes[level] = b;
2642                 btrfs_clear_path_blocking(p, NULL, 0);
2643
2644                 /*
2645                  * we have a lock on b and as long as we aren't changing
2646                  * the tree, there is no way to for the items in b to change.
2647                  * It is safe to drop the lock on our parent before we
2648                  * go through the expensive btree search on b.
2649                  *
2650                  * If cow is true, then we might be changing slot zero,
2651                  * which may require changing the parent.  So, we can't
2652                  * drop the lock until after we know which slot we're
2653                  * operating on.
2654                  */
2655                 if (!cow)
2656                         btrfs_unlock_up_safe(p, level + 1);
2657
2658                 ret = bin_search(b, key, level, &slot);
2659
2660                 if (level != 0) {
2661                         int dec = 0;
2662                         if (ret && slot > 0) {
2663                                 dec = 1;
2664                                 slot -= 1;
2665                         }
2666                         p->slots[level] = slot;
2667                         err = setup_nodes_for_search(trans, root, p, b, level,
2668                                              ins_len, &write_lock_level);
2669                         if (err == -EAGAIN)
2670                                 goto again;
2671                         if (err) {
2672                                 ret = err;
2673                                 goto done;
2674                         }
2675                         b = p->nodes[level];
2676                         slot = p->slots[level];
2677
2678                         /*
2679                          * slot 0 is special, if we change the key
2680                          * we have to update the parent pointer
2681                          * which means we must have a write lock
2682                          * on the parent
2683                          */
2684                         if (slot == 0 && cow &&
2685                             write_lock_level < level + 1) {
2686                                 write_lock_level = level + 1;
2687                                 btrfs_release_path(p);
2688                                 goto again;
2689                         }
2690
2691                         unlock_up(p, level, lowest_unlock,
2692                                   min_write_lock_level, &write_lock_level);
2693
2694                         if (level == lowest_level) {
2695                                 if (dec)
2696                                         p->slots[level]++;
2697                                 goto done;
2698                         }
2699
2700                         err = read_block_for_search(trans, root, p,
2701                                                     &b, level, slot, key, 0);
2702                         if (err == -EAGAIN)
2703                                 goto again;
2704                         if (err) {
2705                                 ret = err;
2706                                 goto done;
2707                         }
2708
2709                         if (!p->skip_locking) {
2710                                 level = btrfs_header_level(b);
2711                                 if (level <= write_lock_level) {
2712                                         err = btrfs_try_tree_write_lock(b);
2713                                         if (!err) {
2714                                                 btrfs_set_path_blocking(p);
2715                                                 btrfs_tree_lock(b);
2716                                                 btrfs_clear_path_blocking(p, b,
2717                                                                   BTRFS_WRITE_LOCK);
2718                                         }
2719                                         p->locks[level] = BTRFS_WRITE_LOCK;
2720                                 } else {
2721                                         err = btrfs_try_tree_read_lock(b);
2722                                         if (!err) {
2723                                                 btrfs_set_path_blocking(p);
2724                                                 btrfs_tree_read_lock(b);
2725                                                 btrfs_clear_path_blocking(p, b,
2726                                                                   BTRFS_READ_LOCK);
2727                                         }
2728                                         p->locks[level] = BTRFS_READ_LOCK;
2729                                 }
2730                                 p->nodes[level] = b;
2731                         }
2732                 } else {
2733                         p->slots[level] = slot;
2734                         if (ins_len > 0 &&
2735                             btrfs_leaf_free_space(root, b) < ins_len) {
2736                                 if (write_lock_level < 1) {
2737                                         write_lock_level = 1;
2738                                         btrfs_release_path(p);
2739                                         goto again;
2740                                 }
2741
2742                                 btrfs_set_path_blocking(p);
2743                                 err = split_leaf(trans, root, key,
2744                                                  p, ins_len, ret == 0);
2745                                 btrfs_clear_path_blocking(p, NULL, 0);
2746
2747                                 BUG_ON(err > 0);
2748                                 if (err) {
2749                                         ret = err;
2750                                         goto done;
2751                                 }
2752                         }
2753                         if (!p->search_for_split)
2754                                 unlock_up(p, level, lowest_unlock,
2755                                           min_write_lock_level, &write_lock_level);
2756                         goto done;
2757                 }
2758         }
2759         ret = 1;
2760 done:
2761         /*
2762          * we don't really know what they plan on doing with the path
2763          * from here on, so for now just mark it as blocking
2764          */
2765         if (!p->leave_spinning)
2766                 btrfs_set_path_blocking(p);
2767         if (ret < 0)
2768                 btrfs_release_path(p);
2769         return ret;
2770 }
2771
2772 /*
2773  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2774  * current state of the tree together with the operations recorded in the tree
2775  * modification log to search for the key in a previous version of this tree, as
2776  * denoted by the time_seq parameter.
2777  *
2778  * Naturally, there is no support for insert, delete or cow operations.
2779  *
2780  * The resulting path and return value will be set up as if we called
2781  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2782  */
2783 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2784                           struct btrfs_path *p, u64 time_seq)
2785 {
2786         struct extent_buffer *b;
2787         int slot;
2788         int ret;
2789         int err;
2790         int level;
2791         int lowest_unlock = 1;
2792         u8 lowest_level = 0;
2793
2794         lowest_level = p->lowest_level;
2795         WARN_ON(p->nodes[0] != NULL);
2796
2797         if (p->search_commit_root) {
2798                 BUG_ON(time_seq);
2799                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2800         }
2801
2802 again:
2803         b = get_old_root(root, time_seq);
2804         level = btrfs_header_level(b);
2805         p->locks[level] = BTRFS_READ_LOCK;
2806
2807         while (b) {
2808                 level = btrfs_header_level(b);
2809                 p->nodes[level] = b;
2810                 btrfs_clear_path_blocking(p, NULL, 0);
2811
2812                 /*
2813                  * we have a lock on b and as long as we aren't changing
2814                  * the tree, there is no way to for the items in b to change.
2815                  * It is safe to drop the lock on our parent before we
2816                  * go through the expensive btree search on b.
2817                  */
2818                 btrfs_unlock_up_safe(p, level + 1);
2819
2820                 ret = bin_search(b, key, level, &slot);
2821
2822                 if (level != 0) {
2823                         int dec = 0;
2824                         if (ret && slot > 0) {
2825                                 dec = 1;
2826                                 slot -= 1;
2827                         }
2828                         p->slots[level] = slot;
2829                         unlock_up(p, level, lowest_unlock, 0, NULL);
2830
2831                         if (level == lowest_level) {
2832                                 if (dec)
2833                                         p->slots[level]++;
2834                                 goto done;
2835                         }
2836
2837                         err = read_block_for_search(NULL, root, p, &b, level,
2838                                                     slot, key, time_seq);
2839                         if (err == -EAGAIN)
2840                                 goto again;
2841                         if (err) {
2842                                 ret = err;
2843                                 goto done;
2844                         }
2845
2846                         level = btrfs_header_level(b);
2847                         err = btrfs_try_tree_read_lock(b);
2848                         if (!err) {
2849                                 btrfs_set_path_blocking(p);
2850                                 btrfs_tree_read_lock(b);
2851                                 btrfs_clear_path_blocking(p, b,
2852                                                           BTRFS_READ_LOCK);
2853                         }
2854                         b = tree_mod_log_rewind(root->fs_info, b, time_seq);
2855                         p->locks[level] = BTRFS_READ_LOCK;
2856                         p->nodes[level] = b;
2857                 } else {
2858                         p->slots[level] = slot;
2859                         unlock_up(p, level, lowest_unlock, 0, NULL);
2860                         goto done;
2861                 }
2862         }
2863         ret = 1;
2864 done:
2865         if (!p->leave_spinning)
2866                 btrfs_set_path_blocking(p);
2867         if (ret < 0)
2868                 btrfs_release_path(p);
2869
2870         return ret;
2871 }
2872
2873 /*
2874  * helper to use instead of search slot if no exact match is needed but
2875  * instead the next or previous item should be returned.
2876  * When find_higher is true, the next higher item is returned, the next lower
2877  * otherwise.
2878  * When return_any and find_higher are both true, and no higher item is found,
2879  * return the next lower instead.
2880  * When return_any is true and find_higher is false, and no lower item is found,
2881  * return the next higher instead.
2882  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2883  * < 0 on error
2884  */
2885 int btrfs_search_slot_for_read(struct btrfs_root *root,
2886                                struct btrfs_key *key, struct btrfs_path *p,
2887                                int find_higher, int return_any)
2888 {
2889         int ret;
2890         struct extent_buffer *leaf;
2891
2892 again:
2893         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2894         if (ret <= 0)
2895                 return ret;
2896         /*
2897          * a return value of 1 means the path is at the position where the
2898          * item should be inserted. Normally this is the next bigger item,
2899          * but in case the previous item is the last in a leaf, path points
2900          * to the first free slot in the previous leaf, i.e. at an invalid
2901          * item.
2902          */
2903         leaf = p->nodes[0];
2904
2905         if (find_higher) {
2906                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2907                         ret = btrfs_next_leaf(root, p);
2908                         if (ret <= 0)
2909                                 return ret;
2910                         if (!return_any)
2911                                 return 1;
2912                         /*
2913                          * no higher item found, return the next
2914                          * lower instead
2915                          */
2916                         return_any = 0;
2917                         find_higher = 0;
2918                         btrfs_release_path(p);
2919                         goto again;
2920                 }
2921         } else {
2922                 if (p->slots[0] == 0) {
2923                         ret = btrfs_prev_leaf(root, p);
2924                         if (ret < 0)
2925                                 return ret;
2926                         if (!ret) {
2927                                 p->slots[0] = btrfs_header_nritems(leaf) - 1;
2928                                 return 0;
2929                         }
2930                         if (!return_any)
2931                                 return 1;
2932                         /*
2933                          * no lower item found, return the next
2934                          * higher instead
2935                          */
2936                         return_any = 0;
2937                         find_higher = 1;
2938                         btrfs_release_path(p);
2939                         goto again;
2940                 } else {
2941                         --p->slots[0];
2942                 }
2943         }
2944         return 0;
2945 }
2946
2947 /*
2948  * adjust the pointers going up the tree, starting at level
2949  * making sure the right key of each node is points to 'key'.
2950  * This is used after shifting pointers to the left, so it stops
2951  * fixing up pointers when a given leaf/node is not in slot 0 of the
2952  * higher levels
2953  *
2954  */
2955 static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
2956                            struct btrfs_disk_key *key, int level)
2957 {
2958         int i;
2959         struct extent_buffer *t;
2960
2961         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2962                 int tslot = path->slots[i];
2963                 if (!path->nodes[i])
2964                         break;
2965                 t = path->nodes[i];
2966                 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
2967                 btrfs_set_node_key(t, key, tslot);
2968                 btrfs_mark_buffer_dirty(path->nodes[i]);
2969                 if (tslot != 0)
2970                         break;
2971         }
2972 }
2973
2974 /*
2975  * update item key.
2976  *
2977  * This function isn't completely safe. It's the caller's responsibility
2978  * that the new key won't break the order
2979  */
2980 void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
2981                              struct btrfs_key *new_key)
2982 {
2983         struct btrfs_disk_key disk_key;
2984         struct extent_buffer *eb;
2985         int slot;
2986
2987         eb = path->nodes[0];
2988         slot = path->slots[0];
2989         if (slot > 0) {
2990                 btrfs_item_key(eb, &disk_key, slot - 1);
2991                 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
2992         }
2993         if (slot < btrfs_header_nritems(eb) - 1) {
2994                 btrfs_item_key(eb, &disk_key, slot + 1);
2995                 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
2996         }
2997
2998         btrfs_cpu_key_to_disk(&disk_key, new_key);
2999         btrfs_set_item_key(eb, &disk_key, slot);
3000         btrfs_mark_buffer_dirty(eb);
3001         if (slot == 0)
3002                 fixup_low_keys(root, path, &disk_key, 1);
3003 }
3004
3005 /*
3006  * try to push data from one node into the next node left in the
3007  * tree.
3008  *
3009  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3010  * error, and > 0 if there was no room in the left hand block.
3011  */
3012 static int push_node_left(struct btrfs_trans_handle *trans,
3013                           struct btrfs_root *root, struct extent_buffer *dst,
3014                           struct extent_buffer *src, int empty)
3015 {
3016         int push_items = 0;
3017         int src_nritems;
3018         int dst_nritems;
3019         int ret = 0;
3020
3021         src_nritems = btrfs_header_nritems(src);
3022         dst_nritems = btrfs_header_nritems(dst);
3023         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3024         WARN_ON(btrfs_header_generation(src) != trans->transid);
3025         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3026
3027         if (!empty && src_nritems <= 8)
3028                 return 1;
3029
3030         if (push_items <= 0)
3031                 return 1;
3032
3033         if (empty) {
3034                 push_items = min(src_nritems, push_items);
3035                 if (push_items < src_nritems) {
3036                         /* leave at least 8 pointers in the node if
3037                          * we aren't going to empty it
3038                          */
3039                         if (src_nritems - push_items < 8) {
3040                                 if (push_items <= 8)
3041                                         return 1;
3042                                 push_items -= 8;
3043                         }
3044                 }
3045         } else
3046                 push_items = min(src_nritems - 8, push_items);
3047
3048         tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3049                              push_items);
3050         copy_extent_buffer(dst, src,
3051                            btrfs_node_key_ptr_offset(dst_nritems),
3052                            btrfs_node_key_ptr_offset(0),
3053                            push_items * sizeof(struct btrfs_key_ptr));
3054
3055         if (push_items < src_nritems) {
3056                 /*
3057                  * don't call tree_mod_log_eb_move here, key removal was already
3058                  * fully logged by tree_mod_log_eb_copy above.
3059                  */
3060                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3061                                       btrfs_node_key_ptr_offset(push_items),
3062                                       (src_nritems - push_items) *
3063                                       sizeof(struct btrfs_key_ptr));
3064         }
3065         btrfs_set_header_nritems(src, src_nritems - push_items);
3066         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3067         btrfs_mark_buffer_dirty(src);
3068         btrfs_mark_buffer_dirty(dst);
3069
3070         return ret;
3071 }
3072
3073 /*
3074  * try to push data from one node into the next node right in the
3075  * tree.
3076  *
3077  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3078  * error, and > 0 if there was no room in the right hand block.
3079  *
3080  * this will  only push up to 1/2 the contents of the left node over
3081  */
3082 static int balance_node_right(struct btrfs_trans_handle *trans,
3083                               struct btrfs_root *root,
3084                               struct extent_buffer *dst,
3085                               struct extent_buffer *src)
3086 {
3087         int push_items = 0;
3088         int max_push;
3089         int src_nritems;
3090         int dst_nritems;
3091         int ret = 0;
3092
3093         WARN_ON(btrfs_header_generation(src) != trans->transid);
3094         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3095
3096         src_nritems = btrfs_header_nritems(src);
3097         dst_nritems = btrfs_header_nritems(dst);
3098         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3099         if (push_items <= 0)
3100                 return 1;
3101
3102         if (src_nritems < 4)
3103                 return 1;
3104
3105         max_push = src_nritems / 2 + 1;
3106         /* don't try to empty the node */
3107         if (max_push >= src_nritems)
3108                 return 1;
3109
3110         if (max_push < push_items)
3111                 push_items = max_push;
3112
3113         tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3114         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3115                                       btrfs_node_key_ptr_offset(0),
3116                                       (dst_nritems) *
3117                                       sizeof(struct btrfs_key_ptr));
3118
3119         tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3120                              src_nritems - push_items, push_items);
3121         copy_extent_buffer(dst, src,
3122                            btrfs_node_key_ptr_offset(0),
3123                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3124                            push_items * sizeof(struct btrfs_key_ptr));
3125
3126         btrfs_set_header_nritems(src, src_nritems - push_items);
3127         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3128
3129         btrfs_mark_buffer_dirty(src);
3130         btrfs_mark_buffer_dirty(dst);
3131
3132         return ret;
3133 }
3134
3135 /*
3136  * helper function to insert a new root level in the tree.
3137  * A new node is allocated, and a single item is inserted to
3138  * point to the existing root
3139  *
3140  * returns zero on success or < 0 on failure.
3141  */
3142 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3143                            struct btrfs_root *root,
3144                            struct btrfs_path *path, int level, int log_removal)
3145 {
3146         u64 lower_gen;
3147         struct extent_buffer *lower;
3148         struct extent_buffer *c;
3149         struct extent_buffer *old;
3150         struct btrfs_disk_key lower_key;
3151
3152         BUG_ON(path->nodes[level]);
3153         BUG_ON(path->nodes[level-1] != root->node);
3154
3155         lower = path->nodes[level-1];
3156         if (level == 1)
3157                 btrfs_item_key(lower, &lower_key, 0);
3158         else
3159                 btrfs_node_key(lower, &lower_key, 0);
3160
3161         c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3162                                    root->root_key.objectid, &lower_key,
3163                                    level, root->node->start, 0);
3164         if (IS_ERR(c))
3165                 return PTR_ERR(c);
3166
3167         root_add_used(root, root->nodesize);
3168
3169         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3170         btrfs_set_header_nritems(c, 1);
3171         btrfs_set_header_level(c, level);
3172         btrfs_set_header_bytenr(c, c->start);
3173         btrfs_set_header_generation(c, trans->transid);
3174         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3175         btrfs_set_header_owner(c, root->root_key.objectid);
3176
3177         write_extent_buffer(c, root->fs_info->fsid,
3178                             (unsigned long)btrfs_header_fsid(c),
3179                             BTRFS_FSID_SIZE);
3180
3181         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3182                             (unsigned long)btrfs_header_chunk_tree_uuid(c),
3183                             BTRFS_UUID_SIZE);
3184
3185         btrfs_set_node_key(c, &lower_key, 0);
3186         btrfs_set_node_blockptr(c, 0, lower->start);
3187         lower_gen = btrfs_header_generation(lower);
3188         WARN_ON(lower_gen != trans->transid);
3189
3190         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3191
3192         btrfs_mark_buffer_dirty(c);
3193
3194         old = root->node;
3195         tree_mod_log_set_root_pointer(root, c, log_removal);
3196         rcu_assign_pointer(root->node, c);
3197
3198         /* the super has an extra ref to root->node */
3199         free_extent_buffer(old);
3200
3201         add_root_to_dirty_list(root);
3202         extent_buffer_get(c);
3203         path->nodes[level] = c;
3204         path->locks[level] = BTRFS_WRITE_LOCK;
3205         path->slots[level] = 0;
3206         return 0;
3207 }
3208
3209 /*
3210  * worker function to insert a single pointer in a node.
3211  * the node should have enough room for the pointer already
3212  *
3213  * slot and level indicate where you want the key to go, and
3214  * blocknr is the block the key points to.
3215  */
3216 static void insert_ptr(struct btrfs_trans_handle *trans,
3217                        struct btrfs_root *root, struct btrfs_path *path,
3218                        struct btrfs_disk_key *key, u64 bytenr,
3219                        int slot, int level)
3220 {
3221         struct extent_buffer *lower;
3222         int nritems;
3223         int ret;
3224
3225         BUG_ON(!path->nodes[level]);
3226         btrfs_assert_tree_locked(path->nodes[level]);
3227         lower = path->nodes[level];
3228         nritems = btrfs_header_nritems(lower);
3229         BUG_ON(slot > nritems);
3230         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3231         if (slot != nritems) {
3232                 if (level)
3233                         tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3234                                              slot, nritems - slot);
3235                 memmove_extent_buffer(lower,
3236                               btrfs_node_key_ptr_offset(slot + 1),
3237                               btrfs_node_key_ptr_offset(slot),
3238                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3239         }
3240         if (level) {
3241                 ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3242                                               MOD_LOG_KEY_ADD);
3243                 BUG_ON(ret < 0);
3244         }
3245         btrfs_set_node_key(lower, key, slot);
3246         btrfs_set_node_blockptr(lower, slot, bytenr);
3247         WARN_ON(trans->transid == 0);
3248         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3249         btrfs_set_header_nritems(lower, nritems + 1);
3250         btrfs_mark_buffer_dirty(lower);
3251 }
3252
3253 /*
3254  * split the node at the specified level in path in two.
3255  * The path is corrected to point to the appropriate node after the split
3256  *
3257  * Before splitting this tries to make some room in the node by pushing
3258  * left and right, if either one works, it returns right away.
3259  *
3260  * returns 0 on success and < 0 on failure
3261  */
3262 static noinline int split_node(struct btrfs_trans_handle *trans,
3263                                struct btrfs_root *root,
3264                                struct btrfs_path *path, int level)
3265 {
3266         struct extent_buffer *c;
3267         struct extent_buffer *split;
3268         struct btrfs_disk_key disk_key;
3269         int mid;
3270         int ret;
3271         u32 c_nritems;
3272
3273         c = path->nodes[level];
3274         WARN_ON(btrfs_header_generation(c) != trans->transid);
3275         if (c == root->node) {
3276                 /*
3277                  * trying to split the root, lets make a new one
3278                  *
3279                  * tree mod log: We pass 0 as log_removal parameter to
3280                  * insert_new_root, because that root buffer will be kept as a
3281                  * normal node. We are going to log removal of half of the
3282                  * elements below with tree_mod_log_eb_copy. We're holding a
3283                  * tree lock on the buffer, which is why we cannot race with
3284                  * other tree_mod_log users.
3285                  */
3286                 ret = insert_new_root(trans, root, path, level + 1, 0);
3287                 if (ret)
3288                         return ret;
3289         } else {
3290                 ret = push_nodes_for_insert(trans, root, path, level);
3291                 c = path->nodes[level];
3292                 if (!ret && btrfs_header_nritems(c) <
3293                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3294                         return 0;
3295                 if (ret < 0)
3296                         return ret;
3297         }
3298
3299         c_nritems = btrfs_header_nritems(c);
3300         mid = (c_nritems + 1) / 2;
3301         btrfs_node_key(c, &disk_key, mid);
3302
3303         split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3304                                         root->root_key.objectid,
3305                                         &disk_key, level, c->start, 0);
3306         if (IS_ERR(split))
3307                 return PTR_ERR(split);
3308
3309         root_add_used(root, root->nodesize);
3310
3311         memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3312         btrfs_set_header_level(split, btrfs_header_level(c));
3313         btrfs_set_header_bytenr(split, split->start);
3314         btrfs_set_header_generation(split, trans->transid);
3315         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3316         btrfs_set_header_owner(split, root->root_key.objectid);
3317         write_extent_buffer(split, root->fs_info->fsid,
3318                             (unsigned long)btrfs_header_fsid(split),
3319                             BTRFS_FSID_SIZE);
3320         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3321                             (unsigned long)btrfs_header_chunk_tree_uuid(split),
3322                             BTRFS_UUID_SIZE);
3323
3324         tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
3325         copy_extent_buffer(split, c,
3326                            btrfs_node_key_ptr_offset(0),
3327                            btrfs_node_key_ptr_offset(mid),
3328                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3329         btrfs_set_header_nritems(split, c_nritems - mid);
3330         btrfs_set_header_nritems(c, mid);
3331         ret = 0;
3332
3333         btrfs_mark_buffer_dirty(c);
3334         btrfs_mark_buffer_dirty(split);
3335
3336         insert_ptr(trans, root, path, &disk_key, split->start,
3337                    path->slots[level + 1] + 1, level + 1);
3338
3339         if (path->slots[level] >= mid) {
3340                 path->slots[level] -= mid;
3341                 btrfs_tree_unlock(c);
3342                 free_extent_buffer(c);
3343                 path->nodes[level] = split;
3344                 path->slots[level + 1] += 1;
3345         } else {
3346                 btrfs_tree_unlock(split);
3347                 free_extent_buffer(split);
3348         }
3349         return ret;
3350 }
3351
3352 /*
3353  * how many bytes are required to store the items in a leaf.  start
3354  * and nr indicate which items in the leaf to check.  This totals up the
3355  * space used both by the item structs and the item data
3356  */
3357 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3358 {
3359         struct btrfs_item *start_item;
3360         struct btrfs_item *end_item;
3361         struct btrfs_map_token token;
3362         int data_len;
3363         int nritems = btrfs_header_nritems(l);
3364         int end = min(nritems, start + nr) - 1;
3365
3366         if (!nr)
3367                 return 0;
3368         btrfs_init_map_token(&token);
3369         start_item = btrfs_item_nr(l, start);
3370         end_item = btrfs_item_nr(l, end);
3371         data_len = btrfs_token_item_offset(l, start_item, &token) +
3372                 btrfs_token_item_size(l, start_item, &token);
3373         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3374         data_len += sizeof(struct btrfs_item) * nr;
3375         WARN_ON(data_len < 0);
3376         return data_len;
3377 }
3378
3379 /*
3380  * The space between the end of the leaf items and
3381  * the start of the leaf data.  IOW, how much room
3382  * the leaf has left for both items and data
3383  */
3384 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3385                                    struct extent_buffer *leaf)
3386 {
3387         int nritems = btrfs_header_nritems(leaf);
3388         int ret;
3389         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3390         if (ret < 0) {
3391                 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
3392                        "used %d nritems %d\n",
3393                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3394                        leaf_space_used(leaf, 0, nritems), nritems);
3395         }
3396         return ret;
3397 }
3398
3399 /*
3400  * min slot controls the lowest index we're willing to push to the
3401  * right.  We'll push up to and including min_slot, but no lower
3402  */
3403 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3404                                       struct btrfs_root *root,
3405                                       struct btrfs_path *path,
3406                                       int data_size, int empty,
3407                                       struct extent_buffer *right,
3408                                       int free_space, u32 left_nritems,
3409                                       u32 min_slot)
3410 {
3411         struct extent_buffer *left = path->nodes[0];
3412         struct extent_buffer *upper = path->nodes[1];
3413         struct btrfs_map_token token;
3414         struct btrfs_disk_key disk_key;
3415         int slot;
3416         u32 i;
3417         int push_space = 0;
3418         int push_items = 0;
3419         struct btrfs_item *item;
3420         u32 nr;
3421         u32 right_nritems;
3422         u32 data_end;
3423         u32 this_item_size;
3424
3425         btrfs_init_map_token(&token);
3426
3427         if (empty)
3428                 nr = 0;
3429         else
3430                 nr = max_t(u32, 1, min_slot);
3431
3432         if (path->slots[0] >= left_nritems)
3433                 push_space += data_size;
3434
3435         slot = path->slots[1];
3436         i = left_nritems - 1;
3437         while (i >= nr) {
3438                 item = btrfs_item_nr(left, i);
3439
3440                 if (!empty && push_items > 0) {
3441                         if (path->slots[0] > i)
3442                                 break;
3443                         if (path->slots[0] == i) {
3444                                 int space = btrfs_leaf_free_space(root, left);
3445                                 if (space + push_space * 2 > free_space)
3446                                         break;
3447                         }
3448                 }
3449
3450                 if (path->slots[0] == i)
3451                         push_space += data_size;
3452
3453                 this_item_size = btrfs_item_size(left, item);
3454                 if (this_item_size + sizeof(*item) + push_space > free_space)
3455                         break;
3456
3457                 push_items++;
3458                 push_space += this_item_size + sizeof(*item);
3459                 if (i == 0)
3460                         break;
3461                 i--;
3462         }
3463
3464         if (push_items == 0)
3465                 goto out_unlock;
3466
3467         WARN_ON(!empty && push_items == left_nritems);
3468
3469         /* push left to right */
3470         right_nritems = btrfs_header_nritems(right);
3471
3472         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3473         push_space -= leaf_data_end(root, left);
3474
3475         /* make room in the right data area */
3476         data_end = leaf_data_end(root, right);
3477         memmove_extent_buffer(right,
3478                               btrfs_leaf_data(right) + data_end - push_space,
3479                               btrfs_leaf_data(right) + data_end,
3480                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
3481
3482         /* copy from the left data area */
3483         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3484                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3485                      btrfs_leaf_data(left) + leaf_data_end(root, left),
3486                      push_space);
3487
3488         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3489                               btrfs_item_nr_offset(0),
3490                               right_nritems * sizeof(struct btrfs_item));
3491
3492         /* copy the items from left to right */
3493         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3494                    btrfs_item_nr_offset(left_nritems - push_items),
3495                    push_items * sizeof(struct btrfs_item));
3496
3497         /* update the item pointers */
3498         right_nritems += push_items;
3499         btrfs_set_header_nritems(right, right_nritems);
3500         push_space = BTRFS_LEAF_DATA_SIZE(root);
3501         for (i = 0; i < right_nritems; i++) {
3502                 item = btrfs_item_nr(right, i);
3503                 push_space -= btrfs_token_item_size(right, item, &token);
3504                 btrfs_set_token_item_offset(right, item, push_space, &token);
3505         }
3506
3507         left_nritems -= push_items;
3508         btrfs_set_header_nritems(left, left_nritems);
3509
3510         if (left_nritems)
3511                 btrfs_mark_buffer_dirty(left);
3512         else
3513                 clean_tree_block(trans, root, left);
3514
3515         btrfs_mark_buffer_dirty(right);
3516
3517         btrfs_item_key(right, &disk_key, 0);
3518         btrfs_set_node_key(upper, &disk_key, slot + 1);
3519         btrfs_mark_buffer_dirty(upper);
3520
3521         /* then fixup the leaf pointer in the path */
3522         if (path->slots[0] >= left_nritems) {
3523                 path->slots[0] -= left_nritems;
3524                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3525                         clean_tree_block(trans, root, path->nodes[0]);
3526                 btrfs_tree_unlock(path->nodes[0]);
3527                 free_extent_buffer(path->nodes[0]);
3528                 path->nodes[0] = right;
3529                 path->slots[1] += 1;
3530         } else {
3531                 btrfs_tree_unlock(right);
3532                 free_extent_buffer(right);
3533         }
3534         return 0;
3535
3536 out_unlock:
3537         btrfs_tree_unlock(right);
3538         free_extent_buffer(right);
3539         return 1;
3540 }
3541
3542 /*
3543  * push some data in the path leaf to the right, trying to free up at
3544  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3545  *
3546  * returns 1 if the push failed because the other node didn't have enough
3547  * room, 0 if everything worked out and < 0 if there were major errors.
3548  *
3549  * this will push starting from min_slot to the end of the leaf.  It won't
3550  * push any slot lower than min_slot
3551  */
3552 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3553                            *root, struct btrfs_path *path,
3554                            int min_data_size, int data_size,
3555                            int empty, u32 min_slot)
3556 {
3557         struct extent_buffer *left = path->nodes[0];
3558         struct extent_buffer *right;
3559         struct extent_buffer *upper;
3560         int slot;
3561         int free_space;
3562         u32 left_nritems;
3563         int ret;
3564
3565         if (!path->nodes[1])
3566                 return 1;
3567
3568         slot = path->slots[1];
3569         upper = path->nodes[1];
3570         if (slot >= btrfs_header_nritems(upper) - 1)
3571                 return 1;
3572
3573         btrfs_assert_tree_locked(path->nodes[1]);
3574
3575         right = read_node_slot(root, upper, slot + 1);
3576         if (right == NULL)
3577                 return 1;
3578
3579         btrfs_tree_lock(right);
3580         btrfs_set_lock_blocking(right);
3581
3582         free_space = btrfs_leaf_free_space(root, right);
3583         if (free_space < data_size)
3584                 goto out_unlock;
3585
3586         /* cow and double check */
3587         ret = btrfs_cow_block(trans, root, right, upper,
3588                               slot + 1, &right);
3589         if (ret)
3590                 goto out_unlock;
3591
3592         free_space = btrfs_leaf_free_space(root, right);
3593         if (free_space < data_size)
3594                 goto out_unlock;
3595
3596         left_nritems = btrfs_header_nritems(left);
3597         if (left_nritems == 0)
3598                 goto out_unlock;
3599
3600         return __push_leaf_right(trans, root, path, min_data_size, empty,
3601                                 right, free_space, left_nritems, min_slot);
3602 out_unlock:
3603         btrfs_tree_unlock(right);
3604         free_extent_buffer(right);
3605         return 1;
3606 }
3607
3608 /*
3609  * push some data in the path leaf to the left, trying to free up at
3610  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3611  *
3612  * max_slot can put a limit on how far into the leaf we'll push items.  The
3613  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3614  * items
3615  */
3616 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3617                                      struct btrfs_root *root,
3618                                      struct btrfs_path *path, int data_size,
3619                                      int empty, struct extent_buffer *left,
3620                                      int free_space, u32 right_nritems,
3621                                      u32 max_slot)
3622 {
3623         struct btrfs_disk_key disk_key;
3624         struct extent_buffer *right = path->nodes[0];
3625         int i;
3626         int push_space = 0;
3627         int push_items = 0;
3628         struct btrfs_item *item;
3629         u32 old_left_nritems;
3630         u32 nr;
3631         int ret = 0;
3632         u32 this_item_size;
3633         u32 old_left_item_size;
3634         struct btrfs_map_token token;
3635
3636         btrfs_init_map_token(&token);
3637
3638         if (empty)
3639                 nr = min(right_nritems, max_slot);
3640         else
3641                 nr = min(right_nritems - 1, max_slot);
3642
3643         for (i = 0; i < nr; i++) {
3644                 item = btrfs_item_nr(right, i);
3645
3646                 if (!empty && push_items > 0) {
3647                         if (path->slots[0] < i)
3648                                 break;
3649                         if (path->slots[0] == i) {
3650                                 int space = btrfs_leaf_free_space(root, right);
3651                                 if (space + push_space * 2 > free_space)
3652                                         break;
3653                         }
3654                 }
3655
3656                 if (path->slots[0] == i)
3657                         push_space += data_size;
3658
3659                 this_item_size = btrfs_item_size(right, item);
3660                 if (this_item_size + sizeof(*item) + push_space > free_space)
3661                         break;
3662
3663                 push_items++;
3664                 push_space += this_item_size + sizeof(*item);
3665         }
3666
3667         if (push_items == 0) {
3668                 ret = 1;
3669                 goto out;
3670         }
3671         if (!empty && push_items == btrfs_header_nritems(right))
3672                 WARN_ON(1);
3673
3674         /* push data from right to left */
3675         copy_extent_buffer(left, right,
3676                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3677                            btrfs_item_nr_offset(0),
3678                            push_items * sizeof(struct btrfs_item));
3679
3680         push_space = BTRFS_LEAF_DATA_SIZE(root) -
3681                      btrfs_item_offset_nr(right, push_items - 1);
3682
3683         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3684                      leaf_data_end(root, left) - push_space,
3685                      btrfs_leaf_data(right) +
3686                      btrfs_item_offset_nr(right, push_items - 1),
3687                      push_space);
3688         old_left_nritems = btrfs_header_nritems(left);
3689         BUG_ON(old_left_nritems <= 0);
3690
3691         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3692         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3693                 u32 ioff;
3694
3695                 item = btrfs_item_nr(left, i);
3696
3697                 ioff = btrfs_token_item_offset(left, item, &token);
3698                 btrfs_set_token_item_offset(left, item,
3699                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3700                       &token);
3701         }
3702         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3703
3704         /* fixup right node */
3705         if (push_items > right_nritems)
3706                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3707                        right_nritems);
3708
3709         if (push_items < right_nritems) {
3710                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3711                                                   leaf_data_end(root, right);
3712                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
3713                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
3714                                       btrfs_leaf_data(right) +
3715                                       leaf_data_end(root, right), push_space);
3716
3717                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3718                               btrfs_item_nr_offset(push_items),
3719                              (btrfs_header_nritems(right) - push_items) *
3720                              sizeof(struct btrfs_item));
3721         }
3722         right_nritems -= push_items;
3723         btrfs_set_header_nritems(right, right_nritems);
3724         push_space = BTRFS_LEAF_DATA_SIZE(root);
3725         for (i = 0; i < right_nritems; i++) {
3726                 item = btrfs_item_nr(right, i);
3727
3728                 push_space = push_space - btrfs_token_item_size(right,
3729                                                                 item, &token);
3730                 btrfs_set_token_item_offset(right, item, push_space, &token);
3731         }
3732
3733         btrfs_mark_buffer_dirty(left);
3734         if (right_nritems)
3735                 btrfs_mark_buffer_dirty(right);
3736         else
3737                 clean_tree_block(trans, root, right);
3738
3739         btrfs_item_key(right, &disk_key, 0);
3740         fixup_low_keys(root, path, &disk_key, 1);
3741
3742         /* then fixup the leaf pointer in the path */
3743         if (path->slots[0] < push_items) {
3744                 path->slots[0] += old_left_nritems;
3745                 btrfs_tree_unlock(path->nodes[0]);
3746                 free_extent_buffer(path->nodes[0]);
3747                 path->nodes[0] = left;
3748                 path->slots[1] -= 1;
3749         } else {
3750                 btrfs_tree_unlock(left);
3751                 free_extent_buffer(left);
3752                 path->slots[0] -= push_items;
3753         }
3754         BUG_ON(path->slots[0] < 0);
3755         return ret;
3756 out:
3757         btrfs_tree_unlock(left);
3758         free_extent_buffer(left);
3759         return ret;
3760 }
3761
3762 /*
3763  * push some data in the path leaf to the left, trying to free up at
3764  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3765  *
3766  * max_slot can put a limit on how far into the leaf we'll push items.  The
3767  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3768  * items
3769  */
3770 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3771                           *root, struct btrfs_path *path, int min_data_size,
3772                           int data_size, int empty, u32 max_slot)
3773 {
3774         struct extent_buffer *right = path->nodes[0];
3775         struct extent_buffer *left;
3776         int slot;
3777         int free_space;
3778         u32 right_nritems;
3779         int ret = 0;
3780
3781         slot = path->slots[1];
3782         if (slot == 0)
3783                 return 1;
3784         if (!path->nodes[1])
3785                 return 1;
3786
3787         right_nritems = btrfs_header_nritems(right);
3788         if (right_nritems == 0)
3789                 return 1;
3790
3791         btrfs_assert_tree_locked(path->nodes[1]);
3792
3793         left = read_node_slot(root, path->nodes[1], slot - 1);
3794         if (left == NULL)
3795                 return 1;
3796
3797         btrfs_tree_lock(left);
3798         btrfs_set_lock_blocking(left);
3799
3800         free_space = btrfs_leaf_free_space(root, left);
3801         if (free_space < data_size) {
3802                 ret = 1;
3803                 goto out;
3804         }
3805
3806         /* cow and double check */
3807         ret = btrfs_cow_block(trans, root, left,
3808                               path->nodes[1], slot - 1, &left);
3809         if (ret) {
3810                 /* we hit -ENOSPC, but it isn't fatal here */
3811                 if (ret == -ENOSPC)
3812                         ret = 1;
3813                 goto out;
3814         }
3815
3816         free_space = btrfs_leaf_free_space(root, left);
3817         if (free_space < data_size) {
3818                 ret = 1;
3819                 goto out;
3820         }
3821
3822         return __push_leaf_left(trans, root, path, min_data_size,
3823                                empty, left, free_space, right_nritems,
3824                                max_slot);
3825 out:
3826         btrfs_tree_unlock(left);
3827         free_extent_buffer(left);
3828         return ret;
3829 }
3830
3831 /*
3832  * split the path's leaf in two, making sure there is at least data_size
3833  * available for the resulting leaf level of the path.
3834  */
3835 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3836                                     struct btrfs_root *root,
3837                                     struct btrfs_path *path,
3838                                     struct extent_buffer *l,
3839                                     struct extent_buffer *right,
3840                                     int slot, int mid, int nritems)
3841 {
3842         int data_copy_size;
3843         int rt_data_off;
3844         int i;
3845         struct btrfs_disk_key disk_key;
3846         struct btrfs_map_token token;
3847
3848         btrfs_init_map_token(&token);
3849
3850         nritems = nritems - mid;
3851         btrfs_set_header_nritems(right, nritems);
3852         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
3853
3854         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3855                            btrfs_item_nr_offset(mid),
3856                            nritems * sizeof(struct btrfs_item));
3857
3858         copy_extent_buffer(right, l,
3859                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
3860                      data_copy_size, btrfs_leaf_data(l) +
3861                      leaf_data_end(root, l), data_copy_size);
3862
3863         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
3864                       btrfs_item_end_nr(l, mid);
3865
3866         for (i = 0; i < nritems; i++) {
3867                 struct btrfs_item *item = btrfs_item_nr(right, i);
3868                 u32 ioff;
3869
3870                 ioff = btrfs_token_item_offset(right, item, &token);
3871                 btrfs_set_token_item_offset(right, item,
3872                                             ioff + rt_data_off, &token);
3873         }
3874
3875         btrfs_set_header_nritems(l, mid);
3876         btrfs_item_key(right, &disk_key, 0);
3877         insert_ptr(trans, root, path, &disk_key, right->start,
3878                    path->slots[1] + 1, 1);
3879
3880         btrfs_mark_buffer_dirty(right);
3881         btrfs_mark_buffer_dirty(l);
3882         BUG_ON(path->slots[0] != slot);
3883
3884         if (mid <= slot) {
3885                 btrfs_tree_unlock(path->nodes[0]);
3886                 free_extent_buffer(path->nodes[0]);
3887                 path->nodes[0] = right;
3888                 path->slots[0] -= mid;
3889                 path->slots[1] += 1;
3890         } else {
3891                 btrfs_tree_unlock(right);
3892                 free_extent_buffer(right);
3893         }
3894
3895         BUG_ON(path->slots[0] < 0);
3896 }
3897
3898 /*
3899  * double splits happen when we need to insert a big item in the middle
3900  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3901  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3902  *          A                 B                 C
3903  *
3904  * We avoid this by trying to push the items on either side of our target
3905  * into the adjacent leaves.  If all goes well we can avoid the double split
3906  * completely.
3907  */
3908 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3909                                           struct btrfs_root *root,
3910                                           struct btrfs_path *path,
3911                                           int data_size)
3912 {
3913         int ret;
3914         int progress = 0;
3915         int slot;
3916         u32 nritems;
3917
3918         slot = path->slots[0];
3919
3920         /*
3921          * try to push all the items after our slot into the
3922          * right leaf
3923          */
3924         ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
3925         if (ret < 0)
3926                 return ret;
3927
3928         if (ret == 0)
3929                 progress++;
3930
3931         nritems = btrfs_header_nritems(path->nodes[0]);
3932         /*
3933          * our goal is to get our slot at the start or end of a leaf.  If
3934          * we've done so we're done
3935          */
3936         if (path->slots[0] == 0 || path->slots[0] == nritems)
3937                 return 0;
3938
3939         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3940                 return 0;
3941
3942         /* try to push all the items before our slot into the next leaf */
3943         slot = path->slots[0];
3944         ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
3945         if (ret < 0)
3946                 return ret;
3947
3948         if (ret == 0)
3949                 progress++;
3950
3951         if (progress)
3952                 return 0;
3953         return 1;
3954 }
3955
3956 /*
3957  * split the path's leaf in two, making sure there is at least data_size
3958  * available for the resulting leaf level of the path.
3959  *
3960  * returns 0 if all went well and < 0 on failure.
3961  */
3962 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3963                                struct btrfs_root *root,
3964                                struct btrfs_key *ins_key,
3965                                struct btrfs_path *path, int data_size,
3966                                int extend)
3967 {
3968         struct btrfs_disk_key disk_key;
3969         struct extent_buffer *l;
3970         u32 nritems;
3971         int mid;
3972         int slot;
3973         struct extent_buffer *right;
3974         int ret = 0;
3975         int wret;
3976         int split;
3977         int num_doubles = 0;
3978         int tried_avoid_double = 0;
3979
3980         l = path->nodes[0];
3981         slot = path->slots[0];
3982         if (extend && data_size + btrfs_item_size_nr(l, slot) +
3983             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
3984                 return -EOVERFLOW;
3985
3986         /* first try to make some room by pushing left and right */
3987         if (data_size) {
3988                 wret = push_leaf_right(trans, root, path, data_size,
3989                                        data_size, 0, 0);
3990                 if (wret < 0)
3991                         return wret;
3992                 if (wret) {
3993                         wret = push_leaf_left(trans, root, path, data_size,
3994                                               data_size, 0, (u32)-1);
3995                         if (wret < 0)
3996                                 return wret;
3997                 }
3998                 l = path->nodes[0];
3999
4000                 /* did the pushes work? */
4001                 if (btrfs_leaf_free_space(root, l) >= data_size)
4002                         return 0;
4003         }
4004
4005         if (!path->nodes[1]) {
4006                 ret = insert_new_root(trans, root, path, 1, 1);
4007                 if (ret)
4008                         return ret;
4009         }
4010 again:
4011         split = 1;
4012         l = path->nodes[0];
4013         slot = path->slots[0];
4014         nritems = btrfs_header_nritems(l);
4015         mid = (nritems + 1) / 2;
4016
4017         if (mid <= slot) {
4018                 if (nritems == 1 ||
4019                     leaf_space_used(l, mid, nritems - mid) + data_size >
4020                         BTRFS_LEAF_DATA_SIZE(root)) {
4021                         if (slot >= nritems) {
4022                                 split = 0;
4023                         } else {
4024                                 mid = slot;
4025                                 if (mid != nritems &&
4026                                     leaf_space_used(l, mid, nritems - mid) +
4027                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4028                                         if (data_size && !tried_avoid_double)
4029                                                 goto push_for_double;
4030                                         split = 2;
4031                                 }
4032                         }
4033                 }
4034         } else {
4035                 if (leaf_space_used(l, 0, mid) + data_size >
4036                         BTRFS_LEAF_DATA_SIZE(root)) {
4037                         if (!extend && data_size && slot == 0) {
4038                                 split = 0;
4039                         } else if ((extend || !data_size) && slot == 0) {
4040                                 mid = 1;
4041                         } else {
4042                                 mid = slot;
4043                                 if (mid != nritems &&
4044                                     leaf_space_used(l, mid, nritems - mid) +
4045                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4046                                         if (data_size && !tried_avoid_double)
4047                                                 goto push_for_double;
4048                                         split = 2 ;
4049                                 }
4050                         }
4051                 }
4052         }
4053
4054         if (split == 0)
4055                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4056         else
4057                 btrfs_item_key(l, &disk_key, mid);
4058
4059         right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
4060                                         root->root_key.objectid,
4061                                         &disk_key, 0, l->start, 0);
4062         if (IS_ERR(right))
4063                 return PTR_ERR(right);
4064
4065         root_add_used(root, root->leafsize);
4066
4067         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4068         btrfs_set_header_bytenr(right, right->start);
4069         btrfs_set_header_generation(right, trans->transid);
4070         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4071         btrfs_set_header_owner(right, root->root_key.objectid);
4072         btrfs_set_header_level(right, 0);
4073         write_extent_buffer(right, root->fs_info->fsid,
4074                             (unsigned long)btrfs_header_fsid(right),
4075                             BTRFS_FSID_SIZE);
4076
4077         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
4078                             (unsigned long)btrfs_header_chunk_tree_uuid(right),
4079                             BTRFS_UUID_SIZE);
4080
4081         if (split == 0) {
4082                 if (mid <= slot) {
4083                         btrfs_set_header_nritems(right, 0);
4084                         insert_ptr(trans, root, path, &disk_key, right->start,
4085                                    path->slots[1] + 1, 1);
4086                         btrfs_tree_unlock(path->nodes[0]);
4087                         free_extent_buffer(path->nodes[0]);
4088                         path->nodes[0] = right;
4089                         path->slots[0] = 0;
4090                         path->slots[1] += 1;
4091                 } else {
4092                         btrfs_set_header_nritems(right, 0);
4093                         insert_ptr(trans, root, path, &disk_key, right->start,
4094                                           path->slots[1], 1);
4095                         btrfs_tree_unlock(path->nodes[0]);
4096                         free_extent_buffer(path->nodes[0]);
4097                         path->nodes[0] = right;
4098                         path->slots[0] = 0;
4099                         if (path->slots[1] == 0)
4100                                 fixup_low_keys(root, path, &disk_key, 1);
4101                 }
4102                 btrfs_mark_buffer_dirty(right);
4103                 return ret;
4104         }
4105
4106         copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4107
4108         if (split == 2) {
4109                 BUG_ON(num_doubles != 0);
4110                 num_doubles++;
4111                 goto again;
4112         }
4113
4114         return 0;
4115
4116 push_for_double:
4117         push_for_double_split(trans, root, path, data_size);
4118         tried_avoid_double = 1;
4119         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4120                 return 0;
4121         goto again;
4122 }
4123
4124 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4125                                          struct btrfs_root *root,
4126                                          struct btrfs_path *path, int ins_len)
4127 {
4128         struct btrfs_key key;
4129         struct extent_buffer *leaf;
4130         struct btrfs_file_extent_item *fi;
4131         u64 extent_len = 0;
4132         u32 item_size;
4133         int ret;
4134
4135         leaf = path->nodes[0];
4136         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4137
4138         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4139                key.type != BTRFS_EXTENT_CSUM_KEY);
4140
4141         if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4142                 return 0;
4143
4144         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4145         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4146                 fi = btrfs_item_ptr(leaf, path->slots[0],
4147                                     struct btrfs_file_extent_item);
4148                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4149         }
4150         btrfs_release_path(path);
4151
4152         path->keep_locks = 1;
4153         path->search_for_split = 1;
4154         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4155         path->search_for_split = 0;
4156         if (ret < 0)
4157                 goto err;
4158
4159         ret = -EAGAIN;
4160         leaf = path->nodes[0];
4161         /* if our item isn't there or got smaller, return now */
4162         if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4163                 goto err;
4164
4165         /* the leaf has  changed, it now has room.  return now */
4166         if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4167                 goto err;
4168
4169         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4170                 fi = btrfs_item_ptr(leaf, path->slots[0],
4171                                     struct btrfs_file_extent_item);
4172                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4173                         goto err;
4174         }
4175
4176         btrfs_set_path_blocking(path);
4177         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4178         if (ret)
4179                 goto err;
4180
4181         path->keep_locks = 0;
4182         btrfs_unlock_up_safe(path, 1);
4183         return 0;
4184 err:
4185         path->keep_locks = 0;
4186         return ret;
4187 }
4188
4189 static noinline int split_item(struct btrfs_trans_handle *trans,
4190                                struct btrfs_root *root,
4191                                struct btrfs_path *path,
4192                                struct btrfs_key *new_key,
4193                                unsigned long split_offset)
4194 {
4195         struct extent_buffer *leaf;
4196         struct btrfs_item *item;
4197         struct btrfs_item *new_item;
4198         int slot;
4199         char *buf;
4200         u32 nritems;
4201         u32 item_size;
4202         u32 orig_offset;
4203         struct btrfs_disk_key disk_key;
4204
4205         leaf = path->nodes[0];
4206         BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4207
4208         btrfs_set_path_blocking(path);
4209
4210         item = btrfs_item_nr(leaf, path->slots[0]);
4211         orig_offset = btrfs_item_offset(leaf, item);
4212         item_size = btrfs_item_size(leaf, item);
4213
4214         buf = kmalloc(item_size, GFP_NOFS);
4215         if (!buf)
4216                 return -ENOMEM;
4217
4218         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4219                             path->slots[0]), item_size);
4220
4221         slot = path->slots[0] + 1;
4222         nritems = btrfs_header_nritems(leaf);
4223         if (slot != nritems) {
4224                 /* shift the items */
4225                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4226                                 btrfs_item_nr_offset(slot),
4227                                 (nritems - slot) * sizeof(struct btrfs_item));
4228         }
4229
4230         btrfs_cpu_key_to_disk(&disk_key, new_key);
4231         btrfs_set_item_key(leaf, &disk_key, slot);
4232
4233         new_item = btrfs_item_nr(leaf, slot);
4234
4235         btrfs_set_item_offset(leaf, new_item, orig_offset);
4236         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4237
4238         btrfs_set_item_offset(leaf, item,
4239                               orig_offset + item_size - split_offset);
4240         btrfs_set_item_size(leaf, item, split_offset);
4241
4242         btrfs_set_header_nritems(leaf, nritems + 1);
4243
4244         /* write the data for the start of the original item */
4245         write_extent_buffer(leaf, buf,
4246                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4247                             split_offset);
4248
4249         /* write the data for the new item */
4250         write_extent_buffer(leaf, buf + split_offset,
4251                             btrfs_item_ptr_offset(leaf, slot),
4252                             item_size - split_offset);
4253         btrfs_mark_buffer_dirty(leaf);
4254
4255         BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4256         kfree(buf);
4257         return 0;
4258 }
4259
4260 /*
4261  * This function splits a single item into two items,
4262  * giving 'new_key' to the new item and splitting the
4263  * old one at split_offset (from the start of the item).
4264  *
4265  * The path may be released by this operation.  After
4266  * the split, the path is pointing to the old item.  The
4267  * new item is going to be in the same node as the old one.
4268  *
4269  * Note, the item being split must be smaller enough to live alone on
4270  * a tree block with room for one extra struct btrfs_item
4271  *
4272  * This allows us to split the item in place, keeping a lock on the
4273  * leaf the entire time.
4274  */
4275 int btrfs_split_item(struct btrfs_trans_handle *trans,
4276                      struct btrfs_root *root,
4277                      struct btrfs_path *path,
4278                      struct btrfs_key *new_key,
4279                      unsigned long split_offset)
4280 {
4281         int ret;
4282         ret = setup_leaf_for_split(trans, root, path,
4283                                    sizeof(struct btrfs_item));
4284         if (ret)
4285                 return ret;
4286
4287         ret = split_item(trans, root, path, new_key, split_offset);
4288         return ret;
4289 }
4290
4291 /*
4292  * This function duplicate a item, giving 'new_key' to the new item.
4293  * It guarantees both items live in the same tree leaf and the new item
4294  * is contiguous with the original item.
4295  *
4296  * This allows us to split file extent in place, keeping a lock on the
4297  * leaf the entire time.
4298  */
4299 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4300                          struct btrfs_root *root,
4301                          struct btrfs_path *path,
4302                          struct btrfs_key *new_key)
4303 {
4304         struct extent_buffer *leaf;
4305         int ret;
4306         u32 item_size;
4307
4308         leaf = path->nodes[0];
4309         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4310         ret = setup_leaf_for_split(trans, root, path,
4311                                    item_size + sizeof(struct btrfs_item));
4312         if (ret)
4313                 return ret;
4314
4315         path->slots[0]++;
4316         setup_items_for_insert(root, path, new_key, &item_size,
4317                                item_size, item_size +
4318                                sizeof(struct btrfs_item), 1);
4319         leaf = path->nodes[0];
4320         memcpy_extent_buffer(leaf,
4321                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4322                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4323                              item_size);
4324         return 0;
4325 }
4326
4327 /*
4328  * make the item pointed to by the path smaller.  new_size indicates
4329  * how small to make it, and from_end tells us if we just chop bytes
4330  * off the end of the item or if we shift the item to chop bytes off
4331  * the front.
4332  */
4333 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4334                          u32 new_size, int from_end)
4335 {
4336         int slot;
4337         struct extent_buffer *leaf;
4338         struct btrfs_item *item;
4339         u32 nritems;
4340         unsigned int data_end;
4341         unsigned int old_data_start;
4342         unsigned int old_size;
4343         unsigned int size_diff;
4344         int i;
4345         struct btrfs_map_token token;
4346
4347         btrfs_init_map_token(&token);
4348
4349         leaf = path->nodes[0];
4350         slot = path->slots[0];
4351
4352         old_size = btrfs_item_size_nr(leaf, slot);
4353         if (old_size == new_size)
4354                 return;
4355
4356         nritems = btrfs_header_nritems(leaf);
4357         data_end = leaf_data_end(root, leaf);
4358
4359         old_data_start = btrfs_item_offset_nr(leaf, slot);
4360
4361         size_diff = old_size - new_size;
4362
4363         BUG_ON(slot < 0);
4364         BUG_ON(slot >= nritems);
4365
4366         /*
4367          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4368          */
4369         /* first correct the data pointers */
4370         for (i = slot; i < nritems; i++) {
4371                 u32 ioff;
4372                 item = btrfs_item_nr(leaf, i);
4373
4374                 ioff = btrfs_token_item_offset(leaf, item, &token);
4375                 btrfs_set_token_item_offset(leaf, item,
4376                                             ioff + size_diff, &token);
4377         }
4378
4379         /* shift the data */
4380         if (from_end) {
4381                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4382                               data_end + size_diff, btrfs_leaf_data(leaf) +
4383                               data_end, old_data_start + new_size - data_end);
4384         } else {
4385                 struct btrfs_disk_key disk_key;
4386                 u64 offset;
4387
4388                 btrfs_item_key(leaf, &disk_key, slot);
4389
4390                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4391                         unsigned long ptr;
4392                         struct btrfs_file_extent_item *fi;
4393
4394                         fi = btrfs_item_ptr(leaf, slot,
4395                                             struct btrfs_file_extent_item);
4396                         fi = (struct btrfs_file_extent_item *)(
4397                              (unsigned long)fi - size_diff);
4398
4399                         if (btrfs_file_extent_type(leaf, fi) ==
4400                             BTRFS_FILE_EXTENT_INLINE) {
4401                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4402                                 memmove_extent_buffer(leaf, ptr,
4403                                       (unsigned long)fi,
4404                                       offsetof(struct btrfs_file_extent_item,
4405                                                  disk_bytenr));
4406                         }
4407                 }
4408
4409                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4410                               data_end + size_diff, btrfs_leaf_data(leaf) +
4411                               data_end, old_data_start - data_end);
4412
4413                 offset = btrfs_disk_key_offset(&disk_key);
4414                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4415                 btrfs_set_item_key(leaf, &disk_key, slot);
4416                 if (slot == 0)
4417                         fixup_low_keys(root, path, &disk_key, 1);
4418         }
4419
4420         item = btrfs_item_nr(leaf, slot);
4421         btrfs_set_item_size(leaf, item, new_size);
4422         btrfs_mark_buffer_dirty(leaf);
4423
4424         if (btrfs_leaf_free_space(root, leaf) < 0) {
4425                 btrfs_print_leaf(root, leaf);
4426                 BUG();
4427         }
4428 }
4429
4430 /*
4431  * make the item pointed to by the path bigger, data_size is the new size.
4432  */
4433 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4434                        u32 data_size)
4435 {
4436         int slot;
4437         struct extent_buffer *leaf;
4438         struct btrfs_item *item;
4439         u32 nritems;
4440         unsigned int data_end;
4441         unsigned int old_data;
4442         unsigned int old_size;
4443         int i;
4444         struct btrfs_map_token token;
4445
4446         btrfs_init_map_token(&token);
4447
4448         leaf = path->nodes[0];
4449
4450         nritems = btrfs_header_nritems(leaf);
4451         data_end = leaf_data_end(root, leaf);
4452
4453         if (btrfs_leaf_free_space(root, leaf) < data_size) {
4454                 btrfs_print_leaf(root, leaf);
4455                 BUG();
4456         }
4457         slot = path->slots[0];
4458         old_data = btrfs_item_end_nr(leaf, slot);
4459
4460         BUG_ON(slot < 0);
4461         if (slot >= nritems) {
4462                 btrfs_print_leaf(root, leaf);
4463                 printk(KERN_CRIT "slot %d too large, nritems %d\n",
4464                        slot, nritems);
4465                 BUG_ON(1);
4466         }
4467
4468         /*
4469          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4470          */
4471         /* first correct the data pointers */
4472         for (i = slot; i < nritems; i++) {
4473                 u32 ioff;
4474                 item = btrfs_item_nr(leaf, i);
4475
4476                 ioff = btrfs_token_item_offset(leaf, item, &token);
4477                 btrfs_set_token_item_offset(leaf, item,
4478                                             ioff - data_size, &token);
4479         }
4480
4481         /* shift the data */
4482         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4483                       data_end - data_size, btrfs_leaf_data(leaf) +
4484                       data_end, old_data - data_end);
4485
4486         data_end = old_data;
4487         old_size = btrfs_item_size_nr(leaf, slot);
4488         item = btrfs_item_nr(leaf, slot);
4489         btrfs_set_item_size(leaf, item, old_size + data_size);
4490         btrfs_mark_buffer_dirty(leaf);
4491
4492         if (btrfs_leaf_free_space(root, leaf) < 0) {
4493                 btrfs_print_leaf(root, leaf);
4494                 BUG();
4495         }
4496 }
4497
4498 /*
4499  * this is a helper for btrfs_insert_empty_items, the main goal here is
4500  * to save stack depth by doing the bulk of the work in a function
4501  * that doesn't call btrfs_search_slot
4502  */
4503 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4504                             struct btrfs_key *cpu_key, u32 *data_size,
4505                             u32 total_data, u32 total_size, int nr)
4506 {
4507         struct btrfs_item *item;
4508         int i;
4509         u32 nritems;
4510         unsigned int data_end;
4511         struct btrfs_disk_key disk_key;
4512         struct extent_buffer *leaf;
4513         int slot;
4514         struct btrfs_map_token token;
4515
4516         btrfs_init_map_token(&token);
4517
4518         leaf = path->nodes[0];
4519         slot = path->slots[0];
4520
4521         nritems = btrfs_header_nritems(leaf);
4522         data_end = leaf_data_end(root, leaf);
4523
4524         if (btrfs_leaf_free_space(root, leaf) < total_size) {
4525                 btrfs_print_leaf(root, leaf);
4526                 printk(KERN_CRIT "not enough freespace need %u have %d\n",
4527                        total_size, btrfs_leaf_free_space(root, leaf));
4528                 BUG();
4529         }
4530
4531         if (slot != nritems) {
4532                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4533
4534                 if (old_data < data_end) {
4535                         btrfs_print_leaf(root, leaf);
4536                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4537                                slot, old_data, data_end);
4538                         BUG_ON(1);
4539                 }
4540                 /*
4541                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4542                  */
4543                 /* first correct the data pointers */
4544                 for (i = slot; i < nritems; i++) {
4545                         u32 ioff;
4546
4547                         item = btrfs_item_nr(leaf, i);
4548                         ioff = btrfs_token_item_offset(leaf, item, &token);
4549                         btrfs_set_token_item_offset(leaf, item,
4550                                                     ioff - total_data, &token);
4551                 }
4552                 /* shift the items */
4553                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4554                               btrfs_item_nr_offset(slot),
4555                               (nritems - slot) * sizeof(struct btrfs_item));
4556
4557                 /* shift the data */
4558                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4559                               data_end - total_data, btrfs_leaf_data(leaf) +
4560                               data_end, old_data - data_end);
4561                 data_end = old_data;
4562         }
4563
4564         /* setup the item for the new data */
4565         for (i = 0; i < nr; i++) {
4566                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4567                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4568                 item = btrfs_item_nr(leaf, slot + i);
4569                 btrfs_set_token_item_offset(leaf, item,
4570                                             data_end - data_size[i], &token);
4571                 data_end -= data_size[i];
4572                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4573         }
4574
4575         btrfs_set_header_nritems(leaf, nritems + nr);
4576
4577         if (slot == 0) {
4578                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4579                 fixup_low_keys(root, path, &disk_key, 1);
4580         }
4581         btrfs_unlock_up_safe(path, 1);
4582         btrfs_mark_buffer_dirty(leaf);
4583
4584         if (btrfs_leaf_free_space(root, leaf) < 0) {
4585                 btrfs_print_leaf(root, leaf);
4586                 BUG();
4587         }
4588 }
4589
4590 /*
4591  * Given a key and some data, insert items into the tree.
4592  * This does all the path init required, making room in the tree if needed.
4593  */
4594 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4595                             struct btrfs_root *root,
4596                             struct btrfs_path *path,
4597                             struct btrfs_key *cpu_key, u32 *data_size,
4598                             int nr)
4599 {
4600         int ret = 0;
4601         int slot;
4602         int i;
4603         u32 total_size = 0;
4604         u32 total_data = 0;
4605
4606         for (i = 0; i < nr; i++)
4607                 total_data += data_size[i];
4608
4609         total_size = total_data + (nr * sizeof(struct btrfs_item));
4610         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4611         if (ret == 0)
4612                 return -EEXIST;
4613         if (ret < 0)
4614                 return ret;
4615
4616         slot = path->slots[0];
4617         BUG_ON(slot < 0);
4618
4619         setup_items_for_insert(root, path, cpu_key, data_size,
4620                                total_data, total_size, nr);
4621         return 0;
4622 }
4623
4624 /*
4625  * Given a key and some data, insert an item into the tree.
4626  * This does all the path init required, making room in the tree if needed.
4627  */
4628 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4629                       *root, struct btrfs_key *cpu_key, void *data, u32
4630                       data_size)
4631 {
4632         int ret = 0;
4633         struct btrfs_path *path;
4634         struct extent_buffer *leaf;
4635         unsigned long ptr;
4636
4637         path = btrfs_alloc_path();
4638         if (!path)
4639                 return -ENOMEM;
4640         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4641         if (!ret) {
4642                 leaf = path->nodes[0];
4643                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4644                 write_extent_buffer(leaf, data, ptr, data_size);
4645                 btrfs_mark_buffer_dirty(leaf);
4646         }
4647         btrfs_free_path(path);
4648         return ret;
4649 }
4650
4651 /*
4652  * delete the pointer from a given node.
4653  *
4654  * the tree should have been previously balanced so the deletion does not
4655  * empty a node.
4656  */
4657 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4658                     int level, int slot)
4659 {
4660         struct extent_buffer *parent = path->nodes[level];
4661         u32 nritems;
4662         int ret;
4663
4664         nritems = btrfs_header_nritems(parent);
4665         if (slot != nritems - 1) {
4666                 if (level)
4667                         tree_mod_log_eb_move(root->fs_info, parent, slot,
4668                                              slot + 1, nritems - slot - 1);
4669                 memmove_extent_buffer(parent,
4670                               btrfs_node_key_ptr_offset(slot),
4671                               btrfs_node_key_ptr_offset(slot + 1),
4672                               sizeof(struct btrfs_key_ptr) *
4673                               (nritems - slot - 1));
4674         } else if (level) {
4675                 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4676                                               MOD_LOG_KEY_REMOVE);
4677                 BUG_ON(ret < 0);
4678         }
4679
4680         nritems--;
4681         btrfs_set_header_nritems(parent, nritems);
4682         if (nritems == 0 && parent == root->node) {
4683                 BUG_ON(btrfs_header_level(root->node) != 1);
4684                 /* just turn the root into a leaf and break */
4685                 btrfs_set_header_level(root->node, 0);
4686         } else if (slot == 0) {
4687                 struct btrfs_disk_key disk_key;
4688
4689                 btrfs_node_key(parent, &disk_key, 0);
4690                 fixup_low_keys(root, path, &disk_key, level + 1);
4691         }
4692         btrfs_mark_buffer_dirty(parent);
4693 }
4694
4695 /*
4696  * a helper function to delete the leaf pointed to by path->slots[1] and
4697  * path->nodes[1].
4698  *
4699  * This deletes the pointer in path->nodes[1] and frees the leaf
4700  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4701  *
4702  * The path must have already been setup for deleting the leaf, including
4703  * all the proper balancing.  path->nodes[1] must be locked.
4704  */
4705 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4706                                     struct btrfs_root *root,
4707                                     struct btrfs_path *path,
4708                                     struct extent_buffer *leaf)
4709 {
4710         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4711         del_ptr(root, path, 1, path->slots[1]);
4712
4713         /*
4714          * btrfs_free_extent is expensive, we want to make sure we
4715          * aren't holding any locks when we call it
4716          */
4717         btrfs_unlock_up_safe(path, 0);
4718
4719         root_sub_used(root, leaf->len);
4720
4721         extent_buffer_get(leaf);
4722         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4723         free_extent_buffer_stale(leaf);
4724 }
4725 /*
4726  * delete the item at the leaf level in path.  If that empties
4727  * the leaf, remove it from the tree
4728  */
4729 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4730                     struct btrfs_path *path, int slot, int nr)
4731 {
4732         struct extent_buffer *leaf;
4733         struct btrfs_item *item;
4734         int last_off;
4735         int dsize = 0;
4736         int ret = 0;
4737         int wret;
4738         int i;
4739         u32 nritems;
4740         struct btrfs_map_token token;
4741
4742         btrfs_init_map_token(&token);
4743
4744         leaf = path->nodes[0];
4745         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4746
4747         for (i = 0; i < nr; i++)
4748                 dsize += btrfs_item_size_nr(leaf, slot + i);
4749
4750         nritems = btrfs_header_nritems(leaf);
4751
4752         if (slot + nr != nritems) {
4753                 int data_end = leaf_data_end(root, leaf);
4754
4755                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4756                               data_end + dsize,
4757                               btrfs_leaf_data(leaf) + data_end,
4758                               last_off - data_end);
4759
4760                 for (i = slot + nr; i < nritems; i++) {
4761                         u32 ioff;
4762
4763                         item = btrfs_item_nr(leaf, i);
4764                         ioff = btrfs_token_item_offset(leaf, item, &token);
4765                         btrfs_set_token_item_offset(leaf, item,
4766                                                     ioff + dsize, &token);
4767                 }
4768
4769                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4770                               btrfs_item_nr_offset(slot + nr),
4771                               sizeof(struct btrfs_item) *
4772                               (nritems - slot - nr));
4773         }
4774         btrfs_set_header_nritems(leaf, nritems - nr);
4775         nritems -= nr;
4776
4777         /* delete the leaf if we've emptied it */
4778         if (nritems == 0) {
4779                 if (leaf == root->node) {
4780                         btrfs_set_header_level(leaf, 0);
4781                 } else {
4782                         btrfs_set_path_blocking(path);
4783                         clean_tree_block(trans, root, leaf);
4784                         btrfs_del_leaf(trans, root, path, leaf);
4785                 }
4786         } else {
4787                 int used = leaf_space_used(leaf, 0, nritems);
4788                 if (slot == 0) {
4789                         struct btrfs_disk_key disk_key;
4790
4791                         btrfs_item_key(leaf, &disk_key, 0);
4792                         fixup_low_keys(root, path, &disk_key, 1);
4793                 }
4794
4795                 /* delete the leaf if it is mostly empty */
4796                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
4797                         /* push_leaf_left fixes the path.
4798                          * make sure the path still points to our leaf
4799                          * for possible call to del_ptr below
4800                          */
4801                         slot = path->slots[1];
4802                         extent_buffer_get(leaf);
4803
4804                         btrfs_set_path_blocking(path);
4805                         wret = push_leaf_left(trans, root, path, 1, 1,
4806                                               1, (u32)-1);
4807                         if (wret < 0 && wret != -ENOSPC)
4808                                 ret = wret;
4809
4810                         if (path->nodes[0] == leaf &&
4811                             btrfs_header_nritems(leaf)) {
4812                                 wret = push_leaf_right(trans, root, path, 1,
4813                                                        1, 1, 0);
4814                                 if (wret < 0 && wret != -ENOSPC)
4815                                         ret = wret;
4816                         }
4817
4818                         if (btrfs_header_nritems(leaf) == 0) {
4819                                 path->slots[1] = slot;
4820                                 btrfs_del_leaf(trans, root, path, leaf);
4821                                 free_extent_buffer(leaf);
4822                                 ret = 0;
4823                         } else {
4824                                 /* if we're still in the path, make sure
4825                                  * we're dirty.  Otherwise, one of the
4826                                  * push_leaf functions must have already
4827                                  * dirtied this buffer
4828                                  */
4829                                 if (path->nodes[0] == leaf)
4830                                         btrfs_mark_buffer_dirty(leaf);
4831                                 free_extent_buffer(leaf);
4832                         }
4833                 } else {
4834                         btrfs_mark_buffer_dirty(leaf);
4835                 }
4836         }
4837         return ret;
4838 }
4839
4840 /*
4841  * search the tree again to find a leaf with lesser keys
4842  * returns 0 if it found something or 1 if there are no lesser leaves.
4843  * returns < 0 on io errors.
4844  *
4845  * This may release the path, and so you may lose any locks held at the
4846  * time you call it.
4847  */
4848 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4849 {
4850         struct btrfs_key key;
4851         struct btrfs_disk_key found_key;
4852         int ret;
4853
4854         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4855
4856         if (key.offset > 0)
4857                 key.offset--;
4858         else if (key.type > 0)
4859                 key.type--;
4860         else if (key.objectid > 0)
4861                 key.objectid--;
4862         else
4863                 return 1;
4864
4865         btrfs_release_path(path);
4866         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4867         if (ret < 0)
4868                 return ret;
4869         btrfs_item_key(path->nodes[0], &found_key, 0);
4870         ret = comp_keys(&found_key, &key);
4871         if (ret < 0)
4872                 return 0;
4873         return 1;
4874 }
4875
4876 /*
4877  * A helper function to walk down the tree starting at min_key, and looking
4878  * for nodes or leaves that are have a minimum transaction id.
4879  * This is used by the btree defrag code, and tree logging
4880  *
4881  * This does not cow, but it does stuff the starting key it finds back
4882  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4883  * key and get a writable path.
4884  *
4885  * This does lock as it descends, and path->keep_locks should be set
4886  * to 1 by the caller.
4887  *
4888  * This honors path->lowest_level to prevent descent past a given level
4889  * of the tree.
4890  *
4891  * min_trans indicates the oldest transaction that you are interested
4892  * in walking through.  Any nodes or leaves older than min_trans are
4893  * skipped over (without reading them).
4894  *
4895  * returns zero if something useful was found, < 0 on error and 1 if there
4896  * was nothing in the tree that matched the search criteria.
4897  */
4898 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4899                          struct btrfs_key *max_key,
4900                          struct btrfs_path *path,
4901                          u64 min_trans)
4902 {
4903         struct extent_buffer *cur;
4904         struct btrfs_key found_key;
4905         int slot;
4906         int sret;
4907         u32 nritems;
4908         int level;
4909         int ret = 1;
4910
4911         WARN_ON(!path->keep_locks);
4912 again:
4913         cur = btrfs_read_lock_root_node(root);
4914         level = btrfs_header_level(cur);
4915         WARN_ON(path->nodes[level]);
4916         path->nodes[level] = cur;
4917         path->locks[level] = BTRFS_READ_LOCK;
4918
4919         if (btrfs_header_generation(cur) < min_trans) {
4920                 ret = 1;
4921                 goto out;
4922         }
4923         while (1) {
4924                 nritems = btrfs_header_nritems(cur);
4925                 level = btrfs_header_level(cur);
4926                 sret = bin_search(cur, min_key, level, &slot);
4927
4928                 /* at the lowest level, we're done, setup the path and exit */
4929                 if (level == path->lowest_level) {
4930                         if (slot >= nritems)
4931                                 goto find_next_key;
4932                         ret = 0;
4933                         path->slots[level] = slot;
4934                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4935                         goto out;
4936                 }
4937                 if (sret && slot > 0)
4938                         slot--;
4939                 /*
4940                  * check this node pointer against the min_trans parameters.
4941                  * If it is too old, old, skip to the next one.
4942                  */
4943                 while (slot < nritems) {
4944                         u64 blockptr;
4945                         u64 gen;
4946
4947                         blockptr = btrfs_node_blockptr(cur, slot);
4948                         gen = btrfs_node_ptr_generation(cur, slot);
4949                         if (gen < min_trans) {
4950                                 slot++;
4951                                 continue;
4952                         }
4953                         break;
4954                 }
4955 find_next_key:
4956                 /*
4957                  * we didn't find a candidate key in this node, walk forward
4958                  * and find another one
4959                  */
4960                 if (slot >= nritems) {
4961                         path->slots[level] = slot;
4962                         btrfs_set_path_blocking(path);
4963                         sret = btrfs_find_next_key(root, path, min_key, level,
4964                                                   min_trans);
4965                         if (sret == 0) {
4966                                 btrfs_release_path(path);
4967                                 goto again;
4968                         } else {
4969                                 goto out;
4970                         }
4971                 }
4972                 /* save our key for returning back */
4973                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4974                 path->slots[level] = slot;
4975                 if (level == path->lowest_level) {
4976                         ret = 0;
4977                         unlock_up(path, level, 1, 0, NULL);
4978                         goto out;
4979                 }
4980                 btrfs_set_path_blocking(path);
4981                 cur = read_node_slot(root, cur, slot);
4982                 BUG_ON(!cur); /* -ENOMEM */
4983
4984                 btrfs_tree_read_lock(cur);
4985
4986                 path->locks[level - 1] = BTRFS_READ_LOCK;
4987                 path->nodes[level - 1] = cur;
4988                 unlock_up(path, level, 1, 0, NULL);
4989                 btrfs_clear_path_blocking(path, NULL, 0);
4990         }
4991 out:
4992         if (ret == 0)
4993                 memcpy(min_key, &found_key, sizeof(found_key));
4994         btrfs_set_path_blocking(path);
4995         return ret;
4996 }
4997
4998 static void tree_move_down(struct btrfs_root *root,
4999                            struct btrfs_path *path,
5000                            int *level, int root_level)
5001 {
5002         BUG_ON(*level == 0);
5003         path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5004                                         path->slots[*level]);
5005         path->slots[*level - 1] = 0;
5006         (*level)--;
5007 }
5008
5009 static int tree_move_next_or_upnext(struct btrfs_root *root,
5010                                     struct btrfs_path *path,
5011                                     int *level, int root_level)
5012 {
5013         int ret = 0;
5014         int nritems;
5015         nritems = btrfs_header_nritems(path->nodes[*level]);
5016
5017         path->slots[*level]++;
5018
5019         while (path->slots[*level] >= nritems) {
5020                 if (*level == root_level)
5021                         return -1;
5022
5023                 /* move upnext */
5024                 path->slots[*level] = 0;
5025                 free_extent_buffer(path->nodes[*level]);
5026                 path->nodes[*level] = NULL;
5027                 (*level)++;
5028                 path->slots[*level]++;
5029
5030                 nritems = btrfs_header_nritems(path->nodes[*level]);
5031                 ret = 1;
5032         }
5033         return ret;
5034 }
5035
5036 /*
5037  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5038  * or down.
5039  */
5040 static int tree_advance(struct btrfs_root *root,
5041                         struct btrfs_path *path,
5042                         int *level, int root_level,
5043                         int allow_down,
5044                         struct btrfs_key *key)
5045 {
5046         int ret;
5047
5048         if (*level == 0 || !allow_down) {
5049                 ret = tree_move_next_or_upnext(root, path, level, root_level);
5050         } else {
5051                 tree_move_down(root, path, level, root_level);
5052                 ret = 0;
5053         }
5054         if (ret >= 0) {
5055                 if (*level == 0)
5056                         btrfs_item_key_to_cpu(path->nodes[*level], key,
5057                                         path->slots[*level]);
5058                 else
5059                         btrfs_node_key_to_cpu(path->nodes[*level], key,
5060                                         path->slots[*level]);
5061         }
5062         return ret;
5063 }
5064
5065 static int tree_compare_item(struct btrfs_root *left_root,
5066                              struct btrfs_path *left_path,
5067                              struct btrfs_path *right_path,
5068                              char *tmp_buf)
5069 {
5070         int cmp;
5071         int len1, len2;
5072         unsigned long off1, off2;
5073
5074         len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5075         len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5076         if (len1 != len2)
5077                 return 1;
5078
5079         off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5080         off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5081                                 right_path->slots[0]);
5082
5083         read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5084
5085         cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5086         if (cmp)
5087                 return 1;
5088         return 0;
5089 }
5090
5091 #define ADVANCE 1
5092 #define ADVANCE_ONLY_NEXT -1
5093
5094 /*
5095  * This function compares two trees and calls the provided callback for
5096  * every changed/new/deleted item it finds.
5097  * If shared tree blocks are encountered, whole subtrees are skipped, making
5098  * the compare pretty fast on snapshotted subvolumes.
5099  *
5100  * This currently works on commit roots only. As commit roots are read only,
5101  * we don't do any locking. The commit roots are protected with transactions.
5102  * Transactions are ended and rejoined when a commit is tried in between.
5103  *
5104  * This function checks for modifications done to the trees while comparing.
5105  * If it detects a change, it aborts immediately.
5106  */
5107 int btrfs_compare_trees(struct btrfs_root *left_root,
5108                         struct btrfs_root *right_root,
5109                         btrfs_changed_cb_t changed_cb, void *ctx)
5110 {
5111         int ret;
5112         int cmp;
5113         struct btrfs_trans_handle *trans = NULL;
5114         struct btrfs_path *left_path = NULL;
5115         struct btrfs_path *right_path = NULL;
5116         struct btrfs_key left_key;
5117         struct btrfs_key right_key;
5118         char *tmp_buf = NULL;
5119         int left_root_level;
5120         int right_root_level;
5121         int left_level;
5122         int right_level;
5123         int left_end_reached;
5124         int right_end_reached;
5125         int advance_left;
5126         int advance_right;
5127         u64 left_blockptr;
5128         u64 right_blockptr;
5129         u64 left_start_ctransid;
5130         u64 right_start_ctransid;
5131         u64 ctransid;
5132
5133         left_path = btrfs_alloc_path();
5134         if (!left_path) {
5135                 ret = -ENOMEM;
5136                 goto out;
5137         }
5138         right_path = btrfs_alloc_path();
5139         if (!right_path) {
5140                 ret = -ENOMEM;
5141                 goto out;
5142         }
5143
5144         tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
5145         if (!tmp_buf) {
5146                 ret = -ENOMEM;
5147                 goto out;
5148         }
5149
5150         left_path->search_commit_root = 1;
5151         left_path->skip_locking = 1;
5152         right_path->search_commit_root = 1;
5153         right_path->skip_locking = 1;
5154
5155         spin_lock(&left_root->root_item_lock);
5156         left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5157         spin_unlock(&left_root->root_item_lock);
5158
5159         spin_lock(&right_root->root_item_lock);
5160         right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5161         spin_unlock(&right_root->root_item_lock);
5162
5163         trans = btrfs_join_transaction(left_root);
5164         if (IS_ERR(trans)) {
5165                 ret = PTR_ERR(trans);
5166                 trans = NULL;
5167                 goto out;
5168         }
5169
5170         /*
5171          * Strategy: Go to the first items of both trees. Then do
5172          *
5173          * If both trees are at level 0
5174          *   Compare keys of current items
5175          *     If left < right treat left item as new, advance left tree
5176          *       and repeat
5177          *     If left > right treat right item as deleted, advance right tree
5178          *       and repeat
5179          *     If left == right do deep compare of items, treat as changed if
5180          *       needed, advance both trees and repeat
5181          * If both trees are at the same level but not at level 0
5182          *   Compare keys of current nodes/leafs
5183          *     If left < right advance left tree and repeat
5184          *     If left > right advance right tree and repeat
5185          *     If left == right compare blockptrs of the next nodes/leafs
5186          *       If they match advance both trees but stay at the same level
5187          *         and repeat
5188          *       If they don't match advance both trees while allowing to go
5189          *         deeper and repeat
5190          * If tree levels are different
5191          *   Advance the tree that needs it and repeat
5192          *
5193          * Advancing a tree means:
5194          *   If we are at level 0, try to go to the next slot. If that's not
5195          *   possible, go one level up and repeat. Stop when we found a level
5196          *   where we could go to the next slot. We may at this point be on a
5197          *   node or a leaf.
5198          *
5199          *   If we are not at level 0 and not on shared tree blocks, go one
5200          *   level deeper.
5201          *
5202          *   If we are not at level 0 and on shared tree blocks, go one slot to
5203          *   the right if possible or go up and right.
5204          */
5205
5206         left_level = btrfs_header_level(left_root->commit_root);
5207         left_root_level = left_level;
5208         left_path->nodes[left_level] = left_root->commit_root;
5209         extent_buffer_get(left_path->nodes[left_level]);
5210
5211         right_level = btrfs_header_level(right_root->commit_root);
5212         right_root_level = right_level;
5213         right_path->nodes[right_level] = right_root->commit_root;
5214         extent_buffer_get(right_path->nodes[right_level]);
5215
5216         if (left_level == 0)
5217                 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5218                                 &left_key, left_path->slots[left_level]);
5219         else
5220                 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5221                                 &left_key, left_path->slots[left_level]);
5222         if (right_level == 0)
5223                 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5224                                 &right_key, right_path->slots[right_level]);
5225         else
5226                 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5227                                 &right_key, right_path->slots[right_level]);
5228
5229         left_end_reached = right_end_reached = 0;
5230         advance_left = advance_right = 0;
5231
5232         while (1) {
5233                 /*
5234                  * We need to make sure the transaction does not get committed
5235                  * while we do anything on commit roots. This means, we need to
5236                  * join and leave transactions for every item that we process.
5237                  */
5238                 if (trans && btrfs_should_end_transaction(trans, left_root)) {
5239                         btrfs_release_path(left_path);
5240                         btrfs_release_path(right_path);
5241
5242                         ret = btrfs_end_transaction(trans, left_root);
5243                         trans = NULL;
5244                         if (ret < 0)
5245                                 goto out;
5246                 }
5247                 /* now rejoin the transaction */
5248                 if (!trans) {
5249                         trans = btrfs_join_transaction(left_root);
5250                         if (IS_ERR(trans)) {
5251                                 ret = PTR_ERR(trans);
5252                                 trans = NULL;
5253                                 goto out;
5254                         }
5255
5256                         spin_lock(&left_root->root_item_lock);
5257                         ctransid = btrfs_root_ctransid(&left_root->root_item);
5258                         spin_unlock(&left_root->root_item_lock);
5259                         if (ctransid != left_start_ctransid)
5260                                 left_start_ctransid = 0;
5261
5262                         spin_lock(&right_root->root_item_lock);
5263                         ctransid = btrfs_root_ctransid(&right_root->root_item);
5264                         spin_unlock(&right_root->root_item_lock);
5265                         if (ctransid != right_start_ctransid)
5266                                 right_start_ctransid = 0;
5267
5268                         if (!left_start_ctransid || !right_start_ctransid) {
5269                                 WARN(1, KERN_WARNING
5270                                         "btrfs: btrfs_compare_tree detected "
5271                                         "a change in one of the trees while "
5272                                         "iterating. This is probably a "
5273                                         "bug.\n");
5274                                 ret = -EIO;
5275                                 goto out;
5276                         }
5277
5278                         /*
5279                          * the commit root may have changed, so start again
5280                          * where we stopped
5281                          */
5282                         left_path->lowest_level = left_level;
5283                         right_path->lowest_level = right_level;
5284                         ret = btrfs_search_slot(NULL, left_root,
5285                                         &left_key, left_path, 0, 0);
5286                         if (ret < 0)
5287                                 goto out;
5288                         ret = btrfs_search_slot(NULL, right_root,
5289                                         &right_key, right_path, 0, 0);
5290                         if (ret < 0)
5291                                 goto out;
5292                 }
5293
5294                 if (advance_left && !left_end_reached) {
5295                         ret = tree_advance(left_root, left_path, &left_level,
5296                                         left_root_level,
5297                                         advance_left != ADVANCE_ONLY_NEXT,
5298                                         &left_key);
5299                         if (ret < 0)
5300                                 left_end_reached = ADVANCE;
5301                         advance_left = 0;
5302                 }
5303                 if (advance_right && !right_end_reached) {
5304                         ret = tree_advance(right_root, right_path, &right_level,
5305                                         right_root_level,
5306                                         advance_right != ADVANCE_ONLY_NEXT,
5307                                         &right_key);
5308                         if (ret < 0)
5309                                 right_end_reached = ADVANCE;
5310                         advance_right = 0;
5311                 }
5312
5313                 if (left_end_reached && right_end_reached) {
5314                         ret = 0;
5315                         goto out;
5316                 } else if (left_end_reached) {
5317                         if (right_level == 0) {
5318                                 ret = changed_cb(left_root, right_root,
5319                                                 left_path, right_path,
5320                                                 &right_key,
5321                                                 BTRFS_COMPARE_TREE_DELETED,
5322                                                 ctx);
5323                                 if (ret < 0)
5324                                         goto out;
5325                         }
5326                         advance_right = ADVANCE;
5327                         continue;
5328                 } else if (right_end_reached) {
5329                         if (left_level == 0) {
5330                                 ret = changed_cb(left_root, right_root,
5331                                                 left_path, right_path,
5332                                                 &left_key,
5333                                                 BTRFS_COMPARE_TREE_NEW,
5334                                                 ctx);
5335                                 if (ret < 0)
5336                                         goto out;
5337                         }
5338                         advance_left = ADVANCE;
5339                         continue;
5340                 }
5341
5342                 if (left_level == 0 && right_level == 0) {
5343                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5344                         if (cmp < 0) {
5345                                 ret = changed_cb(left_root, right_root,
5346                                                 left_path, right_path,
5347                                                 &left_key,
5348                                                 BTRFS_COMPARE_TREE_NEW,
5349                                                 ctx);
5350                                 if (ret < 0)
5351                                         goto out;
5352                                 advance_left = ADVANCE;
5353                         } else if (cmp > 0) {
5354                                 ret = changed_cb(left_root, right_root,
5355                                                 left_path, right_path,
5356                                                 &right_key,
5357                                                 BTRFS_COMPARE_TREE_DELETED,
5358                                                 ctx);
5359                                 if (ret < 0)
5360                                         goto out;
5361                                 advance_right = ADVANCE;
5362                         } else {
5363                                 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5364                                 ret = tree_compare_item(left_root, left_path,
5365                                                 right_path, tmp_buf);
5366                                 if (ret) {
5367                                         WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5368                                         ret = changed_cb(left_root, right_root,
5369                                                 left_path, right_path,
5370                                                 &left_key,
5371                                                 BTRFS_COMPARE_TREE_CHANGED,
5372                                                 ctx);
5373                                         if (ret < 0)
5374                                                 goto out;
5375                                 }
5376                                 advance_left = ADVANCE;
5377                                 advance_right = ADVANCE;
5378                         }
5379                 } else if (left_level == right_level) {
5380                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5381                         if (cmp < 0) {
5382                                 advance_left = ADVANCE;
5383                         } else if (cmp > 0) {
5384                                 advance_right = ADVANCE;
5385                         } else {
5386                                 left_blockptr = btrfs_node_blockptr(
5387                                                 left_path->nodes[left_level],
5388                                                 left_path->slots[left_level]);
5389                                 right_blockptr = btrfs_node_blockptr(
5390                                                 right_path->nodes[right_level],
5391                                                 right_path->slots[right_level]);
5392                                 if (left_blockptr == right_blockptr) {
5393                                         /*
5394                                          * As we're on a shared block, don't
5395                                          * allow to go deeper.
5396                                          */
5397                                         advance_left = ADVANCE_ONLY_NEXT;
5398                                         advance_right = ADVANCE_ONLY_NEXT;
5399                                 } else {
5400                                         advance_left = ADVANCE;
5401                                         advance_right = ADVANCE;
5402                                 }
5403                         }
5404                 } else if (left_level < right_level) {
5405                         advance_right = ADVANCE;
5406                 } else {
5407                         advance_left = ADVANCE;
5408                 }
5409         }
5410
5411 out:
5412         btrfs_free_path(left_path);
5413         btrfs_free_path(right_path);
5414         kfree(tmp_buf);
5415
5416         if (trans) {
5417                 if (!ret)
5418                         ret = btrfs_end_transaction(trans, left_root);
5419                 else
5420                         btrfs_end_transaction(trans, left_root);
5421         }
5422
5423         return ret;
5424 }
5425
5426 /*
5427  * this is similar to btrfs_next_leaf, but does not try to preserve
5428  * and fixup the path.  It looks for and returns the next key in the
5429  * tree based on the current path and the min_trans parameters.
5430  *
5431  * 0 is returned if another key is found, < 0 if there are any errors
5432  * and 1 is returned if there are no higher keys in the tree
5433  *
5434  * path->keep_locks should be set to 1 on the search made before
5435  * calling this function.
5436  */
5437 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5438                         struct btrfs_key *key, int level, u64 min_trans)
5439 {
5440         int slot;
5441         struct extent_buffer *c;
5442
5443         WARN_ON(!path->keep_locks);
5444         while (level < BTRFS_MAX_LEVEL) {
5445                 if (!path->nodes[level])
5446                         return 1;
5447
5448                 slot = path->slots[level] + 1;
5449                 c = path->nodes[level];
5450 next:
5451                 if (slot >= btrfs_header_nritems(c)) {
5452                         int ret;
5453                         int orig_lowest;
5454                         struct btrfs_key cur_key;
5455                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5456                             !path->nodes[level + 1])
5457                                 return 1;
5458
5459                         if (path->locks[level + 1]) {
5460                                 level++;
5461                                 continue;
5462                         }
5463
5464                         slot = btrfs_header_nritems(c) - 1;
5465                         if (level == 0)
5466                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5467                         else
5468                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5469
5470                         orig_lowest = path->lowest_level;
5471                         btrfs_release_path(path);
5472                         path->lowest_level = level;
5473                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5474                                                 0, 0);
5475                         path->lowest_level = orig_lowest;
5476                         if (ret < 0)
5477                                 return ret;
5478
5479                         c = path->nodes[level];
5480                         slot = path->slots[level];
5481                         if (ret == 0)
5482                                 slot++;
5483                         goto next;
5484                 }
5485
5486                 if (level == 0)
5487                         btrfs_item_key_to_cpu(c, key, slot);
5488                 else {
5489                         u64 gen = btrfs_node_ptr_generation(c, slot);
5490
5491                         if (gen < min_trans) {
5492                                 slot++;
5493                                 goto next;
5494                         }
5495                         btrfs_node_key_to_cpu(c, key, slot);
5496                 }
5497                 return 0;
5498         }
5499         return 1;
5500 }
5501
5502 /*
5503  * search the tree again to find a leaf with greater keys
5504  * returns 0 if it found something or 1 if there are no greater leaves.
5505  * returns < 0 on io errors.
5506  */
5507 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5508 {
5509         return btrfs_next_old_leaf(root, path, 0);
5510 }
5511
5512 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5513                         u64 time_seq)
5514 {
5515         int slot;
5516         int level;
5517         struct extent_buffer *c;
5518         struct extent_buffer *next;
5519         struct btrfs_key key;
5520         u32 nritems;
5521         int ret;
5522         int old_spinning = path->leave_spinning;
5523         int next_rw_lock = 0;
5524
5525         nritems = btrfs_header_nritems(path->nodes[0]);
5526         if (nritems == 0)
5527                 return 1;
5528
5529         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5530 again:
5531         level = 1;
5532         next = NULL;
5533         next_rw_lock = 0;
5534         btrfs_release_path(path);
5535
5536         path->keep_locks = 1;
5537         path->leave_spinning = 1;
5538
5539         if (time_seq)
5540                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5541         else
5542                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5543         path->keep_locks = 0;
5544
5545         if (ret < 0)
5546                 return ret;
5547
5548         nritems = btrfs_header_nritems(path->nodes[0]);
5549         /*
5550          * by releasing the path above we dropped all our locks.  A balance
5551          * could have added more items next to the key that used to be
5552          * at the very end of the block.  So, check again here and
5553          * advance the path if there are now more items available.
5554          */
5555         if (nritems > 0 && path->slots[0] < nritems - 1) {
5556                 if (ret == 0)
5557                         path->slots[0]++;
5558                 ret = 0;
5559                 goto done;
5560         }
5561
5562         while (level < BTRFS_MAX_LEVEL) {
5563                 if (!path->nodes[level]) {
5564                         ret = 1;
5565                         goto done;
5566                 }
5567
5568                 slot = path->slots[level] + 1;
5569                 c = path->nodes[level];
5570                 if (slot >= btrfs_header_nritems(c)) {
5571                         level++;
5572                         if (level == BTRFS_MAX_LEVEL) {
5573                                 ret = 1;
5574                                 goto done;
5575                         }
5576                         continue;
5577                 }
5578
5579                 if (next) {
5580                         btrfs_tree_unlock_rw(next, next_rw_lock);
5581                         free_extent_buffer(next);
5582                 }
5583
5584                 next = c;
5585                 next_rw_lock = path->locks[level];
5586                 ret = read_block_for_search(NULL, root, path, &next, level,
5587                                             slot, &key, 0);
5588                 if (ret == -EAGAIN)
5589                         goto again;
5590
5591                 if (ret < 0) {
5592                         btrfs_release_path(path);
5593                         goto done;
5594                 }
5595
5596                 if (!path->skip_locking) {
5597                         ret = btrfs_try_tree_read_lock(next);
5598                         if (!ret && time_seq) {
5599                                 /*
5600                                  * If we don't get the lock, we may be racing
5601                                  * with push_leaf_left, holding that lock while
5602                                  * itself waiting for the leaf we've currently
5603                                  * locked. To solve this situation, we give up
5604                                  * on our lock and cycle.
5605                                  */
5606                                 free_extent_buffer(next);
5607                                 btrfs_release_path(path);
5608                                 cond_resched();
5609                                 goto again;
5610                         }
5611                         if (!ret) {
5612                                 btrfs_set_path_blocking(path);
5613                                 btrfs_tree_read_lock(next);
5614                                 btrfs_clear_path_blocking(path, next,
5615                                                           BTRFS_READ_LOCK);
5616                         }
5617                         next_rw_lock = BTRFS_READ_LOCK;
5618                 }
5619                 break;
5620         }
5621         path->slots[level] = slot;
5622         while (1) {
5623                 level--;
5624                 c = path->nodes[level];
5625                 if (path->locks[level])
5626                         btrfs_tree_unlock_rw(c, path->locks[level]);
5627
5628                 free_extent_buffer(c);
5629                 path->nodes[level] = next;
5630                 path->slots[level] = 0;
5631                 if (!path->skip_locking)
5632                         path->locks[level] = next_rw_lock;
5633                 if (!level)
5634                         break;
5635
5636                 ret = read_block_for_search(NULL, root, path, &next, level,
5637                                             0, &key, 0);
5638                 if (ret == -EAGAIN)
5639                         goto again;
5640
5641                 if (ret < 0) {
5642                         btrfs_release_path(path);
5643                         goto done;
5644                 }
5645
5646                 if (!path->skip_locking) {
5647                         ret = btrfs_try_tree_read_lock(next);
5648                         if (!ret) {
5649                                 btrfs_set_path_blocking(path);
5650                                 btrfs_tree_read_lock(next);
5651                                 btrfs_clear_path_blocking(path, next,
5652                                                           BTRFS_READ_LOCK);
5653                         }
5654                         next_rw_lock = BTRFS_READ_LOCK;
5655                 }
5656         }
5657         ret = 0;
5658 done:
5659         unlock_up(path, 0, 1, 0, NULL);
5660         path->leave_spinning = old_spinning;
5661         if (!old_spinning)
5662                 btrfs_set_path_blocking(path);
5663
5664         return ret;
5665 }
5666
5667 /*
5668  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5669  * searching until it gets past min_objectid or finds an item of 'type'
5670  *
5671  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5672  */
5673 int btrfs_previous_item(struct btrfs_root *root,
5674                         struct btrfs_path *path, u64 min_objectid,
5675                         int type)
5676 {
5677         struct btrfs_key found_key;
5678         struct extent_buffer *leaf;
5679         u32 nritems;
5680         int ret;
5681
5682         while (1) {
5683                 if (path->slots[0] == 0) {
5684                         btrfs_set_path_blocking(path);
5685                         ret = btrfs_prev_leaf(root, path);
5686                         if (ret != 0)
5687                                 return ret;
5688                 } else {
5689                         path->slots[0]--;
5690                 }
5691                 leaf = path->nodes[0];
5692                 nritems = btrfs_header_nritems(leaf);
5693                 if (nritems == 0)
5694                         return 1;
5695                 if (path->slots[0] == nritems)
5696                         path->slots[0]--;
5697
5698                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5699                 if (found_key.objectid < min_objectid)
5700                         break;
5701                 if (found_key.type == type)
5702                         return 0;
5703                 if (found_key.objectid == min_objectid &&
5704                     found_key.type < type)
5705                         break;
5706         }
5707         return 1;
5708 }