Merge branch 'topic/ppc-kvm' of https://git.kernel.org/pub/scm/linux/kernel/git/power...
[linux-2.6-microblaze.git] / fs / btrfs / tree-mod-log.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "tree-mod-log.h"
4 #include "disk-io.h"
5
6 struct tree_mod_root {
7         u64 logical;
8         u8 level;
9 };
10
11 struct tree_mod_elem {
12         struct rb_node node;
13         u64 logical;
14         u64 seq;
15         enum btrfs_mod_log_op op;
16
17         /*
18          * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
19          * operations.
20          */
21         int slot;
22
23         /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
24         u64 generation;
25
26         /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
27         struct btrfs_disk_key key;
28         u64 blockptr;
29
30         /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
31         struct {
32                 int dst_slot;
33                 int nr_items;
34         } move;
35
36         /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
37         struct tree_mod_root old_root;
38 };
39
40 /*
41  * Pull a new tree mod seq number for our operation.
42  */
43 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
44 {
45         return atomic64_inc_return(&fs_info->tree_mod_seq);
46 }
47
48 /*
49  * This adds a new blocker to the tree mod log's blocker list if the @elem
50  * passed does not already have a sequence number set. So when a caller expects
51  * to record tree modifications, it should ensure to set elem->seq to zero
52  * before calling btrfs_get_tree_mod_seq.
53  * Returns a fresh, unused tree log modification sequence number, even if no new
54  * blocker was added.
55  */
56 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
57                            struct btrfs_seq_list *elem)
58 {
59         write_lock(&fs_info->tree_mod_log_lock);
60         if (!elem->seq) {
61                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
62                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
63                 set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
64         }
65         write_unlock(&fs_info->tree_mod_log_lock);
66
67         return elem->seq;
68 }
69
70 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
71                             struct btrfs_seq_list *elem)
72 {
73         struct rb_root *tm_root;
74         struct rb_node *node;
75         struct rb_node *next;
76         struct tree_mod_elem *tm;
77         u64 min_seq = BTRFS_SEQ_LAST;
78         u64 seq_putting = elem->seq;
79
80         if (!seq_putting)
81                 return;
82
83         write_lock(&fs_info->tree_mod_log_lock);
84         list_del(&elem->list);
85         elem->seq = 0;
86
87         if (list_empty(&fs_info->tree_mod_seq_list)) {
88                 clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
89         } else {
90                 struct btrfs_seq_list *first;
91
92                 first = list_first_entry(&fs_info->tree_mod_seq_list,
93                                          struct btrfs_seq_list, list);
94                 if (seq_putting > first->seq) {
95                         /*
96                          * Blocker with lower sequence number exists, we cannot
97                          * remove anything from the log.
98                          */
99                         write_unlock(&fs_info->tree_mod_log_lock);
100                         return;
101                 }
102                 min_seq = first->seq;
103         }
104
105         /*
106          * Anything that's lower than the lowest existing (read: blocked)
107          * sequence number can be removed from the tree.
108          */
109         tm_root = &fs_info->tree_mod_log;
110         for (node = rb_first(tm_root); node; node = next) {
111                 next = rb_next(node);
112                 tm = rb_entry(node, struct tree_mod_elem, node);
113                 if (tm->seq >= min_seq)
114                         continue;
115                 rb_erase(node, tm_root);
116                 kfree(tm);
117         }
118         write_unlock(&fs_info->tree_mod_log_lock);
119 }
120
121 /*
122  * Key order of the log:
123  *       node/leaf start address -> sequence
124  *
125  * The 'start address' is the logical address of the *new* root node for root
126  * replace operations, or the logical address of the affected block for all
127  * other operations.
128  */
129 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
130                                         struct tree_mod_elem *tm)
131 {
132         struct rb_root *tm_root;
133         struct rb_node **new;
134         struct rb_node *parent = NULL;
135         struct tree_mod_elem *cur;
136
137         lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
138
139         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
140
141         tm_root = &fs_info->tree_mod_log;
142         new = &tm_root->rb_node;
143         while (*new) {
144                 cur = rb_entry(*new, struct tree_mod_elem, node);
145                 parent = *new;
146                 if (cur->logical < tm->logical)
147                         new = &((*new)->rb_left);
148                 else if (cur->logical > tm->logical)
149                         new = &((*new)->rb_right);
150                 else if (cur->seq < tm->seq)
151                         new = &((*new)->rb_left);
152                 else if (cur->seq > tm->seq)
153                         new = &((*new)->rb_right);
154                 else
155                         return -EEXIST;
156         }
157
158         rb_link_node(&tm->node, parent, new);
159         rb_insert_color(&tm->node, tm_root);
160         return 0;
161 }
162
163 /*
164  * Determines if logging can be omitted. Returns true if it can. Otherwise, it
165  * returns false with the tree_mod_log_lock acquired. The caller must hold
166  * this until all tree mod log insertions are recorded in the rb tree and then
167  * write unlock fs_info::tree_mod_log_lock.
168  */
169 static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
170                                     struct extent_buffer *eb)
171 {
172         if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
173                 return true;
174         if (eb && btrfs_header_level(eb) == 0)
175                 return true;
176
177         write_lock(&fs_info->tree_mod_log_lock);
178         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
179                 write_unlock(&fs_info->tree_mod_log_lock);
180                 return true;
181         }
182
183         return false;
184 }
185
186 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
187 static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
188                                     struct extent_buffer *eb)
189 {
190         if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
191                 return false;
192         if (eb && btrfs_header_level(eb) == 0)
193                 return false;
194
195         return true;
196 }
197
198 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
199                                                  int slot,
200                                                  enum btrfs_mod_log_op op,
201                                                  gfp_t flags)
202 {
203         struct tree_mod_elem *tm;
204
205         tm = kzalloc(sizeof(*tm), flags);
206         if (!tm)
207                 return NULL;
208
209         tm->logical = eb->start;
210         if (op != BTRFS_MOD_LOG_KEY_ADD) {
211                 btrfs_node_key(eb, &tm->key, slot);
212                 tm->blockptr = btrfs_node_blockptr(eb, slot);
213         }
214         tm->op = op;
215         tm->slot = slot;
216         tm->generation = btrfs_node_ptr_generation(eb, slot);
217         RB_CLEAR_NODE(&tm->node);
218
219         return tm;
220 }
221
222 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
223                                   enum btrfs_mod_log_op op, gfp_t flags)
224 {
225         struct tree_mod_elem *tm;
226         int ret;
227
228         if (!tree_mod_need_log(eb->fs_info, eb))
229                 return 0;
230
231         tm = alloc_tree_mod_elem(eb, slot, op, flags);
232         if (!tm)
233                 return -ENOMEM;
234
235         if (tree_mod_dont_log(eb->fs_info, eb)) {
236                 kfree(tm);
237                 return 0;
238         }
239
240         ret = tree_mod_log_insert(eb->fs_info, tm);
241         write_unlock(&eb->fs_info->tree_mod_log_lock);
242         if (ret)
243                 kfree(tm);
244
245         return ret;
246 }
247
248 int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
249                                    int dst_slot, int src_slot,
250                                    int nr_items)
251 {
252         struct tree_mod_elem *tm = NULL;
253         struct tree_mod_elem **tm_list = NULL;
254         int ret = 0;
255         int i;
256         bool locked = false;
257
258         if (!tree_mod_need_log(eb->fs_info, eb))
259                 return 0;
260
261         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
262         if (!tm_list)
263                 return -ENOMEM;
264
265         tm = kzalloc(sizeof(*tm), GFP_NOFS);
266         if (!tm) {
267                 ret = -ENOMEM;
268                 goto free_tms;
269         }
270
271         tm->logical = eb->start;
272         tm->slot = src_slot;
273         tm->move.dst_slot = dst_slot;
274         tm->move.nr_items = nr_items;
275         tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
276
277         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
278                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
279                                 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
280                 if (!tm_list[i]) {
281                         ret = -ENOMEM;
282                         goto free_tms;
283                 }
284         }
285
286         if (tree_mod_dont_log(eb->fs_info, eb))
287                 goto free_tms;
288         locked = true;
289
290         /*
291          * When we override something during the move, we log these removals.
292          * This can only happen when we move towards the beginning of the
293          * buffer, i.e. dst_slot < src_slot.
294          */
295         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
296                 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
297                 if (ret)
298                         goto free_tms;
299         }
300
301         ret = tree_mod_log_insert(eb->fs_info, tm);
302         if (ret)
303                 goto free_tms;
304         write_unlock(&eb->fs_info->tree_mod_log_lock);
305         kfree(tm_list);
306
307         return 0;
308
309 free_tms:
310         for (i = 0; i < nr_items; i++) {
311                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
312                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
313                 kfree(tm_list[i]);
314         }
315         if (locked)
316                 write_unlock(&eb->fs_info->tree_mod_log_lock);
317         kfree(tm_list);
318         kfree(tm);
319
320         return ret;
321 }
322
323 static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
324                                        struct tree_mod_elem **tm_list,
325                                        int nritems)
326 {
327         int i, j;
328         int ret;
329
330         for (i = nritems - 1; i >= 0; i--) {
331                 ret = tree_mod_log_insert(fs_info, tm_list[i]);
332                 if (ret) {
333                         for (j = nritems - 1; j > i; j--)
334                                 rb_erase(&tm_list[j]->node,
335                                          &fs_info->tree_mod_log);
336                         return ret;
337                 }
338         }
339
340         return 0;
341 }
342
343 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
344                                    struct extent_buffer *new_root,
345                                    bool log_removal)
346 {
347         struct btrfs_fs_info *fs_info = old_root->fs_info;
348         struct tree_mod_elem *tm = NULL;
349         struct tree_mod_elem **tm_list = NULL;
350         int nritems = 0;
351         int ret = 0;
352         int i;
353
354         if (!tree_mod_need_log(fs_info, NULL))
355                 return 0;
356
357         if (log_removal && btrfs_header_level(old_root) > 0) {
358                 nritems = btrfs_header_nritems(old_root);
359                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
360                                   GFP_NOFS);
361                 if (!tm_list) {
362                         ret = -ENOMEM;
363                         goto free_tms;
364                 }
365                 for (i = 0; i < nritems; i++) {
366                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
367                             BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
368                         if (!tm_list[i]) {
369                                 ret = -ENOMEM;
370                                 goto free_tms;
371                         }
372                 }
373         }
374
375         tm = kzalloc(sizeof(*tm), GFP_NOFS);
376         if (!tm) {
377                 ret = -ENOMEM;
378                 goto free_tms;
379         }
380
381         tm->logical = new_root->start;
382         tm->old_root.logical = old_root->start;
383         tm->old_root.level = btrfs_header_level(old_root);
384         tm->generation = btrfs_header_generation(old_root);
385         tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
386
387         if (tree_mod_dont_log(fs_info, NULL))
388                 goto free_tms;
389
390         if (tm_list)
391                 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
392         if (!ret)
393                 ret = tree_mod_log_insert(fs_info, tm);
394
395         write_unlock(&fs_info->tree_mod_log_lock);
396         if (ret)
397                 goto free_tms;
398         kfree(tm_list);
399
400         return ret;
401
402 free_tms:
403         if (tm_list) {
404                 for (i = 0; i < nritems; i++)
405                         kfree(tm_list[i]);
406                 kfree(tm_list);
407         }
408         kfree(tm);
409
410         return ret;
411 }
412
413 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
414                                                    u64 start, u64 min_seq,
415                                                    bool smallest)
416 {
417         struct rb_root *tm_root;
418         struct rb_node *node;
419         struct tree_mod_elem *cur = NULL;
420         struct tree_mod_elem *found = NULL;
421
422         read_lock(&fs_info->tree_mod_log_lock);
423         tm_root = &fs_info->tree_mod_log;
424         node = tm_root->rb_node;
425         while (node) {
426                 cur = rb_entry(node, struct tree_mod_elem, node);
427                 if (cur->logical < start) {
428                         node = node->rb_left;
429                 } else if (cur->logical > start) {
430                         node = node->rb_right;
431                 } else if (cur->seq < min_seq) {
432                         node = node->rb_left;
433                 } else if (!smallest) {
434                         /* We want the node with the highest seq */
435                         if (found)
436                                 BUG_ON(found->seq > cur->seq);
437                         found = cur;
438                         node = node->rb_left;
439                 } else if (cur->seq > min_seq) {
440                         /* We want the node with the smallest seq */
441                         if (found)
442                                 BUG_ON(found->seq < cur->seq);
443                         found = cur;
444                         node = node->rb_right;
445                 } else {
446                         found = cur;
447                         break;
448                 }
449         }
450         read_unlock(&fs_info->tree_mod_log_lock);
451
452         return found;
453 }
454
455 /*
456  * This returns the element from the log with the smallest time sequence
457  * value that's in the log (the oldest log item). Any element with a time
458  * sequence lower than min_seq will be ignored.
459  */
460 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
461                                                         u64 start, u64 min_seq)
462 {
463         return __tree_mod_log_search(fs_info, start, min_seq, true);
464 }
465
466 /*
467  * This returns the element from the log with the largest time sequence
468  * value that's in the log (the most recent log item). Any element with
469  * a time sequence lower than min_seq will be ignored.
470  */
471 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
472                                                  u64 start, u64 min_seq)
473 {
474         return __tree_mod_log_search(fs_info, start, min_seq, false);
475 }
476
477 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
478                                struct extent_buffer *src,
479                                unsigned long dst_offset,
480                                unsigned long src_offset,
481                                int nr_items)
482 {
483         struct btrfs_fs_info *fs_info = dst->fs_info;
484         int ret = 0;
485         struct tree_mod_elem **tm_list = NULL;
486         struct tree_mod_elem **tm_list_add, **tm_list_rem;
487         int i;
488         bool locked = false;
489
490         if (!tree_mod_need_log(fs_info, NULL))
491                 return 0;
492
493         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
494                 return 0;
495
496         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
497                           GFP_NOFS);
498         if (!tm_list)
499                 return -ENOMEM;
500
501         tm_list_add = tm_list;
502         tm_list_rem = tm_list + nr_items;
503         for (i = 0; i < nr_items; i++) {
504                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
505                     BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
506                 if (!tm_list_rem[i]) {
507                         ret = -ENOMEM;
508                         goto free_tms;
509                 }
510
511                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
512                                                 BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
513                 if (!tm_list_add[i]) {
514                         ret = -ENOMEM;
515                         goto free_tms;
516                 }
517         }
518
519         if (tree_mod_dont_log(fs_info, NULL))
520                 goto free_tms;
521         locked = true;
522
523         for (i = 0; i < nr_items; i++) {
524                 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
525                 if (ret)
526                         goto free_tms;
527                 ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
528                 if (ret)
529                         goto free_tms;
530         }
531
532         write_unlock(&fs_info->tree_mod_log_lock);
533         kfree(tm_list);
534
535         return 0;
536
537 free_tms:
538         for (i = 0; i < nr_items * 2; i++) {
539                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
540                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
541                 kfree(tm_list[i]);
542         }
543         if (locked)
544                 write_unlock(&fs_info->tree_mod_log_lock);
545         kfree(tm_list);
546
547         return ret;
548 }
549
550 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
551 {
552         struct tree_mod_elem **tm_list = NULL;
553         int nritems = 0;
554         int i;
555         int ret = 0;
556
557         if (!tree_mod_need_log(eb->fs_info, eb))
558                 return 0;
559
560         nritems = btrfs_header_nritems(eb);
561         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
562         if (!tm_list)
563                 return -ENOMEM;
564
565         for (i = 0; i < nritems; i++) {
566                 tm_list[i] = alloc_tree_mod_elem(eb, i,
567                     BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
568                 if (!tm_list[i]) {
569                         ret = -ENOMEM;
570                         goto free_tms;
571                 }
572         }
573
574         if (tree_mod_dont_log(eb->fs_info, eb))
575                 goto free_tms;
576
577         ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
578         write_unlock(&eb->fs_info->tree_mod_log_lock);
579         if (ret)
580                 goto free_tms;
581         kfree(tm_list);
582
583         return 0;
584
585 free_tms:
586         for (i = 0; i < nritems; i++)
587                 kfree(tm_list[i]);
588         kfree(tm_list);
589
590         return ret;
591 }
592
593 /*
594  * Returns the logical address of the oldest predecessor of the given root.
595  * Entries older than time_seq are ignored.
596  */
597 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
598                                                       u64 time_seq)
599 {
600         struct tree_mod_elem *tm;
601         struct tree_mod_elem *found = NULL;
602         u64 root_logical = eb_root->start;
603         bool looped = false;
604
605         if (!time_seq)
606                 return NULL;
607
608         /*
609          * The very last operation that's logged for a root is the replacement
610          * operation (if it is replaced at all). This has the logical address
611          * of the *new* root, making it the very first operation that's logged
612          * for this root.
613          */
614         while (1) {
615                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
616                                                 time_seq);
617                 if (!looped && !tm)
618                         return NULL;
619                 /*
620                  * If there are no tree operation for the oldest root, we simply
621                  * return it. This should only happen if that (old) root is at
622                  * level 0.
623                  */
624                 if (!tm)
625                         break;
626
627                 /*
628                  * If there's an operation that's not a root replacement, we
629                  * found the oldest version of our root. Normally, we'll find a
630                  * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
631                  */
632                 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
633                         break;
634
635                 found = tm;
636                 root_logical = tm->old_root.logical;
637                 looped = true;
638         }
639
640         /* If there's no old root to return, return what we found instead */
641         if (!found)
642                 found = tm;
643
644         return found;
645 }
646
647
648 /*
649  * tm is a pointer to the first operation to rewind within eb. Then, all
650  * previous operations will be rewound (until we reach something older than
651  * time_seq).
652  */
653 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
654                                 struct extent_buffer *eb,
655                                 u64 time_seq,
656                                 struct tree_mod_elem *first_tm)
657 {
658         u32 n;
659         struct rb_node *next;
660         struct tree_mod_elem *tm = first_tm;
661         unsigned long o_dst;
662         unsigned long o_src;
663         unsigned long p_size = sizeof(struct btrfs_key_ptr);
664
665         n = btrfs_header_nritems(eb);
666         read_lock(&fs_info->tree_mod_log_lock);
667         while (tm && tm->seq >= time_seq) {
668                 /*
669                  * All the operations are recorded with the operator used for
670                  * the modification. As we're going backwards, we do the
671                  * opposite of each operation here.
672                  */
673                 switch (tm->op) {
674                 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
675                         BUG_ON(tm->slot < n);
676                         fallthrough;
677                 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
678                 case BTRFS_MOD_LOG_KEY_REMOVE:
679                         btrfs_set_node_key(eb, &tm->key, tm->slot);
680                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
681                         btrfs_set_node_ptr_generation(eb, tm->slot,
682                                                       tm->generation);
683                         n++;
684                         break;
685                 case BTRFS_MOD_LOG_KEY_REPLACE:
686                         BUG_ON(tm->slot >= n);
687                         btrfs_set_node_key(eb, &tm->key, tm->slot);
688                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
689                         btrfs_set_node_ptr_generation(eb, tm->slot,
690                                                       tm->generation);
691                         break;
692                 case BTRFS_MOD_LOG_KEY_ADD:
693                         /* if a move operation is needed it's in the log */
694                         n--;
695                         break;
696                 case BTRFS_MOD_LOG_MOVE_KEYS:
697                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
698                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
699                         memmove_extent_buffer(eb, o_dst, o_src,
700                                               tm->move.nr_items * p_size);
701                         break;
702                 case BTRFS_MOD_LOG_ROOT_REPLACE:
703                         /*
704                          * This operation is special. For roots, this must be
705                          * handled explicitly before rewinding.
706                          * For non-roots, this operation may exist if the node
707                          * was a root: root A -> child B; then A gets empty and
708                          * B is promoted to the new root. In the mod log, we'll
709                          * have a root-replace operation for B, a tree block
710                          * that is no root. We simply ignore that operation.
711                          */
712                         break;
713                 }
714                 next = rb_next(&tm->node);
715                 if (!next)
716                         break;
717                 tm = rb_entry(next, struct tree_mod_elem, node);
718                 if (tm->logical != first_tm->logical)
719                         break;
720         }
721         read_unlock(&fs_info->tree_mod_log_lock);
722         btrfs_set_header_nritems(eb, n);
723 }
724
725 /*
726  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
727  * is returned. If rewind operations happen, a fresh buffer is returned. The
728  * returned buffer is always read-locked. If the returned buffer is not the
729  * input buffer, the lock on the input buffer is released and the input buffer
730  * is freed (its refcount is decremented).
731  */
732 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
733                                                 struct btrfs_path *path,
734                                                 struct extent_buffer *eb,
735                                                 u64 time_seq)
736 {
737         struct extent_buffer *eb_rewin;
738         struct tree_mod_elem *tm;
739
740         if (!time_seq)
741                 return eb;
742
743         if (btrfs_header_level(eb) == 0)
744                 return eb;
745
746         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
747         if (!tm)
748                 return eb;
749
750         if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
751                 BUG_ON(tm->slot != 0);
752                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
753                 if (!eb_rewin) {
754                         btrfs_tree_read_unlock(eb);
755                         free_extent_buffer(eb);
756                         return NULL;
757                 }
758                 btrfs_set_header_bytenr(eb_rewin, eb->start);
759                 btrfs_set_header_backref_rev(eb_rewin,
760                                              btrfs_header_backref_rev(eb));
761                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
762                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
763         } else {
764                 eb_rewin = btrfs_clone_extent_buffer(eb);
765                 if (!eb_rewin) {
766                         btrfs_tree_read_unlock(eb);
767                         free_extent_buffer(eb);
768                         return NULL;
769                 }
770         }
771
772         btrfs_tree_read_unlock(eb);
773         free_extent_buffer(eb);
774
775         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
776                                        eb_rewin, btrfs_header_level(eb_rewin));
777         btrfs_tree_read_lock(eb_rewin);
778         tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
779         WARN_ON(btrfs_header_nritems(eb_rewin) >
780                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
781
782         return eb_rewin;
783 }
784
785 /*
786  * Rewind the state of @root's root node to the given @time_seq value.
787  * If there are no changes, the current root->root_node is returned. If anything
788  * changed in between, there's a fresh buffer allocated on which the rewind
789  * operations are done. In any case, the returned buffer is read locked.
790  * Returns NULL on error (with no locks held).
791  */
792 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
793 {
794         struct btrfs_fs_info *fs_info = root->fs_info;
795         struct tree_mod_elem *tm;
796         struct extent_buffer *eb = NULL;
797         struct extent_buffer *eb_root;
798         u64 eb_root_owner = 0;
799         struct extent_buffer *old;
800         struct tree_mod_root *old_root = NULL;
801         u64 old_generation = 0;
802         u64 logical;
803         int level;
804
805         eb_root = btrfs_read_lock_root_node(root);
806         tm = tree_mod_log_oldest_root(eb_root, time_seq);
807         if (!tm)
808                 return eb_root;
809
810         if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
811                 old_root = &tm->old_root;
812                 old_generation = tm->generation;
813                 logical = old_root->logical;
814                 level = old_root->level;
815         } else {
816                 logical = eb_root->start;
817                 level = btrfs_header_level(eb_root);
818         }
819
820         tm = tree_mod_log_search(fs_info, logical, time_seq);
821         if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
822                 btrfs_tree_read_unlock(eb_root);
823                 free_extent_buffer(eb_root);
824                 old = read_tree_block(fs_info, logical, root->root_key.objectid,
825                                       0, level, NULL);
826                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
827                         if (!IS_ERR(old))
828                                 free_extent_buffer(old);
829                         btrfs_warn(fs_info,
830                                    "failed to read tree block %llu from get_old_root",
831                                    logical);
832                 } else {
833                         struct tree_mod_elem *tm2;
834
835                         btrfs_tree_read_lock(old);
836                         eb = btrfs_clone_extent_buffer(old);
837                         /*
838                          * After the lookup for the most recent tree mod operation
839                          * above and before we locked and cloned the extent buffer
840                          * 'old', a new tree mod log operation may have been added.
841                          * So lookup for a more recent one to make sure the number
842                          * of mod log operations we replay is consistent with the
843                          * number of items we have in the cloned extent buffer,
844                          * otherwise we can hit a BUG_ON when rewinding the extent
845                          * buffer.
846                          */
847                         tm2 = tree_mod_log_search(fs_info, logical, time_seq);
848                         btrfs_tree_read_unlock(old);
849                         free_extent_buffer(old);
850                         ASSERT(tm2);
851                         ASSERT(tm2 == tm || tm2->seq > tm->seq);
852                         if (!tm2 || tm2->seq < tm->seq) {
853                                 free_extent_buffer(eb);
854                                 return NULL;
855                         }
856                         tm = tm2;
857                 }
858         } else if (old_root) {
859                 eb_root_owner = btrfs_header_owner(eb_root);
860                 btrfs_tree_read_unlock(eb_root);
861                 free_extent_buffer(eb_root);
862                 eb = alloc_dummy_extent_buffer(fs_info, logical);
863         } else {
864                 eb = btrfs_clone_extent_buffer(eb_root);
865                 btrfs_tree_read_unlock(eb_root);
866                 free_extent_buffer(eb_root);
867         }
868
869         if (!eb)
870                 return NULL;
871         if (old_root) {
872                 btrfs_set_header_bytenr(eb, eb->start);
873                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
874                 btrfs_set_header_owner(eb, eb_root_owner);
875                 btrfs_set_header_level(eb, old_root->level);
876                 btrfs_set_header_generation(eb, old_generation);
877         }
878         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
879                                        btrfs_header_level(eb));
880         btrfs_tree_read_lock(eb);
881         if (tm)
882                 tree_mod_log_rewind(fs_info, eb, time_seq, tm);
883         else
884                 WARN_ON(btrfs_header_level(eb) != 0);
885         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
886
887         return eb;
888 }
889
890 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
891 {
892         struct tree_mod_elem *tm;
893         int level;
894         struct extent_buffer *eb_root = btrfs_root_node(root);
895
896         tm = tree_mod_log_oldest_root(eb_root, time_seq);
897         if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
898                 level = tm->old_root.level;
899         else
900                 level = btrfs_header_level(eb_root);
901
902         free_extent_buffer(eb_root);
903
904         return level;
905 }
906
907 /*
908  * Return the lowest sequence number in the tree modification log.
909  *
910  * Return the sequence number of the oldest tree modification log user, which
911  * corresponds to the lowest sequence number of all existing users. If there are
912  * no users it returns 0.
913  */
914 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
915 {
916         u64 ret = 0;
917
918         read_lock(&fs_info->tree_mod_log_lock);
919         if (!list_empty(&fs_info->tree_mod_seq_list)) {
920                 struct btrfs_seq_list *elem;
921
922                 elem = list_first_entry(&fs_info->tree_mod_seq_list,
923                                         struct btrfs_seq_list, list);
924                 ret = elem->seq;
925         }
926         read_unlock(&fs_info->tree_mod_log_lock);
927
928         return ret;
929 }