1 // SPDX-License-Identifier: GPL-2.0
3 #include <linux/slab.h>
4 #include <trace/events/btrfs.h>
7 #include "extent-io-tree.h"
8 #include "btrfs_inode.h"
11 static struct kmem_cache *extent_state_cache;
13 static inline bool extent_state_in_tree(const struct extent_state *state)
15 return !RB_EMPTY_NODE(&state->rb_node);
18 #ifdef CONFIG_BTRFS_DEBUG
19 static LIST_HEAD(states);
20 static DEFINE_SPINLOCK(leak_lock);
22 static inline void btrfs_leak_debug_add_state(struct extent_state *state)
26 spin_lock_irqsave(&leak_lock, flags);
27 list_add(&state->leak_list, &states);
28 spin_unlock_irqrestore(&leak_lock, flags);
31 static inline void btrfs_leak_debug_del_state(struct extent_state *state)
35 spin_lock_irqsave(&leak_lock, flags);
36 list_del(&state->leak_list);
37 spin_unlock_irqrestore(&leak_lock, flags);
40 static inline void btrfs_extent_state_leak_debug_check(void)
42 struct extent_state *state;
44 while (!list_empty(&states)) {
45 state = list_entry(states.next, struct extent_state, leak_list);
46 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
47 state->start, state->end, state->state,
48 extent_state_in_tree(state),
49 refcount_read(&state->refs));
50 list_del(&state->leak_list);
51 kmem_cache_free(extent_state_cache, state);
55 #define btrfs_debug_check_extent_io_range(tree, start, end) \
56 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
57 static inline void __btrfs_debug_check_extent_io_range(const char *caller,
58 struct extent_io_tree *tree,
61 struct btrfs_inode *inode = tree->inode;
67 isize = i_size_read(&inode->vfs_inode);
68 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
69 btrfs_debug_rl(inode->root->fs_info,
70 "%s: ino %llu isize %llu odd range [%llu,%llu]",
71 caller, btrfs_ino(inode), isize, start, end);
75 #define btrfs_leak_debug_add_state(state) do {} while (0)
76 #define btrfs_leak_debug_del_state(state) do {} while (0)
77 #define btrfs_extent_state_leak_debug_check() do {} while (0)
78 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
82 * For the file_extent_tree, we want to hold the inode lock when we lookup and
83 * update the disk_i_size, but lockdep will complain because our io_tree we hold
84 * the tree lock and get the inode lock when setting delalloc. These two things
85 * are unrelated, so make a class for the file_extent_tree so we don't get the
86 * two locking patterns mixed up.
88 static struct lock_class_key file_extent_tree_class;
93 struct rb_node rb_node;
96 void extent_io_tree_init(struct btrfs_fs_info *fs_info,
97 struct extent_io_tree *tree, unsigned int owner)
99 tree->fs_info = fs_info;
100 tree->state = RB_ROOT;
101 spin_lock_init(&tree->lock);
104 if (owner == IO_TREE_INODE_FILE_EXTENT)
105 lockdep_set_class(&tree->lock, &file_extent_tree_class);
109 * Empty an io tree, removing and freeing every extent state record from the
110 * tree. This should be called once we are sure no other task can access the
111 * tree anymore, so no tree updates happen after we empty the tree and there
112 * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never
113 * set on any extent state when calling this function).
115 void extent_io_tree_release(struct extent_io_tree *tree)
118 struct extent_state *state;
119 struct extent_state *tmp;
121 spin_lock(&tree->lock);
123 tree->state = RB_ROOT;
124 rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) {
125 /* Clear node to keep free_extent_state() happy. */
126 RB_CLEAR_NODE(&state->rb_node);
127 ASSERT(!(state->state & EXTENT_LOCKED));
129 * No need for a memory barrier here, as we are holding the tree
130 * lock and we only change the waitqueue while holding that lock
131 * (see wait_extent_bit()).
133 ASSERT(!waitqueue_active(&state->wq));
134 free_extent_state(state);
135 cond_resched_lock(&tree->lock);
138 * Should still be empty even after a reschedule, no other task should
139 * be accessing the tree anymore.
141 ASSERT(RB_EMPTY_ROOT(&tree->state));
142 spin_unlock(&tree->lock);
145 static struct extent_state *alloc_extent_state(gfp_t mask)
147 struct extent_state *state;
150 * The given mask might be not appropriate for the slab allocator,
151 * drop the unsupported bits
153 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
154 state = kmem_cache_alloc(extent_state_cache, mask);
158 RB_CLEAR_NODE(&state->rb_node);
159 btrfs_leak_debug_add_state(state);
160 refcount_set(&state->refs, 1);
161 init_waitqueue_head(&state->wq);
162 trace_alloc_extent_state(state, mask, _RET_IP_);
166 static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc)
169 prealloc = alloc_extent_state(GFP_ATOMIC);
174 void free_extent_state(struct extent_state *state)
178 if (refcount_dec_and_test(&state->refs)) {
179 WARN_ON(extent_state_in_tree(state));
180 btrfs_leak_debug_del_state(state);
181 trace_free_extent_state(state, _RET_IP_);
182 kmem_cache_free(extent_state_cache, state);
186 static int add_extent_changeset(struct extent_state *state, u32 bits,
187 struct extent_changeset *changeset,
194 if (set && (state->state & bits) == bits)
196 if (!set && (state->state & bits) == 0)
198 changeset->bytes_changed += state->end - state->start + 1;
199 ret = ulist_add(&changeset->range_changed, state->start, state->end,
204 static inline struct extent_state *next_state(struct extent_state *state)
206 struct rb_node *next = rb_next(&state->rb_node);
209 return rb_entry(next, struct extent_state, rb_node);
214 static inline struct extent_state *prev_state(struct extent_state *state)
216 struct rb_node *next = rb_prev(&state->rb_node);
219 return rb_entry(next, struct extent_state, rb_node);
225 * Search @tree for an entry that contains @offset. Such entry would have
226 * entry->start <= offset && entry->end >= offset.
228 * @tree: the tree to search
229 * @offset: offset that should fall within an entry in @tree
230 * @node_ret: pointer where new node should be anchored (used when inserting an
232 * @parent_ret: points to entry which would have been the parent of the entry,
235 * Return a pointer to the entry that contains @offset byte address and don't change
236 * @node_ret and @parent_ret.
238 * If no such entry exists, return pointer to entry that ends before @offset
239 * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
241 static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
243 struct rb_node ***node_ret,
244 struct rb_node **parent_ret)
246 struct rb_root *root = &tree->state;
247 struct rb_node **node = &root->rb_node;
248 struct rb_node *prev = NULL;
249 struct extent_state *entry = NULL;
253 entry = rb_entry(prev, struct extent_state, rb_node);
255 if (offset < entry->start)
256 node = &(*node)->rb_left;
257 else if (offset > entry->end)
258 node = &(*node)->rb_right;
268 /* Search neighbors until we find the first one past the end */
269 while (entry && offset > entry->end)
270 entry = next_state(entry);
276 * Search offset in the tree or fill neighbor rbtree node pointers.
278 * @tree: the tree to search
279 * @offset: offset that should fall within an entry in @tree
280 * @next_ret: pointer to the first entry whose range ends after @offset
281 * @prev_ret: pointer to the first entry whose range begins before @offset
283 * Return a pointer to the entry that contains @offset byte address. If no
284 * such entry exists, then return NULL and fill @prev_ret and @next_ret.
285 * Otherwise return the found entry and other pointers are left untouched.
287 static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree,
289 struct extent_state **prev_ret,
290 struct extent_state **next_ret)
292 struct rb_root *root = &tree->state;
293 struct rb_node **node = &root->rb_node;
294 struct extent_state *orig_prev;
295 struct extent_state *entry = NULL;
301 entry = rb_entry(*node, struct extent_state, rb_node);
303 if (offset < entry->start)
304 node = &(*node)->rb_left;
305 else if (offset > entry->end)
306 node = &(*node)->rb_right;
312 while (entry && offset > entry->end)
313 entry = next_state(entry);
317 while (entry && offset < entry->start)
318 entry = prev_state(entry);
325 * Inexact rb-tree search, return the next entry if @offset is not found
327 static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset)
329 return tree_search_for_insert(tree, offset, NULL, NULL);
332 static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
334 btrfs_panic(tree->fs_info, err,
335 "locking error: extent tree was modified by another thread while locked");
338 static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
340 struct extent_state *prev;
342 prev = prev_state(state);
343 if (prev && prev->end == state->start - 1 && prev->state == state->state) {
345 btrfs_merge_delalloc_extent(tree->inode, state, prev);
346 state->start = prev->start;
347 rb_erase(&prev->rb_node, &tree->state);
348 RB_CLEAR_NODE(&prev->rb_node);
349 free_extent_state(prev);
353 static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state)
355 struct extent_state *next;
357 next = next_state(state);
358 if (next && next->start == state->end + 1 && next->state == state->state) {
360 btrfs_merge_delalloc_extent(tree->inode, state, next);
361 state->end = next->end;
362 rb_erase(&next->rb_node, &tree->state);
363 RB_CLEAR_NODE(&next->rb_node);
364 free_extent_state(next);
369 * Utility function to look for merge candidates inside a given range. Any
370 * extents with matching state are merged together into a single extent in the
371 * tree. Extents with EXTENT_IO in their state field are not merged because
372 * the end_io handlers need to be able to do operations on them without
373 * sleeping (or doing allocations/splits).
375 * This should be called with the tree lock held.
377 static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
379 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
382 merge_prev_state(tree, state);
383 merge_next_state(tree, state);
386 static void set_state_bits(struct extent_io_tree *tree,
387 struct extent_state *state,
388 u32 bits, struct extent_changeset *changeset)
390 u32 bits_to_set = bits & ~EXTENT_CTLBITS;
394 btrfs_set_delalloc_extent(tree->inode, state, bits);
396 ret = add_extent_changeset(state, bits_to_set, changeset, 1);
398 state->state |= bits_to_set;
402 * Insert an extent_state struct into the tree. 'bits' are set on the
403 * struct before it is inserted.
405 * Returns a pointer to the struct extent_state record containing the range
406 * requested for insertion, which may be the same as the given struct or it
407 * may be an existing record in the tree that was expanded to accommodate the
408 * requested range. In case of an extent_state different from the one that was
409 * given, the later can be freed or reused by the caller.
411 * On error it returns an error pointer.
413 * The tree lock is not taken internally. This is a utility function and
414 * probably isn't what you want to call (see set/clear_extent_bit).
416 static struct extent_state *insert_state(struct extent_io_tree *tree,
417 struct extent_state *state,
419 struct extent_changeset *changeset)
421 struct rb_node **node;
422 struct rb_node *parent = NULL;
423 const u64 start = state->start - 1;
424 const u64 end = state->end + 1;
425 const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY));
427 set_state_bits(tree, state, bits, changeset);
429 node = &tree->state.rb_node;
431 struct extent_state *entry;
434 entry = rb_entry(parent, struct extent_state, rb_node);
436 if (state->end < entry->start) {
437 if (try_merge && end == entry->start &&
438 state->state == entry->state) {
440 btrfs_merge_delalloc_extent(tree->inode,
442 entry->start = state->start;
443 merge_prev_state(tree, entry);
447 node = &(*node)->rb_left;
448 } else if (state->end > entry->end) {
449 if (try_merge && entry->end == start &&
450 state->state == entry->state) {
452 btrfs_merge_delalloc_extent(tree->inode,
454 entry->end = state->end;
455 merge_next_state(tree, entry);
459 node = &(*node)->rb_right;
461 btrfs_err(tree->fs_info,
462 "found node %llu %llu on insert of %llu %llu",
463 entry->start, entry->end, state->start, state->end);
464 return ERR_PTR(-EEXIST);
468 rb_link_node(&state->rb_node, parent, node);
469 rb_insert_color(&state->rb_node, &tree->state);
475 * Insert state to @tree to the location given by @node and @parent.
477 static void insert_state_fast(struct extent_io_tree *tree,
478 struct extent_state *state, struct rb_node **node,
479 struct rb_node *parent, unsigned bits,
480 struct extent_changeset *changeset)
482 set_state_bits(tree, state, bits, changeset);
483 rb_link_node(&state->rb_node, parent, node);
484 rb_insert_color(&state->rb_node, &tree->state);
485 merge_state(tree, state);
489 * Split a given extent state struct in two, inserting the preallocated
490 * struct 'prealloc' as the newly created second half. 'split' indicates an
491 * offset inside 'orig' where it should be split.
494 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
495 * are two extent state structs in the tree:
496 * prealloc: [orig->start, split - 1]
497 * orig: [ split, orig->end ]
499 * The tree locks are not taken by this function. They need to be held
502 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
503 struct extent_state *prealloc, u64 split)
505 struct rb_node *parent = NULL;
506 struct rb_node **node;
509 btrfs_split_delalloc_extent(tree->inode, orig, split);
511 prealloc->start = orig->start;
512 prealloc->end = split - 1;
513 prealloc->state = orig->state;
516 parent = &orig->rb_node;
519 struct extent_state *entry;
522 entry = rb_entry(parent, struct extent_state, rb_node);
524 if (prealloc->end < entry->start) {
525 node = &(*node)->rb_left;
526 } else if (prealloc->end > entry->end) {
527 node = &(*node)->rb_right;
529 free_extent_state(prealloc);
534 rb_link_node(&prealloc->rb_node, parent, node);
535 rb_insert_color(&prealloc->rb_node, &tree->state);
541 * Utility function to clear some bits in an extent state struct. It will
542 * optionally wake up anyone waiting on this state (wake == 1).
544 * If no bits are set on the state struct after clearing things, the
545 * struct is freed and removed from the tree
547 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
548 struct extent_state *state,
550 struct extent_changeset *changeset)
552 struct extent_state *next;
553 u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
557 btrfs_clear_delalloc_extent(tree->inode, state, bits);
559 ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
561 state->state &= ~bits_to_clear;
564 if (state->state == 0) {
565 next = next_state(state);
566 if (extent_state_in_tree(state)) {
567 rb_erase(&state->rb_node, &tree->state);
568 RB_CLEAR_NODE(&state->rb_node);
569 free_extent_state(state);
574 merge_state(tree, state);
575 next = next_state(state);
581 * Detect if extent bits request NOWAIT semantics and set the gfp mask accordingly,
582 * unset the EXTENT_NOWAIT bit.
584 static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask)
586 *mask = (*bits & EXTENT_NOWAIT ? GFP_NOWAIT : GFP_NOFS);
587 *bits &= EXTENT_NOWAIT - 1;
591 * Clear some bits on a range in the tree. This may require splitting or
592 * inserting elements in the tree, so the gfp mask is used to indicate which
593 * allocations or sleeping are allowed.
595 * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
596 * range from the tree regardless of state (ie for truncate).
598 * The range [start, end] is inclusive.
600 * This takes the tree lock, and returns 0 on success and < 0 on error.
602 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
603 u32 bits, struct extent_state **cached_state,
604 struct extent_changeset *changeset)
606 struct extent_state *state;
607 struct extent_state *cached;
608 struct extent_state *prealloc = NULL;
613 int delete = (bits & EXTENT_CLEAR_ALL_BITS);
616 set_gfp_mask_from_bits(&bits, &mask);
617 btrfs_debug_check_extent_io_range(tree, start, end);
618 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
621 bits |= ~EXTENT_CTLBITS;
623 if (bits & EXTENT_DELALLOC)
624 bits |= EXTENT_NORESERVE;
626 wake = (bits & EXTENT_LOCKED) ? 1 : 0;
627 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
632 * Don't care for allocation failure here because we might end
633 * up not needing the pre-allocated extent state at all, which
634 * is the case if we only have in the tree extent states that
635 * cover our input range and don't cover too any other range.
636 * If we end up needing a new extent state we allocate it later.
638 prealloc = alloc_extent_state(mask);
641 spin_lock(&tree->lock);
643 cached = *cached_state;
646 *cached_state = NULL;
650 if (cached && extent_state_in_tree(cached) &&
651 cached->start <= start && cached->end > start) {
653 refcount_dec(&cached->refs);
658 free_extent_state(cached);
661 /* This search will find the extents that end after our range starts. */
662 state = tree_search(tree, start);
666 if (state->start > end)
668 WARN_ON(state->end < start);
669 last_end = state->end;
671 /* The state doesn't have the wanted bits, go ahead. */
672 if (!(state->state & bits)) {
673 state = next_state(state);
678 * | ---- desired range ---- |
680 * | ------------- state -------------- |
682 * We need to split the extent we found, and may flip bits on second
685 * If the extent we found extends past our range, we just split and
686 * search again. It'll get split again the next time though.
688 * If the extent we found is inside our range, we clear the desired bit
692 if (state->start < start) {
693 prealloc = alloc_extent_state_atomic(prealloc);
696 err = split_state(tree, state, prealloc, start);
698 extent_io_tree_panic(tree, err);
703 if (state->end <= end) {
704 state = clear_state_bit(tree, state, bits, wake, changeset);
710 * | ---- desired range ---- |
712 * We need to split the extent, and clear the bit on the first half.
714 if (state->start <= end && state->end > end) {
715 prealloc = alloc_extent_state_atomic(prealloc);
718 err = split_state(tree, state, prealloc, end + 1);
720 extent_io_tree_panic(tree, err);
725 clear_state_bit(tree, prealloc, bits, wake, changeset);
731 state = clear_state_bit(tree, state, bits, wake, changeset);
733 if (last_end == (u64)-1)
735 start = last_end + 1;
736 if (start <= end && state && !need_resched())
742 spin_unlock(&tree->lock);
743 if (gfpflags_allow_blocking(mask))
748 spin_unlock(&tree->lock);
750 free_extent_state(prealloc);
757 * Wait for one or more bits to clear on a range in the state tree.
758 * The range [start, end] is inclusive.
759 * The tree lock is taken by this function
761 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
762 u32 bits, struct extent_state **cached_state)
764 struct extent_state *state;
766 btrfs_debug_check_extent_io_range(tree, start, end);
768 spin_lock(&tree->lock);
771 * Maintain cached_state, as we may not remove it from the tree if there
772 * are more bits than the bits we're waiting on set on this state.
774 if (cached_state && *cached_state) {
775 state = *cached_state;
776 if (extent_state_in_tree(state) &&
777 state->start <= start && start < state->end)
782 * This search will find all the extents that end after our
785 state = tree_search(tree, start);
789 if (state->start > end)
792 if (state->state & bits) {
795 start = state->start;
796 refcount_inc(&state->refs);
797 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
798 spin_unlock(&tree->lock);
800 spin_lock(&tree->lock);
801 finish_wait(&state->wq, &wait);
802 free_extent_state(state);
805 start = state->end + 1;
810 if (!cond_resched_lock(&tree->lock)) {
811 state = next_state(state);
816 /* This state is no longer useful, clear it and free it up. */
817 if (cached_state && *cached_state) {
818 state = *cached_state;
819 *cached_state = NULL;
820 free_extent_state(state);
822 spin_unlock(&tree->lock);
825 static void cache_state_if_flags(struct extent_state *state,
826 struct extent_state **cached_ptr,
829 if (cached_ptr && !(*cached_ptr)) {
830 if (!flags || (state->state & flags)) {
832 refcount_inc(&state->refs);
837 static void cache_state(struct extent_state *state,
838 struct extent_state **cached_ptr)
840 return cache_state_if_flags(state, cached_ptr,
841 EXTENT_LOCKED | EXTENT_BOUNDARY);
845 * Find the first state struct with 'bits' set after 'start', and return it.
846 * tree->lock must be held. NULL will returned if nothing was found after
849 static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
852 struct extent_state *state;
855 * This search will find all the extents that end after our range
858 state = tree_search(tree, start);
860 if (state->end >= start && (state->state & bits))
862 state = next_state(state);
868 * Find the first offset in the io tree with one or more @bits set.
870 * Note: If there are multiple bits set in @bits, any of them will match.
872 * Return true if we find something, and update @start_ret and @end_ret.
873 * Return false if we found nothing.
875 bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
876 u64 *start_ret, u64 *end_ret, u32 bits,
877 struct extent_state **cached_state)
879 struct extent_state *state;
882 spin_lock(&tree->lock);
883 if (cached_state && *cached_state) {
884 state = *cached_state;
885 if (state->end == start - 1 && extent_state_in_tree(state)) {
886 while ((state = next_state(state)) != NULL) {
887 if (state->state & bits)
891 * If we found the next extent state, clear cached_state
892 * so that we can cache the next extent state below and
893 * avoid future calls going over the same extent state
894 * again. If we haven't found any, clear as well since
897 free_extent_state(*cached_state);
898 *cached_state = NULL;
903 free_extent_state(*cached_state);
904 *cached_state = NULL;
907 state = find_first_extent_bit_state(tree, start, bits);
910 cache_state_if_flags(state, cached_state, 0);
911 *start_ret = state->start;
912 *end_ret = state->end;
916 spin_unlock(&tree->lock);
921 * Find a contiguous area of bits
923 * @tree: io tree to check
924 * @start: offset to start the search from
925 * @start_ret: the first offset we found with the bits set
926 * @end_ret: the final contiguous range of the bits that were set
927 * @bits: bits to look for
929 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
930 * to set bits appropriately, and then merge them again. During this time it
931 * will drop the tree->lock, so use this helper if you want to find the actual
932 * contiguous area for given bits. We will search to the first bit we find, and
933 * then walk down the tree until we find a non-contiguous area. The area
934 * returned will be the full contiguous area with the bits set.
936 int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
937 u64 *start_ret, u64 *end_ret, u32 bits)
939 struct extent_state *state;
942 spin_lock(&tree->lock);
943 state = find_first_extent_bit_state(tree, start, bits);
945 *start_ret = state->start;
946 *end_ret = state->end;
947 while ((state = next_state(state)) != NULL) {
948 if (state->start > (*end_ret + 1))
950 *end_ret = state->end;
954 spin_unlock(&tree->lock);
959 * Find a contiguous range of bytes in the file marked as delalloc, not more
960 * than 'max_bytes'. start and end are used to return the range,
962 * True is returned if we find something, false if nothing was in the tree.
964 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
965 u64 *end, u64 max_bytes,
966 struct extent_state **cached_state)
968 struct extent_state *state;
969 u64 cur_start = *start;
973 spin_lock(&tree->lock);
976 * This search will find all the extents that end after our range
979 state = tree_search(tree, cur_start);
986 if (found && (state->start != cur_start ||
987 (state->state & EXTENT_BOUNDARY))) {
990 if (!(state->state & EXTENT_DELALLOC)) {
996 *start = state->start;
997 *cached_state = state;
998 refcount_inc(&state->refs);
1002 cur_start = state->end + 1;
1003 total_bytes += state->end - state->start + 1;
1004 if (total_bytes >= max_bytes)
1006 state = next_state(state);
1009 spin_unlock(&tree->lock);
1014 * Set some bits on a range in the tree. This may require allocations or
1015 * sleeping. By default all allocations use GFP_NOFS, use EXTENT_NOWAIT for
1018 * If any of the exclusive bits are set, this will fail with -EEXIST if some
1019 * part of the range already has the desired bits set. The extent_state of the
1020 * existing range is returned in failed_state in this case, and the start of the
1021 * existing range is returned in failed_start. failed_state is used as an
1022 * optimization for wait_extent_bit, failed_start must be used as the source of
1023 * truth as failed_state may have changed since we returned.
1025 * [start, end] is inclusive This takes the tree lock.
1027 static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1028 u32 bits, u64 *failed_start,
1029 struct extent_state **failed_state,
1030 struct extent_state **cached_state,
1031 struct extent_changeset *changeset)
1033 struct extent_state *state;
1034 struct extent_state *prealloc = NULL;
1035 struct rb_node **p = NULL;
1036 struct rb_node *parent = NULL;
1040 u32 exclusive_bits = (bits & EXTENT_LOCKED);
1043 set_gfp_mask_from_bits(&bits, &mask);
1044 btrfs_debug_check_extent_io_range(tree, start, end);
1045 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
1048 ASSERT(failed_start);
1050 ASSERT(failed_start == NULL && failed_state == NULL);
1054 * Don't care for allocation failure here because we might end
1055 * up not needing the pre-allocated extent state at all, which
1056 * is the case if we only have in the tree extent states that
1057 * cover our input range and don't cover too any other range.
1058 * If we end up needing a new extent state we allocate it later.
1060 prealloc = alloc_extent_state(mask);
1063 spin_lock(&tree->lock);
1064 if (cached_state && *cached_state) {
1065 state = *cached_state;
1066 if (state->start <= start && state->end > start &&
1067 extent_state_in_tree(state))
1071 * This search will find all the extents that end after our range
1074 state = tree_search_for_insert(tree, start, &p, &parent);
1076 prealloc = alloc_extent_state_atomic(prealloc);
1079 prealloc->start = start;
1080 prealloc->end = end;
1081 insert_state_fast(tree, prealloc, p, parent, bits, changeset);
1082 cache_state(prealloc, cached_state);
1087 last_start = state->start;
1088 last_end = state->end;
1091 * | ---- desired range ---- |
1094 * Just lock what we found and keep going
1096 if (state->start == start && state->end <= end) {
1097 if (state->state & exclusive_bits) {
1098 *failed_start = state->start;
1099 cache_state(state, failed_state);
1104 set_state_bits(tree, state, bits, changeset);
1105 cache_state(state, cached_state);
1106 merge_state(tree, state);
1107 if (last_end == (u64)-1)
1109 start = last_end + 1;
1110 state = next_state(state);
1111 if (start < end && state && state->start == start &&
1118 * | ---- desired range ---- |
1121 * | ------------- state -------------- |
1123 * We need to split the extent we found, and may flip bits on second
1126 * If the extent we found extends past our range, we just split and
1127 * search again. It'll get split again the next time though.
1129 * If the extent we found is inside our range, we set the desired bit
1132 if (state->start < start) {
1133 if (state->state & exclusive_bits) {
1134 *failed_start = start;
1135 cache_state(state, failed_state);
1141 * If this extent already has all the bits we want set, then
1142 * skip it, not necessary to split it or do anything with it.
1144 if ((state->state & bits) == bits) {
1145 start = state->end + 1;
1146 cache_state(state, cached_state);
1150 prealloc = alloc_extent_state_atomic(prealloc);
1153 err = split_state(tree, state, prealloc, start);
1155 extent_io_tree_panic(tree, err);
1160 if (state->end <= end) {
1161 set_state_bits(tree, state, bits, changeset);
1162 cache_state(state, cached_state);
1163 merge_state(tree, state);
1164 if (last_end == (u64)-1)
1166 start = last_end + 1;
1167 state = next_state(state);
1168 if (start < end && state && state->start == start &&
1175 * | ---- desired range ---- |
1176 * | state | or | state |
1178 * There's a hole, we need to insert something in it and ignore the
1181 if (state->start > start) {
1183 struct extent_state *inserted_state;
1185 if (end < last_start)
1188 this_end = last_start - 1;
1190 prealloc = alloc_extent_state_atomic(prealloc);
1195 * Avoid to free 'prealloc' if it can be merged with the later
1198 prealloc->start = start;
1199 prealloc->end = this_end;
1200 inserted_state = insert_state(tree, prealloc, bits, changeset);
1201 if (IS_ERR(inserted_state)) {
1202 err = PTR_ERR(inserted_state);
1203 extent_io_tree_panic(tree, err);
1206 cache_state(inserted_state, cached_state);
1207 if (inserted_state == prealloc)
1209 start = this_end + 1;
1213 * | ---- desired range ---- |
1216 * We need to split the extent, and set the bit on the first half
1218 if (state->start <= end && state->end > end) {
1219 if (state->state & exclusive_bits) {
1220 *failed_start = start;
1221 cache_state(state, failed_state);
1226 prealloc = alloc_extent_state_atomic(prealloc);
1229 err = split_state(tree, state, prealloc, end + 1);
1231 extent_io_tree_panic(tree, err);
1233 set_state_bits(tree, prealloc, bits, changeset);
1234 cache_state(prealloc, cached_state);
1235 merge_state(tree, prealloc);
1243 spin_unlock(&tree->lock);
1244 if (gfpflags_allow_blocking(mask))
1249 spin_unlock(&tree->lock);
1251 free_extent_state(prealloc);
1257 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1258 u32 bits, struct extent_state **cached_state)
1260 return __set_extent_bit(tree, start, end, bits, NULL, NULL,
1261 cached_state, NULL);
1265 * Convert all bits in a given range from one bit to another
1267 * @tree: the io tree to search
1268 * @start: the start offset in bytes
1269 * @end: the end offset in bytes (inclusive)
1270 * @bits: the bits to set in this range
1271 * @clear_bits: the bits to clear in this range
1272 * @cached_state: state that we're going to cache
1274 * This will go through and set bits for the given range. If any states exist
1275 * already in this range they are set with the given bit and cleared of the
1276 * clear_bits. This is only meant to be used by things that are mergeable, ie.
1277 * converting from say DELALLOC to DIRTY. This is not meant to be used with
1278 * boundary bits like LOCK.
1280 * All allocations are done with GFP_NOFS.
1282 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1283 u32 bits, u32 clear_bits,
1284 struct extent_state **cached_state)
1286 struct extent_state *state;
1287 struct extent_state *prealloc = NULL;
1288 struct rb_node **p = NULL;
1289 struct rb_node *parent = NULL;
1293 bool first_iteration = true;
1295 btrfs_debug_check_extent_io_range(tree, start, end);
1296 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1302 * Best effort, don't worry if extent state allocation fails
1303 * here for the first iteration. We might have a cached state
1304 * that matches exactly the target range, in which case no
1305 * extent state allocations are needed. We'll only know this
1306 * after locking the tree.
1308 prealloc = alloc_extent_state(GFP_NOFS);
1309 if (!prealloc && !first_iteration)
1313 spin_lock(&tree->lock);
1314 if (cached_state && *cached_state) {
1315 state = *cached_state;
1316 if (state->start <= start && state->end > start &&
1317 extent_state_in_tree(state))
1322 * This search will find all the extents that end after our range
1325 state = tree_search_for_insert(tree, start, &p, &parent);
1327 prealloc = alloc_extent_state_atomic(prealloc);
1332 prealloc->start = start;
1333 prealloc->end = end;
1334 insert_state_fast(tree, prealloc, p, parent, bits, NULL);
1335 cache_state(prealloc, cached_state);
1340 last_start = state->start;
1341 last_end = state->end;
1344 * | ---- desired range ---- |
1347 * Just lock what we found and keep going.
1349 if (state->start == start && state->end <= end) {
1350 set_state_bits(tree, state, bits, NULL);
1351 cache_state(state, cached_state);
1352 state = clear_state_bit(tree, state, clear_bits, 0, NULL);
1353 if (last_end == (u64)-1)
1355 start = last_end + 1;
1356 if (start < end && state && state->start == start &&
1363 * | ---- desired range ---- |
1366 * | ------------- state -------------- |
1368 * We need to split the extent we found, and may flip bits on second
1371 * If the extent we found extends past our range, we just split and
1372 * search again. It'll get split again the next time though.
1374 * If the extent we found is inside our range, we set the desired bit
1377 if (state->start < start) {
1378 prealloc = alloc_extent_state_atomic(prealloc);
1383 err = split_state(tree, state, prealloc, start);
1385 extent_io_tree_panic(tree, err);
1389 if (state->end <= end) {
1390 set_state_bits(tree, state, bits, NULL);
1391 cache_state(state, cached_state);
1392 state = clear_state_bit(tree, state, clear_bits, 0, NULL);
1393 if (last_end == (u64)-1)
1395 start = last_end + 1;
1396 if (start < end && state && state->start == start &&
1403 * | ---- desired range ---- |
1404 * | state | or | state |
1406 * There's a hole, we need to insert something in it and ignore the
1409 if (state->start > start) {
1411 struct extent_state *inserted_state;
1413 if (end < last_start)
1416 this_end = last_start - 1;
1418 prealloc = alloc_extent_state_atomic(prealloc);
1425 * Avoid to free 'prealloc' if it can be merged with the later
1428 prealloc->start = start;
1429 prealloc->end = this_end;
1430 inserted_state = insert_state(tree, prealloc, bits, NULL);
1431 if (IS_ERR(inserted_state)) {
1432 err = PTR_ERR(inserted_state);
1433 extent_io_tree_panic(tree, err);
1435 cache_state(inserted_state, cached_state);
1436 if (inserted_state == prealloc)
1438 start = this_end + 1;
1442 * | ---- desired range ---- |
1445 * We need to split the extent, and set the bit on the first half.
1447 if (state->start <= end && state->end > end) {
1448 prealloc = alloc_extent_state_atomic(prealloc);
1454 err = split_state(tree, state, prealloc, end + 1);
1456 extent_io_tree_panic(tree, err);
1458 set_state_bits(tree, prealloc, bits, NULL);
1459 cache_state(prealloc, cached_state);
1460 clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
1468 spin_unlock(&tree->lock);
1470 first_iteration = false;
1474 spin_unlock(&tree->lock);
1476 free_extent_state(prealloc);
1482 * Find the first range that has @bits not set. This range could start before
1485 * @tree: the tree to search
1486 * @start: offset at/after which the found extent should start
1487 * @start_ret: records the beginning of the range
1488 * @end_ret: records the end of the range (inclusive)
1489 * @bits: the set of bits which must be unset
1491 * Since unallocated range is also considered one which doesn't have the bits
1492 * set it's possible that @end_ret contains -1, this happens in case the range
1493 * spans (last_range_end, end of device]. In this case it's up to the caller to
1494 * trim @end_ret to the appropriate size.
1496 void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1497 u64 *start_ret, u64 *end_ret, u32 bits)
1499 struct extent_state *state;
1500 struct extent_state *prev = NULL, *next = NULL;
1502 spin_lock(&tree->lock);
1504 /* Find first extent with bits cleared */
1506 state = tree_search_prev_next(tree, start, &prev, &next);
1507 if (!state && !next && !prev) {
1509 * Tree is completely empty, send full range and let
1510 * caller deal with it
1515 } else if (!state && !next) {
1517 * We are past the last allocated chunk, set start at
1518 * the end of the last extent.
1520 *start_ret = prev->end + 1;
1523 } else if (!state) {
1528 * At this point 'state' either contains 'start' or start is
1531 if (in_range(start, state->start, state->end - state->start + 1)) {
1532 if (state->state & bits) {
1534 * |--range with bits sets--|
1538 start = state->end + 1;
1541 * 'start' falls within a range that doesn't
1542 * have the bits set, so take its start as the
1543 * beginning of the desired range
1545 * |--range with bits cleared----|
1549 *start_ret = state->start;
1554 * |---prev range---|---hole/unset---|---node range---|
1560 * |---hole/unset--||--first node--|
1565 *start_ret = prev->end + 1;
1573 * Find the longest stretch from start until an entry which has the
1577 if (state->end >= start && !(state->state & bits)) {
1578 *end_ret = state->end;
1580 *end_ret = state->start - 1;
1583 state = next_state(state);
1586 spin_unlock(&tree->lock);
1590 * Count the number of bytes in the tree that have a given bit(s) set for a
1593 * @tree: The io tree to search.
1594 * @start: The start offset of the range. This value is updated to the
1595 * offset of the first byte found with the given bit(s), so it
1596 * can end up being bigger than the initial value.
1597 * @search_end: The end offset (inclusive value) of the search range.
1598 * @max_bytes: The maximum byte count we are interested. The search stops
1599 * once it reaches this count.
1600 * @bits: The bits the range must have in order to be accounted for.
1601 * If multiple bits are set, then only subranges that have all
1602 * the bits set are accounted for.
1603 * @contig: Indicate if we should ignore holes in the range or not. If
1604 * this is true, then stop once we find a hole.
1605 * @cached_state: A cached state to be used across multiple calls to this
1606 * function in order to speedup searches. Use NULL if this is
1607 * called only once or if each call does not start where the
1608 * previous one ended.
1610 * Returns the total number of bytes found within the given range that have
1611 * all given bits set. If the returned number of bytes is greater than zero
1612 * then @start is updated with the offset of the first byte with the bits set.
1614 u64 count_range_bits(struct extent_io_tree *tree,
1615 u64 *start, u64 search_end, u64 max_bytes,
1616 u32 bits, int contig,
1617 struct extent_state **cached_state)
1619 struct extent_state *state = NULL;
1620 struct extent_state *cached;
1621 u64 cur_start = *start;
1622 u64 total_bytes = 0;
1626 if (WARN_ON(search_end < cur_start))
1629 spin_lock(&tree->lock);
1631 if (!cached_state || !*cached_state)
1634 cached = *cached_state;
1636 if (!extent_state_in_tree(cached))
1639 if (cached->start <= cur_start && cur_start <= cached->end) {
1641 } else if (cached->start > cur_start) {
1642 struct extent_state *prev;
1645 * The cached state starts after our search range's start. Check
1646 * if the previous state record starts at or before the range we
1647 * are looking for, and if so, use it - this is a common case
1648 * when there are holes between records in the tree. If there is
1649 * no previous state record, we can start from our cached state.
1651 prev = prev_state(cached);
1654 else if (prev->start <= cur_start && cur_start <= prev->end)
1659 * This search will find all the extents that end after our range
1664 state = tree_search(tree, cur_start);
1667 if (state->start > search_end)
1669 if (contig && found && state->start > last + 1)
1671 if (state->end >= cur_start && (state->state & bits) == bits) {
1672 total_bytes += min(search_end, state->end) + 1 -
1673 max(cur_start, state->start);
1674 if (total_bytes >= max_bytes)
1677 *start = max(cur_start, state->start);
1681 } else if (contig && found) {
1684 state = next_state(state);
1688 free_extent_state(*cached_state);
1689 *cached_state = state;
1691 refcount_inc(&state->refs);
1694 spin_unlock(&tree->lock);
1700 * Check if the single @bit exists in the given range.
1702 bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
1704 struct extent_state *state = NULL;
1705 bool bitset = false;
1707 ASSERT(is_power_of_2(bit));
1709 spin_lock(&tree->lock);
1710 state = tree_search(tree, start);
1711 while (state && start <= end) {
1712 if (state->start > end)
1715 if (state->state & bit) {
1720 /* If state->end is (u64)-1, start will overflow to 0 */
1721 start = state->end + 1;
1722 if (start > end || start == 0)
1724 state = next_state(state);
1726 spin_unlock(&tree->lock);
1731 * Check if the whole range [@start,@end) contains the single @bit set.
1733 bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
1734 struct extent_state *cached)
1736 struct extent_state *state = NULL;
1739 ASSERT(is_power_of_2(bit));
1741 spin_lock(&tree->lock);
1742 if (cached && extent_state_in_tree(cached) && cached->start <= start &&
1743 cached->end > start)
1746 state = tree_search(tree, start);
1747 while (state && start <= end) {
1748 if (state->start > start) {
1753 if (state->start > end)
1756 if ((state->state & bit) == 0) {
1761 if (state->end == (u64)-1)
1765 * Last entry (if state->end is (u64)-1 and overflow happens),
1766 * or next entry starts after the range.
1768 start = state->end + 1;
1769 if (start > end || start == 0)
1771 state = next_state(state);
1774 /* We ran out of states and were still inside of our range. */
1777 spin_unlock(&tree->lock);
1781 /* Wrappers around set/clear extent bit */
1782 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1783 u32 bits, struct extent_changeset *changeset)
1786 * We don't support EXTENT_LOCKED yet, as current changeset will
1787 * record any bits changed, so for EXTENT_LOCKED case, it will
1788 * either fail with -EEXIST or changeset will record the whole
1791 ASSERT(!(bits & EXTENT_LOCKED));
1793 return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
1796 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1797 u32 bits, struct extent_changeset *changeset)
1800 * Don't support EXTENT_LOCKED case, same reason as
1801 * set_record_extent_bits().
1803 ASSERT(!(bits & EXTENT_LOCKED));
1805 return __clear_extent_bit(tree, start, end, bits, NULL, changeset);
1808 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1809 struct extent_state **cached)
1814 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
1815 NULL, cached, NULL);
1816 if (err == -EEXIST) {
1817 if (failed_start > start)
1818 clear_extent_bit(tree, start, failed_start - 1,
1819 EXTENT_LOCKED, cached);
1826 * Either insert or lock state struct between start and end use mask to tell
1827 * us if waiting is desired.
1829 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1830 struct extent_state **cached_state)
1832 struct extent_state *failed_state = NULL;
1836 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
1837 &failed_state, cached_state, NULL);
1838 while (err == -EEXIST) {
1839 if (failed_start != start)
1840 clear_extent_bit(tree, start, failed_start - 1,
1841 EXTENT_LOCKED, cached_state);
1843 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED,
1845 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
1846 &failed_start, &failed_state,
1847 cached_state, NULL);
1852 void __cold extent_state_free_cachep(void)
1854 btrfs_extent_state_leak_debug_check();
1855 kmem_cache_destroy(extent_state_cache);
1858 int __init extent_state_init_cachep(void)
1860 extent_state_cache = kmem_cache_create("btrfs_extent_state",
1861 sizeof(struct extent_state), 0,
1862 SLAB_MEM_SPREAD, NULL);
1863 if (!extent_state_cache)