1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007 Oracle. All rights reserved.
6 #include <linux/slab.h>
7 #include <linux/blkdev.h>
8 #include <linux/writeback.h>
9 #include <linux/sched/mm.h>
12 #include "transaction.h"
13 #include "btrfs_inode.h"
14 #include "extent_io.h"
16 #include "compression.h"
17 #include "delalloc-space.h"
20 static struct kmem_cache *btrfs_ordered_extent_cache;
22 static u64 entry_end(struct btrfs_ordered_extent *entry)
24 if (entry->file_offset + entry->num_bytes < entry->file_offset)
26 return entry->file_offset + entry->num_bytes;
29 /* returns NULL if the insertion worked, or it returns the node it did find
32 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
35 struct rb_node **p = &root->rb_node;
36 struct rb_node *parent = NULL;
37 struct btrfs_ordered_extent *entry;
41 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
43 if (file_offset < entry->file_offset)
45 else if (file_offset >= entry_end(entry))
51 rb_link_node(node, parent, p);
52 rb_insert_color(node, root);
57 * look for a given offset in the tree, and if it can't be found return the
60 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
61 struct rb_node **prev_ret)
63 struct rb_node *n = root->rb_node;
64 struct rb_node *prev = NULL;
66 struct btrfs_ordered_extent *entry;
67 struct btrfs_ordered_extent *prev_entry = NULL;
70 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
74 if (file_offset < entry->file_offset)
76 else if (file_offset >= entry_end(entry))
84 while (prev && file_offset >= entry_end(prev_entry)) {
88 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
90 if (file_offset < entry_end(prev_entry))
96 prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
98 while (prev && file_offset < entry_end(prev_entry)) {
102 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
111 * helper to check if a given offset is inside a given entry
113 static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
115 if (file_offset < entry->file_offset ||
116 entry->file_offset + entry->num_bytes <= file_offset)
121 static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
124 if (file_offset + len <= entry->file_offset ||
125 entry->file_offset + entry->num_bytes <= file_offset)
131 * look find the first ordered struct that has this offset, otherwise
132 * the first one less than this offset
134 static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
137 struct rb_root *root = &tree->tree;
138 struct rb_node *prev = NULL;
140 struct btrfs_ordered_extent *entry;
143 entry = rb_entry(tree->last, struct btrfs_ordered_extent,
145 if (offset_in_entry(entry, file_offset))
148 ret = __tree_search(root, file_offset, &prev);
157 * Allocate and add a new ordered_extent into the per-inode tree.
159 * The tree is given a single reference on the ordered extent that was
162 static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
163 u64 disk_bytenr, u64 num_bytes,
164 u64 disk_num_bytes, int type, int dio,
167 struct btrfs_root *root = inode->root;
168 struct btrfs_fs_info *fs_info = root->fs_info;
169 struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
170 struct rb_node *node;
171 struct btrfs_ordered_extent *entry;
174 if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) {
175 /* For nocow write, we can release the qgroup rsv right now */
176 ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
182 * The ordered extent has reserved qgroup space, release now
183 * and pass the reserved number for qgroup_record to free.
185 ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
189 entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
193 entry->file_offset = file_offset;
194 entry->disk_bytenr = disk_bytenr;
195 entry->num_bytes = num_bytes;
196 entry->disk_num_bytes = disk_num_bytes;
197 entry->bytes_left = num_bytes;
198 entry->inode = igrab(&inode->vfs_inode);
199 entry->compress_type = compress_type;
200 entry->truncated_len = (u64)-1;
201 entry->qgroup_rsv = ret;
203 ASSERT(type == BTRFS_ORDERED_REGULAR ||
204 type == BTRFS_ORDERED_NOCOW ||
205 type == BTRFS_ORDERED_PREALLOC ||
206 type == BTRFS_ORDERED_COMPRESSED);
207 set_bit(type, &entry->flags);
210 percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes,
211 fs_info->delalloc_batch);
212 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
215 /* one ref for the tree */
216 refcount_set(&entry->refs, 1);
217 init_waitqueue_head(&entry->wait);
218 INIT_LIST_HEAD(&entry->list);
219 INIT_LIST_HEAD(&entry->log_list);
220 INIT_LIST_HEAD(&entry->root_extent_list);
221 INIT_LIST_HEAD(&entry->work_list);
222 init_completion(&entry->completion);
224 trace_btrfs_ordered_extent_add(inode, entry);
226 spin_lock_irq(&tree->lock);
227 node = tree_insert(&tree->tree, file_offset,
230 btrfs_panic(fs_info, -EEXIST,
231 "inconsistency in ordered tree at offset %llu",
233 spin_unlock_irq(&tree->lock);
235 spin_lock(&root->ordered_extent_lock);
236 list_add_tail(&entry->root_extent_list,
237 &root->ordered_extents);
238 root->nr_ordered_extents++;
239 if (root->nr_ordered_extents == 1) {
240 spin_lock(&fs_info->ordered_root_lock);
241 BUG_ON(!list_empty(&root->ordered_root));
242 list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
243 spin_unlock(&fs_info->ordered_root_lock);
245 spin_unlock(&root->ordered_extent_lock);
248 * We don't need the count_max_extents here, we can assume that all of
249 * that work has been done at higher layers, so this is truly the
250 * smallest the extent is going to get.
252 spin_lock(&inode->lock);
253 btrfs_mod_outstanding_extents(inode, 1);
254 spin_unlock(&inode->lock);
259 int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
260 u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
263 ASSERT(type == BTRFS_ORDERED_REGULAR ||
264 type == BTRFS_ORDERED_NOCOW ||
265 type == BTRFS_ORDERED_PREALLOC);
266 return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
267 num_bytes, disk_num_bytes, type, 0,
268 BTRFS_COMPRESS_NONE);
271 int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
272 u64 disk_bytenr, u64 num_bytes,
273 u64 disk_num_bytes, int type)
275 ASSERT(type == BTRFS_ORDERED_REGULAR ||
276 type == BTRFS_ORDERED_NOCOW ||
277 type == BTRFS_ORDERED_PREALLOC);
278 return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
279 num_bytes, disk_num_bytes, type, 1,
280 BTRFS_COMPRESS_NONE);
283 int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
284 u64 disk_bytenr, u64 num_bytes,
285 u64 disk_num_bytes, int compress_type)
287 ASSERT(compress_type != BTRFS_COMPRESS_NONE);
288 return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
289 num_bytes, disk_num_bytes,
290 BTRFS_ORDERED_COMPRESSED, 0,
295 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
296 * when an ordered extent is finished. If the list covers more than one
297 * ordered extent, it is split across multiples.
299 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
300 struct btrfs_ordered_sum *sum)
302 struct btrfs_ordered_inode_tree *tree;
304 tree = &BTRFS_I(entry->inode)->ordered_tree;
305 spin_lock_irq(&tree->lock);
306 list_add_tail(&sum->list, &entry->list);
307 spin_unlock_irq(&tree->lock);
311 * Finish IO for one ordered extent across a given range. The range can
312 * contain several ordered extents.
314 * @found_ret: Return the finished ordered extent
315 * @file_offset: File offset for the finished IO
316 * Will also be updated to one byte past the range that is
317 * recordered as finished. This allows caller to walk forward.
318 * @io_size: Length of the finish IO range
319 * @uptodate: If the IO finished without problem
321 * Return true if any ordered extent is finished in the range, and update
322 * @found_ret and @file_offset.
323 * Return false otherwise.
325 * NOTE: Although The range can cross multiple ordered extents, only one
326 * ordered extent will be updated during one call. The caller is responsible to
327 * iterate all ordered extents in the range.
329 bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
330 struct btrfs_ordered_extent **finished_ret,
331 u64 *file_offset, u64 io_size, int uptodate)
333 struct btrfs_fs_info *fs_info = inode->root->fs_info;
334 struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
335 struct rb_node *node;
336 struct btrfs_ordered_extent *entry = NULL;
337 bool finished = false;
343 spin_lock_irqsave(&tree->lock, flags);
344 node = tree_search(tree, *file_offset);
348 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
349 if (!offset_in_entry(entry, *file_offset))
352 dec_start = max(*file_offset, entry->file_offset);
353 dec_end = min(*file_offset + io_size,
354 entry->file_offset + entry->num_bytes);
355 *file_offset = dec_end;
356 if (dec_start > dec_end) {
357 btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu",
360 to_dec = dec_end - dec_start;
361 if (to_dec > entry->bytes_left) {
363 "bad ordered accounting left %llu size %llu",
364 entry->bytes_left, to_dec);
366 entry->bytes_left -= to_dec;
368 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
370 if (entry->bytes_left == 0) {
372 * Ensure only one caller can set the flag and finished_ret
375 finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
376 /* test_and_set_bit implies a barrier */
377 cond_wake_up_nomb(&entry->wait);
380 if (finished && finished_ret && entry) {
381 *finished_ret = entry;
382 refcount_inc(&entry->refs);
384 spin_unlock_irqrestore(&tree->lock, flags);
389 * Finish IO for one ordered extent across a given range. The range can only
390 * contain one ordered extent.
392 * @cached: The cached ordered extent. If not NULL, we can skip the tree
393 * search and use the ordered extent directly.
394 * Will be also used to store the finished ordered extent.
395 * @file_offset: File offset for the finished IO
396 * @io_size: Length of the finish IO range
397 * @uptodate: If the IO finishes without problem
399 * Return true if the ordered extent is finished in the range, and update
401 * Return false otherwise.
403 * NOTE: The range can NOT cross multiple ordered extents.
404 * Thus caller should ensure the range doesn't cross ordered extents.
406 bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
407 struct btrfs_ordered_extent **cached,
408 u64 file_offset, u64 io_size, int uptodate)
410 struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
411 struct rb_node *node;
412 struct btrfs_ordered_extent *entry = NULL;
414 bool finished = false;
416 spin_lock_irqsave(&tree->lock, flags);
417 if (cached && *cached) {
422 node = tree_search(tree, file_offset);
426 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
428 if (!offset_in_entry(entry, file_offset))
431 if (io_size > entry->bytes_left)
432 btrfs_crit(inode->root->fs_info,
433 "bad ordered accounting left %llu size %llu",
434 entry->bytes_left, io_size);
436 entry->bytes_left -= io_size;
438 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
440 if (entry->bytes_left == 0) {
442 * Ensure only one caller can set the flag and finished_ret
445 finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
446 /* test_and_set_bit implies a barrier */
447 cond_wake_up_nomb(&entry->wait);
450 if (finished && cached && entry) {
452 refcount_inc(&entry->refs);
454 spin_unlock_irqrestore(&tree->lock, flags);
459 * used to drop a reference on an ordered extent. This will free
460 * the extent if the last reference is dropped
462 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
464 struct list_head *cur;
465 struct btrfs_ordered_sum *sum;
467 trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
469 if (refcount_dec_and_test(&entry->refs)) {
470 ASSERT(list_empty(&entry->root_extent_list));
471 ASSERT(list_empty(&entry->log_list));
472 ASSERT(RB_EMPTY_NODE(&entry->rb_node));
474 btrfs_add_delayed_iput(entry->inode);
475 while (!list_empty(&entry->list)) {
476 cur = entry->list.next;
477 sum = list_entry(cur, struct btrfs_ordered_sum, list);
478 list_del(&sum->list);
481 kmem_cache_free(btrfs_ordered_extent_cache, entry);
486 * remove an ordered extent from the tree. No references are dropped
487 * and waiters are woken up.
489 void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
490 struct btrfs_ordered_extent *entry)
492 struct btrfs_ordered_inode_tree *tree;
493 struct btrfs_root *root = btrfs_inode->root;
494 struct btrfs_fs_info *fs_info = root->fs_info;
495 struct rb_node *node;
498 /* This is paired with btrfs_add_ordered_extent. */
499 spin_lock(&btrfs_inode->lock);
500 btrfs_mod_outstanding_extents(btrfs_inode, -1);
501 spin_unlock(&btrfs_inode->lock);
502 if (root != fs_info->tree_root)
503 btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
506 if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
507 percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes,
508 fs_info->delalloc_batch);
510 tree = &btrfs_inode->ordered_tree;
511 spin_lock_irq(&tree->lock);
512 node = &entry->rb_node;
513 rb_erase(node, &tree->tree);
515 if (tree->last == node)
517 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
518 pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
519 spin_unlock_irq(&tree->lock);
522 * The current running transaction is waiting on us, we need to let it
523 * know that we're complete and wake it up.
526 struct btrfs_transaction *trans;
529 * The checks for trans are just a formality, it should be set,
530 * but if it isn't we don't want to deref/assert under the spin
531 * lock, so be nice and check if trans is set, but ASSERT() so
532 * if it isn't set a developer will notice.
534 spin_lock(&fs_info->trans_lock);
535 trans = fs_info->running_transaction;
537 refcount_inc(&trans->use_count);
538 spin_unlock(&fs_info->trans_lock);
542 if (atomic_dec_and_test(&trans->pending_ordered))
543 wake_up(&trans->pending_wait);
544 btrfs_put_transaction(trans);
548 spin_lock(&root->ordered_extent_lock);
549 list_del_init(&entry->root_extent_list);
550 root->nr_ordered_extents--;
552 trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
554 if (!root->nr_ordered_extents) {
555 spin_lock(&fs_info->ordered_root_lock);
556 BUG_ON(list_empty(&root->ordered_root));
557 list_del_init(&root->ordered_root);
558 spin_unlock(&fs_info->ordered_root_lock);
560 spin_unlock(&root->ordered_extent_lock);
561 wake_up(&entry->wait);
564 static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
566 struct btrfs_ordered_extent *ordered;
568 ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
569 btrfs_start_ordered_extent(ordered, 1);
570 complete(&ordered->completion);
574 * wait for all the ordered extents in a root. This is done when balancing
575 * space between drives.
577 u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
578 const u64 range_start, const u64 range_len)
580 struct btrfs_fs_info *fs_info = root->fs_info;
584 struct btrfs_ordered_extent *ordered, *next;
586 const u64 range_end = range_start + range_len;
588 mutex_lock(&root->ordered_extent_mutex);
589 spin_lock(&root->ordered_extent_lock);
590 list_splice_init(&root->ordered_extents, &splice);
591 while (!list_empty(&splice) && nr) {
592 ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
595 if (range_end <= ordered->disk_bytenr ||
596 ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
597 list_move_tail(&ordered->root_extent_list, &skipped);
598 cond_resched_lock(&root->ordered_extent_lock);
602 list_move_tail(&ordered->root_extent_list,
603 &root->ordered_extents);
604 refcount_inc(&ordered->refs);
605 spin_unlock(&root->ordered_extent_lock);
607 btrfs_init_work(&ordered->flush_work,
608 btrfs_run_ordered_extent_work, NULL, NULL);
609 list_add_tail(&ordered->work_list, &works);
610 btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
613 spin_lock(&root->ordered_extent_lock);
618 list_splice_tail(&skipped, &root->ordered_extents);
619 list_splice_tail(&splice, &root->ordered_extents);
620 spin_unlock(&root->ordered_extent_lock);
622 list_for_each_entry_safe(ordered, next, &works, work_list) {
623 list_del_init(&ordered->work_list);
624 wait_for_completion(&ordered->completion);
625 btrfs_put_ordered_extent(ordered);
628 mutex_unlock(&root->ordered_extent_mutex);
633 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
634 const u64 range_start, const u64 range_len)
636 struct btrfs_root *root;
637 struct list_head splice;
640 INIT_LIST_HEAD(&splice);
642 mutex_lock(&fs_info->ordered_operations_mutex);
643 spin_lock(&fs_info->ordered_root_lock);
644 list_splice_init(&fs_info->ordered_roots, &splice);
645 while (!list_empty(&splice) && nr) {
646 root = list_first_entry(&splice, struct btrfs_root,
648 root = btrfs_grab_root(root);
650 list_move_tail(&root->ordered_root,
651 &fs_info->ordered_roots);
652 spin_unlock(&fs_info->ordered_root_lock);
654 done = btrfs_wait_ordered_extents(root, nr,
655 range_start, range_len);
656 btrfs_put_root(root);
658 spin_lock(&fs_info->ordered_root_lock);
663 list_splice_tail(&splice, &fs_info->ordered_roots);
664 spin_unlock(&fs_info->ordered_root_lock);
665 mutex_unlock(&fs_info->ordered_operations_mutex);
669 * Used to start IO or wait for a given ordered extent to finish.
671 * If wait is one, this effectively waits on page writeback for all the pages
672 * in the extent, and it waits on the io completion code to insert
673 * metadata into the btree corresponding to the extent
675 void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
677 u64 start = entry->file_offset;
678 u64 end = start + entry->num_bytes - 1;
679 struct btrfs_inode *inode = BTRFS_I(entry->inode);
681 trace_btrfs_ordered_extent_start(inode, entry);
684 * pages in the range can be dirty, clean or writeback. We
685 * start IO on any dirty ones so the wait doesn't stall waiting
686 * for the flusher thread to find them
688 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
689 filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
691 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
697 * Used to wait on ordered extents across a large range of bytes.
699 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
705 struct btrfs_ordered_extent *ordered;
707 if (start + len < start) {
708 orig_end = INT_LIMIT(loff_t);
710 orig_end = start + len - 1;
711 if (orig_end > INT_LIMIT(loff_t))
712 orig_end = INT_LIMIT(loff_t);
715 /* start IO across the range first to instantiate any delalloc
718 ret = btrfs_fdatawrite_range(inode, start, orig_end);
723 * If we have a writeback error don't return immediately. Wait first
724 * for any ordered extents that haven't completed yet. This is to make
725 * sure no one can dirty the same page ranges and call writepages()
726 * before the ordered extents complete - to avoid failures (-EEXIST)
727 * when adding the new ordered extents to the ordered tree.
729 ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
733 ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
736 if (ordered->file_offset > orig_end) {
737 btrfs_put_ordered_extent(ordered);
740 if (ordered->file_offset + ordered->num_bytes <= start) {
741 btrfs_put_ordered_extent(ordered);
744 btrfs_start_ordered_extent(ordered, 1);
745 end = ordered->file_offset;
747 * If the ordered extent had an error save the error but don't
748 * exit without waiting first for all other ordered extents in
749 * the range to complete.
751 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
753 btrfs_put_ordered_extent(ordered);
754 if (end == 0 || end == start)
758 return ret_wb ? ret_wb : ret;
762 * find an ordered extent corresponding to file_offset. return NULL if
763 * nothing is found, otherwise take a reference on the extent and return it
765 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
768 struct btrfs_ordered_inode_tree *tree;
769 struct rb_node *node;
770 struct btrfs_ordered_extent *entry = NULL;
772 tree = &inode->ordered_tree;
773 spin_lock_irq(&tree->lock);
774 node = tree_search(tree, file_offset);
778 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
779 if (!offset_in_entry(entry, file_offset))
782 refcount_inc(&entry->refs);
784 spin_unlock_irq(&tree->lock);
788 /* Since the DIO code tries to lock a wide area we need to look for any ordered
789 * extents that exist in the range, rather than just the start of the range.
791 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
792 struct btrfs_inode *inode, u64 file_offset, u64 len)
794 struct btrfs_ordered_inode_tree *tree;
795 struct rb_node *node;
796 struct btrfs_ordered_extent *entry = NULL;
798 tree = &inode->ordered_tree;
799 spin_lock_irq(&tree->lock);
800 node = tree_search(tree, file_offset);
802 node = tree_search(tree, file_offset + len);
808 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
809 if (range_overlaps(entry, file_offset, len))
812 if (entry->file_offset >= file_offset + len) {
817 node = rb_next(node);
823 refcount_inc(&entry->refs);
824 spin_unlock_irq(&tree->lock);
829 * Adds all ordered extents to the given list. The list ends up sorted by the
830 * file_offset of the ordered extents.
832 void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
833 struct list_head *list)
835 struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
838 ASSERT(inode_is_locked(&inode->vfs_inode));
840 spin_lock_irq(&tree->lock);
841 for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
842 struct btrfs_ordered_extent *ordered;
844 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
846 if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
849 ASSERT(list_empty(&ordered->log_list));
850 list_add_tail(&ordered->log_list, list);
851 refcount_inc(&ordered->refs);
853 spin_unlock_irq(&tree->lock);
857 * lookup and return any extent before 'file_offset'. NULL is returned
860 struct btrfs_ordered_extent *
861 btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
863 struct btrfs_ordered_inode_tree *tree;
864 struct rb_node *node;
865 struct btrfs_ordered_extent *entry = NULL;
867 tree = &inode->ordered_tree;
868 spin_lock_irq(&tree->lock);
869 node = tree_search(tree, file_offset);
873 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
874 refcount_inc(&entry->refs);
876 spin_unlock_irq(&tree->lock);
881 * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
882 * ordered extents in it are run to completion.
884 * @inode: Inode whose ordered tree is to be searched
885 * @start: Beginning of range to flush
886 * @end: Last byte of range to lock
887 * @cached_state: If passed, will return the extent state responsible for the
888 * locked range. It's the caller's responsibility to free the cached state.
890 * This function always returns with the given range locked, ensuring after it's
891 * called no order extent can be pending.
893 void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
895 struct extent_state **cached_state)
897 struct btrfs_ordered_extent *ordered;
898 struct extent_state *cache = NULL;
899 struct extent_state **cachedp = &cache;
902 cachedp = cached_state;
905 lock_extent_bits(&inode->io_tree, start, end, cachedp);
906 ordered = btrfs_lookup_ordered_range(inode, start,
910 * If no external cached_state has been passed then
911 * decrement the extra ref taken for cachedp since we
912 * aren't exposing it outside of this function
915 refcount_dec(&cache->refs);
918 unlock_extent_cached(&inode->io_tree, start, end, cachedp);
919 btrfs_start_ordered_extent(ordered, 1);
920 btrfs_put_ordered_extent(ordered);
924 int __init ordered_data_init(void)
926 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
927 sizeof(struct btrfs_ordered_extent), 0,
930 if (!btrfs_ordered_extent_cache)
936 void __cold ordered_data_exit(void)
938 kmem_cache_destroy(btrfs_ordered_extent_cache);