fs/btrfs/extent_io.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include <linux/bitops.h>
   4 #include <linux/slab.h>
   5 #include <linux/bio.h>
   6 #include <linux/mm.h>
   7 #include <linux/pagemap.h>
   8 #include <linux/page-flags.h>
   9 #include <linux/spinlock.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/swap.h>
  12 #include <linux/writeback.h>
  13 #include <linux/pagevec.h>
  14 #include <linux/prefetch.h>
  15 #include <linux/cleancache.h>
  16 #include "extent_io.h"
  17 #include "extent-io-tree.h"
  18 #include "extent_map.h"
  19 #include "ctree.h"
  20 #include "btrfs_inode.h"
  21 #include "volumes.h"
  22 #include "check-integrity.h"
  23 #include "locking.h"
  24 #include "rcu-string.h"
  25 #include "backref.h"
  26 #include "disk-io.h"
  27 #include "subpage.h"
  28
  29 static struct kmem_cache *extent_state_cache;
  30 static struct kmem_cache *extent_buffer_cache;
  31 static struct bio_set btrfs_bioset;
  32
  33 static inline bool extent_state_in_tree(const struct extent_state *state)
  34 {
  35         return !RB_EMPTY_NODE(&state->rb_node);
  36 }
  37
  38 #ifdef CONFIG_BTRFS_DEBUG
  39 static LIST_HEAD(states);
  40 static DEFINE_SPINLOCK(leak_lock);
  41
  42 static inline void btrfs_leak_debug_add(spinlock_t *lock,
  43                                         struct list_head *new,
  44                                         struct list_head *head)
  45 {
  46         unsigned long flags;
  47
  48         spin_lock_irqsave(lock, flags);
  49         list_add(new, head);
  50         spin_unlock_irqrestore(lock, flags);
  51 }
  52
  53 static inline void btrfs_leak_debug_del(spinlock_t *lock,
  54                                         struct list_head *entry)
  55 {
  56         unsigned long flags;
  57
  58         spin_lock_irqsave(lock, flags);
  59         list_del(entry);
  60         spin_unlock_irqrestore(lock, flags);
  61 }
  62
  63 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
  64 {
  65         struct extent_buffer *eb;
  66         unsigned long flags;
  67
  68         /*
  69          * If we didn't get into open_ctree our allocated_ebs will not be
  70          * initialized, so just skip this.
  71          */
  72         if (!fs_info->allocated_ebs.next)
  73                 return;
  74
  75         spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
  76         while (!list_empty(&fs_info->allocated_ebs)) {
  77                 eb = list_first_entry(&fs_info->allocated_ebs,
  78                                       struct extent_buffer, leak_list);
  79                 pr_err(
  80         "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
  81                        eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
  82                        btrfs_header_owner(eb));
  83                 list_del(&eb->leak_list);
  84                 kmem_cache_free(extent_buffer_cache, eb);
  85         }
  86         spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
  87 }
  88
  89 static inline void btrfs_extent_state_leak_debug_check(void)
  90 {
  91         struct extent_state *state;
  92
  93         while (!list_empty(&states)) {
  94                 state = list_entry(states.next, struct extent_state, leak_list);
  95                 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
  96                        state->start, state->end, state->state,
  97                        extent_state_in_tree(state),
  98                        refcount_read(&state->refs));
  99                 list_del(&state->leak_list);
 100                 kmem_cache_free(extent_state_cache, state);
 101         }
 102 }
 103
 104 #define btrfs_debug_check_extent_io_range(tree, start, end)             \
 105         __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
 106 static inline void __btrfs_debug_check_extent_io_range(const char *caller,
 107                 struct extent_io_tree *tree, u64 start, u64 end)
 108 {
 109         struct inode *inode = tree->private_data;
 110         u64 isize;
 111
 112         if (!inode || !is_data_inode(inode))
 113                 return;
 114
 115         isize = i_size_read(inode);
 116         if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
 117                 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
 118                     "%s: ino %llu isize %llu odd range [%llu,%llu]",
 119                         caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
 120         }
 121 }
 122 #else
 123 #define btrfs_leak_debug_add(lock, new, head)   do {} while (0)
 124 #define btrfs_leak_debug_del(lock, entry)       do {} while (0)
 125 #define btrfs_extent_state_leak_debug_check()   do {} while (0)
 126 #define btrfs_debug_check_extent_io_range(c, s, e)      do {} while (0)
 127 #endif
 128
 129 struct tree_entry {
 130         u64 start;
 131         u64 end;
 132         struct rb_node rb_node;
 133 };
 134
 135 struct extent_page_data {
 136         struct bio *bio;
 137         /* tells writepage not to lock the state bits for this range
 138          * it still does the unlocking
 139          */
 140         unsigned int extent_locked:1;
 141
 142         /* tells the submit_bio code to use REQ_SYNC */
 143         unsigned int sync_io:1;
 144 };
 145
 146 static int add_extent_changeset(struct extent_state *state, u32 bits,
 147                                  struct extent_changeset *changeset,
 148                                  int set)
 149 {
 150         int ret;
 151
 152         if (!changeset)
 153                 return 0;
 154         if (set && (state->state & bits) == bits)
 155                 return 0;
 156         if (!set && (state->state & bits) == 0)
 157                 return 0;
 158         changeset->bytes_changed += state->end - state->start + 1;
 159         ret = ulist_add(&changeset->range_changed, state->start, state->end,
 160                         GFP_ATOMIC);
 161         return ret;
 162 }
 163
 164 int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 165                                 unsigned long bio_flags)
 166 {
 167         blk_status_t ret = 0;
 168         struct extent_io_tree *tree = bio->bi_private;
 169
 170         bio->bi_private = NULL;
 171
 172         if (is_data_inode(tree->private_data))
 173                 ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
 174                                             bio_flags);
 175         else
 176                 ret = btrfs_submit_metadata_bio(tree->private_data, bio,
 177                                                 mirror_num, bio_flags);
 178
 179         return blk_status_to_errno(ret);
 180 }
 181
 182 /* Cleanup unsubmitted bios */
 183 static void end_write_bio(struct extent_page_data *epd, int ret)
 184 {
 185         if (epd->bio) {
 186                 epd->bio->bi_status = errno_to_blk_status(ret);
 187                 bio_endio(epd->bio);
 188                 epd->bio = NULL;
 189         }
 190 }
 191
 192 /*
 193  * Submit bio from extent page data via submit_one_bio
 194  *
 195  * Return 0 if everything is OK.
 196  * Return <0 for error.
 197  */
 198 static int __must_check flush_write_bio(struct extent_page_data *epd)
 199 {
 200         int ret = 0;
 201
 202         if (epd->bio) {
 203                 ret = submit_one_bio(epd->bio, 0, 0);
 204                 /*
 205                  * Clean up of epd->bio is handled by its endio function.
 206                  * And endio is either triggered by successful bio execution
 207                  * or the error handler of submit bio hook.
 208                  * So at this point, no matter what happened, we don't need
 209                  * to clean up epd->bio.
 210                  */
 211                 epd->bio = NULL;
 212         }
 213         return ret;
 214 }
 215
 216 int __init extent_state_cache_init(void)
 217 {
 218         extent_state_cache = kmem_cache_create("btrfs_extent_state",
 219                         sizeof(struct extent_state), 0,
 220                         SLAB_MEM_SPREAD, NULL);
 221         if (!extent_state_cache)
 222                 return -ENOMEM;
 223         return 0;
 224 }
 225
 226 int __init extent_io_init(void)
 227 {
 228         extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
 229                         sizeof(struct extent_buffer), 0,
 230                         SLAB_MEM_SPREAD, NULL);
 231         if (!extent_buffer_cache)
 232                 return -ENOMEM;
 233
 234         if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
 235                         offsetof(struct btrfs_io_bio, bio),
 236                         BIOSET_NEED_BVECS))
 237                 goto free_buffer_cache;
 238
 239         if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
 240                 goto free_bioset;
 241
 242         return 0;
 243
 244 free_bioset:
 245         bioset_exit(&btrfs_bioset);
 246
 247 free_buffer_cache:
 248         kmem_cache_destroy(extent_buffer_cache);
 249         extent_buffer_cache = NULL;
 250         return -ENOMEM;
 251 }
 252
 253 void __cold extent_state_cache_exit(void)
 254 {
 255         btrfs_extent_state_leak_debug_check();
 256         kmem_cache_destroy(extent_state_cache);
 257 }
 258
 259 void __cold extent_io_exit(void)
 260 {
 261         /*
 262          * Make sure all delayed rcu free are flushed before we
 263          * destroy caches.
 264          */
 265         rcu_barrier();
 266         kmem_cache_destroy(extent_buffer_cache);
 267         bioset_exit(&btrfs_bioset);
 268 }
 269
 270 /*
 271  * For the file_extent_tree, we want to hold the inode lock when we lookup and
 272  * update the disk_i_size, but lockdep will complain because our io_tree we hold
 273  * the tree lock and get the inode lock when setting delalloc.  These two things
 274  * are unrelated, so make a class for the file_extent_tree so we don't get the
 275  * two locking patterns mixed up.
 276  */
 277 static struct lock_class_key file_extent_tree_class;
 278
 279 void extent_io_tree_init(struct btrfs_fs_info *fs_info,
 280                          struct extent_io_tree *tree, unsigned int owner,
 281                          void *private_data)
 282 {
 283         tree->fs_info = fs_info;
 284         tree->state = RB_ROOT;
 285         tree->dirty_bytes = 0;
 286         spin_lock_init(&tree->lock);
 287         tree->private_data = private_data;
 288         tree->owner = owner;
 289         if (owner == IO_TREE_INODE_FILE_EXTENT)
 290                 lockdep_set_class(&tree->lock, &file_extent_tree_class);
 291 }
 292
 293 void extent_io_tree_release(struct extent_io_tree *tree)
 294 {
 295         spin_lock(&tree->lock);
 296         /*
 297          * Do a single barrier for the waitqueue_active check here, the state
 298          * of the waitqueue should not change once extent_io_tree_release is
 299          * called.
 300          */
 301         smp_mb();
 302         while (!RB_EMPTY_ROOT(&tree->state)) {
 303                 struct rb_node *node;
 304                 struct extent_state *state;
 305
 306                 node = rb_first(&tree->state);
 307                 state = rb_entry(node, struct extent_state, rb_node);
 308                 rb_erase(&state->rb_node, &tree->state);
 309                 RB_CLEAR_NODE(&state->rb_node);
 310                 /*
 311                  * btree io trees aren't supposed to have tasks waiting for
 312                  * changes in the flags of extent states ever.
 313                  */
 314                 ASSERT(!waitqueue_active(&state->wq));
 315                 free_extent_state(state);
 316
 317                 cond_resched_lock(&tree->lock);
 318         }
 319         spin_unlock(&tree->lock);
 320 }
 321
 322 static struct extent_state *alloc_extent_state(gfp_t mask)
 323 {
 324         struct extent_state *state;
 325
 326         /*
 327          * The given mask might be not appropriate for the slab allocator,
 328          * drop the unsupported bits
 329          */
 330         mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
 331         state = kmem_cache_alloc(extent_state_cache, mask);
 332         if (!state)
 333                 return state;
 334         state->state = 0;
 335         state->failrec = NULL;
 336         RB_CLEAR_NODE(&state->rb_node);
 337         btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
 338         refcount_set(&state->refs, 1);
 339         init_waitqueue_head(&state->wq);
 340         trace_alloc_extent_state(state, mask, _RET_IP_);
 341         return state;
 342 }
 343
 344 void free_extent_state(struct extent_state *state)
 345 {
 346         if (!state)
 347                 return;
 348         if (refcount_dec_and_test(&state->refs)) {
 349                 WARN_ON(extent_state_in_tree(state));
 350                 btrfs_leak_debug_del(&leak_lock, &state->leak_list);
 351                 trace_free_extent_state(state, _RET_IP_);
 352                 kmem_cache_free(extent_state_cache, state);
 353         }
 354 }
 355
 356 static struct rb_node *tree_insert(struct rb_root *root,
 357                                    struct rb_node *search_start,
 358                                    u64 offset,
 359                                    struct rb_node *node,
 360                                    struct rb_node ***p_in,
 361                                    struct rb_node **parent_in)
 362 {
 363         struct rb_node **p;
 364         struct rb_node *parent = NULL;
 365         struct tree_entry *entry;
 366
 367         if (p_in && parent_in) {
 368                 p = *p_in;
 369                 parent = *parent_in;
 370                 goto do_insert;
 371         }
 372
 373         p = search_start ? &search_start : &root->rb_node;
 374         while (*p) {
 375                 parent = *p;
 376                 entry = rb_entry(parent, struct tree_entry, rb_node);
 377
 378                 if (offset < entry->start)
 379                         p = &(*p)->rb_left;
 380                 else if (offset > entry->end)
 381                         p = &(*p)->rb_right;
 382                 else
 383                         return parent;
 384         }
 385
 386 do_insert:
 387         rb_link_node(node, parent, p);
 388         rb_insert_color(node, root);
 389         return NULL;
 390 }
 391
 392 /**
 393  * Search @tree for an entry that contains @offset. Such entry would have
 394  * entry->start <= offset && entry->end >= offset.
 395  *
 396  * @tree:       the tree to search
 397  * @offset:     offset that should fall within an entry in @tree
 398  * @next_ret:   pointer to the first entry whose range ends after @offset
 399  * @prev_ret:   pointer to the first entry whose range begins before @offset
 400  * @p_ret:      pointer where new node should be anchored (used when inserting an
 401  *              entry in the tree)
 402  * @parent_ret: points to entry which would have been the parent of the entry,
 403  *               containing @offset
 404  *
 405  * This function returns a pointer to the entry that contains @offset byte
 406  * address. If no such entry exists, then NULL is returned and the other
 407  * pointer arguments to the function are filled, otherwise the found entry is
 408  * returned and other pointers are left untouched.
 409  */
 410 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 411                                       struct rb_node **next_ret,
 412                                       struct rb_node **prev_ret,
 413                                       struct rb_node ***p_ret,
 414                                       struct rb_node **parent_ret)
 415 {
 416         struct rb_root *root = &tree->state;
 417         struct rb_node **n = &root->rb_node;
 418         struct rb_node *prev = NULL;
 419         struct rb_node *orig_prev = NULL;
 420         struct tree_entry *entry;
 421         struct tree_entry *prev_entry = NULL;
 422
 423         while (*n) {
 424                 prev = *n;
 425                 entry = rb_entry(prev, struct tree_entry, rb_node);
 426                 prev_entry = entry;
 427
 428                 if (offset < entry->start)
 429                         n = &(*n)->rb_left;
 430                 else if (offset > entry->end)
 431                         n = &(*n)->rb_right;
 432                 else
 433                         return *n;
 434         }
 435
 436         if (p_ret)
 437                 *p_ret = n;
 438         if (parent_ret)
 439                 *parent_ret = prev;
 440
 441         if (next_ret) {
 442                 orig_prev = prev;
 443                 while (prev && offset > prev_entry->end) {
 444                         prev = rb_next(prev);
 445                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 446                 }
 447                 *next_ret = prev;
 448                 prev = orig_prev;
 449         }
 450
 451         if (prev_ret) {
 452                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 453                 while (prev && offset < prev_entry->start) {
 454                         prev = rb_prev(prev);
 455                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 456                 }
 457                 *prev_ret = prev;
 458         }
 459         return NULL;
 460 }
 461
 462 static inline struct rb_node *
 463 tree_search_for_insert(struct extent_io_tree *tree,
 464                        u64 offset,
 465                        struct rb_node ***p_ret,
 466                        struct rb_node **parent_ret)
 467 {
 468         struct rb_node *next= NULL;
 469         struct rb_node *ret;
 470
 471         ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
 472         if (!ret)
 473                 return next;
 474         return ret;
 475 }
 476
 477 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 478                                           u64 offset)
 479 {
 480         return tree_search_for_insert(tree, offset, NULL, NULL);
 481 }
 482
 483 /*
 484  * utility function to look for merge candidates inside a given range.
 485  * Any extents with matching state are merged together into a single
 486  * extent in the tree.  Extents with EXTENT_IO in their state field
 487  * are not merged because the end_io handlers need to be able to do
 488  * operations on them without sleeping (or doing allocations/splits).
 489  *
 490  * This should be called with the tree lock held.
 491  */
 492 static void merge_state(struct extent_io_tree *tree,
 493                         struct extent_state *state)
 494 {
 495         struct extent_state *other;
 496         struct rb_node *other_node;
 497
 498         if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
 499                 return;
 500
 501         other_node = rb_prev(&state->rb_node);
 502         if (other_node) {
 503                 other = rb_entry(other_node, struct extent_state, rb_node);
 504                 if (other->end == state->start - 1 &&
 505                     other->state == state->state) {
 506                         if (tree->private_data &&
 507                             is_data_inode(tree->private_data))
 508                                 btrfs_merge_delalloc_extent(tree->private_data,
 509                                                             state, other);
 510                         state->start = other->start;
 511                         rb_erase(&other->rb_node, &tree->state);
 512                         RB_CLEAR_NODE(&other->rb_node);
 513                         free_extent_state(other);
 514                 }
 515         }
 516         other_node = rb_next(&state->rb_node);
 517         if (other_node) {
 518                 other = rb_entry(other_node, struct extent_state, rb_node);
 519                 if (other->start == state->end + 1 &&
 520                     other->state == state->state) {
 521                         if (tree->private_data &&
 522                             is_data_inode(tree->private_data))
 523                                 btrfs_merge_delalloc_extent(tree->private_data,
 524                                                             state, other);
 525                         state->end = other->end;
 526                         rb_erase(&other->rb_node, &tree->state);
 527                         RB_CLEAR_NODE(&other->rb_node);
 528                         free_extent_state(other);
 529                 }
 530         }
 531 }
 532
 533 static void set_state_bits(struct extent_io_tree *tree,
 534                            struct extent_state *state, u32 *bits,
 535                            struct extent_changeset *changeset);
 536
 537 /*
 538  * insert an extent_state struct into the tree.  'bits' are set on the
 539  * struct before it is inserted.
 540  *
 541  * This may return -EEXIST if the extent is already there, in which case the
 542  * state struct is freed.
 543  *
 544  * The tree lock is not taken internally.  This is a utility function and
 545  * probably isn't what you want to call (see set/clear_extent_bit).
 546  */
 547 static int insert_state(struct extent_io_tree *tree,
 548                         struct extent_state *state, u64 start, u64 end,
 549                         struct rb_node ***p,
 550                         struct rb_node **parent,
 551                         u32 *bits, struct extent_changeset *changeset)
 552 {
 553         struct rb_node *node;
 554
 555         if (end < start) {
 556                 btrfs_err(tree->fs_info,
 557                         "insert state: end < start %llu %llu", end, start);
 558                 WARN_ON(1);
 559         }
 560         state->start = start;
 561         state->end = end;
 562
 563         set_state_bits(tree, state, bits, changeset);
 564
 565         node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
 566         if (node) {
 567                 struct extent_state *found;
 568                 found = rb_entry(node, struct extent_state, rb_node);
 569                 btrfs_err(tree->fs_info,
 570                        "found node %llu %llu on insert of %llu %llu",
 571                        found->start, found->end, start, end);
 572                 return -EEXIST;
 573         }
 574         merge_state(tree, state);
 575         return 0;
 576 }
 577
 578 /*
 579  * split a given extent state struct in two, inserting the preallocated
 580  * struct 'prealloc' as the newly created second half.  'split' indicates an
 581  * offset inside 'orig' where it should be split.
 582  *
 583  * Before calling,
 584  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 585  * are two extent state structs in the tree:
 586  * prealloc: [orig->start, split - 1]
 587  * orig: [ split, orig->end ]
 588  *
 589  * The tree locks are not taken by this function. They need to be held
 590  * by the caller.
 591  */
 592 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 593                        struct extent_state *prealloc, u64 split)
 594 {
 595         struct rb_node *node;
 596
 597         if (tree->private_data && is_data_inode(tree->private_data))
 598                 btrfs_split_delalloc_extent(tree->private_data, orig, split);
 599
 600         prealloc->start = orig->start;
 601         prealloc->end = split - 1;
 602         prealloc->state = orig->state;
 603         orig->start = split;
 604
 605         node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
 606                            &prealloc->rb_node, NULL, NULL);
 607         if (node) {
 608                 free_extent_state(prealloc);
 609                 return -EEXIST;
 610         }
 611         return 0;
 612 }
 613
 614 static struct extent_state *next_state(struct extent_state *state)
 615 {
 616         struct rb_node *next = rb_next(&state->rb_node);
 617         if (next)
 618                 return rb_entry(next, struct extent_state, rb_node);
 619         else
 620                 return NULL;
 621 }
 622
 623 /*
 624  * utility function to clear some bits in an extent state struct.
 625  * it will optionally wake up anyone waiting on this state (wake == 1).
 626  *
 627  * If no bits are set on the state struct after clearing things, the
 628  * struct is freed and removed from the tree
 629  */
 630 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 631                                             struct extent_state *state,
 632                                             u32 *bits, int wake,
 633                                             struct extent_changeset *changeset)
 634 {
 635         struct extent_state *next;
 636         u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
 637         int ret;
 638
 639         if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 640                 u64 range = state->end - state->start + 1;
 641                 WARN_ON(range > tree->dirty_bytes);
 642                 tree->dirty_bytes -= range;
 643         }
 644
 645         if (tree->private_data && is_data_inode(tree->private_data))
 646                 btrfs_clear_delalloc_extent(tree->private_data, state, bits);
 647
 648         ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
 649         BUG_ON(ret < 0);
 650         state->state &= ~bits_to_clear;
 651         if (wake)
 652                 wake_up(&state->wq);
 653         if (state->state == 0) {
 654                 next = next_state(state);
 655                 if (extent_state_in_tree(state)) {
 656                         rb_erase(&state->rb_node, &tree->state);
 657                         RB_CLEAR_NODE(&state->rb_node);
 658                         free_extent_state(state);
 659                 } else {
 660                         WARN_ON(1);
 661                 }
 662         } else {
 663                 merge_state(tree, state);
 664                 next = next_state(state);
 665         }
 666         return next;
 667 }
 668
 669 static struct extent_state *
 670 alloc_extent_state_atomic(struct extent_state *prealloc)
 671 {
 672         if (!prealloc)
 673                 prealloc = alloc_extent_state(GFP_ATOMIC);
 674
 675         return prealloc;
 676 }
 677
 678 static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 679 {
 680         btrfs_panic(tree->fs_info, err,
 681         "locking error: extent tree was modified by another thread while locked");
 682 }
 683
 684 /*
 685  * clear some bits on a range in the tree.  This may require splitting
 686  * or inserting elements in the tree, so the gfp mask is used to
 687  * indicate which allocations or sleeping are allowed.
 688  *
 689  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 690  * the given range from the tree regardless of state (ie for truncate).
 691  *
 692  * the range [start, end] is inclusive.
 693  *
 694  * This takes the tree lock, and returns 0 on success and < 0 on error.
 695  */
 696 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 697                        u32 bits, int wake, int delete,
 698                        struct extent_state **cached_state,
 699                        gfp_t mask, struct extent_changeset *changeset)
 700 {
 701         struct extent_state *state;
 702         struct extent_state *cached;
 703         struct extent_state *prealloc = NULL;
 704         struct rb_node *node;
 705         u64 last_end;
 706         int err;
 707         int clear = 0;
 708
 709         btrfs_debug_check_extent_io_range(tree, start, end);
 710         trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
 711
 712         if (bits & EXTENT_DELALLOC)
 713                 bits |= EXTENT_NORESERVE;
 714
 715         if (delete)
 716                 bits |= ~EXTENT_CTLBITS;
 717
 718         if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
 719                 clear = 1;
 720 again:
 721         if (!prealloc && gfpflags_allow_blocking(mask)) {
 722                 /*
 723                  * Don't care for allocation failure here because we might end
 724                  * up not needing the pre-allocated extent state at all, which
 725                  * is the case if we only have in the tree extent states that
 726                  * cover our input range and don't cover too any other range.
 727                  * If we end up needing a new extent state we allocate it later.
 728                  */
 729                 prealloc = alloc_extent_state(mask);
 730         }
 731
 732         spin_lock(&tree->lock);
 733         if (cached_state) {
 734                 cached = *cached_state;
 735
 736                 if (clear) {
 737                         *cached_state = NULL;
 738                         cached_state = NULL;
 739                 }
 740
 741                 if (cached && extent_state_in_tree(cached) &&
 742                     cached->start <= start && cached->end > start) {
 743                         if (clear)
 744                                 refcount_dec(&cached->refs);
 745                         state = cached;
 746                         goto hit_next;
 747                 }
 748                 if (clear)
 749                         free_extent_state(cached);
 750         }
 751         /*
 752          * this search will find the extents that end after
 753          * our range starts
 754          */
 755         node = tree_search(tree, start);
 756         if (!node)
 757                 goto out;
 758         state = rb_entry(node, struct extent_state, rb_node);
 759 hit_next:
 760         if (state->start > end)
 761                 goto out;
 762         WARN_ON(state->end < start);
 763         last_end = state->end;
 764
 765         /* the state doesn't have the wanted bits, go ahead */
 766         if (!(state->state & bits)) {
 767                 state = next_state(state);
 768                 goto next;
 769         }
 770
 771         /*
 772          *     | ---- desired range ---- |
 773          *  | state | or
 774          *  | ------------- state -------------- |
 775          *
 776          * We need to split the extent we found, and may flip
 777          * bits on second half.
 778          *
 779          * If the extent we found extends past our range, we
 780          * just split and search again.  It'll get split again
 781          * the next time though.
 782          *
 783          * If the extent we found is inside our range, we clear
 784          * the desired bit on it.
 785          */
 786
 787         if (state->start < start) {
 788                 prealloc = alloc_extent_state_atomic(prealloc);
 789                 BUG_ON(!prealloc);
 790                 err = split_state(tree, state, prealloc, start);
 791                 if (err)
 792                         extent_io_tree_panic(tree, err);
 793
 794                 prealloc = NULL;
 795                 if (err)
 796                         goto out;
 797                 if (state->end <= end) {
 798                         state = clear_state_bit(tree, state, &bits, wake,
 799                                                 changeset);
 800                         goto next;
 801                 }
 802                 goto search_again;
 803         }
 804         /*
 805          * | ---- desired range ---- |
 806          *                        | state |
 807          * We need to split the extent, and clear the bit
 808          * on the first half
 809          */
 810         if (state->start <= end && state->end > end) {
 811                 prealloc = alloc_extent_state_atomic(prealloc);
 812                 BUG_ON(!prealloc);
 813                 err = split_state(tree, state, prealloc, end + 1);
 814                 if (err)
 815                         extent_io_tree_panic(tree, err);
 816
 817                 if (wake)
 818                         wake_up(&state->wq);
 819
 820                 clear_state_bit(tree, prealloc, &bits, wake, changeset);
 821
 822                 prealloc = NULL;
 823                 goto out;
 824         }
 825
 826         state = clear_state_bit(tree, state, &bits, wake, changeset);
 827 next:
 828         if (last_end == (u64)-1)
 829                 goto out;
 830         start = last_end + 1;
 831         if (start <= end && state && !need_resched())
 832                 goto hit_next;
 833
 834 search_again:
 835         if (start > end)
 836                 goto out;
 837         spin_unlock(&tree->lock);
 838         if (gfpflags_allow_blocking(mask))
 839                 cond_resched();
 840         goto again;
 841
 842 out:
 843         spin_unlock(&tree->lock);
 844         if (prealloc)
 845                 free_extent_state(prealloc);
 846
 847         return 0;
 848
 849 }
 850
 851 static void wait_on_state(struct extent_io_tree *tree,
 852                           struct extent_state *state)
 853                 __releases(tree->lock)
 854                 __acquires(tree->lock)
 855 {
 856         DEFINE_WAIT(wait);
 857         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
 858         spin_unlock(&tree->lock);
 859         schedule();
 860         spin_lock(&tree->lock);
 861         finish_wait(&state->wq, &wait);
 862 }
 863
 864 /*
 865  * waits for one or more bits to clear on a range in the state tree.
 866  * The range [start, end] is inclusive.
 867  * The tree lock is taken by this function
 868  */
 869 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 870                             u32 bits)
 871 {
 872         struct extent_state *state;
 873         struct rb_node *node;
 874
 875         btrfs_debug_check_extent_io_range(tree, start, end);
 876
 877         spin_lock(&tree->lock);
 878 again:
 879         while (1) {
 880                 /*
 881                  * this search will find all the extents that end after
 882                  * our range starts
 883                  */
 884                 node = tree_search(tree, start);
 885 process_node:
 886                 if (!node)
 887                         break;
 888
 889                 state = rb_entry(node, struct extent_state, rb_node);
 890
 891                 if (state->start > end)
 892                         goto out;
 893
 894                 if (state->state & bits) {
 895                         start = state->start;
 896                         refcount_inc(&state->refs);
 897                         wait_on_state(tree, state);
 898                         free_extent_state(state);
 899                         goto again;
 900                 }
 901                 start = state->end + 1;
 902
 903                 if (start > end)
 904                         break;
 905
 906                 if (!cond_resched_lock(&tree->lock)) {
 907                         node = rb_next(node);
 908                         goto process_node;
 909                 }
 910         }
 911 out:
 912         spin_unlock(&tree->lock);
 913 }
 914
 915 static void set_state_bits(struct extent_io_tree *tree,
 916                            struct extent_state *state,
 917                            u32 *bits, struct extent_changeset *changeset)
 918 {
 919         u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
 920         int ret;
 921
 922         if (tree->private_data && is_data_inode(tree->private_data))
 923                 btrfs_set_delalloc_extent(tree->private_data, state, bits);
 924
 925         if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 926                 u64 range = state->end - state->start + 1;
 927                 tree->dirty_bytes += range;
 928         }
 929         ret = add_extent_changeset(state, bits_to_set, changeset, 1);
 930         BUG_ON(ret < 0);
 931         state->state |= bits_to_set;
 932 }
 933
 934 static void cache_state_if_flags(struct extent_state *state,
 935                                  struct extent_state **cached_ptr,
 936                                  unsigned flags)
 937 {
 938         if (cached_ptr && !(*cached_ptr)) {
 939                 if (!flags || (state->state & flags)) {
 940                         *cached_ptr = state;
 941                         refcount_inc(&state->refs);
 942                 }
 943         }
 944 }
 945
 946 static void cache_state(struct extent_state *state,
 947                         struct extent_state **cached_ptr)
 948 {
 949         return cache_state_if_flags(state, cached_ptr,
 950                                     EXTENT_LOCKED | EXTENT_BOUNDARY);
 951 }
 952
 953 /*
 954  * set some bits on a range in the tree.  This may require allocations or
 955  * sleeping, so the gfp mask is used to indicate what is allowed.
 956  *
 957  * If any of the exclusive bits are set, this will fail with -EEXIST if some
 958  * part of the range already has the desired bits set.  The start of the
 959  * existing range is returned in failed_start in this case.
 960  *
 961  * [start, end] is inclusive This takes the tree lock.
 962  */
 963 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
 964                    u32 exclusive_bits, u64 *failed_start,
 965                    struct extent_state **cached_state, gfp_t mask,
 966                    struct extent_changeset *changeset)
 967 {
 968         struct extent_state *state;
 969         struct extent_state *prealloc = NULL;
 970         struct rb_node *node;
 971         struct rb_node **p;
 972         struct rb_node *parent;
 973         int err = 0;
 974         u64 last_start;
 975         u64 last_end;
 976
 977         btrfs_debug_check_extent_io_range(tree, start, end);
 978         trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
 979
 980         if (exclusive_bits)
 981                 ASSERT(failed_start);
 982         else
 983                 ASSERT(failed_start == NULL);
 984 again:
 985         if (!prealloc && gfpflags_allow_blocking(mask)) {
 986                 /*
 987                  * Don't care for allocation failure here because we might end
 988                  * up not needing the pre-allocated extent state at all, which
 989                  * is the case if we only have in the tree extent states that
 990                  * cover our input range and don't cover too any other range.
 991                  * If we end up needing a new extent state we allocate it later.
 992                  */
 993                 prealloc = alloc_extent_state(mask);
 994         }
 995
 996         spin_lock(&tree->lock);
 997         if (cached_state && *cached_state) {
 998                 state = *cached_state;
 999                 if (state->start <= start && state->end > start &&
1000                     extent_state_in_tree(state)) {
1001                         node = &state->rb_node;
1002                         goto hit_next;
1003                 }
1004         }
1005         /*
1006          * this search will find all the extents that end after
1007          * our range starts.
1008          */
1009         node = tree_search_for_insert(tree, start, &p, &parent);
1010         if (!node) {
1011                 prealloc = alloc_extent_state_atomic(prealloc);
1012                 BUG_ON(!prealloc);
1013                 err = insert_state(tree, prealloc, start, end,
1014                                    &p, &parent, &bits, changeset);
1015                 if (err)
1016                         extent_io_tree_panic(tree, err);
1017
1018                 cache_state(prealloc, cached_state);
1019                 prealloc = NULL;
1020                 goto out;
1021         }
1022         state = rb_entry(node, struct extent_state, rb_node);
1023 hit_next:
1024         last_start = state->start;
1025         last_end = state->end;
1026
1027         /*
1028          * | ---- desired range ---- |
1029          * | state |
1030          *
1031          * Just lock what we found and keep going
1032          */
1033         if (state->start == start && state->end <= end) {
1034                 if (state->state & exclusive_bits) {
1035                         *failed_start = state->start;
1036                         err = -EEXIST;
1037                         goto out;
1038                 }
1039
1040                 set_state_bits(tree, state, &bits, changeset);
1041                 cache_state(state, cached_state);
1042                 merge_state(tree, state);
1043                 if (last_end == (u64)-1)
1044                         goto out;
1045                 start = last_end + 1;
1046                 state = next_state(state);
1047                 if (start < end && state && state->start == start &&
1048                     !need_resched())
1049                         goto hit_next;
1050                 goto search_again;
1051         }
1052
1053         /*
1054          *     | ---- desired range ---- |
1055          * | state |
1056          *   or
1057          * | ------------- state -------------- |
1058          *
1059          * We need to split the extent we found, and may flip bits on
1060          * second half.
1061          *
1062          * If the extent we found extends past our
1063          * range, we just split and search again.  It'll get split
1064          * again the next time though.
1065          *
1066          * If the extent we found is inside our range, we set the
1067          * desired bit on it.
1068          */
1069         if (state->start < start) {
1070                 if (state->state & exclusive_bits) {
1071                         *failed_start = start;
1072                         err = -EEXIST;
1073                         goto out;
1074                 }
1075
1076                 /*
1077                  * If this extent already has all the bits we want set, then
1078                  * skip it, not necessary to split it or do anything with it.
1079                  */
1080                 if ((state->state & bits) == bits) {
1081                         start = state->end + 1;
1082                         cache_state(state, cached_state);
1083                         goto search_again;
1084                 }
1085
1086                 prealloc = alloc_extent_state_atomic(prealloc);
1087                 BUG_ON(!prealloc);
1088                 err = split_state(tree, state, prealloc, start);
1089                 if (err)
1090                         extent_io_tree_panic(tree, err);
1091
1092                 prealloc = NULL;
1093                 if (err)
1094                         goto out;
1095                 if (state->end <= end) {
1096                         set_state_bits(tree, state, &bits, changeset);
1097                         cache_state(state, cached_state);
1098                         merge_state(tree, state);
1099                         if (last_end == (u64)-1)
1100                                 goto out;
1101                         start = last_end + 1;
1102                         state = next_state(state);
1103                         if (start < end && state && state->start == start &&
1104                             !need_resched())
1105                                 goto hit_next;
1106                 }
1107                 goto search_again;
1108         }
1109         /*
1110          * | ---- desired range ---- |
1111          *     | state | or               | state |
1112          *
1113          * There's a hole, we need to insert something in it and
1114          * ignore the extent we found.
1115          */
1116         if (state->start > start) {
1117                 u64 this_end;
1118                 if (end < last_start)
1119                         this_end = end;
1120                 else
1121                         this_end = last_start - 1;
1122
1123                 prealloc = alloc_extent_state_atomic(prealloc);
1124                 BUG_ON(!prealloc);
1125
1126                 /*
1127                  * Avoid to free 'prealloc' if it can be merged with
1128                  * the later extent.
1129                  */
1130                 err = insert_state(tree, prealloc, start, this_end,
1131                                    NULL, NULL, &bits, changeset);
1132                 if (err)
1133                         extent_io_tree_panic(tree, err);
1134
1135                 cache_state(prealloc, cached_state);
1136                 prealloc = NULL;
1137                 start = this_end + 1;
1138                 goto search_again;
1139         }
1140         /*
1141          * | ---- desired range ---- |
1142          *                        | state |
1143          * We need to split the extent, and set the bit
1144          * on the first half
1145          */
1146         if (state->start <= end && state->end > end) {
1147                 if (state->state & exclusive_bits) {
1148                         *failed_start = start;
1149                         err = -EEXIST;
1150                         goto out;
1151                 }
1152
1153                 prealloc = alloc_extent_state_atomic(prealloc);
1154                 BUG_ON(!prealloc);
1155                 err = split_state(tree, state, prealloc, end + 1);
1156                 if (err)
1157                         extent_io_tree_panic(tree, err);
1158
1159                 set_state_bits(tree, prealloc, &bits, changeset);
1160                 cache_state(prealloc, cached_state);
1161                 merge_state(tree, prealloc);
1162                 prealloc = NULL;
1163                 goto out;
1164         }
1165
1166 search_again:
1167         if (start > end)
1168                 goto out;
1169         spin_unlock(&tree->lock);
1170         if (gfpflags_allow_blocking(mask))
1171                 cond_resched();
1172         goto again;
1173
1174 out:
1175         spin_unlock(&tree->lock);
1176         if (prealloc)
1177                 free_extent_state(prealloc);
1178
1179         return err;
1180
1181 }
1182
1183 /**
1184  * convert_extent_bit - convert all bits in a given range from one bit to
1185  *                      another
1186  * @tree:       the io tree to search
1187  * @start:      the start offset in bytes
1188  * @end:        the end offset in bytes (inclusive)
1189  * @bits:       the bits to set in this range
1190  * @clear_bits: the bits to clear in this range
1191  * @cached_state:       state that we're going to cache
1192  *
1193  * This will go through and set bits for the given range.  If any states exist
1194  * already in this range they are set with the given bit and cleared of the
1195  * clear_bits.  This is only meant to be used by things that are mergeable, ie
1196  * converting from say DELALLOC to DIRTY.  This is not meant to be used with
1197  * boundary bits like LOCK.
1198  *
1199  * All allocations are done with GFP_NOFS.
1200  */
1201 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1202                        u32 bits, u32 clear_bits,
1203                        struct extent_state **cached_state)
1204 {
1205         struct extent_state *state;
1206         struct extent_state *prealloc = NULL;
1207         struct rb_node *node;
1208         struct rb_node **p;
1209         struct rb_node *parent;
1210         int err = 0;
1211         u64 last_start;
1212         u64 last_end;
1213         bool first_iteration = true;
1214
1215         btrfs_debug_check_extent_io_range(tree, start, end);
1216         trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1217                                        clear_bits);
1218
1219 again:
1220         if (!prealloc) {
1221                 /*
1222                  * Best effort, don't worry if extent state allocation fails
1223                  * here for the first iteration. We might have a cached state
1224                  * that matches exactly the target range, in which case no
1225                  * extent state allocations are needed. We'll only know this
1226                  * after locking the tree.
1227                  */
1228                 prealloc = alloc_extent_state(GFP_NOFS);
1229                 if (!prealloc && !first_iteration)
1230                         return -ENOMEM;
1231         }
1232
1233         spin_lock(&tree->lock);
1234         if (cached_state && *cached_state) {
1235                 state = *cached_state;
1236                 if (state->start <= start && state->end > start &&
1237                     extent_state_in_tree(state)) {
1238                         node = &state->rb_node;
1239                         goto hit_next;
1240                 }
1241         }
1242
1243         /*
1244          * this search will find all the extents that end after
1245          * our range starts.
1246          */
1247         node = tree_search_for_insert(tree, start, &p, &parent);
1248         if (!node) {
1249                 prealloc = alloc_extent_state_atomic(prealloc);
1250                 if (!prealloc) {
1251                         err = -ENOMEM;
1252                         goto out;
1253                 }
1254                 err = insert_state(tree, prealloc, start, end,
1255                                    &p, &parent, &bits, NULL);
1256                 if (err)
1257                         extent_io_tree_panic(tree, err);
1258                 cache_state(prealloc, cached_state);
1259                 prealloc = NULL;
1260                 goto out;
1261         }
1262         state = rb_entry(node, struct extent_state, rb_node);
1263 hit_next:
1264         last_start = state->start;
1265         last_end = state->end;
1266
1267         /*
1268          * | ---- desired range ---- |
1269          * | state |
1270          *
1271          * Just lock what we found and keep going
1272          */
1273         if (state->start == start && state->end <= end) {
1274                 set_state_bits(tree, state, &bits, NULL);
1275                 cache_state(state, cached_state);
1276                 state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
1277                 if (last_end == (u64)-1)
1278                         goto out;
1279                 start = last_end + 1;
1280                 if (start < end && state && state->start == start &&
1281                     !need_resched())
1282                         goto hit_next;
1283                 goto search_again;
1284         }
1285
1286         /*
1287          *     | ---- desired range ---- |
1288          * | state |
1289          *   or
1290          * | ------------- state -------------- |
1291          *
1292          * We need to split the extent we found, and may flip bits on
1293          * second half.
1294          *
1295          * If the extent we found extends past our
1296          * range, we just split and search again.  It'll get split
1297          * again the next time though.
1298          *
1299          * If the extent we found is inside our range, we set the
1300          * desired bit on it.
1301          */
1302         if (state->start < start) {
1303                 prealloc = alloc_extent_state_atomic(prealloc);
1304                 if (!prealloc) {
1305                         err = -ENOMEM;
1306                         goto out;
1307                 }
1308                 err = split_state(tree, state, prealloc, start);
1309                 if (err)
1310                         extent_io_tree_panic(tree, err);
1311                 prealloc = NULL;
1312                 if (err)
1313                         goto out;
1314                 if (state->end <= end) {
1315                         set_state_bits(tree, state, &bits, NULL);
1316                         cache_state(state, cached_state);
1317                         state = clear_state_bit(tree, state, &clear_bits, 0,
1318                                                 NULL);
1319                         if (last_end == (u64)-1)
1320                                 goto out;
1321                         start = last_end + 1;
1322                         if (start < end && state && state->start == start &&
1323                             !need_resched())
1324                                 goto hit_next;
1325                 }
1326                 goto search_again;
1327         }
1328         /*
1329          * | ---- desired range ---- |
1330          *     | state | or               | state |
1331          *
1332          * There's a hole, we need to insert something in it and
1333          * ignore the extent we found.
1334          */
1335         if (state->start > start) {
1336                 u64 this_end;
1337                 if (end < last_start)
1338                         this_end = end;
1339                 else
1340                         this_end = last_start - 1;
1341
1342                 prealloc = alloc_extent_state_atomic(prealloc);
1343                 if (!prealloc) {
1344                         err = -ENOMEM;
1345                         goto out;
1346                 }
1347
1348                 /*
1349                  * Avoid to free 'prealloc' if it can be merged with
1350                  * the later extent.
1351                  */
1352                 err = insert_state(tree, prealloc, start, this_end,
1353                                    NULL, NULL, &bits, NULL);
1354                 if (err)
1355                         extent_io_tree_panic(tree, err);
1356                 cache_state(prealloc, cached_state);
1357                 prealloc = NULL;
1358                 start = this_end + 1;
1359                 goto search_again;
1360         }
1361         /*
1362          * | ---- desired range ---- |
1363          *                        | state |
1364          * We need to split the extent, and set the bit
1365          * on the first half
1366          */
1367         if (state->start <= end && state->end > end) {
1368                 prealloc = alloc_extent_state_atomic(prealloc);
1369                 if (!prealloc) {
1370                         err = -ENOMEM;
1371                         goto out;
1372                 }
1373
1374                 err = split_state(tree, state, prealloc, end + 1);
1375                 if (err)
1376                         extent_io_tree_panic(tree, err);
1377
1378                 set_state_bits(tree, prealloc, &bits, NULL);
1379                 cache_state(prealloc, cached_state);
1380                 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
1381                 prealloc = NULL;
1382                 goto out;
1383         }
1384
1385 search_again:
1386         if (start > end)
1387                 goto out;
1388         spin_unlock(&tree->lock);
1389         cond_resched();
1390         first_iteration = false;
1391         goto again;
1392
1393 out:
1394         spin_unlock(&tree->lock);
1395         if (prealloc)
1396                 free_extent_state(prealloc);
1397
1398         return err;
1399 }
1400
1401 /* wrappers around set/clear extent bit */
1402 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1403                            u32 bits, struct extent_changeset *changeset)
1404 {
1405         /*
1406          * We don't support EXTENT_LOCKED yet, as current changeset will
1407          * record any bits changed, so for EXTENT_LOCKED case, it will
1408          * either fail with -EEXIST or changeset will record the whole
1409          * range.
1410          */
1411         BUG_ON(bits & EXTENT_LOCKED);
1412
1413         return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1414                               changeset);
1415 }
1416
1417 int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
1418                            u32 bits)
1419 {
1420         return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1421                               GFP_NOWAIT, NULL);
1422 }
1423
1424 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1425                      u32 bits, int wake, int delete,
1426                      struct extent_state **cached)
1427 {
1428         return __clear_extent_bit(tree, start, end, bits, wake, delete,
1429                                   cached, GFP_NOFS, NULL);
1430 }
1431
1432 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1433                 u32 bits, struct extent_changeset *changeset)
1434 {
1435         /*
1436          * Don't support EXTENT_LOCKED case, same reason as
1437          * set_record_extent_bits().
1438          */
1439         BUG_ON(bits & EXTENT_LOCKED);
1440
1441         return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
1442                                   changeset);
1443 }
1444
1445 /*
1446  * either insert or lock state struct between start and end use mask to tell
1447  * us if waiting is desired.
1448  */
1449 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1450                      struct extent_state **cached_state)
1451 {
1452         int err;
1453         u64 failed_start;
1454
1455         while (1) {
1456                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
1457                                      EXTENT_LOCKED, &failed_start,
1458                                      cached_state, GFP_NOFS, NULL);
1459                 if (err == -EEXIST) {
1460                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1461                         start = failed_start;
1462                 } else
1463                         break;
1464                 WARN_ON(start > end);
1465         }
1466         return err;
1467 }
1468
1469 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1470 {
1471         int err;
1472         u64 failed_start;
1473
1474         err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1475                              &failed_start, NULL, GFP_NOFS, NULL);
1476         if (err == -EEXIST) {
1477                 if (failed_start > start)
1478                         clear_extent_bit(tree, start, failed_start - 1,
1479                                          EXTENT_LOCKED, 1, 0, NULL);
1480                 return 0;
1481         }
1482         return 1;
1483 }
1484
1485 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1486 {
1487         unsigned long index = start >> PAGE_SHIFT;
1488         unsigned long end_index = end >> PAGE_SHIFT;
1489         struct page *page;
1490
1491         while (index <= end_index) {
1492                 page = find_get_page(inode->i_mapping, index);
1493                 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1494                 clear_page_dirty_for_io(page);
1495                 put_page(page);
1496                 index++;
1497         }
1498 }
1499
1500 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1501 {
1502         unsigned long index = start >> PAGE_SHIFT;
1503         unsigned long end_index = end >> PAGE_SHIFT;
1504         struct page *page;
1505
1506         while (index <= end_index) {
1507                 page = find_get_page(inode->i_mapping, index);
1508                 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1509                 __set_page_dirty_nobuffers(page);
1510                 account_page_redirty(page);
1511                 put_page(page);
1512                 index++;
1513         }
1514 }
1515
1516 /* find the first state struct with 'bits' set after 'start', and
1517  * return it.  tree->lock must be held.  NULL will returned if
1518  * nothing was found after 'start'
1519  */
1520 static struct extent_state *
1521 find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
1522 {
1523         struct rb_node *node;
1524         struct extent_state *state;
1525
1526         /*
1527          * this search will find all the extents that end after
1528          * our range starts.
1529          */
1530         node = tree_search(tree, start);
1531         if (!node)
1532                 goto out;
1533
1534         while (1) {
1535                 state = rb_entry(node, struct extent_state, rb_node);
1536                 if (state->end >= start && (state->state & bits))
1537                         return state;
1538
1539                 node = rb_next(node);
1540                 if (!node)
1541                         break;
1542         }
1543 out:
1544         return NULL;
1545 }
1546
1547 /*
1548  * Find the first offset in the io tree with one or more @bits set.
1549  *
1550  * Note: If there are multiple bits set in @bits, any of them will match.
1551  *
1552  * Return 0 if we find something, and update @start_ret and @end_ret.
1553  * Return 1 if we found nothing.
1554  */
1555 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1556                           u64 *start_ret, u64 *end_ret, u32 bits,
1557                           struct extent_state **cached_state)
1558 {
1559         struct extent_state *state;
1560         int ret = 1;
1561
1562         spin_lock(&tree->lock);
1563         if (cached_state && *cached_state) {
1564                 state = *cached_state;
1565                 if (state->end == start - 1 && extent_state_in_tree(state)) {
1566                         while ((state = next_state(state)) != NULL) {
1567                                 if (state->state & bits)
1568                                         goto got_it;
1569                         }
1570                         free_extent_state(*cached_state);
1571                         *cached_state = NULL;
1572                         goto out;
1573                 }
1574                 free_extent_state(*cached_state);
1575                 *cached_state = NULL;
1576         }
1577
1578         state = find_first_extent_bit_state(tree, start, bits);
1579 got_it:
1580         if (state) {
1581                 cache_state_if_flags(state, cached_state, 0);
1582                 *start_ret = state->start;
1583                 *end_ret = state->end;
1584                 ret = 0;
1585         }
1586 out:
1587         spin_unlock(&tree->lock);
1588         return ret;
1589 }
1590
1591 /**
1592  * Find a contiguous area of bits
1593  *
1594  * @tree:      io tree to check
1595  * @start:     offset to start the search from
1596  * @start_ret: the first offset we found with the bits set
1597  * @end_ret:   the final contiguous range of the bits that were set
1598  * @bits:      bits to look for
1599  *
1600  * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
1601  * to set bits appropriately, and then merge them again.  During this time it
1602  * will drop the tree->lock, so use this helper if you want to find the actual
1603  * contiguous area for given bits.  We will search to the first bit we find, and
1604  * then walk down the tree until we find a non-contiguous area.  The area
1605  * returned will be the full contiguous area with the bits set.
1606  */
1607 int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
1608                                u64 *start_ret, u64 *end_ret, u32 bits)
1609 {
1610         struct extent_state *state;
1611         int ret = 1;
1612
1613         spin_lock(&tree->lock);
1614         state = find_first_extent_bit_state(tree, start, bits);
1615         if (state) {
1616                 *start_ret = state->start;
1617                 *end_ret = state->end;
1618                 while ((state = next_state(state)) != NULL) {
1619                         if (state->start > (*end_ret + 1))
1620                                 break;
1621                         *end_ret = state->end;
1622                 }
1623                 ret = 0;
1624         }
1625         spin_unlock(&tree->lock);
1626         return ret;
1627 }
1628
1629 /**
1630  * Find the first range that has @bits not set. This range could start before
1631  * @start.
1632  *
1633  * @tree:      the tree to search
1634  * @start:     offset at/after which the found extent should start
1635  * @start_ret: records the beginning of the range
1636  * @end_ret:   records the end of the range (inclusive)
1637  * @bits:      the set of bits which must be unset
1638  *
1639  * Since unallocated range is also considered one which doesn't have the bits
1640  * set it's possible that @end_ret contains -1, this happens in case the range
1641  * spans (last_range_end, end of device]. In this case it's up to the caller to
1642  * trim @end_ret to the appropriate size.
1643  */
1644 void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1645                                  u64 *start_ret, u64 *end_ret, u32 bits)
1646 {
1647         struct extent_state *state;
1648         struct rb_node *node, *prev = NULL, *next;
1649
1650         spin_lock(&tree->lock);
1651
1652         /* Find first extent with bits cleared */
1653         while (1) {
1654                 node = __etree_search(tree, start, &next, &prev, NULL, NULL);
1655                 if (!node && !next && !prev) {
1656                         /*
1657                          * Tree is completely empty, send full range and let
1658                          * caller deal with it
1659                          */
1660                         *start_ret = 0;
1661                         *end_ret = -1;
1662                         goto out;
1663                 } else if (!node && !next) {
1664                         /*
1665                          * We are past the last allocated chunk, set start at
1666                          * the end of the last extent.
1667                          */
1668                         state = rb_entry(prev, struct extent_state, rb_node);
1669                         *start_ret = state->end + 1;
1670                         *end_ret = -1;
1671                         goto out;
1672                 } else if (!node) {
1673                         node = next;
1674                 }
1675                 /*
1676                  * At this point 'node' either contains 'start' or start is
1677                  * before 'node'
1678                  */
1679                 state = rb_entry(node, struct extent_state, rb_node);
1680
1681                 if (in_range(start, state->start, state->end - state->start + 1)) {
1682                         if (state->state & bits) {
1683                                 /*
1684                                  * |--range with bits sets--|
1685                                  *    |
1686                                  *    start
1687                                  */
1688                                 start = state->end + 1;
1689                         } else {
1690                                 /*
1691                                  * 'start' falls within a range that doesn't
1692                                  * have the bits set, so take its start as
1693                                  * the beginning of the desired range
1694                                  *
1695                                  * |--range with bits cleared----|
1696                                  *      |
1697                                  *      start
1698                                  */
1699                                 *start_ret = state->start;
1700                                 break;
1701                         }
1702                 } else {
1703                         /*
1704                          * |---prev range---|---hole/unset---|---node range---|
1705                          *                          |
1706                          *                        start
1707                          *
1708                          *                        or
1709                          *
1710                          * |---hole/unset--||--first node--|
1711                          * 0   |
1712                          *    start
1713                          */
1714                         if (prev) {
1715                                 state = rb_entry(prev, struct extent_state,
1716                                                  rb_node);
1717                                 *start_ret = state->end + 1;
1718                         } else {
1719                                 *start_ret = 0;
1720                         }
1721                         break;
1722                 }
1723         }
1724
1725         /*
1726          * Find the longest stretch from start until an entry which has the
1727          * bits set
1728          */
1729         while (1) {
1730                 state = rb_entry(node, struct extent_state, rb_node);
1731                 if (state->end >= start && !(state->state & bits)) {
1732                         *end_ret = state->end;
1733                 } else {
1734                         *end_ret = state->start - 1;
1735                         break;
1736                 }
1737
1738                 node = rb_next(node);
1739                 if (!node)
1740                         break;
1741         }
1742 out:
1743         spin_unlock(&tree->lock);
1744 }
1745
1746 /*
1747  * find a contiguous range of bytes in the file marked as delalloc, not
1748  * more than 'max_bytes'.  start and end are used to return the range,
1749  *
1750  * true is returned if we find something, false if nothing was in the tree
1751  */
1752 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1753                                u64 *end, u64 max_bytes,
1754                                struct extent_state **cached_state)
1755 {
1756         struct rb_node *node;
1757         struct extent_state *state;
1758         u64 cur_start = *start;
1759         bool found = false;
1760         u64 total_bytes = 0;
1761
1762         spin_lock(&tree->lock);
1763
1764         /*
1765          * this search will find all the extents that end after
1766          * our range starts.
1767          */
1768         node = tree_search(tree, cur_start);
1769         if (!node) {
1770                 *end = (u64)-1;
1771                 goto out;
1772         }
1773
1774         while (1) {
1775                 state = rb_entry(node, struct extent_state, rb_node);
1776                 if (found && (state->start != cur_start ||
1777                               (state->state & EXTENT_BOUNDARY))) {
1778                         goto out;
1779                 }
1780                 if (!(state->state & EXTENT_DELALLOC)) {
1781                         if (!found)
1782                                 *end = state->end;
1783                         goto out;
1784                 }
1785                 if (!found) {
1786                         *start = state->start;
1787                         *cached_state = state;
1788                         refcount_inc(&state->refs);
1789                 }
1790                 found = true;
1791                 *end = state->end;
1792                 cur_start = state->end + 1;
1793                 node = rb_next(node);
1794                 total_bytes += state->end - state->start + 1;
1795                 if (total_bytes >= max_bytes)
1796                         break;
1797                 if (!node)
1798                         break;
1799         }
1800 out:
1801         spin_unlock(&tree->lock);
1802         return found;
1803 }
1804
1805 static int __process_pages_contig(struct address_space *mapping,
1806                                   struct page *locked_page,
1807                                   pgoff_t start_index, pgoff_t end_index,
1808                                   unsigned long page_ops, pgoff_t *index_ret);
1809
1810 static noinline void __unlock_for_delalloc(struct inode *inode,
1811                                            struct page *locked_page,
1812                                            u64 start, u64 end)
1813 {
1814         unsigned long index = start >> PAGE_SHIFT;
1815         unsigned long end_index = end >> PAGE_SHIFT;
1816
1817         ASSERT(locked_page);
1818         if (index == locked_page->index && end_index == index)
1819                 return;
1820
1821         __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
1822                                PAGE_UNLOCK, NULL);
1823 }
1824
1825 static noinline int lock_delalloc_pages(struct inode *inode,
1826                                         struct page *locked_page,
1827                                         u64 delalloc_start,
1828                                         u64 delalloc_end)
1829 {
1830         unsigned long index = delalloc_start >> PAGE_SHIFT;
1831         unsigned long index_ret = index;
1832         unsigned long end_index = delalloc_end >> PAGE_SHIFT;
1833         int ret;
1834
1835         ASSERT(locked_page);
1836         if (index == locked_page->index && index == end_index)
1837                 return 0;
1838
1839         ret = __process_pages_contig(inode->i_mapping, locked_page, index,
1840                                      end_index, PAGE_LOCK, &index_ret);
1841         if (ret == -EAGAIN)
1842                 __unlock_for_delalloc(inode, locked_page, delalloc_start,
1843                                       (u64)index_ret << PAGE_SHIFT);
1844         return ret;
1845 }
1846
1847 /*
1848  * Find and lock a contiguous range of bytes in the file marked as delalloc, no
1849  * more than @max_bytes.  @Start and @end are used to return the range,
1850  *
1851  * Return: true if we find something
1852  *         false if nothing was in the tree
1853  */
1854 EXPORT_FOR_TESTS
1855 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
1856                                     struct page *locked_page, u64 *start,
1857                                     u64 *end)
1858 {
1859         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1860         u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
1861         u64 delalloc_start;
1862         u64 delalloc_end;
1863         bool found;
1864         struct extent_state *cached_state = NULL;
1865         int ret;
1866         int loops = 0;
1867
1868 again:
1869         /* step one, find a bunch of delalloc bytes starting at start */
1870         delalloc_start = *start;
1871         delalloc_end = 0;
1872         found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1873                                           max_bytes, &cached_state);
1874         if (!found || delalloc_end <= *start) {
1875                 *start = delalloc_start;
1876                 *end = delalloc_end;
1877                 free_extent_state(cached_state);
1878                 return false;
1879         }
1880
1881         /*
1882          * start comes from the offset of locked_page.  We have to lock
1883          * pages in order, so we can't process delalloc bytes before
1884          * locked_page
1885          */
1886         if (delalloc_start < *start)
1887                 delalloc_start = *start;
1888
1889         /*
1890          * make sure to limit the number of pages we try to lock down
1891          */
1892         if (delalloc_end + 1 - delalloc_start > max_bytes)
1893                 delalloc_end = delalloc_start + max_bytes - 1;
1894
1895         /* step two, lock all the pages after the page that has start */
1896         ret = lock_delalloc_pages(inode, locked_page,
1897                                   delalloc_start, delalloc_end);
1898         ASSERT(!ret || ret == -EAGAIN);
1899         if (ret == -EAGAIN) {
1900                 /* some of the pages are gone, lets avoid looping by
1901                  * shortening the size of the delalloc range we're searching
1902                  */
1903                 free_extent_state(cached_state);
1904                 cached_state = NULL;
1905                 if (!loops) {
1906                         max_bytes = PAGE_SIZE;
1907                         loops = 1;
1908                         goto again;
1909                 } else {
1910                         found = false;
1911                         goto out_failed;
1912                 }
1913         }
1914
1915         /* step three, lock the state bits for the whole range */
1916         lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
1917
1918         /* then test to make sure it is all still delalloc */
1919         ret = test_range_bit(tree, delalloc_start, delalloc_end,
1920                              EXTENT_DELALLOC, 1, cached_state);
1921         if (!ret) {
1922                 unlock_extent_cached(tree, delalloc_start, delalloc_end,
1923                                      &cached_state);
1924                 __unlock_for_delalloc(inode, locked_page,
1925                               delalloc_start, delalloc_end);
1926                 cond_resched();
1927                 goto again;
1928         }
1929         free_extent_state(cached_state);
1930         *start = delalloc_start;
1931         *end = delalloc_end;
1932 out_failed:
1933         return found;
1934 }
1935
1936 static int __process_pages_contig(struct address_space *mapping,
1937                                   struct page *locked_page,
1938                                   pgoff_t start_index, pgoff_t end_index,
1939                                   unsigned long page_ops, pgoff_t *index_ret)
1940 {
1941         unsigned long nr_pages = end_index - start_index + 1;
1942         unsigned long pages_processed = 0;
1943         pgoff_t index = start_index;
1944         struct page *pages[16];
1945         unsigned ret;
1946         int err = 0;
1947         int i;
1948
1949         if (page_ops & PAGE_LOCK) {
1950                 ASSERT(page_ops == PAGE_LOCK);
1951                 ASSERT(index_ret && *index_ret == start_index);
1952         }
1953
1954         if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1955                 mapping_set_error(mapping, -EIO);
1956
1957         while (nr_pages > 0) {
1958                 ret = find_get_pages_contig(mapping, index,
1959                                      min_t(unsigned long,
1960                                      nr_pages, ARRAY_SIZE(pages)), pages);
1961                 if (ret == 0) {
1962                         /*
1963                          * Only if we're going to lock these pages,
1964                          * can we find nothing at @index.
1965                          */
1966                         ASSERT(page_ops & PAGE_LOCK);
1967                         err = -EAGAIN;
1968                         goto out;
1969                 }
1970
1971                 for (i = 0; i < ret; i++) {
1972                         if (page_ops & PAGE_SET_PRIVATE2)
1973                                 SetPagePrivate2(pages[i]);
1974
1975                         if (locked_page && pages[i] == locked_page) {
1976                                 put_page(pages[i]);
1977                                 pages_processed++;
1978                                 continue;
1979                         }
1980                         if (page_ops & PAGE_START_WRITEBACK) {
1981                                 clear_page_dirty_for_io(pages[i]);
1982                                 set_page_writeback(pages[i]);
1983                         }
1984                         if (page_ops & PAGE_SET_ERROR)
1985                                 SetPageError(pages[i]);
1986                         if (page_ops & PAGE_END_WRITEBACK)
1987                                 end_page_writeback(pages[i]);
1988                         if (page_ops & PAGE_UNLOCK)
1989                                 unlock_page(pages[i]);
1990                         if (page_ops & PAGE_LOCK) {
1991                                 lock_page(pages[i]);
1992                                 if (!PageDirty(pages[i]) ||
1993                                     pages[i]->mapping != mapping) {
1994                                         unlock_page(pages[i]);
1995                                         for (; i < ret; i++)
1996                                                 put_page(pages[i]);
1997                                         err = -EAGAIN;
1998                                         goto out;
1999                                 }
2000                         }
2001                         put_page(pages[i]);
2002                         pages_processed++;
2003                 }
2004                 nr_pages -= ret;
2005                 index += ret;
2006                 cond_resched();
2007         }
2008 out:
2009         if (err && index_ret)
2010                 *index_ret = start_index + pages_processed - 1;
2011         return err;
2012 }
2013
2014 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2015                                   struct page *locked_page,
2016                                   u32 clear_bits, unsigned long page_ops)
2017 {
2018         clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
2019
2020         __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
2021                                start >> PAGE_SHIFT, end >> PAGE_SHIFT,
2022                                page_ops, NULL);
2023 }
2024
2025 /*
2026  * count the number of bytes in the tree that have a given bit(s)
2027  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
2028  * cached.  The total number found is returned.
2029  */
2030 u64 count_range_bits(struct extent_io_tree *tree,
2031                      u64 *start, u64 search_end, u64 max_bytes,
2032                      u32 bits, int contig)
2033 {
2034         struct rb_node *node;
2035         struct extent_state *state;
2036         u64 cur_start = *start;
2037         u64 total_bytes = 0;
2038         u64 last = 0;
2039         int found = 0;
2040
2041         if (WARN_ON(search_end <= cur_start))
2042                 return 0;
2043
2044         spin_lock(&tree->lock);
2045         if (cur_start == 0 && bits == EXTENT_DIRTY) {
2046                 total_bytes = tree->dirty_bytes;
2047                 goto out;
2048         }
2049         /*
2050          * this search will find all the extents that end after
2051          * our range starts.
2052          */
2053         node = tree_search(tree, cur_start);
2054         if (!node)
2055                 goto out;
2056
2057         while (1) {
2058                 state = rb_entry(node, struct extent_state, rb_node);
2059                 if (state->start > search_end)
2060                         break;
2061                 if (contig && found && state->start > last + 1)
2062                         break;
2063                 if (state->end >= cur_start && (state->state & bits) == bits) {
2064                         total_bytes += min(search_end, state->end) + 1 -
2065                                        max(cur_start, state->start);
2066                         if (total_bytes >= max_bytes)
2067                                 break;
2068                         if (!found) {
2069                                 *start = max(cur_start, state->start);
2070                                 found = 1;
2071                         }
2072                         last = state->end;
2073                 } else if (contig && found) {
2074                         break;
2075                 }
2076                 node = rb_next(node);
2077                 if (!node)
2078                         break;
2079         }
2080 out:
2081         spin_unlock(&tree->lock);
2082         return total_bytes;
2083 }
2084
2085 /*
2086  * set the private field for a given byte offset in the tree.  If there isn't
2087  * an extent_state there already, this does nothing.
2088  */
2089 int set_state_failrec(struct extent_io_tree *tree, u64 start,
2090                       struct io_failure_record *failrec)
2091 {
2092         struct rb_node *node;
2093         struct extent_state *state;
2094         int ret = 0;
2095
2096         spin_lock(&tree->lock);
2097         /*
2098          * this search will find all the extents that end after
2099          * our range starts.
2100          */
2101         node = tree_search(tree, start);
2102         if (!node) {
2103                 ret = -ENOENT;
2104                 goto out;
2105         }
2106         state = rb_entry(node, struct extent_state, rb_node);
2107         if (state->start != start) {
2108                 ret = -ENOENT;
2109                 goto out;
2110         }
2111         state->failrec = failrec;
2112 out:
2113         spin_unlock(&tree->lock);
2114         return ret;
2115 }
2116
2117 struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
2118 {
2119         struct rb_node *node;
2120         struct extent_state *state;
2121         struct io_failure_record *failrec;
2122
2123         spin_lock(&tree->lock);
2124         /*
2125          * this search will find all the extents that end after
2126          * our range starts.
2127          */
2128         node = tree_search(tree, start);
2129         if (!node) {
2130                 failrec = ERR_PTR(-ENOENT);
2131                 goto out;
2132         }
2133         state = rb_entry(node, struct extent_state, rb_node);
2134         if (state->start != start) {
2135                 failrec = ERR_PTR(-ENOENT);
2136                 goto out;
2137         }
2138
2139         failrec = state->failrec;
2140 out:
2141         spin_unlock(&tree->lock);
2142         return failrec;
2143 }
2144
2145 /*
2146  * searches a range in the state tree for a given mask.
2147  * If 'filled' == 1, this returns 1 only if every extent in the tree
2148  * has the bits set.  Otherwise, 1 is returned if any bit in the
2149  * range is found set.
2150  */
2151 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
2152                    u32 bits, int filled, struct extent_state *cached)
2153 {
2154         struct extent_state *state = NULL;
2155         struct rb_node *node;
2156         int bitset = 0;
2157
2158         spin_lock(&tree->lock);
2159         if (cached && extent_state_in_tree(cached) && cached->start <= start &&
2160             cached->end > start)
2161                 node = &cached->rb_node;
2162         else
2163                 node = tree_search(tree, start);
2164         while (node && start <= end) {
2165                 state = rb_entry(node, struct extent_state, rb_node);
2166
2167                 if (filled && state->start > start) {
2168                         bitset = 0;
2169                         break;
2170                 }
2171
2172                 if (state->start > end)
2173                         break;
2174
2175                 if (state->state & bits) {
2176                         bitset = 1;
2177                         if (!filled)
2178                                 break;
2179                 } else if (filled) {
2180                         bitset = 0;
2181                         break;
2182                 }
2183
2184                 if (state->end == (u64)-1)
2185                         break;
2186
2187                 start = state->end + 1;
2188                 if (start > end)
2189                         break;
2190                 node = rb_next(node);
2191                 if (!node) {
2192                         if (filled)
2193                                 bitset = 0;
2194                         break;
2195                 }
2196         }
2197         spin_unlock(&tree->lock);
2198         return bitset;
2199 }
2200
2201 /*
2202  * helper function to set a given page up to date if all the
2203  * extents in the tree for that page are up to date
2204  */
2205 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
2206 {
2207         u64 start = page_offset(page);
2208         u64 end = start + PAGE_SIZE - 1;
2209         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
2210                 SetPageUptodate(page);
2211 }
2212
2213 int free_io_failure(struct extent_io_tree *failure_tree,
2214                     struct extent_io_tree *io_tree,
2215                     struct io_failure_record *rec)
2216 {
2217         int ret;
2218         int err = 0;
2219
2220         set_state_failrec(failure_tree, rec->start, NULL);
2221         ret = clear_extent_bits(failure_tree, rec->start,
2222                                 rec->start + rec->len - 1,
2223                                 EXTENT_LOCKED | EXTENT_DIRTY);
2224         if (ret)
2225                 err = ret;
2226
2227         ret = clear_extent_bits(io_tree, rec->start,
2228                                 rec->start + rec->len - 1,
2229                                 EXTENT_DAMAGED);
2230         if (ret && !err)
2231                 err = ret;
2232
2233         kfree(rec);
2234         return err;
2235 }
2236
2237 /*
2238  * this bypasses the standard btrfs submit functions deliberately, as
2239  * the standard behavior is to write all copies in a raid setup. here we only
2240  * want to write the one bad copy. so we do the mapping for ourselves and issue
2241  * submit_bio directly.
2242  * to avoid any synchronization issues, wait for the data after writing, which
2243  * actually prevents the read that triggered the error from finishing.
2244  * currently, there can be no more than two copies of every data bit. thus,
2245  * exactly one rewrite is required.
2246  */
2247 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2248                       u64 length, u64 logical, struct page *page,
2249                       unsigned int pg_offset, int mirror_num)
2250 {
2251         struct bio *bio;
2252         struct btrfs_device *dev;
2253         u64 map_length = 0;
2254         u64 sector;
2255         struct btrfs_bio *bbio = NULL;
2256         int ret;
2257
2258         ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
2259         BUG_ON(!mirror_num);
2260
2261         bio = btrfs_io_bio_alloc(1);
2262         bio->bi_iter.bi_size = 0;
2263         map_length = length;
2264
2265         /*
2266          * Avoid races with device replace and make sure our bbio has devices
2267          * associated to its stripes that don't go away while we are doing the
2268          * read repair operation.
2269          */
2270         btrfs_bio_counter_inc_blocked(fs_info);
2271         if (btrfs_is_parity_mirror(fs_info, logical, length)) {
2272                 /*
2273                  * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2274                  * to update all raid stripes, but here we just want to correct
2275                  * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2276                  * stripe's dev and sector.
2277                  */
2278                 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
2279                                       &map_length, &bbio, 0);
2280                 if (ret) {
2281                         btrfs_bio_counter_dec(fs_info);
2282                         bio_put(bio);
2283                         return -EIO;
2284                 }
2285                 ASSERT(bbio->mirror_num == 1);
2286         } else {
2287                 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2288                                       &map_length, &bbio, mirror_num);
2289                 if (ret) {
2290                         btrfs_bio_counter_dec(fs_info);
2291                         bio_put(bio);
2292                         return -EIO;
2293                 }
2294                 BUG_ON(mirror_num != bbio->mirror_num);
2295         }
2296
2297         sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
2298         bio->bi_iter.bi_sector = sector;
2299         dev = bbio->stripes[bbio->mirror_num - 1].dev;
2300         btrfs_put_bbio(bbio);
2301         if (!dev || !dev->bdev ||
2302             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
2303                 btrfs_bio_counter_dec(fs_info);
2304                 bio_put(bio);
2305                 return -EIO;
2306         }
2307         bio_set_dev(bio, dev->bdev);
2308         bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
2309         bio_add_page(bio, page, length, pg_offset);
2310
2311         if (btrfsic_submit_bio_wait(bio)) {
2312                 /* try to remap that extent elsewhere? */
2313                 btrfs_bio_counter_dec(fs_info);
2314                 bio_put(bio);
2315                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2316                 return -EIO;
2317         }
2318
2319         btrfs_info_rl_in_rcu(fs_info,
2320                 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
2321                                   ino, start,
2322                                   rcu_str_deref(dev->name), sector);
2323         btrfs_bio_counter_dec(fs_info);
2324         bio_put(bio);
2325         return 0;
2326 }
2327
2328 int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
2329 {
2330         struct btrfs_fs_info *fs_info = eb->fs_info;
2331         u64 start = eb->start;
2332         int i, num_pages = num_extent_pages(eb);
2333         int ret = 0;
2334
2335         if (sb_rdonly(fs_info->sb))
2336                 return -EROFS;
2337
2338         for (i = 0; i < num_pages; i++) {
2339                 struct page *p = eb->pages[i];
2340
2341                 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
2342                                         start - page_offset(p), mirror_num);
2343                 if (ret)
2344                         break;
2345                 start += PAGE_SIZE;
2346         }
2347
2348         return ret;
2349 }
2350
2351 /*
2352  * each time an IO finishes, we do a fast check in the IO failure tree
2353  * to see if we need to process or clean up an io_failure_record
2354  */
2355 int clean_io_failure(struct btrfs_fs_info *fs_info,
2356                      struct extent_io_tree *failure_tree,
2357                      struct extent_io_tree *io_tree, u64 start,
2358                      struct page *page, u64 ino, unsigned int pg_offset)
2359 {
2360         u64 private;
2361         struct io_failure_record *failrec;
2362         struct extent_state *state;
2363         int num_copies;
2364         int ret;
2365
2366         private = 0;
2367         ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2368                                EXTENT_DIRTY, 0);
2369         if (!ret)
2370                 return 0;
2371
2372         failrec = get_state_failrec(failure_tree, start);
2373         if (IS_ERR(failrec))
2374                 return 0;
2375
2376         BUG_ON(!failrec->this_mirror);
2377
2378         if (failrec->in_validation) {
2379                 /* there was no real error, just free the record */
2380                 btrfs_debug(fs_info,
2381                         "clean_io_failure: freeing dummy error at %llu",
2382                         failrec->start);
2383                 goto out;
2384         }
2385         if (sb_rdonly(fs_info->sb))
2386                 goto out;
2387
2388         spin_lock(&io_tree->lock);
2389         state = find_first_extent_bit_state(io_tree,
2390                                             failrec->start,
2391                                             EXTENT_LOCKED);
2392         spin_unlock(&io_tree->lock);
2393
2394         if (state && state->start <= failrec->start &&
2395             state->end >= failrec->start + failrec->len - 1) {
2396                 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2397                                               failrec->len);
2398                 if (num_copies > 1)  {
2399                         repair_io_failure(fs_info, ino, start, failrec->len,
2400                                           failrec->logical, page, pg_offset,
2401                                           failrec->failed_mirror);
2402                 }
2403         }
2404
2405 out:
2406         free_io_failure(failure_tree, io_tree, failrec);
2407
2408         return 0;
2409 }
2410
2411 /*
2412  * Can be called when
2413  * - hold extent lock
2414  * - under ordered extent
2415  * - the inode is freeing
2416  */
2417 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
2418 {
2419         struct extent_io_tree *failure_tree = &inode->io_failure_tree;
2420         struct io_failure_record *failrec;
2421         struct extent_state *state, *next;
2422
2423         if (RB_EMPTY_ROOT(&failure_tree->state))
2424                 return;
2425
2426         spin_lock(&failure_tree->lock);
2427         state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2428         while (state) {
2429                 if (state->start > end)
2430                         break;
2431
2432                 ASSERT(state->end <= end);
2433
2434                 next = next_state(state);
2435
2436                 failrec = state->failrec;
2437                 free_extent_state(state);
2438                 kfree(failrec);
2439
2440                 state = next;
2441         }
2442         spin_unlock(&failure_tree->lock);
2443 }
2444
2445 static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
2446                                                              u64 start, u64 end)
2447 {
2448         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2449         struct io_failure_record *failrec;
2450         struct extent_map *em;
2451         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2452         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2453         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2454         int ret;
2455         u64 logical;
2456
2457         failrec = get_state_failrec(failure_tree, start);
2458         if (!IS_ERR(failrec)) {
2459                 btrfs_debug(fs_info,
2460                         "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
2461                         failrec->logical, failrec->start, failrec->len,
2462                         failrec->in_validation);
2463                 /*
2464                  * when data can be on disk more than twice, add to failrec here
2465                  * (e.g. with a list for failed_mirror) to make
2466                  * clean_io_failure() clean all those errors at once.
2467                  */
2468
2469                 return failrec;
2470         }
2471
2472         failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2473         if (!failrec)
2474                 return ERR_PTR(-ENOMEM);
2475
2476         failrec->start = start;
2477         failrec->len = end - start + 1;
2478         failrec->this_mirror = 0;
2479         failrec->bio_flags = 0;
2480         failrec->in_validation = 0;
2481
2482         read_lock(&em_tree->lock);
2483         em = lookup_extent_mapping(em_tree, start, failrec->len);
2484         if (!em) {
2485                 read_unlock(&em_tree->lock);
2486                 kfree(failrec);
2487                 return ERR_PTR(-EIO);
2488         }
2489
2490         if (em->start > start || em->start + em->len <= start) {
2491                 free_extent_map(em);
2492                 em = NULL;
2493         }
2494         read_unlock(&em_tree->lock);
2495         if (!em) {
2496                 kfree(failrec);
2497                 return ERR_PTR(-EIO);
2498         }
2499
2500         logical = start - em->start;
2501         logical = em->block_start + logical;
2502         if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2503                 logical = em->block_start;
2504                 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2505                 extent_set_compress_type(&failrec->bio_flags, em->compress_type);
2506         }
2507
2508         btrfs_debug(fs_info,
2509                     "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
2510                     logical, start, failrec->len);
2511
2512         failrec->logical = logical;
2513         free_extent_map(em);
2514
2515         /* Set the bits in the private failure tree */
2516         ret = set_extent_bits(failure_tree, start, end,
2517                               EXTENT_LOCKED | EXTENT_DIRTY);
2518         if (ret >= 0) {
2519                 ret = set_state_failrec(failure_tree, start, failrec);
2520                 /* Set the bits in the inode's tree */
2521                 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
2522         } else if (ret < 0) {
2523                 kfree(failrec);
2524                 return ERR_PTR(ret);
2525         }
2526
2527         return failrec;
2528 }
2529
2530 static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
2531                                    struct io_failure_record *failrec,
2532                                    int failed_mirror)
2533 {
2534         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2535         int num_copies;
2536
2537         num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
2538         if (num_copies == 1) {
2539                 /*
2540                  * we only have a single copy of the data, so don't bother with
2541                  * all the retry and error correction code that follows. no
2542                  * matter what the error is, it is very likely to persist.
2543                  */
2544                 btrfs_debug(fs_info,
2545                         "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
2546                         num_copies, failrec->this_mirror, failed_mirror);
2547                 return false;
2548         }
2549
2550         /*
2551          * there are two premises:
2552          *      a) deliver good data to the caller
2553          *      b) correct the bad sectors on disk
2554          */
2555         if (needs_validation) {
2556                 /*
2557                  * to fulfill b), we need to know the exact failing sectors, as
2558                  * we don't want to rewrite any more than the failed ones. thus,
2559                  * we need separate read requests for the failed bio
2560                  *
2561                  * if the following BUG_ON triggers, our validation request got
2562                  * merged. we need separate requests for our algorithm to work.
2563                  */
2564                 BUG_ON(failrec->in_validation);
2565                 failrec->in_validation = 1;
2566                 failrec->this_mirror = failed_mirror;
2567         } else {
2568                 /*
2569                  * we're ready to fulfill a) and b) alongside. get a good copy
2570                  * of the failed sector and if we succeed, we have setup
2571                  * everything for repair_io_failure to do the rest for us.
2572                  */
2573                 if (failrec->in_validation) {
2574                         BUG_ON(failrec->this_mirror != failed_mirror);
2575                         failrec->in_validation = 0;
2576                         failrec->this_mirror = 0;
2577                 }
2578                 failrec->failed_mirror = failed_mirror;
2579                 failrec->this_mirror++;
2580                 if (failrec->this_mirror == failed_mirror)
2581                         failrec->this_mirror++;
2582         }
2583
2584         if (failrec->this_mirror > num_copies) {
2585                 btrfs_debug(fs_info,
2586                         "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
2587                         num_copies, failrec->this_mirror, failed_mirror);
2588                 return false;
2589         }
2590
2591         return true;
2592 }
2593
2594 static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio)
2595 {
2596         u64 len = 0;
2597         const u32 blocksize = inode->i_sb->s_blocksize;
2598
2599         /*
2600          * If bi_status is BLK_STS_OK, then this was a checksum error, not an
2601          * I/O error. In this case, we already know exactly which sector was
2602          * bad, so we don't need to validate.
2603          */
2604         if (bio->bi_status == BLK_STS_OK)
2605                 return false;
2606
2607         /*
2608          * We need to validate each sector individually if the failed I/O was
2609          * for multiple sectors.
2610          *
2611          * There are a few possible bios that can end up here:
2612          * 1. A buffered read bio, which is not cloned.
2613          * 2. A direct I/O read bio, which is cloned.
2614          * 3. A (buffered or direct) repair bio, which is not cloned.
2615          *
2616          * For cloned bios (case 2), we can get the size from
2617          * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get
2618          * it from the bvecs.
2619          */
2620         if (bio_flagged(bio, BIO_CLONED)) {
2621                 if (btrfs_io_bio(bio)->iter.bi_size > blocksize)
2622                         return true;
2623         } else {
2624                 struct bio_vec *bvec;
2625                 int i;
2626
2627                 bio_for_each_bvec_all(bvec, bio, i) {
2628                         len += bvec->bv_len;
2629                         if (len > blocksize)
2630                                 return true;
2631                 }
2632         }
2633         return false;
2634 }
2635
2636 blk_status_t btrfs_submit_read_repair(struct inode *inode,
2637                                       struct bio *failed_bio, u32 bio_offset,
2638                                       struct page *page, unsigned int pgoff,
2639                                       u64 start, u64 end, int failed_mirror,
2640                                       submit_bio_hook_t *submit_bio_hook)
2641 {
2642         struct io_failure_record *failrec;
2643         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2644         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2645         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2646         struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
2647         const int icsum = bio_offset >> fs_info->sectorsize_bits;
2648         bool need_validation;
2649         struct bio *repair_bio;
2650         struct btrfs_io_bio *repair_io_bio;
2651         blk_status_t status;
2652
2653         btrfs_debug(fs_info,
2654                    "repair read error: read error at %llu", start);
2655
2656         BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2657
2658         failrec = btrfs_get_io_failure_record(inode, start, end);
2659         if (IS_ERR(failrec))
2660                 return errno_to_blk_status(PTR_ERR(failrec));
2661
2662         need_validation = btrfs_io_needs_validation(inode, failed_bio);
2663
2664         if (!btrfs_check_repairable(inode, need_validation, failrec,
2665                                     failed_mirror)) {
2666                 free_io_failure(failure_tree, tree, failrec);
2667                 return BLK_STS_IOERR;
2668         }
2669
2670         repair_bio = btrfs_io_bio_alloc(1);
2671         repair_io_bio = btrfs_io_bio(repair_bio);
2672         repair_bio->bi_opf = REQ_OP_READ;
2673         if (need_validation)
2674                 repair_bio->bi_opf |= REQ_FAILFAST_DEV;
2675         repair_bio->bi_end_io = failed_bio->bi_end_io;
2676         repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
2677         repair_bio->bi_private = failed_bio->bi_private;
2678
2679         if (failed_io_bio->csum) {
2680                 const u32 csum_size = fs_info->csum_size;
2681
2682                 repair_io_bio->csum = repair_io_bio->csum_inline;
2683                 memcpy(repair_io_bio->csum,
2684                        failed_io_bio->csum + csum_size * icsum, csum_size);
2685         }
2686
2687         bio_add_page(repair_bio, page, failrec->len, pgoff);
2688         repair_io_bio->logical = failrec->start;
2689         repair_io_bio->iter = repair_bio->bi_iter;
2690
2691         btrfs_debug(btrfs_sb(inode->i_sb),
2692 "repair read error: submitting new read to mirror %d, in_validation=%d",
2693                     failrec->this_mirror, failrec->in_validation);
2694
2695         status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
2696                                  failrec->bio_flags);
2697         if (status) {
2698                 free_io_failure(failure_tree, tree, failrec);
2699                 bio_put(repair_bio);
2700         }
2701         return status;
2702 }
2703
2704 /* lots and lots of room for performance fixes in the end_bio funcs */
2705
2706 void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2707 {
2708         int uptodate = (err == 0);
2709         int ret = 0;
2710
2711         btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
2712
2713         if (!uptodate) {
2714                 ClearPageUptodate(page);
2715                 SetPageError(page);
2716                 ret = err < 0 ? err : -EIO;
2717                 mapping_set_error(page->mapping, ret);
2718         }
2719 }
2720
2721 /*
2722  * after a writepage IO is done, we need to:
2723  * clear the uptodate bits on error
2724  * clear the writeback bits in the extent tree for this IO
2725  * end_page_writeback if the page has no more pending IO
2726  *
2727  * Scheduling is not allowed, so the extent state tree is expected
2728  * to have one and only one object corresponding to this IO.
2729  */
2730 static void end_bio_extent_writepage(struct bio *bio)
2731 {
2732         int error = blk_status_to_errno(bio->bi_status);
2733         struct bio_vec *bvec;
2734         u64 start;
2735         u64 end;
2736         struct bvec_iter_all iter_all;
2737
2738         ASSERT(!bio_flagged(bio, BIO_CLONED));
2739         bio_for_each_segment_all(bvec, bio, iter_all) {
2740                 struct page *page = bvec->bv_page;
2741                 struct inode *inode = page->mapping->host;
2742                 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2743
2744                 /* We always issue full-page reads, but if some block
2745                  * in a page fails to read, blk_update_request() will
2746                  * advance bv_offset and adjust bv_len to compensate.
2747                  * Print a warning for nonzero offsets, and an error
2748                  * if they don't add up to a full page.  */
2749                 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2750                         if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
2751                                 btrfs_err(fs_info,
2752                                    "partial page write in btrfs with offset %u and length %u",
2753                                         bvec->bv_offset, bvec->bv_len);
2754                         else
2755                                 btrfs_info(fs_info,
2756                                    "incomplete page write in btrfs with offset %u and length %u",
2757                                         bvec->bv_offset, bvec->bv_len);
2758                 }
2759
2760                 start = page_offset(page);
2761                 end = start + bvec->bv_offset + bvec->bv_len - 1;
2762
2763                 end_extent_writepage(page, error, start, end);
2764                 end_page_writeback(page);
2765         }
2766
2767         bio_put(bio);
2768 }
2769
2770 /*
2771  * Record previously processed extent range
2772  *
2773  * For endio_readpage_release_extent() to handle a full extent range, reducing
2774  * the extent io operations.
2775  */
2776 struct processed_extent {
2777         struct btrfs_inode *inode;
2778         /* Start of the range in @inode */
2779         u64 start;
2780         /* End of the range in @inode */
2781         u64 end;
2782         bool uptodate;
2783 };
2784
2785 /*
2786  * Try to release processed extent range
2787  *
2788  * May not release the extent range right now if the current range is
2789  * contiguous to processed extent.
2790  *
2791  * Will release processed extent when any of @inode, @uptodate, the range is
2792  * no longer contiguous to the processed range.
2793  *
2794  * Passing @inode == NULL will force processed extent to be released.
2795  */
2796 static void endio_readpage_release_extent(struct processed_extent *processed,
2797                               struct btrfs_inode *inode, u64 start, u64 end,
2798                               bool uptodate)
2799 {
2800         struct extent_state *cached = NULL;
2801         struct extent_io_tree *tree;
2802
2803         /* The first extent, initialize @processed */
2804         if (!processed->inode)
2805                 goto update;
2806
2807         /*
2808          * Contiguous to processed extent, just uptodate the end.
2809          *
2810          * Several things to notice:
2811          *
2812          * - bio can be merged as long as on-disk bytenr is contiguous
2813          *   This means we can have page belonging to other inodes, thus need to
2814          *   check if the inode still matches.
2815          * - bvec can contain range beyond current page for multi-page bvec
2816          *   Thus we need to do processed->end + 1 >= start check
2817          */
2818         if (processed->inode == inode && processed->uptodate == uptodate &&
2819             processed->end + 1 >= start && end >= processed->end) {
2820                 processed->end = end;
2821                 return;
2822         }
2823
2824         tree = &processed->inode->io_tree;
2825         /*
2826          * Now we don't have range contiguous to the processed range, release
2827          * the processed range now.
2828          */
2829         if (processed->uptodate && tree->track_uptodate)
2830                 set_extent_uptodate(tree, processed->start, processed->end,
2831                                     &cached, GFP_ATOMIC);
2832         unlock_extent_cached_atomic(tree, processed->start, processed->end,
2833                                     &cached);
2834
2835 update:
2836         /* Update processed to current range */
2837         processed->inode = inode;
2838         processed->start = start;
2839         processed->end = end;
2840         processed->uptodate = uptodate;
2841 }
2842
2843 static void endio_readpage_update_page_status(struct page *page, bool uptodate)
2844 {
2845         if (uptodate) {
2846                 SetPageUptodate(page);
2847         } else {
2848                 ClearPageUptodate(page);
2849                 SetPageError(page);
2850         }
2851         unlock_page(page);
2852 }
2853
2854 /*
2855  * after a readpage IO is done, we need to:
2856  * clear the uptodate bits on error
2857  * set the uptodate bits if things worked
2858  * set the page up to date if all extents in the tree are uptodate
2859  * clear the lock bit in the extent tree
2860  * unlock the page if there are no other extents locked for it
2861  *
2862  * Scheduling is not allowed, so the extent state tree is expected
2863  * to have one and only one object corresponding to this IO.
2864  */
2865 static void end_bio_extent_readpage(struct bio *bio)
2866 {
2867         struct bio_vec *bvec;
2868         int uptodate = !bio->bi_status;
2869         struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2870         struct extent_io_tree *tree, *failure_tree;
2871         struct processed_extent processed = { 0 };
2872         /*
2873          * The offset to the beginning of a bio, since one bio can never be
2874          * larger than UINT_MAX, u32 here is enough.
2875          */
2876         u32 bio_offset = 0;
2877         int mirror;
2878         int ret;
2879         struct bvec_iter_all iter_all;
2880
2881         ASSERT(!bio_flagged(bio, BIO_CLONED));
2882         bio_for_each_segment_all(bvec, bio, iter_all) {
2883                 struct page *page = bvec->bv_page;
2884                 struct inode *inode = page->mapping->host;
2885                 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2886                 const u32 sectorsize = fs_info->sectorsize;
2887                 u64 start;
2888                 u64 end;
2889                 u32 len;
2890
2891                 btrfs_debug(fs_info,
2892                         "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
2893                         bio->bi_iter.bi_sector, bio->bi_status,
2894                         io_bio->mirror_num);
2895                 tree = &BTRFS_I(inode)->io_tree;
2896                 failure_tree = &BTRFS_I(inode)->io_failure_tree;
2897
2898                 /*
2899                  * We always issue full-sector reads, but if some block in a
2900                  * page fails to read, blk_update_request() will advance
2901                  * bv_offset and adjust bv_len to compensate.  Print a warning
2902                  * for unaligned offsets, and an error if they don't add up to
2903                  * a full sector.
2904                  */
2905                 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
2906                         btrfs_err(fs_info,
2907                 "partial page read in btrfs with offset %u and length %u",
2908                                   bvec->bv_offset, bvec->bv_len);
2909                 else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
2910                                      sectorsize))
2911                         btrfs_info(fs_info,
2912                 "incomplete page read with offset %u and length %u",
2913                                    bvec->bv_offset, bvec->bv_len);
2914
2915                 start = page_offset(page) + bvec->bv_offset;
2916                 end = start + bvec->bv_len - 1;
2917                 len = bvec->bv_len;
2918
2919                 mirror = io_bio->mirror_num;
2920                 if (likely(uptodate)) {
2921                         if (is_data_inode(inode))
2922                                 ret = btrfs_verify_data_csum(io_bio,
2923                                                 bio_offset, page, start, end,
2924                                                 mirror);
2925                         else
2926                                 ret = btrfs_validate_metadata_buffer(io_bio,
2927                                         page, start, end, mirror);
2928                         if (ret)
2929                                 uptodate = 0;
2930                         else
2931                                 clean_io_failure(BTRFS_I(inode)->root->fs_info,
2932                                                  failure_tree, tree, start,
2933                                                  page,
2934                                                  btrfs_ino(BTRFS_I(inode)), 0);
2935                 }
2936
2937                 if (likely(uptodate))
2938                         goto readpage_ok;
2939
2940                 if (is_data_inode(inode)) {
2941
2942                         /*
2943                          * The generic bio_readpage_error handles errors the
2944                          * following way: If possible, new read requests are
2945                          * created and submitted and will end up in
2946                          * end_bio_extent_readpage as well (if we're lucky,
2947                          * not in the !uptodate case). In that case it returns
2948                          * 0 and we just go on with the next page in our bio.
2949                          * If it can't handle the error it will return -EIO and
2950                          * we remain responsible for that page.
2951                          */
2952                         if (!btrfs_submit_read_repair(inode, bio, bio_offset,
2953                                                 page,
2954                                                 start - page_offset(page),
2955                                                 start, end, mirror,
2956                                                 btrfs_submit_data_bio)) {
2957                                 uptodate = !bio->bi_status;
2958                                 ASSERT(bio_offset + len > bio_offset);
2959                                 bio_offset += len;
2960                                 continue;
2961                         }
2962                 } else {
2963                         struct extent_buffer *eb;
2964
2965                         eb = (struct extent_buffer *)page->private;
2966                         set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
2967                         eb->read_mirror = mirror;
2968                         atomic_dec(&eb->io_pages);
2969                         if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
2970                                                &eb->bflags))
2971                                 btree_readahead_hook(eb, -EIO);
2972                 }
2973 readpage_ok:
2974                 if (likely(uptodate)) {
2975                         loff_t i_size = i_size_read(inode);
2976                         pgoff_t end_index = i_size >> PAGE_SHIFT;
2977                         unsigned off;
2978
2979                         /* Zero out the end if this page straddles i_size */
2980                         off = offset_in_page(i_size);
2981                         if (page->index == end_index && off)
2982                                 zero_user_segment(page, off, PAGE_SIZE);
2983                 }
2984                 ASSERT(bio_offset + len > bio_offset);
2985                 bio_offset += len;
2986
2987                 /* Update page status and unlock */
2988                 endio_readpage_update_page_status(page, uptodate);
2989                 endio_readpage_release_extent(&processed, BTRFS_I(inode),
2990                                               start, end, uptodate);
2991         }
2992         /* Release the last extent */
2993         endio_readpage_release_extent(&processed, NULL, 0, 0, false);
2994         btrfs_io_bio_free_csum(io_bio);
2995         bio_put(bio);
2996 }
2997
2998 /*
2999  * Initialize the members up to but not including 'bio'. Use after allocating a
3000  * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
3001  * 'bio' because use of __GFP_ZERO is not supported.
3002  */
3003 static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
3004 {
3005         memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
3006 }
3007
3008 /*
3009  * The following helpers allocate a bio. As it's backed by a bioset, it'll
3010  * never fail.  We're returning a bio right now but you can call btrfs_io_bio
3011  * for the appropriate container_of magic
3012  */
3013 struct bio *btrfs_bio_alloc(u64 first_byte)
3014 {
3015         struct bio *bio;
3016
3017         bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
3018         bio->bi_iter.bi_sector = first_byte >> 9;
3019         btrfs_io_bio_init(btrfs_io_bio(bio));
3020         return bio;
3021 }
3022
3023 struct bio *btrfs_bio_clone(struct bio *bio)
3024 {
3025         struct btrfs_io_bio *btrfs_bio;
3026         struct bio *new;
3027
3028         /* Bio allocation backed by a bioset does not fail */
3029         new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
3030         btrfs_bio = btrfs_io_bio(new);
3031         btrfs_io_bio_init(btrfs_bio);
3032         btrfs_bio->iter = bio->bi_iter;
3033         return new;
3034 }
3035
3036 struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
3037 {
3038         struct bio *bio;
3039
3040         /* Bio allocation backed by a bioset does not fail */
3041         bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
3042         btrfs_io_bio_init(btrfs_io_bio(bio));
3043         return bio;
3044 }
3045
3046 struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
3047 {
3048         struct bio *bio;
3049         struct btrfs_io_bio *btrfs_bio;
3050
3051         /* this will never fail when it's backed by a bioset */
3052         bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
3053         ASSERT(bio);
3054
3055         btrfs_bio = btrfs_io_bio(bio);
3056         btrfs_io_bio_init(btrfs_bio);
3057
3058         bio_trim(bio, offset >> 9, size >> 9);
3059         btrfs_bio->iter = bio->bi_iter;
3060         return bio;
3061 }
3062
3063 /*
3064  * @opf:        bio REQ_OP_* and REQ_* flags as one value
3065  * @wbc:        optional writeback control for io accounting
3066  * @page:       page to add to the bio
3067  * @disk_bytenr: logical bytenr where the write will be
3068  * @size:       portion of page that we want to write to
3069  * @pg_offset:  offset of the new bio or to check whether we are adding
3070  *              a contiguous page to the previous one
3071  * @bio_ret:    must be valid pointer, newly allocated bio will be stored there
3072  * @end_io_func:     end_io callback for new bio
3073  * @mirror_num:      desired mirror to read/write
3074  * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
3075  * @bio_flags:  flags of the current bio to see if we can merge them
3076  */
3077 static int submit_extent_page(unsigned int opf,
3078                               struct writeback_control *wbc,
3079                               struct page *page, u64 disk_bytenr,
3080                               size_t size, unsigned long pg_offset,
3081                               struct bio **bio_ret,
3082                               bio_end_io_t end_io_func,
3083                               int mirror_num,
3084                               unsigned long prev_bio_flags,
3085                               unsigned long bio_flags,
3086                               bool force_bio_submit)
3087 {
3088         int ret = 0;
3089         struct bio *bio;
3090         size_t io_size = min_t(size_t, size, PAGE_SIZE);
3091         sector_t sector = disk_bytenr >> 9;
3092         struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
3093
3094         ASSERT(bio_ret);
3095
3096         if (*bio_ret) {
3097                 bool contig;
3098                 bool can_merge = true;
3099
3100                 bio = *bio_ret;
3101                 if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
3102                         contig = bio->bi_iter.bi_sector == sector;
3103                 else
3104                         contig = bio_end_sector(bio) == sector;
3105
3106                 if (btrfs_bio_fits_in_stripe(page, io_size, bio, bio_flags))
3107                         can_merge = false;
3108
3109                 if (prev_bio_flags != bio_flags || !contig || !can_merge ||
3110                     force_bio_submit ||
3111                     bio_add_page(bio, page, io_size, pg_offset) < io_size) {
3112                         ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
3113                         if (ret < 0) {
3114                                 *bio_ret = NULL;
3115                                 return ret;
3116                         }
3117                         bio = NULL;
3118                 } else {
3119                         if (wbc)
3120                                 wbc_account_cgroup_owner(wbc, page, io_size);
3121                         return 0;
3122                 }
3123         }
3124
3125         bio = btrfs_bio_alloc(disk_bytenr);
3126         bio_add_page(bio, page, io_size, pg_offset);
3127         bio->bi_end_io = end_io_func;
3128         bio->bi_private = tree;
3129         bio->bi_write_hint = page->mapping->host->i_write_hint;
3130         bio->bi_opf = opf;
3131         if (wbc) {
3132                 struct block_device *bdev;
3133
3134                 bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
3135                 bio_set_dev(bio, bdev);
3136                 wbc_init_bio(wbc, bio);
3137                 wbc_account_cgroup_owner(wbc, page, io_size);
3138         }
3139
3140         *bio_ret = bio;
3141
3142         return ret;
3143 }
3144
3145 static int attach_extent_buffer_page(struct extent_buffer *eb,
3146                                      struct page *page,
3147                                      struct btrfs_subpage *prealloc)
3148 {
3149         struct btrfs_fs_info *fs_info = eb->fs_info;
3150         int ret = 0;
3151
3152         /*
3153          * If the page is mapped to btree inode, we should hold the private
3154          * lock to prevent race.
3155          * For cloned or dummy extent buffers, their pages are not mapped and
3156          * will not race with any other ebs.
3157          */
3158         if (page->mapping)
3159                 lockdep_assert_held(&page->mapping->private_lock);
3160
3161         if (fs_info->sectorsize == PAGE_SIZE) {
3162                 if (!PagePrivate(page))
3163                         attach_page_private(page, eb);
3164                 else
3165                         WARN_ON(page->private != (unsigned long)eb);
3166                 return 0;
3167         }
3168
3169         /* Already mapped, just free prealloc */
3170         if (PagePrivate(page)) {
3171                 btrfs_free_subpage(prealloc);
3172                 return 0;
3173         }
3174
3175         if (prealloc)
3176                 /* Has preallocated memory for subpage */
3177                 attach_page_private(page, prealloc);
3178         else
3179                 /* Do new allocation to attach subpage */
3180                 ret = btrfs_attach_subpage(fs_info, page,
3181                                            BTRFS_SUBPAGE_METADATA);
3182         return ret;
3183 }
3184
3185 void set_page_extent_mapped(struct page *page)
3186 {
3187         if (!PagePrivate(page))
3188                 attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
3189 }
3190
3191 static struct extent_map *
3192 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
3193                  u64 start, u64 len, struct extent_map **em_cached)
3194 {
3195         struct extent_map *em;
3196
3197         if (em_cached && *em_cached) {
3198                 em = *em_cached;
3199                 if (extent_map_in_tree(em) && start >= em->start &&
3200                     start < extent_map_end(em)) {
3201                         refcount_inc(&em->refs);
3202                         return em;
3203                 }
3204
3205                 free_extent_map(em);
3206                 *em_cached = NULL;
3207         }
3208
3209         em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
3210         if (em_cached && !IS_ERR_OR_NULL(em)) {
3211                 BUG_ON(*em_cached);
3212                 refcount_inc(&em->refs);
3213                 *em_cached = em;
3214         }
3215         return em;
3216 }
3217 /*
3218  * basic readpage implementation.  Locked extent state structs are inserted
3219  * into the tree that are removed when the IO is done (by the end_io
3220  * handlers)
3221  * XXX JDM: This needs looking at to ensure proper page locking
3222  * return 0 on success, otherwise return error
3223  */
3224 int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
3225                       struct bio **bio, unsigned long *bio_flags,
3226                       unsigned int read_flags, u64 *prev_em_start)
3227 {
3228         struct inode *inode = page->mapping->host;
3229         u64 start = page_offset(page);
3230         const u64 end = start + PAGE_SIZE - 1;
3231         u64 cur = start;
3232         u64 extent_offset;
3233         u64 last_byte = i_size_read(inode);
3234         u64 block_start;
3235         u64 cur_end;
3236         struct extent_map *em;
3237         int ret = 0;
3238         int nr = 0;
3239         size_t pg_offset = 0;
3240         size_t iosize;
3241         size_t blocksize = inode->i_sb->s_blocksize;
3242         unsigned long this_bio_flag = 0;
3243         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
3244
3245         set_page_extent_mapped(page);
3246
3247         if (!PageUptodate(page)) {
3248                 if (cleancache_get_page(page) == 0) {
3249                         BUG_ON(blocksize != PAGE_SIZE);
3250                         unlock_extent(tree, start, end);
3251                         goto out;
3252                 }
3253         }
3254
3255         if (page->index == last_byte >> PAGE_SHIFT) {
3256                 char *userpage;
3257                 size_t zero_offset = offset_in_page(last_byte);
3258
3259                 if (zero_offset) {
3260                         iosize = PAGE_SIZE - zero_offset;
3261                         userpage = kmap_atomic(page);
3262                         memset(userpage + zero_offset, 0, iosize);
3263                         flush_dcache_page(page);
3264                         kunmap_atomic(userpage);
3265                 }
3266         }
3267         while (cur <= end) {
3268                 bool force_bio_submit = false;
3269                 u64 disk_bytenr;
3270
3271                 if (cur >= last_byte) {
3272                         char *userpage;
3273                         struct extent_state *cached = NULL;
3274
3275                         iosize = PAGE_SIZE - pg_offset;
3276                         userpage = kmap_atomic(page);
3277                         memset(userpage + pg_offset, 0, iosize);
3278                         flush_dcache_page(page);
3279                         kunmap_atomic(userpage);
3280                         set_extent_uptodate(tree, cur, cur + iosize - 1,
3281                                             &cached, GFP_NOFS);
3282                         unlock_extent_cached(tree, cur,
3283                                              cur + iosize - 1, &cached);
3284                         break;
3285                 }
3286                 em = __get_extent_map(inode, page, pg_offset, cur,
3287                                       end - cur + 1, em_cached);
3288                 if (IS_ERR_OR_NULL(em)) {
3289                         SetPageError(page);
3290                         unlock_extent(tree, cur, end);
3291                         break;
3292                 }
3293                 extent_offset = cur - em->start;
3294                 BUG_ON(extent_map_end(em) <= cur);
3295                 BUG_ON(end < cur);
3296
3297                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
3298                         this_bio_flag |= EXTENT_BIO_COMPRESSED;
3299                         extent_set_compress_type(&this_bio_flag,
3300                                                  em->compress_type);
3301                 }
3302
3303                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
3304                 cur_end = min(extent_map_end(em) - 1, end);
3305                 iosize = ALIGN(iosize, blocksize);
3306                 if (this_bio_flag & EXTENT_BIO_COMPRESSED)
3307                         disk_bytenr = em->block_start;
3308                 else
3309                         disk_bytenr = em->block_start + extent_offset;
3310                 block_start = em->block_start;
3311                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3312                         block_start = EXTENT_MAP_HOLE;
3313
3314                 /*
3315                  * If we have a file range that points to a compressed extent
3316                  * and it's followed by a consecutive file range that points
3317                  * to the same compressed extent (possibly with a different
3318                  * offset and/or length, so it either points to the whole extent
3319                  * or only part of it), we must make sure we do not submit a
3320                  * single bio to populate the pages for the 2 ranges because
3321                  * this makes the compressed extent read zero out the pages
3322                  * belonging to the 2nd range. Imagine the following scenario:
3323                  *
3324                  *  File layout
3325                  *  [0 - 8K]                     [8K - 24K]
3326                  *    |                               |
3327                  *    |                               |
3328                  * points to extent X,         points to extent X,
3329                  * offset 4K, length of 8K     offset 0, length 16K
3330                  *
3331                  * [extent X, compressed length = 4K uncompressed length = 16K]
3332                  *
3333                  * If the bio to read the compressed extent covers both ranges,
3334                  * it will decompress extent X into the pages belonging to the
3335                  * first range and then it will stop, zeroing out the remaining
3336                  * pages that belong to the other range that points to extent X.
3337                  * So here we make sure we submit 2 bios, one for the first
3338                  * range and another one for the third range. Both will target
3339                  * the same physical extent from disk, but we can't currently
3340                  * make the compressed bio endio callback populate the pages
3341                  * for both ranges because each compressed bio is tightly
3342                  * coupled with a single extent map, and each range can have
3343                  * an extent map with a different offset value relative to the
3344                  * uncompressed data of our extent and different lengths. This
3345                  * is a corner case so we prioritize correctness over
3346                  * non-optimal behavior (submitting 2 bios for the same extent).
3347                  */
3348                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3349                     prev_em_start && *prev_em_start != (u64)-1 &&
3350                     *prev_em_start != em->start)
3351                         force_bio_submit = true;
3352
3353                 if (prev_em_start)
3354                         *prev_em_start = em->start;
3355
3356                 free_extent_map(em);
3357                 em = NULL;
3358
3359                 /* we've found a hole, just zero and go on */
3360                 if (block_start == EXTENT_MAP_HOLE) {
3361                         char *userpage;
3362                         struct extent_state *cached = NULL;
3363
3364                         userpage = kmap_atomic(page);
3365                         memset(userpage + pg_offset, 0, iosize);
3366                         flush_dcache_page(page);
3367                         kunmap_atomic(userpage);
3368
3369                         set_extent_uptodate(tree, cur, cur + iosize - 1,
3370                                             &cached, GFP_NOFS);
3371                         unlock_extent_cached(tree, cur,
3372                                              cur + iosize - 1, &cached);
3373                         cur = cur + iosize;
3374                         pg_offset += iosize;
3375                         continue;
3376                 }
3377                 /* the get_extent function already copied into the page */
3378                 if (test_range_bit(tree, cur, cur_end,
3379                                    EXTENT_UPTODATE, 1, NULL)) {
3380                         check_page_uptodate(tree, page);
3381                         unlock_extent(tree, cur, cur + iosize - 1);
3382                         cur = cur + iosize;
3383                         pg_offset += iosize;
3384                         continue;
3385                 }
3386                 /* we have an inline extent but it didn't get marked up
3387                  * to date.  Error out
3388                  */
3389                 if (block_start == EXTENT_MAP_INLINE) {
3390                         SetPageError(page);
3391                         unlock_extent(tree, cur, cur + iosize - 1);
3392                         cur = cur + iosize;
3393                         pg_offset += iosize;
3394                         continue;
3395                 }
3396
3397                 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
3398                                          page, disk_bytenr, iosize,
3399                                          pg_offset, bio,
3400                                          end_bio_extent_readpage, 0,
3401                                          *bio_flags,
3402                                          this_bio_flag,
3403                                          force_bio_submit);
3404                 if (!ret) {
3405                         nr++;
3406                         *bio_flags = this_bio_flag;
3407                 } else {
3408                         SetPageError(page);
3409                         unlock_extent(tree, cur, cur + iosize - 1);
3410                         goto out;
3411                 }
3412                 cur = cur + iosize;
3413                 pg_offset += iosize;
3414         }
3415 out:
3416         if (!nr) {
3417                 if (!PageError(page))
3418                         SetPageUptodate(page);
3419                 unlock_page(page);
3420         }
3421         return ret;
3422 }
3423
3424 static inline void contiguous_readpages(struct page *pages[], int nr_pages,
3425                                              u64 start, u64 end,
3426                                              struct extent_map **em_cached,
3427                                              struct bio **bio,
3428                                              unsigned long *bio_flags,
3429                                              u64 *prev_em_start)
3430 {
3431         struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
3432         int index;
3433
3434         btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
3435
3436         for (index = 0; index < nr_pages; index++) {
3437                 btrfs_do_readpage(pages[index], em_cached, bio, bio_flags,
3438                                   REQ_RAHEAD, prev_em_start);
3439                 put_page(pages[index]);
3440         }
3441 }
3442
3443 static void update_nr_written(struct writeback_control *wbc,
3444                               unsigned long nr_written)
3445 {
3446         wbc->nr_to_write -= nr_written;
3447 }
3448
3449 /*
3450  * helper for __extent_writepage, doing all of the delayed allocation setup.
3451  *
3452  * This returns 1 if btrfs_run_delalloc_range function did all the work required
3453  * to write the page (copy into inline extent).  In this case the IO has
3454  * been started and the page is already unlocked.
3455  *
3456  * This returns 0 if all went well (page still locked)
3457  * This returns < 0 if there were errors (page still locked)
3458  */
3459 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
3460                 struct page *page, struct writeback_control *wbc,
3461                 u64 delalloc_start, unsigned long *nr_written)
3462 {
3463         u64 page_end = delalloc_start + PAGE_SIZE - 1;
3464         bool found;
3465         u64 delalloc_to_write = 0;
3466         u64 delalloc_end = 0;
3467         int ret;
3468         int page_started = 0;
3469
3470
3471         while (delalloc_end < page_end) {
3472                 found = find_lock_delalloc_range(&inode->vfs_inode, page,
3473                                                &delalloc_start,
3474                                                &delalloc_end);
3475                 if (!found) {
3476                         delalloc_start = delalloc_end + 1;
3477                         continue;
3478                 }
3479                 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
3480                                 delalloc_end, &page_started, nr_written, wbc);
3481                 if (ret) {
3482                         SetPageError(page);
3483                         /*
3484                          * btrfs_run_delalloc_range should return < 0 for error
3485                          * but just in case, we use > 0 here meaning the IO is
3486                          * started, so we don't want to return > 0 unless
3487                          * things are going well.
3488                          */
3489                         return ret < 0 ? ret : -EIO;
3490                 }
3491                 /*
3492                  * delalloc_end is already one less than the total length, so
3493                  * we don't subtract one from PAGE_SIZE
3494                  */
3495                 delalloc_to_write += (delalloc_end - delalloc_start +
3496                                       PAGE_SIZE) >> PAGE_SHIFT;
3497                 delalloc_start = delalloc_end + 1;
3498         }
3499         if (wbc->nr_to_write < delalloc_to_write) {
3500                 int thresh = 8192;
3501
3502                 if (delalloc_to_write < thresh * 2)
3503                         thresh = delalloc_to_write;
3504                 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3505                                          thresh);
3506         }
3507
3508         /* did the fill delalloc function already unlock and start
3509          * the IO?
3510          */
3511         if (page_started) {
3512                 /*
3513                  * we've unlocked the page, so we can't update
3514                  * the mapping's writeback index, just update
3515                  * nr_to_write.
3516                  */
3517                 wbc->nr_to_write -= *nr_written;
3518                 return 1;
3519         }
3520
3521         return 0;
3522 }
3523
3524 /*
3525  * helper for __extent_writepage.  This calls the writepage start hooks,
3526  * and does the loop to map the page into extents and bios.
3527  *
3528  * We return 1 if the IO is started and the page is unlocked,
3529  * 0 if all went well (page still locked)
3530  * < 0 if there were errors (page still locked)
3531  */
3532 static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
3533                                  struct page *page,
3534                                  struct writeback_control *wbc,
3535                                  struct extent_page_data *epd,
3536                                  loff_t i_size,
3537                                  unsigned long nr_written,
3538                                  int *nr_ret)
3539 {
3540         struct btrfs_fs_info *fs_info = inode->root->fs_info;
3541         struct extent_io_tree *tree = &inode->io_tree;
3542         u64 start = page_offset(page);
3543         u64 end = start + PAGE_SIZE - 1;
3544         u64 cur = start;
3545         u64 extent_offset;
3546         u64 block_start;
3547         struct extent_map *em;
3548         int ret = 0;
3549         int nr = 0;
3550         const unsigned int write_flags = wbc_to_write_flags(wbc);
3551         bool compressed;
3552
3553         ret = btrfs_writepage_cow_fixup(page, start, end);
3554         if (ret) {
3555                 /* Fixup worker will requeue */
3556                 redirty_page_for_writepage(wbc, page);
3557                 update_nr_written(wbc, nr_written);
3558                 unlock_page(page);
3559                 return 1;
3560         }
3561
3562         /*
3563          * we don't want to touch the inode after unlocking the page,
3564          * so we update the mapping writeback index now
3565          */
3566         update_nr_written(wbc, nr_written + 1);
3567
3568         while (cur <= end) {
3569                 u64 disk_bytenr;
3570                 u64 em_end;
3571                 u32 iosize;
3572
3573                 if (cur >= i_size) {
3574                         btrfs_writepage_endio_finish_ordered(page, cur, end, 1);
3575                         break;
3576                 }
3577                 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
3578                 if (IS_ERR_OR_NULL(em)) {
3579                         SetPageError(page);
3580                         ret = PTR_ERR_OR_ZERO(em);
3581                         break;
3582                 }
3583
3584                 extent_offset = cur - em->start;
3585                 em_end = extent_map_end(em);
3586                 ASSERT(cur <= em_end);
3587                 ASSERT(cur < end);
3588                 ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
3589                 ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
3590                 block_start = em->block_start;
3591                 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3592                 disk_bytenr = em->block_start + extent_offset;
3593
3594                 /* Note that em_end from extent_map_end() is exclusive */
3595                 iosize = min(em_end, end + 1) - cur;
3596                 free_extent_map(em);
3597                 em = NULL;
3598
3599                 /*
3600                  * compressed and inline extents are written through other
3601                  * paths in the FS
3602                  */
3603                 if (compressed || block_start == EXTENT_MAP_HOLE ||
3604                     block_start == EXTENT_MAP_INLINE) {
3605                         if (compressed)
3606                                 nr++;
3607                         else
3608                                 btrfs_writepage_endio_finish_ordered(page, cur,
3609                                                         cur + iosize - 1, 1);
3610                         cur += iosize;
3611                         continue;
3612                 }
3613
3614                 btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
3615                 if (!PageWriteback(page)) {
3616                         btrfs_err(inode->root->fs_info,
3617                                    "page %lu not writeback, cur %llu end %llu",
3618                                page->index, cur, end);
3619                 }
3620
3621                 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
3622                                          page, disk_bytenr, iosize,
3623                                          cur - page_offset(page), &epd->bio,
3624                                          end_bio_extent_writepage,
3625                                          0, 0, 0, false);
3626                 if (ret) {
3627                         SetPageError(page);
3628                         if (PageWriteback(page))
3629                                 end_page_writeback(page);
3630                 }
3631
3632                 cur += iosize;
3633                 nr++;
3634         }
3635         *nr_ret = nr;
3636         return ret;
3637 }
3638
3639 /*
3640  * the writepage semantics are similar to regular writepage.  extent
3641  * records are inserted to lock ranges in the tree, and as dirty areas
3642  * are found, they are marked writeback.  Then the lock bits are removed
3643  * and the end_io handler clears the writeback ranges
3644  *
3645  * Return 0 if everything goes well.
3646  * Return <0 for error.
3647  */
3648 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3649                               struct extent_page_data *epd)
3650 {
3651         struct inode *inode = page->mapping->host;
3652         u64 start = page_offset(page);
3653         u64 page_end = start + PAGE_SIZE - 1;
3654         int ret;
3655         int nr = 0;
3656         size_t pg_offset;
3657         loff_t i_size = i_size_read(inode);
3658         unsigned long end_index = i_size >> PAGE_SHIFT;
3659         unsigned long nr_written = 0;
3660
3661         trace___extent_writepage(page, inode, wbc);
3662
3663         WARN_ON(!PageLocked(page));
3664
3665         ClearPageError(page);
3666
3667         pg_offset = offset_in_page(i_size);
3668         if (page->index > end_index ||
3669            (page->index == end_index && !pg_offset)) {
3670                 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
3671                 unlock_page(page);
3672                 return 0;
3673         }
3674
3675         if (page->index == end_index) {
3676                 char *userpage;
3677
3678                 userpage = kmap_atomic(page);
3679                 memset(userpage + pg_offset, 0,
3680                        PAGE_SIZE - pg_offset);
3681                 kunmap_atomic(userpage);
3682                 flush_dcache_page(page);
3683         }
3684
3685         set_page_extent_mapped(page);
3686
3687         if (!epd->extent_locked) {
3688                 ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
3689                                          &nr_written);
3690                 if (ret == 1)
3691                         return 0;
3692                 if (ret)
3693                         goto done;
3694         }
3695
3696         ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
3697                                     nr_written, &nr);
3698         if (ret == 1)
3699                 return 0;
3700
3701 done:
3702         if (nr == 0) {
3703                 /* make sure the mapping tag for page dirty gets cleared */
3704                 set_page_writeback(page);
3705                 end_page_writeback(page);
3706         }
3707         if (PageError(page)) {
3708                 ret = ret < 0 ? ret : -EIO;
3709                 end_extent_writepage(page, ret, start, page_end);
3710         }
3711         unlock_page(page);
3712         ASSERT(ret <= 0);
3713         return ret;
3714 }
3715
3716 void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3717 {
3718         wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3719                        TASK_UNINTERRUPTIBLE);
3720 }
3721
3722 static void end_extent_buffer_writeback(struct extent_buffer *eb)
3723 {
3724         clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3725         smp_mb__after_atomic();
3726         wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3727 }
3728
3729 /*
3730  * Lock extent buffer status and pages for writeback.
3731  *
3732  * May try to flush write bio if we can't get the lock.
3733  *
3734  * Return  0 if the extent buffer doesn't need to be submitted.
3735  *           (E.g. the extent buffer is not dirty)
3736  * Return >0 is the extent buffer is submitted to bio.
3737  * Return <0 if something went wrong, no page is locked.
3738  */
3739 static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
3740                           struct extent_page_data *epd)
3741 {
3742         struct btrfs_fs_info *fs_info = eb->fs_info;
3743         int i, num_pages, failed_page_nr;
3744         int flush = 0;
3745         int ret = 0;
3746
3747         if (!btrfs_try_tree_write_lock(eb)) {
3748                 ret = flush_write_bio(epd);
3749                 if (ret < 0)
3750                         return ret;
3751                 flush = 1;
3752                 btrfs_tree_lock(eb);
3753         }
3754
3755         if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3756                 btrfs_tree_unlock(eb);
3757                 if (!epd->sync_io)
3758                         return 0;
3759                 if (!flush) {
3760                         ret = flush_write_bio(epd);
3761                         if (ret < 0)
3762                                 return ret;
3763                         flush = 1;
3764                 }
3765                 while (1) {
3766                         wait_on_extent_buffer_writeback(eb);
3767                         btrfs_tree_lock(eb);
3768                         if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3769                                 break;
3770                         btrfs_tree_unlock(eb);
3771                 }
3772         }
3773
3774         /*
3775          * We need to do this to prevent races in people who check if the eb is
3776          * under IO since we can end up having no IO bits set for a short period
3777          * of time.
3778          */
3779         spin_lock(&eb->refs_lock);
3780         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3781                 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3782                 spin_unlock(&eb->refs_lock);
3783                 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3784                 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3785                                          -eb->len,
3786                                          fs_info->dirty_metadata_batch);
3787                 ret = 1;
3788         } else {
3789                 spin_unlock(&eb->refs_lock);
3790         }
3791
3792         btrfs_tree_unlock(eb);
3793
3794         if (!ret)
3795                 return ret;
3796
3797         num_pages = num_extent_pages(eb);
3798         for (i = 0; i < num_pages; i++) {
3799                 struct page *p = eb->pages[i];
3800
3801                 if (!trylock_page(p)) {
3802                         if (!flush) {
3803                                 int err;
3804
3805                                 err = flush_write_bio(epd);
3806                                 if (err < 0) {
3807                                         ret = err;
3808                                         failed_page_nr = i;
3809                                         goto err_unlock;
3810                                 }
3811                                 flush = 1;
3812                         }
3813                         lock_page(p);
3814                 }
3815         }
3816
3817         return ret;
3818 err_unlock:
3819         /* Unlock already locked pages */
3820         for (i = 0; i < failed_page_nr; i++)
3821                 unlock_page(eb->pages[i]);
3822         /*
3823          * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
3824          * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
3825          * be made and undo everything done before.
3826          */
3827         btrfs_tree_lock(eb);
3828         spin_lock(&eb->refs_lock);
3829         set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3830         end_extent_buffer_writeback(eb);
3831         spin_unlock(&eb->refs_lock);
3832         percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
3833                                  fs_info->dirty_metadata_batch);
3834         btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3835         btrfs_tree_unlock(eb);
3836         return ret;
3837 }
3838
3839 static void set_btree_ioerr(struct page *page)
3840 {
3841         struct extent_buffer *eb = (struct extent_buffer *)page->private;
3842         struct btrfs_fs_info *fs_info;
3843
3844         SetPageError(page);
3845         if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3846                 return;
3847
3848         /*
3849          * If we error out, we should add back the dirty_metadata_bytes
3850          * to make it consistent.
3851          */
3852         fs_info = eb->fs_info;
3853         percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3854                                  eb->len, fs_info->dirty_metadata_batch);
3855
3856         /*
3857          * If writeback for a btree extent that doesn't belong to a log tree
3858          * failed, increment the counter transaction->eb_write_errors.
3859          * We do this because while the transaction is running and before it's
3860          * committing (when we call filemap_fdata[write|wait]_range against
3861          * the btree inode), we might have
3862          * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3863          * returns an error or an error happens during writeback, when we're
3864          * committing the transaction we wouldn't know about it, since the pages
3865          * can be no longer dirty nor marked anymore for writeback (if a
3866          * subsequent modification to the extent buffer didn't happen before the
3867          * transaction commit), which makes filemap_fdata[write|wait]_range not
3868          * able to find the pages tagged with SetPageError at transaction
3869          * commit time. So if this happens we must abort the transaction,
3870          * otherwise we commit a super block with btree roots that point to
3871          * btree nodes/leafs whose content on disk is invalid - either garbage
3872          * or the content of some node/leaf from a past generation that got
3873          * cowed or deleted and is no longer valid.
3874          *
3875          * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3876          * not be enough - we need to distinguish between log tree extents vs
3877          * non-log tree extents, and the next filemap_fdatawait_range() call
3878          * will catch and clear such errors in the mapping - and that call might
3879          * be from a log sync and not from a transaction commit. Also, checking
3880          * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
3881          * not done and would not be reliable - the eb might have been released
3882          * from memory and reading it back again means that flag would not be
3883          * set (since it's a runtime flag, not persisted on disk).
3884          *
3885          * Using the flags below in the btree inode also makes us achieve the
3886          * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
3887          * writeback for all dirty pages and before filemap_fdatawait_range()
3888          * is called, the writeback for all dirty pages had already finished
3889          * with errors - because we were not using AS_EIO/AS_ENOSPC,
3890          * filemap_fdatawait_range() would return success, as it could not know
3891          * that writeback errors happened (the pages were no longer tagged for
3892          * writeback).
3893          */
3894         switch (eb->log_index) {
3895         case -1:
3896                 set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
3897                 break;
3898         case 0:
3899                 set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
3900                 break;
3901         case 1:
3902                 set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
3903                 break;
3904         default:
3905                 BUG(); /* unexpected, logic error */
3906         }
3907 }
3908
3909 static void end_bio_extent_buffer_writepage(struct bio *bio)
3910 {
3911         struct bio_vec *bvec;
3912         struct extent_buffer *eb;
3913         int done;
3914         struct bvec_iter_all iter_all;
3915
3916         ASSERT(!bio_flagged(bio, BIO_CLONED));
3917         bio_for_each_segment_all(bvec, bio, iter_all) {
3918                 struct page *page = bvec->bv_page;
3919
3920                 eb = (struct extent_buffer *)page->private;
3921                 BUG_ON(!eb);
3922                 done = atomic_dec_and_test(&eb->io_pages);
3923
3924                 if (bio->bi_status ||
3925                     test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
3926                         ClearPageUptodate(page);
3927                         set_btree_ioerr(page);
3928                 }
3929
3930                 end_page_writeback(page);
3931
3932                 if (!done)
3933                         continue;
3934
3935                 end_extent_buffer_writeback(eb);
3936         }
3937
3938         bio_put(bio);
3939 }
3940
3941 static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3942                         struct writeback_control *wbc,
3943                         struct extent_page_data *epd)
3944 {
3945         u64 disk_bytenr = eb->start;
3946         u32 nritems;
3947         int i, num_pages;
3948         unsigned long start, end;
3949         unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
3950         int ret = 0;
3951
3952         clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
3953         num_pages = num_extent_pages(eb);
3954         atomic_set(&eb->io_pages, num_pages);
3955
3956         /* set btree blocks beyond nritems with 0 to avoid stale content. */
3957         nritems = btrfs_header_nritems(eb);
3958         if (btrfs_header_level(eb) > 0) {
3959                 end = btrfs_node_key_ptr_offset(nritems);
3960
3961                 memzero_extent_buffer(eb, end, eb->len - end);
3962         } else {
3963                 /*
3964                  * leaf:
3965                  * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
3966                  */
3967                 start = btrfs_item_nr_offset(nritems);
3968                 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
3969                 memzero_extent_buffer(eb, start, end - start);
3970         }
3971
3972         for (i = 0; i < num_pages; i++) {
3973                 struct page *p = eb->pages[i];
3974
3975                 clear_page_dirty_for_io(p);
3976                 set_page_writeback(p);
3977                 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
3978                                          p, disk_bytenr, PAGE_SIZE, 0,
3979                                          &epd->bio,
3980                                          end_bio_extent_buffer_writepage,
3981                                          0, 0, 0, false);
3982                 if (ret) {
3983                         set_btree_ioerr(p);
3984                         if (PageWriteback(p))
3985                                 end_page_writeback(p);
3986                         if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3987                                 end_extent_buffer_writeback(eb);
3988                         ret = -EIO;
3989                         break;
3990                 }
3991                 disk_bytenr += PAGE_SIZE;
3992                 update_nr_written(wbc, 1);
3993                 unlock_page(p);
3994         }
3995
3996         if (unlikely(ret)) {
3997                 for (; i < num_pages; i++) {
3998                         struct page *p = eb->pages[i];
3999                         clear_page_dirty_for_io(p);
4000                         unlock_page(p);
4001                 }
4002         }
4003
4004         return ret;
4005 }
4006
4007 /*
4008  * Submit all page(s) of one extent buffer.
4009  *
4010  * @page:       the page of one extent buffer
4011  * @eb_context: to determine if we need to submit this page, if current page
4012  *              belongs to this eb, we don't need to submit
4013  *
4014  * The caller should pass each page in their bytenr order, and here we use
4015  * @eb_context to determine if we have submitted pages of one extent buffer.
4016  *
4017  * If we have, we just skip until we hit a new page that doesn't belong to
4018  * current @eb_context.
4019  *
4020  * If not, we submit all the page(s) of the extent buffer.
4021  *
4022  * Return >0 if we have submitted the extent buffer successfully.
4023  * Return 0 if we don't need to submit the page, as it's already submitted by
4024  * previous call.
4025  * Return <0 for fatal error.
4026  */
4027 static int submit_eb_page(struct page *page, struct writeback_control *wbc,
4028                           struct extent_page_data *epd,
4029                           struct extent_buffer **eb_context)
4030 {
4031         struct address_space *mapping = page->mapping;
4032         struct extent_buffer *eb;
4033         int ret;
4034
4035         if (!PagePrivate(page))
4036                 return 0;
4037
4038         spin_lock(&mapping->private_lock);
4039         if (!PagePrivate(page)) {
4040                 spin_unlock(&mapping->private_lock);
4041                 return 0;
4042         }
4043
4044         eb = (struct extent_buffer *)page->private;
4045
4046         /*
4047          * Shouldn't happen and normally this would be a BUG_ON but no point
4048          * crashing the machine for something we can survive anyway.
4049          */
4050         if (WARN_ON(!eb)) {
4051                 spin_unlock(&mapping->private_lock);
4052                 return 0;
4053         }
4054
4055         if (eb == *eb_context) {
4056                 spin_unlock(&mapping->private_lock);
4057                 return 0;
4058         }
4059         ret = atomic_inc_not_zero(&eb->refs);
4060         spin_unlock(&mapping->private_lock);
4061         if (!ret)
4062                 return 0;
4063
4064         *eb_context = eb;
4065
4066         ret = lock_extent_buffer_for_io(eb, epd);
4067         if (ret <= 0) {
4068                 free_extent_buffer(eb);
4069                 return ret;
4070         }
4071         ret = write_one_eb(eb, wbc, epd);
4072         free_extent_buffer(eb);
4073         if (ret < 0)
4074                 return ret;
4075         return 1;
4076 }
4077
4078 int btree_write_cache_pages(struct address_space *mapping,
4079                                    struct writeback_control *wbc)
4080 {
4081         struct extent_buffer *eb_context = NULL;
4082         struct extent_page_data epd = {
4083                 .bio = NULL,
4084                 .extent_locked = 0,
4085                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4086         };
4087         struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
4088         int ret = 0;
4089         int done = 0;
4090         int nr_to_write_done = 0;
4091         struct pagevec pvec;
4092         int nr_pages;
4093         pgoff_t index;
4094         pgoff_t end;            /* Inclusive */
4095         int scanned = 0;
4096         xa_mark_t tag;
4097
4098         pagevec_init(&pvec);
4099         if (wbc->range_cyclic) {
4100                 index = mapping->writeback_index; /* Start from prev offset */
4101                 end = -1;
4102                 /*
4103                  * Start from the beginning does not need to cycle over the
4104                  * range, mark it as scanned.
4105                  */
4106                 scanned = (index == 0);
4107         } else {
4108                 index = wbc->range_start >> PAGE_SHIFT;
4109                 end = wbc->range_end >> PAGE_SHIFT;
4110                 scanned = 1;
4111         }
4112         if (wbc->sync_mode == WB_SYNC_ALL)
4113                 tag = PAGECACHE_TAG_TOWRITE;
4114         else
4115                 tag = PAGECACHE_TAG_DIRTY;
4116 retry:
4117         if (wbc->sync_mode == WB_SYNC_ALL)
4118                 tag_pages_for_writeback(mapping, index, end);
4119         while (!done && !nr_to_write_done && (index <= end) &&
4120                (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
4121                         tag))) {
4122                 unsigned i;
4123
4124                 for (i = 0; i < nr_pages; i++) {
4125                         struct page *page = pvec.pages[i];
4126
4127                         ret = submit_eb_page(page, wbc, &epd, &eb_context);
4128                         if (ret == 0)
4129                                 continue;
4130                         if (ret < 0) {
4131                                 done = 1;
4132                                 break;
4133                         }
4134
4135                         /*
4136                          * the filesystem may choose to bump up nr_to_write.
4137                          * We have to make sure to honor the new nr_to_write
4138                          * at any time
4139                          */
4140                         nr_to_write_done = wbc->nr_to_write <= 0;
4141                 }
4142                 pagevec_release(&pvec);
4143                 cond_resched();
4144         }
4145         if (!scanned && !done) {
4146                 /*
4147                  * We hit the last page and there is more work to be done: wrap
4148                  * back to the start of the file
4149                  */
4150                 scanned = 1;
4151                 index = 0;
4152                 goto retry;
4153         }
4154         if (ret < 0) {
4155                 end_write_bio(&epd, ret);
4156                 return ret;
4157         }
4158         /*
4159          * If something went wrong, don't allow any metadata write bio to be
4160          * submitted.
4161          *
4162          * This would prevent use-after-free if we had dirty pages not
4163          * cleaned up, which can still happen by fuzzed images.
4164          *
4165          * - Bad extent tree
4166          *   Allowing existing tree block to be allocated for other trees.
4167          *
4168          * - Log tree operations
4169          *   Exiting tree blocks get allocated to log tree, bumps its
4170          *   generation, then get cleaned in tree re-balance.
4171          *   Such tree block will not be written back, since it's clean,
4172          *   thus no WRITTEN flag set.
4173          *   And after log writes back, this tree block is not traced by
4174          *   any dirty extent_io_tree.
4175          *
4176          * - Offending tree block gets re-dirtied from its original owner
4177          *   Since it has bumped generation, no WRITTEN flag, it can be
4178          *   reused without COWing. This tree block will not be traced
4179          *   by btrfs_transaction::dirty_pages.
4180          *
4181          *   Now such dirty tree block will not be cleaned by any dirty
4182          *   extent io tree. Thus we don't want to submit such wild eb
4183          *   if the fs already has error.
4184          */
4185         if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
4186                 ret = flush_write_bio(&epd);
4187         } else {
4188                 ret = -EROFS;
4189                 end_write_bio(&epd, ret);
4190         }
4191         return ret;
4192 }
4193
4194 /**
4195  * Walk the list of dirty pages of the given address space and write all of them.
4196  *
4197  * @mapping: address space structure to write
4198  * @wbc:     subtract the number of written pages from *@wbc->nr_to_write
4199  * @epd:     holds context for the write, namely the bio
4200  *
4201  * If a page is already under I/O, write_cache_pages() skips it, even
4202  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
4203  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
4204  * and msync() need to guarantee that all the data which was dirty at the time
4205  * the call was made get new I/O started against them.  If wbc->sync_mode is
4206  * WB_SYNC_ALL then we were called for data integrity and we must wait for
4207  * existing IO to complete.
4208  */
4209 static int extent_write_cache_pages(struct address_space *mapping,
4210                              struct writeback_control *wbc,
4211                              struct extent_page_data *epd)
4212 {
4213         struct inode *inode = mapping->host;
4214         int ret = 0;
4215         int done = 0;
4216         int nr_to_write_done = 0;
4217         struct pagevec pvec;
4218         int nr_pages;
4219         pgoff_t index;
4220         pgoff_t end;            /* Inclusive */
4221         pgoff_t done_index;
4222         int range_whole = 0;
4223         int scanned = 0;
4224         xa_mark_t tag;
4225
4226         /*
4227          * We have to hold onto the inode so that ordered extents can do their
4228          * work when the IO finishes.  The alternative to this is failing to add
4229          * an ordered extent if the igrab() fails there and that is a huge pain
4230          * to deal with, so instead just hold onto the inode throughout the
4231          * writepages operation.  If it fails here we are freeing up the inode
4232          * anyway and we'd rather not waste our time writing out stuff that is
4233          * going to be truncated anyway.
4234          */
4235         if (!igrab(inode))
4236                 return 0;
4237
4238         pagevec_init(&pvec);
4239         if (wbc->range_cyclic) {
4240                 index = mapping->writeback_index; /* Start from prev offset */
4241                 end = -1;
4242                 /*
4243                  * Start from the beginning does not need to cycle over the
4244                  * range, mark it as scanned.
4245                  */
4246                 scanned = (index == 0);
4247         } else {
4248                 index = wbc->range_start >> PAGE_SHIFT;
4249                 end = wbc->range_end >> PAGE_SHIFT;
4250                 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
4251                         range_whole = 1;
4252                 scanned = 1;
4253         }
4254
4255         /*
4256          * We do the tagged writepage as long as the snapshot flush bit is set
4257          * and we are the first one who do the filemap_flush() on this inode.
4258          *
4259          * The nr_to_write == LONG_MAX is needed to make sure other flushers do
4260          * not race in and drop the bit.
4261          */
4262         if (range_whole && wbc->nr_to_write == LONG_MAX &&
4263             test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
4264                                &BTRFS_I(inode)->runtime_flags))
4265                 wbc->tagged_writepages = 1;
4266
4267         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
4268                 tag = PAGECACHE_TAG_TOWRITE;
4269         else
4270                 tag = PAGECACHE_TAG_DIRTY;
4271 retry:
4272         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
4273                 tag_pages_for_writeback(mapping, index, end);
4274         done_index = index;
4275         while (!done && !nr_to_write_done && (index <= end) &&
4276                         (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
4277                                                 &index, end, tag))) {
4278                 unsigned i;
4279
4280                 for (i = 0; i < nr_pages; i++) {
4281                         struct page *page = pvec.pages[i];
4282
4283                         done_index = page->index + 1;
4284                         /*
4285                          * At this point we hold neither the i_pages lock nor
4286                          * the page lock: the page may be truncated or
4287                          * invalidated (changing page->mapping to NULL),
4288                          * or even swizzled back from swapper_space to
4289                          * tmpfs file mapping
4290                          */
4291                         if (!trylock_page(page)) {
4292                                 ret = flush_write_bio(epd);
4293                                 BUG_ON(ret < 0);
4294                                 lock_page(page);
4295                         }
4296
4297                         if (unlikely(page->mapping != mapping)) {
4298                                 unlock_page(page);
4299                                 continue;
4300                         }
4301
4302                         if (wbc->sync_mode != WB_SYNC_NONE) {
4303                                 if (PageWriteback(page)) {
4304                                         ret = flush_write_bio(epd);
4305                                         BUG_ON(ret < 0);
4306                                 }
4307                                 wait_on_page_writeback(page);
4308                         }
4309
4310                         if (PageWriteback(page) ||
4311                             !clear_page_dirty_for_io(page)) {
4312                                 unlock_page(page);
4313                                 continue;
4314                         }
4315
4316                         ret = __extent_writepage(page, wbc, epd);
4317                         if (ret < 0) {
4318                                 done = 1;
4319                                 break;
4320                         }
4321
4322                         /*
4323                          * the filesystem may choose to bump up nr_to_write.
4324                          * We have to make sure to honor the new nr_to_write
4325                          * at any time
4326                          */
4327                         nr_to_write_done = wbc->nr_to_write <= 0;
4328                 }
4329                 pagevec_release(&pvec);
4330                 cond_resched();
4331         }
4332         if (!scanned && !done) {
4333                 /*
4334                  * We hit the last page and there is more work to be done: wrap
4335                  * back to the start of the file
4336                  */
4337                 scanned = 1;
4338                 index = 0;
4339
4340                 /*
4341                  * If we're looping we could run into a page that is locked by a
4342                  * writer and that writer could be waiting on writeback for a
4343                  * page in our current bio, and thus deadlock, so flush the
4344                  * write bio here.
4345                  */
4346                 ret = flush_write_bio(epd);
4347                 if (!ret)
4348                         goto retry;
4349         }
4350
4351         if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
4352                 mapping->writeback_index = done_index;
4353
4354         btrfs_add_delayed_iput(inode);
4355         return ret;
4356 }
4357
4358 int extent_write_full_page(struct page *page, struct writeback_control *wbc)
4359 {
4360         int ret;
4361         struct extent_page_data epd = {
4362                 .bio = NULL,
4363                 .extent_locked = 0,
4364                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4365         };
4366
4367         ret = __extent_writepage(page, wbc, &epd);
4368         ASSERT(ret <= 0);
4369         if (ret < 0) {
4370                 end_write_bio(&epd, ret);
4371                 return ret;
4372         }
4373
4374         ret = flush_write_bio(&epd);
4375         ASSERT(ret <= 0);
4376         return ret;
4377 }
4378
4379 int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
4380                               int mode)
4381 {
4382         int ret = 0;
4383         struct address_space *mapping = inode->i_mapping;
4384         struct page *page;
4385         unsigned long nr_pages = (end - start + PAGE_SIZE) >>
4386                 PAGE_SHIFT;
4387
4388         struct extent_page_data epd = {
4389                 .bio = NULL,
4390                 .extent_locked = 1,
4391                 .sync_io = mode == WB_SYNC_ALL,
4392         };
4393         struct writeback_control wbc_writepages = {
4394                 .sync_mode      = mode,
4395                 .nr_to_write    = nr_pages * 2,
4396                 .range_start    = start,
4397                 .range_end      = end + 1,
4398                 /* We're called from an async helper function */
4399                 .punt_to_cgroup = 1,
4400                 .no_cgroup_owner = 1,
4401         };
4402
4403         wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
4404         while (start <= end) {
4405                 page = find_get_page(mapping, start >> PAGE_SHIFT);
4406                 if (clear_page_dirty_for_io(page))
4407                         ret = __extent_writepage(page, &wbc_writepages, &epd);
4408                 else {
4409                         btrfs_writepage_endio_finish_ordered(page, start,
4410                                                     start + PAGE_SIZE - 1, 1);
4411                         unlock_page(page);
4412                 }
4413                 put_page(page);
4414                 start += PAGE_SIZE;
4415         }
4416
4417         ASSERT(ret <= 0);
4418         if (ret == 0)
4419                 ret = flush_write_bio(&epd);
4420         else
4421                 end_write_bio(&epd, ret);
4422
4423         wbc_detach_inode(&wbc_writepages);
4424         return ret;
4425 }
4426
4427 int extent_writepages(struct address_space *mapping,
4428                       struct writeback_control *wbc)
4429 {
4430         int ret = 0;
4431         struct extent_page_data epd = {
4432                 .bio = NULL,
4433                 .extent_locked = 0,
4434                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4435         };
4436
4437         ret = extent_write_cache_pages(mapping, wbc, &epd);
4438         ASSERT(ret <= 0);
4439         if (ret < 0) {
4440                 end_write_bio(&epd, ret);
4441                 return ret;
4442         }
4443         ret = flush_write_bio(&epd);
4444         return ret;
4445 }
4446
4447 void extent_readahead(struct readahead_control *rac)
4448 {
4449         struct bio *bio = NULL;
4450         unsigned long bio_flags = 0;
4451         struct page *pagepool[16];
4452         struct extent_map *em_cached = NULL;
4453         u64 prev_em_start = (u64)-1;
4454         int nr;
4455
4456         while ((nr = readahead_page_batch(rac, pagepool))) {
4457                 u64 contig_start = page_offset(pagepool[0]);
4458                 u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1;
4459
4460                 ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
4461
4462                 contiguous_readpages(pagepool, nr, contig_start, contig_end,
4463                                 &em_cached, &bio, &bio_flags, &prev_em_start);
4464         }
4465
4466         if (em_cached)
4467                 free_extent_map(em_cached);
4468
4469         if (bio) {
4470                 if (submit_one_bio(bio, 0, bio_flags))
4471                         return;
4472         }
4473 }
4474
4475 /*
4476  * basic invalidatepage code, this waits on any locked or writeback
4477  * ranges corresponding to the page, and then deletes any extent state
4478  * records from the tree
4479  */
4480 int extent_invalidatepage(struct extent_io_tree *tree,
4481                           struct page *page, unsigned long offset)
4482 {
4483         struct extent_state *cached_state = NULL;
4484         u64 start = page_offset(page);
4485         u64 end = start + PAGE_SIZE - 1;
4486         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
4487
4488         /* This function is only called for the btree inode */
4489         ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
4490
4491         start += ALIGN(offset, blocksize);
4492         if (start > end)
4493                 return 0;
4494
4495         lock_extent_bits(tree, start, end, &cached_state);
4496         wait_on_page_writeback(page);
4497
4498         /*
4499          * Currently for btree io tree, only EXTENT_LOCKED is utilized,
4500          * so here we only need to unlock the extent range to free any
4501          * existing extent state.
4502          */
4503         unlock_extent_cached(tree, start, end, &cached_state);
4504         return 0;
4505 }
4506
4507 /*
4508  * a helper for releasepage, this tests for areas of the page that
4509  * are locked or under IO and drops the related state bits if it is safe
4510  * to drop the page.
4511  */
4512 static int try_release_extent_state(struct extent_io_tree *tree,
4513                                     struct page *page, gfp_t mask)
4514 {
4515         u64 start = page_offset(page);
4516         u64 end = start + PAGE_SIZE - 1;
4517         int ret = 1;
4518
4519         if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
4520                 ret = 0;
4521         } else {
4522                 /*
4523                  * At this point we can safely clear everything except the
4524                  * locked bit, the nodatasum bit and the delalloc new bit.
4525                  * The delalloc new bit will be cleared by ordered extent
4526                  * completion.
4527                  */
4528                 ret = __clear_extent_bit(tree, start, end,
4529                          ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
4530                          0, 0, NULL, mask, NULL);
4531
4532                 /* if clear_extent_bit failed for enomem reasons,
4533                  * we can't allow the release to continue.
4534                  */
4535                 if (ret < 0)
4536                         ret = 0;
4537                 else
4538                         ret = 1;
4539         }
4540         return ret;
4541 }
4542
4543 /*
4544  * a helper for releasepage.  As long as there are no locked extents
4545  * in the range corresponding to the page, both state records and extent
4546  * map records are removed
4547  */
4548 int try_release_extent_mapping(struct page *page, gfp_t mask)
4549 {
4550         struct extent_map *em;
4551         u64 start = page_offset(page);
4552         u64 end = start + PAGE_SIZE - 1;
4553         struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
4554         struct extent_io_tree *tree = &btrfs_inode->io_tree;
4555         struct extent_map_tree *map = &btrfs_inode->extent_tree;
4556
4557         if (gfpflags_allow_blocking(mask) &&
4558             page->mapping->host->i_size > SZ_16M) {
4559                 u64 len;
4560                 while (start <= end) {
4561                         struct btrfs_fs_info *fs_info;
4562                         u64 cur_gen;
4563
4564                         len = end - start + 1;
4565                         write_lock(&map->lock);
4566                         em = lookup_extent_mapping(map, start, len);
4567                         if (!em) {
4568                                 write_unlock(&map->lock);
4569                                 break;
4570                         }
4571                         if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
4572                             em->start != start) {
4573                                 write_unlock(&map->lock);
4574                                 free_extent_map(em);
4575                                 break;
4576                         }
4577                         if (test_range_bit(tree, em->start,
4578                                            extent_map_end(em) - 1,
4579                                            EXTENT_LOCKED, 0, NULL))
4580                                 goto next;
4581                         /*
4582                          * If it's not in the list of modified extents, used
4583                          * by a fast fsync, we can remove it. If it's being
4584                          * logged we can safely remove it since fsync took an
4585                          * extra reference on the em.
4586                          */
4587                         if (list_empty(&em->list) ||
4588                             test_bit(EXTENT_FLAG_LOGGING, &em->flags))
4589                                 goto remove_em;
4590                         /*
4591                          * If it's in the list of modified extents, remove it
4592                          * only if its generation is older then the current one,
4593                          * in which case we don't need it for a fast fsync.
4594                          * Otherwise don't remove it, we could be racing with an
4595                          * ongoing fast fsync that could miss the new extent.
4596                          */
4597                         fs_info = btrfs_inode->root->fs_info;
4598                         spin_lock(&fs_info->trans_lock);
4599                         cur_gen = fs_info->generation;
4600                         spin_unlock(&fs_info->trans_lock);
4601                         if (em->generation >= cur_gen)
4602                                 goto next;
4603 remove_em:
4604                         /*
4605                          * We only remove extent maps that are not in the list of
4606                          * modified extents or that are in the list but with a
4607                          * generation lower then the current generation, so there
4608                          * is no need to set the full fsync flag on the inode (it
4609                          * hurts the fsync performance for workloads with a data
4610                          * size that exceeds or is close to the system's memory).
4611                          */
4612                         remove_extent_mapping(map, em);
4613                         /* once for the rb tree */
4614                         free_extent_map(em);
4615 next:
4616                         start = extent_map_end(em);
4617                         write_unlock(&map->lock);
4618
4619                         /* once for us */
4620                         free_extent_map(em);
4621
4622                         cond_resched(); /* Allow large-extent preemption. */
4623                 }
4624         }
4625         return try_release_extent_state(tree, page, mask);
4626 }
4627
4628 /*
4629  * helper function for fiemap, which doesn't want to see any holes.
4630  * This maps until we find something past 'last'
4631  */
4632 static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
4633                                                 u64 offset, u64 last)
4634 {
4635         u64 sectorsize = btrfs_inode_sectorsize(inode);
4636         struct extent_map *em;
4637         u64 len;
4638
4639         if (offset >= last)
4640                 return NULL;
4641
4642         while (1) {
4643                 len = last - offset;
4644                 if (len == 0)
4645                         break;
4646                 len = ALIGN(len, sectorsize);
4647                 em = btrfs_get_extent_fiemap(inode, offset, len);
4648                 if (IS_ERR_OR_NULL(em))
4649                         return em;
4650
4651                 /* if this isn't a hole return it */
4652                 if (em->block_start != EXTENT_MAP_HOLE)
4653                         return em;
4654
4655                 /* this is a hole, advance to the next extent */
4656                 offset = extent_map_end(em);
4657                 free_extent_map(em);
4658                 if (offset >= last)
4659                         break;
4660         }
4661         return NULL;
4662 }
4663
4664 /*
4665  * To cache previous fiemap extent
4666  *
4667  * Will be used for merging fiemap extent
4668  */
4669 struct fiemap_cache {
4670         u64 offset;
4671         u64 phys;
4672         u64 len;
4673         u32 flags;
4674         bool cached;
4675 };
4676
4677 /*
4678  * Helper to submit fiemap extent.
4679  *
4680  * Will try to merge current fiemap extent specified by @offset, @phys,
4681  * @len and @flags with cached one.
4682  * And only when we fails to merge, cached one will be submitted as
4683  * fiemap extent.
4684  *
4685  * Return value is the same as fiemap_fill_next_extent().
4686  */
4687 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
4688                                 struct fiemap_cache *cache,
4689                                 u64 offset, u64 phys, u64 len, u32 flags)
4690 {
4691         int ret = 0;
4692
4693         if (!cache->cached)
4694                 goto assign;
4695
4696         /*
4697          * Sanity check, extent_fiemap() should have ensured that new
4698          * fiemap extent won't overlap with cached one.
4699          * Not recoverable.
4700          *
4701          * NOTE: Physical address can overlap, due to compression
4702          */
4703         if (cache->offset + cache->len > offset) {
4704                 WARN_ON(1);
4705                 return -EINVAL;
4706         }
4707
4708         /*
4709          * Only merges fiemap extents if
4710          * 1) Their logical addresses are continuous
4711          *
4712          * 2) Their physical addresses are continuous
4713          *    So truly compressed (physical size smaller than logical size)
4714          *    extents won't get merged with each other
4715          *
4716          * 3) Share same flags except FIEMAP_EXTENT_LAST
4717          *    So regular extent won't get merged with prealloc extent
4718          */
4719         if (cache->offset + cache->len  == offset &&
4720             cache->phys + cache->len == phys  &&
4721             (cache->flags & ~FIEMAP_EXTENT_LAST) ==
4722                         (flags & ~FIEMAP_EXTENT_LAST)) {
4723                 cache->len += len;
4724                 cache->flags |= flags;
4725                 goto try_submit_last;
4726         }
4727
4728         /* Not mergeable, need to submit cached one */
4729         ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4730                                       cache->len, cache->flags);
4731         cache->cached = false;
4732         if (ret)
4733                 return ret;
4734 assign:
4735         cache->cached = true;
4736         cache->offset = offset;
4737         cache->phys = phys;
4738         cache->len = len;
4739         cache->flags = flags;
4740 try_submit_last:
4741         if (cache->flags & FIEMAP_EXTENT_LAST) {
4742                 ret = fiemap_fill_next_extent(fieinfo, cache->offset,
4743                                 cache->phys, cache->len, cache->flags);
4744                 cache->cached = false;
4745         }
4746         return ret;
4747 }
4748
4749 /*
4750  * Emit last fiemap cache
4751  *
4752  * The last fiemap cache may still be cached in the following case:
4753  * 0                  4k                    8k
4754  * |<- Fiemap range ->|
4755  * |<------------  First extent ----------->|
4756  *
4757  * In this case, the first extent range will be cached but not emitted.
4758  * So we must emit it before ending extent_fiemap().
4759  */
4760 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
4761                                   struct fiemap_cache *cache)
4762 {
4763         int ret;
4764
4765         if (!cache->cached)
4766                 return 0;
4767
4768         ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4769                                       cache->len, cache->flags);
4770         cache->cached = false;
4771         if (ret > 0)
4772                 ret = 0;
4773         return ret;
4774 }
4775
4776 int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
4777                   u64 start, u64 len)
4778 {
4779         int ret = 0;
4780         u64 off = start;
4781         u64 max = start + len;
4782         u32 flags = 0;
4783         u32 found_type;
4784         u64 last;
4785         u64 last_for_get_extent = 0;
4786         u64 disko = 0;
4787         u64 isize = i_size_read(&inode->vfs_inode);
4788         struct btrfs_key found_key;
4789         struct extent_map *em = NULL;
4790         struct extent_state *cached_state = NULL;
4791         struct btrfs_path *path;
4792         struct btrfs_root *root = inode->root;
4793         struct fiemap_cache cache = { 0 };
4794         struct ulist *roots;
4795         struct ulist *tmp_ulist;
4796         int end = 0;
4797         u64 em_start = 0;
4798         u64 em_len = 0;
4799         u64 em_end = 0;
4800
4801         if (len == 0)
4802                 return -EINVAL;
4803
4804         path = btrfs_alloc_path();
4805         if (!path)
4806                 return -ENOMEM;
4807
4808         roots = ulist_alloc(GFP_KERNEL);
4809         tmp_ulist = ulist_alloc(GFP_KERNEL);
4810         if (!roots || !tmp_ulist) {
4811                 ret = -ENOMEM;
4812                 goto out_free_ulist;
4813         }
4814
4815         start = round_down(start, btrfs_inode_sectorsize(inode));
4816         len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
4817
4818         /*
4819          * lookup the last file extent.  We're not using i_size here
4820          * because there might be preallocation past i_size
4821          */
4822         ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
4823                                        0);
4824         if (ret < 0) {
4825                 goto out_free_ulist;
4826         } else {
4827                 WARN_ON(!ret);
4828                 if (ret == 1)
4829                         ret = 0;
4830         }
4831
4832         path->slots[0]--;
4833         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
4834         found_type = found_key.type;
4835
4836         /* No extents, but there might be delalloc bits */
4837         if (found_key.objectid != btrfs_ino(inode) ||
4838             found_type != BTRFS_EXTENT_DATA_KEY) {
4839                 /* have to trust i_size as the end */
4840                 last = (u64)-1;
4841                 last_for_get_extent = isize;
4842         } else {
4843                 /*
4844                  * remember the start of the last extent.  There are a
4845                  * bunch of different factors that go into the length of the
4846                  * extent, so its much less complex to remember where it started
4847                  */
4848                 last = found_key.offset;
4849                 last_for_get_extent = last + 1;
4850         }
4851         btrfs_release_path(path);
4852
4853         /*
4854          * we might have some extents allocated but more delalloc past those
4855          * extents.  so, we trust isize unless the start of the last extent is
4856          * beyond isize
4857          */
4858         if (last < isize) {
4859                 last = (u64)-1;
4860                 last_for_get_extent = isize;
4861         }
4862
4863         lock_extent_bits(&inode->io_tree, start, start + len - 1,
4864                          &cached_state);
4865
4866         em = get_extent_skip_holes(inode, start, last_for_get_extent);
4867         if (!em)
4868                 goto out;
4869         if (IS_ERR(em)) {
4870                 ret = PTR_ERR(em);
4871                 goto out;
4872         }
4873
4874         while (!end) {
4875                 u64 offset_in_extent = 0;
4876
4877                 /* break if the extent we found is outside the range */
4878                 if (em->start >= max || extent_map_end(em) < off)
4879                         break;
4880
4881                 /*
4882                  * get_extent may return an extent that starts before our
4883                  * requested range.  We have to make sure the ranges
4884                  * we return to fiemap always move forward and don't
4885                  * overlap, so adjust the offsets here
4886                  */
4887                 em_start = max(em->start, off);
4888
4889                 /*
4890                  * record the offset from the start of the extent
4891                  * for adjusting the disk offset below.  Only do this if the
4892                  * extent isn't compressed since our in ram offset may be past
4893                  * what we have actually allocated on disk.
4894                  */
4895                 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4896                         offset_in_extent = em_start - em->start;
4897                 em_end = extent_map_end(em);
4898                 em_len = em_end - em_start;
4899                 flags = 0;
4900                 if (em->block_start < EXTENT_MAP_LAST_BYTE)
4901                         disko = em->block_start + offset_in_extent;
4902                 else
4903                         disko = 0;
4904
4905                 /*
4906                  * bump off for our next call to get_extent
4907                  */
4908                 off = extent_map_end(em);
4909                 if (off >= max)
4910                         end = 1;
4911
4912                 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
4913                         end = 1;
4914                         flags |= FIEMAP_EXTENT_LAST;
4915                 } else if (em->block_start == EXTENT_MAP_INLINE) {
4916                         flags |= (FIEMAP_EXTENT_DATA_INLINE |
4917                                   FIEMAP_EXTENT_NOT_ALIGNED);
4918                 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
4919                         flags |= (FIEMAP_EXTENT_DELALLOC |
4920                                   FIEMAP_EXTENT_UNKNOWN);
4921                 } else if (fieinfo->fi_extents_max) {
4922                         u64 bytenr = em->block_start -
4923                                 (em->start - em->orig_start);
4924
4925                         /*
4926                          * As btrfs supports shared space, this information
4927                          * can be exported to userspace tools via
4928                          * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
4929                          * then we're just getting a count and we can skip the
4930                          * lookup stuff.
4931                          */
4932                         ret = btrfs_check_shared(root, btrfs_ino(inode),
4933                                                  bytenr, roots, tmp_ulist);
4934                         if (ret < 0)
4935                                 goto out_free;
4936                         if (ret)
4937                                 flags |= FIEMAP_EXTENT_SHARED;
4938                         ret = 0;
4939                 }
4940                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4941                         flags |= FIEMAP_EXTENT_ENCODED;
4942                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4943                         flags |= FIEMAP_EXTENT_UNWRITTEN;
4944
4945                 free_extent_map(em);
4946                 em = NULL;
4947                 if ((em_start >= last) || em_len == (u64)-1 ||
4948                    (last == (u64)-1 && isize <= em_end)) {
4949                         flags |= FIEMAP_EXTENT_LAST;
4950                         end = 1;
4951                 }
4952
4953                 /* now scan forward to see if this is really the last extent. */
4954                 em = get_extent_skip_holes(inode, off, last_for_get_extent);
4955                 if (IS_ERR(em)) {
4956                         ret = PTR_ERR(em);
4957                         goto out;
4958                 }
4959                 if (!em) {
4960                         flags |= FIEMAP_EXTENT_LAST;
4961                         end = 1;
4962                 }
4963                 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
4964                                            em_len, flags);
4965                 if (ret) {
4966                         if (ret == 1)
4967                                 ret = 0;
4968                         goto out_free;
4969                 }
4970         }
4971 out_free:
4972         if (!ret)
4973                 ret = emit_last_fiemap_cache(fieinfo, &cache);
4974         free_extent_map(em);
4975 out:
4976         unlock_extent_cached(&inode->io_tree, start, start + len - 1,
4977                              &cached_state);
4978
4979 out_free_ulist:
4980         btrfs_free_path(path);
4981         ulist_free(roots);
4982         ulist_free(tmp_ulist);
4983         return ret;
4984 }
4985
4986 static void __free_extent_buffer(struct extent_buffer *eb)
4987 {
4988         kmem_cache_free(extent_buffer_cache, eb);
4989 }
4990
4991 int extent_buffer_under_io(const struct extent_buffer *eb)
4992 {
4993         return (atomic_read(&eb->io_pages) ||
4994                 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4995                 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4996 }
4997
4998 static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
4999 {
5000         struct btrfs_subpage *subpage;
5001
5002         lockdep_assert_held(&page->mapping->private_lock);
5003
5004         if (PagePrivate(page)) {
5005                 subpage = (struct btrfs_subpage *)page->private;
5006                 if (atomic_read(&subpage->eb_refs))
5007                         return true;
5008         }
5009         return false;
5010 }
5011
5012 static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
5013 {
5014         struct btrfs_fs_info *fs_info = eb->fs_info;
5015         const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5016
5017         /*
5018          * For mapped eb, we're going to change the page private, which should
5019          * be done under the private_lock.
5020          */
5021         if (mapped)
5022                 spin_lock(&page->mapping->private_lock);
5023
5024         if (!PagePrivate(page)) {
5025                 if (mapped)
5026                         spin_unlock(&page->mapping->private_lock);
5027                 return;
5028         }
5029
5030         if (fs_info->sectorsize == PAGE_SIZE) {
5031                 /*
5032                  * We do this since we'll remove the pages after we've
5033                  * removed the eb from the radix tree, so we could race
5034                  * and have this page now attached to the new eb.  So
5035                  * only clear page_private if it's still connected to
5036                  * this eb.
5037                  */
5038                 if (PagePrivate(page) &&
5039                     page->private == (unsigned long)eb) {
5040                         BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5041                         BUG_ON(PageDirty(page));
5042                         BUG_ON(PageWriteback(page));
5043                         /*
5044                          * We need to make sure we haven't be attached
5045                          * to a new eb.
5046                          */
5047                         detach_page_private(page);
5048                 }
5049                 if (mapped)
5050                         spin_unlock(&page->mapping->private_lock);
5051                 return;
5052         }
5053
5054         /*
5055          * For subpage, we can have dummy eb with page private.  In this case,
5056          * we can directly detach the private as such page is only attached to
5057          * one dummy eb, no sharing.
5058          */
5059         if (!mapped) {
5060                 btrfs_detach_subpage(fs_info, page);
5061                 return;
5062         }
5063
5064         btrfs_page_dec_eb_refs(fs_info, page);
5065
5066         /*
5067          * We can only detach the page private if there are no other ebs in the
5068          * page range.
5069          */
5070         if (!page_range_has_eb(fs_info, page))
5071                 btrfs_detach_subpage(fs_info, page);
5072
5073         spin_unlock(&page->mapping->private_lock);
5074 }
5075
5076 /* Release all pages attached to the extent buffer */
5077 static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
5078 {
5079         int i;
5080         int num_pages;
5081
5082         ASSERT(!extent_buffer_under_io(eb));
5083
5084         num_pages = num_extent_pages(eb);
5085         for (i = 0; i < num_pages; i++) {
5086                 struct page *page = eb->pages[i];
5087
5088                 if (!page)
5089                         continue;
5090
5091                 detach_extent_buffer_page(eb, page);
5092
5093                 /* One for when we allocated the page */
5094                 put_page(page);
5095         }
5096 }
5097
5098 /*
5099  * Helper for releasing the extent buffer.
5100  */
5101 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
5102 {
5103         btrfs_release_extent_buffer_pages(eb);
5104         btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
5105         __free_extent_buffer(eb);
5106 }
5107
5108 static struct extent_buffer *
5109 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
5110                       unsigned long len)
5111 {
5112         struct extent_buffer *eb = NULL;
5113
5114         eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
5115         eb->start = start;
5116         eb->len = len;
5117         eb->fs_info = fs_info;
5118         eb->bflags = 0;
5119         init_rwsem(&eb->lock);
5120
5121         btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
5122                              &fs_info->allocated_ebs);
5123
5124         spin_lock_init(&eb->refs_lock);
5125         atomic_set(&eb->refs, 1);
5126         atomic_set(&eb->io_pages, 0);
5127
5128         ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
5129
5130         return eb;
5131 }
5132
5133 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
5134 {
5135         int i;
5136         struct page *p;
5137         struct extent_buffer *new;
5138         int num_pages = num_extent_pages(src);
5139
5140         new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
5141         if (new == NULL)
5142                 return NULL;
5143
5144         /*
5145          * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
5146          * btrfs_release_extent_buffer() have different behavior for
5147          * UNMAPPED subpage extent buffer.
5148          */
5149         set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
5150
5151         for (i = 0; i < num_pages; i++) {
5152                 int ret;
5153
5154                 p = alloc_page(GFP_NOFS);
5155                 if (!p) {
5156                         btrfs_release_extent_buffer(new);
5157                         return NULL;
5158                 }
5159                 ret = attach_extent_buffer_page(new, p, NULL);
5160                 if (ret < 0) {
5161                         put_page(p);
5162                         btrfs_release_extent_buffer(new);
5163                         return NULL;
5164                 }
5165                 WARN_ON(PageDirty(p));
5166                 SetPageUptodate(p);
5167                 new->pages[i] = p;
5168                 copy_page(page_address(p), page_address(src->pages[i]));
5169         }
5170         set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
5171
5172         return new;
5173 }
5174
5175 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5176                                                   u64 start, unsigned long len)
5177 {
5178         struct extent_buffer *eb;
5179         int num_pages;
5180         int i;
5181
5182         eb = __alloc_extent_buffer(fs_info, start, len);
5183         if (!eb)
5184                 return NULL;
5185
5186         num_pages = num_extent_pages(eb);
5187         for (i = 0; i < num_pages; i++) {
5188                 int ret;
5189
5190                 eb->pages[i] = alloc_page(GFP_NOFS);
5191                 if (!eb->pages[i])
5192                         goto err;
5193                 ret = attach_extent_buffer_page(eb, eb->pages[i], NULL);
5194                 if (ret < 0)
5195                         goto err;
5196         }
5197         set_extent_buffer_uptodate(eb);
5198         btrfs_set_header_nritems(eb, 0);
5199         set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5200
5201         return eb;
5202 err:
5203         for (; i > 0; i--) {
5204                 detach_extent_buffer_page(eb, eb->pages[i - 1]);
5205                 __free_page(eb->pages[i - 1]);
5206         }
5207         __free_extent_buffer(eb);
5208         return NULL;
5209 }
5210
5211 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5212                                                 u64 start)
5213 {
5214         return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
5215 }
5216
5217 static void check_buffer_tree_ref(struct extent_buffer *eb)
5218 {
5219         int refs;
5220         /*
5221          * The TREE_REF bit is first set when the extent_buffer is added
5222          * to the radix tree. It is also reset, if unset, when a new reference
5223          * is created by find_extent_buffer.
5224          *
5225          * It is only cleared in two cases: freeing the last non-tree
5226          * reference to the extent_buffer when its STALE bit is set or
5227          * calling releasepage when the tree reference is the only reference.
5228          *
5229          * In both cases, care is taken to ensure that the extent_buffer's
5230          * pages are not under io. However, releasepage can be concurrently
5231          * called with creating new references, which is prone to race
5232          * conditions between the calls to check_buffer_tree_ref in those
5233          * codepaths and clearing TREE_REF in try_release_extent_buffer.
5234          *
5235          * The actual lifetime of the extent_buffer in the radix tree is
5236          * adequately protected by the refcount, but the TREE_REF bit and
5237          * its corresponding reference are not. To protect against this
5238          * class of races, we call check_buffer_tree_ref from the codepaths
5239          * which trigger io after they set eb->io_pages. Note that once io is
5240          * initiated, TREE_REF can no longer be cleared, so that is the
5241          * moment at which any such race is best fixed.
5242          */
5243         refs = atomic_read(&eb->refs);
5244         if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5245                 return;
5246
5247         spin_lock(&eb->refs_lock);
5248         if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5249                 atomic_inc(&eb->refs);
5250         spin_unlock(&eb->refs_lock);
5251 }
5252
5253 static void mark_extent_buffer_accessed(struct extent_buffer *eb,
5254                 struct page *accessed)
5255 {
5256         int num_pages, i;
5257
5258         check_buffer_tree_ref(eb);
5259
5260         num_pages = num_extent_pages(eb);
5261         for (i = 0; i < num_pages; i++) {
5262                 struct page *p = eb->pages[i];
5263
5264                 if (p != accessed)
5265                         mark_page_accessed(p);
5266         }
5267 }
5268
5269 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
5270                                          u64 start)
5271 {
5272         struct extent_buffer *eb;
5273
5274         rcu_read_lock();
5275         eb = radix_tree_lookup(&fs_info->buffer_radix,
5276                                start >> fs_info->sectorsize_bits);
5277         if (eb && atomic_inc_not_zero(&eb->refs)) {
5278                 rcu_read_unlock();
5279                 /*
5280                  * Lock our eb's refs_lock to avoid races with
5281                  * free_extent_buffer. When we get our eb it might be flagged
5282                  * with EXTENT_BUFFER_STALE and another task running
5283                  * free_extent_buffer might have seen that flag set,
5284                  * eb->refs == 2, that the buffer isn't under IO (dirty and
5285                  * writeback flags not set) and it's still in the tree (flag
5286                  * EXTENT_BUFFER_TREE_REF set), therefore being in the process
5287                  * of decrementing the extent buffer's reference count twice.
5288                  * So here we could race and increment the eb's reference count,
5289                  * clear its stale flag, mark it as dirty and drop our reference
5290                  * before the other task finishes executing free_extent_buffer,
5291                  * which would later result in an attempt to free an extent
5292                  * buffer that is dirty.
5293                  */
5294                 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
5295                         spin_lock(&eb->refs_lock);
5296                         spin_unlock(&eb->refs_lock);
5297                 }
5298                 mark_extent_buffer_accessed(eb, NULL);
5299                 return eb;
5300         }
5301         rcu_read_unlock();
5302
5303         return NULL;
5304 }
5305
5306 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5307 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
5308                                         u64 start)
5309 {
5310         struct extent_buffer *eb, *exists = NULL;
5311         int ret;
5312
5313         eb = find_extent_buffer(fs_info, start);
5314         if (eb)
5315                 return eb;
5316         eb = alloc_dummy_extent_buffer(fs_info, start);
5317         if (!eb)
5318                 return ERR_PTR(-ENOMEM);
5319         eb->fs_info = fs_info;
5320 again:
5321         ret = radix_tree_preload(GFP_NOFS);
5322         if (ret) {
5323                 exists = ERR_PTR(ret);
5324                 goto free_eb;
5325         }
5326         spin_lock(&fs_info->buffer_lock);
5327         ret = radix_tree_insert(&fs_info->buffer_radix,
5328                                 start >> fs_info->sectorsize_bits, eb);
5329         spin_unlock(&fs_info->buffer_lock);
5330         radix_tree_preload_end();
5331         if (ret == -EEXIST) {
5332                 exists = find_extent_buffer(fs_info, start);
5333                 if (exists)
5334                         goto free_eb;
5335                 else
5336                         goto again;
5337         }
5338         check_buffer_tree_ref(eb);
5339         set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5340
5341         return eb;
5342 free_eb:
5343         btrfs_release_extent_buffer(eb);
5344         return exists;
5345 }
5346 #endif
5347
5348 static struct extent_buffer *grab_extent_buffer(
5349                 struct btrfs_fs_info *fs_info, struct page *page)
5350 {
5351         struct extent_buffer *exists;
5352
5353         /*
5354          * For subpage case, we completely rely on radix tree to ensure we
5355          * don't try to insert two ebs for the same bytenr.  So here we always
5356          * return NULL and just continue.
5357          */
5358         if (fs_info->sectorsize < PAGE_SIZE)
5359                 return NULL;
5360
5361         /* Page not yet attached to an extent buffer */
5362         if (!PagePrivate(page))
5363                 return NULL;
5364
5365         /*
5366          * We could have already allocated an eb for this page and attached one
5367          * so lets see if we can get a ref on the existing eb, and if we can we
5368          * know it's good and we can just return that one, else we know we can
5369          * just overwrite page->private.
5370          */
5371         exists = (struct extent_buffer *)page->private;
5372         if (atomic_inc_not_zero(&exists->refs))
5373                 return exists;
5374
5375         WARN_ON(PageDirty(page));
5376         detach_page_private(page);
5377         return NULL;
5378 }
5379
5380 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
5381                                           u64 start, u64 owner_root, int level)
5382 {
5383         unsigned long len = fs_info->nodesize;
5384         int num_pages;
5385         int i;
5386         unsigned long index = start >> PAGE_SHIFT;
5387         struct extent_buffer *eb;
5388         struct extent_buffer *exists = NULL;
5389         struct page *p;
5390         struct address_space *mapping = fs_info->btree_inode->i_mapping;
5391         int uptodate = 1;
5392         int ret;
5393
5394         if (!IS_ALIGNED(start, fs_info->sectorsize)) {
5395                 btrfs_err(fs_info, "bad tree block start %llu", start);
5396                 return ERR_PTR(-EINVAL);
5397         }
5398
5399         if (fs_info->sectorsize < PAGE_SIZE &&
5400             offset_in_page(start) + len > PAGE_SIZE) {
5401                 btrfs_err(fs_info,
5402                 "tree block crosses page boundary, start %llu nodesize %lu",
5403                           start, len);
5404                 return ERR_PTR(-EINVAL);
5405         }
5406
5407         eb = find_extent_buffer(fs_info, start);
5408         if (eb)
5409                 return eb;
5410
5411         eb = __alloc_extent_buffer(fs_info, start, len);
5412         if (!eb)
5413                 return ERR_PTR(-ENOMEM);
5414         btrfs_set_buffer_lockdep_class(owner_root, eb, level);
5415
5416         num_pages = num_extent_pages(eb);
5417         for (i = 0; i < num_pages; i++, index++) {
5418                 struct btrfs_subpage *prealloc = NULL;
5419
5420                 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
5421                 if (!p) {
5422                         exists = ERR_PTR(-ENOMEM);
5423                         goto free_eb;
5424                 }
5425
5426                 /*
5427                  * Preallocate page->private for subpage case, so that we won't
5428                  * allocate memory with private_lock hold.  The memory will be
5429                  * freed by attach_extent_buffer_page() or freed manually if
5430                  * we exit earlier.
5431                  *
5432                  * Although we have ensured one subpage eb can only have one
5433                  * page, but it may change in the future for 16K page size
5434                  * support, so we still preallocate the memory in the loop.
5435                  */
5436                 ret = btrfs_alloc_subpage(fs_info, &prealloc,
5437                                           BTRFS_SUBPAGE_METADATA);
5438                 if (ret < 0) {
5439                         unlock_page(p);
5440                         put_page(p);
5441                         exists = ERR_PTR(ret);
5442                         goto free_eb;
5443                 }
5444
5445                 spin_lock(&mapping->private_lock);
5446                 exists = grab_extent_buffer(fs_info, p);
5447                 if (exists) {
5448                         spin_unlock(&mapping->private_lock);
5449                         unlock_page(p);
5450                         put_page(p);
5451                         mark_extent_buffer_accessed(exists, p);
5452                         btrfs_free_subpage(prealloc);
5453                         goto free_eb;
5454                 }
5455                 /* Should not fail, as we have preallocated the memory */
5456                 ret = attach_extent_buffer_page(eb, p, prealloc);
5457                 ASSERT(!ret);
5458                 /*
5459                  * To inform we have extra eb under allocation, so that
5460                  * detach_extent_buffer_page() won't release the page private
5461                  * when the eb hasn't yet been inserted into radix tree.
5462                  *
5463                  * The ref will be decreased when the eb released the page, in
5464                  * detach_extent_buffer_page().
5465                  * Thus needs no special handling in error path.
5466                  */
5467                 btrfs_page_inc_eb_refs(fs_info, p);
5468                 spin_unlock(&mapping->private_lock);
5469
5470                 WARN_ON(PageDirty(p));
5471                 eb->pages[i] = p;
5472                 if (!PageUptodate(p))
5473                         uptodate = 0;
5474
5475                 /*
5476                  * We can't unlock the pages just yet since the extent buffer
5477                  * hasn't been properly inserted in the radix tree, this
5478                  * opens a race with btree_releasepage which can free a page
5479                  * while we are still filling in all pages for the buffer and
5480                  * we could crash.
5481                  */
5482         }
5483         if (uptodate)
5484                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5485 again:
5486         ret = radix_tree_preload(GFP_NOFS);
5487         if (ret) {
5488                 exists = ERR_PTR(ret);
5489                 goto free_eb;
5490         }
5491
5492         spin_lock(&fs_info->buffer_lock);
5493         ret = radix_tree_insert(&fs_info->buffer_radix,
5494                                 start >> fs_info->sectorsize_bits, eb);
5495         spin_unlock(&fs_info->buffer_lock);
5496         radix_tree_preload_end();
5497         if (ret == -EEXIST) {
5498                 exists = find_extent_buffer(fs_info, start);
5499                 if (exists)
5500                         goto free_eb;
5501                 else
5502                         goto again;
5503         }
5504         /* add one reference for the tree */
5505         check_buffer_tree_ref(eb);
5506         set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5507
5508         /*
5509          * Now it's safe to unlock the pages because any calls to
5510          * btree_releasepage will correctly detect that a page belongs to a
5511          * live buffer and won't free them prematurely.
5512          */
5513         for (i = 0; i < num_pages; i++)
5514                 unlock_page(eb->pages[i]);
5515         return eb;
5516
5517 free_eb:
5518         WARN_ON(!atomic_dec_and_test(&eb->refs));
5519         for (i = 0; i < num_pages; i++) {
5520                 if (eb->pages[i])
5521                         unlock_page(eb->pages[i]);
5522         }
5523
5524         btrfs_release_extent_buffer(eb);
5525         return exists;
5526 }
5527
5528 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
5529 {
5530         struct extent_buffer *eb =
5531                         container_of(head, struct extent_buffer, rcu_head);
5532
5533         __free_extent_buffer(eb);
5534 }
5535
5536 static int release_extent_buffer(struct extent_buffer *eb)
5537         __releases(&eb->refs_lock)
5538 {
5539         lockdep_assert_held(&eb->refs_lock);
5540
5541         WARN_ON(atomic_read(&eb->refs) == 0);
5542         if (atomic_dec_and_test(&eb->refs)) {
5543                 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
5544                         struct btrfs_fs_info *fs_info = eb->fs_info;
5545
5546                         spin_unlock(&eb->refs_lock);
5547
5548                         spin_lock(&fs_info->buffer_lock);
5549                         radix_tree_delete(&fs_info->buffer_radix,
5550                                           eb->start >> fs_info->sectorsize_bits);
5551                         spin_unlock(&fs_info->buffer_lock);
5552                 } else {
5553                         spin_unlock(&eb->refs_lock);
5554                 }
5555
5556                 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
5557                 /* Should be safe to release our pages at this point */
5558                 btrfs_release_extent_buffer_pages(eb);
5559 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5560                 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
5561                         __free_extent_buffer(eb);
5562                         return 1;
5563                 }
5564 #endif
5565                 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
5566                 return 1;
5567         }
5568         spin_unlock(&eb->refs_lock);
5569
5570         return 0;
5571 }
5572
5573 void free_extent_buffer(struct extent_buffer *eb)
5574 {
5575         int refs;
5576         int old;
5577         if (!eb)
5578                 return;
5579
5580         while (1) {
5581                 refs = atomic_read(&eb->refs);
5582                 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
5583                     || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
5584                         refs == 1))
5585                         break;
5586                 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
5587                 if (old == refs)
5588                         return;
5589         }
5590
5591         spin_lock(&eb->refs_lock);
5592         if (atomic_read(&eb->refs) == 2 &&
5593             test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
5594             !extent_buffer_under_io(eb) &&
5595             test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5596                 atomic_dec(&eb->refs);
5597
5598         /*
5599          * I know this is terrible, but it's temporary until we stop tracking
5600          * the uptodate bits and such for the extent buffers.
5601          */
5602         release_extent_buffer(eb);
5603 }
5604
5605 void free_extent_buffer_stale(struct extent_buffer *eb)
5606 {
5607         if (!eb)
5608                 return;
5609
5610         spin_lock(&eb->refs_lock);
5611         set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
5612
5613         if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
5614             test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5615                 atomic_dec(&eb->refs);
5616         release_extent_buffer(eb);
5617 }
5618
5619 void clear_extent_buffer_dirty(const struct extent_buffer *eb)
5620 {
5621         int i;
5622         int num_pages;
5623         struct page *page;
5624
5625         num_pages = num_extent_pages(eb);
5626
5627         for (i = 0; i < num_pages; i++) {
5628                 page = eb->pages[i];
5629                 if (!PageDirty(page))
5630                         continue;
5631
5632                 lock_page(page);
5633                 WARN_ON(!PagePrivate(page));
5634
5635                 clear_page_dirty_for_io(page);
5636                 xa_lock_irq(&page->mapping->i_pages);
5637                 if (!PageDirty(page))
5638                         __xa_clear_mark(&page->mapping->i_pages,
5639                                         page_index(page), PAGECACHE_TAG_DIRTY);
5640                 xa_unlock_irq(&page->mapping->i_pages);
5641                 ClearPageError(page);
5642                 unlock_page(page);
5643         }
5644         WARN_ON(atomic_read(&eb->refs) == 0);
5645 }
5646
5647 bool set_extent_buffer_dirty(struct extent_buffer *eb)
5648 {
5649         int i;
5650         int num_pages;
5651         bool was_dirty;
5652
5653         check_buffer_tree_ref(eb);
5654
5655         was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
5656
5657         num_pages = num_extent_pages(eb);
5658         WARN_ON(atomic_read(&eb->refs) == 0);
5659         WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
5660
5661         if (!was_dirty)
5662                 for (i = 0; i < num_pages; i++)
5663                         set_page_dirty(eb->pages[i]);
5664
5665 #ifdef CONFIG_BTRFS_DEBUG
5666         for (i = 0; i < num_pages; i++)
5667                 ASSERT(PageDirty(eb->pages[i]));
5668 #endif
5669
5670         return was_dirty;
5671 }
5672
5673 void clear_extent_buffer_uptodate(struct extent_buffer *eb)
5674 {
5675         struct btrfs_fs_info *fs_info = eb->fs_info;
5676         struct page *page;
5677         int num_pages;
5678         int i;
5679
5680         clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5681         num_pages = num_extent_pages(eb);
5682         for (i = 0; i < num_pages; i++) {
5683                 page = eb->pages[i];
5684                 if (page)
5685                         btrfs_page_clear_uptodate(fs_info, page,
5686                                                   eb->start, eb->len);
5687         }
5688 }
5689
5690 void set_extent_buffer_uptodate(struct extent_buffer *eb)
5691 {
5692         struct btrfs_fs_info *fs_info = eb->fs_info;
5693         struct page *page;
5694         int num_pages;
5695         int i;
5696
5697         set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5698         num_pages = num_extent_pages(eb);
5699         for (i = 0; i < num_pages; i++) {
5700                 page = eb->pages[i];
5701                 btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len);
5702         }
5703 }
5704
5705 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
5706 {
5707         int i;
5708         struct page *page;
5709         int err;
5710         int ret = 0;
5711         int locked_pages = 0;
5712         int all_uptodate = 1;
5713         int num_pages;
5714         unsigned long num_reads = 0;
5715         struct bio *bio = NULL;
5716         unsigned long bio_flags = 0;
5717
5718         if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
5719                 return 0;
5720
5721         num_pages = num_extent_pages(eb);
5722         for (i = 0; i < num_pages; i++) {
5723                 page = eb->pages[i];
5724                 if (wait == WAIT_NONE) {
5725                         if (!trylock_page(page))
5726                                 goto unlock_exit;
5727                 } else {
5728                         lock_page(page);
5729                 }
5730                 locked_pages++;
5731         }
5732         /*
5733          * We need to firstly lock all pages to make sure that
5734          * the uptodate bit of our pages won't be affected by
5735          * clear_extent_buffer_uptodate().
5736          */
5737         for (i = 0; i < num_pages; i++) {
5738                 page = eb->pages[i];
5739                 if (!PageUptodate(page)) {
5740                         num_reads++;
5741                         all_uptodate = 0;
5742                 }
5743         }
5744
5745         if (all_uptodate) {
5746                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5747                 goto unlock_exit;
5748         }
5749
5750         clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5751         eb->read_mirror = 0;
5752         atomic_set(&eb->io_pages, num_reads);
5753         /*
5754          * It is possible for releasepage to clear the TREE_REF bit before we
5755          * set io_pages. See check_buffer_tree_ref for a more detailed comment.
5756          */
5757         check_buffer_tree_ref(eb);
5758         for (i = 0; i < num_pages; i++) {
5759                 page = eb->pages[i];
5760
5761                 if (!PageUptodate(page)) {
5762                         if (ret) {
5763                                 atomic_dec(&eb->io_pages);
5764                                 unlock_page(page);
5765                                 continue;
5766                         }
5767
5768                         ClearPageError(page);
5769                         err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
5770                                          page, page_offset(page), PAGE_SIZE, 0,
5771                                          &bio, end_bio_extent_readpage,
5772                                          mirror_num, 0, 0, false);
5773                         if (err) {
5774                                 /*
5775                                  * We failed to submit the bio so it's the
5776                                  * caller's responsibility to perform cleanup
5777                                  * i.e unlock page/set error bit.
5778                                  */
5779                                 ret = err;
5780                                 SetPageError(page);
5781                                 unlock_page(page);
5782                                 atomic_dec(&eb->io_pages);
5783                         }
5784                 } else {
5785                         unlock_page(page);
5786                 }
5787         }
5788
5789         if (bio) {
5790                 err = submit_one_bio(bio, mirror_num, bio_flags);
5791                 if (err)
5792                         return err;
5793         }
5794
5795         if (ret || wait != WAIT_COMPLETE)
5796                 return ret;
5797
5798         for (i = 0; i < num_pages; i++) {
5799                 page = eb->pages[i];
5800                 wait_on_page_locked(page);
5801                 if (!PageUptodate(page))
5802                         ret = -EIO;
5803         }
5804
5805         return ret;
5806
5807 unlock_exit:
5808         while (locked_pages > 0) {
5809                 locked_pages--;
5810                 page = eb->pages[locked_pages];
5811                 unlock_page(page);
5812         }
5813         return ret;
5814 }
5815
5816 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
5817                             unsigned long len)
5818 {
5819         btrfs_warn(eb->fs_info,
5820                 "access to eb bytenr %llu len %lu out of range start %lu len %lu",
5821                 eb->start, eb->len, start, len);
5822         WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
5823
5824         return true;
5825 }
5826
5827 /*
5828  * Check if the [start, start + len) range is valid before reading/writing
5829  * the eb.
5830  * NOTE: @start and @len are offset inside the eb, not logical address.
5831  *
5832  * Caller should not touch the dst/src memory if this function returns error.
5833  */
5834 static inline int check_eb_range(const struct extent_buffer *eb,
5835                                  unsigned long start, unsigned long len)
5836 {
5837         unsigned long offset;
5838
5839         /* start, start + len should not go beyond eb->len nor overflow */
5840         if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
5841                 return report_eb_range(eb, start, len);
5842
5843         return false;
5844 }
5845
5846 void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
5847                         unsigned long start, unsigned long len)
5848 {
5849         size_t cur;
5850         size_t offset;
5851         struct page *page;
5852         char *kaddr;
5853         char *dst = (char *)dstv;
5854         unsigned long i = get_eb_page_index(start);
5855
5856         if (check_eb_range(eb, start, len))
5857                 return;
5858
5859         offset = get_eb_offset_in_page(eb, start);
5860
5861         while (len > 0) {
5862                 page = eb->pages[i];
5863
5864                 cur = min(len, (PAGE_SIZE - offset));
5865                 kaddr = page_address(page);
5866                 memcpy(dst, kaddr + offset, cur);
5867
5868                 dst += cur;
5869                 len -= cur;
5870                 offset = 0;
5871                 i++;
5872         }
5873 }
5874
5875 int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
5876                                        void __user *dstv,
5877                                        unsigned long start, unsigned long len)
5878 {
5879         size_t cur;
5880         size_t offset;
5881         struct page *page;
5882         char *kaddr;
5883         char __user *dst = (char __user *)dstv;
5884         unsigned long i = get_eb_page_index(start);
5885         int ret = 0;
5886
5887         WARN_ON(start > eb->len);
5888         WARN_ON(start + len > eb->start + eb->len);
5889
5890         offset = get_eb_offset_in_page(eb, start);
5891
5892         while (len > 0) {
5893                 page = eb->pages[i];
5894
5895                 cur = min(len, (PAGE_SIZE - offset));
5896                 kaddr = page_address(page);
5897                 if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
5898                         ret = -EFAULT;
5899                         break;
5900                 }
5901
5902                 dst += cur;
5903                 len -= cur;
5904                 offset = 0;
5905                 i++;
5906         }
5907
5908         return ret;
5909 }
5910
5911 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
5912                          unsigned long start, unsigned long len)
5913 {
5914         size_t cur;
5915         size_t offset;
5916         struct page *page;
5917         char *kaddr;
5918         char *ptr = (char *)ptrv;
5919         unsigned long i = get_eb_page_index(start);
5920         int ret = 0;
5921
5922         if (check_eb_range(eb, start, len))
5923                 return -EINVAL;
5924
5925         offset = get_eb_offset_in_page(eb, start);
5926
5927         while (len > 0) {
5928                 page = eb->pages[i];
5929
5930                 cur = min(len, (PAGE_SIZE - offset));
5931
5932                 kaddr = page_address(page);
5933                 ret = memcmp(ptr, kaddr + offset, cur);
5934                 if (ret)
5935                         break;
5936
5937                 ptr += cur;
5938                 len -= cur;
5939                 offset = 0;
5940                 i++;
5941         }
5942         return ret;
5943 }
5944
5945 void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
5946                 const void *srcv)
5947 {
5948         char *kaddr;
5949
5950         WARN_ON(!PageUptodate(eb->pages[0]));
5951         kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
5952         memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
5953                         BTRFS_FSID_SIZE);
5954 }
5955
5956 void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
5957 {
5958         char *kaddr;
5959
5960         WARN_ON(!PageUptodate(eb->pages[0]));
5961         kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
5962         memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
5963                         BTRFS_FSID_SIZE);
5964 }
5965
5966 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
5967                          unsigned long start, unsigned long len)
5968 {
5969         size_t cur;
5970         size_t offset;
5971         struct page *page;
5972         char *kaddr;
5973         char *src = (char *)srcv;
5974         unsigned long i = get_eb_page_index(start);
5975
5976         if (check_eb_range(eb, start, len))
5977                 return;
5978
5979         offset = get_eb_offset_in_page(eb, start);
5980
5981         while (len > 0) {
5982                 page = eb->pages[i];
5983                 WARN_ON(!PageUptodate(page));
5984
5985                 cur = min(len, PAGE_SIZE - offset);
5986                 kaddr = page_address(page);
5987                 memcpy(kaddr + offset, src, cur);
5988
5989                 src += cur;
5990                 len -= cur;
5991                 offset = 0;
5992                 i++;
5993         }
5994 }
5995
5996 void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
5997                 unsigned long len)
5998 {
5999         size_t cur;
6000         size_t offset;
6001         struct page *page;
6002         char *kaddr;
6003         unsigned long i = get_eb_page_index(start);
6004
6005         if (check_eb_range(eb, start, len))
6006                 return;
6007
6008         offset = get_eb_offset_in_page(eb, start);
6009
6010         while (len > 0) {
6011                 page = eb->pages[i];
6012                 WARN_ON(!PageUptodate(page));
6013
6014                 cur = min(len, PAGE_SIZE - offset);
6015                 kaddr = page_address(page);
6016                 memset(kaddr + offset, 0, cur);
6017
6018                 len -= cur;
6019                 offset = 0;
6020                 i++;
6021         }
6022 }
6023
6024 void copy_extent_buffer_full(const struct extent_buffer *dst,
6025                              const struct extent_buffer *src)
6026 {
6027         int i;
6028         int num_pages;
6029
6030         ASSERT(dst->len == src->len);
6031
6032         if (dst->fs_info->sectorsize == PAGE_SIZE) {
6033                 num_pages = num_extent_pages(dst);
6034                 for (i = 0; i < num_pages; i++)
6035                         copy_page(page_address(dst->pages[i]),
6036                                   page_address(src->pages[i]));
6037         } else {
6038                 size_t src_offset = get_eb_offset_in_page(src, 0);
6039                 size_t dst_offset = get_eb_offset_in_page(dst, 0);
6040
6041                 ASSERT(src->fs_info->sectorsize < PAGE_SIZE);
6042                 memcpy(page_address(dst->pages[0]) + dst_offset,
6043                        page_address(src->pages[0]) + src_offset,
6044                        src->len);
6045         }
6046 }
6047
6048 void copy_extent_buffer(const struct extent_buffer *dst,
6049                         const struct extent_buffer *src,
6050                         unsigned long dst_offset, unsigned long src_offset,
6051                         unsigned long len)
6052 {
6053         u64 dst_len = dst->len;
6054         size_t cur;
6055         size_t offset;
6056         struct page *page;
6057         char *kaddr;
6058         unsigned long i = get_eb_page_index(dst_offset);
6059
6060         if (check_eb_range(dst, dst_offset, len) ||
6061             check_eb_range(src, src_offset, len))
6062                 return;
6063
6064         WARN_ON(src->len != dst_len);
6065
6066         offset = get_eb_offset_in_page(dst, dst_offset);
6067
6068         while (len > 0) {
6069                 page = dst->pages[i];
6070                 WARN_ON(!PageUptodate(page));
6071
6072                 cur = min(len, (unsigned long)(PAGE_SIZE - offset));
6073
6074                 kaddr = page_address(page);
6075                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
6076
6077                 src_offset += cur;
6078                 len -= cur;
6079                 offset = 0;
6080                 i++;
6081         }
6082 }
6083
6084 /*
6085  * eb_bitmap_offset() - calculate the page and offset of the byte containing the
6086  * given bit number
6087  * @eb: the extent buffer
6088  * @start: offset of the bitmap item in the extent buffer
6089  * @nr: bit number
6090  * @page_index: return index of the page in the extent buffer that contains the
6091  * given bit number
6092  * @page_offset: return offset into the page given by page_index
6093  *
6094  * This helper hides the ugliness of finding the byte in an extent buffer which
6095  * contains a given bit.
6096  */
6097 static inline void eb_bitmap_offset(const struct extent_buffer *eb,
6098                                     unsigned long start, unsigned long nr,
6099                                     unsigned long *page_index,
6100                                     size_t *page_offset)
6101 {
6102         size_t byte_offset = BIT_BYTE(nr);
6103         size_t offset;
6104
6105         /*
6106          * The byte we want is the offset of the extent buffer + the offset of
6107          * the bitmap item in the extent buffer + the offset of the byte in the
6108          * bitmap item.
6109          */
6110         offset = start + offset_in_page(eb->start) + byte_offset;
6111
6112         *page_index = offset >> PAGE_SHIFT;
6113         *page_offset = offset_in_page(offset);
6114 }
6115
6116 /**
6117  * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
6118  * @eb: the extent buffer
6119  * @start: offset of the bitmap item in the extent buffer
6120  * @nr: bit number to test
6121  */
6122 int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
6123                            unsigned long nr)
6124 {
6125         u8 *kaddr;
6126         struct page *page;
6127         unsigned long i;
6128         size_t offset;
6129
6130         eb_bitmap_offset(eb, start, nr, &i, &offset);
6131         page = eb->pages[i];
6132         WARN_ON(!PageUptodate(page));
6133         kaddr = page_address(page);
6134         return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
6135 }
6136
6137 /**
6138  * extent_buffer_bitmap_set - set an area of a bitmap
6139  * @eb: the extent buffer
6140  * @start: offset of the bitmap item in the extent buffer
6141  * @pos: bit number of the first bit
6142  * @len: number of bits to set
6143  */
6144 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
6145                               unsigned long pos, unsigned long len)
6146 {
6147         u8 *kaddr;
6148         struct page *page;
6149         unsigned long i;
6150         size_t offset;
6151         const unsigned int size = pos + len;
6152         int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
6153         u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
6154
6155         eb_bitmap_offset(eb, start, pos, &i, &offset);
6156         page = eb->pages[i];
6157         WARN_ON(!PageUptodate(page));
6158         kaddr = page_address(page);
6159
6160         while (len >= bits_to_set) {
6161                 kaddr[offset] |= mask_to_set;
6162                 len -= bits_to_set;
6163                 bits_to_set = BITS_PER_BYTE;
6164                 mask_to_set = ~0;
6165                 if (++offset >= PAGE_SIZE && len > 0) {
6166                         offset = 0;
6167                         page = eb->pages[++i];
6168                         WARN_ON(!PageUptodate(page));
6169                         kaddr = page_address(page);
6170                 }
6171         }
6172         if (len) {
6173                 mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
6174                 kaddr[offset] |= mask_to_set;
6175         }
6176 }
6177
6178
6179 /**
6180  * extent_buffer_bitmap_clear - clear an area of a bitmap
6181  * @eb: the extent buffer
6182  * @start: offset of the bitmap item in the extent buffer
6183  * @pos: bit number of the first bit
6184  * @len: number of bits to clear
6185  */
6186 void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
6187                                 unsigned long start, unsigned long pos,
6188                                 unsigned long len)
6189 {
6190         u8 *kaddr;
6191         struct page *page;
6192         unsigned long i;
6193         size_t offset;
6194         const unsigned int size = pos + len;
6195         int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
6196         u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
6197
6198         eb_bitmap_offset(eb, start, pos, &i, &offset);
6199         page = eb->pages[i];
6200         WARN_ON(!PageUptodate(page));
6201         kaddr = page_address(page);
6202
6203         while (len >= bits_to_clear) {
6204                 kaddr[offset] &= ~mask_to_clear;
6205                 len -= bits_to_clear;
6206                 bits_to_clear = BITS_PER_BYTE;
6207                 mask_to_clear = ~0;
6208                 if (++offset >= PAGE_SIZE && len > 0) {
6209                         offset = 0;
6210                         page = eb->pages[++i];
6211                         WARN_ON(!PageUptodate(page));
6212                         kaddr = page_address(page);
6213                 }
6214         }
6215         if (len) {
6216                 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
6217                 kaddr[offset] &= ~mask_to_clear;
6218         }
6219 }
6220
6221 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
6222 {
6223         unsigned long distance = (src > dst) ? src - dst : dst - src;
6224         return distance < len;
6225 }
6226
6227 static void copy_pages(struct page *dst_page, struct page *src_page,
6228                        unsigned long dst_off, unsigned long src_off,
6229                        unsigned long len)
6230 {
6231         char *dst_kaddr = page_address(dst_page);
6232         char *src_kaddr;
6233         int must_memmove = 0;
6234
6235         if (dst_page != src_page) {
6236                 src_kaddr = page_address(src_page);
6237         } else {
6238                 src_kaddr = dst_kaddr;
6239                 if (areas_overlap(src_off, dst_off, len))
6240                         must_memmove = 1;
6241         }
6242
6243         if (must_memmove)
6244                 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
6245         else
6246                 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
6247 }
6248
6249 void memcpy_extent_buffer(const struct extent_buffer *dst,
6250                           unsigned long dst_offset, unsigned long src_offset,
6251                           unsigned long len)
6252 {
6253         size_t cur;
6254         size_t dst_off_in_page;
6255         size_t src_off_in_page;
6256         unsigned long dst_i;
6257         unsigned long src_i;
6258
6259         if (check_eb_range(dst, dst_offset, len) ||
6260             check_eb_range(dst, src_offset, len))
6261                 return;
6262
6263         while (len > 0) {
6264                 dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
6265                 src_off_in_page = get_eb_offset_in_page(dst, src_offset);
6266
6267                 dst_i = get_eb_page_index(dst_offset);
6268                 src_i = get_eb_page_index(src_offset);
6269
6270                 cur = min(len, (unsigned long)(PAGE_SIZE -
6271                                                src_off_in_page));
6272                 cur = min_t(unsigned long, cur,
6273                         (unsigned long)(PAGE_SIZE - dst_off_in_page));
6274
6275                 copy_pages(dst->pages[dst_i], dst->pages[src_i],
6276                            dst_off_in_page, src_off_in_page, cur);
6277
6278                 src_offset += cur;
6279                 dst_offset += cur;
6280                 len -= cur;
6281         }
6282 }
6283
6284 void memmove_extent_buffer(const struct extent_buffer *dst,
6285                            unsigned long dst_offset, unsigned long src_offset,
6286                            unsigned long len)
6287 {
6288         size_t cur;
6289         size_t dst_off_in_page;
6290         size_t src_off_in_page;
6291         unsigned long dst_end = dst_offset + len - 1;
6292         unsigned long src_end = src_offset + len - 1;
6293         unsigned long dst_i;
6294         unsigned long src_i;
6295
6296         if (check_eb_range(dst, dst_offset, len) ||
6297             check_eb_range(dst, src_offset, len))
6298                 return;
6299         if (dst_offset < src_offset) {
6300                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
6301                 return;
6302         }
6303         while (len > 0) {
6304                 dst_i = get_eb_page_index(dst_end);
6305                 src_i = get_eb_page_index(src_end);
6306
6307                 dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
6308                 src_off_in_page = get_eb_offset_in_page(dst, src_end);
6309
6310                 cur = min_t(unsigned long, len, src_off_in_page + 1);
6311                 cur = min(cur, dst_off_in_page + 1);
6312                 copy_pages(dst->pages[dst_i], dst->pages[src_i],
6313                            dst_off_in_page - cur + 1,
6314                            src_off_in_page - cur + 1, cur);
6315
6316                 dst_end -= cur;
6317                 src_end -= cur;
6318                 len -= cur;
6319         }
6320 }
6321
6322 int try_release_extent_buffer(struct page *page)
6323 {
6324         struct extent_buffer *eb;
6325
6326         /*
6327          * We need to make sure nobody is attaching this page to an eb right
6328          * now.
6329          */
6330         spin_lock(&page->mapping->private_lock);
6331         if (!PagePrivate(page)) {
6332                 spin_unlock(&page->mapping->private_lock);
6333                 return 1;
6334         }
6335
6336         eb = (struct extent_buffer *)page->private;
6337         BUG_ON(!eb);
6338
6339         /*
6340          * This is a little awful but should be ok, we need to make sure that
6341          * the eb doesn't disappear out from under us while we're looking at
6342          * this page.
6343          */
6344         spin_lock(&eb->refs_lock);
6345         if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
6346                 spin_unlock(&eb->refs_lock);
6347                 spin_unlock(&page->mapping->private_lock);
6348                 return 0;
6349         }
6350         spin_unlock(&page->mapping->private_lock);
6351
6352         /*
6353          * If tree ref isn't set then we know the ref on this eb is a real ref,
6354          * so just return, this page will likely be freed soon anyway.
6355          */
6356         if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
6357                 spin_unlock(&eb->refs_lock);
6358                 return 0;
6359         }
6360
6361         return release_extent_buffer(eb);
6362 }
6363
6364 /*
6365  * btrfs_readahead_tree_block - attempt to readahead a child block
6366  * @fs_info:    the fs_info
6367  * @bytenr:     bytenr to read
6368  * @owner_root: objectid of the root that owns this eb
6369  * @gen:        generation for the uptodate check, can be 0
6370  * @level:      level for the eb
6371  *
6372  * Attempt to readahead a tree block at @bytenr.  If @gen is 0 then we do a
6373  * normal uptodate check of the eb, without checking the generation.  If we have
6374  * to read the block we will not block on anything.
6375  */
6376 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
6377                                 u64 bytenr, u64 owner_root, u64 gen, int level)
6378 {
6379         struct extent_buffer *eb;
6380         int ret;
6381
6382         eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
6383         if (IS_ERR(eb))
6384                 return;
6385
6386         if (btrfs_buffer_uptodate(eb, gen, 1)) {
6387                 free_extent_buffer(eb);
6388                 return;
6389         }
6390
6391         ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
6392         if (ret < 0)
6393                 free_extent_buffer_stale(eb);
6394         else
6395                 free_extent_buffer(eb);
6396 }
6397
6398 /*
6399  * btrfs_readahead_node_child - readahead a node's child block
6400  * @node:       parent node we're reading from
6401  * @slot:       slot in the parent node for the child we want to read
6402  *
6403  * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
6404  * the slot in the node provided.
6405  */
6406 void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
6407 {
6408         btrfs_readahead_tree_block(node->fs_info,
6409                                    btrfs_node_blockptr(node, slot),
6410                                    btrfs_header_owner(node),
6411                                    btrfs_node_ptr_generation(node, slot),
6412                                    btrfs_header_level(node) - 1);
6413 }