fs/btrfs/extent_io.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include <linux/bitops.h>
   4 #include <linux/slab.h>
   5 #include <linux/bio.h>
   6 #include <linux/mm.h>
   7 #include <linux/pagemap.h>
   8 #include <linux/page-flags.h>
   9 #include <linux/spinlock.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/swap.h>
  12 #include <linux/writeback.h>
  13 #include <linux/pagevec.h>
  14 #include <linux/prefetch.h>
  15 #include <linux/cleancache.h>
  16 #include "misc.h"
  17 #include "extent_io.h"
  18 #include "extent-io-tree.h"
  19 #include "extent_map.h"
  20 #include "ctree.h"
  21 #include "btrfs_inode.h"
  22 #include "volumes.h"
  23 #include "check-integrity.h"
  24 #include "locking.h"
  25 #include "rcu-string.h"
  26 #include "backref.h"
  27 #include "disk-io.h"
  28 #include "subpage.h"
  29 #include "zoned.h"
  30 #include "block-group.h"
  31
  32 static struct kmem_cache *extent_state_cache;
  33 static struct kmem_cache *extent_buffer_cache;
  34 static struct bio_set btrfs_bioset;
  35
  36 static inline bool extent_state_in_tree(const struct extent_state *state)
  37 {
  38         return !RB_EMPTY_NODE(&state->rb_node);
  39 }
  40
  41 #ifdef CONFIG_BTRFS_DEBUG
  42 static LIST_HEAD(states);
  43 static DEFINE_SPINLOCK(leak_lock);
  44
  45 static inline void btrfs_leak_debug_add(spinlock_t *lock,
  46                                         struct list_head *new,
  47                                         struct list_head *head)
  48 {
  49         unsigned long flags;
  50
  51         spin_lock_irqsave(lock, flags);
  52         list_add(new, head);
  53         spin_unlock_irqrestore(lock, flags);
  54 }
  55
  56 static inline void btrfs_leak_debug_del(spinlock_t *lock,
  57                                         struct list_head *entry)
  58 {
  59         unsigned long flags;
  60
  61         spin_lock_irqsave(lock, flags);
  62         list_del(entry);
  63         spin_unlock_irqrestore(lock, flags);
  64 }
  65
  66 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
  67 {
  68         struct extent_buffer *eb;
  69         unsigned long flags;
  70
  71         /*
  72          * If we didn't get into open_ctree our allocated_ebs will not be
  73          * initialized, so just skip this.
  74          */
  75         if (!fs_info->allocated_ebs.next)
  76                 return;
  77
  78         spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
  79         while (!list_empty(&fs_info->allocated_ebs)) {
  80                 eb = list_first_entry(&fs_info->allocated_ebs,
  81                                       struct extent_buffer, leak_list);
  82                 pr_err(
  83         "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
  84                        eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
  85                        btrfs_header_owner(eb));
  86                 list_del(&eb->leak_list);
  87                 kmem_cache_free(extent_buffer_cache, eb);
  88         }
  89         spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
  90 }
  91
  92 static inline void btrfs_extent_state_leak_debug_check(void)
  93 {
  94         struct extent_state *state;
  95
  96         while (!list_empty(&states)) {
  97                 state = list_entry(states.next, struct extent_state, leak_list);
  98                 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
  99                        state->start, state->end, state->state,
 100                        extent_state_in_tree(state),
 101                        refcount_read(&state->refs));
 102                 list_del(&state->leak_list);
 103                 kmem_cache_free(extent_state_cache, state);
 104         }
 105 }
 106
 107 #define btrfs_debug_check_extent_io_range(tree, start, end)             \
 108         __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
 109 static inline void __btrfs_debug_check_extent_io_range(const char *caller,
 110                 struct extent_io_tree *tree, u64 start, u64 end)
 111 {
 112         struct inode *inode = tree->private_data;
 113         u64 isize;
 114
 115         if (!inode || !is_data_inode(inode))
 116                 return;
 117
 118         isize = i_size_read(inode);
 119         if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
 120                 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
 121                     "%s: ino %llu isize %llu odd range [%llu,%llu]",
 122                         caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
 123         }
 124 }
 125 #else
 126 #define btrfs_leak_debug_add(lock, new, head)   do {} while (0)
 127 #define btrfs_leak_debug_del(lock, entry)       do {} while (0)
 128 #define btrfs_extent_state_leak_debug_check()   do {} while (0)
 129 #define btrfs_debug_check_extent_io_range(c, s, e)      do {} while (0)
 130 #endif
 131
 132 struct tree_entry {
 133         u64 start;
 134         u64 end;
 135         struct rb_node rb_node;
 136 };
 137
 138 struct extent_page_data {
 139         struct btrfs_bio_ctrl bio_ctrl;
 140         /* tells writepage not to lock the state bits for this range
 141          * it still does the unlocking
 142          */
 143         unsigned int extent_locked:1;
 144
 145         /* tells the submit_bio code to use REQ_SYNC */
 146         unsigned int sync_io:1;
 147 };
 148
 149 static int add_extent_changeset(struct extent_state *state, u32 bits,
 150                                  struct extent_changeset *changeset,
 151                                  int set)
 152 {
 153         int ret;
 154
 155         if (!changeset)
 156                 return 0;
 157         if (set && (state->state & bits) == bits)
 158                 return 0;
 159         if (!set && (state->state & bits) == 0)
 160                 return 0;
 161         changeset->bytes_changed += state->end - state->start + 1;
 162         ret = ulist_add(&changeset->range_changed, state->start, state->end,
 163                         GFP_ATOMIC);
 164         return ret;
 165 }
 166
 167 int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 168                                 unsigned long bio_flags)
 169 {
 170         blk_status_t ret = 0;
 171         struct extent_io_tree *tree = bio->bi_private;
 172
 173         bio->bi_private = NULL;
 174
 175         if (is_data_inode(tree->private_data))
 176                 ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
 177                                             bio_flags);
 178         else
 179                 ret = btrfs_submit_metadata_bio(tree->private_data, bio,
 180                                                 mirror_num, bio_flags);
 181
 182         return blk_status_to_errno(ret);
 183 }
 184
 185 /* Cleanup unsubmitted bios */
 186 static void end_write_bio(struct extent_page_data *epd, int ret)
 187 {
 188         struct bio *bio = epd->bio_ctrl.bio;
 189
 190         if (bio) {
 191                 bio->bi_status = errno_to_blk_status(ret);
 192                 bio_endio(bio);
 193                 epd->bio_ctrl.bio = NULL;
 194         }
 195 }
 196
 197 /*
 198  * Submit bio from extent page data via submit_one_bio
 199  *
 200  * Return 0 if everything is OK.
 201  * Return <0 for error.
 202  */
 203 static int __must_check flush_write_bio(struct extent_page_data *epd)
 204 {
 205         int ret = 0;
 206         struct bio *bio = epd->bio_ctrl.bio;
 207
 208         if (bio) {
 209                 ret = submit_one_bio(bio, 0, 0);
 210                 /*
 211                  * Clean up of epd->bio is handled by its endio function.
 212                  * And endio is either triggered by successful bio execution
 213                  * or the error handler of submit bio hook.
 214                  * So at this point, no matter what happened, we don't need
 215                  * to clean up epd->bio.
 216                  */
 217                 epd->bio_ctrl.bio = NULL;
 218         }
 219         return ret;
 220 }
 221
 222 int __init extent_state_cache_init(void)
 223 {
 224         extent_state_cache = kmem_cache_create("btrfs_extent_state",
 225                         sizeof(struct extent_state), 0,
 226                         SLAB_MEM_SPREAD, NULL);
 227         if (!extent_state_cache)
 228                 return -ENOMEM;
 229         return 0;
 230 }
 231
 232 int __init extent_io_init(void)
 233 {
 234         extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
 235                         sizeof(struct extent_buffer), 0,
 236                         SLAB_MEM_SPREAD, NULL);
 237         if (!extent_buffer_cache)
 238                 return -ENOMEM;
 239
 240         if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
 241                         offsetof(struct btrfs_io_bio, bio),
 242                         BIOSET_NEED_BVECS))
 243                 goto free_buffer_cache;
 244
 245         if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
 246                 goto free_bioset;
 247
 248         return 0;
 249
 250 free_bioset:
 251         bioset_exit(&btrfs_bioset);
 252
 253 free_buffer_cache:
 254         kmem_cache_destroy(extent_buffer_cache);
 255         extent_buffer_cache = NULL;
 256         return -ENOMEM;
 257 }
 258
 259 void __cold extent_state_cache_exit(void)
 260 {
 261         btrfs_extent_state_leak_debug_check();
 262         kmem_cache_destroy(extent_state_cache);
 263 }
 264
 265 void __cold extent_io_exit(void)
 266 {
 267         /*
 268          * Make sure all delayed rcu free are flushed before we
 269          * destroy caches.
 270          */
 271         rcu_barrier();
 272         kmem_cache_destroy(extent_buffer_cache);
 273         bioset_exit(&btrfs_bioset);
 274 }
 275
 276 /*
 277  * For the file_extent_tree, we want to hold the inode lock when we lookup and
 278  * update the disk_i_size, but lockdep will complain because our io_tree we hold
 279  * the tree lock and get the inode lock when setting delalloc.  These two things
 280  * are unrelated, so make a class for the file_extent_tree so we don't get the
 281  * two locking patterns mixed up.
 282  */
 283 static struct lock_class_key file_extent_tree_class;
 284
 285 void extent_io_tree_init(struct btrfs_fs_info *fs_info,
 286                          struct extent_io_tree *tree, unsigned int owner,
 287                          void *private_data)
 288 {
 289         tree->fs_info = fs_info;
 290         tree->state = RB_ROOT;
 291         tree->dirty_bytes = 0;
 292         spin_lock_init(&tree->lock);
 293         tree->private_data = private_data;
 294         tree->owner = owner;
 295         if (owner == IO_TREE_INODE_FILE_EXTENT)
 296                 lockdep_set_class(&tree->lock, &file_extent_tree_class);
 297 }
 298
 299 void extent_io_tree_release(struct extent_io_tree *tree)
 300 {
 301         spin_lock(&tree->lock);
 302         /*
 303          * Do a single barrier for the waitqueue_active check here, the state
 304          * of the waitqueue should not change once extent_io_tree_release is
 305          * called.
 306          */
 307         smp_mb();
 308         while (!RB_EMPTY_ROOT(&tree->state)) {
 309                 struct rb_node *node;
 310                 struct extent_state *state;
 311
 312                 node = rb_first(&tree->state);
 313                 state = rb_entry(node, struct extent_state, rb_node);
 314                 rb_erase(&state->rb_node, &tree->state);
 315                 RB_CLEAR_NODE(&state->rb_node);
 316                 /*
 317                  * btree io trees aren't supposed to have tasks waiting for
 318                  * changes in the flags of extent states ever.
 319                  */
 320                 ASSERT(!waitqueue_active(&state->wq));
 321                 free_extent_state(state);
 322
 323                 cond_resched_lock(&tree->lock);
 324         }
 325         spin_unlock(&tree->lock);
 326 }
 327
 328 static struct extent_state *alloc_extent_state(gfp_t mask)
 329 {
 330         struct extent_state *state;
 331
 332         /*
 333          * The given mask might be not appropriate for the slab allocator,
 334          * drop the unsupported bits
 335          */
 336         mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
 337         state = kmem_cache_alloc(extent_state_cache, mask);
 338         if (!state)
 339                 return state;
 340         state->state = 0;
 341         state->failrec = NULL;
 342         RB_CLEAR_NODE(&state->rb_node);
 343         btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
 344         refcount_set(&state->refs, 1);
 345         init_waitqueue_head(&state->wq);
 346         trace_alloc_extent_state(state, mask, _RET_IP_);
 347         return state;
 348 }
 349
 350 void free_extent_state(struct extent_state *state)
 351 {
 352         if (!state)
 353                 return;
 354         if (refcount_dec_and_test(&state->refs)) {
 355                 WARN_ON(extent_state_in_tree(state));
 356                 btrfs_leak_debug_del(&leak_lock, &state->leak_list);
 357                 trace_free_extent_state(state, _RET_IP_);
 358                 kmem_cache_free(extent_state_cache, state);
 359         }
 360 }
 361
 362 static struct rb_node *tree_insert(struct rb_root *root,
 363                                    struct rb_node *search_start,
 364                                    u64 offset,
 365                                    struct rb_node *node,
 366                                    struct rb_node ***p_in,
 367                                    struct rb_node **parent_in)
 368 {
 369         struct rb_node **p;
 370         struct rb_node *parent = NULL;
 371         struct tree_entry *entry;
 372
 373         if (p_in && parent_in) {
 374                 p = *p_in;
 375                 parent = *parent_in;
 376                 goto do_insert;
 377         }
 378
 379         p = search_start ? &search_start : &root->rb_node;
 380         while (*p) {
 381                 parent = *p;
 382                 entry = rb_entry(parent, struct tree_entry, rb_node);
 383
 384                 if (offset < entry->start)
 385                         p = &(*p)->rb_left;
 386                 else if (offset > entry->end)
 387                         p = &(*p)->rb_right;
 388                 else
 389                         return parent;
 390         }
 391
 392 do_insert:
 393         rb_link_node(node, parent, p);
 394         rb_insert_color(node, root);
 395         return NULL;
 396 }
 397
 398 /**
 399  * Search @tree for an entry that contains @offset. Such entry would have
 400  * entry->start <= offset && entry->end >= offset.
 401  *
 402  * @tree:       the tree to search
 403  * @offset:     offset that should fall within an entry in @tree
 404  * @next_ret:   pointer to the first entry whose range ends after @offset
 405  * @prev_ret:   pointer to the first entry whose range begins before @offset
 406  * @p_ret:      pointer where new node should be anchored (used when inserting an
 407  *              entry in the tree)
 408  * @parent_ret: points to entry which would have been the parent of the entry,
 409  *               containing @offset
 410  *
 411  * This function returns a pointer to the entry that contains @offset byte
 412  * address. If no such entry exists, then NULL is returned and the other
 413  * pointer arguments to the function are filled, otherwise the found entry is
 414  * returned and other pointers are left untouched.
 415  */
 416 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 417                                       struct rb_node **next_ret,
 418                                       struct rb_node **prev_ret,
 419                                       struct rb_node ***p_ret,
 420                                       struct rb_node **parent_ret)
 421 {
 422         struct rb_root *root = &tree->state;
 423         struct rb_node **n = &root->rb_node;
 424         struct rb_node *prev = NULL;
 425         struct rb_node *orig_prev = NULL;
 426         struct tree_entry *entry;
 427         struct tree_entry *prev_entry = NULL;
 428
 429         while (*n) {
 430                 prev = *n;
 431                 entry = rb_entry(prev, struct tree_entry, rb_node);
 432                 prev_entry = entry;
 433
 434                 if (offset < entry->start)
 435                         n = &(*n)->rb_left;
 436                 else if (offset > entry->end)
 437                         n = &(*n)->rb_right;
 438                 else
 439                         return *n;
 440         }
 441
 442         if (p_ret)
 443                 *p_ret = n;
 444         if (parent_ret)
 445                 *parent_ret = prev;
 446
 447         if (next_ret) {
 448                 orig_prev = prev;
 449                 while (prev && offset > prev_entry->end) {
 450                         prev = rb_next(prev);
 451                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 452                 }
 453                 *next_ret = prev;
 454                 prev = orig_prev;
 455         }
 456
 457         if (prev_ret) {
 458                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 459                 while (prev && offset < prev_entry->start) {
 460                         prev = rb_prev(prev);
 461                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 462                 }
 463                 *prev_ret = prev;
 464         }
 465         return NULL;
 466 }
 467
 468 static inline struct rb_node *
 469 tree_search_for_insert(struct extent_io_tree *tree,
 470                        u64 offset,
 471                        struct rb_node ***p_ret,
 472                        struct rb_node **parent_ret)
 473 {
 474         struct rb_node *next= NULL;
 475         struct rb_node *ret;
 476
 477         ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
 478         if (!ret)
 479                 return next;
 480         return ret;
 481 }
 482
 483 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 484                                           u64 offset)
 485 {
 486         return tree_search_for_insert(tree, offset, NULL, NULL);
 487 }
 488
 489 /*
 490  * utility function to look for merge candidates inside a given range.
 491  * Any extents with matching state are merged together into a single
 492  * extent in the tree.  Extents with EXTENT_IO in their state field
 493  * are not merged because the end_io handlers need to be able to do
 494  * operations on them without sleeping (or doing allocations/splits).
 495  *
 496  * This should be called with the tree lock held.
 497  */
 498 static void merge_state(struct extent_io_tree *tree,
 499                         struct extent_state *state)
 500 {
 501         struct extent_state *other;
 502         struct rb_node *other_node;
 503
 504         if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
 505                 return;
 506
 507         other_node = rb_prev(&state->rb_node);
 508         if (other_node) {
 509                 other = rb_entry(other_node, struct extent_state, rb_node);
 510                 if (other->end == state->start - 1 &&
 511                     other->state == state->state) {
 512                         if (tree->private_data &&
 513                             is_data_inode(tree->private_data))
 514                                 btrfs_merge_delalloc_extent(tree->private_data,
 515                                                             state, other);
 516                         state->start = other->start;
 517                         rb_erase(&other->rb_node, &tree->state);
 518                         RB_CLEAR_NODE(&other->rb_node);
 519                         free_extent_state(other);
 520                 }
 521         }
 522         other_node = rb_next(&state->rb_node);
 523         if (other_node) {
 524                 other = rb_entry(other_node, struct extent_state, rb_node);
 525                 if (other->start == state->end + 1 &&
 526                     other->state == state->state) {
 527                         if (tree->private_data &&
 528                             is_data_inode(tree->private_data))
 529                                 btrfs_merge_delalloc_extent(tree->private_data,
 530                                                             state, other);
 531                         state->end = other->end;
 532                         rb_erase(&other->rb_node, &tree->state);
 533                         RB_CLEAR_NODE(&other->rb_node);
 534                         free_extent_state(other);
 535                 }
 536         }
 537 }
 538
 539 static void set_state_bits(struct extent_io_tree *tree,
 540                            struct extent_state *state, u32 *bits,
 541                            struct extent_changeset *changeset);
 542
 543 /*
 544  * insert an extent_state struct into the tree.  'bits' are set on the
 545  * struct before it is inserted.
 546  *
 547  * This may return -EEXIST if the extent is already there, in which case the
 548  * state struct is freed.
 549  *
 550  * The tree lock is not taken internally.  This is a utility function and
 551  * probably isn't what you want to call (see set/clear_extent_bit).
 552  */
 553 static int insert_state(struct extent_io_tree *tree,
 554                         struct extent_state *state, u64 start, u64 end,
 555                         struct rb_node ***p,
 556                         struct rb_node **parent,
 557                         u32 *bits, struct extent_changeset *changeset)
 558 {
 559         struct rb_node *node;
 560
 561         if (end < start) {
 562                 btrfs_err(tree->fs_info,
 563                         "insert state: end < start %llu %llu", end, start);
 564                 WARN_ON(1);
 565         }
 566         state->start = start;
 567         state->end = end;
 568
 569         set_state_bits(tree, state, bits, changeset);
 570
 571         node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
 572         if (node) {
 573                 struct extent_state *found;
 574                 found = rb_entry(node, struct extent_state, rb_node);
 575                 btrfs_err(tree->fs_info,
 576                        "found node %llu %llu on insert of %llu %llu",
 577                        found->start, found->end, start, end);
 578                 return -EEXIST;
 579         }
 580         merge_state(tree, state);
 581         return 0;
 582 }
 583
 584 /*
 585  * split a given extent state struct in two, inserting the preallocated
 586  * struct 'prealloc' as the newly created second half.  'split' indicates an
 587  * offset inside 'orig' where it should be split.
 588  *
 589  * Before calling,
 590  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 591  * are two extent state structs in the tree:
 592  * prealloc: [orig->start, split - 1]
 593  * orig: [ split, orig->end ]
 594  *
 595  * The tree locks are not taken by this function. They need to be held
 596  * by the caller.
 597  */
 598 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 599                        struct extent_state *prealloc, u64 split)
 600 {
 601         struct rb_node *node;
 602
 603         if (tree->private_data && is_data_inode(tree->private_data))
 604                 btrfs_split_delalloc_extent(tree->private_data, orig, split);
 605
 606         prealloc->start = orig->start;
 607         prealloc->end = split - 1;
 608         prealloc->state = orig->state;
 609         orig->start = split;
 610
 611         node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
 612                            &prealloc->rb_node, NULL, NULL);
 613         if (node) {
 614                 free_extent_state(prealloc);
 615                 return -EEXIST;
 616         }
 617         return 0;
 618 }
 619
 620 static struct extent_state *next_state(struct extent_state *state)
 621 {
 622         struct rb_node *next = rb_next(&state->rb_node);
 623         if (next)
 624                 return rb_entry(next, struct extent_state, rb_node);
 625         else
 626                 return NULL;
 627 }
 628
 629 /*
 630  * utility function to clear some bits in an extent state struct.
 631  * it will optionally wake up anyone waiting on this state (wake == 1).
 632  *
 633  * If no bits are set on the state struct after clearing things, the
 634  * struct is freed and removed from the tree
 635  */
 636 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 637                                             struct extent_state *state,
 638                                             u32 *bits, int wake,
 639                                             struct extent_changeset *changeset)
 640 {
 641         struct extent_state *next;
 642         u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
 643         int ret;
 644
 645         if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 646                 u64 range = state->end - state->start + 1;
 647                 WARN_ON(range > tree->dirty_bytes);
 648                 tree->dirty_bytes -= range;
 649         }
 650
 651         if (tree->private_data && is_data_inode(tree->private_data))
 652                 btrfs_clear_delalloc_extent(tree->private_data, state, bits);
 653
 654         ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
 655         BUG_ON(ret < 0);
 656         state->state &= ~bits_to_clear;
 657         if (wake)
 658                 wake_up(&state->wq);
 659         if (state->state == 0) {
 660                 next = next_state(state);
 661                 if (extent_state_in_tree(state)) {
 662                         rb_erase(&state->rb_node, &tree->state);
 663                         RB_CLEAR_NODE(&state->rb_node);
 664                         free_extent_state(state);
 665                 } else {
 666                         WARN_ON(1);
 667                 }
 668         } else {
 669                 merge_state(tree, state);
 670                 next = next_state(state);
 671         }
 672         return next;
 673 }
 674
 675 static struct extent_state *
 676 alloc_extent_state_atomic(struct extent_state *prealloc)
 677 {
 678         if (!prealloc)
 679                 prealloc = alloc_extent_state(GFP_ATOMIC);
 680
 681         return prealloc;
 682 }
 683
 684 static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 685 {
 686         btrfs_panic(tree->fs_info, err,
 687         "locking error: extent tree was modified by another thread while locked");
 688 }
 689
 690 /*
 691  * clear some bits on a range in the tree.  This may require splitting
 692  * or inserting elements in the tree, so the gfp mask is used to
 693  * indicate which allocations or sleeping are allowed.
 694  *
 695  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 696  * the given range from the tree regardless of state (ie for truncate).
 697  *
 698  * the range [start, end] is inclusive.
 699  *
 700  * This takes the tree lock, and returns 0 on success and < 0 on error.
 701  */
 702 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 703                        u32 bits, int wake, int delete,
 704                        struct extent_state **cached_state,
 705                        gfp_t mask, struct extent_changeset *changeset)
 706 {
 707         struct extent_state *state;
 708         struct extent_state *cached;
 709         struct extent_state *prealloc = NULL;
 710         struct rb_node *node;
 711         u64 last_end;
 712         int err;
 713         int clear = 0;
 714
 715         btrfs_debug_check_extent_io_range(tree, start, end);
 716         trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
 717
 718         if (bits & EXTENT_DELALLOC)
 719                 bits |= EXTENT_NORESERVE;
 720
 721         if (delete)
 722                 bits |= ~EXTENT_CTLBITS;
 723
 724         if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
 725                 clear = 1;
 726 again:
 727         if (!prealloc && gfpflags_allow_blocking(mask)) {
 728                 /*
 729                  * Don't care for allocation failure here because we might end
 730                  * up not needing the pre-allocated extent state at all, which
 731                  * is the case if we only have in the tree extent states that
 732                  * cover our input range and don't cover too any other range.
 733                  * If we end up needing a new extent state we allocate it later.
 734                  */
 735                 prealloc = alloc_extent_state(mask);
 736         }
 737
 738         spin_lock(&tree->lock);
 739         if (cached_state) {
 740                 cached = *cached_state;
 741
 742                 if (clear) {
 743                         *cached_state = NULL;
 744                         cached_state = NULL;
 745                 }
 746
 747                 if (cached && extent_state_in_tree(cached) &&
 748                     cached->start <= start && cached->end > start) {
 749                         if (clear)
 750                                 refcount_dec(&cached->refs);
 751                         state = cached;
 752                         goto hit_next;
 753                 }
 754                 if (clear)
 755                         free_extent_state(cached);
 756         }
 757         /*
 758          * this search will find the extents that end after
 759          * our range starts
 760          */
 761         node = tree_search(tree, start);
 762         if (!node)
 763                 goto out;
 764         state = rb_entry(node, struct extent_state, rb_node);
 765 hit_next:
 766         if (state->start > end)
 767                 goto out;
 768         WARN_ON(state->end < start);
 769         last_end = state->end;
 770
 771         /* the state doesn't have the wanted bits, go ahead */
 772         if (!(state->state & bits)) {
 773                 state = next_state(state);
 774                 goto next;
 775         }
 776
 777         /*
 778          *     | ---- desired range ---- |
 779          *  | state | or
 780          *  | ------------- state -------------- |
 781          *
 782          * We need to split the extent we found, and may flip
 783          * bits on second half.
 784          *
 785          * If the extent we found extends past our range, we
 786          * just split and search again.  It'll get split again
 787          * the next time though.
 788          *
 789          * If the extent we found is inside our range, we clear
 790          * the desired bit on it.
 791          */
 792
 793         if (state->start < start) {
 794                 prealloc = alloc_extent_state_atomic(prealloc);
 795                 BUG_ON(!prealloc);
 796                 err = split_state(tree, state, prealloc, start);
 797                 if (err)
 798                         extent_io_tree_panic(tree, err);
 799
 800                 prealloc = NULL;
 801                 if (err)
 802                         goto out;
 803                 if (state->end <= end) {
 804                         state = clear_state_bit(tree, state, &bits, wake,
 805                                                 changeset);
 806                         goto next;
 807                 }
 808                 goto search_again;
 809         }
 810         /*
 811          * | ---- desired range ---- |
 812          *                        | state |
 813          * We need to split the extent, and clear the bit
 814          * on the first half
 815          */
 816         if (state->start <= end && state->end > end) {
 817                 prealloc = alloc_extent_state_atomic(prealloc);
 818                 BUG_ON(!prealloc);
 819                 err = split_state(tree, state, prealloc, end + 1);
 820                 if (err)
 821                         extent_io_tree_panic(tree, err);
 822
 823                 if (wake)
 824                         wake_up(&state->wq);
 825
 826                 clear_state_bit(tree, prealloc, &bits, wake, changeset);
 827
 828                 prealloc = NULL;
 829                 goto out;
 830         }
 831
 832         state = clear_state_bit(tree, state, &bits, wake, changeset);
 833 next:
 834         if (last_end == (u64)-1)
 835                 goto out;
 836         start = last_end + 1;
 837         if (start <= end && state && !need_resched())
 838                 goto hit_next;
 839
 840 search_again:
 841         if (start > end)
 842                 goto out;
 843         spin_unlock(&tree->lock);
 844         if (gfpflags_allow_blocking(mask))
 845                 cond_resched();
 846         goto again;
 847
 848 out:
 849         spin_unlock(&tree->lock);
 850         if (prealloc)
 851                 free_extent_state(prealloc);
 852
 853         return 0;
 854
 855 }
 856
 857 static void wait_on_state(struct extent_io_tree *tree,
 858                           struct extent_state *state)
 859                 __releases(tree->lock)
 860                 __acquires(tree->lock)
 861 {
 862         DEFINE_WAIT(wait);
 863         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
 864         spin_unlock(&tree->lock);
 865         schedule();
 866         spin_lock(&tree->lock);
 867         finish_wait(&state->wq, &wait);
 868 }
 869
 870 /*
 871  * waits for one or more bits to clear on a range in the state tree.
 872  * The range [start, end] is inclusive.
 873  * The tree lock is taken by this function
 874  */
 875 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 876                             u32 bits)
 877 {
 878         struct extent_state *state;
 879         struct rb_node *node;
 880
 881         btrfs_debug_check_extent_io_range(tree, start, end);
 882
 883         spin_lock(&tree->lock);
 884 again:
 885         while (1) {
 886                 /*
 887                  * this search will find all the extents that end after
 888                  * our range starts
 889                  */
 890                 node = tree_search(tree, start);
 891 process_node:
 892                 if (!node)
 893                         break;
 894
 895                 state = rb_entry(node, struct extent_state, rb_node);
 896
 897                 if (state->start > end)
 898                         goto out;
 899
 900                 if (state->state & bits) {
 901                         start = state->start;
 902                         refcount_inc(&state->refs);
 903                         wait_on_state(tree, state);
 904                         free_extent_state(state);
 905                         goto again;
 906                 }
 907                 start = state->end + 1;
 908
 909                 if (start > end)
 910                         break;
 911
 912                 if (!cond_resched_lock(&tree->lock)) {
 913                         node = rb_next(node);
 914                         goto process_node;
 915                 }
 916         }
 917 out:
 918         spin_unlock(&tree->lock);
 919 }
 920
 921 static void set_state_bits(struct extent_io_tree *tree,
 922                            struct extent_state *state,
 923                            u32 *bits, struct extent_changeset *changeset)
 924 {
 925         u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
 926         int ret;
 927
 928         if (tree->private_data && is_data_inode(tree->private_data))
 929                 btrfs_set_delalloc_extent(tree->private_data, state, bits);
 930
 931         if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 932                 u64 range = state->end - state->start + 1;
 933                 tree->dirty_bytes += range;
 934         }
 935         ret = add_extent_changeset(state, bits_to_set, changeset, 1);
 936         BUG_ON(ret < 0);
 937         state->state |= bits_to_set;
 938 }
 939
 940 static void cache_state_if_flags(struct extent_state *state,
 941                                  struct extent_state **cached_ptr,
 942                                  unsigned flags)
 943 {
 944         if (cached_ptr && !(*cached_ptr)) {
 945                 if (!flags || (state->state & flags)) {
 946                         *cached_ptr = state;
 947                         refcount_inc(&state->refs);
 948                 }
 949         }
 950 }
 951
 952 static void cache_state(struct extent_state *state,
 953                         struct extent_state **cached_ptr)
 954 {
 955         return cache_state_if_flags(state, cached_ptr,
 956                                     EXTENT_LOCKED | EXTENT_BOUNDARY);
 957 }
 958
 959 /*
 960  * set some bits on a range in the tree.  This may require allocations or
 961  * sleeping, so the gfp mask is used to indicate what is allowed.
 962  *
 963  * If any of the exclusive bits are set, this will fail with -EEXIST if some
 964  * part of the range already has the desired bits set.  The start of the
 965  * existing range is returned in failed_start in this case.
 966  *
 967  * [start, end] is inclusive This takes the tree lock.
 968  */
 969 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
 970                    u32 exclusive_bits, u64 *failed_start,
 971                    struct extent_state **cached_state, gfp_t mask,
 972                    struct extent_changeset *changeset)
 973 {
 974         struct extent_state *state;
 975         struct extent_state *prealloc = NULL;
 976         struct rb_node *node;
 977         struct rb_node **p;
 978         struct rb_node *parent;
 979         int err = 0;
 980         u64 last_start;
 981         u64 last_end;
 982
 983         btrfs_debug_check_extent_io_range(tree, start, end);
 984         trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
 985
 986         if (exclusive_bits)
 987                 ASSERT(failed_start);
 988         else
 989                 ASSERT(failed_start == NULL);
 990 again:
 991         if (!prealloc && gfpflags_allow_blocking(mask)) {
 992                 /*
 993                  * Don't care for allocation failure here because we might end
 994                  * up not needing the pre-allocated extent state at all, which
 995                  * is the case if we only have in the tree extent states that
 996                  * cover our input range and don't cover too any other range.
 997                  * If we end up needing a new extent state we allocate it later.
 998                  */
 999                 prealloc = alloc_extent_state(mask);
1000         }
1001
1002         spin_lock(&tree->lock);
1003         if (cached_state && *cached_state) {
1004                 state = *cached_state;
1005                 if (state->start <= start && state->end > start &&
1006                     extent_state_in_tree(state)) {
1007                         node = &state->rb_node;
1008                         goto hit_next;
1009                 }
1010         }
1011         /*
1012          * this search will find all the extents that end after
1013          * our range starts.
1014          */
1015         node = tree_search_for_insert(tree, start, &p, &parent);
1016         if (!node) {
1017                 prealloc = alloc_extent_state_atomic(prealloc);
1018                 BUG_ON(!prealloc);
1019                 err = insert_state(tree, prealloc, start, end,
1020                                    &p, &parent, &bits, changeset);
1021                 if (err)
1022                         extent_io_tree_panic(tree, err);
1023
1024                 cache_state(prealloc, cached_state);
1025                 prealloc = NULL;
1026                 goto out;
1027         }
1028         state = rb_entry(node, struct extent_state, rb_node);
1029 hit_next:
1030         last_start = state->start;
1031         last_end = state->end;
1032
1033         /*
1034          * | ---- desired range ---- |
1035          * | state |
1036          *
1037          * Just lock what we found and keep going
1038          */
1039         if (state->start == start && state->end <= end) {
1040                 if (state->state & exclusive_bits) {
1041                         *failed_start = state->start;
1042                         err = -EEXIST;
1043                         goto out;
1044                 }
1045
1046                 set_state_bits(tree, state, &bits, changeset);
1047                 cache_state(state, cached_state);
1048                 merge_state(tree, state);
1049                 if (last_end == (u64)-1)
1050                         goto out;
1051                 start = last_end + 1;
1052                 state = next_state(state);
1053                 if (start < end && state && state->start == start &&
1054                     !need_resched())
1055                         goto hit_next;
1056                 goto search_again;
1057         }
1058
1059         /*
1060          *     | ---- desired range ---- |
1061          * | state |
1062          *   or
1063          * | ------------- state -------------- |
1064          *
1065          * We need to split the extent we found, and may flip bits on
1066          * second half.
1067          *
1068          * If the extent we found extends past our
1069          * range, we just split and search again.  It'll get split
1070          * again the next time though.
1071          *
1072          * If the extent we found is inside our range, we set the
1073          * desired bit on it.
1074          */
1075         if (state->start < start) {
1076                 if (state->state & exclusive_bits) {
1077                         *failed_start = start;
1078                         err = -EEXIST;
1079                         goto out;
1080                 }
1081
1082                 /*
1083                  * If this extent already has all the bits we want set, then
1084                  * skip it, not necessary to split it or do anything with it.
1085                  */
1086                 if ((state->state & bits) == bits) {
1087                         start = state->end + 1;
1088                         cache_state(state, cached_state);
1089                         goto search_again;
1090                 }
1091
1092                 prealloc = alloc_extent_state_atomic(prealloc);
1093                 BUG_ON(!prealloc);
1094                 err = split_state(tree, state, prealloc, start);
1095                 if (err)
1096                         extent_io_tree_panic(tree, err);
1097
1098                 prealloc = NULL;
1099                 if (err)
1100                         goto out;
1101                 if (state->end <= end) {
1102                         set_state_bits(tree, state, &bits, changeset);
1103                         cache_state(state, cached_state);
1104                         merge_state(tree, state);
1105                         if (last_end == (u64)-1)
1106                                 goto out;
1107                         start = last_end + 1;
1108                         state = next_state(state);
1109                         if (start < end && state && state->start == start &&
1110                             !need_resched())
1111                                 goto hit_next;
1112                 }
1113                 goto search_again;
1114         }
1115         /*
1116          * | ---- desired range ---- |
1117          *     | state | or               | state |
1118          *
1119          * There's a hole, we need to insert something in it and
1120          * ignore the extent we found.
1121          */
1122         if (state->start > start) {
1123                 u64 this_end;
1124                 if (end < last_start)
1125                         this_end = end;
1126                 else
1127                         this_end = last_start - 1;
1128
1129                 prealloc = alloc_extent_state_atomic(prealloc);
1130                 BUG_ON(!prealloc);
1131
1132                 /*
1133                  * Avoid to free 'prealloc' if it can be merged with
1134                  * the later extent.
1135                  */
1136                 err = insert_state(tree, prealloc, start, this_end,
1137                                    NULL, NULL, &bits, changeset);
1138                 if (err)
1139                         extent_io_tree_panic(tree, err);
1140
1141                 cache_state(prealloc, cached_state);
1142                 prealloc = NULL;
1143                 start = this_end + 1;
1144                 goto search_again;
1145         }
1146         /*
1147          * | ---- desired range ---- |
1148          *                        | state |
1149          * We need to split the extent, and set the bit
1150          * on the first half
1151          */
1152         if (state->start <= end && state->end > end) {
1153                 if (state->state & exclusive_bits) {
1154                         *failed_start = start;
1155                         err = -EEXIST;
1156                         goto out;
1157                 }
1158
1159                 prealloc = alloc_extent_state_atomic(prealloc);
1160                 BUG_ON(!prealloc);
1161                 err = split_state(tree, state, prealloc, end + 1);
1162                 if (err)
1163                         extent_io_tree_panic(tree, err);
1164
1165                 set_state_bits(tree, prealloc, &bits, changeset);
1166                 cache_state(prealloc, cached_state);
1167                 merge_state(tree, prealloc);
1168                 prealloc = NULL;
1169                 goto out;
1170         }
1171
1172 search_again:
1173         if (start > end)
1174                 goto out;
1175         spin_unlock(&tree->lock);
1176         if (gfpflags_allow_blocking(mask))
1177                 cond_resched();
1178         goto again;
1179
1180 out:
1181         spin_unlock(&tree->lock);
1182         if (prealloc)
1183                 free_extent_state(prealloc);
1184
1185         return err;
1186
1187 }
1188
1189 /**
1190  * convert_extent_bit - convert all bits in a given range from one bit to
1191  *                      another
1192  * @tree:       the io tree to search
1193  * @start:      the start offset in bytes
1194  * @end:        the end offset in bytes (inclusive)
1195  * @bits:       the bits to set in this range
1196  * @clear_bits: the bits to clear in this range
1197  * @cached_state:       state that we're going to cache
1198  *
1199  * This will go through and set bits for the given range.  If any states exist
1200  * already in this range they are set with the given bit and cleared of the
1201  * clear_bits.  This is only meant to be used by things that are mergeable, ie
1202  * converting from say DELALLOC to DIRTY.  This is not meant to be used with
1203  * boundary bits like LOCK.
1204  *
1205  * All allocations are done with GFP_NOFS.
1206  */
1207 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1208                        u32 bits, u32 clear_bits,
1209                        struct extent_state **cached_state)
1210 {
1211         struct extent_state *state;
1212         struct extent_state *prealloc = NULL;
1213         struct rb_node *node;
1214         struct rb_node **p;
1215         struct rb_node *parent;
1216         int err = 0;
1217         u64 last_start;
1218         u64 last_end;
1219         bool first_iteration = true;
1220
1221         btrfs_debug_check_extent_io_range(tree, start, end);
1222         trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1223                                        clear_bits);
1224
1225 again:
1226         if (!prealloc) {
1227                 /*
1228                  * Best effort, don't worry if extent state allocation fails
1229                  * here for the first iteration. We might have a cached state
1230                  * that matches exactly the target range, in which case no
1231                  * extent state allocations are needed. We'll only know this
1232                  * after locking the tree.
1233                  */
1234                 prealloc = alloc_extent_state(GFP_NOFS);
1235                 if (!prealloc && !first_iteration)
1236                         return -ENOMEM;
1237         }
1238
1239         spin_lock(&tree->lock);
1240         if (cached_state && *cached_state) {
1241                 state = *cached_state;
1242                 if (state->start <= start && state->end > start &&
1243                     extent_state_in_tree(state)) {
1244                         node = &state->rb_node;
1245                         goto hit_next;
1246                 }
1247         }
1248
1249         /*
1250          * this search will find all the extents that end after
1251          * our range starts.
1252          */
1253         node = tree_search_for_insert(tree, start, &p, &parent);
1254         if (!node) {
1255                 prealloc = alloc_extent_state_atomic(prealloc);
1256                 if (!prealloc) {
1257                         err = -ENOMEM;
1258                         goto out;
1259                 }
1260                 err = insert_state(tree, prealloc, start, end,
1261                                    &p, &parent, &bits, NULL);
1262                 if (err)
1263                         extent_io_tree_panic(tree, err);
1264                 cache_state(prealloc, cached_state);
1265                 prealloc = NULL;
1266                 goto out;
1267         }
1268         state = rb_entry(node, struct extent_state, rb_node);
1269 hit_next:
1270         last_start = state->start;
1271         last_end = state->end;
1272
1273         /*
1274          * | ---- desired range ---- |
1275          * | state |
1276          *
1277          * Just lock what we found and keep going
1278          */
1279         if (state->start == start && state->end <= end) {
1280                 set_state_bits(tree, state, &bits, NULL);
1281                 cache_state(state, cached_state);
1282                 state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
1283                 if (last_end == (u64)-1)
1284                         goto out;
1285                 start = last_end + 1;
1286                 if (start < end && state && state->start == start &&
1287                     !need_resched())
1288                         goto hit_next;
1289                 goto search_again;
1290         }
1291
1292         /*
1293          *     | ---- desired range ---- |
1294          * | state |
1295          *   or
1296          * | ------------- state -------------- |
1297          *
1298          * We need to split the extent we found, and may flip bits on
1299          * second half.
1300          *
1301          * If the extent we found extends past our
1302          * range, we just split and search again.  It'll get split
1303          * again the next time though.
1304          *
1305          * If the extent we found is inside our range, we set the
1306          * desired bit on it.
1307          */
1308         if (state->start < start) {
1309                 prealloc = alloc_extent_state_atomic(prealloc);
1310                 if (!prealloc) {
1311                         err = -ENOMEM;
1312                         goto out;
1313                 }
1314                 err = split_state(tree, state, prealloc, start);
1315                 if (err)
1316                         extent_io_tree_panic(tree, err);
1317                 prealloc = NULL;
1318                 if (err)
1319                         goto out;
1320                 if (state->end <= end) {
1321                         set_state_bits(tree, state, &bits, NULL);
1322                         cache_state(state, cached_state);
1323                         state = clear_state_bit(tree, state, &clear_bits, 0,
1324                                                 NULL);
1325                         if (last_end == (u64)-1)
1326                                 goto out;
1327                         start = last_end + 1;
1328                         if (start < end && state && state->start == start &&
1329                             !need_resched())
1330                                 goto hit_next;
1331                 }
1332                 goto search_again;
1333         }
1334         /*
1335          * | ---- desired range ---- |
1336          *     | state | or               | state |
1337          *
1338          * There's a hole, we need to insert something in it and
1339          * ignore the extent we found.
1340          */
1341         if (state->start > start) {
1342                 u64 this_end;
1343                 if (end < last_start)
1344                         this_end = end;
1345                 else
1346                         this_end = last_start - 1;
1347
1348                 prealloc = alloc_extent_state_atomic(prealloc);
1349                 if (!prealloc) {
1350                         err = -ENOMEM;
1351                         goto out;
1352                 }
1353
1354                 /*
1355                  * Avoid to free 'prealloc' if it can be merged with
1356                  * the later extent.
1357                  */
1358                 err = insert_state(tree, prealloc, start, this_end,
1359                                    NULL, NULL, &bits, NULL);
1360                 if (err)
1361                         extent_io_tree_panic(tree, err);
1362                 cache_state(prealloc, cached_state);
1363                 prealloc = NULL;
1364                 start = this_end + 1;
1365                 goto search_again;
1366         }
1367         /*
1368          * | ---- desired range ---- |
1369          *                        | state |
1370          * We need to split the extent, and set the bit
1371          * on the first half
1372          */
1373         if (state->start <= end && state->end > end) {
1374                 prealloc = alloc_extent_state_atomic(prealloc);
1375                 if (!prealloc) {
1376                         err = -ENOMEM;
1377                         goto out;
1378                 }
1379
1380                 err = split_state(tree, state, prealloc, end + 1);
1381                 if (err)
1382                         extent_io_tree_panic(tree, err);
1383
1384                 set_state_bits(tree, prealloc, &bits, NULL);
1385                 cache_state(prealloc, cached_state);
1386                 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
1387                 prealloc = NULL;
1388                 goto out;
1389         }
1390
1391 search_again:
1392         if (start > end)
1393                 goto out;
1394         spin_unlock(&tree->lock);
1395         cond_resched();
1396         first_iteration = false;
1397         goto again;
1398
1399 out:
1400         spin_unlock(&tree->lock);
1401         if (prealloc)
1402                 free_extent_state(prealloc);
1403
1404         return err;
1405 }
1406
1407 /* wrappers around set/clear extent bit */
1408 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1409                            u32 bits, struct extent_changeset *changeset)
1410 {
1411         /*
1412          * We don't support EXTENT_LOCKED yet, as current changeset will
1413          * record any bits changed, so for EXTENT_LOCKED case, it will
1414          * either fail with -EEXIST or changeset will record the whole
1415          * range.
1416          */
1417         BUG_ON(bits & EXTENT_LOCKED);
1418
1419         return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1420                               changeset);
1421 }
1422
1423 int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
1424                            u32 bits)
1425 {
1426         return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1427                               GFP_NOWAIT, NULL);
1428 }
1429
1430 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1431                      u32 bits, int wake, int delete,
1432                      struct extent_state **cached)
1433 {
1434         return __clear_extent_bit(tree, start, end, bits, wake, delete,
1435                                   cached, GFP_NOFS, NULL);
1436 }
1437
1438 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1439                 u32 bits, struct extent_changeset *changeset)
1440 {
1441         /*
1442          * Don't support EXTENT_LOCKED case, same reason as
1443          * set_record_extent_bits().
1444          */
1445         BUG_ON(bits & EXTENT_LOCKED);
1446
1447         return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
1448                                   changeset);
1449 }
1450
1451 /*
1452  * either insert or lock state struct between start and end use mask to tell
1453  * us if waiting is desired.
1454  */
1455 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1456                      struct extent_state **cached_state)
1457 {
1458         int err;
1459         u64 failed_start;
1460
1461         while (1) {
1462                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
1463                                      EXTENT_LOCKED, &failed_start,
1464                                      cached_state, GFP_NOFS, NULL);
1465                 if (err == -EEXIST) {
1466                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1467                         start = failed_start;
1468                 } else
1469                         break;
1470                 WARN_ON(start > end);
1471         }
1472         return err;
1473 }
1474
1475 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1476 {
1477         int err;
1478         u64 failed_start;
1479
1480         err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1481                              &failed_start, NULL, GFP_NOFS, NULL);
1482         if (err == -EEXIST) {
1483                 if (failed_start > start)
1484                         clear_extent_bit(tree, start, failed_start - 1,
1485                                          EXTENT_LOCKED, 1, 0, NULL);
1486                 return 0;
1487         }
1488         return 1;
1489 }
1490
1491 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1492 {
1493         unsigned long index = start >> PAGE_SHIFT;
1494         unsigned long end_index = end >> PAGE_SHIFT;
1495         struct page *page;
1496
1497         while (index <= end_index) {
1498                 page = find_get_page(inode->i_mapping, index);
1499                 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1500                 clear_page_dirty_for_io(page);
1501                 put_page(page);
1502                 index++;
1503         }
1504 }
1505
1506 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1507 {
1508         unsigned long index = start >> PAGE_SHIFT;
1509         unsigned long end_index = end >> PAGE_SHIFT;
1510         struct page *page;
1511
1512         while (index <= end_index) {
1513                 page = find_get_page(inode->i_mapping, index);
1514                 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1515                 __set_page_dirty_nobuffers(page);
1516                 account_page_redirty(page);
1517                 put_page(page);
1518                 index++;
1519         }
1520 }
1521
1522 /* find the first state struct with 'bits' set after 'start', and
1523  * return it.  tree->lock must be held.  NULL will returned if
1524  * nothing was found after 'start'
1525  */
1526 static struct extent_state *
1527 find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
1528 {
1529         struct rb_node *node;
1530         struct extent_state *state;
1531
1532         /*
1533          * this search will find all the extents that end after
1534          * our range starts.
1535          */
1536         node = tree_search(tree, start);
1537         if (!node)
1538                 goto out;
1539
1540         while (1) {
1541                 state = rb_entry(node, struct extent_state, rb_node);
1542                 if (state->end >= start && (state->state & bits))
1543                         return state;
1544
1545                 node = rb_next(node);
1546                 if (!node)
1547                         break;
1548         }
1549 out:
1550         return NULL;
1551 }
1552
1553 /*
1554  * Find the first offset in the io tree with one or more @bits set.
1555  *
1556  * Note: If there are multiple bits set in @bits, any of them will match.
1557  *
1558  * Return 0 if we find something, and update @start_ret and @end_ret.
1559  * Return 1 if we found nothing.
1560  */
1561 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1562                           u64 *start_ret, u64 *end_ret, u32 bits,
1563                           struct extent_state **cached_state)
1564 {
1565         struct extent_state *state;
1566         int ret = 1;
1567
1568         spin_lock(&tree->lock);
1569         if (cached_state && *cached_state) {
1570                 state = *cached_state;
1571                 if (state->end == start - 1 && extent_state_in_tree(state)) {
1572                         while ((state = next_state(state)) != NULL) {
1573                                 if (state->state & bits)
1574                                         goto got_it;
1575                         }
1576                         free_extent_state(*cached_state);
1577                         *cached_state = NULL;
1578                         goto out;
1579                 }
1580                 free_extent_state(*cached_state);
1581                 *cached_state = NULL;
1582         }
1583
1584         state = find_first_extent_bit_state(tree, start, bits);
1585 got_it:
1586         if (state) {
1587                 cache_state_if_flags(state, cached_state, 0);
1588                 *start_ret = state->start;
1589                 *end_ret = state->end;
1590                 ret = 0;
1591         }
1592 out:
1593         spin_unlock(&tree->lock);
1594         return ret;
1595 }
1596
1597 /**
1598  * Find a contiguous area of bits
1599  *
1600  * @tree:      io tree to check
1601  * @start:     offset to start the search from
1602  * @start_ret: the first offset we found with the bits set
1603  * @end_ret:   the final contiguous range of the bits that were set
1604  * @bits:      bits to look for
1605  *
1606  * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
1607  * to set bits appropriately, and then merge them again.  During this time it
1608  * will drop the tree->lock, so use this helper if you want to find the actual
1609  * contiguous area for given bits.  We will search to the first bit we find, and
1610  * then walk down the tree until we find a non-contiguous area.  The area
1611  * returned will be the full contiguous area with the bits set.
1612  */
1613 int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
1614                                u64 *start_ret, u64 *end_ret, u32 bits)
1615 {
1616         struct extent_state *state;
1617         int ret = 1;
1618
1619         spin_lock(&tree->lock);
1620         state = find_first_extent_bit_state(tree, start, bits);
1621         if (state) {
1622                 *start_ret = state->start;
1623                 *end_ret = state->end;
1624                 while ((state = next_state(state)) != NULL) {
1625                         if (state->start > (*end_ret + 1))
1626                                 break;
1627                         *end_ret = state->end;
1628                 }
1629                 ret = 0;
1630         }
1631         spin_unlock(&tree->lock);
1632         return ret;
1633 }
1634
1635 /**
1636  * Find the first range that has @bits not set. This range could start before
1637  * @start.
1638  *
1639  * @tree:      the tree to search
1640  * @start:     offset at/after which the found extent should start
1641  * @start_ret: records the beginning of the range
1642  * @end_ret:   records the end of the range (inclusive)
1643  * @bits:      the set of bits which must be unset
1644  *
1645  * Since unallocated range is also considered one which doesn't have the bits
1646  * set it's possible that @end_ret contains -1, this happens in case the range
1647  * spans (last_range_end, end of device]. In this case it's up to the caller to
1648  * trim @end_ret to the appropriate size.
1649  */
1650 void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1651                                  u64 *start_ret, u64 *end_ret, u32 bits)
1652 {
1653         struct extent_state *state;
1654         struct rb_node *node, *prev = NULL, *next;
1655
1656         spin_lock(&tree->lock);
1657
1658         /* Find first extent with bits cleared */
1659         while (1) {
1660                 node = __etree_search(tree, start, &next, &prev, NULL, NULL);
1661                 if (!node && !next && !prev) {
1662                         /*
1663                          * Tree is completely empty, send full range and let
1664                          * caller deal with it
1665                          */
1666                         *start_ret = 0;
1667                         *end_ret = -1;
1668                         goto out;
1669                 } else if (!node && !next) {
1670                         /*
1671                          * We are past the last allocated chunk, set start at
1672                          * the end of the last extent.
1673                          */
1674                         state = rb_entry(prev, struct extent_state, rb_node);
1675                         *start_ret = state->end + 1;
1676                         *end_ret = -1;
1677                         goto out;
1678                 } else if (!node) {
1679                         node = next;
1680                 }
1681                 /*
1682                  * At this point 'node' either contains 'start' or start is
1683                  * before 'node'
1684                  */
1685                 state = rb_entry(node, struct extent_state, rb_node);
1686
1687                 if (in_range(start, state->start, state->end - state->start + 1)) {
1688                         if (state->state & bits) {
1689                                 /*
1690                                  * |--range with bits sets--|
1691                                  *    |
1692                                  *    start
1693                                  */
1694                                 start = state->end + 1;
1695                         } else {
1696                                 /*
1697                                  * 'start' falls within a range that doesn't
1698                                  * have the bits set, so take its start as
1699                                  * the beginning of the desired range
1700                                  *
1701                                  * |--range with bits cleared----|
1702                                  *      |
1703                                  *      start
1704                                  */
1705                                 *start_ret = state->start;
1706                                 break;
1707                         }
1708                 } else {
1709                         /*
1710                          * |---prev range---|---hole/unset---|---node range---|
1711                          *                          |
1712                          *                        start
1713                          *
1714                          *                        or
1715                          *
1716                          * |---hole/unset--||--first node--|
1717                          * 0   |
1718                          *    start
1719                          */
1720                         if (prev) {
1721                                 state = rb_entry(prev, struct extent_state,
1722                                                  rb_node);
1723                                 *start_ret = state->end + 1;
1724                         } else {
1725                                 *start_ret = 0;
1726                         }
1727                         break;
1728                 }
1729         }
1730
1731         /*
1732          * Find the longest stretch from start until an entry which has the
1733          * bits set
1734          */
1735         while (1) {
1736                 state = rb_entry(node, struct extent_state, rb_node);
1737                 if (state->end >= start && !(state->state & bits)) {
1738                         *end_ret = state->end;
1739                 } else {
1740                         *end_ret = state->start - 1;
1741                         break;
1742                 }
1743
1744                 node = rb_next(node);
1745                 if (!node)
1746                         break;
1747         }
1748 out:
1749         spin_unlock(&tree->lock);
1750 }
1751
1752 /*
1753  * find a contiguous range of bytes in the file marked as delalloc, not
1754  * more than 'max_bytes'.  start and end are used to return the range,
1755  *
1756  * true is returned if we find something, false if nothing was in the tree
1757  */
1758 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1759                                u64 *end, u64 max_bytes,
1760                                struct extent_state **cached_state)
1761 {
1762         struct rb_node *node;
1763         struct extent_state *state;
1764         u64 cur_start = *start;
1765         bool found = false;
1766         u64 total_bytes = 0;
1767
1768         spin_lock(&tree->lock);
1769
1770         /*
1771          * this search will find all the extents that end after
1772          * our range starts.
1773          */
1774         node = tree_search(tree, cur_start);
1775         if (!node) {
1776                 *end = (u64)-1;
1777                 goto out;
1778         }
1779
1780         while (1) {
1781                 state = rb_entry(node, struct extent_state, rb_node);
1782                 if (found && (state->start != cur_start ||
1783                               (state->state & EXTENT_BOUNDARY))) {
1784                         goto out;
1785                 }
1786                 if (!(state->state & EXTENT_DELALLOC)) {
1787                         if (!found)
1788                                 *end = state->end;
1789                         goto out;
1790                 }
1791                 if (!found) {
1792                         *start = state->start;
1793                         *cached_state = state;
1794                         refcount_inc(&state->refs);
1795                 }
1796                 found = true;
1797                 *end = state->end;
1798                 cur_start = state->end + 1;
1799                 node = rb_next(node);
1800                 total_bytes += state->end - state->start + 1;
1801                 if (total_bytes >= max_bytes)
1802                         break;
1803                 if (!node)
1804                         break;
1805         }
1806 out:
1807         spin_unlock(&tree->lock);
1808         return found;
1809 }
1810
1811 static int __process_pages_contig(struct address_space *mapping,
1812                                   struct page *locked_page,
1813                                   pgoff_t start_index, pgoff_t end_index,
1814                                   unsigned long page_ops, pgoff_t *index_ret);
1815
1816 static noinline void __unlock_for_delalloc(struct inode *inode,
1817                                            struct page *locked_page,
1818                                            u64 start, u64 end)
1819 {
1820         unsigned long index = start >> PAGE_SHIFT;
1821         unsigned long end_index = end >> PAGE_SHIFT;
1822
1823         ASSERT(locked_page);
1824         if (index == locked_page->index && end_index == index)
1825                 return;
1826
1827         __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
1828                                PAGE_UNLOCK, NULL);
1829 }
1830
1831 static noinline int lock_delalloc_pages(struct inode *inode,
1832                                         struct page *locked_page,
1833                                         u64 delalloc_start,
1834                                         u64 delalloc_end)
1835 {
1836         unsigned long index = delalloc_start >> PAGE_SHIFT;
1837         unsigned long index_ret = index;
1838         unsigned long end_index = delalloc_end >> PAGE_SHIFT;
1839         int ret;
1840
1841         ASSERT(locked_page);
1842         if (index == locked_page->index && index == end_index)
1843                 return 0;
1844
1845         ret = __process_pages_contig(inode->i_mapping, locked_page, index,
1846                                      end_index, PAGE_LOCK, &index_ret);
1847         if (ret == -EAGAIN)
1848                 __unlock_for_delalloc(inode, locked_page, delalloc_start,
1849                                       (u64)index_ret << PAGE_SHIFT);
1850         return ret;
1851 }
1852
1853 /*
1854  * Find and lock a contiguous range of bytes in the file marked as delalloc, no
1855  * more than @max_bytes.  @Start and @end are used to return the range,
1856  *
1857  * Return: true if we find something
1858  *         false if nothing was in the tree
1859  */
1860 EXPORT_FOR_TESTS
1861 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
1862                                     struct page *locked_page, u64 *start,
1863                                     u64 *end)
1864 {
1865         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1866         u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
1867         u64 delalloc_start;
1868         u64 delalloc_end;
1869         bool found;
1870         struct extent_state *cached_state = NULL;
1871         int ret;
1872         int loops = 0;
1873
1874 again:
1875         /* step one, find a bunch of delalloc bytes starting at start */
1876         delalloc_start = *start;
1877         delalloc_end = 0;
1878         found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1879                                           max_bytes, &cached_state);
1880         if (!found || delalloc_end <= *start) {
1881                 *start = delalloc_start;
1882                 *end = delalloc_end;
1883                 free_extent_state(cached_state);
1884                 return false;
1885         }
1886
1887         /*
1888          * start comes from the offset of locked_page.  We have to lock
1889          * pages in order, so we can't process delalloc bytes before
1890          * locked_page
1891          */
1892         if (delalloc_start < *start)
1893                 delalloc_start = *start;
1894
1895         /*
1896          * make sure to limit the number of pages we try to lock down
1897          */
1898         if (delalloc_end + 1 - delalloc_start > max_bytes)
1899                 delalloc_end = delalloc_start + max_bytes - 1;
1900
1901         /* step two, lock all the pages after the page that has start */
1902         ret = lock_delalloc_pages(inode, locked_page,
1903                                   delalloc_start, delalloc_end);
1904         ASSERT(!ret || ret == -EAGAIN);
1905         if (ret == -EAGAIN) {
1906                 /* some of the pages are gone, lets avoid looping by
1907                  * shortening the size of the delalloc range we're searching
1908                  */
1909                 free_extent_state(cached_state);
1910                 cached_state = NULL;
1911                 if (!loops) {
1912                         max_bytes = PAGE_SIZE;
1913                         loops = 1;
1914                         goto again;
1915                 } else {
1916                         found = false;
1917                         goto out_failed;
1918                 }
1919         }
1920
1921         /* step three, lock the state bits for the whole range */
1922         lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
1923
1924         /* then test to make sure it is all still delalloc */
1925         ret = test_range_bit(tree, delalloc_start, delalloc_end,
1926                              EXTENT_DELALLOC, 1, cached_state);
1927         if (!ret) {
1928                 unlock_extent_cached(tree, delalloc_start, delalloc_end,
1929                                      &cached_state);
1930                 __unlock_for_delalloc(inode, locked_page,
1931                               delalloc_start, delalloc_end);
1932                 cond_resched();
1933                 goto again;
1934         }
1935         free_extent_state(cached_state);
1936         *start = delalloc_start;
1937         *end = delalloc_end;
1938 out_failed:
1939         return found;
1940 }
1941
1942 static int __process_pages_contig(struct address_space *mapping,
1943                                   struct page *locked_page,
1944                                   pgoff_t start_index, pgoff_t end_index,
1945                                   unsigned long page_ops, pgoff_t *index_ret)
1946 {
1947         unsigned long nr_pages = end_index - start_index + 1;
1948         unsigned long pages_processed = 0;
1949         pgoff_t index = start_index;
1950         struct page *pages[16];
1951         unsigned ret;
1952         int err = 0;
1953         int i;
1954
1955         if (page_ops & PAGE_LOCK) {
1956                 ASSERT(page_ops == PAGE_LOCK);
1957                 ASSERT(index_ret && *index_ret == start_index);
1958         }
1959
1960         if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1961                 mapping_set_error(mapping, -EIO);
1962
1963         while (nr_pages > 0) {
1964                 ret = find_get_pages_contig(mapping, index,
1965                                      min_t(unsigned long,
1966                                      nr_pages, ARRAY_SIZE(pages)), pages);
1967                 if (ret == 0) {
1968                         /*
1969                          * Only if we're going to lock these pages,
1970                          * can we find nothing at @index.
1971                          */
1972                         ASSERT(page_ops & PAGE_LOCK);
1973                         err = -EAGAIN;
1974                         goto out;
1975                 }
1976
1977                 for (i = 0; i < ret; i++) {
1978                         if (page_ops & PAGE_SET_PRIVATE2)
1979                                 SetPagePrivate2(pages[i]);
1980
1981                         if (locked_page && pages[i] == locked_page) {
1982                                 put_page(pages[i]);
1983                                 pages_processed++;
1984                                 continue;
1985                         }
1986                         if (page_ops & PAGE_START_WRITEBACK) {
1987                                 clear_page_dirty_for_io(pages[i]);
1988                                 set_page_writeback(pages[i]);
1989                         }
1990                         if (page_ops & PAGE_SET_ERROR)
1991                                 SetPageError(pages[i]);
1992                         if (page_ops & PAGE_END_WRITEBACK)
1993                                 end_page_writeback(pages[i]);
1994                         if (page_ops & PAGE_UNLOCK)
1995                                 unlock_page(pages[i]);
1996                         if (page_ops & PAGE_LOCK) {
1997                                 lock_page(pages[i]);
1998                                 if (!PageDirty(pages[i]) ||
1999                                     pages[i]->mapping != mapping) {
2000                                         unlock_page(pages[i]);
2001                                         for (; i < ret; i++)
2002                                                 put_page(pages[i]);
2003                                         err = -EAGAIN;
2004                                         goto out;
2005                                 }
2006                         }
2007                         put_page(pages[i]);
2008                         pages_processed++;
2009                 }
2010                 nr_pages -= ret;
2011                 index += ret;
2012                 cond_resched();
2013         }
2014 out:
2015         if (err && index_ret)
2016                 *index_ret = start_index + pages_processed - 1;
2017         return err;
2018 }
2019
2020 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2021                                   struct page *locked_page,
2022                                   u32 clear_bits, unsigned long page_ops)
2023 {
2024         clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
2025
2026         __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
2027                                start >> PAGE_SHIFT, end >> PAGE_SHIFT,
2028                                page_ops, NULL);
2029 }
2030
2031 /*
2032  * count the number of bytes in the tree that have a given bit(s)
2033  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
2034  * cached.  The total number found is returned.
2035  */
2036 u64 count_range_bits(struct extent_io_tree *tree,
2037                      u64 *start, u64 search_end, u64 max_bytes,
2038                      u32 bits, int contig)
2039 {
2040         struct rb_node *node;
2041         struct extent_state *state;
2042         u64 cur_start = *start;
2043         u64 total_bytes = 0;
2044         u64 last = 0;
2045         int found = 0;
2046
2047         if (WARN_ON(search_end <= cur_start))
2048                 return 0;
2049
2050         spin_lock(&tree->lock);
2051         if (cur_start == 0 && bits == EXTENT_DIRTY) {
2052                 total_bytes = tree->dirty_bytes;
2053                 goto out;
2054         }
2055         /*
2056          * this search will find all the extents that end after
2057          * our range starts.
2058          */
2059         node = tree_search(tree, cur_start);
2060         if (!node)
2061                 goto out;
2062
2063         while (1) {
2064                 state = rb_entry(node, struct extent_state, rb_node);
2065                 if (state->start > search_end)
2066                         break;
2067                 if (contig && found && state->start > last + 1)
2068                         break;
2069                 if (state->end >= cur_start && (state->state & bits) == bits) {
2070                         total_bytes += min(search_end, state->end) + 1 -
2071                                        max(cur_start, state->start);
2072                         if (total_bytes >= max_bytes)
2073                                 break;
2074                         if (!found) {
2075                                 *start = max(cur_start, state->start);
2076                                 found = 1;
2077                         }
2078                         last = state->end;
2079                 } else if (contig && found) {
2080                         break;
2081                 }
2082                 node = rb_next(node);
2083                 if (!node)
2084                         break;
2085         }
2086 out:
2087         spin_unlock(&tree->lock);
2088         return total_bytes;
2089 }
2090
2091 /*
2092  * set the private field for a given byte offset in the tree.  If there isn't
2093  * an extent_state there already, this does nothing.
2094  */
2095 int set_state_failrec(struct extent_io_tree *tree, u64 start,
2096                       struct io_failure_record *failrec)
2097 {
2098         struct rb_node *node;
2099         struct extent_state *state;
2100         int ret = 0;
2101
2102         spin_lock(&tree->lock);
2103         /*
2104          * this search will find all the extents that end after
2105          * our range starts.
2106          */
2107         node = tree_search(tree, start);
2108         if (!node) {
2109                 ret = -ENOENT;
2110                 goto out;
2111         }
2112         state = rb_entry(node, struct extent_state, rb_node);
2113         if (state->start != start) {
2114                 ret = -ENOENT;
2115                 goto out;
2116         }
2117         state->failrec = failrec;
2118 out:
2119         spin_unlock(&tree->lock);
2120         return ret;
2121 }
2122
2123 struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
2124 {
2125         struct rb_node *node;
2126         struct extent_state *state;
2127         struct io_failure_record *failrec;
2128
2129         spin_lock(&tree->lock);
2130         /*
2131          * this search will find all the extents that end after
2132          * our range starts.
2133          */
2134         node = tree_search(tree, start);
2135         if (!node) {
2136                 failrec = ERR_PTR(-ENOENT);
2137                 goto out;
2138         }
2139         state = rb_entry(node, struct extent_state, rb_node);
2140         if (state->start != start) {
2141                 failrec = ERR_PTR(-ENOENT);
2142                 goto out;
2143         }
2144
2145         failrec = state->failrec;
2146 out:
2147         spin_unlock(&tree->lock);
2148         return failrec;
2149 }
2150
2151 /*
2152  * searches a range in the state tree for a given mask.
2153  * If 'filled' == 1, this returns 1 only if every extent in the tree
2154  * has the bits set.  Otherwise, 1 is returned if any bit in the
2155  * range is found set.
2156  */
2157 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
2158                    u32 bits, int filled, struct extent_state *cached)
2159 {
2160         struct extent_state *state = NULL;
2161         struct rb_node *node;
2162         int bitset = 0;
2163
2164         spin_lock(&tree->lock);
2165         if (cached && extent_state_in_tree(cached) && cached->start <= start &&
2166             cached->end > start)
2167                 node = &cached->rb_node;
2168         else
2169                 node = tree_search(tree, start);
2170         while (node && start <= end) {
2171                 state = rb_entry(node, struct extent_state, rb_node);
2172
2173                 if (filled && state->start > start) {
2174                         bitset = 0;
2175                         break;
2176                 }
2177
2178                 if (state->start > end)
2179                         break;
2180
2181                 if (state->state & bits) {
2182                         bitset = 1;
2183                         if (!filled)
2184                                 break;
2185                 } else if (filled) {
2186                         bitset = 0;
2187                         break;
2188                 }
2189
2190                 if (state->end == (u64)-1)
2191                         break;
2192
2193                 start = state->end + 1;
2194                 if (start > end)
2195                         break;
2196                 node = rb_next(node);
2197                 if (!node) {
2198                         if (filled)
2199                                 bitset = 0;
2200                         break;
2201                 }
2202         }
2203         spin_unlock(&tree->lock);
2204         return bitset;
2205 }
2206
2207 /*
2208  * helper function to set a given page up to date if all the
2209  * extents in the tree for that page are up to date
2210  */
2211 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
2212 {
2213         u64 start = page_offset(page);
2214         u64 end = start + PAGE_SIZE - 1;
2215         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
2216                 SetPageUptodate(page);
2217 }
2218
2219 int free_io_failure(struct extent_io_tree *failure_tree,
2220                     struct extent_io_tree *io_tree,
2221                     struct io_failure_record *rec)
2222 {
2223         int ret;
2224         int err = 0;
2225
2226         set_state_failrec(failure_tree, rec->start, NULL);
2227         ret = clear_extent_bits(failure_tree, rec->start,
2228                                 rec->start + rec->len - 1,
2229                                 EXTENT_LOCKED | EXTENT_DIRTY);
2230         if (ret)
2231                 err = ret;
2232
2233         ret = clear_extent_bits(io_tree, rec->start,
2234                                 rec->start + rec->len - 1,
2235                                 EXTENT_DAMAGED);
2236         if (ret && !err)
2237                 err = ret;
2238
2239         kfree(rec);
2240         return err;
2241 }
2242
2243 /*
2244  * this bypasses the standard btrfs submit functions deliberately, as
2245  * the standard behavior is to write all copies in a raid setup. here we only
2246  * want to write the one bad copy. so we do the mapping for ourselves and issue
2247  * submit_bio directly.
2248  * to avoid any synchronization issues, wait for the data after writing, which
2249  * actually prevents the read that triggered the error from finishing.
2250  * currently, there can be no more than two copies of every data bit. thus,
2251  * exactly one rewrite is required.
2252  */
2253 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2254                       u64 length, u64 logical, struct page *page,
2255                       unsigned int pg_offset, int mirror_num)
2256 {
2257         struct bio *bio;
2258         struct btrfs_device *dev;
2259         u64 map_length = 0;
2260         u64 sector;
2261         struct btrfs_bio *bbio = NULL;
2262         int ret;
2263
2264         ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
2265         BUG_ON(!mirror_num);
2266
2267         if (btrfs_is_zoned(fs_info))
2268                 return btrfs_repair_one_zone(fs_info, logical);
2269
2270         bio = btrfs_io_bio_alloc(1);
2271         bio->bi_iter.bi_size = 0;
2272         map_length = length;
2273
2274         /*
2275          * Avoid races with device replace and make sure our bbio has devices
2276          * associated to its stripes that don't go away while we are doing the
2277          * read repair operation.
2278          */
2279         btrfs_bio_counter_inc_blocked(fs_info);
2280         if (btrfs_is_parity_mirror(fs_info, logical, length)) {
2281                 /*
2282                  * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2283                  * to update all raid stripes, but here we just want to correct
2284                  * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2285                  * stripe's dev and sector.
2286                  */
2287                 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
2288                                       &map_length, &bbio, 0);
2289                 if (ret) {
2290                         btrfs_bio_counter_dec(fs_info);
2291                         bio_put(bio);
2292                         return -EIO;
2293                 }
2294                 ASSERT(bbio->mirror_num == 1);
2295         } else {
2296                 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2297                                       &map_length, &bbio, mirror_num);
2298                 if (ret) {
2299                         btrfs_bio_counter_dec(fs_info);
2300                         bio_put(bio);
2301                         return -EIO;
2302                 }
2303                 BUG_ON(mirror_num != bbio->mirror_num);
2304         }
2305
2306         sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
2307         bio->bi_iter.bi_sector = sector;
2308         dev = bbio->stripes[bbio->mirror_num - 1].dev;
2309         btrfs_put_bbio(bbio);
2310         if (!dev || !dev->bdev ||
2311             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
2312                 btrfs_bio_counter_dec(fs_info);
2313                 bio_put(bio);
2314                 return -EIO;
2315         }
2316         bio_set_dev(bio, dev->bdev);
2317         bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
2318         bio_add_page(bio, page, length, pg_offset);
2319
2320         if (btrfsic_submit_bio_wait(bio)) {
2321                 /* try to remap that extent elsewhere? */
2322                 btrfs_bio_counter_dec(fs_info);
2323                 bio_put(bio);
2324                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2325                 return -EIO;
2326         }
2327
2328         btrfs_info_rl_in_rcu(fs_info,
2329                 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
2330                                   ino, start,
2331                                   rcu_str_deref(dev->name), sector);
2332         btrfs_bio_counter_dec(fs_info);
2333         bio_put(bio);
2334         return 0;
2335 }
2336
2337 int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
2338 {
2339         struct btrfs_fs_info *fs_info = eb->fs_info;
2340         u64 start = eb->start;
2341         int i, num_pages = num_extent_pages(eb);
2342         int ret = 0;
2343
2344         if (sb_rdonly(fs_info->sb))
2345                 return -EROFS;
2346
2347         for (i = 0; i < num_pages; i++) {
2348                 struct page *p = eb->pages[i];
2349
2350                 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
2351                                         start - page_offset(p), mirror_num);
2352                 if (ret)
2353                         break;
2354                 start += PAGE_SIZE;
2355         }
2356
2357         return ret;
2358 }
2359
2360 /*
2361  * each time an IO finishes, we do a fast check in the IO failure tree
2362  * to see if we need to process or clean up an io_failure_record
2363  */
2364 int clean_io_failure(struct btrfs_fs_info *fs_info,
2365                      struct extent_io_tree *failure_tree,
2366                      struct extent_io_tree *io_tree, u64 start,
2367                      struct page *page, u64 ino, unsigned int pg_offset)
2368 {
2369         u64 private;
2370         struct io_failure_record *failrec;
2371         struct extent_state *state;
2372         int num_copies;
2373         int ret;
2374
2375         private = 0;
2376         ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2377                                EXTENT_DIRTY, 0);
2378         if (!ret)
2379                 return 0;
2380
2381         failrec = get_state_failrec(failure_tree, start);
2382         if (IS_ERR(failrec))
2383                 return 0;
2384
2385         BUG_ON(!failrec->this_mirror);
2386
2387         if (sb_rdonly(fs_info->sb))
2388                 goto out;
2389
2390         spin_lock(&io_tree->lock);
2391         state = find_first_extent_bit_state(io_tree,
2392                                             failrec->start,
2393                                             EXTENT_LOCKED);
2394         spin_unlock(&io_tree->lock);
2395
2396         if (state && state->start <= failrec->start &&
2397             state->end >= failrec->start + failrec->len - 1) {
2398                 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2399                                               failrec->len);
2400                 if (num_copies > 1)  {
2401                         repair_io_failure(fs_info, ino, start, failrec->len,
2402                                           failrec->logical, page, pg_offset,
2403                                           failrec->failed_mirror);
2404                 }
2405         }
2406
2407 out:
2408         free_io_failure(failure_tree, io_tree, failrec);
2409
2410         return 0;
2411 }
2412
2413 /*
2414  * Can be called when
2415  * - hold extent lock
2416  * - under ordered extent
2417  * - the inode is freeing
2418  */
2419 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
2420 {
2421         struct extent_io_tree *failure_tree = &inode->io_failure_tree;
2422         struct io_failure_record *failrec;
2423         struct extent_state *state, *next;
2424
2425         if (RB_EMPTY_ROOT(&failure_tree->state))
2426                 return;
2427
2428         spin_lock(&failure_tree->lock);
2429         state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2430         while (state) {
2431                 if (state->start > end)
2432                         break;
2433
2434                 ASSERT(state->end <= end);
2435
2436                 next = next_state(state);
2437
2438                 failrec = state->failrec;
2439                 free_extent_state(state);
2440                 kfree(failrec);
2441
2442                 state = next;
2443         }
2444         spin_unlock(&failure_tree->lock);
2445 }
2446
2447 static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
2448                                                              u64 start)
2449 {
2450         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2451         struct io_failure_record *failrec;
2452         struct extent_map *em;
2453         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2454         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2455         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2456         const u32 sectorsize = fs_info->sectorsize;
2457         int ret;
2458         u64 logical;
2459
2460         failrec = get_state_failrec(failure_tree, start);
2461         if (!IS_ERR(failrec)) {
2462                 btrfs_debug(fs_info,
2463         "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
2464                         failrec->logical, failrec->start, failrec->len);
2465                 /*
2466                  * when data can be on disk more than twice, add to failrec here
2467                  * (e.g. with a list for failed_mirror) to make
2468                  * clean_io_failure() clean all those errors at once.
2469                  */
2470
2471                 return failrec;
2472         }
2473
2474         failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2475         if (!failrec)
2476                 return ERR_PTR(-ENOMEM);
2477
2478         failrec->start = start;
2479         failrec->len = sectorsize;
2480         failrec->this_mirror = 0;
2481         failrec->bio_flags = 0;
2482
2483         read_lock(&em_tree->lock);
2484         em = lookup_extent_mapping(em_tree, start, failrec->len);
2485         if (!em) {
2486                 read_unlock(&em_tree->lock);
2487                 kfree(failrec);
2488                 return ERR_PTR(-EIO);
2489         }
2490
2491         if (em->start > start || em->start + em->len <= start) {
2492                 free_extent_map(em);
2493                 em = NULL;
2494         }
2495         read_unlock(&em_tree->lock);
2496         if (!em) {
2497                 kfree(failrec);
2498                 return ERR_PTR(-EIO);
2499         }
2500
2501         logical = start - em->start;
2502         logical = em->block_start + logical;
2503         if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2504                 logical = em->block_start;
2505                 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2506                 extent_set_compress_type(&failrec->bio_flags, em->compress_type);
2507         }
2508
2509         btrfs_debug(fs_info,
2510                     "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
2511                     logical, start, failrec->len);
2512
2513         failrec->logical = logical;
2514         free_extent_map(em);
2515
2516         /* Set the bits in the private failure tree */
2517         ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
2518                               EXTENT_LOCKED | EXTENT_DIRTY);
2519         if (ret >= 0) {
2520                 ret = set_state_failrec(failure_tree, start, failrec);
2521                 /* Set the bits in the inode's tree */
2522                 ret = set_extent_bits(tree, start, start + sectorsize - 1,
2523                                       EXTENT_DAMAGED);
2524         } else if (ret < 0) {
2525                 kfree(failrec);
2526                 return ERR_PTR(ret);
2527         }
2528
2529         return failrec;
2530 }
2531
2532 static bool btrfs_check_repairable(struct inode *inode,
2533                                    struct io_failure_record *failrec,
2534                                    int failed_mirror)
2535 {
2536         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2537         int num_copies;
2538
2539         num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
2540         if (num_copies == 1) {
2541                 /*
2542                  * we only have a single copy of the data, so don't bother with
2543                  * all the retry and error correction code that follows. no
2544                  * matter what the error is, it is very likely to persist.
2545                  */
2546                 btrfs_debug(fs_info,
2547                         "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
2548                         num_copies, failrec->this_mirror, failed_mirror);
2549                 return false;
2550         }
2551
2552         /* The failure record should only contain one sector */
2553         ASSERT(failrec->len == fs_info->sectorsize);
2554
2555         /*
2556          * There are two premises:
2557          * a) deliver good data to the caller
2558          * b) correct the bad sectors on disk
2559          *
2560          * Since we're only doing repair for one sector, we only need to get
2561          * a good copy of the failed sector and if we succeed, we have setup
2562          * everything for repair_io_failure to do the rest for us.
2563          */
2564         failrec->failed_mirror = failed_mirror;
2565         failrec->this_mirror++;
2566         if (failrec->this_mirror == failed_mirror)
2567                 failrec->this_mirror++;
2568
2569         if (failrec->this_mirror > num_copies) {
2570                 btrfs_debug(fs_info,
2571                         "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
2572                         num_copies, failrec->this_mirror, failed_mirror);
2573                 return false;
2574         }
2575
2576         return true;
2577 }
2578
2579 int btrfs_repair_one_sector(struct inode *inode,
2580                             struct bio *failed_bio, u32 bio_offset,
2581                             struct page *page, unsigned int pgoff,
2582                             u64 start, int failed_mirror,
2583                             submit_bio_hook_t *submit_bio_hook)
2584 {
2585         struct io_failure_record *failrec;
2586         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2587         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2588         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2589         struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
2590         const int icsum = bio_offset >> fs_info->sectorsize_bits;
2591         struct bio *repair_bio;
2592         struct btrfs_io_bio *repair_io_bio;
2593         blk_status_t status;
2594
2595         btrfs_debug(fs_info,
2596                    "repair read error: read error at %llu", start);
2597
2598         BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2599
2600         failrec = btrfs_get_io_failure_record(inode, start);
2601         if (IS_ERR(failrec))
2602                 return PTR_ERR(failrec);
2603
2604
2605         if (!btrfs_check_repairable(inode, failrec, failed_mirror)) {
2606                 free_io_failure(failure_tree, tree, failrec);
2607                 return -EIO;
2608         }
2609
2610         repair_bio = btrfs_io_bio_alloc(1);
2611         repair_io_bio = btrfs_io_bio(repair_bio);
2612         repair_bio->bi_opf = REQ_OP_READ;
2613         repair_bio->bi_end_io = failed_bio->bi_end_io;
2614         repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
2615         repair_bio->bi_private = failed_bio->bi_private;
2616
2617         if (failed_io_bio->csum) {
2618                 const u32 csum_size = fs_info->csum_size;
2619
2620                 repair_io_bio->csum = repair_io_bio->csum_inline;
2621                 memcpy(repair_io_bio->csum,
2622                        failed_io_bio->csum + csum_size * icsum, csum_size);
2623         }
2624
2625         bio_add_page(repair_bio, page, failrec->len, pgoff);
2626         repair_io_bio->logical = failrec->start;
2627         repair_io_bio->iter = repair_bio->bi_iter;
2628
2629         btrfs_debug(btrfs_sb(inode->i_sb),
2630                     "repair read error: submitting new read to mirror %d",
2631                     failrec->this_mirror);
2632
2633         status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
2634                                  failrec->bio_flags);
2635         if (status) {
2636                 free_io_failure(failure_tree, tree, failrec);
2637                 bio_put(repair_bio);
2638         }
2639         return blk_status_to_errno(status);
2640 }
2641
2642 static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
2643 {
2644         struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
2645
2646         ASSERT(page_offset(page) <= start &&
2647                start + len <= page_offset(page) + PAGE_SIZE);
2648
2649         /*
2650          * For subapge metadata case, all btrfs_page_* helpers need page to
2651          * have page::private populated.
2652          * But we can have rare case where the last eb in the page is only
2653          * referred by the IO, and it gets released immedately after it's
2654          * read and verified.
2655          *
2656          * This can detach the page private completely.
2657          * In that case, we can just skip the page status update completely,
2658          * as the page has no eb anymore.
2659          */
2660         if (fs_info->sectorsize < PAGE_SIZE && unlikely(!PagePrivate(page))) {
2661                 ASSERT(!is_data_inode(page->mapping->host));
2662                 return;
2663         }
2664         if (uptodate) {
2665                 btrfs_page_set_uptodate(fs_info, page, start, len);
2666         } else {
2667                 btrfs_page_clear_uptodate(fs_info, page, start, len);
2668                 btrfs_page_set_error(fs_info, page, start, len);
2669         }
2670
2671         if (fs_info->sectorsize == PAGE_SIZE)
2672                 unlock_page(page);
2673         else if (is_data_inode(page->mapping->host))
2674                 /*
2675                  * For subpage data, unlock the page if we're the last reader.
2676                  * For subpage metadata, page lock is not utilized for read.
2677                  */
2678                 btrfs_subpage_end_reader(fs_info, page, start, len);
2679 }
2680
2681 static blk_status_t submit_read_repair(struct inode *inode,
2682                                       struct bio *failed_bio, u32 bio_offset,
2683                                       struct page *page, unsigned int pgoff,
2684                                       u64 start, u64 end, int failed_mirror,
2685                                       unsigned int error_bitmap,
2686                                       submit_bio_hook_t *submit_bio_hook)
2687 {
2688         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2689         const u32 sectorsize = fs_info->sectorsize;
2690         const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
2691         int error = 0;
2692         int i;
2693
2694         BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2695
2696         /* We're here because we had some read errors or csum mismatch */
2697         ASSERT(error_bitmap);
2698
2699         /*
2700          * We only get called on buffered IO, thus page must be mapped and bio
2701          * must not be cloned.
2702          */
2703         ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED));
2704
2705         /* Iterate through all the sectors in the range */
2706         for (i = 0; i < nr_bits; i++) {
2707                 const unsigned int offset = i * sectorsize;
2708                 struct extent_state *cached = NULL;
2709                 bool uptodate = false;
2710                 int ret;
2711
2712                 if (!(error_bitmap & (1U << i))) {
2713                         /*
2714                          * This sector has no error, just end the page read
2715                          * and unlock the range.
2716                          */
2717                         uptodate = true;
2718                         goto next;
2719                 }
2720
2721                 ret = btrfs_repair_one_sector(inode, failed_bio,
2722                                 bio_offset + offset,
2723                                 page, pgoff + offset, start + offset,
2724                                 failed_mirror, submit_bio_hook);
2725                 if (!ret) {
2726                         /*
2727                          * We have submitted the read repair, the page release
2728                          * will be handled by the endio function of the
2729                          * submitted repair bio.
2730                          * Thus we don't need to do any thing here.
2731                          */
2732                         continue;
2733                 }
2734                 /*
2735                  * Repair failed, just record the error but still continue.
2736                  * Or the remaining sectors will not be properly unlocked.
2737                  */
2738                 if (!error)
2739                         error = ret;
2740 next:
2741                 end_page_read(page, uptodate, start + offset, sectorsize);
2742                 if (uptodate)
2743                         set_extent_uptodate(&BTRFS_I(inode)->io_tree,
2744                                         start + offset,
2745                                         start + offset + sectorsize - 1,
2746                                         &cached, GFP_ATOMIC);
2747                 unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree,
2748                                 start + offset,
2749                                 start + offset + sectorsize - 1,
2750                                 &cached);
2751         }
2752         return errno_to_blk_status(error);
2753 }
2754
2755 /* lots and lots of room for performance fixes in the end_bio funcs */
2756
2757 void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2758 {
2759         int uptodate = (err == 0);
2760         int ret = 0;
2761
2762         btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
2763
2764         if (!uptodate) {
2765                 ClearPageUptodate(page);
2766                 SetPageError(page);
2767                 ret = err < 0 ? err : -EIO;
2768                 mapping_set_error(page->mapping, ret);
2769         }
2770 }
2771
2772 /*
2773  * after a writepage IO is done, we need to:
2774  * clear the uptodate bits on error
2775  * clear the writeback bits in the extent tree for this IO
2776  * end_page_writeback if the page has no more pending IO
2777  *
2778  * Scheduling is not allowed, so the extent state tree is expected
2779  * to have one and only one object corresponding to this IO.
2780  */
2781 static void end_bio_extent_writepage(struct bio *bio)
2782 {
2783         int error = blk_status_to_errno(bio->bi_status);
2784         struct bio_vec *bvec;
2785         u64 start;
2786         u64 end;
2787         struct bvec_iter_all iter_all;
2788         bool first_bvec = true;
2789
2790         ASSERT(!bio_flagged(bio, BIO_CLONED));
2791         bio_for_each_segment_all(bvec, bio, iter_all) {
2792                 struct page *page = bvec->bv_page;
2793                 struct inode *inode = page->mapping->host;
2794                 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2795
2796                 /* We always issue full-page reads, but if some block
2797                  * in a page fails to read, blk_update_request() will
2798                  * advance bv_offset and adjust bv_len to compensate.
2799                  * Print a warning for nonzero offsets, and an error
2800                  * if they don't add up to a full page.  */
2801                 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2802                         if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
2803                                 btrfs_err(fs_info,
2804                                    "partial page write in btrfs with offset %u and length %u",
2805                                         bvec->bv_offset, bvec->bv_len);
2806                         else
2807                                 btrfs_info(fs_info,
2808                                    "incomplete page write in btrfs with offset %u and length %u",
2809                                         bvec->bv_offset, bvec->bv_len);
2810                 }
2811
2812                 start = page_offset(page);
2813                 end = start + bvec->bv_offset + bvec->bv_len - 1;
2814
2815                 if (first_bvec) {
2816                         btrfs_record_physical_zoned(inode, start, bio);
2817                         first_bvec = false;
2818                 }
2819
2820                 end_extent_writepage(page, error, start, end);
2821                 end_page_writeback(page);
2822         }
2823
2824         bio_put(bio);
2825 }
2826
2827 /*
2828  * Record previously processed extent range
2829  *
2830  * For endio_readpage_release_extent() to handle a full extent range, reducing
2831  * the extent io operations.
2832  */
2833 struct processed_extent {
2834         struct btrfs_inode *inode;
2835         /* Start of the range in @inode */
2836         u64 start;
2837         /* End of the range in @inode */
2838         u64 end;
2839         bool uptodate;
2840 };
2841
2842 /*
2843  * Try to release processed extent range
2844  *
2845  * May not release the extent range right now if the current range is
2846  * contiguous to processed extent.
2847  *
2848  * Will release processed extent when any of @inode, @uptodate, the range is
2849  * no longer contiguous to the processed range.
2850  *
2851  * Passing @inode == NULL will force processed extent to be released.
2852  */
2853 static void endio_readpage_release_extent(struct processed_extent *processed,
2854                               struct btrfs_inode *inode, u64 start, u64 end,
2855                               bool uptodate)
2856 {
2857         struct extent_state *cached = NULL;
2858         struct extent_io_tree *tree;
2859
2860         /* The first extent, initialize @processed */
2861         if (!processed->inode)
2862                 goto update;
2863
2864         /*
2865          * Contiguous to processed extent, just uptodate the end.
2866          *
2867          * Several things to notice:
2868          *
2869          * - bio can be merged as long as on-disk bytenr is contiguous
2870          *   This means we can have page belonging to other inodes, thus need to
2871          *   check if the inode still matches.
2872          * - bvec can contain range beyond current page for multi-page bvec
2873          *   Thus we need to do processed->end + 1 >= start check
2874          */
2875         if (processed->inode == inode && processed->uptodate == uptodate &&
2876             processed->end + 1 >= start && end >= processed->end) {
2877                 processed->end = end;
2878                 return;
2879         }
2880
2881         tree = &processed->inode->io_tree;
2882         /*
2883          * Now we don't have range contiguous to the processed range, release
2884          * the processed range now.
2885          */
2886         if (processed->uptodate && tree->track_uptodate)
2887                 set_extent_uptodate(tree, processed->start, processed->end,
2888                                     &cached, GFP_ATOMIC);
2889         unlock_extent_cached_atomic(tree, processed->start, processed->end,
2890                                     &cached);
2891
2892 update:
2893         /* Update processed to current range */
2894         processed->inode = inode;
2895         processed->start = start;
2896         processed->end = end;
2897         processed->uptodate = uptodate;
2898 }
2899
2900 static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
2901 {
2902         ASSERT(PageLocked(page));
2903         if (fs_info->sectorsize == PAGE_SIZE)
2904                 return;
2905
2906         ASSERT(PagePrivate(page));
2907         btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
2908 }
2909
2910 /*
2911  * Find extent buffer for a givne bytenr.
2912  *
2913  * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
2914  * in endio context.
2915  */
2916 static struct extent_buffer *find_extent_buffer_readpage(
2917                 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
2918 {
2919         struct extent_buffer *eb;
2920
2921         /*
2922          * For regular sectorsize, we can use page->private to grab extent
2923          * buffer
2924          */
2925         if (fs_info->sectorsize == PAGE_SIZE) {
2926                 ASSERT(PagePrivate(page) && page->private);
2927                 return (struct extent_buffer *)page->private;
2928         }
2929
2930         /* For subpage case, we need to lookup buffer radix tree */
2931         rcu_read_lock();
2932         eb = radix_tree_lookup(&fs_info->buffer_radix,
2933                                bytenr >> fs_info->sectorsize_bits);
2934         rcu_read_unlock();
2935         ASSERT(eb);
2936         return eb;
2937 }
2938
2939 /*
2940  * after a readpage IO is done, we need to:
2941  * clear the uptodate bits on error
2942  * set the uptodate bits if things worked
2943  * set the page up to date if all extents in the tree are uptodate
2944  * clear the lock bit in the extent tree
2945  * unlock the page if there are no other extents locked for it
2946  *
2947  * Scheduling is not allowed, so the extent state tree is expected
2948  * to have one and only one object corresponding to this IO.
2949  */
2950 static void end_bio_extent_readpage(struct bio *bio)
2951 {
2952         struct bio_vec *bvec;
2953         struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2954         struct extent_io_tree *tree, *failure_tree;
2955         struct processed_extent processed = { 0 };
2956         /*
2957          * The offset to the beginning of a bio, since one bio can never be
2958          * larger than UINT_MAX, u32 here is enough.
2959          */
2960         u32 bio_offset = 0;
2961         int mirror;
2962         int ret;
2963         struct bvec_iter_all iter_all;
2964
2965         ASSERT(!bio_flagged(bio, BIO_CLONED));
2966         bio_for_each_segment_all(bvec, bio, iter_all) {
2967                 bool uptodate = !bio->bi_status;
2968                 struct page *page = bvec->bv_page;
2969                 struct inode *inode = page->mapping->host;
2970                 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2971                 const u32 sectorsize = fs_info->sectorsize;
2972                 unsigned int error_bitmap = (unsigned int)-1;
2973                 u64 start;
2974                 u64 end;
2975                 u32 len;
2976
2977                 btrfs_debug(fs_info,
2978                         "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
2979                         bio->bi_iter.bi_sector, bio->bi_status,
2980                         io_bio->mirror_num);
2981                 tree = &BTRFS_I(inode)->io_tree;
2982                 failure_tree = &BTRFS_I(inode)->io_failure_tree;
2983
2984                 /*
2985                  * We always issue full-sector reads, but if some block in a
2986                  * page fails to read, blk_update_request() will advance
2987                  * bv_offset and adjust bv_len to compensate.  Print a warning
2988                  * for unaligned offsets, and an error if they don't add up to
2989                  * a full sector.
2990                  */
2991                 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
2992                         btrfs_err(fs_info,
2993                 "partial page read in btrfs with offset %u and length %u",
2994                                   bvec->bv_offset, bvec->bv_len);
2995                 else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
2996                                      sectorsize))
2997                         btrfs_info(fs_info,
2998                 "incomplete page read with offset %u and length %u",
2999                                    bvec->bv_offset, bvec->bv_len);
3000
3001                 start = page_offset(page) + bvec->bv_offset;
3002                 end = start + bvec->bv_len - 1;
3003                 len = bvec->bv_len;
3004
3005                 mirror = io_bio->mirror_num;
3006                 if (likely(uptodate)) {
3007                         if (is_data_inode(inode)) {
3008                                 error_bitmap = btrfs_verify_data_csum(io_bio,
3009                                                 bio_offset, page, start, end);
3010                                 ret = error_bitmap;
3011                         } else {
3012                                 ret = btrfs_validate_metadata_buffer(io_bio,
3013                                         page, start, end, mirror);
3014                         }
3015                         if (ret)
3016                                 uptodate = false;
3017                         else
3018                                 clean_io_failure(BTRFS_I(inode)->root->fs_info,
3019                                                  failure_tree, tree, start,
3020                                                  page,
3021                                                  btrfs_ino(BTRFS_I(inode)), 0);
3022                 }
3023
3024                 if (likely(uptodate))
3025                         goto readpage_ok;
3026
3027                 if (is_data_inode(inode)) {
3028                         /*
3029                          * btrfs_submit_read_repair() will handle all the good
3030                          * and bad sectors, we just continue to the next bvec.
3031                          */
3032                         submit_read_repair(inode, bio, bio_offset, page,
3033                                            start - page_offset(page), start,
3034                                            end, mirror, error_bitmap,
3035                                            btrfs_submit_data_bio);
3036
3037                         ASSERT(bio_offset + len > bio_offset);
3038                         bio_offset += len;
3039                         continue;
3040                 } else {
3041                         struct extent_buffer *eb;
3042
3043                         eb = find_extent_buffer_readpage(fs_info, page, start);
3044                         set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
3045                         eb->read_mirror = mirror;
3046                         atomic_dec(&eb->io_pages);
3047                         if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
3048                                                &eb->bflags))
3049                                 btree_readahead_hook(eb, -EIO);
3050                 }
3051 readpage_ok:
3052                 if (likely(uptodate)) {
3053                         loff_t i_size = i_size_read(inode);
3054                         pgoff_t end_index = i_size >> PAGE_SHIFT;
3055
3056                         /*
3057                          * Zero out the remaining part if this range straddles
3058                          * i_size.
3059                          *
3060                          * Here we should only zero the range inside the bvec,
3061                          * not touch anything else.
3062                          *
3063                          * NOTE: i_size is exclusive while end is inclusive.
3064                          */
3065                         if (page->index == end_index && i_size <= end) {
3066                                 u32 zero_start = max(offset_in_page(i_size),
3067                                                      offset_in_page(start));
3068
3069                                 zero_user_segment(page, zero_start,
3070                                                   offset_in_page(end) + 1);
3071                         }
3072                 }
3073                 ASSERT(bio_offset + len > bio_offset);
3074                 bio_offset += len;
3075
3076                 /* Update page status and unlock */
3077                 end_page_read(page, uptodate, start, len);
3078                 endio_readpage_release_extent(&processed, BTRFS_I(inode),
3079                                               start, end, uptodate);
3080         }
3081         /* Release the last extent */
3082         endio_readpage_release_extent(&processed, NULL, 0, 0, false);
3083         btrfs_io_bio_free_csum(io_bio);
3084         bio_put(bio);
3085 }
3086
3087 /*
3088  * Initialize the members up to but not including 'bio'. Use after allocating a
3089  * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
3090  * 'bio' because use of __GFP_ZERO is not supported.
3091  */
3092 static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
3093 {
3094         memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
3095 }
3096
3097 /*
3098  * The following helpers allocate a bio. As it's backed by a bioset, it'll
3099  * never fail.  We're returning a bio right now but you can call btrfs_io_bio
3100  * for the appropriate container_of magic
3101  */
3102 struct bio *btrfs_bio_alloc(u64 first_byte)
3103 {
3104         struct bio *bio;
3105
3106         bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset);
3107         bio->bi_iter.bi_sector = first_byte >> 9;
3108         btrfs_io_bio_init(btrfs_io_bio(bio));
3109         return bio;
3110 }
3111
3112 struct bio *btrfs_bio_clone(struct bio *bio)
3113 {
3114         struct btrfs_io_bio *btrfs_bio;
3115         struct bio *new;
3116
3117         /* Bio allocation backed by a bioset does not fail */
3118         new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
3119         btrfs_bio = btrfs_io_bio(new);
3120         btrfs_io_bio_init(btrfs_bio);
3121         btrfs_bio->iter = bio->bi_iter;
3122         return new;
3123 }
3124
3125 struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
3126 {
3127         struct bio *bio;
3128
3129         /* Bio allocation backed by a bioset does not fail */
3130         bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
3131         btrfs_io_bio_init(btrfs_io_bio(bio));
3132         return bio;
3133 }
3134
3135 struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
3136 {
3137         struct bio *bio;
3138         struct btrfs_io_bio *btrfs_bio;
3139
3140         /* this will never fail when it's backed by a bioset */
3141         bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
3142         ASSERT(bio);
3143
3144         btrfs_bio = btrfs_io_bio(bio);
3145         btrfs_io_bio_init(btrfs_bio);
3146
3147         bio_trim(bio, offset >> 9, size >> 9);
3148         btrfs_bio->iter = bio->bi_iter;
3149         return bio;
3150 }
3151
3152 /**
3153  * Attempt to add a page to bio
3154  *
3155  * @bio:        destination bio
3156  * @page:       page to add to the bio
3157  * @disk_bytenr:  offset of the new bio or to check whether we are adding
3158  *                a contiguous page to the previous one
3159  * @pg_offset:  starting offset in the page
3160  * @size:       portion of page that we want to write
3161  * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
3162  * @bio_flags:  flags of the current bio to see if we can merge them
3163  * @return:     true if page was added, false otherwise
3164  *
3165  * Attempt to add a page to bio considering stripe alignment etc.
3166  *
3167  * Return true if successfully page added. Otherwise, return false.
3168  */
3169 static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
3170                                struct page *page,
3171                                u64 disk_bytenr, unsigned int size,
3172                                unsigned int pg_offset,
3173                                unsigned long bio_flags)
3174 {
3175         struct bio *bio = bio_ctrl->bio;
3176         u32 bio_size = bio->bi_iter.bi_size;
3177         const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
3178         bool contig;
3179         int ret;
3180
3181         ASSERT(bio);
3182         /* The limit should be calculated when bio_ctrl->bio is allocated */
3183         ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
3184         if (bio_ctrl->bio_flags != bio_flags)
3185                 return false;
3186
3187         if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
3188                 contig = bio->bi_iter.bi_sector == sector;
3189         else
3190                 contig = bio_end_sector(bio) == sector;
3191         if (!contig)
3192                 return false;
3193
3194         if (bio_size + size > bio_ctrl->len_to_oe_boundary ||
3195             bio_size + size > bio_ctrl->len_to_stripe_boundary)
3196                 return false;
3197
3198         if (bio_op(bio) == REQ_OP_ZONE_APPEND)
3199                 ret = bio_add_zone_append_page(bio, page, size, pg_offset);
3200         else
3201                 ret = bio_add_page(bio, page, size, pg_offset);
3202
3203         return ret == size;
3204 }
3205
3206 static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
3207                                struct btrfs_inode *inode)
3208 {
3209         struct btrfs_fs_info *fs_info = inode->root->fs_info;
3210         struct btrfs_io_geometry geom;
3211         struct btrfs_ordered_extent *ordered;
3212         struct extent_map *em;
3213         u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
3214         int ret;
3215
3216         /*
3217          * Pages for compressed extent are never submitted to disk directly,
3218          * thus it has no real boundary, just set them to U32_MAX.
3219          *
3220          * The split happens for real compressed bio, which happens in
3221          * btrfs_submit_compressed_read/write().
3222          */
3223         if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) {
3224                 bio_ctrl->len_to_oe_boundary = U32_MAX;
3225                 bio_ctrl->len_to_stripe_boundary = U32_MAX;
3226                 return 0;
3227         }
3228         em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
3229         if (IS_ERR(em))
3230                 return PTR_ERR(em);
3231         ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
3232                                     logical, &geom);
3233         free_extent_map(em);
3234         if (ret < 0) {
3235                 return ret;
3236         }
3237         if (geom.len > U32_MAX)
3238                 bio_ctrl->len_to_stripe_boundary = U32_MAX;
3239         else
3240                 bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
3241
3242         if (!btrfs_is_zoned(fs_info) ||
3243             bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
3244                 bio_ctrl->len_to_oe_boundary = U32_MAX;
3245                 return 0;
3246         }
3247
3248         ASSERT(fs_info->max_zone_append_size > 0);
3249         /* Ordered extent not yet created, so we're good */
3250         ordered = btrfs_lookup_ordered_extent(inode, logical);
3251         if (!ordered) {
3252                 bio_ctrl->len_to_oe_boundary = U32_MAX;
3253                 return 0;
3254         }
3255
3256         bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
3257                 ordered->disk_bytenr + ordered->disk_num_bytes - logical);
3258         btrfs_put_ordered_extent(ordered);
3259         return 0;
3260 }
3261
3262 /*
3263  * @opf:        bio REQ_OP_* and REQ_* flags as one value
3264  * @wbc:        optional writeback control for io accounting
3265  * @page:       page to add to the bio
3266  * @disk_bytenr: logical bytenr where the write will be
3267  * @size:       portion of page that we want to write to
3268  * @pg_offset:  offset of the new bio or to check whether we are adding
3269  *              a contiguous page to the previous one
3270  * @bio_ret:    must be valid pointer, newly allocated bio will be stored there
3271  * @end_io_func:     end_io callback for new bio
3272  * @mirror_num:      desired mirror to read/write
3273  * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
3274  * @bio_flags:  flags of the current bio to see if we can merge them
3275  */
3276 static int submit_extent_page(unsigned int opf,
3277                               struct writeback_control *wbc,
3278                               struct btrfs_bio_ctrl *bio_ctrl,
3279                               struct page *page, u64 disk_bytenr,
3280                               size_t size, unsigned long pg_offset,
3281                               bio_end_io_t end_io_func,
3282                               int mirror_num,
3283                               unsigned long bio_flags,
3284                               bool force_bio_submit)
3285 {
3286         int ret = 0;
3287         struct bio *bio;
3288         size_t io_size = min_t(size_t, size, PAGE_SIZE);
3289         struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
3290         struct extent_io_tree *tree = &inode->io_tree;
3291         struct btrfs_fs_info *fs_info = inode->root->fs_info;
3292
3293         ASSERT(bio_ctrl);
3294
3295         ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
3296                pg_offset + size <= PAGE_SIZE);
3297         if (bio_ctrl->bio) {
3298                 bio = bio_ctrl->bio;
3299                 if (force_bio_submit ||
3300                     !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size,
3301                                         pg_offset, bio_flags)) {
3302                         ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags);
3303                         bio_ctrl->bio = NULL;
3304                         if (ret < 0)
3305                                 return ret;
3306                 } else {
3307                         if (wbc)
3308                                 wbc_account_cgroup_owner(wbc, page, io_size);
3309                         return 0;
3310                 }
3311         }
3312
3313         bio = btrfs_bio_alloc(disk_bytenr);
3314         bio_add_page(bio, page, io_size, pg_offset);
3315         bio->bi_end_io = end_io_func;
3316         bio->bi_private = tree;
3317         bio->bi_write_hint = page->mapping->host->i_write_hint;
3318         bio->bi_opf = opf;
3319         if (wbc) {
3320                 struct block_device *bdev;
3321
3322                 bdev = fs_info->fs_devices->latest_bdev;
3323                 bio_set_dev(bio, bdev);
3324                 wbc_init_bio(wbc, bio);
3325                 wbc_account_cgroup_owner(wbc, page, io_size);
3326         }
3327         if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
3328                 struct btrfs_device *device;
3329
3330                 device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size);
3331                 if (IS_ERR(device))
3332                         return PTR_ERR(device);
3333
3334                 btrfs_io_bio(bio)->device = device;
3335         }
3336
3337         bio_ctrl->bio = bio;
3338         bio_ctrl->bio_flags = bio_flags;
3339         ret = calc_bio_boundaries(bio_ctrl, inode);
3340
3341         return ret;
3342 }
3343
3344 static int attach_extent_buffer_page(struct extent_buffer *eb,
3345                                      struct page *page,
3346                                      struct btrfs_subpage *prealloc)
3347 {
3348         struct btrfs_fs_info *fs_info = eb->fs_info;
3349         int ret = 0;
3350
3351         /*
3352          * If the page is mapped to btree inode, we should hold the private
3353          * lock to prevent race.
3354          * For cloned or dummy extent buffers, their pages are not mapped and
3355          * will not race with any other ebs.
3356          */
3357         if (page->mapping)
3358                 lockdep_assert_held(&page->mapping->private_lock);
3359
3360         if (fs_info->sectorsize == PAGE_SIZE) {
3361                 if (!PagePrivate(page))
3362                         attach_page_private(page, eb);
3363                 else
3364                         WARN_ON(page->private != (unsigned long)eb);
3365                 return 0;
3366         }
3367
3368         /* Already mapped, just free prealloc */
3369         if (PagePrivate(page)) {
3370                 btrfs_free_subpage(prealloc);
3371                 return 0;
3372         }
3373
3374         if (prealloc)
3375                 /* Has preallocated memory for subpage */
3376                 attach_page_private(page, prealloc);
3377         else
3378                 /* Do new allocation to attach subpage */
3379                 ret = btrfs_attach_subpage(fs_info, page,
3380                                            BTRFS_SUBPAGE_METADATA);
3381         return ret;
3382 }
3383
3384 int set_page_extent_mapped(struct page *page)
3385 {
3386         struct btrfs_fs_info *fs_info;
3387
3388         ASSERT(page->mapping);
3389
3390         if (PagePrivate(page))
3391                 return 0;
3392
3393         fs_info = btrfs_sb(page->mapping->host->i_sb);
3394
3395         if (fs_info->sectorsize < PAGE_SIZE)
3396                 return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
3397
3398         attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
3399         return 0;
3400 }
3401
3402 void clear_page_extent_mapped(struct page *page)
3403 {
3404         struct btrfs_fs_info *fs_info;
3405
3406         ASSERT(page->mapping);
3407
3408         if (!PagePrivate(page))
3409                 return;
3410
3411         fs_info = btrfs_sb(page->mapping->host->i_sb);
3412         if (fs_info->sectorsize < PAGE_SIZE)
3413                 return btrfs_detach_subpage(fs_info, page);
3414
3415         detach_page_private(page);
3416 }
3417
3418 static struct extent_map *
3419 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
3420                  u64 start, u64 len, struct extent_map **em_cached)
3421 {
3422         struct extent_map *em;
3423
3424         if (em_cached && *em_cached) {
3425                 em = *em_cached;
3426                 if (extent_map_in_tree(em) && start >= em->start &&
3427                     start < extent_map_end(em)) {
3428                         refcount_inc(&em->refs);
3429                         return em;
3430                 }
3431
3432                 free_extent_map(em);
3433                 *em_cached = NULL;
3434         }
3435
3436         em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
3437         if (em_cached && !IS_ERR_OR_NULL(em)) {
3438                 BUG_ON(*em_cached);
3439                 refcount_inc(&em->refs);
3440                 *em_cached = em;
3441         }
3442         return em;
3443 }
3444 /*
3445  * basic readpage implementation.  Locked extent state structs are inserted
3446  * into the tree that are removed when the IO is done (by the end_io
3447  * handlers)
3448  * XXX JDM: This needs looking at to ensure proper page locking
3449  * return 0 on success, otherwise return error
3450  */
3451 int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
3452                       struct btrfs_bio_ctrl *bio_ctrl,
3453                       unsigned int read_flags, u64 *prev_em_start)
3454 {
3455         struct inode *inode = page->mapping->host;
3456         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3457         u64 start = page_offset(page);
3458         const u64 end = start + PAGE_SIZE - 1;
3459         u64 cur = start;
3460         u64 extent_offset;
3461         u64 last_byte = i_size_read(inode);
3462         u64 block_start;
3463         u64 cur_end;
3464         struct extent_map *em;
3465         int ret = 0;
3466         int nr = 0;
3467         size_t pg_offset = 0;
3468         size_t iosize;
3469         size_t blocksize = inode->i_sb->s_blocksize;
3470         unsigned long this_bio_flag = 0;
3471         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
3472
3473         ret = set_page_extent_mapped(page);
3474         if (ret < 0) {
3475                 unlock_extent(tree, start, end);
3476                 btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
3477                 unlock_page(page);
3478                 goto out;
3479         }
3480
3481         if (!PageUptodate(page)) {
3482                 if (cleancache_get_page(page) == 0) {
3483                         BUG_ON(blocksize != PAGE_SIZE);
3484                         unlock_extent(tree, start, end);
3485                         unlock_page(page);
3486                         goto out;
3487                 }
3488         }
3489
3490         if (page->index == last_byte >> PAGE_SHIFT) {
3491                 size_t zero_offset = offset_in_page(last_byte);
3492
3493                 if (zero_offset) {
3494                         iosize = PAGE_SIZE - zero_offset;
3495                         memzero_page(page, zero_offset, iosize);
3496                         flush_dcache_page(page);
3497                 }
3498         }
3499         begin_page_read(fs_info, page);
3500         while (cur <= end) {
3501                 bool force_bio_submit = false;
3502                 u64 disk_bytenr;
3503
3504                 if (cur >= last_byte) {
3505                         struct extent_state *cached = NULL;
3506
3507                         iosize = PAGE_SIZE - pg_offset;
3508                         memzero_page(page, pg_offset, iosize);
3509                         flush_dcache_page(page);
3510                         set_extent_uptodate(tree, cur, cur + iosize - 1,
3511                                             &cached, GFP_NOFS);
3512                         unlock_extent_cached(tree, cur,
3513                                              cur + iosize - 1, &cached);
3514                         end_page_read(page, true, cur, iosize);
3515                         break;
3516                 }
3517                 em = __get_extent_map(inode, page, pg_offset, cur,
3518                                       end - cur + 1, em_cached);
3519                 if (IS_ERR_OR_NULL(em)) {
3520                         unlock_extent(tree, cur, end);
3521                         end_page_read(page, false, cur, end + 1 - cur);
3522                         break;
3523                 }
3524                 extent_offset = cur - em->start;
3525                 BUG_ON(extent_map_end(em) <= cur);
3526                 BUG_ON(end < cur);
3527
3528                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
3529                         this_bio_flag |= EXTENT_BIO_COMPRESSED;
3530                         extent_set_compress_type(&this_bio_flag,
3531                                                  em->compress_type);
3532                 }
3533
3534                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
3535                 cur_end = min(extent_map_end(em) - 1, end);
3536                 iosize = ALIGN(iosize, blocksize);
3537                 if (this_bio_flag & EXTENT_BIO_COMPRESSED)
3538                         disk_bytenr = em->block_start;
3539                 else
3540                         disk_bytenr = em->block_start + extent_offset;
3541                 block_start = em->block_start;
3542                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3543                         block_start = EXTENT_MAP_HOLE;
3544
3545                 /*
3546                  * If we have a file range that points to a compressed extent
3547                  * and it's followed by a consecutive file range that points
3548                  * to the same compressed extent (possibly with a different
3549                  * offset and/or length, so it either points to the whole extent
3550                  * or only part of it), we must make sure we do not submit a
3551                  * single bio to populate the pages for the 2 ranges because
3552                  * this makes the compressed extent read zero out the pages
3553                  * belonging to the 2nd range. Imagine the following scenario:
3554                  *
3555                  *  File layout
3556                  *  [0 - 8K]                     [8K - 24K]
3557                  *    |                               |
3558                  *    |                               |
3559                  * points to extent X,         points to extent X,
3560                  * offset 4K, length of 8K     offset 0, length 16K
3561                  *
3562                  * [extent X, compressed length = 4K uncompressed length = 16K]
3563                  *
3564                  * If the bio to read the compressed extent covers both ranges,
3565                  * it will decompress extent X into the pages belonging to the
3566                  * first range and then it will stop, zeroing out the remaining
3567                  * pages that belong to the other range that points to extent X.
3568                  * So here we make sure we submit 2 bios, one for the first
3569                  * range and another one for the third range. Both will target
3570                  * the same physical extent from disk, but we can't currently
3571                  * make the compressed bio endio callback populate the pages
3572                  * for both ranges because each compressed bio is tightly
3573                  * coupled with a single extent map, and each range can have
3574                  * an extent map with a different offset value relative to the
3575                  * uncompressed data of our extent and different lengths. This
3576                  * is a corner case so we prioritize correctness over
3577                  * non-optimal behavior (submitting 2 bios for the same extent).
3578                  */
3579                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3580                     prev_em_start && *prev_em_start != (u64)-1 &&
3581                     *prev_em_start != em->start)
3582                         force_bio_submit = true;
3583
3584                 if (prev_em_start)
3585                         *prev_em_start = em->start;
3586
3587                 free_extent_map(em);
3588                 em = NULL;
3589
3590                 /* we've found a hole, just zero and go on */
3591                 if (block_start == EXTENT_MAP_HOLE) {
3592                         struct extent_state *cached = NULL;
3593
3594                         memzero_page(page, pg_offset, iosize);
3595                         flush_dcache_page(page);
3596
3597                         set_extent_uptodate(tree, cur, cur + iosize - 1,
3598                                             &cached, GFP_NOFS);
3599                         unlock_extent_cached(tree, cur,
3600                                              cur + iosize - 1, &cached);
3601                         end_page_read(page, true, cur, iosize);
3602                         cur = cur + iosize;
3603                         pg_offset += iosize;
3604                         continue;
3605                 }
3606                 /* the get_extent function already copied into the page */
3607                 if (test_range_bit(tree, cur, cur_end,
3608                                    EXTENT_UPTODATE, 1, NULL)) {
3609                         check_page_uptodate(tree, page);
3610                         unlock_extent(tree, cur, cur + iosize - 1);
3611                         end_page_read(page, true, cur, iosize);
3612                         cur = cur + iosize;
3613                         pg_offset += iosize;
3614                         continue;
3615                 }
3616                 /* we have an inline extent but it didn't get marked up
3617                  * to date.  Error out
3618                  */
3619                 if (block_start == EXTENT_MAP_INLINE) {
3620                         unlock_extent(tree, cur, cur + iosize - 1);
3621                         end_page_read(page, false, cur, iosize);
3622                         cur = cur + iosize;
3623                         pg_offset += iosize;
3624                         continue;
3625                 }
3626
3627                 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
3628                                          bio_ctrl, page, disk_bytenr, iosize,
3629                                          pg_offset,
3630                                          end_bio_extent_readpage, 0,
3631                                          this_bio_flag,
3632                                          force_bio_submit);
3633                 if (!ret) {
3634                         nr++;
3635                 } else {
3636                         unlock_extent(tree, cur, cur + iosize - 1);
3637                         end_page_read(page, false, cur, iosize);
3638                         goto out;
3639                 }
3640                 cur = cur + iosize;
3641                 pg_offset += iosize;
3642         }
3643 out:
3644         return ret;
3645 }
3646
3647 static inline void contiguous_readpages(struct page *pages[], int nr_pages,
3648                                         u64 start, u64 end,
3649                                         struct extent_map **em_cached,
3650                                         struct btrfs_bio_ctrl *bio_ctrl,
3651                                         u64 *prev_em_start)
3652 {
3653         struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
3654         int index;
3655
3656         btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
3657
3658         for (index = 0; index < nr_pages; index++) {
3659                 btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
3660                                   REQ_RAHEAD, prev_em_start);
3661                 put_page(pages[index]);
3662         }
3663 }
3664
3665 static void update_nr_written(struct writeback_control *wbc,
3666                               unsigned long nr_written)
3667 {
3668         wbc->nr_to_write -= nr_written;
3669 }
3670
3671 /*
3672  * helper for __extent_writepage, doing all of the delayed allocation setup.
3673  *
3674  * This returns 1 if btrfs_run_delalloc_range function did all the work required
3675  * to write the page (copy into inline extent).  In this case the IO has
3676  * been started and the page is already unlocked.
3677  *
3678  * This returns 0 if all went well (page still locked)
3679  * This returns < 0 if there were errors (page still locked)
3680  */
3681 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
3682                 struct page *page, struct writeback_control *wbc,
3683                 u64 delalloc_start, unsigned long *nr_written)
3684 {
3685         u64 page_end = delalloc_start + PAGE_SIZE - 1;
3686         bool found;
3687         u64 delalloc_to_write = 0;
3688         u64 delalloc_end = 0;
3689         int ret;
3690         int page_started = 0;
3691
3692
3693         while (delalloc_end < page_end) {
3694                 found = find_lock_delalloc_range(&inode->vfs_inode, page,
3695                                                &delalloc_start,
3696                                                &delalloc_end);
3697                 if (!found) {
3698                         delalloc_start = delalloc_end + 1;
3699                         continue;
3700                 }
3701                 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
3702                                 delalloc_end, &page_started, nr_written, wbc);
3703                 if (ret) {
3704                         SetPageError(page);
3705                         /*
3706                          * btrfs_run_delalloc_range should return < 0 for error
3707                          * but just in case, we use > 0 here meaning the IO is
3708                          * started, so we don't want to return > 0 unless
3709                          * things are going well.
3710                          */
3711                         return ret < 0 ? ret : -EIO;
3712                 }
3713                 /*
3714                  * delalloc_end is already one less than the total length, so
3715                  * we don't subtract one from PAGE_SIZE
3716                  */
3717                 delalloc_to_write += (delalloc_end - delalloc_start +
3718                                       PAGE_SIZE) >> PAGE_SHIFT;
3719                 delalloc_start = delalloc_end + 1;
3720         }
3721         if (wbc->nr_to_write < delalloc_to_write) {
3722                 int thresh = 8192;
3723
3724                 if (delalloc_to_write < thresh * 2)
3725                         thresh = delalloc_to_write;
3726                 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3727                                          thresh);
3728         }
3729
3730         /* did the fill delalloc function already unlock and start
3731          * the IO?
3732          */
3733         if (page_started) {
3734                 /*
3735                  * we've unlocked the page, so we can't update
3736                  * the mapping's writeback index, just update
3737                  * nr_to_write.
3738                  */
3739                 wbc->nr_to_write -= *nr_written;
3740                 return 1;
3741         }
3742
3743         return 0;
3744 }
3745
3746 /*
3747  * helper for __extent_writepage.  This calls the writepage start hooks,
3748  * and does the loop to map the page into extents and bios.
3749  *
3750  * We return 1 if the IO is started and the page is unlocked,
3751  * 0 if all went well (page still locked)
3752  * < 0 if there were errors (page still locked)
3753  */
3754 static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
3755                                  struct page *page,
3756                                  struct writeback_control *wbc,
3757                                  struct extent_page_data *epd,
3758                                  loff_t i_size,
3759                                  unsigned long nr_written,
3760                                  int *nr_ret)
3761 {
3762         struct btrfs_fs_info *fs_info = inode->root->fs_info;
3763         struct extent_io_tree *tree = &inode->io_tree;
3764         u64 start = page_offset(page);
3765         u64 end = start + PAGE_SIZE - 1;
3766         u64 cur = start;
3767         u64 extent_offset;
3768         u64 block_start;
3769         struct extent_map *em;
3770         int ret = 0;
3771         int nr = 0;
3772         u32 opf = REQ_OP_WRITE;
3773         const unsigned int write_flags = wbc_to_write_flags(wbc);
3774         bool compressed;
3775
3776         ret = btrfs_writepage_cow_fixup(page, start, end);
3777         if (ret) {
3778                 /* Fixup worker will requeue */
3779                 redirty_page_for_writepage(wbc, page);
3780                 update_nr_written(wbc, nr_written);
3781                 unlock_page(page);
3782                 return 1;
3783         }
3784
3785         /*
3786          * we don't want to touch the inode after unlocking the page,
3787          * so we update the mapping writeback index now
3788          */
3789         update_nr_written(wbc, nr_written + 1);
3790
3791         while (cur <= end) {
3792                 u64 disk_bytenr;
3793                 u64 em_end;
3794                 u32 iosize;
3795
3796                 if (cur >= i_size) {
3797                         btrfs_writepage_endio_finish_ordered(page, cur, end, 1);
3798                         break;
3799                 }
3800                 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
3801                 if (IS_ERR_OR_NULL(em)) {
3802                         SetPageError(page);
3803                         ret = PTR_ERR_OR_ZERO(em);
3804                         break;
3805                 }
3806
3807                 extent_offset = cur - em->start;
3808                 em_end = extent_map_end(em);
3809                 ASSERT(cur <= em_end);
3810                 ASSERT(cur < end);
3811                 ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
3812                 ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
3813                 block_start = em->block_start;
3814                 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3815                 disk_bytenr = em->block_start + extent_offset;
3816
3817                 /* Note that em_end from extent_map_end() is exclusive */
3818                 iosize = min(em_end, end + 1) - cur;
3819
3820                 if (btrfs_use_zone_append(inode, em->block_start))
3821                         opf = REQ_OP_ZONE_APPEND;
3822
3823                 free_extent_map(em);
3824                 em = NULL;
3825
3826                 /*
3827                  * compressed and inline extents are written through other
3828                  * paths in the FS
3829                  */
3830                 if (compressed || block_start == EXTENT_MAP_HOLE ||
3831                     block_start == EXTENT_MAP_INLINE) {
3832                         if (compressed)
3833                                 nr++;
3834                         else
3835                                 btrfs_writepage_endio_finish_ordered(page, cur,
3836                                                         cur + iosize - 1, 1);
3837                         cur += iosize;
3838                         continue;
3839                 }
3840
3841                 btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
3842                 if (!PageWriteback(page)) {
3843                         btrfs_err(inode->root->fs_info,
3844                                    "page %lu not writeback, cur %llu end %llu",
3845                                page->index, cur, end);
3846                 }
3847
3848                 ret = submit_extent_page(opf | write_flags, wbc,
3849                                          &epd->bio_ctrl, page,
3850                                          disk_bytenr, iosize,
3851                                          cur - page_offset(page),
3852                                          end_bio_extent_writepage,
3853                                          0, 0, false);
3854                 if (ret) {
3855                         SetPageError(page);
3856                         if (PageWriteback(page))
3857                                 end_page_writeback(page);
3858                 }
3859
3860                 cur += iosize;
3861                 nr++;
3862         }
3863         *nr_ret = nr;
3864         return ret;
3865 }
3866
3867 /*
3868  * the writepage semantics are similar to regular writepage.  extent
3869  * records are inserted to lock ranges in the tree, and as dirty areas
3870  * are found, they are marked writeback.  Then the lock bits are removed
3871  * and the end_io handler clears the writeback ranges
3872  *
3873  * Return 0 if everything goes well.
3874  * Return <0 for error.
3875  */
3876 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3877                               struct extent_page_data *epd)
3878 {
3879         struct inode *inode = page->mapping->host;
3880         u64 start = page_offset(page);
3881         u64 page_end = start + PAGE_SIZE - 1;
3882         int ret;
3883         int nr = 0;
3884         size_t pg_offset;
3885         loff_t i_size = i_size_read(inode);
3886         unsigned long end_index = i_size >> PAGE_SHIFT;
3887         unsigned long nr_written = 0;
3888
3889         trace___extent_writepage(page, inode, wbc);
3890
3891         WARN_ON(!PageLocked(page));
3892
3893         ClearPageError(page);
3894
3895         pg_offset = offset_in_page(i_size);
3896         if (page->index > end_index ||
3897            (page->index == end_index && !pg_offset)) {
3898                 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
3899                 unlock_page(page);
3900                 return 0;
3901         }
3902
3903         if (page->index == end_index) {
3904                 memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
3905                 flush_dcache_page(page);
3906         }
3907
3908         ret = set_page_extent_mapped(page);
3909         if (ret < 0) {
3910                 SetPageError(page);
3911                 goto done;
3912         }
3913
3914         if (!epd->extent_locked) {
3915                 ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
3916                                          &nr_written);
3917                 if (ret == 1)
3918                         return 0;
3919                 if (ret)
3920                         goto done;
3921         }
3922
3923         ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
3924                                     nr_written, &nr);
3925         if (ret == 1)
3926                 return 0;
3927
3928 done:
3929         if (nr == 0) {
3930                 /* make sure the mapping tag for page dirty gets cleared */
3931                 set_page_writeback(page);
3932                 end_page_writeback(page);
3933         }
3934         if (PageError(page)) {
3935                 ret = ret < 0 ? ret : -EIO;
3936                 end_extent_writepage(page, ret, start, page_end);
3937         }
3938         unlock_page(page);
3939         ASSERT(ret <= 0);
3940         return ret;
3941 }
3942
3943 void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3944 {
3945         wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3946                        TASK_UNINTERRUPTIBLE);
3947 }
3948
3949 static void end_extent_buffer_writeback(struct extent_buffer *eb)
3950 {
3951         clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3952         smp_mb__after_atomic();
3953         wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3954 }
3955
3956 /*
3957  * Lock extent buffer status and pages for writeback.
3958  *
3959  * May try to flush write bio if we can't get the lock.
3960  *
3961  * Return  0 if the extent buffer doesn't need to be submitted.
3962  *           (E.g. the extent buffer is not dirty)
3963  * Return >0 is the extent buffer is submitted to bio.
3964  * Return <0 if something went wrong, no page is locked.
3965  */
3966 static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
3967                           struct extent_page_data *epd)
3968 {
3969         struct btrfs_fs_info *fs_info = eb->fs_info;
3970         int i, num_pages, failed_page_nr;
3971         int flush = 0;
3972         int ret = 0;
3973
3974         if (!btrfs_try_tree_write_lock(eb)) {
3975                 ret = flush_write_bio(epd);
3976                 if (ret < 0)
3977                         return ret;
3978                 flush = 1;
3979                 btrfs_tree_lock(eb);
3980         }
3981
3982         if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3983                 btrfs_tree_unlock(eb);
3984                 if (!epd->sync_io)
3985                         return 0;
3986                 if (!flush) {
3987                         ret = flush_write_bio(epd);
3988                         if (ret < 0)
3989                                 return ret;
3990                         flush = 1;
3991                 }
3992                 while (1) {
3993                         wait_on_extent_buffer_writeback(eb);
3994                         btrfs_tree_lock(eb);
3995                         if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3996                                 break;
3997                         btrfs_tree_unlock(eb);
3998                 }
3999         }
4000
4001         /*
4002          * We need to do this to prevent races in people who check if the eb is
4003          * under IO since we can end up having no IO bits set for a short period
4004          * of time.
4005          */
4006         spin_lock(&eb->refs_lock);
4007         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4008                 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
4009                 spin_unlock(&eb->refs_lock);
4010                 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
4011                 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4012                                          -eb->len,
4013                                          fs_info->dirty_metadata_batch);
4014                 ret = 1;
4015         } else {
4016                 spin_unlock(&eb->refs_lock);
4017         }
4018
4019         btrfs_tree_unlock(eb);
4020
4021         /*
4022          * Either we don't need to submit any tree block, or we're submitting
4023          * subpage eb.
4024          * Subpage metadata doesn't use page locking at all, so we can skip
4025          * the page locking.
4026          */
4027         if (!ret || fs_info->sectorsize < PAGE_SIZE)
4028                 return ret;
4029
4030         num_pages = num_extent_pages(eb);
4031         for (i = 0; i < num_pages; i++) {
4032                 struct page *p = eb->pages[i];
4033
4034                 if (!trylock_page(p)) {
4035                         if (!flush) {
4036                                 int err;
4037
4038                                 err = flush_write_bio(epd);
4039                                 if (err < 0) {
4040                                         ret = err;
4041                                         failed_page_nr = i;
4042                                         goto err_unlock;
4043                                 }
4044                                 flush = 1;
4045                         }
4046                         lock_page(p);
4047                 }
4048         }
4049
4050         return ret;
4051 err_unlock:
4052         /* Unlock already locked pages */
4053         for (i = 0; i < failed_page_nr; i++)
4054                 unlock_page(eb->pages[i]);
4055         /*
4056          * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
4057          * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
4058          * be made and undo everything done before.
4059          */
4060         btrfs_tree_lock(eb);
4061         spin_lock(&eb->refs_lock);
4062         set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
4063         end_extent_buffer_writeback(eb);
4064         spin_unlock(&eb->refs_lock);
4065         percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
4066                                  fs_info->dirty_metadata_batch);
4067         btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
4068         btrfs_tree_unlock(eb);
4069         return ret;
4070 }
4071
4072 static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
4073 {
4074         struct btrfs_fs_info *fs_info = eb->fs_info;
4075
4076         btrfs_page_set_error(fs_info, page, eb->start, eb->len);
4077         if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
4078                 return;
4079
4080         /*
4081          * If we error out, we should add back the dirty_metadata_bytes
4082          * to make it consistent.
4083          */
4084         percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4085                                  eb->len, fs_info->dirty_metadata_batch);
4086
4087         /*
4088          * If writeback for a btree extent that doesn't belong to a log tree
4089          * failed, increment the counter transaction->eb_write_errors.
4090          * We do this because while the transaction is running and before it's
4091          * committing (when we call filemap_fdata[write|wait]_range against
4092          * the btree inode), we might have
4093          * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
4094          * returns an error or an error happens during writeback, when we're
4095          * committing the transaction we wouldn't know about it, since the pages
4096          * can be no longer dirty nor marked anymore for writeback (if a
4097          * subsequent modification to the extent buffer didn't happen before the
4098          * transaction commit), which makes filemap_fdata[write|wait]_range not
4099          * able to find the pages tagged with SetPageError at transaction
4100          * commit time. So if this happens we must abort the transaction,
4101          * otherwise we commit a super block with btree roots that point to
4102          * btree nodes/leafs whose content on disk is invalid - either garbage
4103          * or the content of some node/leaf from a past generation that got
4104          * cowed or deleted and is no longer valid.
4105          *
4106          * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
4107          * not be enough - we need to distinguish between log tree extents vs
4108          * non-log tree extents, and the next filemap_fdatawait_range() call
4109          * will catch and clear such errors in the mapping - and that call might
4110          * be from a log sync and not from a transaction commit. Also, checking
4111          * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
4112          * not done and would not be reliable - the eb might have been released
4113          * from memory and reading it back again means that flag would not be
4114          * set (since it's a runtime flag, not persisted on disk).
4115          *
4116          * Using the flags below in the btree inode also makes us achieve the
4117          * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
4118          * writeback for all dirty pages and before filemap_fdatawait_range()
4119          * is called, the writeback for all dirty pages had already finished
4120          * with errors - because we were not using AS_EIO/AS_ENOSPC,
4121          * filemap_fdatawait_range() would return success, as it could not know
4122          * that writeback errors happened (the pages were no longer tagged for
4123          * writeback).
4124          */
4125         switch (eb->log_index) {
4126         case -1:
4127                 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
4128                 break;
4129         case 0:
4130                 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
4131                 break;
4132         case 1:
4133                 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
4134                 break;
4135         default:
4136                 BUG(); /* unexpected, logic error */
4137         }
4138 }
4139
4140 /*
4141  * The endio specific version which won't touch any unsafe spinlock in endio
4142  * context.
4143  */
4144 static struct extent_buffer *find_extent_buffer_nolock(
4145                 struct btrfs_fs_info *fs_info, u64 start)
4146 {
4147         struct extent_buffer *eb;
4148
4149         rcu_read_lock();
4150         eb = radix_tree_lookup(&fs_info->buffer_radix,
4151                                start >> fs_info->sectorsize_bits);
4152         if (eb && atomic_inc_not_zero(&eb->refs)) {
4153                 rcu_read_unlock();
4154                 return eb;
4155         }
4156         rcu_read_unlock();
4157         return NULL;
4158 }
4159
4160 /*
4161  * The endio function for subpage extent buffer write.
4162  *
4163  * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
4164  * after all extent buffers in the page has finished their writeback.
4165  */
4166 static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info,
4167                                          struct bio *bio)
4168 {
4169         struct bio_vec *bvec;
4170         struct bvec_iter_all iter_all;
4171
4172         ASSERT(!bio_flagged(bio, BIO_CLONED));
4173         bio_for_each_segment_all(bvec, bio, iter_all) {
4174                 struct page *page = bvec->bv_page;
4175                 u64 bvec_start = page_offset(page) + bvec->bv_offset;
4176                 u64 bvec_end = bvec_start + bvec->bv_len - 1;
4177                 u64 cur_bytenr = bvec_start;
4178
4179                 ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
4180
4181                 /* Iterate through all extent buffers in the range */
4182                 while (cur_bytenr <= bvec_end) {
4183                         struct extent_buffer *eb;
4184                         int done;
4185
4186                         /*
4187                          * Here we can't use find_extent_buffer(), as it may
4188                          * try to lock eb->refs_lock, which is not safe in endio
4189                          * context.
4190                          */
4191                         eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
4192                         ASSERT(eb);
4193
4194                         cur_bytenr = eb->start + eb->len;
4195
4196                         ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
4197                         done = atomic_dec_and_test(&eb->io_pages);
4198                         ASSERT(done);
4199
4200                         if (bio->bi_status ||
4201                             test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
4202                                 ClearPageUptodate(page);
4203                                 set_btree_ioerr(page, eb);
4204                         }
4205
4206                         btrfs_subpage_clear_writeback(fs_info, page, eb->start,
4207                                                       eb->len);
4208                         end_extent_buffer_writeback(eb);
4209                         /*
4210                          * free_extent_buffer() will grab spinlock which is not
4211                          * safe in endio context. Thus here we manually dec
4212                          * the ref.
4213                          */
4214                         atomic_dec(&eb->refs);
4215                 }
4216         }
4217         bio_put(bio);
4218 }
4219
4220 static void end_bio_extent_buffer_writepage(struct bio *bio)
4221 {
4222         struct btrfs_fs_info *fs_info;
4223         struct bio_vec *bvec;
4224         struct extent_buffer *eb;
4225         int done;
4226         struct bvec_iter_all iter_all;
4227
4228         fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
4229         if (fs_info->sectorsize < PAGE_SIZE)
4230                 return end_bio_subpage_eb_writepage(fs_info, bio);
4231
4232         ASSERT(!bio_flagged(bio, BIO_CLONED));
4233         bio_for_each_segment_all(bvec, bio, iter_all) {
4234                 struct page *page = bvec->bv_page;
4235
4236                 eb = (struct extent_buffer *)page->private;
4237                 BUG_ON(!eb);
4238                 done = atomic_dec_and_test(&eb->io_pages);
4239
4240                 if (bio->bi_status ||
4241                     test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
4242                         ClearPageUptodate(page);
4243                         set_btree_ioerr(page, eb);
4244                 }
4245
4246                 end_page_writeback(page);
4247
4248                 if (!done)
4249                         continue;
4250
4251                 end_extent_buffer_writeback(eb);
4252         }
4253
4254         bio_put(bio);
4255 }
4256
4257 /*
4258  * Unlike the work in write_one_eb(), we rely completely on extent locking.
4259  * Page locking is only utilized at minimum to keep the VMM code happy.
4260  *
4261  * Caller should still call write_one_eb() other than this function directly.
4262  * As write_one_eb() has extra preparation before submitting the extent buffer.
4263  */
4264 static int write_one_subpage_eb(struct extent_buffer *eb,
4265                                 struct writeback_control *wbc,
4266                                 struct extent_page_data *epd)
4267 {
4268         struct btrfs_fs_info *fs_info = eb->fs_info;
4269         struct page *page = eb->pages[0];
4270         unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
4271         bool no_dirty_ebs = false;
4272         int ret;
4273
4274         /* clear_page_dirty_for_io() in subpage helper needs page locked */
4275         lock_page(page);
4276         btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
4277
4278         /* Check if this is the last dirty bit to update nr_written */
4279         no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
4280                                                           eb->start, eb->len);
4281         if (no_dirty_ebs)
4282                 clear_page_dirty_for_io(page);
4283
4284         ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
4285                         &epd->bio_ctrl, page, eb->start, eb->len,
4286                         eb->start - page_offset(page),
4287                         end_bio_extent_buffer_writepage, 0, 0, false);
4288         if (ret) {
4289                 btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
4290                 set_btree_ioerr(page, eb);
4291                 unlock_page(page);
4292
4293                 if (atomic_dec_and_test(&eb->io_pages))
4294                         end_extent_buffer_writeback(eb);
4295                 return -EIO;
4296         }
4297         unlock_page(page);
4298         /*
4299          * Submission finished without problem, if no range of the page is
4300          * dirty anymore, we have submitted a page.  Update nr_written in wbc.
4301          */
4302         if (no_dirty_ebs)
4303                 update_nr_written(wbc, 1);
4304         return ret;
4305 }
4306
4307 static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
4308                         struct writeback_control *wbc,
4309                         struct extent_page_data *epd)
4310 {
4311         u64 disk_bytenr = eb->start;
4312         u32 nritems;
4313         int i, num_pages;
4314         unsigned long start, end;
4315         unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
4316         int ret = 0;
4317
4318         clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
4319         num_pages = num_extent_pages(eb);
4320         atomic_set(&eb->io_pages, num_pages);
4321
4322         /* set btree blocks beyond nritems with 0 to avoid stale content. */
4323         nritems = btrfs_header_nritems(eb);
4324         if (btrfs_header_level(eb) > 0) {
4325                 end = btrfs_node_key_ptr_offset(nritems);
4326
4327                 memzero_extent_buffer(eb, end, eb->len - end);
4328         } else {
4329                 /*
4330                  * leaf:
4331                  * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
4332                  */
4333                 start = btrfs_item_nr_offset(nritems);
4334                 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
4335                 memzero_extent_buffer(eb, start, end - start);
4336         }
4337
4338         if (eb->fs_info->sectorsize < PAGE_SIZE)
4339                 return write_one_subpage_eb(eb, wbc, epd);
4340
4341         for (i = 0; i < num_pages; i++) {
4342                 struct page *p = eb->pages[i];
4343
4344                 clear_page_dirty_for_io(p);
4345                 set_page_writeback(p);
4346                 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
4347                                          &epd->bio_ctrl, p, disk_bytenr,
4348                                          PAGE_SIZE, 0,
4349                                          end_bio_extent_buffer_writepage,
4350                                          0, 0, false);
4351                 if (ret) {
4352                         set_btree_ioerr(p, eb);
4353                         if (PageWriteback(p))
4354                                 end_page_writeback(p);
4355                         if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
4356                                 end_extent_buffer_writeback(eb);
4357                         ret = -EIO;
4358                         break;
4359                 }
4360                 disk_bytenr += PAGE_SIZE;
4361                 update_nr_written(wbc, 1);
4362                 unlock_page(p);
4363         }
4364
4365         if (unlikely(ret)) {
4366                 for (; i < num_pages; i++) {
4367                         struct page *p = eb->pages[i];
4368                         clear_page_dirty_for_io(p);
4369                         unlock_page(p);
4370                 }
4371         }
4372
4373         return ret;
4374 }
4375
4376 /*
4377  * Submit one subpage btree page.
4378  *
4379  * The main difference to submit_eb_page() is:
4380  * - Page locking
4381  *   For subpage, we don't rely on page locking at all.
4382  *
4383  * - Flush write bio
4384  *   We only flush bio if we may be unable to fit current extent buffers into
4385  *   current bio.
4386  *
4387  * Return >=0 for the number of submitted extent buffers.
4388  * Return <0 for fatal error.
4389  */
4390 static int submit_eb_subpage(struct page *page,
4391                              struct writeback_control *wbc,
4392                              struct extent_page_data *epd)
4393 {
4394         struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
4395         int submitted = 0;
4396         u64 page_start = page_offset(page);
4397         int bit_start = 0;
4398         const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
4399         int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
4400         int ret;
4401
4402         /* Lock and write each dirty extent buffers in the range */
4403         while (bit_start < nbits) {
4404                 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
4405                 struct extent_buffer *eb;
4406                 unsigned long flags;
4407                 u64 start;
4408
4409                 /*
4410                  * Take private lock to ensure the subpage won't be detached
4411                  * in the meantime.
4412                  */
4413                 spin_lock(&page->mapping->private_lock);
4414                 if (!PagePrivate(page)) {
4415                         spin_unlock(&page->mapping->private_lock);
4416                         break;
4417                 }
4418                 spin_lock_irqsave(&subpage->lock, flags);
4419                 if (!((1 << bit_start) & subpage->dirty_bitmap)) {
4420                         spin_unlock_irqrestore(&subpage->lock, flags);
4421                         spin_unlock(&page->mapping->private_lock);
4422                         bit_start++;
4423                         continue;
4424                 }
4425
4426                 start = page_start + bit_start * fs_info->sectorsize;
4427                 bit_start += sectors_per_node;
4428
4429                 /*
4430                  * Here we just want to grab the eb without touching extra
4431                  * spin locks, so call find_extent_buffer_nolock().
4432                  */
4433                 eb = find_extent_buffer_nolock(fs_info, start);
4434                 spin_unlock_irqrestore(&subpage->lock, flags);
4435                 spin_unlock(&page->mapping->private_lock);
4436
4437                 /*
4438                  * The eb has already reached 0 refs thus find_extent_buffer()
4439                  * doesn't return it. We don't need to write back such eb
4440                  * anyway.
4441                  */
4442                 if (!eb)
4443                         continue;
4444
4445                 ret = lock_extent_buffer_for_io(eb, epd);
4446                 if (ret == 0) {
4447                         free_extent_buffer(eb);
4448                         continue;
4449                 }
4450                 if (ret < 0) {
4451                         free_extent_buffer(eb);
4452                         goto cleanup;
4453                 }
4454                 ret = write_one_eb(eb, wbc, epd);
4455                 free_extent_buffer(eb);
4456                 if (ret < 0)
4457                         goto cleanup;
4458                 submitted++;
4459         }
4460         return submitted;
4461
4462 cleanup:
4463         /* We hit error, end bio for the submitted extent buffers */
4464         end_write_bio(epd, ret);
4465         return ret;
4466 }
4467
4468 /*
4469  * Submit all page(s) of one extent buffer.
4470  *
4471  * @page:       the page of one extent buffer
4472  * @eb_context: to determine if we need to submit this page, if current page
4473  *              belongs to this eb, we don't need to submit
4474  *
4475  * The caller should pass each page in their bytenr order, and here we use
4476  * @eb_context to determine if we have submitted pages of one extent buffer.
4477  *
4478  * If we have, we just skip until we hit a new page that doesn't belong to
4479  * current @eb_context.
4480  *
4481  * If not, we submit all the page(s) of the extent buffer.
4482  *
4483  * Return >0 if we have submitted the extent buffer successfully.
4484  * Return 0 if we don't need to submit the page, as it's already submitted by
4485  * previous call.
4486  * Return <0 for fatal error.
4487  */
4488 static int submit_eb_page(struct page *page, struct writeback_control *wbc,
4489                           struct extent_page_data *epd,
4490                           struct extent_buffer **eb_context)
4491 {
4492         struct address_space *mapping = page->mapping;
4493         struct btrfs_block_group *cache = NULL;
4494         struct extent_buffer *eb;
4495         int ret;
4496
4497         if (!PagePrivate(page))
4498                 return 0;
4499
4500         if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
4501                 return submit_eb_subpage(page, wbc, epd);
4502
4503         spin_lock(&mapping->private_lock);
4504         if (!PagePrivate(page)) {
4505                 spin_unlock(&mapping->private_lock);
4506                 return 0;
4507         }
4508
4509         eb = (struct extent_buffer *)page->private;
4510
4511         /*
4512          * Shouldn't happen and normally this would be a BUG_ON but no point
4513          * crashing the machine for something we can survive anyway.
4514          */
4515         if (WARN_ON(!eb)) {
4516                 spin_unlock(&mapping->private_lock);
4517                 return 0;
4518         }
4519
4520         if (eb == *eb_context) {
4521                 spin_unlock(&mapping->private_lock);
4522                 return 0;
4523         }
4524         ret = atomic_inc_not_zero(&eb->refs);
4525         spin_unlock(&mapping->private_lock);
4526         if (!ret)
4527                 return 0;
4528
4529         if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
4530                 /*
4531                  * If for_sync, this hole will be filled with
4532                  * trasnsaction commit.
4533                  */
4534                 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
4535                         ret = -EAGAIN;
4536                 else
4537                         ret = 0;
4538                 free_extent_buffer(eb);
4539                 return ret;
4540         }
4541
4542         *eb_context = eb;
4543
4544         ret = lock_extent_buffer_for_io(eb, epd);
4545         if (ret <= 0) {
4546                 btrfs_revert_meta_write_pointer(cache, eb);
4547                 if (cache)
4548                         btrfs_put_block_group(cache);
4549                 free_extent_buffer(eb);
4550                 return ret;
4551         }
4552         if (cache)
4553                 btrfs_put_block_group(cache);
4554         ret = write_one_eb(eb, wbc, epd);
4555         free_extent_buffer(eb);
4556         if (ret < 0)
4557                 return ret;
4558         return 1;
4559 }
4560
4561 int btree_write_cache_pages(struct address_space *mapping,
4562                                    struct writeback_control *wbc)
4563 {
4564         struct extent_buffer *eb_context = NULL;
4565         struct extent_page_data epd = {
4566                 .bio_ctrl = { 0 },
4567                 .extent_locked = 0,
4568                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4569         };
4570         struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
4571         int ret = 0;
4572         int done = 0;
4573         int nr_to_write_done = 0;
4574         struct pagevec pvec;
4575         int nr_pages;
4576         pgoff_t index;
4577         pgoff_t end;            /* Inclusive */
4578         int scanned = 0;
4579         xa_mark_t tag;
4580
4581         pagevec_init(&pvec);
4582         if (wbc->range_cyclic) {
4583                 index = mapping->writeback_index; /* Start from prev offset */
4584                 end = -1;
4585                 /*
4586                  * Start from the beginning does not need to cycle over the
4587                  * range, mark it as scanned.
4588                  */
4589                 scanned = (index == 0);
4590         } else {
4591                 index = wbc->range_start >> PAGE_SHIFT;
4592                 end = wbc->range_end >> PAGE_SHIFT;
4593                 scanned = 1;
4594         }
4595         if (wbc->sync_mode == WB_SYNC_ALL)
4596                 tag = PAGECACHE_TAG_TOWRITE;
4597         else
4598                 tag = PAGECACHE_TAG_DIRTY;
4599         btrfs_zoned_meta_io_lock(fs_info);
4600 retry:
4601         if (wbc->sync_mode == WB_SYNC_ALL)
4602                 tag_pages_for_writeback(mapping, index, end);
4603         while (!done && !nr_to_write_done && (index <= end) &&
4604                (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
4605                         tag))) {
4606                 unsigned i;
4607
4608                 for (i = 0; i < nr_pages; i++) {
4609                         struct page *page = pvec.pages[i];
4610
4611                         ret = submit_eb_page(page, wbc, &epd, &eb_context);
4612                         if (ret == 0)
4613                                 continue;
4614                         if (ret < 0) {
4615                                 done = 1;
4616                                 break;
4617                         }
4618
4619                         /*
4620                          * the filesystem may choose to bump up nr_to_write.
4621                          * We have to make sure to honor the new nr_to_write
4622                          * at any time
4623                          */
4624                         nr_to_write_done = wbc->nr_to_write <= 0;
4625                 }
4626                 pagevec_release(&pvec);
4627                 cond_resched();
4628         }
4629         if (!scanned && !done) {
4630                 /*
4631                  * We hit the last page and there is more work to be done: wrap
4632                  * back to the start of the file
4633                  */
4634                 scanned = 1;
4635                 index = 0;
4636                 goto retry;
4637         }
4638         if (ret < 0) {
4639                 end_write_bio(&epd, ret);
4640                 goto out;
4641         }
4642         /*
4643          * If something went wrong, don't allow any metadata write bio to be
4644          * submitted.
4645          *
4646          * This would prevent use-after-free if we had dirty pages not
4647          * cleaned up, which can still happen by fuzzed images.
4648          *
4649          * - Bad extent tree
4650          *   Allowing existing tree block to be allocated for other trees.
4651          *
4652          * - Log tree operations
4653          *   Exiting tree blocks get allocated to log tree, bumps its
4654          *   generation, then get cleaned in tree re-balance.
4655          *   Such tree block will not be written back, since it's clean,
4656          *   thus no WRITTEN flag set.
4657          *   And after log writes back, this tree block is not traced by
4658          *   any dirty extent_io_tree.
4659          *
4660          * - Offending tree block gets re-dirtied from its original owner
4661          *   Since it has bumped generation, no WRITTEN flag, it can be
4662          *   reused without COWing. This tree block will not be traced
4663          *   by btrfs_transaction::dirty_pages.
4664          *
4665          *   Now such dirty tree block will not be cleaned by any dirty
4666          *   extent io tree. Thus we don't want to submit such wild eb
4667          *   if the fs already has error.
4668          */
4669         if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
4670                 ret = flush_write_bio(&epd);
4671         } else {
4672                 ret = -EROFS;
4673                 end_write_bio(&epd, ret);
4674         }
4675 out:
4676         btrfs_zoned_meta_io_unlock(fs_info);
4677         return ret;
4678 }
4679
4680 /**
4681  * Walk the list of dirty pages of the given address space and write all of them.
4682  *
4683  * @mapping: address space structure to write
4684  * @wbc:     subtract the number of written pages from *@wbc->nr_to_write
4685  * @epd:     holds context for the write, namely the bio
4686  *
4687  * If a page is already under I/O, write_cache_pages() skips it, even
4688  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
4689  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
4690  * and msync() need to guarantee that all the data which was dirty at the time
4691  * the call was made get new I/O started against them.  If wbc->sync_mode is
4692  * WB_SYNC_ALL then we were called for data integrity and we must wait for
4693  * existing IO to complete.
4694  */
4695 static int extent_write_cache_pages(struct address_space *mapping,
4696                              struct writeback_control *wbc,
4697                              struct extent_page_data *epd)
4698 {
4699         struct inode *inode = mapping->host;
4700         int ret = 0;
4701         int done = 0;
4702         int nr_to_write_done = 0;
4703         struct pagevec pvec;
4704         int nr_pages;
4705         pgoff_t index;
4706         pgoff_t end;            /* Inclusive */
4707         pgoff_t done_index;
4708         int range_whole = 0;
4709         int scanned = 0;
4710         xa_mark_t tag;
4711
4712         /*
4713          * We have to hold onto the inode so that ordered extents can do their
4714          * work when the IO finishes.  The alternative to this is failing to add
4715          * an ordered extent if the igrab() fails there and that is a huge pain
4716          * to deal with, so instead just hold onto the inode throughout the
4717          * writepages operation.  If it fails here we are freeing up the inode
4718          * anyway and we'd rather not waste our time writing out stuff that is
4719          * going to be truncated anyway.
4720          */
4721         if (!igrab(inode))
4722                 return 0;
4723
4724         pagevec_init(&pvec);
4725         if (wbc->range_cyclic) {
4726                 index = mapping->writeback_index; /* Start from prev offset */
4727                 end = -1;
4728                 /*
4729                  * Start from the beginning does not need to cycle over the
4730                  * range, mark it as scanned.
4731                  */
4732                 scanned = (index == 0);
4733         } else {
4734                 index = wbc->range_start >> PAGE_SHIFT;
4735                 end = wbc->range_end >> PAGE_SHIFT;
4736                 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
4737                         range_whole = 1;
4738                 scanned = 1;
4739         }
4740
4741         /*
4742          * We do the tagged writepage as long as the snapshot flush bit is set
4743          * and we are the first one who do the filemap_flush() on this inode.
4744          *
4745          * The nr_to_write == LONG_MAX is needed to make sure other flushers do
4746          * not race in and drop the bit.
4747          */
4748         if (range_whole && wbc->nr_to_write == LONG_MAX &&
4749             test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
4750                                &BTRFS_I(inode)->runtime_flags))
4751                 wbc->tagged_writepages = 1;
4752
4753         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
4754                 tag = PAGECACHE_TAG_TOWRITE;
4755         else
4756                 tag = PAGECACHE_TAG_DIRTY;
4757 retry:
4758         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
4759                 tag_pages_for_writeback(mapping, index, end);
4760         done_index = index;
4761         while (!done && !nr_to_write_done && (index <= end) &&
4762                         (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
4763                                                 &index, end, tag))) {
4764                 unsigned i;
4765
4766                 for (i = 0; i < nr_pages; i++) {
4767                         struct page *page = pvec.pages[i];
4768
4769                         done_index = page->index + 1;
4770                         /*
4771                          * At this point we hold neither the i_pages lock nor
4772                          * the page lock: the page may be truncated or
4773                          * invalidated (changing page->mapping to NULL),
4774                          * or even swizzled back from swapper_space to
4775                          * tmpfs file mapping
4776                          */
4777                         if (!trylock_page(page)) {
4778                                 ret = flush_write_bio(epd);
4779                                 BUG_ON(ret < 0);
4780                                 lock_page(page);
4781                         }
4782
4783                         if (unlikely(page->mapping != mapping)) {
4784                                 unlock_page(page);
4785                                 continue;
4786                         }
4787
4788                         if (wbc->sync_mode != WB_SYNC_NONE) {
4789                                 if (PageWriteback(page)) {
4790                                         ret = flush_write_bio(epd);
4791                                         BUG_ON(ret < 0);
4792                                 }
4793                                 wait_on_page_writeback(page);
4794                         }
4795
4796                         if (PageWriteback(page) ||
4797                             !clear_page_dirty_for_io(page)) {
4798                                 unlock_page(page);
4799                                 continue;
4800                         }
4801
4802                         ret = __extent_writepage(page, wbc, epd);
4803                         if (ret < 0) {
4804                                 done = 1;
4805                                 break;
4806                         }
4807
4808                         /*
4809                          * the filesystem may choose to bump up nr_to_write.
4810                          * We have to make sure to honor the new nr_to_write
4811                          * at any time
4812                          */
4813                         nr_to_write_done = wbc->nr_to_write <= 0;
4814                 }
4815                 pagevec_release(&pvec);
4816                 cond_resched();
4817         }
4818         if (!scanned && !done) {
4819                 /*
4820                  * We hit the last page and there is more work to be done: wrap
4821                  * back to the start of the file
4822                  */
4823                 scanned = 1;
4824                 index = 0;
4825
4826                 /*
4827                  * If we're looping we could run into a page that is locked by a
4828                  * writer and that writer could be waiting on writeback for a
4829                  * page in our current bio, and thus deadlock, so flush the
4830                  * write bio here.
4831                  */
4832                 ret = flush_write_bio(epd);
4833                 if (!ret)
4834                         goto retry;
4835         }
4836
4837         if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
4838                 mapping->writeback_index = done_index;
4839
4840         btrfs_add_delayed_iput(inode);
4841         return ret;
4842 }
4843
4844 int extent_write_full_page(struct page *page, struct writeback_control *wbc)
4845 {
4846         int ret;
4847         struct extent_page_data epd = {
4848                 .bio_ctrl = { 0 },
4849                 .extent_locked = 0,
4850                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4851         };
4852
4853         ret = __extent_writepage(page, wbc, &epd);
4854         ASSERT(ret <= 0);
4855         if (ret < 0) {
4856                 end_write_bio(&epd, ret);
4857                 return ret;
4858         }
4859
4860         ret = flush_write_bio(&epd);
4861         ASSERT(ret <= 0);
4862         return ret;
4863 }
4864
4865 int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
4866                               int mode)
4867 {
4868         int ret = 0;
4869         struct address_space *mapping = inode->i_mapping;
4870         struct page *page;
4871         unsigned long nr_pages = (end - start + PAGE_SIZE) >>
4872                 PAGE_SHIFT;
4873
4874         struct extent_page_data epd = {
4875                 .bio_ctrl = { 0 },
4876                 .extent_locked = 1,
4877                 .sync_io = mode == WB_SYNC_ALL,
4878         };
4879         struct writeback_control wbc_writepages = {
4880                 .sync_mode      = mode,
4881                 .nr_to_write    = nr_pages * 2,
4882                 .range_start    = start,
4883                 .range_end      = end + 1,
4884                 /* We're called from an async helper function */
4885                 .punt_to_cgroup = 1,
4886                 .no_cgroup_owner = 1,
4887         };
4888
4889         wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
4890         while (start <= end) {
4891                 page = find_get_page(mapping, start >> PAGE_SHIFT);
4892                 if (clear_page_dirty_for_io(page))
4893                         ret = __extent_writepage(page, &wbc_writepages, &epd);
4894                 else {
4895                         btrfs_writepage_endio_finish_ordered(page, start,
4896                                                     start + PAGE_SIZE - 1, 1);
4897                         unlock_page(page);
4898                 }
4899                 put_page(page);
4900                 start += PAGE_SIZE;
4901         }
4902
4903         ASSERT(ret <= 0);
4904         if (ret == 0)
4905                 ret = flush_write_bio(&epd);
4906         else
4907                 end_write_bio(&epd, ret);
4908
4909         wbc_detach_inode(&wbc_writepages);
4910         return ret;
4911 }
4912
4913 int extent_writepages(struct address_space *mapping,
4914                       struct writeback_control *wbc)
4915 {
4916         int ret = 0;
4917         struct extent_page_data epd = {
4918                 .bio_ctrl = { 0 },
4919                 .extent_locked = 0,
4920                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4921         };
4922
4923         ret = extent_write_cache_pages(mapping, wbc, &epd);
4924         ASSERT(ret <= 0);
4925         if (ret < 0) {
4926                 end_write_bio(&epd, ret);
4927                 return ret;
4928         }
4929         ret = flush_write_bio(&epd);
4930         return ret;
4931 }
4932
4933 void extent_readahead(struct readahead_control *rac)
4934 {
4935         struct btrfs_bio_ctrl bio_ctrl = { 0 };
4936         struct page *pagepool[16];
4937         struct extent_map *em_cached = NULL;
4938         u64 prev_em_start = (u64)-1;
4939         int nr;
4940
4941         while ((nr = readahead_page_batch(rac, pagepool))) {
4942                 u64 contig_start = readahead_pos(rac);
4943                 u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
4944
4945                 contiguous_readpages(pagepool, nr, contig_start, contig_end,
4946                                 &em_cached, &bio_ctrl, &prev_em_start);
4947         }
4948
4949         if (em_cached)
4950                 free_extent_map(em_cached);
4951
4952         if (bio_ctrl.bio) {
4953                 if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags))
4954                         return;
4955         }
4956 }
4957
4958 /*
4959  * basic invalidatepage code, this waits on any locked or writeback
4960  * ranges corresponding to the page, and then deletes any extent state
4961  * records from the tree
4962  */
4963 int extent_invalidatepage(struct extent_io_tree *tree,
4964                           struct page *page, unsigned long offset)
4965 {
4966         struct extent_state *cached_state = NULL;
4967         u64 start = page_offset(page);
4968         u64 end = start + PAGE_SIZE - 1;
4969         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
4970
4971         /* This function is only called for the btree inode */
4972         ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
4973
4974         start += ALIGN(offset, blocksize);
4975         if (start > end)
4976                 return 0;
4977
4978         lock_extent_bits(tree, start, end, &cached_state);
4979         wait_on_page_writeback(page);
4980
4981         /*
4982          * Currently for btree io tree, only EXTENT_LOCKED is utilized,
4983          * so here we only need to unlock the extent range to free any
4984          * existing extent state.
4985          */
4986         unlock_extent_cached(tree, start, end, &cached_state);
4987         return 0;
4988 }
4989
4990 /*
4991  * a helper for releasepage, this tests for areas of the page that
4992  * are locked or under IO and drops the related state bits if it is safe
4993  * to drop the page.
4994  */
4995 static int try_release_extent_state(struct extent_io_tree *tree,
4996                                     struct page *page, gfp_t mask)
4997 {
4998         u64 start = page_offset(page);
4999         u64 end = start + PAGE_SIZE - 1;
5000         int ret = 1;
5001
5002         if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
5003                 ret = 0;
5004         } else {
5005                 /*
5006                  * At this point we can safely clear everything except the
5007                  * locked bit, the nodatasum bit and the delalloc new bit.
5008                  * The delalloc new bit will be cleared by ordered extent
5009                  * completion.
5010                  */
5011                 ret = __clear_extent_bit(tree, start, end,
5012                          ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
5013                          0, 0, NULL, mask, NULL);
5014
5015                 /* if clear_extent_bit failed for enomem reasons,
5016                  * we can't allow the release to continue.
5017                  */
5018                 if (ret < 0)
5019                         ret = 0;
5020                 else
5021                         ret = 1;
5022         }
5023         return ret;
5024 }
5025
5026 /*
5027  * a helper for releasepage.  As long as there are no locked extents
5028  * in the range corresponding to the page, both state records and extent
5029  * map records are removed
5030  */
5031 int try_release_extent_mapping(struct page *page, gfp_t mask)
5032 {
5033         struct extent_map *em;
5034         u64 start = page_offset(page);
5035         u64 end = start + PAGE_SIZE - 1;
5036         struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
5037         struct extent_io_tree *tree = &btrfs_inode->io_tree;
5038         struct extent_map_tree *map = &btrfs_inode->extent_tree;
5039
5040         if (gfpflags_allow_blocking(mask) &&
5041             page->mapping->host->i_size > SZ_16M) {
5042                 u64 len;
5043                 while (start <= end) {
5044                         struct btrfs_fs_info *fs_info;
5045                         u64 cur_gen;
5046
5047                         len = end - start + 1;
5048                         write_lock(&map->lock);
5049                         em = lookup_extent_mapping(map, start, len);
5050                         if (!em) {
5051                                 write_unlock(&map->lock);
5052                                 break;
5053                         }
5054                         if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
5055                             em->start != start) {
5056                                 write_unlock(&map->lock);
5057                                 free_extent_map(em);
5058                                 break;
5059                         }
5060                         if (test_range_bit(tree, em->start,
5061                                            extent_map_end(em) - 1,
5062                                            EXTENT_LOCKED, 0, NULL))
5063                                 goto next;
5064                         /*
5065                          * If it's not in the list of modified extents, used
5066                          * by a fast fsync, we can remove it. If it's being
5067                          * logged we can safely remove it since fsync took an
5068                          * extra reference on the em.
5069                          */
5070                         if (list_empty(&em->list) ||
5071                             test_bit(EXTENT_FLAG_LOGGING, &em->flags))
5072                                 goto remove_em;
5073                         /*
5074                          * If it's in the list of modified extents, remove it
5075                          * only if its generation is older then the current one,
5076                          * in which case we don't need it for a fast fsync.
5077                          * Otherwise don't remove it, we could be racing with an
5078                          * ongoing fast fsync that could miss the new extent.
5079                          */
5080                         fs_info = btrfs_inode->root->fs_info;
5081                         spin_lock(&fs_info->trans_lock);
5082                         cur_gen = fs_info->generation;
5083                         spin_unlock(&fs_info->trans_lock);
5084                         if (em->generation >= cur_gen)
5085                                 goto next;
5086 remove_em:
5087                         /*
5088                          * We only remove extent maps that are not in the list of
5089                          * modified extents or that are in the list but with a
5090                          * generation lower then the current generation, so there
5091                          * is no need to set the full fsync flag on the inode (it
5092                          * hurts the fsync performance for workloads with a data
5093                          * size that exceeds or is close to the system's memory).
5094                          */
5095                         remove_extent_mapping(map, em);
5096                         /* once for the rb tree */
5097                         free_extent_map(em);
5098 next:
5099                         start = extent_map_end(em);
5100                         write_unlock(&map->lock);
5101
5102                         /* once for us */
5103                         free_extent_map(em);
5104
5105                         cond_resched(); /* Allow large-extent preemption. */
5106                 }
5107         }
5108         return try_release_extent_state(tree, page, mask);
5109 }
5110
5111 /*
5112  * helper function for fiemap, which doesn't want to see any holes.
5113  * This maps until we find something past 'last'
5114  */
5115 static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
5116                                                 u64 offset, u64 last)
5117 {
5118         u64 sectorsize = btrfs_inode_sectorsize(inode);
5119         struct extent_map *em;
5120         u64 len;
5121
5122         if (offset >= last)
5123                 return NULL;
5124
5125         while (1) {
5126                 len = last - offset;
5127                 if (len == 0)
5128                         break;
5129                 len = ALIGN(len, sectorsize);
5130                 em = btrfs_get_extent_fiemap(inode, offset, len);
5131                 if (IS_ERR_OR_NULL(em))
5132                         return em;
5133
5134                 /* if this isn't a hole return it */
5135                 if (em->block_start != EXTENT_MAP_HOLE)
5136                         return em;
5137
5138                 /* this is a hole, advance to the next extent */
5139                 offset = extent_map_end(em);
5140                 free_extent_map(em);
5141                 if (offset >= last)
5142                         break;
5143         }
5144         return NULL;
5145 }
5146
5147 /*
5148  * To cache previous fiemap extent
5149  *
5150  * Will be used for merging fiemap extent
5151  */
5152 struct fiemap_cache {
5153         u64 offset;
5154         u64 phys;
5155         u64 len;
5156         u32 flags;
5157         bool cached;
5158 };
5159
5160 /*
5161  * Helper to submit fiemap extent.
5162  *
5163  * Will try to merge current fiemap extent specified by @offset, @phys,
5164  * @len and @flags with cached one.
5165  * And only when we fails to merge, cached one will be submitted as
5166  * fiemap extent.
5167  *
5168  * Return value is the same as fiemap_fill_next_extent().
5169  */
5170 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
5171                                 struct fiemap_cache *cache,
5172                                 u64 offset, u64 phys, u64 len, u32 flags)
5173 {
5174         int ret = 0;
5175
5176         if (!cache->cached)
5177                 goto assign;
5178
5179         /*
5180          * Sanity check, extent_fiemap() should have ensured that new
5181          * fiemap extent won't overlap with cached one.
5182          * Not recoverable.
5183          *
5184          * NOTE: Physical address can overlap, due to compression
5185          */
5186         if (cache->offset + cache->len > offset) {
5187                 WARN_ON(1);
5188                 return -EINVAL;
5189         }
5190
5191         /*
5192          * Only merges fiemap extents if
5193          * 1) Their logical addresses are continuous
5194          *
5195          * 2) Their physical addresses are continuous
5196          *    So truly compressed (physical size smaller than logical size)
5197          *    extents won't get merged with each other
5198          *
5199          * 3) Share same flags except FIEMAP_EXTENT_LAST
5200          *    So regular extent won't get merged with prealloc extent
5201          */
5202         if (cache->offset + cache->len  == offset &&
5203             cache->phys + cache->len == phys  &&
5204             (cache->flags & ~FIEMAP_EXTENT_LAST) ==
5205                         (flags & ~FIEMAP_EXTENT_LAST)) {
5206                 cache->len += len;
5207                 cache->flags |= flags;
5208                 goto try_submit_last;
5209         }
5210
5211         /* Not mergeable, need to submit cached one */
5212         ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5213                                       cache->len, cache->flags);
5214         cache->cached = false;
5215         if (ret)
5216                 return ret;
5217 assign:
5218         cache->cached = true;
5219         cache->offset = offset;
5220         cache->phys = phys;
5221         cache->len = len;
5222         cache->flags = flags;
5223 try_submit_last:
5224         if (cache->flags & FIEMAP_EXTENT_LAST) {
5225                 ret = fiemap_fill_next_extent(fieinfo, cache->offset,
5226                                 cache->phys, cache->len, cache->flags);
5227                 cache->cached = false;
5228         }
5229         return ret;
5230 }
5231
5232 /*
5233  * Emit last fiemap cache
5234  *
5235  * The last fiemap cache may still be cached in the following case:
5236  * 0                  4k                    8k
5237  * |<- Fiemap range ->|
5238  * |<------------  First extent ----------->|
5239  *
5240  * In this case, the first extent range will be cached but not emitted.
5241  * So we must emit it before ending extent_fiemap().
5242  */
5243 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
5244                                   struct fiemap_cache *cache)
5245 {
5246         int ret;
5247
5248         if (!cache->cached)
5249                 return 0;
5250
5251         ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5252                                       cache->len, cache->flags);
5253         cache->cached = false;
5254         if (ret > 0)
5255                 ret = 0;
5256         return ret;
5257 }
5258
5259 int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
5260                   u64 start, u64 len)
5261 {
5262         int ret = 0;
5263         u64 off;
5264         u64 max = start + len;
5265         u32 flags = 0;
5266         u32 found_type;
5267         u64 last;
5268         u64 last_for_get_extent = 0;
5269         u64 disko = 0;
5270         u64 isize = i_size_read(&inode->vfs_inode);
5271         struct btrfs_key found_key;
5272         struct extent_map *em = NULL;
5273         struct extent_state *cached_state = NULL;
5274         struct btrfs_path *path;
5275         struct btrfs_root *root = inode->root;
5276         struct fiemap_cache cache = { 0 };
5277         struct ulist *roots;
5278         struct ulist *tmp_ulist;
5279         int end = 0;
5280         u64 em_start = 0;
5281         u64 em_len = 0;
5282         u64 em_end = 0;
5283
5284         if (len == 0)
5285                 return -EINVAL;
5286
5287         path = btrfs_alloc_path();
5288         if (!path)
5289                 return -ENOMEM;
5290
5291         roots = ulist_alloc(GFP_KERNEL);
5292         tmp_ulist = ulist_alloc(GFP_KERNEL);
5293         if (!roots || !tmp_ulist) {
5294                 ret = -ENOMEM;
5295                 goto out_free_ulist;
5296         }
5297
5298         /*
5299          * We can't initialize that to 'start' as this could miss extents due
5300          * to extent item merging
5301          */
5302         off = 0;
5303         start = round_down(start, btrfs_inode_sectorsize(inode));
5304         len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
5305
5306         /*
5307          * lookup the last file extent.  We're not using i_size here
5308          * because there might be preallocation past i_size
5309          */
5310         ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
5311                                        0);
5312         if (ret < 0) {
5313                 goto out_free_ulist;
5314         } else {
5315                 WARN_ON(!ret);
5316                 if (ret == 1)
5317                         ret = 0;
5318         }
5319
5320         path->slots[0]--;
5321         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5322         found_type = found_key.type;
5323
5324         /* No extents, but there might be delalloc bits */
5325         if (found_key.objectid != btrfs_ino(inode) ||
5326             found_type != BTRFS_EXTENT_DATA_KEY) {
5327                 /* have to trust i_size as the end */
5328                 last = (u64)-1;
5329                 last_for_get_extent = isize;
5330         } else {
5331                 /*
5332                  * remember the start of the last extent.  There are a
5333                  * bunch of different factors that go into the length of the
5334                  * extent, so its much less complex to remember where it started
5335                  */
5336                 last = found_key.offset;
5337                 last_for_get_extent = last + 1;
5338         }
5339         btrfs_release_path(path);
5340
5341         /*
5342          * we might have some extents allocated but more delalloc past those
5343          * extents.  so, we trust isize unless the start of the last extent is
5344          * beyond isize
5345          */
5346         if (last < isize) {
5347                 last = (u64)-1;
5348                 last_for_get_extent = isize;
5349         }
5350
5351         lock_extent_bits(&inode->io_tree, start, start + len - 1,
5352                          &cached_state);
5353
5354         em = get_extent_skip_holes(inode, start, last_for_get_extent);
5355         if (!em)
5356                 goto out;
5357         if (IS_ERR(em)) {
5358                 ret = PTR_ERR(em);
5359                 goto out;
5360         }
5361
5362         while (!end) {
5363                 u64 offset_in_extent = 0;
5364
5365                 /* break if the extent we found is outside the range */
5366                 if (em->start >= max || extent_map_end(em) < off)
5367                         break;
5368
5369                 /*
5370                  * get_extent may return an extent that starts before our
5371                  * requested range.  We have to make sure the ranges
5372                  * we return to fiemap always move forward and don't
5373                  * overlap, so adjust the offsets here
5374                  */
5375                 em_start = max(em->start, off);
5376
5377                 /*
5378                  * record the offset from the start of the extent
5379                  * for adjusting the disk offset below.  Only do this if the
5380                  * extent isn't compressed since our in ram offset may be past
5381                  * what we have actually allocated on disk.
5382                  */
5383                 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5384                         offset_in_extent = em_start - em->start;
5385                 em_end = extent_map_end(em);
5386                 em_len = em_end - em_start;
5387                 flags = 0;
5388                 if (em->block_start < EXTENT_MAP_LAST_BYTE)
5389                         disko = em->block_start + offset_in_extent;
5390                 else
5391                         disko = 0;
5392
5393                 /*
5394                  * bump off for our next call to get_extent
5395                  */
5396                 off = extent_map_end(em);
5397                 if (off >= max)
5398                         end = 1;
5399
5400                 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
5401                         end = 1;
5402                         flags |= FIEMAP_EXTENT_LAST;
5403                 } else if (em->block_start == EXTENT_MAP_INLINE) {
5404                         flags |= (FIEMAP_EXTENT_DATA_INLINE |
5405                                   FIEMAP_EXTENT_NOT_ALIGNED);
5406                 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
5407                         flags |= (FIEMAP_EXTENT_DELALLOC |
5408                                   FIEMAP_EXTENT_UNKNOWN);
5409                 } else if (fieinfo->fi_extents_max) {
5410                         u64 bytenr = em->block_start -
5411                                 (em->start - em->orig_start);
5412
5413                         /*
5414                          * As btrfs supports shared space, this information
5415                          * can be exported to userspace tools via
5416                          * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
5417                          * then we're just getting a count and we can skip the
5418                          * lookup stuff.
5419                          */
5420                         ret = btrfs_check_shared(root, btrfs_ino(inode),
5421                                                  bytenr, roots, tmp_ulist);
5422                         if (ret < 0)
5423                                 goto out_free;
5424                         if (ret)
5425                                 flags |= FIEMAP_EXTENT_SHARED;
5426                         ret = 0;
5427                 }
5428                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5429                         flags |= FIEMAP_EXTENT_ENCODED;
5430                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5431                         flags |= FIEMAP_EXTENT_UNWRITTEN;
5432
5433                 free_extent_map(em);
5434                 em = NULL;
5435                 if ((em_start >= last) || em_len == (u64)-1 ||
5436                    (last == (u64)-1 && isize <= em_end)) {
5437                         flags |= FIEMAP_EXTENT_LAST;
5438                         end = 1;
5439                 }
5440
5441                 /* now scan forward to see if this is really the last extent. */
5442                 em = get_extent_skip_holes(inode, off, last_for_get_extent);
5443                 if (IS_ERR(em)) {
5444                         ret = PTR_ERR(em);
5445                         goto out;
5446                 }
5447                 if (!em) {
5448                         flags |= FIEMAP_EXTENT_LAST;
5449                         end = 1;
5450                 }
5451                 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
5452                                            em_len, flags);
5453                 if (ret) {
5454                         if (ret == 1)
5455                                 ret = 0;
5456                         goto out_free;
5457                 }
5458         }
5459 out_free:
5460         if (!ret)
5461                 ret = emit_last_fiemap_cache(fieinfo, &cache);
5462         free_extent_map(em);
5463 out:
5464         unlock_extent_cached(&inode->io_tree, start, start + len - 1,
5465                              &cached_state);
5466
5467 out_free_ulist:
5468         btrfs_free_path(path);
5469         ulist_free(roots);
5470         ulist_free(tmp_ulist);
5471         return ret;
5472 }
5473
5474 static void __free_extent_buffer(struct extent_buffer *eb)
5475 {
5476         kmem_cache_free(extent_buffer_cache, eb);
5477 }
5478
5479 int extent_buffer_under_io(const struct extent_buffer *eb)
5480 {
5481         return (atomic_read(&eb->io_pages) ||
5482                 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
5483                 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5484 }
5485
5486 static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
5487 {
5488         struct btrfs_subpage *subpage;
5489
5490         lockdep_assert_held(&page->mapping->private_lock);
5491
5492         if (PagePrivate(page)) {
5493                 subpage = (struct btrfs_subpage *)page->private;
5494                 if (atomic_read(&subpage->eb_refs))
5495                         return true;
5496         }
5497         return false;
5498 }
5499
5500 static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
5501 {
5502         struct btrfs_fs_info *fs_info = eb->fs_info;
5503         const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5504
5505         /*
5506          * For mapped eb, we're going to change the page private, which should
5507          * be done under the private_lock.
5508          */
5509         if (mapped)
5510                 spin_lock(&page->mapping->private_lock);
5511
5512         if (!PagePrivate(page)) {
5513                 if (mapped)
5514                         spin_unlock(&page->mapping->private_lock);
5515                 return;
5516         }
5517
5518         if (fs_info->sectorsize == PAGE_SIZE) {
5519                 /*
5520                  * We do this since we'll remove the pages after we've
5521                  * removed the eb from the radix tree, so we could race
5522                  * and have this page now attached to the new eb.  So
5523                  * only clear page_private if it's still connected to
5524                  * this eb.
5525                  */
5526                 if (PagePrivate(page) &&
5527                     page->private == (unsigned long)eb) {
5528                         BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5529                         BUG_ON(PageDirty(page));
5530                         BUG_ON(PageWriteback(page));
5531                         /*
5532                          * We need to make sure we haven't be attached
5533                          * to a new eb.
5534                          */
5535                         detach_page_private(page);
5536                 }
5537                 if (mapped)
5538                         spin_unlock(&page->mapping->private_lock);
5539                 return;
5540         }
5541
5542         /*
5543          * For subpage, we can have dummy eb with page private.  In this case,
5544          * we can directly detach the private as such page is only attached to
5545          * one dummy eb, no sharing.
5546          */
5547         if (!mapped) {
5548                 btrfs_detach_subpage(fs_info, page);
5549                 return;
5550         }
5551
5552         btrfs_page_dec_eb_refs(fs_info, page);
5553
5554         /*
5555          * We can only detach the page private if there are no other ebs in the
5556          * page range.
5557          */
5558         if (!page_range_has_eb(fs_info, page))
5559                 btrfs_detach_subpage(fs_info, page);
5560
5561         spin_unlock(&page->mapping->private_lock);
5562 }
5563
5564 /* Release all pages attached to the extent buffer */
5565 static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
5566 {
5567         int i;
5568         int num_pages;
5569
5570         ASSERT(!extent_buffer_under_io(eb));
5571
5572         num_pages = num_extent_pages(eb);
5573         for (i = 0; i < num_pages; i++) {
5574                 struct page *page = eb->pages[i];
5575
5576                 if (!page)
5577                         continue;
5578
5579                 detach_extent_buffer_page(eb, page);
5580
5581                 /* One for when we allocated the page */
5582                 put_page(page);
5583         }
5584 }
5585
5586 /*
5587  * Helper for releasing the extent buffer.
5588  */
5589 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
5590 {
5591         btrfs_release_extent_buffer_pages(eb);
5592         btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
5593         __free_extent_buffer(eb);
5594 }
5595
5596 static struct extent_buffer *
5597 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
5598                       unsigned long len)
5599 {
5600         struct extent_buffer *eb = NULL;
5601
5602         eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
5603         eb->start = start;
5604         eb->len = len;
5605         eb->fs_info = fs_info;
5606         eb->bflags = 0;
5607         init_rwsem(&eb->lock);
5608
5609         btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
5610                              &fs_info->allocated_ebs);
5611         INIT_LIST_HEAD(&eb->release_list);
5612
5613         spin_lock_init(&eb->refs_lock);
5614         atomic_set(&eb->refs, 1);
5615         atomic_set(&eb->io_pages, 0);
5616
5617         ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
5618
5619         return eb;
5620 }
5621
5622 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
5623 {
5624         int i;
5625         struct page *p;
5626         struct extent_buffer *new;
5627         int num_pages = num_extent_pages(src);
5628
5629         new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
5630         if (new == NULL)
5631                 return NULL;
5632
5633         /*
5634          * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
5635          * btrfs_release_extent_buffer() have different behavior for
5636          * UNMAPPED subpage extent buffer.
5637          */
5638         set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
5639
5640         for (i = 0; i < num_pages; i++) {
5641                 int ret;
5642
5643                 p = alloc_page(GFP_NOFS);
5644                 if (!p) {
5645                         btrfs_release_extent_buffer(new);
5646                         return NULL;
5647                 }
5648                 ret = attach_extent_buffer_page(new, p, NULL);
5649                 if (ret < 0) {
5650                         put_page(p);
5651                         btrfs_release_extent_buffer(new);
5652                         return NULL;
5653                 }
5654                 WARN_ON(PageDirty(p));
5655                 new->pages[i] = p;
5656                 copy_page(page_address(p), page_address(src->pages[i]));
5657         }
5658         set_extent_buffer_uptodate(new);
5659
5660         return new;
5661 }
5662
5663 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5664                                                   u64 start, unsigned long len)
5665 {
5666         struct extent_buffer *eb;
5667         int num_pages;
5668         int i;
5669
5670         eb = __alloc_extent_buffer(fs_info, start, len);
5671         if (!eb)
5672                 return NULL;
5673
5674         num_pages = num_extent_pages(eb);
5675         for (i = 0; i < num_pages; i++) {
5676                 int ret;
5677
5678                 eb->pages[i] = alloc_page(GFP_NOFS);
5679                 if (!eb->pages[i])
5680                         goto err;
5681                 ret = attach_extent_buffer_page(eb, eb->pages[i], NULL);
5682                 if (ret < 0)
5683                         goto err;
5684         }
5685         set_extent_buffer_uptodate(eb);
5686         btrfs_set_header_nritems(eb, 0);
5687         set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5688
5689         return eb;
5690 err:
5691         for (; i > 0; i--) {
5692                 detach_extent_buffer_page(eb, eb->pages[i - 1]);
5693                 __free_page(eb->pages[i - 1]);
5694         }
5695         __free_extent_buffer(eb);
5696         return NULL;
5697 }
5698
5699 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5700                                                 u64 start)
5701 {
5702         return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
5703 }
5704
5705 static void check_buffer_tree_ref(struct extent_buffer *eb)
5706 {
5707         int refs;
5708         /*
5709          * The TREE_REF bit is first set when the extent_buffer is added
5710          * to the radix tree. It is also reset, if unset, when a new reference
5711          * is created by find_extent_buffer.
5712          *
5713          * It is only cleared in two cases: freeing the last non-tree
5714          * reference to the extent_buffer when its STALE bit is set or
5715          * calling releasepage when the tree reference is the only reference.
5716          *
5717          * In both cases, care is taken to ensure that the extent_buffer's
5718          * pages are not under io. However, releasepage can be concurrently
5719          * called with creating new references, which is prone to race
5720          * conditions between the calls to check_buffer_tree_ref in those
5721          * codepaths and clearing TREE_REF in try_release_extent_buffer.
5722          *
5723          * The actual lifetime of the extent_buffer in the radix tree is
5724          * adequately protected by the refcount, but the TREE_REF bit and
5725          * its corresponding reference are not. To protect against this
5726          * class of races, we call check_buffer_tree_ref from the codepaths
5727          * which trigger io after they set eb->io_pages. Note that once io is
5728          * initiated, TREE_REF can no longer be cleared, so that is the
5729          * moment at which any such race is best fixed.
5730          */
5731         refs = atomic_read(&eb->refs);
5732         if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5733                 return;
5734
5735         spin_lock(&eb->refs_lock);
5736         if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5737                 atomic_inc(&eb->refs);
5738         spin_unlock(&eb->refs_lock);
5739 }
5740
5741 static void mark_extent_buffer_accessed(struct extent_buffer *eb,
5742                 struct page *accessed)
5743 {
5744         int num_pages, i;
5745
5746         check_buffer_tree_ref(eb);
5747
5748         num_pages = num_extent_pages(eb);
5749         for (i = 0; i < num_pages; i++) {
5750                 struct page *p = eb->pages[i];
5751
5752                 if (p != accessed)
5753                         mark_page_accessed(p);
5754         }
5755 }
5756
5757 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
5758                                          u64 start)
5759 {
5760         struct extent_buffer *eb;
5761
5762         eb = find_extent_buffer_nolock(fs_info, start);
5763         if (!eb)
5764                 return NULL;
5765         /*
5766          * Lock our eb's refs_lock to avoid races with free_extent_buffer().
5767          * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
5768          * another task running free_extent_buffer() might have seen that flag
5769          * set, eb->refs == 2, that the buffer isn't under IO (dirty and
5770          * writeback flags not set) and it's still in the tree (flag
5771          * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
5772          * decrementing the extent buffer's reference count twice.  So here we
5773          * could race and increment the eb's reference count, clear its stale
5774          * flag, mark it as dirty and drop our reference before the other task
5775          * finishes executing free_extent_buffer, which would later result in
5776          * an attempt to free an extent buffer that is dirty.
5777          */
5778         if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
5779                 spin_lock(&eb->refs_lock);
5780                 spin_unlock(&eb->refs_lock);
5781         }
5782         mark_extent_buffer_accessed(eb, NULL);
5783         return eb;
5784 }
5785
5786 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5787 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
5788                                         u64 start)
5789 {
5790         struct extent_buffer *eb, *exists = NULL;
5791         int ret;
5792
5793         eb = find_extent_buffer(fs_info, start);
5794         if (eb)
5795                 return eb;
5796         eb = alloc_dummy_extent_buffer(fs_info, start);
5797         if (!eb)
5798                 return ERR_PTR(-ENOMEM);
5799         eb->fs_info = fs_info;
5800 again:
5801         ret = radix_tree_preload(GFP_NOFS);
5802         if (ret) {
5803                 exists = ERR_PTR(ret);
5804                 goto free_eb;
5805         }
5806         spin_lock(&fs_info->buffer_lock);
5807         ret = radix_tree_insert(&fs_info->buffer_radix,
5808                                 start >> fs_info->sectorsize_bits, eb);
5809         spin_unlock(&fs_info->buffer_lock);
5810         radix_tree_preload_end();
5811         if (ret == -EEXIST) {
5812                 exists = find_extent_buffer(fs_info, start);
5813                 if (exists)
5814                         goto free_eb;
5815                 else
5816                         goto again;
5817         }
5818         check_buffer_tree_ref(eb);
5819         set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5820
5821         return eb;
5822 free_eb:
5823         btrfs_release_extent_buffer(eb);
5824         return exists;
5825 }
5826 #endif
5827
5828 static struct extent_buffer *grab_extent_buffer(
5829                 struct btrfs_fs_info *fs_info, struct page *page)
5830 {
5831         struct extent_buffer *exists;
5832
5833         /*
5834          * For subpage case, we completely rely on radix tree to ensure we
5835          * don't try to insert two ebs for the same bytenr.  So here we always
5836          * return NULL and just continue.
5837          */
5838         if (fs_info->sectorsize < PAGE_SIZE)
5839                 return NULL;
5840
5841         /* Page not yet attached to an extent buffer */
5842         if (!PagePrivate(page))
5843                 return NULL;
5844
5845         /*
5846          * We could have already allocated an eb for this page and attached one
5847          * so lets see if we can get a ref on the existing eb, and if we can we
5848          * know it's good and we can just return that one, else we know we can
5849          * just overwrite page->private.
5850          */
5851         exists = (struct extent_buffer *)page->private;
5852         if (atomic_inc_not_zero(&exists->refs))
5853                 return exists;
5854
5855         WARN_ON(PageDirty(page));
5856         detach_page_private(page);
5857         return NULL;
5858 }
5859
5860 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
5861                                           u64 start, u64 owner_root, int level)
5862 {
5863         unsigned long len = fs_info->nodesize;
5864         int num_pages;
5865         int i;
5866         unsigned long index = start >> PAGE_SHIFT;
5867         struct extent_buffer *eb;
5868         struct extent_buffer *exists = NULL;
5869         struct page *p;
5870         struct address_space *mapping = fs_info->btree_inode->i_mapping;
5871         int uptodate = 1;
5872         int ret;
5873
5874         if (!IS_ALIGNED(start, fs_info->sectorsize)) {
5875                 btrfs_err(fs_info, "bad tree block start %llu", start);
5876                 return ERR_PTR(-EINVAL);
5877         }
5878
5879 #if BITS_PER_LONG == 32
5880         if (start >= MAX_LFS_FILESIZE) {
5881                 btrfs_err_rl(fs_info,
5882                 "extent buffer %llu is beyond 32bit page cache limit", start);
5883                 btrfs_err_32bit_limit(fs_info);
5884                 return ERR_PTR(-EOVERFLOW);
5885         }
5886         if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
5887                 btrfs_warn_32bit_limit(fs_info);
5888 #endif
5889
5890         if (fs_info->sectorsize < PAGE_SIZE &&
5891             offset_in_page(start) + len > PAGE_SIZE) {
5892                 btrfs_err(fs_info,
5893                 "tree block crosses page boundary, start %llu nodesize %lu",
5894                           start, len);
5895                 return ERR_PTR(-EINVAL);
5896         }
5897
5898         eb = find_extent_buffer(fs_info, start);
5899         if (eb)
5900                 return eb;
5901
5902         eb = __alloc_extent_buffer(fs_info, start, len);
5903         if (!eb)
5904                 return ERR_PTR(-ENOMEM);
5905         btrfs_set_buffer_lockdep_class(owner_root, eb, level);
5906
5907         num_pages = num_extent_pages(eb);
5908         for (i = 0; i < num_pages; i++, index++) {
5909                 struct btrfs_subpage *prealloc = NULL;
5910
5911                 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
5912                 if (!p) {
5913                         exists = ERR_PTR(-ENOMEM);
5914                         goto free_eb;
5915                 }
5916
5917                 /*
5918                  * Preallocate page->private for subpage case, so that we won't
5919                  * allocate memory with private_lock hold.  The memory will be
5920                  * freed by attach_extent_buffer_page() or freed manually if
5921                  * we exit earlier.
5922                  *
5923                  * Although we have ensured one subpage eb can only have one
5924                  * page, but it may change in the future for 16K page size
5925                  * support, so we still preallocate the memory in the loop.
5926                  */
5927                 ret = btrfs_alloc_subpage(fs_info, &prealloc,
5928                                           BTRFS_SUBPAGE_METADATA);
5929                 if (ret < 0) {
5930                         unlock_page(p);
5931                         put_page(p);
5932                         exists = ERR_PTR(ret);
5933                         goto free_eb;
5934                 }
5935
5936                 spin_lock(&mapping->private_lock);
5937                 exists = grab_extent_buffer(fs_info, p);
5938                 if (exists) {
5939                         spin_unlock(&mapping->private_lock);
5940                         unlock_page(p);
5941                         put_page(p);
5942                         mark_extent_buffer_accessed(exists, p);
5943                         btrfs_free_subpage(prealloc);
5944                         goto free_eb;
5945                 }
5946                 /* Should not fail, as we have preallocated the memory */
5947                 ret = attach_extent_buffer_page(eb, p, prealloc);
5948                 ASSERT(!ret);
5949                 /*
5950                  * To inform we have extra eb under allocation, so that
5951                  * detach_extent_buffer_page() won't release the page private
5952                  * when the eb hasn't yet been inserted into radix tree.
5953                  *
5954                  * The ref will be decreased when the eb released the page, in
5955                  * detach_extent_buffer_page().
5956                  * Thus needs no special handling in error path.
5957                  */
5958                 btrfs_page_inc_eb_refs(fs_info, p);
5959                 spin_unlock(&mapping->private_lock);
5960
5961                 WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
5962                 eb->pages[i] = p;
5963                 if (!PageUptodate(p))
5964                         uptodate = 0;
5965
5966                 /*
5967                  * We can't unlock the pages just yet since the extent buffer
5968                  * hasn't been properly inserted in the radix tree, this
5969                  * opens a race with btree_releasepage which can free a page
5970                  * while we are still filling in all pages for the buffer and
5971                  * we could crash.
5972                  */
5973         }
5974         if (uptodate)
5975                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5976 again:
5977         ret = radix_tree_preload(GFP_NOFS);
5978         if (ret) {
5979                 exists = ERR_PTR(ret);
5980                 goto free_eb;
5981         }
5982
5983         spin_lock(&fs_info->buffer_lock);
5984         ret = radix_tree_insert(&fs_info->buffer_radix,
5985                                 start >> fs_info->sectorsize_bits, eb);
5986         spin_unlock(&fs_info->buffer_lock);
5987         radix_tree_preload_end();
5988         if (ret == -EEXIST) {
5989                 exists = find_extent_buffer(fs_info, start);
5990                 if (exists)
5991                         goto free_eb;
5992                 else
5993                         goto again;
5994         }
5995         /* add one reference for the tree */
5996         check_buffer_tree_ref(eb);
5997         set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5998
5999         /*
6000          * Now it's safe to unlock the pages because any calls to
6001          * btree_releasepage will correctly detect that a page belongs to a
6002          * live buffer and won't free them prematurely.
6003          */
6004         for (i = 0; i < num_pages; i++)
6005                 unlock_page(eb->pages[i]);
6006         return eb;
6007
6008 free_eb:
6009         WARN_ON(!atomic_dec_and_test(&eb->refs));
6010         for (i = 0; i < num_pages; i++) {
6011                 if (eb->pages[i])
6012                         unlock_page(eb->pages[i]);
6013         }
6014
6015         btrfs_release_extent_buffer(eb);
6016         return exists;
6017 }
6018
6019 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
6020 {
6021         struct extent_buffer *eb =
6022                         container_of(head, struct extent_buffer, rcu_head);
6023
6024         __free_extent_buffer(eb);
6025 }
6026
6027 static int release_extent_buffer(struct extent_buffer *eb)
6028         __releases(&eb->refs_lock)
6029 {
6030         lockdep_assert_held(&eb->refs_lock);
6031
6032         WARN_ON(atomic_read(&eb->refs) == 0);
6033         if (atomic_dec_and_test(&eb->refs)) {
6034                 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
6035                         struct btrfs_fs_info *fs_info = eb->fs_info;
6036
6037                         spin_unlock(&eb->refs_lock);
6038
6039                         spin_lock(&fs_info->buffer_lock);
6040                         radix_tree_delete(&fs_info->buffer_radix,
6041                                           eb->start >> fs_info->sectorsize_bits);
6042                         spin_unlock(&fs_info->buffer_lock);
6043                 } else {
6044                         spin_unlock(&eb->refs_lock);
6045                 }
6046
6047                 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
6048                 /* Should be safe to release our pages at this point */
6049                 btrfs_release_extent_buffer_pages(eb);
6050 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6051                 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
6052                         __free_extent_buffer(eb);
6053                         return 1;
6054                 }
6055 #endif
6056                 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
6057                 return 1;
6058         }
6059         spin_unlock(&eb->refs_lock);
6060
6061         return 0;
6062 }
6063
6064 void free_extent_buffer(struct extent_buffer *eb)
6065 {
6066         int refs;
6067         int old;
6068         if (!eb)
6069                 return;
6070
6071         while (1) {
6072                 refs = atomic_read(&eb->refs);
6073                 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
6074                     || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
6075                         refs == 1))
6076                         break;
6077                 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
6078                 if (old == refs)
6079                         return;
6080         }
6081
6082         spin_lock(&eb->refs_lock);
6083         if (atomic_read(&eb->refs) == 2 &&
6084             test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
6085             !extent_buffer_under_io(eb) &&
6086             test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6087                 atomic_dec(&eb->refs);
6088
6089         /*
6090          * I know this is terrible, but it's temporary until we stop tracking
6091          * the uptodate bits and such for the extent buffers.
6092          */
6093         release_extent_buffer(eb);
6094 }
6095
6096 void free_extent_buffer_stale(struct extent_buffer *eb)
6097 {
6098         if (!eb)
6099                 return;
6100
6101         spin_lock(&eb->refs_lock);
6102         set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
6103
6104         if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
6105             test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6106                 atomic_dec(&eb->refs);
6107         release_extent_buffer(eb);
6108 }
6109
6110 static void btree_clear_page_dirty(struct page *page)
6111 {
6112         ASSERT(PageDirty(page));
6113         ASSERT(PageLocked(page));
6114         clear_page_dirty_for_io(page);
6115         xa_lock_irq(&page->mapping->i_pages);
6116         if (!PageDirty(page))
6117                 __xa_clear_mark(&page->mapping->i_pages,
6118                                 page_index(page), PAGECACHE_TAG_DIRTY);
6119         xa_unlock_irq(&page->mapping->i_pages);
6120 }
6121
6122 static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
6123 {
6124         struct btrfs_fs_info *fs_info = eb->fs_info;
6125         struct page *page = eb->pages[0];
6126         bool last;
6127
6128         /* btree_clear_page_dirty() needs page locked */
6129         lock_page(page);
6130         last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
6131                                                   eb->len);
6132         if (last)
6133                 btree_clear_page_dirty(page);
6134         unlock_page(page);
6135         WARN_ON(atomic_read(&eb->refs) == 0);
6136 }
6137
6138 void clear_extent_buffer_dirty(const struct extent_buffer *eb)
6139 {
6140         int i;
6141         int num_pages;
6142         struct page *page;
6143
6144         if (eb->fs_info->sectorsize < PAGE_SIZE)
6145                 return clear_subpage_extent_buffer_dirty(eb);
6146
6147         num_pages = num_extent_pages(eb);
6148
6149         for (i = 0; i < num_pages; i++) {
6150                 page = eb->pages[i];
6151                 if (!PageDirty(page))
6152                         continue;
6153                 lock_page(page);
6154                 btree_clear_page_dirty(page);
6155                 ClearPageError(page);
6156                 unlock_page(page);
6157         }
6158         WARN_ON(atomic_read(&eb->refs) == 0);
6159 }
6160
6161 bool set_extent_buffer_dirty(struct extent_buffer *eb)
6162 {
6163         int i;
6164         int num_pages;
6165         bool was_dirty;
6166
6167         check_buffer_tree_ref(eb);
6168
6169         was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
6170
6171         num_pages = num_extent_pages(eb);
6172         WARN_ON(atomic_read(&eb->refs) == 0);
6173         WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
6174
6175         if (!was_dirty) {
6176                 bool subpage = eb->fs_info->sectorsize < PAGE_SIZE;
6177
6178                 /*
6179                  * For subpage case, we can have other extent buffers in the
6180                  * same page, and in clear_subpage_extent_buffer_dirty() we
6181                  * have to clear page dirty without subpage lock held.
6182                  * This can cause race where our page gets dirty cleared after
6183                  * we just set it.
6184                  *
6185                  * Thankfully, clear_subpage_extent_buffer_dirty() has locked
6186                  * its page for other reasons, we can use page lock to prevent
6187                  * the above race.
6188                  */
6189                 if (subpage)
6190                         lock_page(eb->pages[0]);
6191                 for (i = 0; i < num_pages; i++)
6192                         btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
6193                                              eb->start, eb->len);
6194                 if (subpage)
6195                         unlock_page(eb->pages[0]);
6196         }
6197 #ifdef CONFIG_BTRFS_DEBUG
6198         for (i = 0; i < num_pages; i++)
6199                 ASSERT(PageDirty(eb->pages[i]));
6200 #endif
6201
6202         return was_dirty;
6203 }
6204
6205 void clear_extent_buffer_uptodate(struct extent_buffer *eb)
6206 {
6207         struct btrfs_fs_info *fs_info = eb->fs_info;
6208         struct page *page;
6209         int num_pages;
6210         int i;
6211
6212         clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6213         num_pages = num_extent_pages(eb);
6214         for (i = 0; i < num_pages; i++) {
6215                 page = eb->pages[i];
6216                 if (page)
6217                         btrfs_page_clear_uptodate(fs_info, page,
6218                                                   eb->start, eb->len);
6219         }
6220 }
6221
6222 void set_extent_buffer_uptodate(struct extent_buffer *eb)
6223 {
6224         struct btrfs_fs_info *fs_info = eb->fs_info;
6225         struct page *page;
6226         int num_pages;
6227         int i;
6228
6229         set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6230         num_pages = num_extent_pages(eb);
6231         for (i = 0; i < num_pages; i++) {
6232                 page = eb->pages[i];
6233                 btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len);
6234         }
6235 }
6236
6237 static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
6238                                       int mirror_num)
6239 {
6240         struct btrfs_fs_info *fs_info = eb->fs_info;
6241         struct extent_io_tree *io_tree;
6242         struct page *page = eb->pages[0];
6243         struct btrfs_bio_ctrl bio_ctrl = { 0 };
6244         int ret = 0;
6245
6246         ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
6247         ASSERT(PagePrivate(page));
6248         io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
6249
6250         if (wait == WAIT_NONE) {
6251                 if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
6252                         return -EAGAIN;
6253         } else {
6254                 ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6255                 if (ret < 0)
6256                         return ret;
6257         }
6258
6259         ret = 0;
6260         if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
6261             PageUptodate(page) ||
6262             btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
6263                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6264                 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6265                 return ret;
6266         }
6267
6268         clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
6269         eb->read_mirror = 0;
6270         atomic_set(&eb->io_pages, 1);
6271         check_buffer_tree_ref(eb);
6272         btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
6273
6274         ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl,
6275                                  page, eb->start, eb->len,
6276                                  eb->start - page_offset(page),
6277                                  end_bio_extent_readpage, mirror_num, 0,
6278                                  true);
6279         if (ret) {
6280                 /*
6281                  * In the endio function, if we hit something wrong we will
6282                  * increase the io_pages, so here we need to decrease it for
6283                  * error path.
6284                  */
6285                 atomic_dec(&eb->io_pages);
6286         }
6287         if (bio_ctrl.bio) {
6288                 int tmp;
6289
6290                 tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0);
6291                 bio_ctrl.bio = NULL;
6292                 if (tmp < 0)
6293                         return tmp;
6294         }
6295         if (ret || wait != WAIT_COMPLETE)
6296                 return ret;
6297
6298         wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
6299         if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
6300                 ret = -EIO;
6301         return ret;
6302 }
6303
6304 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
6305 {
6306         int i;
6307         struct page *page;
6308         int err;
6309         int ret = 0;
6310         int locked_pages = 0;
6311         int all_uptodate = 1;
6312         int num_pages;
6313         unsigned long num_reads = 0;
6314         struct btrfs_bio_ctrl bio_ctrl = { 0 };
6315
6316         if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
6317                 return 0;
6318
6319         if (eb->fs_info->sectorsize < PAGE_SIZE)
6320                 return read_extent_buffer_subpage(eb, wait, mirror_num);
6321
6322         num_pages = num_extent_pages(eb);
6323         for (i = 0; i < num_pages; i++) {
6324                 page = eb->pages[i];
6325                 if (wait == WAIT_NONE) {
6326                         /*
6327                          * WAIT_NONE is only utilized by readahead. If we can't
6328                          * acquire the lock atomically it means either the eb
6329                          * is being read out or under modification.
6330                          * Either way the eb will be or has been cached,
6331                          * readahead can exit safely.
6332                          */
6333                         if (!trylock_page(page))
6334                                 goto unlock_exit;
6335                 } else {
6336                         lock_page(page);
6337                 }
6338                 locked_pages++;
6339         }
6340         /*
6341          * We need to firstly lock all pages to make sure that
6342          * the uptodate bit of our pages won't be affected by
6343          * clear_extent_buffer_uptodate().
6344          */
6345         for (i = 0; i < num_pages; i++) {
6346                 page = eb->pages[i];
6347                 if (!PageUptodate(page)) {
6348                         num_reads++;
6349                         all_uptodate = 0;
6350                 }
6351         }
6352
6353         if (all_uptodate) {
6354                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6355                 goto unlock_exit;
6356         }
6357
6358         clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
6359         eb->read_mirror = 0;
6360         atomic_set(&eb->io_pages, num_reads);
6361         /*
6362          * It is possible for releasepage to clear the TREE_REF bit before we
6363          * set io_pages. See check_buffer_tree_ref for a more detailed comment.
6364          */
6365         check_buffer_tree_ref(eb);
6366         for (i = 0; i < num_pages; i++) {
6367                 page = eb->pages[i];
6368
6369                 if (!PageUptodate(page)) {
6370                         if (ret) {
6371                                 atomic_dec(&eb->io_pages);
6372                                 unlock_page(page);
6373                                 continue;
6374                         }
6375
6376                         ClearPageError(page);
6377                         err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
6378                                          &bio_ctrl, page, page_offset(page),
6379                                          PAGE_SIZE, 0, end_bio_extent_readpage,
6380                                          mirror_num, 0, false);
6381                         if (err) {
6382                                 /*
6383                                  * We failed to submit the bio so it's the
6384                                  * caller's responsibility to perform cleanup
6385                                  * i.e unlock page/set error bit.
6386                                  */
6387                                 ret = err;
6388                                 SetPageError(page);
6389                                 unlock_page(page);
6390                                 atomic_dec(&eb->io_pages);
6391                         }
6392                 } else {
6393                         unlock_page(page);
6394                 }
6395         }
6396
6397         if (bio_ctrl.bio) {
6398                 err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags);
6399                 bio_ctrl.bio = NULL;
6400                 if (err)
6401                         return err;
6402         }
6403
6404         if (ret || wait != WAIT_COMPLETE)
6405                 return ret;
6406
6407         for (i = 0; i < num_pages; i++) {
6408                 page = eb->pages[i];
6409                 wait_on_page_locked(page);
6410                 if (!PageUptodate(page))
6411                         ret = -EIO;
6412         }
6413
6414         return ret;
6415
6416 unlock_exit:
6417         while (locked_pages > 0) {
6418                 locked_pages--;
6419                 page = eb->pages[locked_pages];
6420                 unlock_page(page);
6421         }
6422         return ret;
6423 }
6424
6425 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
6426                             unsigned long len)
6427 {
6428         btrfs_warn(eb->fs_info,
6429                 "access to eb bytenr %llu len %lu out of range start %lu len %lu",
6430                 eb->start, eb->len, start, len);
6431         WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6432
6433         return true;
6434 }
6435
6436 /*
6437  * Check if the [start, start + len) range is valid before reading/writing
6438  * the eb.
6439  * NOTE: @start and @len are offset inside the eb, not logical address.
6440  *
6441  * Caller should not touch the dst/src memory if this function returns error.
6442  */
6443 static inline int check_eb_range(const struct extent_buffer *eb,
6444                                  unsigned long start, unsigned long len)
6445 {
6446         unsigned long offset;
6447
6448         /* start, start + len should not go beyond eb->len nor overflow */
6449         if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
6450                 return report_eb_range(eb, start, len);
6451
6452         return false;
6453 }
6454
6455 void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
6456                         unsigned long start, unsigned long len)
6457 {
6458         size_t cur;
6459         size_t offset;
6460         struct page *page;
6461         char *kaddr;
6462         char *dst = (char *)dstv;
6463         unsigned long i = get_eb_page_index(start);
6464
6465         if (check_eb_range(eb, start, len))
6466                 return;
6467
6468         offset = get_eb_offset_in_page(eb, start);
6469
6470         while (len > 0) {
6471                 page = eb->pages[i];
6472
6473                 cur = min(len, (PAGE_SIZE - offset));
6474                 kaddr = page_address(page);
6475                 memcpy(dst, kaddr + offset, cur);
6476
6477                 dst += cur;
6478                 len -= cur;
6479                 offset = 0;
6480                 i++;
6481         }
6482 }
6483
6484 int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
6485                                        void __user *dstv,
6486                                        unsigned long start, unsigned long len)
6487 {
6488         size_t cur;
6489         size_t offset;
6490         struct page *page;
6491         char *kaddr;
6492         char __user *dst = (char __user *)dstv;
6493         unsigned long i = get_eb_page_index(start);
6494         int ret = 0;
6495
6496         WARN_ON(start > eb->len);
6497         WARN_ON(start + len > eb->start + eb->len);
6498
6499         offset = get_eb_offset_in_page(eb, start);
6500
6501         while (len > 0) {
6502                 page = eb->pages[i];
6503
6504                 cur = min(len, (PAGE_SIZE - offset));
6505                 kaddr = page_address(page);
6506                 if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
6507                         ret = -EFAULT;
6508                         break;
6509                 }
6510
6511                 dst += cur;
6512                 len -= cur;
6513                 offset = 0;
6514                 i++;
6515         }
6516
6517         return ret;
6518 }
6519
6520 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
6521                          unsigned long start, unsigned long len)
6522 {
6523         size_t cur;
6524         size_t offset;
6525         struct page *page;
6526         char *kaddr;
6527         char *ptr = (char *)ptrv;
6528         unsigned long i = get_eb_page_index(start);
6529         int ret = 0;
6530
6531         if (check_eb_range(eb, start, len))
6532                 return -EINVAL;
6533
6534         offset = get_eb_offset_in_page(eb, start);
6535
6536         while (len > 0) {
6537                 page = eb->pages[i];
6538
6539                 cur = min(len, (PAGE_SIZE - offset));
6540
6541                 kaddr = page_address(page);
6542                 ret = memcmp(ptr, kaddr + offset, cur);
6543                 if (ret)
6544                         break;
6545
6546                 ptr += cur;
6547                 len -= cur;
6548                 offset = 0;
6549                 i++;
6550         }
6551         return ret;
6552 }
6553
6554 /*
6555  * Check that the extent buffer is uptodate.
6556  *
6557  * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
6558  * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
6559  */
6560 static void assert_eb_page_uptodate(const struct extent_buffer *eb,
6561                                     struct page *page)
6562 {
6563         struct btrfs_fs_info *fs_info = eb->fs_info;
6564
6565         if (fs_info->sectorsize < PAGE_SIZE) {
6566                 bool uptodate;
6567
6568                 uptodate = btrfs_subpage_test_uptodate(fs_info, page,
6569                                                        eb->start, eb->len);
6570                 WARN_ON(!uptodate);
6571         } else {
6572                 WARN_ON(!PageUptodate(page));
6573         }
6574 }
6575
6576 void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
6577                 const void *srcv)
6578 {
6579         char *kaddr;
6580
6581         assert_eb_page_uptodate(eb, eb->pages[0]);
6582         kaddr = page_address(eb->pages[0]) +
6583                 get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
6584                                                    chunk_tree_uuid));
6585         memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
6586 }
6587
6588 void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
6589 {
6590         char *kaddr;
6591
6592         assert_eb_page_uptodate(eb, eb->pages[0]);
6593         kaddr = page_address(eb->pages[0]) +
6594                 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
6595         memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
6596 }
6597
6598 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
6599                          unsigned long start, unsigned long len)
6600 {
6601         size_t cur;
6602         size_t offset;
6603         struct page *page;
6604         char *kaddr;
6605         char *src = (char *)srcv;
6606         unsigned long i = get_eb_page_index(start);
6607
6608         WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
6609
6610         if (check_eb_range(eb, start, len))
6611                 return;
6612
6613         offset = get_eb_offset_in_page(eb, start);
6614
6615         while (len > 0) {
6616                 page = eb->pages[i];
6617                 assert_eb_page_uptodate(eb, page);
6618
6619                 cur = min(len, PAGE_SIZE - offset);
6620                 kaddr = page_address(page);
6621                 memcpy(kaddr + offset, src, cur);
6622
6623                 src += cur;
6624                 len -= cur;
6625                 offset = 0;
6626                 i++;
6627         }
6628 }
6629
6630 void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
6631                 unsigned long len)
6632 {
6633         size_t cur;
6634         size_t offset;
6635         struct page *page;
6636         char *kaddr;
6637         unsigned long i = get_eb_page_index(start);
6638
6639         if (check_eb_range(eb, start, len))
6640                 return;
6641
6642         offset = get_eb_offset_in_page(eb, start);
6643
6644         while (len > 0) {
6645                 page = eb->pages[i];
6646                 assert_eb_page_uptodate(eb, page);
6647
6648                 cur = min(len, PAGE_SIZE - offset);
6649                 kaddr = page_address(page);
6650                 memset(kaddr + offset, 0, cur);
6651
6652                 len -= cur;
6653                 offset = 0;
6654                 i++;
6655         }
6656 }
6657
6658 void copy_extent_buffer_full(const struct extent_buffer *dst,
6659                              const struct extent_buffer *src)
6660 {
6661         int i;
6662         int num_pages;
6663
6664         ASSERT(dst->len == src->len);
6665
6666         if (dst->fs_info->sectorsize == PAGE_SIZE) {
6667                 num_pages = num_extent_pages(dst);
6668                 for (i = 0; i < num_pages; i++)
6669                         copy_page(page_address(dst->pages[i]),
6670                                   page_address(src->pages[i]));
6671         } else {
6672                 size_t src_offset = get_eb_offset_in_page(src, 0);
6673                 size_t dst_offset = get_eb_offset_in_page(dst, 0);
6674
6675                 ASSERT(src->fs_info->sectorsize < PAGE_SIZE);
6676                 memcpy(page_address(dst->pages[0]) + dst_offset,
6677                        page_address(src->pages[0]) + src_offset,
6678                        src->len);
6679         }
6680 }
6681
6682 void copy_extent_buffer(const struct extent_buffer *dst,
6683                         const struct extent_buffer *src,
6684                         unsigned long dst_offset, unsigned long src_offset,
6685                         unsigned long len)
6686 {
6687         u64 dst_len = dst->len;
6688         size_t cur;
6689         size_t offset;
6690         struct page *page;
6691         char *kaddr;
6692         unsigned long i = get_eb_page_index(dst_offset);
6693
6694         if (check_eb_range(dst, dst_offset, len) ||
6695             check_eb_range(src, src_offset, len))
6696                 return;
6697
6698         WARN_ON(src->len != dst_len);
6699
6700         offset = get_eb_offset_in_page(dst, dst_offset);
6701
6702         while (len > 0) {
6703                 page = dst->pages[i];
6704                 assert_eb_page_uptodate(dst, page);
6705
6706                 cur = min(len, (unsigned long)(PAGE_SIZE - offset));
6707
6708                 kaddr = page_address(page);
6709                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
6710
6711                 src_offset += cur;
6712                 len -= cur;
6713                 offset = 0;
6714                 i++;
6715         }
6716 }
6717
6718 /*
6719  * eb_bitmap_offset() - calculate the page and offset of the byte containing the
6720  * given bit number
6721  * @eb: the extent buffer
6722  * @start: offset of the bitmap item in the extent buffer
6723  * @nr: bit number
6724  * @page_index: return index of the page in the extent buffer that contains the
6725  * given bit number
6726  * @page_offset: return offset into the page given by page_index
6727  *
6728  * This helper hides the ugliness of finding the byte in an extent buffer which
6729  * contains a given bit.
6730  */
6731 static inline void eb_bitmap_offset(const struct extent_buffer *eb,
6732                                     unsigned long start, unsigned long nr,
6733                                     unsigned long *page_index,
6734                                     size_t *page_offset)
6735 {
6736         size_t byte_offset = BIT_BYTE(nr);
6737         size_t offset;
6738
6739         /*
6740          * The byte we want is the offset of the extent buffer + the offset of
6741          * the bitmap item in the extent buffer + the offset of the byte in the
6742          * bitmap item.
6743          */
6744         offset = start + offset_in_page(eb->start) + byte_offset;
6745
6746         *page_index = offset >> PAGE_SHIFT;
6747         *page_offset = offset_in_page(offset);
6748 }
6749
6750 /**
6751  * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
6752  * @eb: the extent buffer
6753  * @start: offset of the bitmap item in the extent buffer
6754  * @nr: bit number to test
6755  */
6756 int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
6757                            unsigned long nr)
6758 {
6759         u8 *kaddr;
6760         struct page *page;
6761         unsigned long i;
6762         size_t offset;
6763
6764         eb_bitmap_offset(eb, start, nr, &i, &offset);
6765         page = eb->pages[i];
6766         assert_eb_page_uptodate(eb, page);
6767         kaddr = page_address(page);
6768         return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
6769 }
6770
6771 /**
6772  * extent_buffer_bitmap_set - set an area of a bitmap
6773  * @eb: the extent buffer
6774  * @start: offset of the bitmap item in the extent buffer
6775  * @pos: bit number of the first bit
6776  * @len: number of bits to set
6777  */
6778 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
6779                               unsigned long pos, unsigned long len)
6780 {
6781         u8 *kaddr;
6782         struct page *page;
6783         unsigned long i;
6784         size_t offset;
6785         const unsigned int size = pos + len;
6786         int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
6787         u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
6788
6789         eb_bitmap_offset(eb, start, pos, &i, &offset);
6790         page = eb->pages[i];
6791         assert_eb_page_uptodate(eb, page);
6792         kaddr = page_address(page);
6793
6794         while (len >= bits_to_set) {
6795                 kaddr[offset] |= mask_to_set;
6796                 len -= bits_to_set;
6797                 bits_to_set = BITS_PER_BYTE;
6798                 mask_to_set = ~0;
6799                 if (++offset >= PAGE_SIZE && len > 0) {
6800                         offset = 0;
6801                         page = eb->pages[++i];
6802                         assert_eb_page_uptodate(eb, page);
6803                         kaddr = page_address(page);
6804                 }
6805         }
6806         if (len) {
6807                 mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
6808                 kaddr[offset] |= mask_to_set;
6809         }
6810 }
6811
6812
6813 /**
6814  * extent_buffer_bitmap_clear - clear an area of a bitmap
6815  * @eb: the extent buffer
6816  * @start: offset of the bitmap item in the extent buffer
6817  * @pos: bit number of the first bit
6818  * @len: number of bits to clear
6819  */
6820 void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
6821                                 unsigned long start, unsigned long pos,
6822                                 unsigned long len)
6823 {
6824         u8 *kaddr;
6825         struct page *page;
6826         unsigned long i;
6827         size_t offset;
6828         const unsigned int size = pos + len;
6829         int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
6830         u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
6831
6832         eb_bitmap_offset(eb, start, pos, &i, &offset);
6833         page = eb->pages[i];
6834         assert_eb_page_uptodate(eb, page);
6835         kaddr = page_address(page);
6836
6837         while (len >= bits_to_clear) {
6838                 kaddr[offset] &= ~mask_to_clear;
6839                 len -= bits_to_clear;
6840                 bits_to_clear = BITS_PER_BYTE;
6841                 mask_to_clear = ~0;
6842                 if (++offset >= PAGE_SIZE && len > 0) {
6843                         offset = 0;
6844                         page = eb->pages[++i];
6845                         assert_eb_page_uptodate(eb, page);
6846                         kaddr = page_address(page);
6847                 }
6848         }
6849         if (len) {
6850                 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
6851                 kaddr[offset] &= ~mask_to_clear;
6852         }
6853 }
6854
6855 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
6856 {
6857         unsigned long distance = (src > dst) ? src - dst : dst - src;
6858         return distance < len;
6859 }
6860
6861 static void copy_pages(struct page *dst_page, struct page *src_page,
6862                        unsigned long dst_off, unsigned long src_off,
6863                        unsigned long len)
6864 {
6865         char *dst_kaddr = page_address(dst_page);
6866         char *src_kaddr;
6867         int must_memmove = 0;
6868
6869         if (dst_page != src_page) {
6870                 src_kaddr = page_address(src_page);
6871         } else {
6872                 src_kaddr = dst_kaddr;
6873                 if (areas_overlap(src_off, dst_off, len))
6874                         must_memmove = 1;
6875         }
6876
6877         if (must_memmove)
6878                 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
6879         else
6880                 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
6881 }
6882
6883 void memcpy_extent_buffer(const struct extent_buffer *dst,
6884                           unsigned long dst_offset, unsigned long src_offset,
6885                           unsigned long len)
6886 {
6887         size_t cur;
6888         size_t dst_off_in_page;
6889         size_t src_off_in_page;
6890         unsigned long dst_i;
6891         unsigned long src_i;
6892
6893         if (check_eb_range(dst, dst_offset, len) ||
6894             check_eb_range(dst, src_offset, len))
6895                 return;
6896
6897         while (len > 0) {
6898                 dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
6899                 src_off_in_page = get_eb_offset_in_page(dst, src_offset);
6900
6901                 dst_i = get_eb_page_index(dst_offset);
6902                 src_i = get_eb_page_index(src_offset);
6903
6904                 cur = min(len, (unsigned long)(PAGE_SIZE -
6905                                                src_off_in_page));
6906                 cur = min_t(unsigned long, cur,
6907                         (unsigned long)(PAGE_SIZE - dst_off_in_page));
6908
6909                 copy_pages(dst->pages[dst_i], dst->pages[src_i],
6910                            dst_off_in_page, src_off_in_page, cur);
6911
6912                 src_offset += cur;
6913                 dst_offset += cur;
6914                 len -= cur;
6915         }
6916 }
6917
6918 void memmove_extent_buffer(const struct extent_buffer *dst,
6919                            unsigned long dst_offset, unsigned long src_offset,
6920                            unsigned long len)
6921 {
6922         size_t cur;
6923         size_t dst_off_in_page;
6924         size_t src_off_in_page;
6925         unsigned long dst_end = dst_offset + len - 1;
6926         unsigned long src_end = src_offset + len - 1;
6927         unsigned long dst_i;
6928         unsigned long src_i;
6929
6930         if (check_eb_range(dst, dst_offset, len) ||
6931             check_eb_range(dst, src_offset, len))
6932                 return;
6933         if (dst_offset < src_offset) {
6934                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
6935                 return;
6936         }
6937         while (len > 0) {
6938                 dst_i = get_eb_page_index(dst_end);
6939                 src_i = get_eb_page_index(src_end);
6940
6941                 dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
6942                 src_off_in_page = get_eb_offset_in_page(dst, src_end);
6943
6944                 cur = min_t(unsigned long, len, src_off_in_page + 1);
6945                 cur = min(cur, dst_off_in_page + 1);
6946                 copy_pages(dst->pages[dst_i], dst->pages[src_i],
6947                            dst_off_in_page - cur + 1,
6948                            src_off_in_page - cur + 1, cur);
6949
6950                 dst_end -= cur;
6951                 src_end -= cur;
6952                 len -= cur;
6953         }
6954 }
6955
6956 static struct extent_buffer *get_next_extent_buffer(
6957                 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
6958 {
6959         struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
6960         struct extent_buffer *found = NULL;
6961         u64 page_start = page_offset(page);
6962         int ret;
6963         int i;
6964
6965         ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
6966         ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
6967         lockdep_assert_held(&fs_info->buffer_lock);
6968
6969         ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
6970                         bytenr >> fs_info->sectorsize_bits,
6971                         PAGE_SIZE / fs_info->nodesize);
6972         for (i = 0; i < ret; i++) {
6973                 /* Already beyond page end */
6974                 if (gang[i]->start >= page_start + PAGE_SIZE)
6975                         break;
6976                 /* Found one */
6977                 if (gang[i]->start >= bytenr) {
6978                         found = gang[i];
6979                         break;
6980                 }
6981         }
6982         return found;
6983 }
6984
6985 static int try_release_subpage_extent_buffer(struct page *page)
6986 {
6987         struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
6988         u64 cur = page_offset(page);
6989         const u64 end = page_offset(page) + PAGE_SIZE;
6990         int ret;
6991
6992         while (cur < end) {
6993                 struct extent_buffer *eb = NULL;
6994
6995                 /*
6996                  * Unlike try_release_extent_buffer() which uses page->private
6997                  * to grab buffer, for subpage case we rely on radix tree, thus
6998                  * we need to ensure radix tree consistency.
6999                  *
7000                  * We also want an atomic snapshot of the radix tree, thus go
7001                  * with spinlock rather than RCU.
7002                  */
7003                 spin_lock(&fs_info->buffer_lock);
7004                 eb = get_next_extent_buffer(fs_info, page, cur);
7005                 if (!eb) {
7006                         /* No more eb in the page range after or at cur */
7007                         spin_unlock(&fs_info->buffer_lock);
7008                         break;
7009                 }
7010                 cur = eb->start + eb->len;
7011
7012                 /*
7013                  * The same as try_release_extent_buffer(), to ensure the eb
7014                  * won't disappear out from under us.
7015                  */
7016                 spin_lock(&eb->refs_lock);
7017                 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
7018                         spin_unlock(&eb->refs_lock);
7019                         spin_unlock(&fs_info->buffer_lock);
7020                         break;
7021                 }
7022                 spin_unlock(&fs_info->buffer_lock);
7023
7024                 /*
7025                  * If tree ref isn't set then we know the ref on this eb is a
7026                  * real ref, so just return, this eb will likely be freed soon
7027                  * anyway.
7028                  */
7029                 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7030                         spin_unlock(&eb->refs_lock);
7031                         break;
7032                 }
7033
7034                 /*
7035                  * Here we don't care about the return value, we will always
7036                  * check the page private at the end.  And
7037                  * release_extent_buffer() will release the refs_lock.
7038                  */
7039                 release_extent_buffer(eb);
7040         }
7041         /*
7042          * Finally to check if we have cleared page private, as if we have
7043          * released all ebs in the page, the page private should be cleared now.
7044          */
7045         spin_lock(&page->mapping->private_lock);
7046         if (!PagePrivate(page))
7047                 ret = 1;
7048         else
7049                 ret = 0;
7050         spin_unlock(&page->mapping->private_lock);
7051         return ret;
7052
7053 }
7054
7055 int try_release_extent_buffer(struct page *page)
7056 {
7057         struct extent_buffer *eb;
7058
7059         if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
7060                 return try_release_subpage_extent_buffer(page);
7061
7062         /*
7063          * We need to make sure nobody is changing page->private, as we rely on
7064          * page->private as the pointer to extent buffer.
7065          */
7066         spin_lock(&page->mapping->private_lock);
7067         if (!PagePrivate(page)) {
7068                 spin_unlock(&page->mapping->private_lock);
7069                 return 1;
7070         }
7071
7072         eb = (struct extent_buffer *)page->private;
7073         BUG_ON(!eb);
7074
7075         /*
7076          * This is a little awful but should be ok, we need to make sure that
7077          * the eb doesn't disappear out from under us while we're looking at
7078          * this page.
7079          */
7080         spin_lock(&eb->refs_lock);
7081         if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
7082                 spin_unlock(&eb->refs_lock);
7083                 spin_unlock(&page->mapping->private_lock);
7084                 return 0;
7085         }
7086         spin_unlock(&page->mapping->private_lock);
7087
7088         /*
7089          * If tree ref isn't set then we know the ref on this eb is a real ref,
7090          * so just return, this page will likely be freed soon anyway.
7091          */
7092         if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7093                 spin_unlock(&eb->refs_lock);
7094                 return 0;
7095         }
7096
7097         return release_extent_buffer(eb);
7098 }
7099
7100 /*
7101  * btrfs_readahead_tree_block - attempt to readahead a child block
7102  * @fs_info:    the fs_info
7103  * @bytenr:     bytenr to read
7104  * @owner_root: objectid of the root that owns this eb
7105  * @gen:        generation for the uptodate check, can be 0
7106  * @level:      level for the eb
7107  *
7108  * Attempt to readahead a tree block at @bytenr.  If @gen is 0 then we do a
7109  * normal uptodate check of the eb, without checking the generation.  If we have
7110  * to read the block we will not block on anything.
7111  */
7112 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
7113                                 u64 bytenr, u64 owner_root, u64 gen, int level)
7114 {
7115         struct extent_buffer *eb;
7116         int ret;
7117
7118         eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
7119         if (IS_ERR(eb))
7120                 return;
7121
7122         if (btrfs_buffer_uptodate(eb, gen, 1)) {
7123                 free_extent_buffer(eb);
7124                 return;
7125         }
7126
7127         ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
7128         if (ret < 0)
7129                 free_extent_buffer_stale(eb);
7130         else
7131                 free_extent_buffer(eb);
7132 }
7133
7134 /*
7135  * btrfs_readahead_node_child - readahead a node's child block
7136  * @node:       parent node we're reading from
7137  * @slot:       slot in the parent node for the child we want to read
7138  *
7139  * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
7140  * the slot in the node provided.
7141  */
7142 void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
7143 {
7144         btrfs_readahead_tree_block(node->fs_info,
7145                                    btrfs_node_blockptr(node, slot),
7146                                    btrfs_header_owner(node),
7147                                    btrfs_node_ptr_generation(node, slot),
7148                                    btrfs_header_level(node) - 1);
7149 }