fs/btrfs/scrub.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   4  */
   5
   6 #include <linux/blkdev.h>
   7 #include <linux/ratelimit.h>
   8 #include <linux/sched/mm.h>
   9 #include <crypto/hash.h>
  10 #include "ctree.h"
  11 #include "discard.h"
  12 #include "volumes.h"
  13 #include "disk-io.h"
  14 #include "ordered-data.h"
  15 #include "transaction.h"
  16 #include "backref.h"
  17 #include "extent_io.h"
  18 #include "dev-replace.h"
  19 #include "check-integrity.h"
  20 #include "rcu-string.h"
  21 #include "raid56.h"
  22 #include "block-group.h"
  23 #include "zoned.h"
  24
  25 /*
  26  * This is only the first step towards a full-features scrub. It reads all
  27  * extent and super block and verifies the checksums. In case a bad checksum
  28  * is found or the extent cannot be read, good data will be written back if
  29  * any can be found.
  30  *
  31  * Future enhancements:
  32  *  - In case an unrepairable extent is encountered, track which files are
  33  *    affected and report them
  34  *  - track and record media errors, throw out bad devices
  35  *  - add a mode to also read unallocated space
  36  */
  37
  38 struct scrub_block;
  39 struct scrub_ctx;
  40
  41 /*
  42  * The following three values only influence the performance.
  43  *
  44  * The last one configures the number of parallel and outstanding I/O
  45  * operations. The first one configures an upper limit for the number
  46  * of (dynamically allocated) pages that are added to a bio.
  47  */
  48 #define SCRUB_SECTORS_PER_BIO   32      /* 128KiB per bio for 4KiB pages */
  49 #define SCRUB_BIOS_PER_SCTX     64      /* 8MiB per device in flight for 4KiB pages */
  50
  51 /*
  52  * The following value times PAGE_SIZE needs to be large enough to match the
  53  * largest node/leaf/sector size that shall be supported.
  54  */
  55 #define SCRUB_MAX_SECTORS_PER_BLOCK     (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
  56
  57 struct scrub_recover {
  58         refcount_t              refs;
  59         struct btrfs_io_context *bioc;
  60         u64                     map_length;
  61 };
  62
  63 struct scrub_sector {
  64         struct scrub_block      *sblock;
  65         struct page             *page;
  66         struct btrfs_device     *dev;
  67         struct list_head        list;
  68         u64                     flags;  /* extent flags */
  69         u64                     generation;
  70         u64                     logical;
  71         u64                     physical;
  72         u64                     physical_for_dev_replace;
  73         atomic_t                refs;
  74         u8                      mirror_num;
  75         unsigned int            have_csum:1;
  76         unsigned int            io_error:1;
  77         u8                      csum[BTRFS_CSUM_SIZE];
  78
  79         struct scrub_recover    *recover;
  80 };
  81
  82 struct scrub_bio {
  83         int                     index;
  84         struct scrub_ctx        *sctx;
  85         struct btrfs_device     *dev;
  86         struct bio              *bio;
  87         blk_status_t            status;
  88         u64                     logical;
  89         u64                     physical;
  90         struct scrub_sector     *sectors[SCRUB_SECTORS_PER_BIO];
  91         int                     sector_count;
  92         int                     next_free;
  93         struct work_struct      work;
  94 };
  95
  96 struct scrub_block {
  97         struct scrub_sector     *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
  98         int                     sector_count;
  99         atomic_t                outstanding_sectors;
 100         refcount_t              refs; /* free mem on transition to zero */
 101         struct scrub_ctx        *sctx;
 102         struct scrub_parity     *sparity;
 103         struct {
 104                 unsigned int    header_error:1;
 105                 unsigned int    checksum_error:1;
 106                 unsigned int    no_io_error_seen:1;
 107                 unsigned int    generation_error:1; /* also sets header_error */
 108
 109                 /* The following is for the data used to check parity */
 110                 /* It is for the data with checksum */
 111                 unsigned int    data_corrected:1;
 112         };
 113         struct work_struct      work;
 114 };
 115
 116 /* Used for the chunks with parity stripe such RAID5/6 */
 117 struct scrub_parity {
 118         struct scrub_ctx        *sctx;
 119
 120         struct btrfs_device     *scrub_dev;
 121
 122         u64                     logic_start;
 123
 124         u64                     logic_end;
 125
 126         int                     nsectors;
 127
 128         u32                     stripe_len;
 129
 130         refcount_t              refs;
 131
 132         struct list_head        sectors_list;
 133
 134         /* Work of parity check and repair */
 135         struct work_struct      work;
 136
 137         /* Mark the parity blocks which have data */
 138         unsigned long           dbitmap;
 139
 140         /*
 141          * Mark the parity blocks which have data, but errors happen when
 142          * read data or check data
 143          */
 144         unsigned long           ebitmap;
 145 };
 146
 147 struct scrub_ctx {
 148         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 149         struct btrfs_fs_info    *fs_info;
 150         int                     first_free;
 151         int                     curr;
 152         atomic_t                bios_in_flight;
 153         atomic_t                workers_pending;
 154         spinlock_t              list_lock;
 155         wait_queue_head_t       list_wait;
 156         struct list_head        csum_list;
 157         atomic_t                cancel_req;
 158         int                     readonly;
 159         int                     sectors_per_bio;
 160
 161         /* State of IO submission throttling affecting the associated device */
 162         ktime_t                 throttle_deadline;
 163         u64                     throttle_sent;
 164
 165         int                     is_dev_replace;
 166         u64                     write_pointer;
 167
 168         struct scrub_bio        *wr_curr_bio;
 169         struct mutex            wr_lock;
 170         struct btrfs_device     *wr_tgtdev;
 171         bool                    flush_all_writes;
 172
 173         /*
 174          * statistics
 175          */
 176         struct btrfs_scrub_progress stat;
 177         spinlock_t              stat_lock;
 178
 179         /*
 180          * Use a ref counter to avoid use-after-free issues. Scrub workers
 181          * decrement bios_in_flight and workers_pending and then do a wakeup
 182          * on the list_wait wait queue. We must ensure the main scrub task
 183          * doesn't free the scrub context before or while the workers are
 184          * doing the wakeup() call.
 185          */
 186         refcount_t              refs;
 187 };
 188
 189 struct scrub_warning {
 190         struct btrfs_path       *path;
 191         u64                     extent_item_size;
 192         const char              *errstr;
 193         u64                     physical;
 194         u64                     logical;
 195         struct btrfs_device     *dev;
 196 };
 197
 198 struct full_stripe_lock {
 199         struct rb_node node;
 200         u64 logical;
 201         u64 refs;
 202         struct mutex mutex;
 203 };
 204
 205 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 206                                      struct scrub_block *sblocks_for_recheck);
 207 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 208                                 struct scrub_block *sblock,
 209                                 int retry_failed_mirror);
 210 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
 211 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 212                                              struct scrub_block *sblock_good);
 213 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
 214                                             struct scrub_block *sblock_good,
 215                                             int sector_num, int force_write);
 216 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 217 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
 218                                              int sector_num);
 219 static int scrub_checksum_data(struct scrub_block *sblock);
 220 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 221 static int scrub_checksum_super(struct scrub_block *sblock);
 222 static void scrub_block_put(struct scrub_block *sblock);
 223 static void scrub_sector_get(struct scrub_sector *sector);
 224 static void scrub_sector_put(struct scrub_sector *sector);
 225 static void scrub_parity_get(struct scrub_parity *sparity);
 226 static void scrub_parity_put(struct scrub_parity *sparity);
 227 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
 228                          u64 physical, struct btrfs_device *dev, u64 flags,
 229                          u64 gen, int mirror_num, u8 *csum,
 230                          u64 physical_for_dev_replace);
 231 static void scrub_bio_end_io(struct bio *bio);
 232 static void scrub_bio_end_io_worker(struct work_struct *work);
 233 static void scrub_block_complete(struct scrub_block *sblock);
 234 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
 235                                  u64 extent_logical, u32 extent_len,
 236                                  u64 *extent_physical,
 237                                  struct btrfs_device **extent_dev,
 238                                  int *extent_mirror_num);
 239 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
 240                                       struct scrub_sector *sector);
 241 static void scrub_wr_submit(struct scrub_ctx *sctx);
 242 static void scrub_wr_bio_end_io(struct bio *bio);
 243 static void scrub_wr_bio_end_io_worker(struct work_struct *work);
 244 static void scrub_put_ctx(struct scrub_ctx *sctx);
 245
 246 static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
 247 {
 248         return sector->recover &&
 249                (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
 250 }
 251
 252 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 253 {
 254         refcount_inc(&sctx->refs);
 255         atomic_inc(&sctx->bios_in_flight);
 256 }
 257
 258 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 259 {
 260         atomic_dec(&sctx->bios_in_flight);
 261         wake_up(&sctx->list_wait);
 262         scrub_put_ctx(sctx);
 263 }
 264
 265 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 266 {
 267         while (atomic_read(&fs_info->scrub_pause_req)) {
 268                 mutex_unlock(&fs_info->scrub_lock);
 269                 wait_event(fs_info->scrub_pause_wait,
 270                    atomic_read(&fs_info->scrub_pause_req) == 0);
 271                 mutex_lock(&fs_info->scrub_lock);
 272         }
 273 }
 274
 275 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
 276 {
 277         atomic_inc(&fs_info->scrubs_paused);
 278         wake_up(&fs_info->scrub_pause_wait);
 279 }
 280
 281 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
 282 {
 283         mutex_lock(&fs_info->scrub_lock);
 284         __scrub_blocked_if_needed(fs_info);
 285         atomic_dec(&fs_info->scrubs_paused);
 286         mutex_unlock(&fs_info->scrub_lock);
 287
 288         wake_up(&fs_info->scrub_pause_wait);
 289 }
 290
 291 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 292 {
 293         scrub_pause_on(fs_info);
 294         scrub_pause_off(fs_info);
 295 }
 296
 297 /*
 298  * Insert new full stripe lock into full stripe locks tree
 299  *
 300  * Return pointer to existing or newly inserted full_stripe_lock structure if
 301  * everything works well.
 302  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
 303  *
 304  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
 305  * function
 306  */
 307 static struct full_stripe_lock *insert_full_stripe_lock(
 308                 struct btrfs_full_stripe_locks_tree *locks_root,
 309                 u64 fstripe_logical)
 310 {
 311         struct rb_node **p;
 312         struct rb_node *parent = NULL;
 313         struct full_stripe_lock *entry;
 314         struct full_stripe_lock *ret;
 315
 316         lockdep_assert_held(&locks_root->lock);
 317
 318         p = &locks_root->root.rb_node;
 319         while (*p) {
 320                 parent = *p;
 321                 entry = rb_entry(parent, struct full_stripe_lock, node);
 322                 if (fstripe_logical < entry->logical) {
 323                         p = &(*p)->rb_left;
 324                 } else if (fstripe_logical > entry->logical) {
 325                         p = &(*p)->rb_right;
 326                 } else {
 327                         entry->refs++;
 328                         return entry;
 329                 }
 330         }
 331
 332         /*
 333          * Insert new lock.
 334          */
 335         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
 336         if (!ret)
 337                 return ERR_PTR(-ENOMEM);
 338         ret->logical = fstripe_logical;
 339         ret->refs = 1;
 340         mutex_init(&ret->mutex);
 341
 342         rb_link_node(&ret->node, parent, p);
 343         rb_insert_color(&ret->node, &locks_root->root);
 344         return ret;
 345 }
 346
 347 /*
 348  * Search for a full stripe lock of a block group
 349  *
 350  * Return pointer to existing full stripe lock if found
 351  * Return NULL if not found
 352  */
 353 static struct full_stripe_lock *search_full_stripe_lock(
 354                 struct btrfs_full_stripe_locks_tree *locks_root,
 355                 u64 fstripe_logical)
 356 {
 357         struct rb_node *node;
 358         struct full_stripe_lock *entry;
 359
 360         lockdep_assert_held(&locks_root->lock);
 361
 362         node = locks_root->root.rb_node;
 363         while (node) {
 364                 entry = rb_entry(node, struct full_stripe_lock, node);
 365                 if (fstripe_logical < entry->logical)
 366                         node = node->rb_left;
 367                 else if (fstripe_logical > entry->logical)
 368                         node = node->rb_right;
 369                 else
 370                         return entry;
 371         }
 372         return NULL;
 373 }
 374
 375 /*
 376  * Helper to get full stripe logical from a normal bytenr.
 377  *
 378  * Caller must ensure @cache is a RAID56 block group.
 379  */
 380 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
 381 {
 382         u64 ret;
 383
 384         /*
 385          * Due to chunk item size limit, full stripe length should not be
 386          * larger than U32_MAX. Just a sanity check here.
 387          */
 388         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
 389
 390         /*
 391          * round_down() can only handle power of 2, while RAID56 full
 392          * stripe length can be 64KiB * n, so we need to manually round down.
 393          */
 394         ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
 395                         cache->full_stripe_len + cache->start;
 396         return ret;
 397 }
 398
 399 /*
 400  * Lock a full stripe to avoid concurrency of recovery and read
 401  *
 402  * It's only used for profiles with parities (RAID5/6), for other profiles it
 403  * does nothing.
 404  *
 405  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
 406  * So caller must call unlock_full_stripe() at the same context.
 407  *
 408  * Return <0 if encounters error.
 409  */
 410 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 411                             bool *locked_ret)
 412 {
 413         struct btrfs_block_group *bg_cache;
 414         struct btrfs_full_stripe_locks_tree *locks_root;
 415         struct full_stripe_lock *existing;
 416         u64 fstripe_start;
 417         int ret = 0;
 418
 419         *locked_ret = false;
 420         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 421         if (!bg_cache) {
 422                 ASSERT(0);
 423                 return -ENOENT;
 424         }
 425
 426         /* Profiles not based on parity don't need full stripe lock */
 427         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 428                 goto out;
 429         locks_root = &bg_cache->full_stripe_locks_root;
 430
 431         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 432
 433         /* Now insert the full stripe lock */
 434         mutex_lock(&locks_root->lock);
 435         existing = insert_full_stripe_lock(locks_root, fstripe_start);
 436         mutex_unlock(&locks_root->lock);
 437         if (IS_ERR(existing)) {
 438                 ret = PTR_ERR(existing);
 439                 goto out;
 440         }
 441         mutex_lock(&existing->mutex);
 442         *locked_ret = true;
 443 out:
 444         btrfs_put_block_group(bg_cache);
 445         return ret;
 446 }
 447
 448 /*
 449  * Unlock a full stripe.
 450  *
 451  * NOTE: Caller must ensure it's the same context calling corresponding
 452  * lock_full_stripe().
 453  *
 454  * Return 0 if we unlock full stripe without problem.
 455  * Return <0 for error
 456  */
 457 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 458                               bool locked)
 459 {
 460         struct btrfs_block_group *bg_cache;
 461         struct btrfs_full_stripe_locks_tree *locks_root;
 462         struct full_stripe_lock *fstripe_lock;
 463         u64 fstripe_start;
 464         bool freeit = false;
 465         int ret = 0;
 466
 467         /* If we didn't acquire full stripe lock, no need to continue */
 468         if (!locked)
 469                 return 0;
 470
 471         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 472         if (!bg_cache) {
 473                 ASSERT(0);
 474                 return -ENOENT;
 475         }
 476         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 477                 goto out;
 478
 479         locks_root = &bg_cache->full_stripe_locks_root;
 480         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 481
 482         mutex_lock(&locks_root->lock);
 483         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
 484         /* Unpaired unlock_full_stripe() detected */
 485         if (!fstripe_lock) {
 486                 WARN_ON(1);
 487                 ret = -ENOENT;
 488                 mutex_unlock(&locks_root->lock);
 489                 goto out;
 490         }
 491
 492         if (fstripe_lock->refs == 0) {
 493                 WARN_ON(1);
 494                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
 495                         fstripe_lock->logical);
 496         } else {
 497                 fstripe_lock->refs--;
 498         }
 499
 500         if (fstripe_lock->refs == 0) {
 501                 rb_erase(&fstripe_lock->node, &locks_root->root);
 502                 freeit = true;
 503         }
 504         mutex_unlock(&locks_root->lock);
 505
 506         mutex_unlock(&fstripe_lock->mutex);
 507         if (freeit)
 508                 kfree(fstripe_lock);
 509 out:
 510         btrfs_put_block_group(bg_cache);
 511         return ret;
 512 }
 513
 514 static void scrub_free_csums(struct scrub_ctx *sctx)
 515 {
 516         while (!list_empty(&sctx->csum_list)) {
 517                 struct btrfs_ordered_sum *sum;
 518                 sum = list_first_entry(&sctx->csum_list,
 519                                        struct btrfs_ordered_sum, list);
 520                 list_del(&sum->list);
 521                 kfree(sum);
 522         }
 523 }
 524
 525 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 526 {
 527         int i;
 528
 529         if (!sctx)
 530                 return;
 531
 532         /* this can happen when scrub is cancelled */
 533         if (sctx->curr != -1) {
 534                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
 535
 536                 for (i = 0; i < sbio->sector_count; i++) {
 537                         WARN_ON(!sbio->sectors[i]->page);
 538                         scrub_block_put(sbio->sectors[i]->sblock);
 539                 }
 540                 bio_put(sbio->bio);
 541         }
 542
 543         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 544                 struct scrub_bio *sbio = sctx->bios[i];
 545
 546                 if (!sbio)
 547                         break;
 548                 kfree(sbio);
 549         }
 550
 551         kfree(sctx->wr_curr_bio);
 552         scrub_free_csums(sctx);
 553         kfree(sctx);
 554 }
 555
 556 static void scrub_put_ctx(struct scrub_ctx *sctx)
 557 {
 558         if (refcount_dec_and_test(&sctx->refs))
 559                 scrub_free_ctx(sctx);
 560 }
 561
 562 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 563                 struct btrfs_fs_info *fs_info, int is_dev_replace)
 564 {
 565         struct scrub_ctx *sctx;
 566         int             i;
 567
 568         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
 569         if (!sctx)
 570                 goto nomem;
 571         refcount_set(&sctx->refs, 1);
 572         sctx->is_dev_replace = is_dev_replace;
 573         sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
 574         sctx->curr = -1;
 575         sctx->fs_info = fs_info;
 576         INIT_LIST_HEAD(&sctx->csum_list);
 577         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 578                 struct scrub_bio *sbio;
 579
 580                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
 581                 if (!sbio)
 582                         goto nomem;
 583                 sctx->bios[i] = sbio;
 584
 585                 sbio->index = i;
 586                 sbio->sctx = sctx;
 587                 sbio->sector_count = 0;
 588                 INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
 589
 590                 if (i != SCRUB_BIOS_PER_SCTX - 1)
 591                         sctx->bios[i]->next_free = i + 1;
 592                 else
 593                         sctx->bios[i]->next_free = -1;
 594         }
 595         sctx->first_free = 0;
 596         atomic_set(&sctx->bios_in_flight, 0);
 597         atomic_set(&sctx->workers_pending, 0);
 598         atomic_set(&sctx->cancel_req, 0);
 599
 600         spin_lock_init(&sctx->list_lock);
 601         spin_lock_init(&sctx->stat_lock);
 602         init_waitqueue_head(&sctx->list_wait);
 603         sctx->throttle_deadline = 0;
 604
 605         WARN_ON(sctx->wr_curr_bio != NULL);
 606         mutex_init(&sctx->wr_lock);
 607         sctx->wr_curr_bio = NULL;
 608         if (is_dev_replace) {
 609                 WARN_ON(!fs_info->dev_replace.tgtdev);
 610                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
 611                 sctx->flush_all_writes = false;
 612         }
 613
 614         return sctx;
 615
 616 nomem:
 617         scrub_free_ctx(sctx);
 618         return ERR_PTR(-ENOMEM);
 619 }
 620
 621 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 622                                      void *warn_ctx)
 623 {
 624         u32 nlink;
 625         int ret;
 626         int i;
 627         unsigned nofs_flag;
 628         struct extent_buffer *eb;
 629         struct btrfs_inode_item *inode_item;
 630         struct scrub_warning *swarn = warn_ctx;
 631         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
 632         struct inode_fs_paths *ipath = NULL;
 633         struct btrfs_root *local_root;
 634         struct btrfs_key key;
 635
 636         local_root = btrfs_get_fs_root(fs_info, root, true);
 637         if (IS_ERR(local_root)) {
 638                 ret = PTR_ERR(local_root);
 639                 goto err;
 640         }
 641
 642         /*
 643          * this makes the path point to (inum INODE_ITEM ioff)
 644          */
 645         key.objectid = inum;
 646         key.type = BTRFS_INODE_ITEM_KEY;
 647         key.offset = 0;
 648
 649         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 650         if (ret) {
 651                 btrfs_put_root(local_root);
 652                 btrfs_release_path(swarn->path);
 653                 goto err;
 654         }
 655
 656         eb = swarn->path->nodes[0];
 657         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 658                                         struct btrfs_inode_item);
 659         nlink = btrfs_inode_nlink(eb, inode_item);
 660         btrfs_release_path(swarn->path);
 661
 662         /*
 663          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
 664          * uses GFP_NOFS in this context, so we keep it consistent but it does
 665          * not seem to be strictly necessary.
 666          */
 667         nofs_flag = memalloc_nofs_save();
 668         ipath = init_ipath(4096, local_root, swarn->path);
 669         memalloc_nofs_restore(nofs_flag);
 670         if (IS_ERR(ipath)) {
 671                 btrfs_put_root(local_root);
 672                 ret = PTR_ERR(ipath);
 673                 ipath = NULL;
 674                 goto err;
 675         }
 676         ret = paths_from_inode(inum, ipath);
 677
 678         if (ret < 0)
 679                 goto err;
 680
 681         /*
 682          * we deliberately ignore the bit ipath might have been too small to
 683          * hold all of the paths here
 684          */
 685         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 686                 btrfs_warn_in_rcu(fs_info,
 687 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
 688                                   swarn->errstr, swarn->logical,
 689                                   rcu_str_deref(swarn->dev->name),
 690                                   swarn->physical,
 691                                   root, inum, offset,
 692                                   fs_info->sectorsize, nlink,
 693                                   (char *)(unsigned long)ipath->fspath->val[i]);
 694
 695         btrfs_put_root(local_root);
 696         free_ipath(ipath);
 697         return 0;
 698
 699 err:
 700         btrfs_warn_in_rcu(fs_info,
 701                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
 702                           swarn->errstr, swarn->logical,
 703                           rcu_str_deref(swarn->dev->name),
 704                           swarn->physical,
 705                           root, inum, offset, ret);
 706
 707         free_ipath(ipath);
 708         return 0;
 709 }
 710
 711 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 712 {
 713         struct btrfs_device *dev;
 714         struct btrfs_fs_info *fs_info;
 715         struct btrfs_path *path;
 716         struct btrfs_key found_key;
 717         struct extent_buffer *eb;
 718         struct btrfs_extent_item *ei;
 719         struct scrub_warning swarn;
 720         unsigned long ptr = 0;
 721         u64 extent_item_pos;
 722         u64 flags = 0;
 723         u64 ref_root;
 724         u32 item_size;
 725         u8 ref_level = 0;
 726         int ret;
 727
 728         WARN_ON(sblock->sector_count < 1);
 729         dev = sblock->sectors[0]->dev;
 730         fs_info = sblock->sctx->fs_info;
 731
 732         path = btrfs_alloc_path();
 733         if (!path)
 734                 return;
 735
 736         swarn.physical = sblock->sectors[0]->physical;
 737         swarn.logical = sblock->sectors[0]->logical;
 738         swarn.errstr = errstr;
 739         swarn.dev = NULL;
 740
 741         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 742                                   &flags);
 743         if (ret < 0)
 744                 goto out;
 745
 746         extent_item_pos = swarn.logical - found_key.objectid;
 747         swarn.extent_item_size = found_key.offset;
 748
 749         eb = path->nodes[0];
 750         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 751         item_size = btrfs_item_size(eb, path->slots[0]);
 752
 753         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 754                 do {
 755                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 756                                                       item_size, &ref_root,
 757                                                       &ref_level);
 758                         btrfs_warn_in_rcu(fs_info,
 759 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
 760                                 errstr, swarn.logical,
 761                                 rcu_str_deref(dev->name),
 762                                 swarn.physical,
 763                                 ref_level ? "node" : "leaf",
 764                                 ret < 0 ? -1 : ref_level,
 765                                 ret < 0 ? -1 : ref_root);
 766                 } while (ret != 1);
 767                 btrfs_release_path(path);
 768         } else {
 769                 btrfs_release_path(path);
 770                 swarn.path = path;
 771                 swarn.dev = dev;
 772                 iterate_extent_inodes(fs_info, found_key.objectid,
 773                                         extent_item_pos, 1,
 774                                         scrub_print_warning_inode, &swarn, false);
 775         }
 776
 777 out:
 778         btrfs_free_path(path);
 779 }
 780
 781 static inline void scrub_get_recover(struct scrub_recover *recover)
 782 {
 783         refcount_inc(&recover->refs);
 784 }
 785
 786 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
 787                                      struct scrub_recover *recover)
 788 {
 789         if (refcount_dec_and_test(&recover->refs)) {
 790                 btrfs_bio_counter_dec(fs_info);
 791                 btrfs_put_bioc(recover->bioc);
 792                 kfree(recover);
 793         }
 794 }
 795
 796 /*
 797  * scrub_handle_errored_block gets called when either verification of the
 798  * sectors failed or the bio failed to read, e.g. with EIO. In the latter
 799  * case, this function handles all sectors in the bio, even though only one
 800  * may be bad.
 801  * The goal of this function is to repair the errored block by using the
 802  * contents of one of the mirrors.
 803  */
 804 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 805 {
 806         struct scrub_ctx *sctx = sblock_to_check->sctx;
 807         struct btrfs_device *dev;
 808         struct btrfs_fs_info *fs_info;
 809         u64 logical;
 810         unsigned int failed_mirror_index;
 811         unsigned int is_metadata;
 812         unsigned int have_csum;
 813         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 814         struct scrub_block *sblock_bad;
 815         int ret;
 816         int mirror_index;
 817         int sector_num;
 818         int success;
 819         bool full_stripe_locked;
 820         unsigned int nofs_flag;
 821         static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
 822                                       DEFAULT_RATELIMIT_BURST);
 823
 824         BUG_ON(sblock_to_check->sector_count < 1);
 825         fs_info = sctx->fs_info;
 826         if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 827                 /*
 828                  * if we find an error in a super block, we just report it.
 829                  * They will get written with the next transaction commit
 830                  * anyway
 831                  */
 832                 spin_lock(&sctx->stat_lock);
 833                 ++sctx->stat.super_errors;
 834                 spin_unlock(&sctx->stat_lock);
 835                 return 0;
 836         }
 837         logical = sblock_to_check->sectors[0]->logical;
 838         BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
 839         failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
 840         is_metadata = !(sblock_to_check->sectors[0]->flags &
 841                         BTRFS_EXTENT_FLAG_DATA);
 842         have_csum = sblock_to_check->sectors[0]->have_csum;
 843         dev = sblock_to_check->sectors[0]->dev;
 844
 845         if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
 846                 return 0;
 847
 848         /*
 849          * We must use GFP_NOFS because the scrub task might be waiting for a
 850          * worker task executing this function and in turn a transaction commit
 851          * might be waiting the scrub task to pause (which needs to wait for all
 852          * the worker tasks to complete before pausing).
 853          * We do allocations in the workers through insert_full_stripe_lock()
 854          * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
 855          * this function.
 856          */
 857         nofs_flag = memalloc_nofs_save();
 858         /*
 859          * For RAID5/6, race can happen for a different device scrub thread.
 860          * For data corruption, Parity and Data threads will both try
 861          * to recovery the data.
 862          * Race can lead to doubly added csum error, or even unrecoverable
 863          * error.
 864          */
 865         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
 866         if (ret < 0) {
 867                 memalloc_nofs_restore(nofs_flag);
 868                 spin_lock(&sctx->stat_lock);
 869                 if (ret == -ENOMEM)
 870                         sctx->stat.malloc_errors++;
 871                 sctx->stat.read_errors++;
 872                 sctx->stat.uncorrectable_errors++;
 873                 spin_unlock(&sctx->stat_lock);
 874                 return ret;
 875         }
 876
 877         /*
 878          * read all mirrors one after the other. This includes to
 879          * re-read the extent or metadata block that failed (that was
 880          * the cause that this fixup code is called) another time,
 881          * sector by sector this time in order to know which sectors
 882          * caused I/O errors and which ones are good (for all mirrors).
 883          * It is the goal to handle the situation when more than one
 884          * mirror contains I/O errors, but the errors do not
 885          * overlap, i.e. the data can be repaired by selecting the
 886          * sectors from those mirrors without I/O error on the
 887          * particular sectors. One example (with blocks >= 2 * sectorsize)
 888          * would be that mirror #1 has an I/O error on the first sector,
 889          * the second sector is good, and mirror #2 has an I/O error on
 890          * the second sector, but the first sector is good.
 891          * Then the first sector of the first mirror can be repaired by
 892          * taking the first sector of the second mirror, and the
 893          * second sector of the second mirror can be repaired by
 894          * copying the contents of the 2nd sector of the 1st mirror.
 895          * One more note: if the sectors of one mirror contain I/O
 896          * errors, the checksum cannot be verified. In order to get
 897          * the best data for repairing, the first attempt is to find
 898          * a mirror without I/O errors and with a validated checksum.
 899          * Only if this is not possible, the sectors are picked from
 900          * mirrors with I/O errors without considering the checksum.
 901          * If the latter is the case, at the end, the checksum of the
 902          * repaired area is verified in order to correctly maintain
 903          * the statistics.
 904          */
 905
 906         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
 907                                       sizeof(*sblocks_for_recheck), GFP_KERNEL);
 908         if (!sblocks_for_recheck) {
 909                 spin_lock(&sctx->stat_lock);
 910                 sctx->stat.malloc_errors++;
 911                 sctx->stat.read_errors++;
 912                 sctx->stat.uncorrectable_errors++;
 913                 spin_unlock(&sctx->stat_lock);
 914                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 915                 goto out;
 916         }
 917
 918         /* Setup the context, map the logical blocks and alloc the sectors */
 919         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
 920         if (ret) {
 921                 spin_lock(&sctx->stat_lock);
 922                 sctx->stat.read_errors++;
 923                 sctx->stat.uncorrectable_errors++;
 924                 spin_unlock(&sctx->stat_lock);
 925                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 926                 goto out;
 927         }
 928         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 929         sblock_bad = sblocks_for_recheck + failed_mirror_index;
 930
 931         /* build and submit the bios for the failed mirror, check checksums */
 932         scrub_recheck_block(fs_info, sblock_bad, 1);
 933
 934         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 935             sblock_bad->no_io_error_seen) {
 936                 /*
 937                  * The error disappeared after reading sector by sector, or
 938                  * the area was part of a huge bio and other parts of the
 939                  * bio caused I/O errors, or the block layer merged several
 940                  * read requests into one and the error is caused by a
 941                  * different bio (usually one of the two latter cases is
 942                  * the cause)
 943                  */
 944                 spin_lock(&sctx->stat_lock);
 945                 sctx->stat.unverified_errors++;
 946                 sblock_to_check->data_corrected = 1;
 947                 spin_unlock(&sctx->stat_lock);
 948
 949                 if (sctx->is_dev_replace)
 950                         scrub_write_block_to_dev_replace(sblock_bad);
 951                 goto out;
 952         }
 953
 954         if (!sblock_bad->no_io_error_seen) {
 955                 spin_lock(&sctx->stat_lock);
 956                 sctx->stat.read_errors++;
 957                 spin_unlock(&sctx->stat_lock);
 958                 if (__ratelimit(&rs))
 959                         scrub_print_warning("i/o error", sblock_to_check);
 960                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 961         } else if (sblock_bad->checksum_error) {
 962                 spin_lock(&sctx->stat_lock);
 963                 sctx->stat.csum_errors++;
 964                 spin_unlock(&sctx->stat_lock);
 965                 if (__ratelimit(&rs))
 966                         scrub_print_warning("checksum error", sblock_to_check);
 967                 btrfs_dev_stat_inc_and_print(dev,
 968                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
 969         } else if (sblock_bad->header_error) {
 970                 spin_lock(&sctx->stat_lock);
 971                 sctx->stat.verify_errors++;
 972                 spin_unlock(&sctx->stat_lock);
 973                 if (__ratelimit(&rs))
 974                         scrub_print_warning("checksum/header error",
 975                                             sblock_to_check);
 976                 if (sblock_bad->generation_error)
 977                         btrfs_dev_stat_inc_and_print(dev,
 978                                 BTRFS_DEV_STAT_GENERATION_ERRS);
 979                 else
 980                         btrfs_dev_stat_inc_and_print(dev,
 981                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
 982         }
 983
 984         if (sctx->readonly) {
 985                 ASSERT(!sctx->is_dev_replace);
 986                 goto out;
 987         }
 988
 989         /*
 990          * now build and submit the bios for the other mirrors, check
 991          * checksums.
 992          * First try to pick the mirror which is completely without I/O
 993          * errors and also does not have a checksum error.
 994          * If one is found, and if a checksum is present, the full block
 995          * that is known to contain an error is rewritten. Afterwards
 996          * the block is known to be corrected.
 997          * If a mirror is found which is completely correct, and no
 998          * checksum is present, only those sectors are rewritten that had
 999          * an I/O error in the block to be repaired, since it cannot be
1000          * determined, which copy of the other sectors is better (and it
1001          * could happen otherwise that a correct sector would be
1002          * overwritten by a bad one).
1003          */
1004         for (mirror_index = 0; ;mirror_index++) {
1005                 struct scrub_block *sblock_other;
1006
1007                 if (mirror_index == failed_mirror_index)
1008                         continue;
1009
1010                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1011                 if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1012                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1013                                 break;
1014                         if (!sblocks_for_recheck[mirror_index].sector_count)
1015                                 break;
1016
1017                         sblock_other = sblocks_for_recheck + mirror_index;
1018                 } else {
1019                         struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1020                         int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1021
1022                         if (mirror_index >= max_allowed)
1023                                 break;
1024                         if (!sblocks_for_recheck[1].sector_count)
1025                                 break;
1026
1027                         ASSERT(failed_mirror_index == 0);
1028                         sblock_other = sblocks_for_recheck + 1;
1029                         sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
1030                 }
1031
1032                 /* build and submit the bios, check checksums */
1033                 scrub_recheck_block(fs_info, sblock_other, 0);
1034
1035                 if (!sblock_other->header_error &&
1036                     !sblock_other->checksum_error &&
1037                     sblock_other->no_io_error_seen) {
1038                         if (sctx->is_dev_replace) {
1039                                 scrub_write_block_to_dev_replace(sblock_other);
1040                                 goto corrected_error;
1041                         } else {
1042                                 ret = scrub_repair_block_from_good_copy(
1043                                                 sblock_bad, sblock_other);
1044                                 if (!ret)
1045                                         goto corrected_error;
1046                         }
1047                 }
1048         }
1049
1050         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1051                 goto did_not_correct_error;
1052
1053         /*
1054          * In case of I/O errors in the area that is supposed to be
1055          * repaired, continue by picking good copies of those sectors.
1056          * Select the good sectors from mirrors to rewrite bad sectors from
1057          * the area to fix. Afterwards verify the checksum of the block
1058          * that is supposed to be repaired. This verification step is
1059          * only done for the purpose of statistic counting and for the
1060          * final scrub report, whether errors remain.
1061          * A perfect algorithm could make use of the checksum and try
1062          * all possible combinations of sectors from the different mirrors
1063          * until the checksum verification succeeds. For example, when
1064          * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1065          * of mirror #2 is readable but the final checksum test fails,
1066          * then the 2nd sector of mirror #3 could be tried, whether now
1067          * the final checksum succeeds. But this would be a rare
1068          * exception and is therefore not implemented. At least it is
1069          * avoided that the good copy is overwritten.
1070          * A more useful improvement would be to pick the sectors
1071          * without I/O error based on sector sizes (512 bytes on legacy
1072          * disks) instead of on sectorsize. Then maybe 512 byte of one
1073          * mirror could be repaired by taking 512 byte of a different
1074          * mirror, even if other 512 byte sectors in the same sectorsize
1075          * area are unreadable.
1076          */
1077         success = 1;
1078         for (sector_num = 0; sector_num < sblock_bad->sector_count;
1079              sector_num++) {
1080                 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1081                 struct scrub_block *sblock_other = NULL;
1082
1083                 /* Skip no-io-error sectors in scrub */
1084                 if (!sector_bad->io_error && !sctx->is_dev_replace)
1085                         continue;
1086
1087                 if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1088                         /*
1089                          * In case of dev replace, if raid56 rebuild process
1090                          * didn't work out correct data, then copy the content
1091                          * in sblock_bad to make sure target device is identical
1092                          * to source device, instead of writing garbage data in
1093                          * sblock_for_recheck array to target device.
1094                          */
1095                         sblock_other = NULL;
1096                 } else if (sector_bad->io_error) {
1097                         /* Try to find no-io-error sector in mirrors */
1098                         for (mirror_index = 0;
1099                              mirror_index < BTRFS_MAX_MIRRORS &&
1100                              sblocks_for_recheck[mirror_index].sector_count > 0;
1101                              mirror_index++) {
1102                                 if (!sblocks_for_recheck[mirror_index].
1103                                     sectors[sector_num]->io_error) {
1104                                         sblock_other = sblocks_for_recheck +
1105                                                        mirror_index;
1106                                         break;
1107                                 }
1108                         }
1109                         if (!sblock_other)
1110                                 success = 0;
1111                 }
1112
1113                 if (sctx->is_dev_replace) {
1114                         /*
1115                          * Did not find a mirror to fetch the sector from.
1116                          * scrub_write_sector_to_dev_replace() handles this
1117                          * case (sector->io_error), by filling the block with
1118                          * zeros before submitting the write request
1119                          */
1120                         if (!sblock_other)
1121                                 sblock_other = sblock_bad;
1122
1123                         if (scrub_write_sector_to_dev_replace(sblock_other,
1124                                                               sector_num) != 0) {
1125                                 atomic64_inc(
1126                                         &fs_info->dev_replace.num_write_errors);
1127                                 success = 0;
1128                         }
1129                 } else if (sblock_other) {
1130                         ret = scrub_repair_sector_from_good_copy(sblock_bad,
1131                                                                  sblock_other,
1132                                                                  sector_num, 0);
1133                         if (0 == ret)
1134                                 sector_bad->io_error = 0;
1135                         else
1136                                 success = 0;
1137                 }
1138         }
1139
1140         if (success && !sctx->is_dev_replace) {
1141                 if (is_metadata || have_csum) {
1142                         /*
1143                          * need to verify the checksum now that all
1144                          * sectors on disk are repaired (the write
1145                          * request for data to be repaired is on its way).
1146                          * Just be lazy and use scrub_recheck_block()
1147                          * which re-reads the data before the checksum
1148                          * is verified, but most likely the data comes out
1149                          * of the page cache.
1150                          */
1151                         scrub_recheck_block(fs_info, sblock_bad, 1);
1152                         if (!sblock_bad->header_error &&
1153                             !sblock_bad->checksum_error &&
1154                             sblock_bad->no_io_error_seen)
1155                                 goto corrected_error;
1156                         else
1157                                 goto did_not_correct_error;
1158                 } else {
1159 corrected_error:
1160                         spin_lock(&sctx->stat_lock);
1161                         sctx->stat.corrected_errors++;
1162                         sblock_to_check->data_corrected = 1;
1163                         spin_unlock(&sctx->stat_lock);
1164                         btrfs_err_rl_in_rcu(fs_info,
1165                                 "fixed up error at logical %llu on dev %s",
1166                                 logical, rcu_str_deref(dev->name));
1167                 }
1168         } else {
1169 did_not_correct_error:
1170                 spin_lock(&sctx->stat_lock);
1171                 sctx->stat.uncorrectable_errors++;
1172                 spin_unlock(&sctx->stat_lock);
1173                 btrfs_err_rl_in_rcu(fs_info,
1174                         "unable to fixup (regular) error at logical %llu on dev %s",
1175                         logical, rcu_str_deref(dev->name));
1176         }
1177
1178 out:
1179         if (sblocks_for_recheck) {
1180                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1181                      mirror_index++) {
1182                         struct scrub_block *sblock = sblocks_for_recheck +
1183                                                      mirror_index;
1184                         struct scrub_recover *recover;
1185                         int i;
1186
1187                         for (i = 0; i < sblock->sector_count; i++) {
1188                                 sblock->sectors[i]->sblock = NULL;
1189                                 recover = sblock->sectors[i]->recover;
1190                                 if (recover) {
1191                                         scrub_put_recover(fs_info, recover);
1192                                         sblock->sectors[i]->recover = NULL;
1193                                 }
1194                                 scrub_sector_put(sblock->sectors[i]);
1195                         }
1196                 }
1197                 kfree(sblocks_for_recheck);
1198         }
1199
1200         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1201         memalloc_nofs_restore(nofs_flag);
1202         if (ret < 0)
1203                 return ret;
1204         return 0;
1205 }
1206
1207 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1208 {
1209         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1210                 return 2;
1211         else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1212                 return 3;
1213         else
1214                 return (int)bioc->num_stripes;
1215 }
1216
1217 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1218                                                  u64 *raid_map,
1219                                                  int nstripes, int mirror,
1220                                                  int *stripe_index,
1221                                                  u64 *stripe_offset)
1222 {
1223         int i;
1224
1225         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1226                 /* RAID5/6 */
1227                 for (i = 0; i < nstripes; i++) {
1228                         if (raid_map[i] == RAID6_Q_STRIPE ||
1229                             raid_map[i] == RAID5_P_STRIPE)
1230                                 continue;
1231
1232                         if (logical >= raid_map[i] &&
1233                             logical < raid_map[i] + BTRFS_STRIPE_LEN)
1234                                 break;
1235                 }
1236
1237                 *stripe_index = i;
1238                 *stripe_offset = logical - raid_map[i];
1239         } else {
1240                 /* The other RAID type */
1241                 *stripe_index = mirror;
1242                 *stripe_offset = 0;
1243         }
1244 }
1245
1246 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1247                                      struct scrub_block *sblocks_for_recheck)
1248 {
1249         struct scrub_ctx *sctx = original_sblock->sctx;
1250         struct btrfs_fs_info *fs_info = sctx->fs_info;
1251         u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
1252         u64 logical = original_sblock->sectors[0]->logical;
1253         u64 generation = original_sblock->sectors[0]->generation;
1254         u64 flags = original_sblock->sectors[0]->flags;
1255         u64 have_csum = original_sblock->sectors[0]->have_csum;
1256         struct scrub_recover *recover;
1257         struct btrfs_io_context *bioc;
1258         u64 sublen;
1259         u64 mapped_length;
1260         u64 stripe_offset;
1261         int stripe_index;
1262         int sector_index = 0;
1263         int mirror_index;
1264         int nmirrors;
1265         int ret;
1266
1267         /*
1268          * Note: the two members refs and outstanding_sectors are not used (and
1269          * not set) in the blocks that are used for the recheck procedure.
1270          */
1271
1272         while (length > 0) {
1273                 sublen = min_t(u64, length, fs_info->sectorsize);
1274                 mapped_length = sublen;
1275                 bioc = NULL;
1276
1277                 /*
1278                  * With a length of sectorsize, each returned stripe represents
1279                  * one mirror
1280                  */
1281                 btrfs_bio_counter_inc_blocked(fs_info);
1282                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1283                                        logical, &mapped_length, &bioc);
1284                 if (ret || !bioc || mapped_length < sublen) {
1285                         btrfs_put_bioc(bioc);
1286                         btrfs_bio_counter_dec(fs_info);
1287                         return -EIO;
1288                 }
1289
1290                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1291                 if (!recover) {
1292                         btrfs_put_bioc(bioc);
1293                         btrfs_bio_counter_dec(fs_info);
1294                         return -ENOMEM;
1295                 }
1296
1297                 refcount_set(&recover->refs, 1);
1298                 recover->bioc = bioc;
1299                 recover->map_length = mapped_length;
1300
1301                 ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1302
1303                 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1304
1305                 for (mirror_index = 0; mirror_index < nmirrors;
1306                      mirror_index++) {
1307                         struct scrub_block *sblock;
1308                         struct scrub_sector *sector;
1309
1310                         sblock = sblocks_for_recheck + mirror_index;
1311                         sblock->sctx = sctx;
1312
1313                         sector = kzalloc(sizeof(*sector), GFP_NOFS);
1314                         if (!sector) {
1315 leave_nomem:
1316                                 spin_lock(&sctx->stat_lock);
1317                                 sctx->stat.malloc_errors++;
1318                                 spin_unlock(&sctx->stat_lock);
1319                                 scrub_put_recover(fs_info, recover);
1320                                 return -ENOMEM;
1321                         }
1322                         scrub_sector_get(sector);
1323                         sblock->sectors[sector_index] = sector;
1324                         sector->sblock = sblock;
1325                         sector->flags = flags;
1326                         sector->generation = generation;
1327                         sector->logical = logical;
1328                         sector->have_csum = have_csum;
1329                         if (have_csum)
1330                                 memcpy(sector->csum,
1331                                        original_sblock->sectors[0]->csum,
1332                                        sctx->fs_info->csum_size);
1333
1334                         scrub_stripe_index_and_offset(logical,
1335                                                       bioc->map_type,
1336                                                       bioc->raid_map,
1337                                                       bioc->num_stripes -
1338                                                       bioc->num_tgtdevs,
1339                                                       mirror_index,
1340                                                       &stripe_index,
1341                                                       &stripe_offset);
1342                         sector->physical = bioc->stripes[stripe_index].physical +
1343                                          stripe_offset;
1344                         sector->dev = bioc->stripes[stripe_index].dev;
1345
1346                         BUG_ON(sector_index >= original_sblock->sector_count);
1347                         sector->physical_for_dev_replace =
1348                                 original_sblock->sectors[sector_index]->
1349                                 physical_for_dev_replace;
1350                         /* For missing devices, dev->bdev is NULL */
1351                         sector->mirror_num = mirror_index + 1;
1352                         sblock->sector_count++;
1353                         sector->page = alloc_page(GFP_NOFS);
1354                         if (!sector->page)
1355                                 goto leave_nomem;
1356
1357                         scrub_get_recover(recover);
1358                         sector->recover = recover;
1359                 }
1360                 scrub_put_recover(fs_info, recover);
1361                 length -= sublen;
1362                 logical += sublen;
1363                 sector_index++;
1364         }
1365
1366         return 0;
1367 }
1368
1369 static void scrub_bio_wait_endio(struct bio *bio)
1370 {
1371         complete(bio->bi_private);
1372 }
1373
1374 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1375                                         struct bio *bio,
1376                                         struct scrub_sector *sector)
1377 {
1378         DECLARE_COMPLETION_ONSTACK(done);
1379
1380         bio->bi_iter.bi_sector = sector->logical >> 9;
1381         bio->bi_private = &done;
1382         bio->bi_end_io = scrub_bio_wait_endio;
1383         raid56_parity_recover(bio, sector->recover->bioc,
1384                               sector->sblock->sectors[0]->mirror_num, false);
1385
1386         wait_for_completion_io(&done);
1387         return blk_status_to_errno(bio->bi_status);
1388 }
1389
1390 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1391                                           struct scrub_block *sblock)
1392 {
1393         struct scrub_sector *first_sector = sblock->sectors[0];
1394         struct bio *bio;
1395         int i;
1396
1397         /* All sectors in sblock belong to the same stripe on the same device. */
1398         ASSERT(first_sector->dev);
1399         if (!first_sector->dev->bdev)
1400                 goto out;
1401
1402         bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
1403
1404         for (i = 0; i < sblock->sector_count; i++) {
1405                 struct scrub_sector *sector = sblock->sectors[i];
1406
1407                 WARN_ON(!sector->page);
1408                 bio_add_page(bio, sector->page, PAGE_SIZE, 0);
1409         }
1410
1411         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
1412                 bio_put(bio);
1413                 goto out;
1414         }
1415
1416         bio_put(bio);
1417
1418         scrub_recheck_block_checksum(sblock);
1419
1420         return;
1421 out:
1422         for (i = 0; i < sblock->sector_count; i++)
1423                 sblock->sectors[i]->io_error = 1;
1424
1425         sblock->no_io_error_seen = 0;
1426 }
1427
1428 /*
1429  * This function will check the on disk data for checksum errors, header errors
1430  * and read I/O errors. If any I/O errors happen, the exact sectors which are
1431  * errored are marked as being bad. The goal is to enable scrub to take those
1432  * sectors that are not errored from all the mirrors so that the sectors that
1433  * are errored in the just handled mirror can be repaired.
1434  */
1435 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1436                                 struct scrub_block *sblock,
1437                                 int retry_failed_mirror)
1438 {
1439         int i;
1440
1441         sblock->no_io_error_seen = 1;
1442
1443         /* short cut for raid56 */
1444         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
1445                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1446
1447         for (i = 0; i < sblock->sector_count; i++) {
1448                 struct scrub_sector *sector = sblock->sectors[i];
1449                 struct bio bio;
1450                 struct bio_vec bvec;
1451
1452                 if (sector->dev->bdev == NULL) {
1453                         sector->io_error = 1;
1454                         sblock->no_io_error_seen = 0;
1455                         continue;
1456                 }
1457
1458                 WARN_ON(!sector->page);
1459                 bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
1460                 bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
1461                 bio.bi_iter.bi_sector = sector->physical >> 9;
1462
1463                 btrfsic_check_bio(&bio);
1464                 if (submit_bio_wait(&bio)) {
1465                         sector->io_error = 1;
1466                         sblock->no_io_error_seen = 0;
1467                 }
1468
1469                 bio_uninit(&bio);
1470         }
1471
1472         if (sblock->no_io_error_seen)
1473                 scrub_recheck_block_checksum(sblock);
1474 }
1475
1476 static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
1477 {
1478         struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
1479         int ret;
1480
1481         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1482         return !ret;
1483 }
1484
1485 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1486 {
1487         sblock->header_error = 0;
1488         sblock->checksum_error = 0;
1489         sblock->generation_error = 0;
1490
1491         if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1492                 scrub_checksum_data(sblock);
1493         else
1494                 scrub_checksum_tree_block(sblock);
1495 }
1496
1497 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1498                                              struct scrub_block *sblock_good)
1499 {
1500         int i;
1501         int ret = 0;
1502
1503         for (i = 0; i < sblock_bad->sector_count; i++) {
1504                 int ret_sub;
1505
1506                 ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
1507                                                              sblock_good, i, 1);
1508                 if (ret_sub)
1509                         ret = ret_sub;
1510         }
1511
1512         return ret;
1513 }
1514
1515 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
1516                                               struct scrub_block *sblock_good,
1517                                               int sector_num, int force_write)
1518 {
1519         struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1520         struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1521         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1522         const u32 sectorsize = fs_info->sectorsize;
1523
1524         BUG_ON(sector_bad->page == NULL);
1525         BUG_ON(sector_good->page == NULL);
1526         if (force_write || sblock_bad->header_error ||
1527             sblock_bad->checksum_error || sector_bad->io_error) {
1528                 struct bio bio;
1529                 struct bio_vec bvec;
1530                 int ret;
1531
1532                 if (!sector_bad->dev->bdev) {
1533                         btrfs_warn_rl(fs_info,
1534                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1535                         return -EIO;
1536                 }
1537
1538                 bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
1539                 bio.bi_iter.bi_sector = sector_bad->physical >> 9;
1540                 __bio_add_page(&bio, sector_good->page, sectorsize, 0);
1541
1542                 btrfsic_check_bio(&bio);
1543                 ret = submit_bio_wait(&bio);
1544                 bio_uninit(&bio);
1545
1546                 if (ret) {
1547                         btrfs_dev_stat_inc_and_print(sector_bad->dev,
1548                                 BTRFS_DEV_STAT_WRITE_ERRS);
1549                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1550                         return -EIO;
1551                 }
1552         }
1553
1554         return 0;
1555 }
1556
1557 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1558 {
1559         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1560         int i;
1561
1562         /*
1563          * This block is used for the check of the parity on the source device,
1564          * so the data needn't be written into the destination device.
1565          */
1566         if (sblock->sparity)
1567                 return;
1568
1569         for (i = 0; i < sblock->sector_count; i++) {
1570                 int ret;
1571
1572                 ret = scrub_write_sector_to_dev_replace(sblock, i);
1573                 if (ret)
1574                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1575         }
1576 }
1577
1578 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1579 {
1580         struct scrub_sector *sector = sblock->sectors[sector_num];
1581
1582         BUG_ON(sector->page == NULL);
1583         if (sector->io_error)
1584                 clear_page(page_address(sector->page));
1585
1586         return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1587 }
1588
1589 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1590 {
1591         int ret = 0;
1592         u64 length;
1593
1594         if (!btrfs_is_zoned(sctx->fs_info))
1595                 return 0;
1596
1597         if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1598                 return 0;
1599
1600         if (sctx->write_pointer < physical) {
1601                 length = physical - sctx->write_pointer;
1602
1603                 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1604                                                 sctx->write_pointer, length);
1605                 if (!ret)
1606                         sctx->write_pointer = physical;
1607         }
1608         return ret;
1609 }
1610
1611 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
1612                                       struct scrub_sector *sector)
1613 {
1614         struct scrub_bio *sbio;
1615         int ret;
1616         const u32 sectorsize = sctx->fs_info->sectorsize;
1617
1618         mutex_lock(&sctx->wr_lock);
1619 again:
1620         if (!sctx->wr_curr_bio) {
1621                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1622                                               GFP_KERNEL);
1623                 if (!sctx->wr_curr_bio) {
1624                         mutex_unlock(&sctx->wr_lock);
1625                         return -ENOMEM;
1626                 }
1627                 sctx->wr_curr_bio->sctx = sctx;
1628                 sctx->wr_curr_bio->sector_count = 0;
1629         }
1630         sbio = sctx->wr_curr_bio;
1631         if (sbio->sector_count == 0) {
1632                 ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
1633                 if (ret) {
1634                         mutex_unlock(&sctx->wr_lock);
1635                         return ret;
1636                 }
1637
1638                 sbio->physical = sector->physical_for_dev_replace;
1639                 sbio->logical = sector->logical;
1640                 sbio->dev = sctx->wr_tgtdev;
1641                 if (!sbio->bio) {
1642                         sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
1643                                               REQ_OP_WRITE, GFP_NOFS);
1644                 }
1645                 sbio->bio->bi_private = sbio;
1646                 sbio->bio->bi_end_io = scrub_wr_bio_end_io;
1647                 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
1648                 sbio->status = 0;
1649         } else if (sbio->physical + sbio->sector_count * sectorsize !=
1650                    sector->physical_for_dev_replace ||
1651                    sbio->logical + sbio->sector_count * sectorsize !=
1652                    sector->logical) {
1653                 scrub_wr_submit(sctx);
1654                 goto again;
1655         }
1656
1657         ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
1658         if (ret != sectorsize) {
1659                 if (sbio->sector_count < 1) {
1660                         bio_put(sbio->bio);
1661                         sbio->bio = NULL;
1662                         mutex_unlock(&sctx->wr_lock);
1663                         return -EIO;
1664                 }
1665                 scrub_wr_submit(sctx);
1666                 goto again;
1667         }
1668
1669         sbio->sectors[sbio->sector_count] = sector;
1670         scrub_sector_get(sector);
1671         sbio->sector_count++;
1672         if (sbio->sector_count == sctx->sectors_per_bio)
1673                 scrub_wr_submit(sctx);
1674         mutex_unlock(&sctx->wr_lock);
1675
1676         return 0;
1677 }
1678
1679 static void scrub_wr_submit(struct scrub_ctx *sctx)
1680 {
1681         struct scrub_bio *sbio;
1682
1683         if (!sctx->wr_curr_bio)
1684                 return;
1685
1686         sbio = sctx->wr_curr_bio;
1687         sctx->wr_curr_bio = NULL;
1688         scrub_pending_bio_inc(sctx);
1689         /* process all writes in a single worker thread. Then the block layer
1690          * orders the requests before sending them to the driver which
1691          * doubled the write performance on spinning disks when measured
1692          * with Linux 3.5 */
1693         btrfsic_check_bio(sbio->bio);
1694         submit_bio(sbio->bio);
1695
1696         if (btrfs_is_zoned(sctx->fs_info))
1697                 sctx->write_pointer = sbio->physical + sbio->sector_count *
1698                         sctx->fs_info->sectorsize;
1699 }
1700
1701 static void scrub_wr_bio_end_io(struct bio *bio)
1702 {
1703         struct scrub_bio *sbio = bio->bi_private;
1704         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1705
1706         sbio->status = bio->bi_status;
1707         sbio->bio = bio;
1708
1709         INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
1710         queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1711 }
1712
1713 static void scrub_wr_bio_end_io_worker(struct work_struct *work)
1714 {
1715         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1716         struct scrub_ctx *sctx = sbio->sctx;
1717         int i;
1718
1719         ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
1720         if (sbio->status) {
1721                 struct btrfs_dev_replace *dev_replace =
1722                         &sbio->sctx->fs_info->dev_replace;
1723
1724                 for (i = 0; i < sbio->sector_count; i++) {
1725                         struct scrub_sector *sector = sbio->sectors[i];
1726
1727                         sector->io_error = 1;
1728                         atomic64_inc(&dev_replace->num_write_errors);
1729                 }
1730         }
1731
1732         for (i = 0; i < sbio->sector_count; i++)
1733                 scrub_sector_put(sbio->sectors[i]);
1734
1735         bio_put(sbio->bio);
1736         kfree(sbio);
1737         scrub_pending_bio_dec(sctx);
1738 }
1739
1740 static int scrub_checksum(struct scrub_block *sblock)
1741 {
1742         u64 flags;
1743         int ret;
1744
1745         /*
1746          * No need to initialize these stats currently,
1747          * because this function only use return value
1748          * instead of these stats value.
1749          *
1750          * Todo:
1751          * always use stats
1752          */
1753         sblock->header_error = 0;
1754         sblock->generation_error = 0;
1755         sblock->checksum_error = 0;
1756
1757         WARN_ON(sblock->sector_count < 1);
1758         flags = sblock->sectors[0]->flags;
1759         ret = 0;
1760         if (flags & BTRFS_EXTENT_FLAG_DATA)
1761                 ret = scrub_checksum_data(sblock);
1762         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1763                 ret = scrub_checksum_tree_block(sblock);
1764         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1765                 (void)scrub_checksum_super(sblock);
1766         else
1767                 WARN_ON(1);
1768         if (ret)
1769                 scrub_handle_errored_block(sblock);
1770
1771         return ret;
1772 }
1773
1774 static int scrub_checksum_data(struct scrub_block *sblock)
1775 {
1776         struct scrub_ctx *sctx = sblock->sctx;
1777         struct btrfs_fs_info *fs_info = sctx->fs_info;
1778         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1779         u8 csum[BTRFS_CSUM_SIZE];
1780         struct scrub_sector *sector;
1781         char *kaddr;
1782
1783         BUG_ON(sblock->sector_count < 1);
1784         sector = sblock->sectors[0];
1785         if (!sector->have_csum)
1786                 return 0;
1787
1788         kaddr = page_address(sector->page);
1789
1790         shash->tfm = fs_info->csum_shash;
1791         crypto_shash_init(shash);
1792
1793         /*
1794          * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
1795          * only contains one sector of data.
1796          */
1797         crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1798
1799         if (memcmp(csum, sector->csum, fs_info->csum_size))
1800                 sblock->checksum_error = 1;
1801         return sblock->checksum_error;
1802 }
1803
1804 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1805 {
1806         struct scrub_ctx *sctx = sblock->sctx;
1807         struct btrfs_header *h;
1808         struct btrfs_fs_info *fs_info = sctx->fs_info;
1809         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1810         u8 calculated_csum[BTRFS_CSUM_SIZE];
1811         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1812         /*
1813          * This is done in sectorsize steps even for metadata as there's a
1814          * constraint for nodesize to be aligned to sectorsize. This will need
1815          * to change so we don't misuse data and metadata units like that.
1816          */
1817         const u32 sectorsize = sctx->fs_info->sectorsize;
1818         const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1819         int i;
1820         struct scrub_sector *sector;
1821         char *kaddr;
1822
1823         BUG_ON(sblock->sector_count < 1);
1824
1825         /* Each member in sectors is just one sector */
1826         ASSERT(sblock->sector_count == num_sectors);
1827
1828         sector = sblock->sectors[0];
1829         kaddr = page_address(sector->page);
1830         h = (struct btrfs_header *)kaddr;
1831         memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1832
1833         /*
1834          * we don't use the getter functions here, as we
1835          * a) don't have an extent buffer and
1836          * b) the page is already kmapped
1837          */
1838         if (sector->logical != btrfs_stack_header_bytenr(h))
1839                 sblock->header_error = 1;
1840
1841         if (sector->generation != btrfs_stack_header_generation(h)) {
1842                 sblock->header_error = 1;
1843                 sblock->generation_error = 1;
1844         }
1845
1846         if (!scrub_check_fsid(h->fsid, sector))
1847                 sblock->header_error = 1;
1848
1849         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1850                    BTRFS_UUID_SIZE))
1851                 sblock->header_error = 1;
1852
1853         shash->tfm = fs_info->csum_shash;
1854         crypto_shash_init(shash);
1855         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1856                             sectorsize - BTRFS_CSUM_SIZE);
1857
1858         for (i = 1; i < num_sectors; i++) {
1859                 kaddr = page_address(sblock->sectors[i]->page);
1860                 crypto_shash_update(shash, kaddr, sectorsize);
1861         }
1862
1863         crypto_shash_final(shash, calculated_csum);
1864         if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1865                 sblock->checksum_error = 1;
1866
1867         return sblock->header_error || sblock->checksum_error;
1868 }
1869
1870 static int scrub_checksum_super(struct scrub_block *sblock)
1871 {
1872         struct btrfs_super_block *s;
1873         struct scrub_ctx *sctx = sblock->sctx;
1874         struct btrfs_fs_info *fs_info = sctx->fs_info;
1875         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1876         u8 calculated_csum[BTRFS_CSUM_SIZE];
1877         struct scrub_sector *sector;
1878         char *kaddr;
1879         int fail_gen = 0;
1880         int fail_cor = 0;
1881
1882         BUG_ON(sblock->sector_count < 1);
1883         sector = sblock->sectors[0];
1884         kaddr = page_address(sector->page);
1885         s = (struct btrfs_super_block *)kaddr;
1886
1887         if (sector->logical != btrfs_super_bytenr(s))
1888                 ++fail_cor;
1889
1890         if (sector->generation != btrfs_super_generation(s))
1891                 ++fail_gen;
1892
1893         if (!scrub_check_fsid(s->fsid, sector))
1894                 ++fail_cor;
1895
1896         shash->tfm = fs_info->csum_shash;
1897         crypto_shash_init(shash);
1898         crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1899                         BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1900
1901         if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1902                 ++fail_cor;
1903
1904         if (fail_cor + fail_gen) {
1905                 /*
1906                  * if we find an error in a super block, we just report it.
1907                  * They will get written with the next transaction commit
1908                  * anyway
1909                  */
1910                 spin_lock(&sctx->stat_lock);
1911                 ++sctx->stat.super_errors;
1912                 spin_unlock(&sctx->stat_lock);
1913                 if (fail_cor)
1914                         btrfs_dev_stat_inc_and_print(sector->dev,
1915                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1916                 else
1917                         btrfs_dev_stat_inc_and_print(sector->dev,
1918                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1919         }
1920
1921         return fail_cor + fail_gen;
1922 }
1923
1924 static void scrub_block_get(struct scrub_block *sblock)
1925 {
1926         refcount_inc(&sblock->refs);
1927 }
1928
1929 static void scrub_block_put(struct scrub_block *sblock)
1930 {
1931         if (refcount_dec_and_test(&sblock->refs)) {
1932                 int i;
1933
1934                 if (sblock->sparity)
1935                         scrub_parity_put(sblock->sparity);
1936
1937                 for (i = 0; i < sblock->sector_count; i++)
1938                         scrub_sector_put(sblock->sectors[i]);
1939                 kfree(sblock);
1940         }
1941 }
1942
1943 static void scrub_sector_get(struct scrub_sector *sector)
1944 {
1945         atomic_inc(&sector->refs);
1946 }
1947
1948 static void scrub_sector_put(struct scrub_sector *sector)
1949 {
1950         if (atomic_dec_and_test(&sector->refs)) {
1951                 if (sector->page)
1952                         __free_page(sector->page);
1953                 kfree(sector);
1954         }
1955 }
1956
1957 /*
1958  * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1959  * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1960  */
1961 static void scrub_throttle(struct scrub_ctx *sctx)
1962 {
1963         const int time_slice = 1000;
1964         struct scrub_bio *sbio;
1965         struct btrfs_device *device;
1966         s64 delta;
1967         ktime_t now;
1968         u32 div;
1969         u64 bwlimit;
1970
1971         sbio = sctx->bios[sctx->curr];
1972         device = sbio->dev;
1973         bwlimit = READ_ONCE(device->scrub_speed_max);
1974         if (bwlimit == 0)
1975                 return;
1976
1977         /*
1978          * Slice is divided into intervals when the IO is submitted, adjust by
1979          * bwlimit and maximum of 64 intervals.
1980          */
1981         div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1982         div = min_t(u32, 64, div);
1983
1984         /* Start new epoch, set deadline */
1985         now = ktime_get();
1986         if (sctx->throttle_deadline == 0) {
1987                 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1988                 sctx->throttle_sent = 0;
1989         }
1990
1991         /* Still in the time to send? */
1992         if (ktime_before(now, sctx->throttle_deadline)) {
1993                 /* If current bio is within the limit, send it */
1994                 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
1995                 if (sctx->throttle_sent <= div_u64(bwlimit, div))
1996                         return;
1997
1998                 /* We're over the limit, sleep until the rest of the slice */
1999                 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2000         } else {
2001                 /* New request after deadline, start new epoch */
2002                 delta = 0;
2003         }
2004
2005         if (delta) {
2006                 long timeout;
2007
2008                 timeout = div_u64(delta * HZ, 1000);
2009                 schedule_timeout_interruptible(timeout);
2010         }
2011
2012         /* Next call will start the deadline period */
2013         sctx->throttle_deadline = 0;
2014 }
2015
2016 static void scrub_submit(struct scrub_ctx *sctx)
2017 {
2018         struct scrub_bio *sbio;
2019
2020         if (sctx->curr == -1)
2021                 return;
2022
2023         scrub_throttle(sctx);
2024
2025         sbio = sctx->bios[sctx->curr];
2026         sctx->curr = -1;
2027         scrub_pending_bio_inc(sctx);
2028         btrfsic_check_bio(sbio->bio);
2029         submit_bio(sbio->bio);
2030 }
2031
2032 static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
2033                                       struct scrub_sector *sector)
2034 {
2035         struct scrub_block *sblock = sector->sblock;
2036         struct scrub_bio *sbio;
2037         const u32 sectorsize = sctx->fs_info->sectorsize;
2038         int ret;
2039
2040 again:
2041         /*
2042          * grab a fresh bio or wait for one to become available
2043          */
2044         while (sctx->curr == -1) {
2045                 spin_lock(&sctx->list_lock);
2046                 sctx->curr = sctx->first_free;
2047                 if (sctx->curr != -1) {
2048                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2049                         sctx->bios[sctx->curr]->next_free = -1;
2050                         sctx->bios[sctx->curr]->sector_count = 0;
2051                         spin_unlock(&sctx->list_lock);
2052                 } else {
2053                         spin_unlock(&sctx->list_lock);
2054                         wait_event(sctx->list_wait, sctx->first_free != -1);
2055                 }
2056         }
2057         sbio = sctx->bios[sctx->curr];
2058         if (sbio->sector_count == 0) {
2059                 sbio->physical = sector->physical;
2060                 sbio->logical = sector->logical;
2061                 sbio->dev = sector->dev;
2062                 if (!sbio->bio) {
2063                         sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
2064                                               REQ_OP_READ, GFP_NOFS);
2065                 }
2066                 sbio->bio->bi_private = sbio;
2067                 sbio->bio->bi_end_io = scrub_bio_end_io;
2068                 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2069                 sbio->status = 0;
2070         } else if (sbio->physical + sbio->sector_count * sectorsize !=
2071                    sector->physical ||
2072                    sbio->logical + sbio->sector_count * sectorsize !=
2073                    sector->logical ||
2074                    sbio->dev != sector->dev) {
2075                 scrub_submit(sctx);
2076                 goto again;
2077         }
2078
2079         sbio->sectors[sbio->sector_count] = sector;
2080         ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
2081         if (ret != sectorsize) {
2082                 if (sbio->sector_count < 1) {
2083                         bio_put(sbio->bio);
2084                         sbio->bio = NULL;
2085                         return -EIO;
2086                 }
2087                 scrub_submit(sctx);
2088                 goto again;
2089         }
2090
2091         scrub_block_get(sblock); /* one for the page added to the bio */
2092         atomic_inc(&sblock->outstanding_sectors);
2093         sbio->sector_count++;
2094         if (sbio->sector_count == sctx->sectors_per_bio)
2095                 scrub_submit(sctx);
2096
2097         return 0;
2098 }
2099
2100 static void scrub_missing_raid56_end_io(struct bio *bio)
2101 {
2102         struct scrub_block *sblock = bio->bi_private;
2103         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2104
2105         if (bio->bi_status)
2106                 sblock->no_io_error_seen = 0;
2107
2108         bio_put(bio);
2109
2110         queue_work(fs_info->scrub_workers, &sblock->work);
2111 }
2112
2113 static void scrub_missing_raid56_worker(struct work_struct *work)
2114 {
2115         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2116         struct scrub_ctx *sctx = sblock->sctx;
2117         struct btrfs_fs_info *fs_info = sctx->fs_info;
2118         u64 logical;
2119         struct btrfs_device *dev;
2120
2121         logical = sblock->sectors[0]->logical;
2122         dev = sblock->sectors[0]->dev;
2123
2124         if (sblock->no_io_error_seen)
2125                 scrub_recheck_block_checksum(sblock);
2126
2127         if (!sblock->no_io_error_seen) {
2128                 spin_lock(&sctx->stat_lock);
2129                 sctx->stat.read_errors++;
2130                 spin_unlock(&sctx->stat_lock);
2131                 btrfs_err_rl_in_rcu(fs_info,
2132                         "IO error rebuilding logical %llu for dev %s",
2133                         logical, rcu_str_deref(dev->name));
2134         } else if (sblock->header_error || sblock->checksum_error) {
2135                 spin_lock(&sctx->stat_lock);
2136                 sctx->stat.uncorrectable_errors++;
2137                 spin_unlock(&sctx->stat_lock);
2138                 btrfs_err_rl_in_rcu(fs_info,
2139                         "failed to rebuild valid logical %llu for dev %s",
2140                         logical, rcu_str_deref(dev->name));
2141         } else {
2142                 scrub_write_block_to_dev_replace(sblock);
2143         }
2144
2145         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2146                 mutex_lock(&sctx->wr_lock);
2147                 scrub_wr_submit(sctx);
2148                 mutex_unlock(&sctx->wr_lock);
2149         }
2150
2151         scrub_block_put(sblock);
2152         scrub_pending_bio_dec(sctx);
2153 }
2154
2155 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2156 {
2157         struct scrub_ctx *sctx = sblock->sctx;
2158         struct btrfs_fs_info *fs_info = sctx->fs_info;
2159         u64 length = sblock->sector_count << fs_info->sectorsize_bits;
2160         u64 logical = sblock->sectors[0]->logical;
2161         struct btrfs_io_context *bioc = NULL;
2162         struct bio *bio;
2163         struct btrfs_raid_bio *rbio;
2164         int ret;
2165         int i;
2166
2167         btrfs_bio_counter_inc_blocked(fs_info);
2168         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2169                                &length, &bioc);
2170         if (ret || !bioc || !bioc->raid_map)
2171                 goto bioc_out;
2172
2173         if (WARN_ON(!sctx->is_dev_replace ||
2174                     !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2175                 /*
2176                  * We shouldn't be scrubbing a missing device. Even for dev
2177                  * replace, we should only get here for RAID 5/6. We either
2178                  * managed to mount something with no mirrors remaining or
2179                  * there's a bug in scrub_find_good_copy()/btrfs_map_block().
2180                  */
2181                 goto bioc_out;
2182         }
2183
2184         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2185         bio->bi_iter.bi_sector = logical >> 9;
2186         bio->bi_private = sblock;
2187         bio->bi_end_io = scrub_missing_raid56_end_io;
2188
2189         rbio = raid56_alloc_missing_rbio(bio, bioc);
2190         if (!rbio)
2191                 goto rbio_out;
2192
2193         for (i = 0; i < sblock->sector_count; i++) {
2194                 struct scrub_sector *sector = sblock->sectors[i];
2195
2196                 /*
2197                  * For now, our scrub is still one page per sector, so pgoff
2198                  * is always 0.
2199                  */
2200                 raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
2201         }
2202
2203         INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
2204         scrub_block_get(sblock);
2205         scrub_pending_bio_inc(sctx);
2206         raid56_submit_missing_rbio(rbio);
2207         return;
2208
2209 rbio_out:
2210         bio_put(bio);
2211 bioc_out:
2212         btrfs_bio_counter_dec(fs_info);
2213         btrfs_put_bioc(bioc);
2214         spin_lock(&sctx->stat_lock);
2215         sctx->stat.malloc_errors++;
2216         spin_unlock(&sctx->stat_lock);
2217 }
2218
2219 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
2220                        u64 physical, struct btrfs_device *dev, u64 flags,
2221                        u64 gen, int mirror_num, u8 *csum,
2222                        u64 physical_for_dev_replace)
2223 {
2224         struct scrub_block *sblock;
2225         const u32 sectorsize = sctx->fs_info->sectorsize;
2226         int index;
2227
2228         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2229         if (!sblock) {
2230                 spin_lock(&sctx->stat_lock);
2231                 sctx->stat.malloc_errors++;
2232                 spin_unlock(&sctx->stat_lock);
2233                 return -ENOMEM;
2234         }
2235
2236         /* one ref inside this function, plus one for each page added to
2237          * a bio later on */
2238         refcount_set(&sblock->refs, 1);
2239         sblock->sctx = sctx;
2240         sblock->no_io_error_seen = 1;
2241
2242         for (index = 0; len > 0; index++) {
2243                 struct scrub_sector *sector;
2244                 /*
2245                  * Here we will allocate one page for one sector to scrub.
2246                  * This is fine if PAGE_SIZE == sectorsize, but will cost
2247                  * more memory for PAGE_SIZE > sectorsize case.
2248                  */
2249                 u32 l = min(sectorsize, len);
2250
2251                 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2252                 if (!sector) {
2253 leave_nomem:
2254                         spin_lock(&sctx->stat_lock);
2255                         sctx->stat.malloc_errors++;
2256                         spin_unlock(&sctx->stat_lock);
2257                         scrub_block_put(sblock);
2258                         return -ENOMEM;
2259                 }
2260                 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2261                 scrub_sector_get(sector);
2262                 sblock->sectors[index] = sector;
2263                 sector->sblock = sblock;
2264                 sector->dev = dev;
2265                 sector->flags = flags;
2266                 sector->generation = gen;
2267                 sector->logical = logical;
2268                 sector->physical = physical;
2269                 sector->physical_for_dev_replace = physical_for_dev_replace;
2270                 sector->mirror_num = mirror_num;
2271                 if (csum) {
2272                         sector->have_csum = 1;
2273                         memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2274                 } else {
2275                         sector->have_csum = 0;
2276                 }
2277                 sblock->sector_count++;
2278                 sector->page = alloc_page(GFP_KERNEL);
2279                 if (!sector->page)
2280                         goto leave_nomem;
2281                 len -= l;
2282                 logical += l;
2283                 physical += l;
2284                 physical_for_dev_replace += l;
2285         }
2286
2287         WARN_ON(sblock->sector_count == 0);
2288         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2289                 /*
2290                  * This case should only be hit for RAID 5/6 device replace. See
2291                  * the comment in scrub_missing_raid56_pages() for details.
2292                  */
2293                 scrub_missing_raid56_pages(sblock);
2294         } else {
2295                 for (index = 0; index < sblock->sector_count; index++) {
2296                         struct scrub_sector *sector = sblock->sectors[index];
2297                         int ret;
2298
2299                         ret = scrub_add_sector_to_rd_bio(sctx, sector);
2300                         if (ret) {
2301                                 scrub_block_put(sblock);
2302                                 return ret;
2303                         }
2304                 }
2305
2306                 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2307                         scrub_submit(sctx);
2308         }
2309
2310         /* last one frees, either here or in bio completion for last page */
2311         scrub_block_put(sblock);
2312         return 0;
2313 }
2314
2315 static void scrub_bio_end_io(struct bio *bio)
2316 {
2317         struct scrub_bio *sbio = bio->bi_private;
2318         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2319
2320         sbio->status = bio->bi_status;
2321         sbio->bio = bio;
2322
2323         queue_work(fs_info->scrub_workers, &sbio->work);
2324 }
2325
2326 static void scrub_bio_end_io_worker(struct work_struct *work)
2327 {
2328         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2329         struct scrub_ctx *sctx = sbio->sctx;
2330         int i;
2331
2332         ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2333         if (sbio->status) {
2334                 for (i = 0; i < sbio->sector_count; i++) {
2335                         struct scrub_sector *sector = sbio->sectors[i];
2336
2337                         sector->io_error = 1;
2338                         sector->sblock->no_io_error_seen = 0;
2339                 }
2340         }
2341
2342         /* Now complete the scrub_block items that have all pages completed */
2343         for (i = 0; i < sbio->sector_count; i++) {
2344                 struct scrub_sector *sector = sbio->sectors[i];
2345                 struct scrub_block *sblock = sector->sblock;
2346
2347                 if (atomic_dec_and_test(&sblock->outstanding_sectors))
2348                         scrub_block_complete(sblock);
2349                 scrub_block_put(sblock);
2350         }
2351
2352         bio_put(sbio->bio);
2353         sbio->bio = NULL;
2354         spin_lock(&sctx->list_lock);
2355         sbio->next_free = sctx->first_free;
2356         sctx->first_free = sbio->index;
2357         spin_unlock(&sctx->list_lock);
2358
2359         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2360                 mutex_lock(&sctx->wr_lock);
2361                 scrub_wr_submit(sctx);
2362                 mutex_unlock(&sctx->wr_lock);
2363         }
2364
2365         scrub_pending_bio_dec(sctx);
2366 }
2367
2368 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2369                                        unsigned long *bitmap,
2370                                        u64 start, u32 len)
2371 {
2372         u64 offset;
2373         u32 nsectors;
2374         u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2375
2376         if (len >= sparity->stripe_len) {
2377                 bitmap_set(bitmap, 0, sparity->nsectors);
2378                 return;
2379         }
2380
2381         start -= sparity->logic_start;
2382         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2383         offset = offset >> sectorsize_bits;
2384         nsectors = len >> sectorsize_bits;
2385
2386         if (offset + nsectors <= sparity->nsectors) {
2387                 bitmap_set(bitmap, offset, nsectors);
2388                 return;
2389         }
2390
2391         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2392         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2393 }
2394
2395 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2396                                                    u64 start, u32 len)
2397 {
2398         __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
2399 }
2400
2401 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2402                                                   u64 start, u32 len)
2403 {
2404         __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
2405 }
2406
2407 static void scrub_block_complete(struct scrub_block *sblock)
2408 {
2409         int corrupted = 0;
2410
2411         if (!sblock->no_io_error_seen) {
2412                 corrupted = 1;
2413                 scrub_handle_errored_block(sblock);
2414         } else {
2415                 /*
2416                  * if has checksum error, write via repair mechanism in
2417                  * dev replace case, otherwise write here in dev replace
2418                  * case.
2419                  */
2420                 corrupted = scrub_checksum(sblock);
2421                 if (!corrupted && sblock->sctx->is_dev_replace)
2422                         scrub_write_block_to_dev_replace(sblock);
2423         }
2424
2425         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2426                 u64 start = sblock->sectors[0]->logical;
2427                 u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
2428                           sblock->sctx->fs_info->sectorsize;
2429
2430                 ASSERT(end - start <= U32_MAX);
2431                 scrub_parity_mark_sectors_error(sblock->sparity,
2432                                                 start, end - start);
2433         }
2434 }
2435
2436 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2437 {
2438         sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2439         list_del(&sum->list);
2440         kfree(sum);
2441 }
2442
2443 /*
2444  * Find the desired csum for range [logical, logical + sectorsize), and store
2445  * the csum into @csum.
2446  *
2447  * The search source is sctx->csum_list, which is a pre-populated list
2448  * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
2449  * that is before @logical.
2450  *
2451  * Return 0 if there is no csum for the range.
2452  * Return 1 if there is csum for the range and copied to @csum.
2453  */
2454 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2455 {
2456         bool found = false;
2457
2458         while (!list_empty(&sctx->csum_list)) {
2459                 struct btrfs_ordered_sum *sum = NULL;
2460                 unsigned long index;
2461                 unsigned long num_sectors;
2462
2463                 sum = list_first_entry(&sctx->csum_list,
2464                                        struct btrfs_ordered_sum, list);
2465                 /* The current csum range is beyond our range, no csum found */
2466                 if (sum->bytenr > logical)
2467                         break;
2468
2469                 /*
2470                  * The current sum is before our bytenr, since scrub is always
2471                  * done in bytenr order, the csum will never be used anymore,
2472                  * clean it up so that later calls won't bother with the range,
2473                  * and continue search the next range.
2474                  */
2475                 if (sum->bytenr + sum->len <= logical) {
2476                         drop_csum_range(sctx, sum);
2477                         continue;
2478                 }
2479
2480                 /* Now the csum range covers our bytenr, copy the csum */
2481                 found = true;
2482                 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2483                 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2484
2485                 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2486                        sctx->fs_info->csum_size);
2487
2488                 /* Cleanup the range if we're at the end of the csum range */
2489                 if (index == num_sectors - 1)
2490                         drop_csum_range(sctx, sum);
2491                 break;
2492         }
2493         if (!found)
2494                 return 0;
2495         return 1;
2496 }
2497
2498 /* scrub extent tries to collect up to 64 kB for each bio */
2499 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2500                         u64 logical, u32 len,
2501                         u64 physical, struct btrfs_device *dev, u64 flags,
2502                         u64 gen, int mirror_num)
2503 {
2504         struct btrfs_device *src_dev = dev;
2505         u64 src_physical = physical;
2506         int src_mirror = mirror_num;
2507         int ret;
2508         u8 csum[BTRFS_CSUM_SIZE];
2509         u32 blocksize;
2510
2511         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2512                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2513                         blocksize = map->stripe_len;
2514                 else
2515                         blocksize = sctx->fs_info->sectorsize;
2516                 spin_lock(&sctx->stat_lock);
2517                 sctx->stat.data_extents_scrubbed++;
2518                 sctx->stat.data_bytes_scrubbed += len;
2519                 spin_unlock(&sctx->stat_lock);
2520         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2521                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2522                         blocksize = map->stripe_len;
2523                 else
2524                         blocksize = sctx->fs_info->nodesize;
2525                 spin_lock(&sctx->stat_lock);
2526                 sctx->stat.tree_extents_scrubbed++;
2527                 sctx->stat.tree_bytes_scrubbed += len;
2528                 spin_unlock(&sctx->stat_lock);
2529         } else {
2530                 blocksize = sctx->fs_info->sectorsize;
2531                 WARN_ON(1);
2532         }
2533
2534         /*
2535          * For dev-replace case, we can have @dev being a missing device.
2536          * Regular scrub will avoid its execution on missing device at all,
2537          * as that would trigger tons of read error.
2538          *
2539          * Reading from missing device will cause read error counts to
2540          * increase unnecessarily.
2541          * So here we change the read source to a good mirror.
2542          */
2543         if (sctx->is_dev_replace && !dev->bdev)
2544                 scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
2545                                      &src_dev, &src_mirror);
2546         while (len) {
2547                 u32 l = min(len, blocksize);
2548                 int have_csum = 0;
2549
2550                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2551                         /* push csums to sbio */
2552                         have_csum = scrub_find_csum(sctx, logical, csum);
2553                         if (have_csum == 0)
2554                                 ++sctx->stat.no_csum;
2555                 }
2556                 ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
2557                                     flags, gen, src_mirror,
2558                                     have_csum ? csum : NULL, physical);
2559                 if (ret)
2560                         return ret;
2561                 len -= l;
2562                 logical += l;
2563                 physical += l;
2564                 src_physical += l;
2565         }
2566         return 0;
2567 }
2568
2569 static int scrub_sectors_for_parity(struct scrub_parity *sparity,
2570                                   u64 logical, u32 len,
2571                                   u64 physical, struct btrfs_device *dev,
2572                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2573 {
2574         struct scrub_ctx *sctx = sparity->sctx;
2575         struct scrub_block *sblock;
2576         const u32 sectorsize = sctx->fs_info->sectorsize;
2577         int index;
2578
2579         ASSERT(IS_ALIGNED(len, sectorsize));
2580
2581         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2582         if (!sblock) {
2583                 spin_lock(&sctx->stat_lock);
2584                 sctx->stat.malloc_errors++;
2585                 spin_unlock(&sctx->stat_lock);
2586                 return -ENOMEM;
2587         }
2588
2589         /* one ref inside this function, plus one for each page added to
2590          * a bio later on */
2591         refcount_set(&sblock->refs, 1);
2592         sblock->sctx = sctx;
2593         sblock->no_io_error_seen = 1;
2594         sblock->sparity = sparity;
2595         scrub_parity_get(sparity);
2596
2597         for (index = 0; len > 0; index++) {
2598                 struct scrub_sector *sector;
2599
2600                 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2601                 if (!sector) {
2602 leave_nomem:
2603                         spin_lock(&sctx->stat_lock);
2604                         sctx->stat.malloc_errors++;
2605                         spin_unlock(&sctx->stat_lock);
2606                         scrub_block_put(sblock);
2607                         return -ENOMEM;
2608                 }
2609                 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2610                 /* For scrub block */
2611                 scrub_sector_get(sector);
2612                 sblock->sectors[index] = sector;
2613                 /* For scrub parity */
2614                 scrub_sector_get(sector);
2615                 list_add_tail(&sector->list, &sparity->sectors_list);
2616                 sector->sblock = sblock;
2617                 sector->dev = dev;
2618                 sector->flags = flags;
2619                 sector->generation = gen;
2620                 sector->logical = logical;
2621                 sector->physical = physical;
2622                 sector->mirror_num = mirror_num;
2623                 if (csum) {
2624                         sector->have_csum = 1;
2625                         memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2626                 } else {
2627                         sector->have_csum = 0;
2628                 }
2629                 sblock->sector_count++;
2630                 sector->page = alloc_page(GFP_KERNEL);
2631                 if (!sector->page)
2632                         goto leave_nomem;
2633
2634
2635                 /* Iterate over the stripe range in sectorsize steps */
2636                 len -= sectorsize;
2637                 logical += sectorsize;
2638                 physical += sectorsize;
2639         }
2640
2641         WARN_ON(sblock->sector_count == 0);
2642         for (index = 0; index < sblock->sector_count; index++) {
2643                 struct scrub_sector *sector = sblock->sectors[index];
2644                 int ret;
2645
2646                 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2647                 if (ret) {
2648                         scrub_block_put(sblock);
2649                         return ret;
2650                 }
2651         }
2652
2653         /* Last one frees, either here or in bio completion for last sector */
2654         scrub_block_put(sblock);
2655         return 0;
2656 }
2657
2658 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2659                                    u64 logical, u32 len,
2660                                    u64 physical, struct btrfs_device *dev,
2661                                    u64 flags, u64 gen, int mirror_num)
2662 {
2663         struct scrub_ctx *sctx = sparity->sctx;
2664         int ret;
2665         u8 csum[BTRFS_CSUM_SIZE];
2666         u32 blocksize;
2667
2668         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2669                 scrub_parity_mark_sectors_error(sparity, logical, len);
2670                 return 0;
2671         }
2672
2673         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2674                 blocksize = sparity->stripe_len;
2675         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2676                 blocksize = sparity->stripe_len;
2677         } else {
2678                 blocksize = sctx->fs_info->sectorsize;
2679                 WARN_ON(1);
2680         }
2681
2682         while (len) {
2683                 u32 l = min(len, blocksize);
2684                 int have_csum = 0;
2685
2686                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2687                         /* push csums to sbio */
2688                         have_csum = scrub_find_csum(sctx, logical, csum);
2689                         if (have_csum == 0)
2690                                 goto skip;
2691                 }
2692                 ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
2693                                              flags, gen, mirror_num,
2694                                              have_csum ? csum : NULL);
2695                 if (ret)
2696                         return ret;
2697 skip:
2698                 len -= l;
2699                 logical += l;
2700                 physical += l;
2701         }
2702         return 0;
2703 }
2704
2705 /*
2706  * Given a physical address, this will calculate it's
2707  * logical offset. if this is a parity stripe, it will return
2708  * the most left data stripe's logical offset.
2709  *
2710  * return 0 if it is a data stripe, 1 means parity stripe.
2711  */
2712 static int get_raid56_logic_offset(u64 physical, int num,
2713                                    struct map_lookup *map, u64 *offset,
2714                                    u64 *stripe_start)
2715 {
2716         int i;
2717         int j = 0;
2718         u64 stripe_nr;
2719         u64 last_offset;
2720         u32 stripe_index;
2721         u32 rot;
2722         const int data_stripes = nr_data_stripes(map);
2723
2724         last_offset = (physical - map->stripes[num].physical) * data_stripes;
2725         if (stripe_start)
2726                 *stripe_start = last_offset;
2727
2728         *offset = last_offset;
2729         for (i = 0; i < data_stripes; i++) {
2730                 *offset = last_offset + i * map->stripe_len;
2731
2732                 stripe_nr = div64_u64(*offset, map->stripe_len);
2733                 stripe_nr = div_u64(stripe_nr, data_stripes);
2734
2735                 /* Work out the disk rotation on this stripe-set */
2736                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2737                 /* calculate which stripe this data locates */
2738                 rot += i;
2739                 stripe_index = rot % map->num_stripes;
2740                 if (stripe_index == num)
2741                         return 0;
2742                 if (stripe_index < num)
2743                         j++;
2744         }
2745         *offset = last_offset + j * map->stripe_len;
2746         return 1;
2747 }
2748
2749 static void scrub_free_parity(struct scrub_parity *sparity)
2750 {
2751         struct scrub_ctx *sctx = sparity->sctx;
2752         struct scrub_sector *curr, *next;
2753         int nbits;
2754
2755         nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
2756         if (nbits) {
2757                 spin_lock(&sctx->stat_lock);
2758                 sctx->stat.read_errors += nbits;
2759                 sctx->stat.uncorrectable_errors += nbits;
2760                 spin_unlock(&sctx->stat_lock);
2761         }
2762
2763         list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
2764                 list_del_init(&curr->list);
2765                 scrub_sector_put(curr);
2766         }
2767
2768         kfree(sparity);
2769 }
2770
2771 static void scrub_parity_bio_endio_worker(struct work_struct *work)
2772 {
2773         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2774                                                     work);
2775         struct scrub_ctx *sctx = sparity->sctx;
2776
2777         scrub_free_parity(sparity);
2778         scrub_pending_bio_dec(sctx);
2779 }
2780
2781 static void scrub_parity_bio_endio(struct bio *bio)
2782 {
2783         struct scrub_parity *sparity = bio->bi_private;
2784         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2785
2786         if (bio->bi_status)
2787                 bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
2788                           &sparity->dbitmap, sparity->nsectors);
2789
2790         bio_put(bio);
2791
2792         INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
2793         queue_work(fs_info->scrub_parity_workers, &sparity->work);
2794 }
2795
2796 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2797 {
2798         struct scrub_ctx *sctx = sparity->sctx;
2799         struct btrfs_fs_info *fs_info = sctx->fs_info;
2800         struct bio *bio;
2801         struct btrfs_raid_bio *rbio;
2802         struct btrfs_io_context *bioc = NULL;
2803         u64 length;
2804         int ret;
2805
2806         if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
2807                            &sparity->ebitmap, sparity->nsectors))
2808                 goto out;
2809
2810         length = sparity->logic_end - sparity->logic_start;
2811
2812         btrfs_bio_counter_inc_blocked(fs_info);
2813         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2814                                &length, &bioc);
2815         if (ret || !bioc || !bioc->raid_map)
2816                 goto bioc_out;
2817
2818         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2819         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2820         bio->bi_private = sparity;
2821         bio->bi_end_io = scrub_parity_bio_endio;
2822
2823         rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
2824                                               sparity->scrub_dev,
2825                                               &sparity->dbitmap,
2826                                               sparity->nsectors);
2827         if (!rbio)
2828                 goto rbio_out;
2829
2830         scrub_pending_bio_inc(sctx);
2831         raid56_parity_submit_scrub_rbio(rbio);
2832         return;
2833
2834 rbio_out:
2835         bio_put(bio);
2836 bioc_out:
2837         btrfs_bio_counter_dec(fs_info);
2838         btrfs_put_bioc(bioc);
2839         bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
2840                   sparity->nsectors);
2841         spin_lock(&sctx->stat_lock);
2842         sctx->stat.malloc_errors++;
2843         spin_unlock(&sctx->stat_lock);
2844 out:
2845         scrub_free_parity(sparity);
2846 }
2847
2848 static void scrub_parity_get(struct scrub_parity *sparity)
2849 {
2850         refcount_inc(&sparity->refs);
2851 }
2852
2853 static void scrub_parity_put(struct scrub_parity *sparity)
2854 {
2855         if (!refcount_dec_and_test(&sparity->refs))
2856                 return;
2857
2858         scrub_parity_check_and_repair(sparity);
2859 }
2860
2861 /*
2862  * Return 0 if the extent item range covers any byte of the range.
2863  * Return <0 if the extent item is before @search_start.
2864  * Return >0 if the extent item is after @start_start + @search_len.
2865  */
2866 static int compare_extent_item_range(struct btrfs_path *path,
2867                                      u64 search_start, u64 search_len)
2868 {
2869         struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
2870         u64 len;
2871         struct btrfs_key key;
2872
2873         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2874         ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
2875                key.type == BTRFS_METADATA_ITEM_KEY);
2876         if (key.type == BTRFS_METADATA_ITEM_KEY)
2877                 len = fs_info->nodesize;
2878         else
2879                 len = key.offset;
2880
2881         if (key.objectid + len <= search_start)
2882                 return -1;
2883         if (key.objectid >= search_start + search_len)
2884                 return 1;
2885         return 0;
2886 }
2887
2888 /*
2889  * Locate one extent item which covers any byte in range
2890  * [@search_start, @search_start + @search_length)
2891  *
2892  * If the path is not initialized, we will initialize the search by doing
2893  * a btrfs_search_slot().
2894  * If the path is already initialized, we will use the path as the initial
2895  * slot, to avoid duplicated btrfs_search_slot() calls.
2896  *
2897  * NOTE: If an extent item starts before @search_start, we will still
2898  * return the extent item. This is for data extent crossing stripe boundary.
2899  *
2900  * Return 0 if we found such extent item, and @path will point to the extent item.
2901  * Return >0 if no such extent item can be found, and @path will be released.
2902  * Return <0 if hit fatal error, and @path will be released.
2903  */
2904 static int find_first_extent_item(struct btrfs_root *extent_root,
2905                                   struct btrfs_path *path,
2906                                   u64 search_start, u64 search_len)
2907 {
2908         struct btrfs_fs_info *fs_info = extent_root->fs_info;
2909         struct btrfs_key key;
2910         int ret;
2911
2912         /* Continue using the existing path */
2913         if (path->nodes[0])
2914                 goto search_forward;
2915
2916         if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2917                 key.type = BTRFS_METADATA_ITEM_KEY;
2918         else
2919                 key.type = BTRFS_EXTENT_ITEM_KEY;
2920         key.objectid = search_start;
2921         key.offset = (u64)-1;
2922
2923         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2924         if (ret < 0)
2925                 return ret;
2926
2927         ASSERT(ret > 0);
2928         /*
2929          * Here we intentionally pass 0 as @min_objectid, as there could be
2930          * an extent item starting before @search_start.
2931          */
2932         ret = btrfs_previous_extent_item(extent_root, path, 0);
2933         if (ret < 0)
2934                 return ret;
2935         /*
2936          * No matter whether we have found an extent item, the next loop will
2937          * properly do every check on the key.
2938          */
2939 search_forward:
2940         while (true) {
2941                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2942                 if (key.objectid >= search_start + search_len)
2943                         break;
2944                 if (key.type != BTRFS_METADATA_ITEM_KEY &&
2945                     key.type != BTRFS_EXTENT_ITEM_KEY)
2946                         goto next;
2947
2948                 ret = compare_extent_item_range(path, search_start, search_len);
2949                 if (ret == 0)
2950                         return ret;
2951                 if (ret > 0)
2952                         break;
2953 next:
2954                 path->slots[0]++;
2955                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2956                         ret = btrfs_next_leaf(extent_root, path);
2957                         if (ret) {
2958                                 /* Either no more item or fatal error */
2959                                 btrfs_release_path(path);
2960                                 return ret;
2961                         }
2962                 }
2963         }
2964         btrfs_release_path(path);
2965         return 1;
2966 }
2967
2968 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
2969                             u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
2970 {
2971         struct btrfs_key key;
2972         struct btrfs_extent_item *ei;
2973
2974         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2975         ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
2976                key.type == BTRFS_EXTENT_ITEM_KEY);
2977         *extent_start_ret = key.objectid;
2978         if (key.type == BTRFS_METADATA_ITEM_KEY)
2979                 *size_ret = path->nodes[0]->fs_info->nodesize;
2980         else
2981                 *size_ret = key.offset;
2982         ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
2983         *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
2984         *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
2985 }
2986
2987 static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
2988                                       u64 boundary_start, u64 boudary_len)
2989 {
2990         return (extent_start < boundary_start &&
2991                 extent_start + extent_len > boundary_start) ||
2992                (extent_start < boundary_start + boudary_len &&
2993                 extent_start + extent_len > boundary_start + boudary_len);
2994 }
2995
2996 static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
2997                                                struct scrub_parity *sparity,
2998                                                struct map_lookup *map,
2999                                                struct btrfs_device *sdev,
3000                                                struct btrfs_path *path,
3001                                                u64 logical)
3002 {
3003         struct btrfs_fs_info *fs_info = sctx->fs_info;
3004         struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
3005         struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
3006         u64 cur_logical = logical;
3007         int ret;
3008
3009         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3010
3011         /* Path must not be populated */
3012         ASSERT(!path->nodes[0]);
3013
3014         while (cur_logical < logical + map->stripe_len) {
3015                 struct btrfs_io_context *bioc = NULL;
3016                 struct btrfs_device *extent_dev;
3017                 u64 extent_start;
3018                 u64 extent_size;
3019                 u64 mapped_length;
3020                 u64 extent_flags;
3021                 u64 extent_gen;
3022                 u64 extent_physical;
3023                 u64 extent_mirror_num;
3024
3025                 ret = find_first_extent_item(extent_root, path, cur_logical,
3026                                              logical + map->stripe_len - cur_logical);
3027                 /* No more extent item in this data stripe */
3028                 if (ret > 0) {
3029                         ret = 0;
3030                         break;
3031                 }
3032                 if (ret < 0)
3033                         break;
3034                 get_extent_info(path, &extent_start, &extent_size, &extent_flags,
3035                                 &extent_gen);
3036
3037                 /* Metadata should not cross stripe boundaries */
3038                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3039                     does_range_cross_boundary(extent_start, extent_size,
3040                                               logical, map->stripe_len)) {
3041                         btrfs_err(fs_info,
3042         "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3043                                   extent_start, logical);
3044                         spin_lock(&sctx->stat_lock);
3045                         sctx->stat.uncorrectable_errors++;
3046                         spin_unlock(&sctx->stat_lock);
3047                         cur_logical += extent_size;
3048                         continue;
3049                 }
3050
3051                 /* Skip hole range which doesn't have any extent */
3052                 cur_logical = max(extent_start, cur_logical);
3053
3054                 /* Truncate the range inside this data stripe */
3055                 extent_size = min(extent_start + extent_size,
3056                                   logical + map->stripe_len) - cur_logical;
3057                 extent_start = cur_logical;
3058                 ASSERT(extent_size <= U32_MAX);
3059
3060                 scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
3061
3062                 mapped_length = extent_size;
3063                 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
3064                                       &mapped_length, &bioc, 0);
3065                 if (!ret && (!bioc || mapped_length < extent_size))
3066                         ret = -EIO;
3067                 if (ret) {
3068                         btrfs_put_bioc(bioc);
3069                         scrub_parity_mark_sectors_error(sparity, extent_start,
3070                                                         extent_size);
3071                         break;
3072                 }
3073                 extent_physical = bioc->stripes[0].physical;
3074                 extent_mirror_num = bioc->mirror_num;
3075                 extent_dev = bioc->stripes[0].dev;
3076                 btrfs_put_bioc(bioc);
3077
3078                 ret = btrfs_lookup_csums_range(csum_root, extent_start,
3079                                                extent_start + extent_size - 1,
3080                                                &sctx->csum_list, 1);
3081                 if (ret) {
3082                         scrub_parity_mark_sectors_error(sparity, extent_start,
3083                                                         extent_size);
3084                         break;
3085                 }
3086
3087                 ret = scrub_extent_for_parity(sparity, extent_start,
3088                                               extent_size, extent_physical,
3089                                               extent_dev, extent_flags,
3090                                               extent_gen, extent_mirror_num);
3091                 scrub_free_csums(sctx);
3092
3093                 if (ret) {
3094                         scrub_parity_mark_sectors_error(sparity, extent_start,
3095                                                         extent_size);
3096                         break;
3097                 }
3098
3099                 cond_resched();
3100                 cur_logical += extent_size;
3101         }
3102         btrfs_release_path(path);
3103         return ret;
3104 }
3105
3106 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3107                                                   struct map_lookup *map,
3108                                                   struct btrfs_device *sdev,
3109                                                   u64 logic_start,
3110                                                   u64 logic_end)
3111 {
3112         struct btrfs_fs_info *fs_info = sctx->fs_info;
3113         struct btrfs_path *path;
3114         u64 cur_logical;
3115         int ret;
3116         struct scrub_parity *sparity;
3117         int nsectors;
3118
3119         path = btrfs_alloc_path();
3120         if (!path) {
3121                 spin_lock(&sctx->stat_lock);
3122                 sctx->stat.malloc_errors++;
3123                 spin_unlock(&sctx->stat_lock);
3124                 return -ENOMEM;
3125         }
3126         path->search_commit_root = 1;
3127         path->skip_locking = 1;
3128
3129         ASSERT(map->stripe_len <= U32_MAX);
3130         nsectors = map->stripe_len >> fs_info->sectorsize_bits;
3131         ASSERT(nsectors <= BITS_PER_LONG);
3132         sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
3133         if (!sparity) {
3134                 spin_lock(&sctx->stat_lock);
3135                 sctx->stat.malloc_errors++;
3136                 spin_unlock(&sctx->stat_lock);
3137                 btrfs_free_path(path);
3138                 return -ENOMEM;
3139         }
3140
3141         ASSERT(map->stripe_len <= U32_MAX);
3142         sparity->stripe_len = map->stripe_len;
3143         sparity->nsectors = nsectors;
3144         sparity->sctx = sctx;
3145         sparity->scrub_dev = sdev;
3146         sparity->logic_start = logic_start;
3147         sparity->logic_end = logic_end;
3148         refcount_set(&sparity->refs, 1);
3149         INIT_LIST_HEAD(&sparity->sectors_list);
3150
3151         ret = 0;
3152         for (cur_logical = logic_start; cur_logical < logic_end;
3153              cur_logical += map->stripe_len) {
3154                 ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
3155                                                           sdev, path, cur_logical);
3156                 if (ret < 0)
3157                         break;
3158         }
3159
3160         scrub_parity_put(sparity);
3161         scrub_submit(sctx);
3162         mutex_lock(&sctx->wr_lock);
3163         scrub_wr_submit(sctx);
3164         mutex_unlock(&sctx->wr_lock);
3165
3166         btrfs_free_path(path);
3167         return ret < 0 ? ret : 0;
3168 }
3169
3170 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3171 {
3172         if (!btrfs_is_zoned(sctx->fs_info))
3173                 return;
3174
3175         sctx->flush_all_writes = true;
3176         scrub_submit(sctx);
3177         mutex_lock(&sctx->wr_lock);
3178         scrub_wr_submit(sctx);
3179         mutex_unlock(&sctx->wr_lock);
3180
3181         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3182 }
3183
3184 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3185                                         u64 physical, u64 physical_end)
3186 {
3187         struct btrfs_fs_info *fs_info = sctx->fs_info;
3188         int ret = 0;
3189
3190         if (!btrfs_is_zoned(fs_info))
3191                 return 0;
3192
3193         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3194
3195         mutex_lock(&sctx->wr_lock);
3196         if (sctx->write_pointer < physical_end) {
3197                 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3198                                                     physical,
3199                                                     sctx->write_pointer);
3200                 if (ret)
3201                         btrfs_err(fs_info,
3202                                   "zoned: failed to recover write pointer");
3203         }
3204         mutex_unlock(&sctx->wr_lock);
3205         btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3206
3207         return ret;
3208 }
3209
3210 /*
3211  * Scrub one range which can only has simple mirror based profile.
3212  * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
3213  *  RAID0/RAID10).
3214  *
3215  * Since we may need to handle a subset of block group, we need @logical_start
3216  * and @logical_length parameter.
3217  */
3218 static int scrub_simple_mirror(struct scrub_ctx *sctx,
3219                                struct btrfs_root *extent_root,
3220                                struct btrfs_root *csum_root,
3221                                struct btrfs_block_group *bg,
3222                                struct map_lookup *map,
3223                                u64 logical_start, u64 logical_length,
3224                                struct btrfs_device *device,
3225                                u64 physical, int mirror_num)
3226 {
3227         struct btrfs_fs_info *fs_info = sctx->fs_info;
3228         const u64 logical_end = logical_start + logical_length;
3229         /* An artificial limit, inherit from old scrub behavior */
3230         const u32 max_length = SZ_64K;
3231         struct btrfs_path path = { 0 };
3232         u64 cur_logical = logical_start;
3233         int ret;
3234
3235         /* The range must be inside the bg */
3236         ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
3237
3238         path.search_commit_root = 1;
3239         path.skip_locking = 1;
3240         /* Go through each extent items inside the logical range */
3241         while (cur_logical < logical_end) {
3242                 u64 extent_start;
3243                 u64 extent_len;
3244                 u64 extent_flags;
3245                 u64 extent_gen;
3246                 u64 scrub_len;
3247
3248                 /* Canceled? */
3249                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3250                     atomic_read(&sctx->cancel_req)) {
3251                         ret = -ECANCELED;
3252                         break;
3253                 }
3254                 /* Paused? */
3255                 if (atomic_read(&fs_info->scrub_pause_req)) {
3256                         /* Push queued extents */
3257                         sctx->flush_all_writes = true;
3258                         scrub_submit(sctx);
3259                         mutex_lock(&sctx->wr_lock);
3260                         scrub_wr_submit(sctx);
3261                         mutex_unlock(&sctx->wr_lock);
3262                         wait_event(sctx->list_wait,
3263                                    atomic_read(&sctx->bios_in_flight) == 0);
3264                         sctx->flush_all_writes = false;
3265                         scrub_blocked_if_needed(fs_info);
3266                 }
3267                 /* Block group removed? */
3268                 spin_lock(&bg->lock);
3269                 if (bg->removed) {
3270                         spin_unlock(&bg->lock);
3271                         ret = 0;
3272                         break;
3273                 }
3274                 spin_unlock(&bg->lock);
3275
3276                 ret = find_first_extent_item(extent_root, &path, cur_logical,
3277                                              logical_end - cur_logical);
3278                 if (ret > 0) {
3279                         /* No more extent, just update the accounting */
3280                         sctx->stat.last_physical = physical + logical_length;
3281                         ret = 0;
3282                         break;
3283                 }
3284                 if (ret < 0)
3285                         break;
3286                 get_extent_info(&path, &extent_start, &extent_len,
3287                                 &extent_flags, &extent_gen);
3288                 /* Skip hole range which doesn't have any extent */
3289                 cur_logical = max(extent_start, cur_logical);
3290
3291                 /*
3292                  * Scrub len has three limits:
3293                  * - Extent size limit
3294                  * - Scrub range limit
3295                  *   This is especially imporatant for RAID0/RAID10 to reuse
3296                  *   this function
3297                  * - Max scrub size limit
3298                  */
3299                 scrub_len = min(min(extent_start + extent_len,
3300                                     logical_end), cur_logical + max_length) -
3301                             cur_logical;
3302
3303                 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
3304                         ret = btrfs_lookup_csums_range(csum_root, cur_logical,
3305                                         cur_logical + scrub_len - 1,
3306                                         &sctx->csum_list, 1);
3307                         if (ret)
3308                                 break;
3309                 }
3310                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3311                     does_range_cross_boundary(extent_start, extent_len,
3312                                               logical_start, logical_length)) {
3313                         btrfs_err(fs_info,
3314 "scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
3315                                   extent_start, logical_start, logical_end);
3316                         spin_lock(&sctx->stat_lock);
3317                         sctx->stat.uncorrectable_errors++;
3318                         spin_unlock(&sctx->stat_lock);
3319                         cur_logical += scrub_len;
3320                         continue;
3321                 }
3322                 ret = scrub_extent(sctx, map, cur_logical, scrub_len,
3323                                    cur_logical - logical_start + physical,
3324                                    device, extent_flags, extent_gen,
3325                                    mirror_num);
3326                 scrub_free_csums(sctx);
3327                 if (ret)
3328                         break;
3329                 if (sctx->is_dev_replace)
3330                         sync_replace_for_zoned(sctx);
3331                 cur_logical += scrub_len;
3332                 /* Don't hold CPU for too long time */
3333                 cond_resched();
3334         }
3335         btrfs_release_path(&path);
3336         return ret;
3337 }
3338
3339 /* Calculate the full stripe length for simple stripe based profiles */
3340 static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
3341 {
3342         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3343                             BTRFS_BLOCK_GROUP_RAID10));
3344
3345         return map->num_stripes / map->sub_stripes * map->stripe_len;
3346 }
3347
3348 /* Get the logical bytenr for the stripe */
3349 static u64 simple_stripe_get_logical(struct map_lookup *map,
3350                                      struct btrfs_block_group *bg,
3351                                      int stripe_index)
3352 {
3353         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3354                             BTRFS_BLOCK_GROUP_RAID10));
3355         ASSERT(stripe_index < map->num_stripes);
3356
3357         /*
3358          * (stripe_index / sub_stripes) gives how many data stripes we need to
3359          * skip.
3360          */
3361         return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
3362 }
3363
3364 /* Get the mirror number for the stripe */
3365 static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
3366 {
3367         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3368                             BTRFS_BLOCK_GROUP_RAID10));
3369         ASSERT(stripe_index < map->num_stripes);
3370
3371         /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
3372         return stripe_index % map->sub_stripes + 1;
3373 }
3374
3375 static int scrub_simple_stripe(struct scrub_ctx *sctx,
3376                                struct btrfs_root *extent_root,
3377                                struct btrfs_root *csum_root,
3378                                struct btrfs_block_group *bg,
3379                                struct map_lookup *map,
3380                                struct btrfs_device *device,
3381                                int stripe_index)
3382 {
3383         const u64 logical_increment = simple_stripe_full_stripe_len(map);
3384         const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
3385         const u64 orig_physical = map->stripes[stripe_index].physical;
3386         const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
3387         u64 cur_logical = orig_logical;
3388         u64 cur_physical = orig_physical;
3389         int ret = 0;
3390
3391         while (cur_logical < bg->start + bg->length) {
3392                 /*
3393                  * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
3394                  * just RAID1, so we can reuse scrub_simple_mirror() to scrub
3395                  * this stripe.
3396                  */
3397                 ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
3398                                           cur_logical, map->stripe_len, device,
3399                                           cur_physical, mirror_num);
3400                 if (ret)
3401                         return ret;
3402                 /* Skip to next stripe which belongs to the target device */
3403                 cur_logical += logical_increment;
3404                 /* For physical offset, we just go to next stripe */
3405                 cur_physical += map->stripe_len;
3406         }
3407         return ret;
3408 }
3409
3410 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3411                                            struct btrfs_block_group *bg,
3412                                            struct extent_map *em,
3413                                            struct btrfs_device *scrub_dev,
3414                                            int stripe_index)
3415 {
3416         struct btrfs_path *path;
3417         struct btrfs_fs_info *fs_info = sctx->fs_info;
3418         struct btrfs_root *root;
3419         struct btrfs_root *csum_root;
3420         struct blk_plug plug;
3421         struct map_lookup *map = em->map_lookup;
3422         const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3423         const u64 chunk_logical = bg->start;
3424         int ret;
3425         u64 physical = map->stripes[stripe_index].physical;
3426         const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
3427         const u64 physical_end = physical + dev_stripe_len;
3428         u64 logical;
3429         u64 logic_end;
3430         /* The logical increment after finishing one stripe */
3431         u64 increment;
3432         /* Offset inside the chunk */
3433         u64 offset;
3434         u64 stripe_logical;
3435         u64 stripe_end;
3436         int stop_loop = 0;
3437
3438         path = btrfs_alloc_path();
3439         if (!path)
3440                 return -ENOMEM;
3441
3442         /*
3443          * work on commit root. The related disk blocks are static as
3444          * long as COW is applied. This means, it is save to rewrite
3445          * them to repair disk errors without any race conditions
3446          */
3447         path->search_commit_root = 1;
3448         path->skip_locking = 1;
3449         path->reada = READA_FORWARD;
3450
3451         wait_event(sctx->list_wait,
3452                    atomic_read(&sctx->bios_in_flight) == 0);
3453         scrub_blocked_if_needed(fs_info);
3454
3455         root = btrfs_extent_root(fs_info, bg->start);
3456         csum_root = btrfs_csum_root(fs_info, bg->start);
3457
3458         /*
3459          * collect all data csums for the stripe to avoid seeking during
3460          * the scrub. This might currently (crc32) end up to be about 1MB
3461          */
3462         blk_start_plug(&plug);
3463
3464         if (sctx->is_dev_replace &&
3465             btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3466                 mutex_lock(&sctx->wr_lock);
3467                 sctx->write_pointer = physical;
3468                 mutex_unlock(&sctx->wr_lock);
3469                 sctx->flush_all_writes = true;
3470         }
3471
3472         /*
3473          * There used to be a big double loop to handle all profiles using the
3474          * same routine, which grows larger and more gross over time.
3475          *
3476          * So here we handle each profile differently, so simpler profiles
3477          * have simpler scrubbing function.
3478          */
3479         if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
3480                          BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3481                 /*
3482                  * Above check rules out all complex profile, the remaining
3483                  * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
3484                  * mirrored duplication without stripe.
3485                  *
3486                  * Only @physical and @mirror_num needs to calculated using
3487                  * @stripe_index.
3488                  */
3489                 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3490                                 bg->start, bg->length, scrub_dev,
3491                                 map->stripes[stripe_index].physical,
3492                                 stripe_index + 1);
3493                 offset = 0;
3494                 goto out;
3495         }
3496         if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3497                 ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
3498                                           scrub_dev, stripe_index);
3499                 offset = map->stripe_len * (stripe_index / map->sub_stripes);
3500                 goto out;
3501         }
3502
3503         /* Only RAID56 goes through the old code */
3504         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3505         ret = 0;
3506
3507         /* Calculate the logical end of the stripe */
3508         get_raid56_logic_offset(physical_end, stripe_index,
3509                                 map, &logic_end, NULL);
3510         logic_end += chunk_logical;
3511
3512         /* Initialize @offset in case we need to go to out: label */
3513         get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
3514         increment = map->stripe_len * nr_data_stripes(map);
3515
3516         /*
3517          * Due to the rotation, for RAID56 it's better to iterate each stripe
3518          * using their physical offset.
3519          */
3520         while (physical < physical_end) {
3521                 ret = get_raid56_logic_offset(physical, stripe_index, map,
3522                                               &logical, &stripe_logical);
3523                 logical += chunk_logical;
3524                 if (ret) {
3525                         /* it is parity strip */
3526                         stripe_logical += chunk_logical;
3527                         stripe_end = stripe_logical + increment;
3528                         ret = scrub_raid56_parity(sctx, map, scrub_dev,
3529                                                   stripe_logical,
3530                                                   stripe_end);
3531                         if (ret)
3532                                 goto out;
3533                         goto next;
3534                 }
3535
3536                 /*
3537                  * Now we're at a data stripe, scrub each extents in the range.
3538                  *
3539                  * At this stage, if we ignore the repair part, inside each data
3540                  * stripe it is no different than SINGLE profile.
3541                  * We can reuse scrub_simple_mirror() here, as the repair part
3542                  * is still based on @mirror_num.
3543                  */
3544                 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3545                                           logical, map->stripe_len,
3546                                           scrub_dev, physical, 1);
3547                 if (ret < 0)
3548                         goto out;
3549 next:
3550                 logical += increment;
3551                 physical += map->stripe_len;
3552                 spin_lock(&sctx->stat_lock);
3553                 if (stop_loop)
3554                         sctx->stat.last_physical =
3555                                 map->stripes[stripe_index].physical + dev_stripe_len;
3556                 else
3557                         sctx->stat.last_physical = physical;
3558                 spin_unlock(&sctx->stat_lock);
3559                 if (stop_loop)
3560                         break;
3561         }
3562 out:
3563         /* push queued extents */
3564         scrub_submit(sctx);
3565         mutex_lock(&sctx->wr_lock);
3566         scrub_wr_submit(sctx);
3567         mutex_unlock(&sctx->wr_lock);
3568
3569         blk_finish_plug(&plug);
3570         btrfs_free_path(path);
3571
3572         if (sctx->is_dev_replace && ret >= 0) {
3573                 int ret2;
3574
3575                 ret2 = sync_write_pointer_for_zoned(sctx,
3576                                 chunk_logical + offset,
3577                                 map->stripes[stripe_index].physical,
3578                                 physical_end);
3579                 if (ret2)
3580                         ret = ret2;
3581         }
3582
3583         return ret < 0 ? ret : 0;
3584 }
3585
3586 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3587                                           struct btrfs_block_group *bg,
3588                                           struct btrfs_device *scrub_dev,
3589                                           u64 dev_offset,
3590                                           u64 dev_extent_len)
3591 {
3592         struct btrfs_fs_info *fs_info = sctx->fs_info;
3593         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3594         struct map_lookup *map;
3595         struct extent_map *em;
3596         int i;
3597         int ret = 0;
3598
3599         read_lock(&map_tree->lock);
3600         em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3601         read_unlock(&map_tree->lock);
3602
3603         if (!em) {
3604                 /*
3605                  * Might have been an unused block group deleted by the cleaner
3606                  * kthread or relocation.
3607                  */
3608                 spin_lock(&bg->lock);
3609                 if (!bg->removed)
3610                         ret = -EINVAL;
3611                 spin_unlock(&bg->lock);
3612
3613                 return ret;
3614         }
3615         if (em->start != bg->start)
3616                 goto out;
3617         if (em->len < dev_extent_len)
3618                 goto out;
3619
3620         map = em->map_lookup;
3621         for (i = 0; i < map->num_stripes; ++i) {
3622                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3623                     map->stripes[i].physical == dev_offset) {
3624                         ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
3625                         if (ret)
3626                                 goto out;
3627                 }
3628         }
3629 out:
3630         free_extent_map(em);
3631
3632         return ret;
3633 }
3634
3635 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3636                                           struct btrfs_block_group *cache)
3637 {
3638         struct btrfs_fs_info *fs_info = cache->fs_info;
3639         struct btrfs_trans_handle *trans;
3640
3641         if (!btrfs_is_zoned(fs_info))
3642                 return 0;
3643
3644         btrfs_wait_block_group_reservations(cache);
3645         btrfs_wait_nocow_writers(cache);
3646         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3647
3648         trans = btrfs_join_transaction(root);
3649         if (IS_ERR(trans))
3650                 return PTR_ERR(trans);
3651         return btrfs_commit_transaction(trans);
3652 }
3653
3654 static noinline_for_stack
3655 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3656                            struct btrfs_device *scrub_dev, u64 start, u64 end)
3657 {
3658         struct btrfs_dev_extent *dev_extent = NULL;
3659         struct btrfs_path *path;
3660         struct btrfs_fs_info *fs_info = sctx->fs_info;
3661         struct btrfs_root *root = fs_info->dev_root;
3662         u64 chunk_offset;
3663         int ret = 0;
3664         int ro_set;
3665         int slot;
3666         struct extent_buffer *l;
3667         struct btrfs_key key;
3668         struct btrfs_key found_key;
3669         struct btrfs_block_group *cache;
3670         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3671
3672         path = btrfs_alloc_path();
3673         if (!path)
3674                 return -ENOMEM;
3675
3676         path->reada = READA_FORWARD;
3677         path->search_commit_root = 1;
3678         path->skip_locking = 1;
3679
3680         key.objectid = scrub_dev->devid;
3681         key.offset = 0ull;
3682         key.type = BTRFS_DEV_EXTENT_KEY;
3683
3684         while (1) {
3685                 u64 dev_extent_len;
3686
3687                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3688                 if (ret < 0)
3689                         break;
3690                 if (ret > 0) {
3691                         if (path->slots[0] >=
3692                             btrfs_header_nritems(path->nodes[0])) {
3693                                 ret = btrfs_next_leaf(root, path);
3694                                 if (ret < 0)
3695                                         break;
3696                                 if (ret > 0) {
3697                                         ret = 0;
3698                                         break;
3699                                 }
3700                         } else {
3701                                 ret = 0;
3702                         }
3703                 }
3704
3705                 l = path->nodes[0];
3706                 slot = path->slots[0];
3707
3708                 btrfs_item_key_to_cpu(l, &found_key, slot);
3709
3710                 if (found_key.objectid != scrub_dev->devid)
3711                         break;
3712
3713                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3714                         break;
3715
3716                 if (found_key.offset >= end)
3717                         break;
3718
3719                 if (found_key.offset < key.offset)
3720                         break;
3721
3722                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3723                 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3724
3725                 if (found_key.offset + dev_extent_len <= start)
3726                         goto skip;
3727
3728                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3729
3730                 /*
3731                  * get a reference on the corresponding block group to prevent
3732                  * the chunk from going away while we scrub it
3733                  */
3734                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3735
3736                 /* some chunks are removed but not committed to disk yet,
3737                  * continue scrubbing */
3738                 if (!cache)
3739                         goto skip;
3740
3741                 ASSERT(cache->start <= chunk_offset);
3742                 /*
3743                  * We are using the commit root to search for device extents, so
3744                  * that means we could have found a device extent item from a
3745                  * block group that was deleted in the current transaction. The
3746                  * logical start offset of the deleted block group, stored at
3747                  * @chunk_offset, might be part of the logical address range of
3748                  * a new block group (which uses different physical extents).
3749                  * In this case btrfs_lookup_block_group() has returned the new
3750                  * block group, and its start address is less than @chunk_offset.
3751                  *
3752                  * We skip such new block groups, because it's pointless to
3753                  * process them, as we won't find their extents because we search
3754                  * for them using the commit root of the extent tree. For a device
3755                  * replace it's also fine to skip it, we won't miss copying them
3756                  * to the target device because we have the write duplication
3757                  * setup through the regular write path (by btrfs_map_block()),
3758                  * and we have committed a transaction when we started the device
3759                  * replace, right after setting up the device replace state.
3760                  */
3761                 if (cache->start < chunk_offset) {
3762                         btrfs_put_block_group(cache);
3763                         goto skip;
3764                 }
3765
3766                 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3767                         spin_lock(&cache->lock);
3768                         if (!cache->to_copy) {
3769                                 spin_unlock(&cache->lock);
3770                                 btrfs_put_block_group(cache);
3771                                 goto skip;
3772                         }
3773                         spin_unlock(&cache->lock);
3774                 }
3775
3776                 /*
3777                  * Make sure that while we are scrubbing the corresponding block
3778                  * group doesn't get its logical address and its device extents
3779                  * reused for another block group, which can possibly be of a
3780                  * different type and different profile. We do this to prevent
3781                  * false error detections and crashes due to bogus attempts to
3782                  * repair extents.
3783                  */
3784                 spin_lock(&cache->lock);
3785                 if (cache->removed) {
3786                         spin_unlock(&cache->lock);
3787                         btrfs_put_block_group(cache);
3788                         goto skip;
3789                 }
3790                 btrfs_freeze_block_group(cache);
3791                 spin_unlock(&cache->lock);
3792
3793                 /*
3794                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3795                  * to avoid deadlock caused by:
3796                  * btrfs_inc_block_group_ro()
3797                  * -> btrfs_wait_for_commit()
3798                  * -> btrfs_commit_transaction()
3799                  * -> btrfs_scrub_pause()
3800                  */
3801                 scrub_pause_on(fs_info);
3802
3803                 /*
3804                  * Don't do chunk preallocation for scrub.
3805                  *
3806                  * This is especially important for SYSTEM bgs, or we can hit
3807                  * -EFBIG from btrfs_finish_chunk_alloc() like:
3808                  * 1. The only SYSTEM bg is marked RO.
3809                  *    Since SYSTEM bg is small, that's pretty common.
3810                  * 2. New SYSTEM bg will be allocated
3811                  *    Due to regular version will allocate new chunk.
3812                  * 3. New SYSTEM bg is empty and will get cleaned up
3813                  *    Before cleanup really happens, it's marked RO again.
3814                  * 4. Empty SYSTEM bg get scrubbed
3815                  *    We go back to 2.
3816                  *
3817                  * This can easily boost the amount of SYSTEM chunks if cleaner
3818                  * thread can't be triggered fast enough, and use up all space
3819                  * of btrfs_super_block::sys_chunk_array
3820                  *
3821                  * While for dev replace, we need to try our best to mark block
3822                  * group RO, to prevent race between:
3823                  * - Write duplication
3824                  *   Contains latest data
3825                  * - Scrub copy
3826                  *   Contains data from commit tree
3827                  *
3828                  * If target block group is not marked RO, nocow writes can
3829                  * be overwritten by scrub copy, causing data corruption.
3830                  * So for dev-replace, it's not allowed to continue if a block
3831                  * group is not RO.
3832                  */
3833                 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3834                 if (!ret && sctx->is_dev_replace) {
3835                         ret = finish_extent_writes_for_zoned(root, cache);
3836                         if (ret) {
3837                                 btrfs_dec_block_group_ro(cache);
3838                                 scrub_pause_off(fs_info);
3839                                 btrfs_put_block_group(cache);
3840                                 break;
3841                         }
3842                 }
3843
3844                 if (ret == 0) {
3845                         ro_set = 1;
3846                 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3847                         /*
3848                          * btrfs_inc_block_group_ro return -ENOSPC when it
3849                          * failed in creating new chunk for metadata.
3850                          * It is not a problem for scrub, because
3851                          * metadata are always cowed, and our scrub paused
3852                          * commit_transactions.
3853                          */
3854                         ro_set = 0;
3855                 } else if (ret == -ETXTBSY) {
3856                         btrfs_warn(fs_info,
3857                    "skipping scrub of block group %llu due to active swapfile",
3858                                    cache->start);
3859                         scrub_pause_off(fs_info);
3860                         ret = 0;
3861                         goto skip_unfreeze;
3862                 } else {
3863                         btrfs_warn(fs_info,
3864                                    "failed setting block group ro: %d", ret);
3865                         btrfs_unfreeze_block_group(cache);
3866                         btrfs_put_block_group(cache);
3867                         scrub_pause_off(fs_info);
3868                         break;
3869                 }
3870
3871                 /*
3872                  * Now the target block is marked RO, wait for nocow writes to
3873                  * finish before dev-replace.
3874                  * COW is fine, as COW never overwrites extents in commit tree.
3875                  */
3876                 if (sctx->is_dev_replace) {
3877                         btrfs_wait_nocow_writers(cache);
3878                         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3879                                         cache->length);
3880                 }
3881
3882                 scrub_pause_off(fs_info);
3883                 down_write(&dev_replace->rwsem);
3884                 dev_replace->cursor_right = found_key.offset + dev_extent_len;
3885                 dev_replace->cursor_left = found_key.offset;
3886                 dev_replace->item_needs_writeback = 1;
3887                 up_write(&dev_replace->rwsem);
3888
3889                 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
3890                                   dev_extent_len);
3891
3892                 /*
3893                  * flush, submit all pending read and write bios, afterwards
3894                  * wait for them.
3895                  * Note that in the dev replace case, a read request causes
3896                  * write requests that are submitted in the read completion
3897                  * worker. Therefore in the current situation, it is required
3898                  * that all write requests are flushed, so that all read and
3899                  * write requests are really completed when bios_in_flight
3900                  * changes to 0.
3901                  */
3902                 sctx->flush_all_writes = true;
3903                 scrub_submit(sctx);
3904                 mutex_lock(&sctx->wr_lock);
3905                 scrub_wr_submit(sctx);
3906                 mutex_unlock(&sctx->wr_lock);
3907
3908                 wait_event(sctx->list_wait,
3909                            atomic_read(&sctx->bios_in_flight) == 0);
3910
3911                 scrub_pause_on(fs_info);
3912
3913                 /*
3914                  * must be called before we decrease @scrub_paused.
3915                  * make sure we don't block transaction commit while
3916                  * we are waiting pending workers finished.
3917                  */
3918                 wait_event(sctx->list_wait,
3919                            atomic_read(&sctx->workers_pending) == 0);
3920                 sctx->flush_all_writes = false;
3921
3922                 scrub_pause_off(fs_info);
3923
3924                 if (sctx->is_dev_replace &&
3925                     !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3926                                                       cache, found_key.offset))
3927                         ro_set = 0;
3928
3929                 down_write(&dev_replace->rwsem);
3930                 dev_replace->cursor_left = dev_replace->cursor_right;
3931                 dev_replace->item_needs_writeback = 1;
3932                 up_write(&dev_replace->rwsem);
3933
3934                 if (ro_set)
3935                         btrfs_dec_block_group_ro(cache);
3936
3937                 /*
3938                  * We might have prevented the cleaner kthread from deleting
3939                  * this block group if it was already unused because we raced
3940                  * and set it to RO mode first. So add it back to the unused
3941                  * list, otherwise it might not ever be deleted unless a manual
3942                  * balance is triggered or it becomes used and unused again.
3943                  */
3944                 spin_lock(&cache->lock);
3945                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3946                     cache->used == 0) {
3947                         spin_unlock(&cache->lock);
3948                         if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3949                                 btrfs_discard_queue_work(&fs_info->discard_ctl,
3950                                                          cache);
3951                         else
3952                                 btrfs_mark_bg_unused(cache);
3953                 } else {
3954                         spin_unlock(&cache->lock);
3955                 }
3956 skip_unfreeze:
3957                 btrfs_unfreeze_block_group(cache);
3958                 btrfs_put_block_group(cache);
3959                 if (ret)
3960                         break;
3961                 if (sctx->is_dev_replace &&
3962                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3963                         ret = -EIO;
3964                         break;
3965                 }
3966                 if (sctx->stat.malloc_errors > 0) {
3967                         ret = -ENOMEM;
3968                         break;
3969                 }
3970 skip:
3971                 key.offset = found_key.offset + dev_extent_len;
3972                 btrfs_release_path(path);
3973         }
3974
3975         btrfs_free_path(path);
3976
3977         return ret;
3978 }
3979
3980 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3981                                            struct btrfs_device *scrub_dev)
3982 {
3983         int     i;
3984         u64     bytenr;
3985         u64     gen;
3986         int     ret;
3987         struct btrfs_fs_info *fs_info = sctx->fs_info;
3988
3989         if (BTRFS_FS_ERROR(fs_info))
3990                 return -EROFS;
3991
3992         /* Seed devices of a new filesystem has their own generation. */
3993         if (scrub_dev->fs_devices != fs_info->fs_devices)
3994                 gen = scrub_dev->generation;
3995         else
3996                 gen = fs_info->last_trans_committed;
3997
3998         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3999                 bytenr = btrfs_sb_offset(i);
4000                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4001                     scrub_dev->commit_total_bytes)
4002                         break;
4003                 if (!btrfs_check_super_location(scrub_dev, bytenr))
4004                         continue;
4005
4006                 ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4007                                     scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4008                                     NULL, bytenr);
4009                 if (ret)
4010                         return ret;
4011         }
4012         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4013
4014         return 0;
4015 }
4016
4017 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
4018 {
4019         if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
4020                                         &fs_info->scrub_lock)) {
4021                 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
4022                 struct workqueue_struct *scrub_wr_comp =
4023                                                 fs_info->scrub_wr_completion_workers;
4024                 struct workqueue_struct *scrub_parity =
4025                                                 fs_info->scrub_parity_workers;
4026
4027                 fs_info->scrub_workers = NULL;
4028                 fs_info->scrub_wr_completion_workers = NULL;
4029                 fs_info->scrub_parity_workers = NULL;
4030                 mutex_unlock(&fs_info->scrub_lock);
4031
4032                 if (scrub_workers)
4033                         destroy_workqueue(scrub_workers);
4034                 if (scrub_wr_comp)
4035                         destroy_workqueue(scrub_wr_comp);
4036                 if (scrub_parity)
4037                         destroy_workqueue(scrub_parity);
4038         }
4039 }
4040
4041 /*
4042  * get a reference count on fs_info->scrub_workers. start worker if necessary
4043  */
4044 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4045                                                 int is_dev_replace)
4046 {
4047         struct workqueue_struct *scrub_workers = NULL;
4048         struct workqueue_struct *scrub_wr_comp = NULL;
4049         struct workqueue_struct *scrub_parity = NULL;
4050         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4051         int max_active = fs_info->thread_pool_size;
4052         int ret = -ENOMEM;
4053
4054         if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4055                 return 0;
4056
4057         scrub_workers = alloc_workqueue("btrfs-scrub", flags,
4058                                         is_dev_replace ? 1 : max_active);
4059         if (!scrub_workers)
4060                 goto fail_scrub_workers;
4061
4062         scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
4063         if (!scrub_wr_comp)
4064                 goto fail_scrub_wr_completion_workers;
4065
4066         scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
4067         if (!scrub_parity)
4068                 goto fail_scrub_parity_workers;
4069
4070         mutex_lock(&fs_info->scrub_lock);
4071         if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4072                 ASSERT(fs_info->scrub_workers == NULL &&
4073                        fs_info->scrub_wr_completion_workers == NULL &&
4074                        fs_info->scrub_parity_workers == NULL);
4075                 fs_info->scrub_workers = scrub_workers;
4076                 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4077                 fs_info->scrub_parity_workers = scrub_parity;
4078                 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4079                 mutex_unlock(&fs_info->scrub_lock);
4080                 return 0;
4081         }
4082         /* Other thread raced in and created the workers for us */
4083         refcount_inc(&fs_info->scrub_workers_refcnt);
4084         mutex_unlock(&fs_info->scrub_lock);
4085
4086         ret = 0;
4087         destroy_workqueue(scrub_parity);
4088 fail_scrub_parity_workers:
4089         destroy_workqueue(scrub_wr_comp);
4090 fail_scrub_wr_completion_workers:
4091         destroy_workqueue(scrub_workers);
4092 fail_scrub_workers:
4093         return ret;
4094 }
4095
4096 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4097                     u64 end, struct btrfs_scrub_progress *progress,
4098                     int readonly, int is_dev_replace)
4099 {
4100         struct btrfs_dev_lookup_args args = { .devid = devid };
4101         struct scrub_ctx *sctx;
4102         int ret;
4103         struct btrfs_device *dev;
4104         unsigned int nofs_flag;
4105
4106         if (btrfs_fs_closing(fs_info))
4107                 return -EAGAIN;
4108
4109         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4110                 /*
4111                  * in this case scrub is unable to calculate the checksum
4112                  * the way scrub is implemented. Do not handle this
4113                  * situation at all because it won't ever happen.
4114                  */
4115                 btrfs_err(fs_info,
4116                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4117                        fs_info->nodesize,
4118                        BTRFS_STRIPE_LEN);
4119                 return -EINVAL;
4120         }
4121
4122         if (fs_info->nodesize >
4123             SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
4124             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
4125                 /*
4126                  * Would exhaust the array bounds of sectorv member in
4127                  * struct scrub_block
4128                  */
4129                 btrfs_err(fs_info,
4130 "scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
4131                        fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
4132                        fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
4133                 return -EINVAL;
4134         }
4135
4136         /* Allocate outside of device_list_mutex */
4137         sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4138         if (IS_ERR(sctx))
4139                 return PTR_ERR(sctx);
4140
4141         ret = scrub_workers_get(fs_info, is_dev_replace);
4142         if (ret)
4143                 goto out_free_ctx;
4144
4145         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4146         dev = btrfs_find_device(fs_info->fs_devices, &args);
4147         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4148                      !is_dev_replace)) {
4149                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4150                 ret = -ENODEV;
4151                 goto out;
4152         }
4153
4154         if (!is_dev_replace && !readonly &&
4155             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4156                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4157                 btrfs_err_in_rcu(fs_info,
4158                         "scrub on devid %llu: filesystem on %s is not writable",
4159                                  devid, rcu_str_deref(dev->name));
4160                 ret = -EROFS;
4161                 goto out;
4162         }
4163
4164         mutex_lock(&fs_info->scrub_lock);
4165         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4166             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4167                 mutex_unlock(&fs_info->scrub_lock);
4168                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4169                 ret = -EIO;
4170                 goto out;
4171         }
4172
4173         down_read(&fs_info->dev_replace.rwsem);
4174         if (dev->scrub_ctx ||
4175             (!is_dev_replace &&
4176              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4177                 up_read(&fs_info->dev_replace.rwsem);
4178                 mutex_unlock(&fs_info->scrub_lock);
4179                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4180                 ret = -EINPROGRESS;
4181                 goto out;
4182         }
4183         up_read(&fs_info->dev_replace.rwsem);
4184
4185         sctx->readonly = readonly;
4186         dev->scrub_ctx = sctx;
4187         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4188
4189         /*
4190          * checking @scrub_pause_req here, we can avoid
4191          * race between committing transaction and scrubbing.
4192          */
4193         __scrub_blocked_if_needed(fs_info);
4194         atomic_inc(&fs_info->scrubs_running);
4195         mutex_unlock(&fs_info->scrub_lock);
4196
4197         /*
4198          * In order to avoid deadlock with reclaim when there is a transaction
4199          * trying to pause scrub, make sure we use GFP_NOFS for all the
4200          * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
4201          * invoked by our callees. The pausing request is done when the
4202          * transaction commit starts, and it blocks the transaction until scrub
4203          * is paused (done at specific points at scrub_stripe() or right above
4204          * before incrementing fs_info->scrubs_running).
4205          */
4206         nofs_flag = memalloc_nofs_save();
4207         if (!is_dev_replace) {
4208                 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4209                 /*
4210                  * by holding device list mutex, we can
4211                  * kick off writing super in log tree sync.
4212                  */
4213                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4214                 ret = scrub_supers(sctx, dev);
4215                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4216         }
4217
4218         if (!ret)
4219                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4220         memalloc_nofs_restore(nofs_flag);
4221
4222         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4223         atomic_dec(&fs_info->scrubs_running);
4224         wake_up(&fs_info->scrub_pause_wait);
4225
4226         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4227
4228         if (progress)
4229                 memcpy(progress, &sctx->stat, sizeof(*progress));
4230
4231         if (!is_dev_replace)
4232                 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4233                         ret ? "not finished" : "finished", devid, ret);
4234
4235         mutex_lock(&fs_info->scrub_lock);
4236         dev->scrub_ctx = NULL;
4237         mutex_unlock(&fs_info->scrub_lock);
4238
4239         scrub_workers_put(fs_info);
4240         scrub_put_ctx(sctx);
4241
4242         return ret;
4243 out:
4244         scrub_workers_put(fs_info);
4245 out_free_ctx:
4246         scrub_free_ctx(sctx);
4247
4248         return ret;
4249 }
4250
4251 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4252 {
4253         mutex_lock(&fs_info->scrub_lock);
4254         atomic_inc(&fs_info->scrub_pause_req);
4255         while (atomic_read(&fs_info->scrubs_paused) !=
4256                atomic_read(&fs_info->scrubs_running)) {
4257                 mutex_unlock(&fs_info->scrub_lock);
4258                 wait_event(fs_info->scrub_pause_wait,
4259                            atomic_read(&fs_info->scrubs_paused) ==
4260                            atomic_read(&fs_info->scrubs_running));
4261                 mutex_lock(&fs_info->scrub_lock);
4262         }
4263         mutex_unlock(&fs_info->scrub_lock);
4264 }
4265
4266 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4267 {
4268         atomic_dec(&fs_info->scrub_pause_req);
4269         wake_up(&fs_info->scrub_pause_wait);
4270 }
4271
4272 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4273 {
4274         mutex_lock(&fs_info->scrub_lock);
4275         if (!atomic_read(&fs_info->scrubs_running)) {
4276                 mutex_unlock(&fs_info->scrub_lock);
4277                 return -ENOTCONN;
4278         }
4279
4280         atomic_inc(&fs_info->scrub_cancel_req);
4281         while (atomic_read(&fs_info->scrubs_running)) {
4282                 mutex_unlock(&fs_info->scrub_lock);
4283                 wait_event(fs_info->scrub_pause_wait,
4284                            atomic_read(&fs_info->scrubs_running) == 0);
4285                 mutex_lock(&fs_info->scrub_lock);
4286         }
4287         atomic_dec(&fs_info->scrub_cancel_req);
4288         mutex_unlock(&fs_info->scrub_lock);
4289
4290         return 0;
4291 }
4292
4293 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4294 {
4295         struct btrfs_fs_info *fs_info = dev->fs_info;
4296         struct scrub_ctx *sctx;
4297
4298         mutex_lock(&fs_info->scrub_lock);
4299         sctx = dev->scrub_ctx;
4300         if (!sctx) {
4301                 mutex_unlock(&fs_info->scrub_lock);
4302                 return -ENOTCONN;
4303         }
4304         atomic_inc(&sctx->cancel_req);
4305         while (dev->scrub_ctx) {
4306                 mutex_unlock(&fs_info->scrub_lock);
4307                 wait_event(fs_info->scrub_pause_wait,
4308                            dev->scrub_ctx == NULL);
4309                 mutex_lock(&fs_info->scrub_lock);
4310         }
4311         mutex_unlock(&fs_info->scrub_lock);
4312
4313         return 0;
4314 }
4315
4316 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4317                          struct btrfs_scrub_progress *progress)
4318 {
4319         struct btrfs_dev_lookup_args args = { .devid = devid };
4320         struct btrfs_device *dev;
4321         struct scrub_ctx *sctx = NULL;
4322
4323         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4324         dev = btrfs_find_device(fs_info->fs_devices, &args);
4325         if (dev)
4326                 sctx = dev->scrub_ctx;
4327         if (sctx)
4328                 memcpy(progress, &sctx->stat, sizeof(*progress));
4329         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4330
4331         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4332 }
4333
4334 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
4335                                  u64 extent_logical, u32 extent_len,
4336                                  u64 *extent_physical,
4337                                  struct btrfs_device **extent_dev,
4338                                  int *extent_mirror_num)
4339 {
4340         u64 mapped_length;
4341         struct btrfs_io_context *bioc = NULL;
4342         int ret;
4343
4344         mapped_length = extent_len;
4345         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4346                               &mapped_length, &bioc, 0);
4347         if (ret || !bioc || mapped_length < extent_len ||
4348             !bioc->stripes[0].dev->bdev) {
4349                 btrfs_put_bioc(bioc);
4350                 return;
4351         }
4352
4353         *extent_physical = bioc->stripes[0].physical;
4354         *extent_mirror_num = bioc->mirror_num;
4355         *extent_dev = bioc->stripes[0].dev;
4356         btrfs_put_bioc(bioc);
4357 }