fs/btrfs/scrub.c

   1 /*
   2  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/blkdev.h>
  20 #include <linux/ratelimit.h>
  21 #include "ctree.h"
  22 #include "volumes.h"
  23 #include "disk-io.h"
  24 #include "ordered-data.h"
  25 #include "transaction.h"
  26 #include "backref.h"
  27 #include "extent_io.h"
  28 #include "dev-replace.h"
  29 #include "check-integrity.h"
  30 #include "rcu-string.h"
  31 #include "raid56.h"
  32
  33 /*
  34  * This is only the first step towards a full-features scrub. It reads all
  35  * extent and super block and verifies the checksums. In case a bad checksum
  36  * is found or the extent cannot be read, good data will be written back if
  37  * any can be found.
  38  *
  39  * Future enhancements:
  40  *  - In case an unrepairable extent is encountered, track which files are
  41  *    affected and report them
  42  *  - track and record media errors, throw out bad devices
  43  *  - add a mode to also read unallocated space
  44  */
  45
  46 struct scrub_block;
  47 struct scrub_ctx;
  48
  49 /*
  50  * the following three values only influence the performance.
  51  * The last one configures the number of parallel and outstanding I/O
  52  * operations. The first two values configure an upper limit for the number
  53  * of (dynamically allocated) pages that are added to a bio.
  54  */
  55 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
  56 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
  57 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
  58
  59 /*
  60  * the following value times PAGE_SIZE needs to be large enough to match the
  61  * largest node/leaf/sector size that shall be supported.
  62  * Values larger than BTRFS_STRIPE_LEN are not supported.
  63  */
  64 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
  65
  66 struct scrub_recover {
  67         atomic_t                refs;
  68         struct btrfs_bio        *bbio;
  69         u64                     *raid_map;
  70         u64                     map_length;
  71 };
  72
  73 struct scrub_page {
  74         struct scrub_block      *sblock;
  75         struct page             *page;
  76         struct btrfs_device     *dev;
  77         struct list_head        list;
  78         u64                     flags;  /* extent flags */
  79         u64                     generation;
  80         u64                     logical;
  81         u64                     physical;
  82         u64                     physical_for_dev_replace;
  83         atomic_t                ref_count;
  84         struct {
  85                 unsigned int    mirror_num:8;
  86                 unsigned int    have_csum:1;
  87                 unsigned int    io_error:1;
  88         };
  89         u8                      csum[BTRFS_CSUM_SIZE];
  90
  91         struct scrub_recover    *recover;
  92 };
  93
  94 struct scrub_bio {
  95         int                     index;
  96         struct scrub_ctx        *sctx;
  97         struct btrfs_device     *dev;
  98         struct bio              *bio;
  99         int                     err;
 100         u64                     logical;
 101         u64                     physical;
 102 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
 103         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
 104 #else
 105         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
 106 #endif
 107         int                     page_count;
 108         int                     next_free;
 109         struct btrfs_work       work;
 110 };
 111
 112 struct scrub_block {
 113         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 114         int                     page_count;
 115         atomic_t                outstanding_pages;
 116         atomic_t                ref_count; /* free mem on transition to zero */
 117         struct scrub_ctx        *sctx;
 118         struct scrub_parity     *sparity;
 119         struct {
 120                 unsigned int    header_error:1;
 121                 unsigned int    checksum_error:1;
 122                 unsigned int    no_io_error_seen:1;
 123                 unsigned int    generation_error:1; /* also sets header_error */
 124
 125                 /* The following is for the data used to check parity */
 126                 /* It is for the data with checksum */
 127                 unsigned int    data_corrected:1;
 128         };
 129 };
 130
 131 /* Used for the chunks with parity stripe such RAID5/6 */
 132 struct scrub_parity {
 133         struct scrub_ctx        *sctx;
 134
 135         struct btrfs_device     *scrub_dev;
 136
 137         u64                     logic_start;
 138
 139         u64                     logic_end;
 140
 141         int                     nsectors;
 142
 143         int                     stripe_len;
 144
 145         atomic_t                ref_count;
 146
 147         struct list_head        spages;
 148
 149         /* Work of parity check and repair */
 150         struct btrfs_work       work;
 151
 152         /* Mark the parity blocks which have data */
 153         unsigned long           *dbitmap;
 154
 155         /*
 156          * Mark the parity blocks which have data, but errors happen when
 157          * read data or check data
 158          */
 159         unsigned long           *ebitmap;
 160
 161         unsigned long           bitmap[0];
 162 };
 163
 164 struct scrub_wr_ctx {
 165         struct scrub_bio *wr_curr_bio;
 166         struct btrfs_device *tgtdev;
 167         int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
 168         atomic_t flush_all_writes;
 169         struct mutex wr_lock;
 170 };
 171
 172 struct scrub_ctx {
 173         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 174         struct btrfs_root       *dev_root;
 175         int                     first_free;
 176         int                     curr;
 177         atomic_t                bios_in_flight;
 178         atomic_t                workers_pending;
 179         spinlock_t              list_lock;
 180         wait_queue_head_t       list_wait;
 181         u16                     csum_size;
 182         struct list_head        csum_list;
 183         atomic_t                cancel_req;
 184         int                     readonly;
 185         int                     pages_per_rd_bio;
 186         u32                     sectorsize;
 187         u32                     nodesize;
 188
 189         int                     is_dev_replace;
 190         struct scrub_wr_ctx     wr_ctx;
 191
 192         /*
 193          * statistics
 194          */
 195         struct btrfs_scrub_progress stat;
 196         spinlock_t              stat_lock;
 197 };
 198
 199 struct scrub_fixup_nodatasum {
 200         struct scrub_ctx        *sctx;
 201         struct btrfs_device     *dev;
 202         u64                     logical;
 203         struct btrfs_root       *root;
 204         struct btrfs_work       work;
 205         int                     mirror_num;
 206 };
 207
 208 struct scrub_nocow_inode {
 209         u64                     inum;
 210         u64                     offset;
 211         u64                     root;
 212         struct list_head        list;
 213 };
 214
 215 struct scrub_copy_nocow_ctx {
 216         struct scrub_ctx        *sctx;
 217         u64                     logical;
 218         u64                     len;
 219         int                     mirror_num;
 220         u64                     physical_for_dev_replace;
 221         struct list_head        inodes;
 222         struct btrfs_work       work;
 223 };
 224
 225 struct scrub_warning {
 226         struct btrfs_path       *path;
 227         u64                     extent_item_size;
 228         const char              *errstr;
 229         sector_t                sector;
 230         u64                     logical;
 231         struct btrfs_device     *dev;
 232 };
 233
 234 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 235 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 236 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
 237 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 238 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 239 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 240                                      struct btrfs_fs_info *fs_info,
 241                                      struct scrub_block *original_sblock,
 242                                      u64 length, u64 logical,
 243                                      struct scrub_block *sblocks_for_recheck);
 244 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 245                                 struct scrub_block *sblock, int is_metadata,
 246                                 int have_csum, u8 *csum, u64 generation,
 247                                 u16 csum_size, int retry_failed_mirror);
 248 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 249                                          struct scrub_block *sblock,
 250                                          int is_metadata, int have_csum,
 251                                          const u8 *csum, u64 generation,
 252                                          u16 csum_size);
 253 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 254                                              struct scrub_block *sblock_good,
 255                                              int force_write);
 256 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 257                                             struct scrub_block *sblock_good,
 258                                             int page_num, int force_write);
 259 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 260 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 261                                            int page_num);
 262 static int scrub_checksum_data(struct scrub_block *sblock);
 263 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 264 static int scrub_checksum_super(struct scrub_block *sblock);
 265 static void scrub_block_get(struct scrub_block *sblock);
 266 static void scrub_block_put(struct scrub_block *sblock);
 267 static void scrub_page_get(struct scrub_page *spage);
 268 static void scrub_page_put(struct scrub_page *spage);
 269 static void scrub_parity_get(struct scrub_parity *sparity);
 270 static void scrub_parity_put(struct scrub_parity *sparity);
 271 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 272                                     struct scrub_page *spage);
 273 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 274                        u64 physical, struct btrfs_device *dev, u64 flags,
 275                        u64 gen, int mirror_num, u8 *csum, int force,
 276                        u64 physical_for_dev_replace);
 277 static void scrub_bio_end_io(struct bio *bio, int err);
 278 static void scrub_bio_end_io_worker(struct btrfs_work *work);
 279 static void scrub_block_complete(struct scrub_block *sblock);
 280 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 281                                u64 extent_logical, u64 extent_len,
 282                                u64 *extent_physical,
 283                                struct btrfs_device **extent_dev,
 284                                int *extent_mirror_num);
 285 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
 286                               struct scrub_wr_ctx *wr_ctx,
 287                               struct btrfs_fs_info *fs_info,
 288                               struct btrfs_device *dev,
 289                               int is_dev_replace);
 290 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
 291 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 292                                     struct scrub_page *spage);
 293 static void scrub_wr_submit(struct scrub_ctx *sctx);
 294 static void scrub_wr_bio_end_io(struct bio *bio, int err);
 295 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 296 static int write_page_nocow(struct scrub_ctx *sctx,
 297                             u64 physical_for_dev_replace, struct page *page);
 298 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 299                                       struct scrub_copy_nocow_ctx *ctx);
 300 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 301                             int mirror_num, u64 physical_for_dev_replace);
 302 static void copy_nocow_pages_worker(struct btrfs_work *work);
 303 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 304 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 305
 306
 307 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 308 {
 309         atomic_inc(&sctx->bios_in_flight);
 310 }
 311
 312 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 313 {
 314         atomic_dec(&sctx->bios_in_flight);
 315         wake_up(&sctx->list_wait);
 316 }
 317
 318 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 319 {
 320         while (atomic_read(&fs_info->scrub_pause_req)) {
 321                 mutex_unlock(&fs_info->scrub_lock);
 322                 wait_event(fs_info->scrub_pause_wait,
 323                    atomic_read(&fs_info->scrub_pause_req) == 0);
 324                 mutex_lock(&fs_info->scrub_lock);
 325         }
 326 }
 327
 328 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 329 {
 330         atomic_inc(&fs_info->scrubs_paused);
 331         wake_up(&fs_info->scrub_pause_wait);
 332
 333         mutex_lock(&fs_info->scrub_lock);
 334         __scrub_blocked_if_needed(fs_info);
 335         atomic_dec(&fs_info->scrubs_paused);
 336         mutex_unlock(&fs_info->scrub_lock);
 337
 338         wake_up(&fs_info->scrub_pause_wait);
 339 }
 340
 341 /*
 342  * used for workers that require transaction commits (i.e., for the
 343  * NOCOW case)
 344  */
 345 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 346 {
 347         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 348
 349         /*
 350          * increment scrubs_running to prevent cancel requests from
 351          * completing as long as a worker is running. we must also
 352          * increment scrubs_paused to prevent deadlocking on pause
 353          * requests used for transactions commits (as the worker uses a
 354          * transaction context). it is safe to regard the worker
 355          * as paused for all matters practical. effectively, we only
 356          * avoid cancellation requests from completing.
 357          */
 358         mutex_lock(&fs_info->scrub_lock);
 359         atomic_inc(&fs_info->scrubs_running);
 360         atomic_inc(&fs_info->scrubs_paused);
 361         mutex_unlock(&fs_info->scrub_lock);
 362
 363         /*
 364          * check if @scrubs_running=@scrubs_paused condition
 365          * inside wait_event() is not an atomic operation.
 366          * which means we may inc/dec @scrub_running/paused
 367          * at any time. Let's wake up @scrub_pause_wait as
 368          * much as we can to let commit transaction blocked less.
 369          */
 370         wake_up(&fs_info->scrub_pause_wait);
 371
 372         atomic_inc(&sctx->workers_pending);
 373 }
 374
 375 /* used for workers that require transaction commits */
 376 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
 377 {
 378         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 379
 380         /*
 381          * see scrub_pending_trans_workers_inc() why we're pretending
 382          * to be paused in the scrub counters
 383          */
 384         mutex_lock(&fs_info->scrub_lock);
 385         atomic_dec(&fs_info->scrubs_running);
 386         atomic_dec(&fs_info->scrubs_paused);
 387         mutex_unlock(&fs_info->scrub_lock);
 388         atomic_dec(&sctx->workers_pending);
 389         wake_up(&fs_info->scrub_pause_wait);
 390         wake_up(&sctx->list_wait);
 391 }
 392
 393 static void scrub_free_csums(struct scrub_ctx *sctx)
 394 {
 395         while (!list_empty(&sctx->csum_list)) {
 396                 struct btrfs_ordered_sum *sum;
 397                 sum = list_first_entry(&sctx->csum_list,
 398                                        struct btrfs_ordered_sum, list);
 399                 list_del(&sum->list);
 400                 kfree(sum);
 401         }
 402 }
 403
 404 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 405 {
 406         int i;
 407
 408         if (!sctx)
 409                 return;
 410
 411         scrub_free_wr_ctx(&sctx->wr_ctx);
 412
 413         /* this can happen when scrub is cancelled */
 414         if (sctx->curr != -1) {
 415                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
 416
 417                 for (i = 0; i < sbio->page_count; i++) {
 418                         WARN_ON(!sbio->pagev[i]->page);
 419                         scrub_block_put(sbio->pagev[i]->sblock);
 420                 }
 421                 bio_put(sbio->bio);
 422         }
 423
 424         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 425                 struct scrub_bio *sbio = sctx->bios[i];
 426
 427                 if (!sbio)
 428                         break;
 429                 kfree(sbio);
 430         }
 431
 432         scrub_free_csums(sctx);
 433         kfree(sctx);
 434 }
 435
 436 static noinline_for_stack
 437 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 438 {
 439         struct scrub_ctx *sctx;
 440         int             i;
 441         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
 442         int pages_per_rd_bio;
 443         int ret;
 444
 445         /*
 446          * the setting of pages_per_rd_bio is correct for scrub but might
 447          * be wrong for the dev_replace code where we might read from
 448          * different devices in the initial huge bios. However, that
 449          * code is able to correctly handle the case when adding a page
 450          * to a bio fails.
 451          */
 452         if (dev->bdev)
 453                 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
 454                                          bio_get_nr_vecs(dev->bdev));
 455         else
 456                 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 457         sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
 458         if (!sctx)
 459                 goto nomem;
 460         sctx->is_dev_replace = is_dev_replace;
 461         sctx->pages_per_rd_bio = pages_per_rd_bio;
 462         sctx->curr = -1;
 463         sctx->dev_root = dev->dev_root;
 464         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 465                 struct scrub_bio *sbio;
 466
 467                 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
 468                 if (!sbio)
 469                         goto nomem;
 470                 sctx->bios[i] = sbio;
 471
 472                 sbio->index = i;
 473                 sbio->sctx = sctx;
 474                 sbio->page_count = 0;
 475                 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
 476                                 scrub_bio_end_io_worker, NULL, NULL);
 477
 478                 if (i != SCRUB_BIOS_PER_SCTX - 1)
 479                         sctx->bios[i]->next_free = i + 1;
 480                 else
 481                         sctx->bios[i]->next_free = -1;
 482         }
 483         sctx->first_free = 0;
 484         sctx->nodesize = dev->dev_root->nodesize;
 485         sctx->sectorsize = dev->dev_root->sectorsize;
 486         atomic_set(&sctx->bios_in_flight, 0);
 487         atomic_set(&sctx->workers_pending, 0);
 488         atomic_set(&sctx->cancel_req, 0);
 489         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 490         INIT_LIST_HEAD(&sctx->csum_list);
 491
 492         spin_lock_init(&sctx->list_lock);
 493         spin_lock_init(&sctx->stat_lock);
 494         init_waitqueue_head(&sctx->list_wait);
 495
 496         ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
 497                                  fs_info->dev_replace.tgtdev, is_dev_replace);
 498         if (ret) {
 499                 scrub_free_ctx(sctx);
 500                 return ERR_PTR(ret);
 501         }
 502         return sctx;
 503
 504 nomem:
 505         scrub_free_ctx(sctx);
 506         return ERR_PTR(-ENOMEM);
 507 }
 508
 509 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 510                                      void *warn_ctx)
 511 {
 512         u64 isize;
 513         u32 nlink;
 514         int ret;
 515         int i;
 516         struct extent_buffer *eb;
 517         struct btrfs_inode_item *inode_item;
 518         struct scrub_warning *swarn = warn_ctx;
 519         struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
 520         struct inode_fs_paths *ipath = NULL;
 521         struct btrfs_root *local_root;
 522         struct btrfs_key root_key;
 523         struct btrfs_key key;
 524
 525         root_key.objectid = root;
 526         root_key.type = BTRFS_ROOT_ITEM_KEY;
 527         root_key.offset = (u64)-1;
 528         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 529         if (IS_ERR(local_root)) {
 530                 ret = PTR_ERR(local_root);
 531                 goto err;
 532         }
 533
 534         /*
 535          * this makes the path point to (inum INODE_ITEM ioff)
 536          */
 537         key.objectid = inum;
 538         key.type = BTRFS_INODE_ITEM_KEY;
 539         key.offset = 0;
 540
 541         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 542         if (ret) {
 543                 btrfs_release_path(swarn->path);
 544                 goto err;
 545         }
 546
 547         eb = swarn->path->nodes[0];
 548         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 549                                         struct btrfs_inode_item);
 550         isize = btrfs_inode_size(eb, inode_item);
 551         nlink = btrfs_inode_nlink(eb, inode_item);
 552         btrfs_release_path(swarn->path);
 553
 554         ipath = init_ipath(4096, local_root, swarn->path);
 555         if (IS_ERR(ipath)) {
 556                 ret = PTR_ERR(ipath);
 557                 ipath = NULL;
 558                 goto err;
 559         }
 560         ret = paths_from_inode(inum, ipath);
 561
 562         if (ret < 0)
 563                 goto err;
 564
 565         /*
 566          * we deliberately ignore the bit ipath might have been too small to
 567          * hold all of the paths here
 568          */
 569         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 570                 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 571                         "%s, sector %llu, root %llu, inode %llu, offset %llu, "
 572                         "length %llu, links %u (path: %s)\n", swarn->errstr,
 573                         swarn->logical, rcu_str_deref(swarn->dev->name),
 574                         (unsigned long long)swarn->sector, root, inum, offset,
 575                         min(isize - offset, (u64)PAGE_SIZE), nlink,
 576                         (char *)(unsigned long)ipath->fspath->val[i]);
 577
 578         free_ipath(ipath);
 579         return 0;
 580
 581 err:
 582         printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 583                 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
 584                 "resolving failed with ret=%d\n", swarn->errstr,
 585                 swarn->logical, rcu_str_deref(swarn->dev->name),
 586                 (unsigned long long)swarn->sector, root, inum, offset, ret);
 587
 588         free_ipath(ipath);
 589         return 0;
 590 }
 591
 592 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 593 {
 594         struct btrfs_device *dev;
 595         struct btrfs_fs_info *fs_info;
 596         struct btrfs_path *path;
 597         struct btrfs_key found_key;
 598         struct extent_buffer *eb;
 599         struct btrfs_extent_item *ei;
 600         struct scrub_warning swarn;
 601         unsigned long ptr = 0;
 602         u64 extent_item_pos;
 603         u64 flags = 0;
 604         u64 ref_root;
 605         u32 item_size;
 606         u8 ref_level;
 607         int ret;
 608
 609         WARN_ON(sblock->page_count < 1);
 610         dev = sblock->pagev[0]->dev;
 611         fs_info = sblock->sctx->dev_root->fs_info;
 612
 613         path = btrfs_alloc_path();
 614         if (!path)
 615                 return;
 616
 617         swarn.sector = (sblock->pagev[0]->physical) >> 9;
 618         swarn.logical = sblock->pagev[0]->logical;
 619         swarn.errstr = errstr;
 620         swarn.dev = NULL;
 621
 622         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 623                                   &flags);
 624         if (ret < 0)
 625                 goto out;
 626
 627         extent_item_pos = swarn.logical - found_key.objectid;
 628         swarn.extent_item_size = found_key.offset;
 629
 630         eb = path->nodes[0];
 631         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 632         item_size = btrfs_item_size_nr(eb, path->slots[0]);
 633
 634         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 635                 do {
 636                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 637                                                       item_size, &ref_root,
 638                                                       &ref_level);
 639                         printk_in_rcu(KERN_WARNING
 640                                 "BTRFS: %s at logical %llu on dev %s, "
 641                                 "sector %llu: metadata %s (level %d) in tree "
 642                                 "%llu\n", errstr, swarn.logical,
 643                                 rcu_str_deref(dev->name),
 644                                 (unsigned long long)swarn.sector,
 645                                 ref_level ? "node" : "leaf",
 646                                 ret < 0 ? -1 : ref_level,
 647                                 ret < 0 ? -1 : ref_root);
 648                 } while (ret != 1);
 649                 btrfs_release_path(path);
 650         } else {
 651                 btrfs_release_path(path);
 652                 swarn.path = path;
 653                 swarn.dev = dev;
 654                 iterate_extent_inodes(fs_info, found_key.objectid,
 655                                         extent_item_pos, 1,
 656                                         scrub_print_warning_inode, &swarn);
 657         }
 658
 659 out:
 660         btrfs_free_path(path);
 661 }
 662
 663 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 664 {
 665         struct page *page = NULL;
 666         unsigned long index;
 667         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
 668         int ret;
 669         int corrected = 0;
 670         struct btrfs_key key;
 671         struct inode *inode = NULL;
 672         struct btrfs_fs_info *fs_info;
 673         u64 end = offset + PAGE_SIZE - 1;
 674         struct btrfs_root *local_root;
 675         int srcu_index;
 676
 677         key.objectid = root;
 678         key.type = BTRFS_ROOT_ITEM_KEY;
 679         key.offset = (u64)-1;
 680
 681         fs_info = fixup->root->fs_info;
 682         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
 683
 684         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
 685         if (IS_ERR(local_root)) {
 686                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 687                 return PTR_ERR(local_root);
 688         }
 689
 690         key.type = BTRFS_INODE_ITEM_KEY;
 691         key.objectid = inum;
 692         key.offset = 0;
 693         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
 694         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 695         if (IS_ERR(inode))
 696                 return PTR_ERR(inode);
 697
 698         index = offset >> PAGE_CACHE_SHIFT;
 699
 700         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 701         if (!page) {
 702                 ret = -ENOMEM;
 703                 goto out;
 704         }
 705
 706         if (PageUptodate(page)) {
 707                 if (PageDirty(page)) {
 708                         /*
 709                          * we need to write the data to the defect sector. the
 710                          * data that was in that sector is not in memory,
 711                          * because the page was modified. we must not write the
 712                          * modified page to that sector.
 713                          *
 714                          * TODO: what could be done here: wait for the delalloc
 715                          *       runner to write out that page (might involve
 716                          *       COW) and see whether the sector is still
 717                          *       referenced afterwards.
 718                          *
 719                          * For the meantime, we'll treat this error
 720                          * incorrectable, although there is a chance that a
 721                          * later scrub will find the bad sector again and that
 722                          * there's no dirty page in memory, then.
 723                          */
 724                         ret = -EIO;
 725                         goto out;
 726                 }
 727                 ret = repair_io_failure(inode, offset, PAGE_SIZE,
 728                                         fixup->logical, page,
 729                                         offset - page_offset(page),
 730                                         fixup->mirror_num);
 731                 unlock_page(page);
 732                 corrected = !ret;
 733         } else {
 734                 /*
 735                  * we need to get good data first. the general readpage path
 736                  * will call repair_io_failure for us, we just have to make
 737                  * sure we read the bad mirror.
 738                  */
 739                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 740                                         EXTENT_DAMAGED, GFP_NOFS);
 741                 if (ret) {
 742                         /* set_extent_bits should give proper error */
 743                         WARN_ON(ret > 0);
 744                         if (ret > 0)
 745                                 ret = -EFAULT;
 746                         goto out;
 747                 }
 748
 749                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
 750                                                 btrfs_get_extent,
 751                                                 fixup->mirror_num);
 752                 wait_on_page_locked(page);
 753
 754                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
 755                                                 end, EXTENT_DAMAGED, 0, NULL);
 756                 if (!corrected)
 757                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 758                                                 EXTENT_DAMAGED, GFP_NOFS);
 759         }
 760
 761 out:
 762         if (page)
 763                 put_page(page);
 764
 765         iput(inode);
 766
 767         if (ret < 0)
 768                 return ret;
 769
 770         if (ret == 0 && corrected) {
 771                 /*
 772                  * we only need to call readpage for one of the inodes belonging
 773                  * to this extent. so make iterate_extent_inodes stop
 774                  */
 775                 return 1;
 776         }
 777
 778         return -EIO;
 779 }
 780
 781 static void scrub_fixup_nodatasum(struct btrfs_work *work)
 782 {
 783         int ret;
 784         struct scrub_fixup_nodatasum *fixup;
 785         struct scrub_ctx *sctx;
 786         struct btrfs_trans_handle *trans = NULL;
 787         struct btrfs_path *path;
 788         int uncorrectable = 0;
 789
 790         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
 791         sctx = fixup->sctx;
 792
 793         path = btrfs_alloc_path();
 794         if (!path) {
 795                 spin_lock(&sctx->stat_lock);
 796                 ++sctx->stat.malloc_errors;
 797                 spin_unlock(&sctx->stat_lock);
 798                 uncorrectable = 1;
 799                 goto out;
 800         }
 801
 802         trans = btrfs_join_transaction(fixup->root);
 803         if (IS_ERR(trans)) {
 804                 uncorrectable = 1;
 805                 goto out;
 806         }
 807
 808         /*
 809          * the idea is to trigger a regular read through the standard path. we
 810          * read a page from the (failed) logical address by specifying the
 811          * corresponding copynum of the failed sector. thus, that readpage is
 812          * expected to fail.
 813          * that is the point where on-the-fly error correction will kick in
 814          * (once it's finished) and rewrite the failed sector if a good copy
 815          * can be found.
 816          */
 817         ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
 818                                                 path, scrub_fixup_readpage,
 819                                                 fixup);
 820         if (ret < 0) {
 821                 uncorrectable = 1;
 822                 goto out;
 823         }
 824         WARN_ON(ret != 1);
 825
 826         spin_lock(&sctx->stat_lock);
 827         ++sctx->stat.corrected_errors;
 828         spin_unlock(&sctx->stat_lock);
 829
 830 out:
 831         if (trans && !IS_ERR(trans))
 832                 btrfs_end_transaction(trans, fixup->root);
 833         if (uncorrectable) {
 834                 spin_lock(&sctx->stat_lock);
 835                 ++sctx->stat.uncorrectable_errors;
 836                 spin_unlock(&sctx->stat_lock);
 837                 btrfs_dev_replace_stats_inc(
 838                         &sctx->dev_root->fs_info->dev_replace.
 839                         num_uncorrectable_read_errors);
 840                 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
 841                     "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
 842                         fixup->logical, rcu_str_deref(fixup->dev->name));
 843         }
 844
 845         btrfs_free_path(path);
 846         kfree(fixup);
 847
 848         scrub_pending_trans_workers_dec(sctx);
 849 }
 850
 851 static inline void scrub_get_recover(struct scrub_recover *recover)
 852 {
 853         atomic_inc(&recover->refs);
 854 }
 855
 856 static inline void scrub_put_recover(struct scrub_recover *recover)
 857 {
 858         if (atomic_dec_and_test(&recover->refs)) {
 859                 kfree(recover->bbio);
 860                 kfree(recover->raid_map);
 861                 kfree(recover);
 862         }
 863 }
 864
 865 /*
 866  * scrub_handle_errored_block gets called when either verification of the
 867  * pages failed or the bio failed to read, e.g. with EIO. In the latter
 868  * case, this function handles all pages in the bio, even though only one
 869  * may be bad.
 870  * The goal of this function is to repair the errored block by using the
 871  * contents of one of the mirrors.
 872  */
 873 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 874 {
 875         struct scrub_ctx *sctx = sblock_to_check->sctx;
 876         struct btrfs_device *dev;
 877         struct btrfs_fs_info *fs_info;
 878         u64 length;
 879         u64 logical;
 880         u64 generation;
 881         unsigned int failed_mirror_index;
 882         unsigned int is_metadata;
 883         unsigned int have_csum;
 884         u8 *csum;
 885         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 886         struct scrub_block *sblock_bad;
 887         int ret;
 888         int mirror_index;
 889         int page_num;
 890         int success;
 891         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 892                                       DEFAULT_RATELIMIT_BURST);
 893
 894         BUG_ON(sblock_to_check->page_count < 1);
 895         fs_info = sctx->dev_root->fs_info;
 896         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 897                 /*
 898                  * if we find an error in a super block, we just report it.
 899                  * They will get written with the next transaction commit
 900                  * anyway
 901                  */
 902                 spin_lock(&sctx->stat_lock);
 903                 ++sctx->stat.super_errors;
 904                 spin_unlock(&sctx->stat_lock);
 905                 return 0;
 906         }
 907         length = sblock_to_check->page_count * PAGE_SIZE;
 908         logical = sblock_to_check->pagev[0]->logical;
 909         generation = sblock_to_check->pagev[0]->generation;
 910         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
 911         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
 912         is_metadata = !(sblock_to_check->pagev[0]->flags &
 913                         BTRFS_EXTENT_FLAG_DATA);
 914         have_csum = sblock_to_check->pagev[0]->have_csum;
 915         csum = sblock_to_check->pagev[0]->csum;
 916         dev = sblock_to_check->pagev[0]->dev;
 917
 918         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
 919                 sblocks_for_recheck = NULL;
 920                 goto nodatasum_case;
 921         }
 922
 923         /*
 924          * read all mirrors one after the other. This includes to
 925          * re-read the extent or metadata block that failed (that was
 926          * the cause that this fixup code is called) another time,
 927          * page by page this time in order to know which pages
 928          * caused I/O errors and which ones are good (for all mirrors).
 929          * It is the goal to handle the situation when more than one
 930          * mirror contains I/O errors, but the errors do not
 931          * overlap, i.e. the data can be repaired by selecting the
 932          * pages from those mirrors without I/O error on the
 933          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
 934          * would be that mirror #1 has an I/O error on the first page,
 935          * the second page is good, and mirror #2 has an I/O error on
 936          * the second page, but the first page is good.
 937          * Then the first page of the first mirror can be repaired by
 938          * taking the first page of the second mirror, and the
 939          * second page of the second mirror can be repaired by
 940          * copying the contents of the 2nd page of the 1st mirror.
 941          * One more note: if the pages of one mirror contain I/O
 942          * errors, the checksum cannot be verified. In order to get
 943          * the best data for repairing, the first attempt is to find
 944          * a mirror without I/O errors and with a validated checksum.
 945          * Only if this is not possible, the pages are picked from
 946          * mirrors with I/O errors without considering the checksum.
 947          * If the latter is the case, at the end, the checksum of the
 948          * repaired area is verified in order to correctly maintain
 949          * the statistics.
 950          */
 951
 952         sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
 953                                      sizeof(*sblocks_for_recheck),
 954                                      GFP_NOFS);
 955         if (!sblocks_for_recheck) {
 956                 spin_lock(&sctx->stat_lock);
 957                 sctx->stat.malloc_errors++;
 958                 sctx->stat.read_errors++;
 959                 sctx->stat.uncorrectable_errors++;
 960                 spin_unlock(&sctx->stat_lock);
 961                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 962                 goto out;
 963         }
 964
 965         /* setup the context, map the logical blocks and alloc the pages */
 966         ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
 967                                         logical, sblocks_for_recheck);
 968         if (ret) {
 969                 spin_lock(&sctx->stat_lock);
 970                 sctx->stat.read_errors++;
 971                 sctx->stat.uncorrectable_errors++;
 972                 spin_unlock(&sctx->stat_lock);
 973                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 974                 goto out;
 975         }
 976         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 977         sblock_bad = sblocks_for_recheck + failed_mirror_index;
 978
 979         /* build and submit the bios for the failed mirror, check checksums */
 980         scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
 981                             csum, generation, sctx->csum_size, 1);
 982
 983         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 984             sblock_bad->no_io_error_seen) {
 985                 /*
 986                  * the error disappeared after reading page by page, or
 987                  * the area was part of a huge bio and other parts of the
 988                  * bio caused I/O errors, or the block layer merged several
 989                  * read requests into one and the error is caused by a
 990                  * different bio (usually one of the two latter cases is
 991                  * the cause)
 992                  */
 993                 spin_lock(&sctx->stat_lock);
 994                 sctx->stat.unverified_errors++;
 995                 sblock_to_check->data_corrected = 1;
 996                 spin_unlock(&sctx->stat_lock);
 997
 998                 if (sctx->is_dev_replace)
 999                         scrub_write_block_to_dev_replace(sblock_bad);
1000                 goto out;
1001         }
1002
1003         if (!sblock_bad->no_io_error_seen) {
1004                 spin_lock(&sctx->stat_lock);
1005                 sctx->stat.read_errors++;
1006                 spin_unlock(&sctx->stat_lock);
1007                 if (__ratelimit(&_rs))
1008                         scrub_print_warning("i/o error", sblock_to_check);
1009                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1010         } else if (sblock_bad->checksum_error) {
1011                 spin_lock(&sctx->stat_lock);
1012                 sctx->stat.csum_errors++;
1013                 spin_unlock(&sctx->stat_lock);
1014                 if (__ratelimit(&_rs))
1015                         scrub_print_warning("checksum error", sblock_to_check);
1016                 btrfs_dev_stat_inc_and_print(dev,
1017                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
1018         } else if (sblock_bad->header_error) {
1019                 spin_lock(&sctx->stat_lock);
1020                 sctx->stat.verify_errors++;
1021                 spin_unlock(&sctx->stat_lock);
1022                 if (__ratelimit(&_rs))
1023                         scrub_print_warning("checksum/header error",
1024                                             sblock_to_check);
1025                 if (sblock_bad->generation_error)
1026                         btrfs_dev_stat_inc_and_print(dev,
1027                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1028                 else
1029                         btrfs_dev_stat_inc_and_print(dev,
1030                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1031         }
1032
1033         if (sctx->readonly) {
1034                 ASSERT(!sctx->is_dev_replace);
1035                 goto out;
1036         }
1037
1038         if (!is_metadata && !have_csum) {
1039                 struct scrub_fixup_nodatasum *fixup_nodatasum;
1040
1041 nodatasum_case:
1042                 WARN_ON(sctx->is_dev_replace);
1043
1044                 /*
1045                  * !is_metadata and !have_csum, this means that the data
1046                  * might not be COW'ed, that it might be modified
1047                  * concurrently. The general strategy to work on the
1048                  * commit root does not help in the case when COW is not
1049                  * used.
1050                  */
1051                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1052                 if (!fixup_nodatasum)
1053                         goto did_not_correct_error;
1054                 fixup_nodatasum->sctx = sctx;
1055                 fixup_nodatasum->dev = dev;
1056                 fixup_nodatasum->logical = logical;
1057                 fixup_nodatasum->root = fs_info->extent_root;
1058                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1059                 scrub_pending_trans_workers_inc(sctx);
1060                 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1061                                 scrub_fixup_nodatasum, NULL, NULL);
1062                 btrfs_queue_work(fs_info->scrub_workers,
1063                                  &fixup_nodatasum->work);
1064                 goto out;
1065         }
1066
1067         /*
1068          * now build and submit the bios for the other mirrors, check
1069          * checksums.
1070          * First try to pick the mirror which is completely without I/O
1071          * errors and also does not have a checksum error.
1072          * If one is found, and if a checksum is present, the full block
1073          * that is known to contain an error is rewritten. Afterwards
1074          * the block is known to be corrected.
1075          * If a mirror is found which is completely correct, and no
1076          * checksum is present, only those pages are rewritten that had
1077          * an I/O error in the block to be repaired, since it cannot be
1078          * determined, which copy of the other pages is better (and it
1079          * could happen otherwise that a correct page would be
1080          * overwritten by a bad one).
1081          */
1082         for (mirror_index = 0;
1083              mirror_index < BTRFS_MAX_MIRRORS &&
1084              sblocks_for_recheck[mirror_index].page_count > 0;
1085              mirror_index++) {
1086                 struct scrub_block *sblock_other;
1087
1088                 if (mirror_index == failed_mirror_index)
1089                         continue;
1090                 sblock_other = sblocks_for_recheck + mirror_index;
1091
1092                 /* build and submit the bios, check checksums */
1093                 scrub_recheck_block(fs_info, sblock_other, is_metadata,
1094                                     have_csum, csum, generation,
1095                                     sctx->csum_size, 0);
1096
1097                 if (!sblock_other->header_error &&
1098                     !sblock_other->checksum_error &&
1099                     sblock_other->no_io_error_seen) {
1100                         if (sctx->is_dev_replace) {
1101                                 scrub_write_block_to_dev_replace(sblock_other);
1102                         } else {
1103                                 int force_write = is_metadata || have_csum;
1104
1105                                 ret = scrub_repair_block_from_good_copy(
1106                                                 sblock_bad, sblock_other,
1107                                                 force_write);
1108                         }
1109                         if (0 == ret)
1110                                 goto corrected_error;
1111                 }
1112         }
1113
1114         /*
1115          * for dev_replace, pick good pages and write to the target device.
1116          */
1117         if (sctx->is_dev_replace) {
1118                 success = 1;
1119                 for (page_num = 0; page_num < sblock_bad->page_count;
1120                      page_num++) {
1121                         int sub_success;
1122
1123                         sub_success = 0;
1124                         for (mirror_index = 0;
1125                              mirror_index < BTRFS_MAX_MIRRORS &&
1126                              sblocks_for_recheck[mirror_index].page_count > 0;
1127                              mirror_index++) {
1128                                 struct scrub_block *sblock_other =
1129                                         sblocks_for_recheck + mirror_index;
1130                                 struct scrub_page *page_other =
1131                                         sblock_other->pagev[page_num];
1132
1133                                 if (!page_other->io_error) {
1134                                         ret = scrub_write_page_to_dev_replace(
1135                                                         sblock_other, page_num);
1136                                         if (ret == 0) {
1137                                                 /* succeeded for this page */
1138                                                 sub_success = 1;
1139                                                 break;
1140                                         } else {
1141                                                 btrfs_dev_replace_stats_inc(
1142                                                         &sctx->dev_root->
1143                                                         fs_info->dev_replace.
1144                                                         num_write_errors);
1145                                         }
1146                                 }
1147                         }
1148
1149                         if (!sub_success) {
1150                                 /*
1151                                  * did not find a mirror to fetch the page
1152                                  * from. scrub_write_page_to_dev_replace()
1153                                  * handles this case (page->io_error), by
1154                                  * filling the block with zeros before
1155                                  * submitting the write request
1156                                  */
1157                                 success = 0;
1158                                 ret = scrub_write_page_to_dev_replace(
1159                                                 sblock_bad, page_num);
1160                                 if (ret)
1161                                         btrfs_dev_replace_stats_inc(
1162                                                 &sctx->dev_root->fs_info->
1163                                                 dev_replace.num_write_errors);
1164                         }
1165                 }
1166
1167                 goto out;
1168         }
1169
1170         /*
1171          * for regular scrub, repair those pages that are errored.
1172          * In case of I/O errors in the area that is supposed to be
1173          * repaired, continue by picking good copies of those pages.
1174          * Select the good pages from mirrors to rewrite bad pages from
1175          * the area to fix. Afterwards verify the checksum of the block
1176          * that is supposed to be repaired. This verification step is
1177          * only done for the purpose of statistic counting and for the
1178          * final scrub report, whether errors remain.
1179          * A perfect algorithm could make use of the checksum and try
1180          * all possible combinations of pages from the different mirrors
1181          * until the checksum verification succeeds. For example, when
1182          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1183          * of mirror #2 is readable but the final checksum test fails,
1184          * then the 2nd page of mirror #3 could be tried, whether now
1185          * the final checksum succeedes. But this would be a rare
1186          * exception and is therefore not implemented. At least it is
1187          * avoided that the good copy is overwritten.
1188          * A more useful improvement would be to pick the sectors
1189          * without I/O error based on sector sizes (512 bytes on legacy
1190          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1191          * mirror could be repaired by taking 512 byte of a different
1192          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1193          * area are unreadable.
1194          */
1195
1196         /* can only fix I/O errors from here on */
1197         if (sblock_bad->no_io_error_seen)
1198                 goto did_not_correct_error;
1199
1200         success = 1;
1201         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1202                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1203
1204                 if (!page_bad->io_error)
1205                         continue;
1206
1207                 for (mirror_index = 0;
1208                      mirror_index < BTRFS_MAX_MIRRORS &&
1209                      sblocks_for_recheck[mirror_index].page_count > 0;
1210                      mirror_index++) {
1211                         struct scrub_block *sblock_other = sblocks_for_recheck +
1212                                                            mirror_index;
1213                         struct scrub_page *page_other = sblock_other->pagev[
1214                                                         page_num];
1215
1216                         if (!page_other->io_error) {
1217                                 ret = scrub_repair_page_from_good_copy(
1218                                         sblock_bad, sblock_other, page_num, 0);
1219                                 if (0 == ret) {
1220                                         page_bad->io_error = 0;
1221                                         break; /* succeeded for this page */
1222                                 }
1223                         }
1224                 }
1225
1226                 if (page_bad->io_error) {
1227                         /* did not find a mirror to copy the page from */
1228                         success = 0;
1229                 }
1230         }
1231
1232         if (success) {
1233                 if (is_metadata || have_csum) {
1234                         /*
1235                          * need to verify the checksum now that all
1236                          * sectors on disk are repaired (the write
1237                          * request for data to be repaired is on its way).
1238                          * Just be lazy and use scrub_recheck_block()
1239                          * which re-reads the data before the checksum
1240                          * is verified, but most likely the data comes out
1241                          * of the page cache.
1242                          */
1243                         scrub_recheck_block(fs_info, sblock_bad,
1244                                             is_metadata, have_csum, csum,
1245                                             generation, sctx->csum_size, 1);
1246                         if (!sblock_bad->header_error &&
1247                             !sblock_bad->checksum_error &&
1248                             sblock_bad->no_io_error_seen)
1249                                 goto corrected_error;
1250                         else
1251                                 goto did_not_correct_error;
1252                 } else {
1253 corrected_error:
1254                         spin_lock(&sctx->stat_lock);
1255                         sctx->stat.corrected_errors++;
1256                         sblock_to_check->data_corrected = 1;
1257                         spin_unlock(&sctx->stat_lock);
1258                         printk_ratelimited_in_rcu(KERN_ERR
1259                                 "BTRFS: fixed up error at logical %llu on dev %s\n",
1260                                 logical, rcu_str_deref(dev->name));
1261                 }
1262         } else {
1263 did_not_correct_error:
1264                 spin_lock(&sctx->stat_lock);
1265                 sctx->stat.uncorrectable_errors++;
1266                 spin_unlock(&sctx->stat_lock);
1267                 printk_ratelimited_in_rcu(KERN_ERR
1268                         "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
1269                         logical, rcu_str_deref(dev->name));
1270         }
1271
1272 out:
1273         if (sblocks_for_recheck) {
1274                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1275                      mirror_index++) {
1276                         struct scrub_block *sblock = sblocks_for_recheck +
1277                                                      mirror_index;
1278                         struct scrub_recover *recover;
1279                         int page_index;
1280
1281                         for (page_index = 0; page_index < sblock->page_count;
1282                              page_index++) {
1283                                 sblock->pagev[page_index]->sblock = NULL;
1284                                 recover = sblock->pagev[page_index]->recover;
1285                                 if (recover) {
1286                                         scrub_put_recover(recover);
1287                                         sblock->pagev[page_index]->recover =
1288                                                                         NULL;
1289                                 }
1290                                 scrub_page_put(sblock->pagev[page_index]);
1291                         }
1292                 }
1293                 kfree(sblocks_for_recheck);
1294         }
1295
1296         return 0;
1297 }
1298
1299 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
1300 {
1301         if (raid_map) {
1302                 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
1303                         return 3;
1304                 else
1305                         return 2;
1306         } else {
1307                 return (int)bbio->num_stripes;
1308         }
1309 }
1310
1311 static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
1312                                                  u64 mapped_length,
1313                                                  int nstripes, int mirror,
1314                                                  int *stripe_index,
1315                                                  u64 *stripe_offset)
1316 {
1317         int i;
1318
1319         if (raid_map) {
1320                 /* RAID5/6 */
1321                 for (i = 0; i < nstripes; i++) {
1322                         if (raid_map[i] == RAID6_Q_STRIPE ||
1323                             raid_map[i] == RAID5_P_STRIPE)
1324                                 continue;
1325
1326                         if (logical >= raid_map[i] &&
1327                             logical < raid_map[i] + mapped_length)
1328                                 break;
1329                 }
1330
1331                 *stripe_index = i;
1332                 *stripe_offset = logical - raid_map[i];
1333         } else {
1334                 /* The other RAID type */
1335                 *stripe_index = mirror;
1336                 *stripe_offset = 0;
1337         }
1338 }
1339
1340 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1341                                      struct btrfs_fs_info *fs_info,
1342                                      struct scrub_block *original_sblock,
1343                                      u64 length, u64 logical,
1344                                      struct scrub_block *sblocks_for_recheck)
1345 {
1346         struct scrub_recover *recover;
1347         struct btrfs_bio *bbio;
1348         u64 *raid_map;
1349         u64 sublen;
1350         u64 mapped_length;
1351         u64 stripe_offset;
1352         int stripe_index;
1353         int page_index;
1354         int mirror_index;
1355         int nmirrors;
1356         int ret;
1357
1358         /*
1359          * note: the two members ref_count and outstanding_pages
1360          * are not used (and not set) in the blocks that are used for
1361          * the recheck procedure
1362          */
1363
1364         page_index = 0;
1365         while (length > 0) {
1366                 sublen = min_t(u64, length, PAGE_SIZE);
1367                 mapped_length = sublen;
1368                 bbio = NULL;
1369                 raid_map = NULL;
1370
1371                 /*
1372                  * with a length of PAGE_SIZE, each returned stripe
1373                  * represents one mirror
1374                  */
1375                 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
1376                                        &mapped_length, &bbio, 0, &raid_map);
1377                 if (ret || !bbio || mapped_length < sublen) {
1378                         kfree(bbio);
1379                         kfree(raid_map);
1380                         return -EIO;
1381                 }
1382
1383                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1384                 if (!recover) {
1385                         kfree(bbio);
1386                         kfree(raid_map);
1387                         return -ENOMEM;
1388                 }
1389
1390                 atomic_set(&recover->refs, 1);
1391                 recover->bbio = bbio;
1392                 recover->raid_map = raid_map;
1393                 recover->map_length = mapped_length;
1394
1395                 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1396
1397                 nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
1398                 for (mirror_index = 0; mirror_index < nmirrors;
1399                      mirror_index++) {
1400                         struct scrub_block *sblock;
1401                         struct scrub_page *page;
1402
1403                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1404                                 continue;
1405
1406                         sblock = sblocks_for_recheck + mirror_index;
1407                         sblock->sctx = sctx;
1408                         page = kzalloc(sizeof(*page), GFP_NOFS);
1409                         if (!page) {
1410 leave_nomem:
1411                                 spin_lock(&sctx->stat_lock);
1412                                 sctx->stat.malloc_errors++;
1413                                 spin_unlock(&sctx->stat_lock);
1414                                 scrub_put_recover(recover);
1415                                 return -ENOMEM;
1416                         }
1417                         scrub_page_get(page);
1418                         sblock->pagev[page_index] = page;
1419                         page->logical = logical;
1420
1421                         scrub_stripe_index_and_offset(logical, raid_map,
1422                                                       mapped_length,
1423                                                       bbio->num_stripes,
1424                                                       mirror_index,
1425                                                       &stripe_index,
1426                                                       &stripe_offset);
1427                         page->physical = bbio->stripes[stripe_index].physical +
1428                                          stripe_offset;
1429                         page->dev = bbio->stripes[stripe_index].dev;
1430
1431                         BUG_ON(page_index >= original_sblock->page_count);
1432                         page->physical_for_dev_replace =
1433                                 original_sblock->pagev[page_index]->
1434                                 physical_for_dev_replace;
1435                         /* for missing devices, dev->bdev is NULL */
1436                         page->mirror_num = mirror_index + 1;
1437                         sblock->page_count++;
1438                         page->page = alloc_page(GFP_NOFS);
1439                         if (!page->page)
1440                                 goto leave_nomem;
1441
1442                         scrub_get_recover(recover);
1443                         page->recover = recover;
1444                 }
1445                 scrub_put_recover(recover);
1446                 length -= sublen;
1447                 logical += sublen;
1448                 page_index++;
1449         }
1450
1451         return 0;
1452 }
1453
1454 struct scrub_bio_ret {
1455         struct completion event;
1456         int error;
1457 };
1458
1459 static void scrub_bio_wait_endio(struct bio *bio, int error)
1460 {
1461         struct scrub_bio_ret *ret = bio->bi_private;
1462
1463         ret->error = error;
1464         complete(&ret->event);
1465 }
1466
1467 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1468 {
1469         return page->recover && page->recover->raid_map;
1470 }
1471
1472 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1473                                         struct bio *bio,
1474                                         struct scrub_page *page)
1475 {
1476         struct scrub_bio_ret done;
1477         int ret;
1478
1479         init_completion(&done.event);
1480         done.error = 0;
1481         bio->bi_iter.bi_sector = page->logical >> 9;
1482         bio->bi_private = &done;
1483         bio->bi_end_io = scrub_bio_wait_endio;
1484
1485         ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
1486                                     page->recover->raid_map,
1487                                     page->recover->map_length,
1488                                     page->mirror_num, 0);
1489         if (ret)
1490                 return ret;
1491
1492         wait_for_completion(&done.event);
1493         if (done.error)
1494                 return -EIO;
1495
1496         return 0;
1497 }
1498
1499 /*
1500  * this function will check the on disk data for checksum errors, header
1501  * errors and read I/O errors. If any I/O errors happen, the exact pages
1502  * which are errored are marked as being bad. The goal is to enable scrub
1503  * to take those pages that are not errored from all the mirrors so that
1504  * the pages that are errored in the just handled mirror can be repaired.
1505  */
1506 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1507                                 struct scrub_block *sblock, int is_metadata,
1508                                 int have_csum, u8 *csum, u64 generation,
1509                                 u16 csum_size, int retry_failed_mirror)
1510 {
1511         int page_num;
1512
1513         sblock->no_io_error_seen = 1;
1514         sblock->header_error = 0;
1515         sblock->checksum_error = 0;
1516
1517         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1518                 struct bio *bio;
1519                 struct scrub_page *page = sblock->pagev[page_num];
1520
1521                 if (page->dev->bdev == NULL) {
1522                         page->io_error = 1;
1523                         sblock->no_io_error_seen = 0;
1524                         continue;
1525                 }
1526
1527                 WARN_ON(!page->page);
1528                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1529                 if (!bio) {
1530                         page->io_error = 1;
1531                         sblock->no_io_error_seen = 0;
1532                         continue;
1533                 }
1534                 bio->bi_bdev = page->dev->bdev;
1535
1536                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1537                 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1538                         if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
1539                                 sblock->no_io_error_seen = 0;
1540                 } else {
1541                         bio->bi_iter.bi_sector = page->physical >> 9;
1542
1543                         if (btrfsic_submit_bio_wait(READ, bio))
1544                                 sblock->no_io_error_seen = 0;
1545                 }
1546
1547                 bio_put(bio);
1548         }
1549
1550         if (sblock->no_io_error_seen)
1551                 scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1552                                              have_csum, csum, generation,
1553                                              csum_size);
1554
1555         return;
1556 }
1557
1558 static inline int scrub_check_fsid(u8 fsid[],
1559                                    struct scrub_page *spage)
1560 {
1561         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1562         int ret;
1563
1564         ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1565         return !ret;
1566 }
1567
1568 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1569                                          struct scrub_block *sblock,
1570                                          int is_metadata, int have_csum,
1571                                          const u8 *csum, u64 generation,
1572                                          u16 csum_size)
1573 {
1574         int page_num;
1575         u8 calculated_csum[BTRFS_CSUM_SIZE];
1576         u32 crc = ~(u32)0;
1577         void *mapped_buffer;
1578
1579         WARN_ON(!sblock->pagev[0]->page);
1580         if (is_metadata) {
1581                 struct btrfs_header *h;
1582
1583                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1584                 h = (struct btrfs_header *)mapped_buffer;
1585
1586                 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1587                     !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
1588                     memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1589                            BTRFS_UUID_SIZE)) {
1590                         sblock->header_error = 1;
1591                 } else if (generation != btrfs_stack_header_generation(h)) {
1592                         sblock->header_error = 1;
1593                         sblock->generation_error = 1;
1594                 }
1595                 csum = h->csum;
1596         } else {
1597                 if (!have_csum)
1598                         return;
1599
1600                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1601         }
1602
1603         for (page_num = 0;;) {
1604                 if (page_num == 0 && is_metadata)
1605                         crc = btrfs_csum_data(
1606                                 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1607                                 crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1608                 else
1609                         crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
1610
1611                 kunmap_atomic(mapped_buffer);
1612                 page_num++;
1613                 if (page_num >= sblock->page_count)
1614                         break;
1615                 WARN_ON(!sblock->pagev[page_num]->page);
1616
1617                 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1618         }
1619
1620         btrfs_csum_final(crc, calculated_csum);
1621         if (memcmp(calculated_csum, csum, csum_size))
1622                 sblock->checksum_error = 1;
1623 }
1624
1625 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1626                                              struct scrub_block *sblock_good,
1627                                              int force_write)
1628 {
1629         int page_num;
1630         int ret = 0;
1631
1632         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1633                 int ret_sub;
1634
1635                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1636                                                            sblock_good,
1637                                                            page_num,
1638                                                            force_write);
1639                 if (ret_sub)
1640                         ret = ret_sub;
1641         }
1642
1643         return ret;
1644 }
1645
1646 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1647                                             struct scrub_block *sblock_good,
1648                                             int page_num, int force_write)
1649 {
1650         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1651         struct scrub_page *page_good = sblock_good->pagev[page_num];
1652
1653         BUG_ON(page_bad->page == NULL);
1654         BUG_ON(page_good->page == NULL);
1655         if (force_write || sblock_bad->header_error ||
1656             sblock_bad->checksum_error || page_bad->io_error) {
1657                 struct bio *bio;
1658                 int ret;
1659
1660                 if (!page_bad->dev->bdev) {
1661                         printk_ratelimited(KERN_WARNING "BTRFS: "
1662                                 "scrub_repair_page_from_good_copy(bdev == NULL) "
1663                                 "is unexpected!\n");
1664                         return -EIO;
1665                 }
1666
1667                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1668                 if (!bio)
1669                         return -EIO;
1670                 bio->bi_bdev = page_bad->dev->bdev;
1671                 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1672
1673                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1674                 if (PAGE_SIZE != ret) {
1675                         bio_put(bio);
1676                         return -EIO;
1677                 }
1678
1679                 if (btrfsic_submit_bio_wait(WRITE, bio)) {
1680                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1681                                 BTRFS_DEV_STAT_WRITE_ERRS);
1682                         btrfs_dev_replace_stats_inc(
1683                                 &sblock_bad->sctx->dev_root->fs_info->
1684                                 dev_replace.num_write_errors);
1685                         bio_put(bio);
1686                         return -EIO;
1687                 }
1688                 bio_put(bio);
1689         }
1690
1691         return 0;
1692 }
1693
1694 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1695 {
1696         int page_num;
1697
1698         /*
1699          * This block is used for the check of the parity on the source device,
1700          * so the data needn't be written into the destination device.
1701          */
1702         if (sblock->sparity)
1703                 return;
1704
1705         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1706                 int ret;
1707
1708                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1709                 if (ret)
1710                         btrfs_dev_replace_stats_inc(
1711                                 &sblock->sctx->dev_root->fs_info->dev_replace.
1712                                 num_write_errors);
1713         }
1714 }
1715
1716 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1717                                            int page_num)
1718 {
1719         struct scrub_page *spage = sblock->pagev[page_num];
1720
1721         BUG_ON(spage->page == NULL);
1722         if (spage->io_error) {
1723                 void *mapped_buffer = kmap_atomic(spage->page);
1724
1725                 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1726                 flush_dcache_page(spage->page);
1727                 kunmap_atomic(mapped_buffer);
1728         }
1729         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1730 }
1731
1732 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1733                                     struct scrub_page *spage)
1734 {
1735         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1736         struct scrub_bio *sbio;
1737         int ret;
1738
1739         mutex_lock(&wr_ctx->wr_lock);
1740 again:
1741         if (!wr_ctx->wr_curr_bio) {
1742                 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1743                                               GFP_NOFS);
1744                 if (!wr_ctx->wr_curr_bio) {
1745                         mutex_unlock(&wr_ctx->wr_lock);
1746                         return -ENOMEM;
1747                 }
1748                 wr_ctx->wr_curr_bio->sctx = sctx;
1749                 wr_ctx->wr_curr_bio->page_count = 0;
1750         }
1751         sbio = wr_ctx->wr_curr_bio;
1752         if (sbio->page_count == 0) {
1753                 struct bio *bio;
1754
1755                 sbio->physical = spage->physical_for_dev_replace;
1756                 sbio->logical = spage->logical;
1757                 sbio->dev = wr_ctx->tgtdev;
1758                 bio = sbio->bio;
1759                 if (!bio) {
1760                         bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1761                         if (!bio) {
1762                                 mutex_unlock(&wr_ctx->wr_lock);
1763                                 return -ENOMEM;
1764                         }
1765                         sbio->bio = bio;
1766                 }
1767
1768                 bio->bi_private = sbio;
1769                 bio->bi_end_io = scrub_wr_bio_end_io;
1770                 bio->bi_bdev = sbio->dev->bdev;
1771                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1772                 sbio->err = 0;
1773         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1774                    spage->physical_for_dev_replace ||
1775                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1776                    spage->logical) {
1777                 scrub_wr_submit(sctx);
1778                 goto again;
1779         }
1780
1781         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1782         if (ret != PAGE_SIZE) {
1783                 if (sbio->page_count < 1) {
1784                         bio_put(sbio->bio);
1785                         sbio->bio = NULL;
1786                         mutex_unlock(&wr_ctx->wr_lock);
1787                         return -EIO;
1788                 }
1789                 scrub_wr_submit(sctx);
1790                 goto again;
1791         }
1792
1793         sbio->pagev[sbio->page_count] = spage;
1794         scrub_page_get(spage);
1795         sbio->page_count++;
1796         if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1797                 scrub_wr_submit(sctx);
1798         mutex_unlock(&wr_ctx->wr_lock);
1799
1800         return 0;
1801 }
1802
1803 static void scrub_wr_submit(struct scrub_ctx *sctx)
1804 {
1805         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1806         struct scrub_bio *sbio;
1807
1808         if (!wr_ctx->wr_curr_bio)
1809                 return;
1810
1811         sbio = wr_ctx->wr_curr_bio;
1812         wr_ctx->wr_curr_bio = NULL;
1813         WARN_ON(!sbio->bio->bi_bdev);
1814         scrub_pending_bio_inc(sctx);
1815         /* process all writes in a single worker thread. Then the block layer
1816          * orders the requests before sending them to the driver which
1817          * doubled the write performance on spinning disks when measured
1818          * with Linux 3.5 */
1819         btrfsic_submit_bio(WRITE, sbio->bio);
1820 }
1821
1822 static void scrub_wr_bio_end_io(struct bio *bio, int err)
1823 {
1824         struct scrub_bio *sbio = bio->bi_private;
1825         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1826
1827         sbio->err = err;
1828         sbio->bio = bio;
1829
1830         btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1831                          scrub_wr_bio_end_io_worker, NULL, NULL);
1832         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1833 }
1834
1835 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1836 {
1837         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1838         struct scrub_ctx *sctx = sbio->sctx;
1839         int i;
1840
1841         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1842         if (sbio->err) {
1843                 struct btrfs_dev_replace *dev_replace =
1844                         &sbio->sctx->dev_root->fs_info->dev_replace;
1845
1846                 for (i = 0; i < sbio->page_count; i++) {
1847                         struct scrub_page *spage = sbio->pagev[i];
1848
1849                         spage->io_error = 1;
1850                         btrfs_dev_replace_stats_inc(&dev_replace->
1851                                                     num_write_errors);
1852                 }
1853         }
1854
1855         for (i = 0; i < sbio->page_count; i++)
1856                 scrub_page_put(sbio->pagev[i]);
1857
1858         bio_put(sbio->bio);
1859         kfree(sbio);
1860         scrub_pending_bio_dec(sctx);
1861 }
1862
1863 static int scrub_checksum(struct scrub_block *sblock)
1864 {
1865         u64 flags;
1866         int ret;
1867
1868         WARN_ON(sblock->page_count < 1);
1869         flags = sblock->pagev[0]->flags;
1870         ret = 0;
1871         if (flags & BTRFS_EXTENT_FLAG_DATA)
1872                 ret = scrub_checksum_data(sblock);
1873         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1874                 ret = scrub_checksum_tree_block(sblock);
1875         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1876                 (void)scrub_checksum_super(sblock);
1877         else
1878                 WARN_ON(1);
1879         if (ret)
1880                 scrub_handle_errored_block(sblock);
1881
1882         return ret;
1883 }
1884
1885 static int scrub_checksum_data(struct scrub_block *sblock)
1886 {
1887         struct scrub_ctx *sctx = sblock->sctx;
1888         u8 csum[BTRFS_CSUM_SIZE];
1889         u8 *on_disk_csum;
1890         struct page *page;
1891         void *buffer;
1892         u32 crc = ~(u32)0;
1893         int fail = 0;
1894         u64 len;
1895         int index;
1896
1897         BUG_ON(sblock->page_count < 1);
1898         if (!sblock->pagev[0]->have_csum)
1899                 return 0;
1900
1901         on_disk_csum = sblock->pagev[0]->csum;
1902         page = sblock->pagev[0]->page;
1903         buffer = kmap_atomic(page);
1904
1905         len = sctx->sectorsize;
1906         index = 0;
1907         for (;;) {
1908                 u64 l = min_t(u64, len, PAGE_SIZE);
1909
1910                 crc = btrfs_csum_data(buffer, crc, l);
1911                 kunmap_atomic(buffer);
1912                 len -= l;
1913                 if (len == 0)
1914                         break;
1915                 index++;
1916                 BUG_ON(index >= sblock->page_count);
1917                 BUG_ON(!sblock->pagev[index]->page);
1918                 page = sblock->pagev[index]->page;
1919                 buffer = kmap_atomic(page);
1920         }
1921
1922         btrfs_csum_final(crc, csum);
1923         if (memcmp(csum, on_disk_csum, sctx->csum_size))
1924                 fail = 1;
1925
1926         return fail;
1927 }
1928
1929 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1930 {
1931         struct scrub_ctx *sctx = sblock->sctx;
1932         struct btrfs_header *h;
1933         struct btrfs_root *root = sctx->dev_root;
1934         struct btrfs_fs_info *fs_info = root->fs_info;
1935         u8 calculated_csum[BTRFS_CSUM_SIZE];
1936         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1937         struct page *page;
1938         void *mapped_buffer;
1939         u64 mapped_size;
1940         void *p;
1941         u32 crc = ~(u32)0;
1942         int fail = 0;
1943         int crc_fail = 0;
1944         u64 len;
1945         int index;
1946
1947         BUG_ON(sblock->page_count < 1);
1948         page = sblock->pagev[0]->page;
1949         mapped_buffer = kmap_atomic(page);
1950         h = (struct btrfs_header *)mapped_buffer;
1951         memcpy(on_disk_csum, h->csum, sctx->csum_size);
1952
1953         /*
1954          * we don't use the getter functions here, as we
1955          * a) don't have an extent buffer and
1956          * b) the page is already kmapped
1957          */
1958
1959         if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1960                 ++fail;
1961
1962         if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1963                 ++fail;
1964
1965         if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
1966                 ++fail;
1967
1968         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1969                    BTRFS_UUID_SIZE))
1970                 ++fail;
1971
1972         len = sctx->nodesize - BTRFS_CSUM_SIZE;
1973         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1974         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1975         index = 0;
1976         for (;;) {
1977                 u64 l = min_t(u64, len, mapped_size);
1978
1979                 crc = btrfs_csum_data(p, crc, l);
1980                 kunmap_atomic(mapped_buffer);
1981                 len -= l;
1982                 if (len == 0)
1983                         break;
1984                 index++;
1985                 BUG_ON(index >= sblock->page_count);
1986                 BUG_ON(!sblock->pagev[index]->page);
1987                 page = sblock->pagev[index]->page;
1988                 mapped_buffer = kmap_atomic(page);
1989                 mapped_size = PAGE_SIZE;
1990                 p = mapped_buffer;
1991         }
1992
1993         btrfs_csum_final(crc, calculated_csum);
1994         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1995                 ++crc_fail;
1996
1997         return fail || crc_fail;
1998 }
1999
2000 static int scrub_checksum_super(struct scrub_block *sblock)
2001 {
2002         struct btrfs_super_block *s;
2003         struct scrub_ctx *sctx = sblock->sctx;
2004         u8 calculated_csum[BTRFS_CSUM_SIZE];
2005         u8 on_disk_csum[BTRFS_CSUM_SIZE];
2006         struct page *page;
2007         void *mapped_buffer;
2008         u64 mapped_size;
2009         void *p;
2010         u32 crc = ~(u32)0;
2011         int fail_gen = 0;
2012         int fail_cor = 0;
2013         u64 len;
2014         int index;
2015
2016         BUG_ON(sblock->page_count < 1);
2017         page = sblock->pagev[0]->page;
2018         mapped_buffer = kmap_atomic(page);
2019         s = (struct btrfs_super_block *)mapped_buffer;
2020         memcpy(on_disk_csum, s->csum, sctx->csum_size);
2021
2022         if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
2023                 ++fail_cor;
2024
2025         if (sblock->pagev[0]->generation != btrfs_super_generation(s))
2026                 ++fail_gen;
2027
2028         if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
2029                 ++fail_cor;
2030
2031         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
2032         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2033         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2034         index = 0;
2035         for (;;) {
2036                 u64 l = min_t(u64, len, mapped_size);
2037
2038                 crc = btrfs_csum_data(p, crc, l);
2039                 kunmap_atomic(mapped_buffer);
2040                 len -= l;
2041                 if (len == 0)
2042                         break;
2043                 index++;
2044                 BUG_ON(index >= sblock->page_count);
2045                 BUG_ON(!sblock->pagev[index]->page);
2046                 page = sblock->pagev[index]->page;
2047                 mapped_buffer = kmap_atomic(page);
2048                 mapped_size = PAGE_SIZE;
2049                 p = mapped_buffer;
2050         }
2051
2052         btrfs_csum_final(crc, calculated_csum);
2053         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2054                 ++fail_cor;
2055
2056         if (fail_cor + fail_gen) {
2057                 /*
2058                  * if we find an error in a super block, we just report it.
2059                  * They will get written with the next transaction commit
2060                  * anyway
2061                  */
2062                 spin_lock(&sctx->stat_lock);
2063                 ++sctx->stat.super_errors;
2064                 spin_unlock(&sctx->stat_lock);
2065                 if (fail_cor)
2066                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2067                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2068                 else
2069                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2070                                 BTRFS_DEV_STAT_GENERATION_ERRS);
2071         }
2072
2073         return fail_cor + fail_gen;
2074 }
2075
2076 static void scrub_block_get(struct scrub_block *sblock)
2077 {
2078         atomic_inc(&sblock->ref_count);
2079 }
2080
2081 static void scrub_block_put(struct scrub_block *sblock)
2082 {
2083         if (atomic_dec_and_test(&sblock->ref_count)) {
2084                 int i;
2085
2086                 if (sblock->sparity)
2087                         scrub_parity_put(sblock->sparity);
2088
2089                 for (i = 0; i < sblock->page_count; i++)
2090                         scrub_page_put(sblock->pagev[i]);
2091                 kfree(sblock);
2092         }
2093 }
2094
2095 static void scrub_page_get(struct scrub_page *spage)
2096 {
2097         atomic_inc(&spage->ref_count);
2098 }
2099
2100 static void scrub_page_put(struct scrub_page *spage)
2101 {
2102         if (atomic_dec_and_test(&spage->ref_count)) {
2103                 if (spage->page)
2104                         __free_page(spage->page);
2105                 kfree(spage);
2106         }
2107 }
2108
2109 static void scrub_submit(struct scrub_ctx *sctx)
2110 {
2111         struct scrub_bio *sbio;
2112
2113         if (sctx->curr == -1)
2114                 return;
2115
2116         sbio = sctx->bios[sctx->curr];
2117         sctx->curr = -1;
2118         scrub_pending_bio_inc(sctx);
2119
2120         if (!sbio->bio->bi_bdev) {
2121                 /*
2122                  * this case should not happen. If btrfs_map_block() is
2123                  * wrong, it could happen for dev-replace operations on
2124                  * missing devices when no mirrors are available, but in
2125                  * this case it should already fail the mount.
2126                  * This case is handled correctly (but _very_ slowly).
2127                  */
2128                 printk_ratelimited(KERN_WARNING
2129                         "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
2130                 bio_endio(sbio->bio, -EIO);
2131         } else {
2132                 btrfsic_submit_bio(READ, sbio->bio);
2133         }
2134 }
2135
2136 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2137                                     struct scrub_page *spage)
2138 {
2139         struct scrub_block *sblock = spage->sblock;
2140         struct scrub_bio *sbio;
2141         int ret;
2142
2143 again:
2144         /*
2145          * grab a fresh bio or wait for one to become available
2146          */
2147         while (sctx->curr == -1) {
2148                 spin_lock(&sctx->list_lock);
2149                 sctx->curr = sctx->first_free;
2150                 if (sctx->curr != -1) {
2151                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2152                         sctx->bios[sctx->curr]->next_free = -1;
2153                         sctx->bios[sctx->curr]->page_count = 0;
2154                         spin_unlock(&sctx->list_lock);
2155                 } else {
2156                         spin_unlock(&sctx->list_lock);
2157                         wait_event(sctx->list_wait, sctx->first_free != -1);
2158                 }
2159         }
2160         sbio = sctx->bios[sctx->curr];
2161         if (sbio->page_count == 0) {
2162                 struct bio *bio;
2163
2164                 sbio->physical = spage->physical;
2165                 sbio->logical = spage->logical;
2166                 sbio->dev = spage->dev;
2167                 bio = sbio->bio;
2168                 if (!bio) {
2169                         bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
2170                         if (!bio)
2171                                 return -ENOMEM;
2172                         sbio->bio = bio;
2173                 }
2174
2175                 bio->bi_private = sbio;
2176                 bio->bi_end_io = scrub_bio_end_io;
2177                 bio->bi_bdev = sbio->dev->bdev;
2178                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2179                 sbio->err = 0;
2180         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2181                    spage->physical ||
2182                    sbio->logical + sbio->page_count * PAGE_SIZE !=
2183                    spage->logical ||
2184                    sbio->dev != spage->dev) {
2185                 scrub_submit(sctx);
2186                 goto again;
2187         }
2188
2189         sbio->pagev[sbio->page_count] = spage;
2190         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2191         if (ret != PAGE_SIZE) {
2192                 if (sbio->page_count < 1) {
2193                         bio_put(sbio->bio);
2194                         sbio->bio = NULL;
2195                         return -EIO;
2196                 }
2197                 scrub_submit(sctx);
2198                 goto again;
2199         }
2200
2201         scrub_block_get(sblock); /* one for the page added to the bio */
2202         atomic_inc(&sblock->outstanding_pages);
2203         sbio->page_count++;
2204         if (sbio->page_count == sctx->pages_per_rd_bio)
2205                 scrub_submit(sctx);
2206
2207         return 0;
2208 }
2209
2210 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2211                        u64 physical, struct btrfs_device *dev, u64 flags,
2212                        u64 gen, int mirror_num, u8 *csum, int force,
2213                        u64 physical_for_dev_replace)
2214 {
2215         struct scrub_block *sblock;
2216         int index;
2217
2218         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2219         if (!sblock) {
2220                 spin_lock(&sctx->stat_lock);
2221                 sctx->stat.malloc_errors++;
2222                 spin_unlock(&sctx->stat_lock);
2223                 return -ENOMEM;
2224         }
2225
2226         /* one ref inside this function, plus one for each page added to
2227          * a bio later on */
2228         atomic_set(&sblock->ref_count, 1);
2229         sblock->sctx = sctx;
2230         sblock->no_io_error_seen = 1;
2231
2232         for (index = 0; len > 0; index++) {
2233                 struct scrub_page *spage;
2234                 u64 l = min_t(u64, len, PAGE_SIZE);
2235
2236                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2237                 if (!spage) {
2238 leave_nomem:
2239                         spin_lock(&sctx->stat_lock);
2240                         sctx->stat.malloc_errors++;
2241                         spin_unlock(&sctx->stat_lock);
2242                         scrub_block_put(sblock);
2243                         return -ENOMEM;
2244                 }
2245                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2246                 scrub_page_get(spage);
2247                 sblock->pagev[index] = spage;
2248                 spage->sblock = sblock;
2249                 spage->dev = dev;
2250                 spage->flags = flags;
2251                 spage->generation = gen;
2252                 spage->logical = logical;
2253                 spage->physical = physical;
2254                 spage->physical_for_dev_replace = physical_for_dev_replace;
2255                 spage->mirror_num = mirror_num;
2256                 if (csum) {
2257                         spage->have_csum = 1;
2258                         memcpy(spage->csum, csum, sctx->csum_size);
2259                 } else {
2260                         spage->have_csum = 0;
2261                 }
2262                 sblock->page_count++;
2263                 spage->page = alloc_page(GFP_NOFS);
2264                 if (!spage->page)
2265                         goto leave_nomem;
2266                 len -= l;
2267                 logical += l;
2268                 physical += l;
2269                 physical_for_dev_replace += l;
2270         }
2271
2272         WARN_ON(sblock->page_count == 0);
2273         for (index = 0; index < sblock->page_count; index++) {
2274                 struct scrub_page *spage = sblock->pagev[index];
2275                 int ret;
2276
2277                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2278                 if (ret) {
2279                         scrub_block_put(sblock);
2280                         return ret;
2281                 }
2282         }
2283
2284         if (force)
2285                 scrub_submit(sctx);
2286
2287         /* last one frees, either here or in bio completion for last page */
2288         scrub_block_put(sblock);
2289         return 0;
2290 }
2291
2292 static void scrub_bio_end_io(struct bio *bio, int err)
2293 {
2294         struct scrub_bio *sbio = bio->bi_private;
2295         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2296
2297         sbio->err = err;
2298         sbio->bio = bio;
2299
2300         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2301 }
2302
2303 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2304 {
2305         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2306         struct scrub_ctx *sctx = sbio->sctx;
2307         int i;
2308
2309         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2310         if (sbio->err) {
2311                 for (i = 0; i < sbio->page_count; i++) {
2312                         struct scrub_page *spage = sbio->pagev[i];
2313
2314                         spage->io_error = 1;
2315                         spage->sblock->no_io_error_seen = 0;
2316                 }
2317         }
2318
2319         /* now complete the scrub_block items that have all pages completed */
2320         for (i = 0; i < sbio->page_count; i++) {
2321                 struct scrub_page *spage = sbio->pagev[i];
2322                 struct scrub_block *sblock = spage->sblock;
2323
2324                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2325                         scrub_block_complete(sblock);
2326                 scrub_block_put(sblock);
2327         }
2328
2329         bio_put(sbio->bio);
2330         sbio->bio = NULL;
2331         spin_lock(&sctx->list_lock);
2332         sbio->next_free = sctx->first_free;
2333         sctx->first_free = sbio->index;
2334         spin_unlock(&sctx->list_lock);
2335
2336         if (sctx->is_dev_replace &&
2337             atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2338                 mutex_lock(&sctx->wr_ctx.wr_lock);
2339                 scrub_wr_submit(sctx);
2340                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2341         }
2342
2343         scrub_pending_bio_dec(sctx);
2344 }
2345
2346 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2347                                        unsigned long *bitmap,
2348                                        u64 start, u64 len)
2349 {
2350         int offset;
2351         int nsectors;
2352         int sectorsize = sparity->sctx->dev_root->sectorsize;
2353
2354         if (len >= sparity->stripe_len) {
2355                 bitmap_set(bitmap, 0, sparity->nsectors);
2356                 return;
2357         }
2358
2359         start -= sparity->logic_start;
2360         offset = (int)do_div(start, sparity->stripe_len);
2361         offset /= sectorsize;
2362         nsectors = (int)len / sectorsize;
2363
2364         if (offset + nsectors <= sparity->nsectors) {
2365                 bitmap_set(bitmap, offset, nsectors);
2366                 return;
2367         }
2368
2369         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2370         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2371 }
2372
2373 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2374                                                    u64 start, u64 len)
2375 {
2376         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2377 }
2378
2379 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2380                                                   u64 start, u64 len)
2381 {
2382         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2383 }
2384
2385 static void scrub_block_complete(struct scrub_block *sblock)
2386 {
2387         int corrupted = 0;
2388
2389         if (!sblock->no_io_error_seen) {
2390                 corrupted = 1;
2391                 scrub_handle_errored_block(sblock);
2392         } else {
2393                 /*
2394                  * if has checksum error, write via repair mechanism in
2395                  * dev replace case, otherwise write here in dev replace
2396                  * case.
2397                  */
2398                 corrupted = scrub_checksum(sblock);
2399                 if (!corrupted && sblock->sctx->is_dev_replace)
2400                         scrub_write_block_to_dev_replace(sblock);
2401         }
2402
2403         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2404                 u64 start = sblock->pagev[0]->logical;
2405                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2406                           PAGE_SIZE;
2407
2408                 scrub_parity_mark_sectors_error(sblock->sparity,
2409                                                 start, end - start);
2410         }
2411 }
2412
2413 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2414                            u8 *csum)
2415 {
2416         struct btrfs_ordered_sum *sum = NULL;
2417         unsigned long index;
2418         unsigned long num_sectors;
2419
2420         while (!list_empty(&sctx->csum_list)) {
2421                 sum = list_first_entry(&sctx->csum_list,
2422                                        struct btrfs_ordered_sum, list);
2423                 if (sum->bytenr > logical)
2424                         return 0;
2425                 if (sum->bytenr + sum->len > logical)
2426                         break;
2427
2428                 ++sctx->stat.csum_discards;
2429                 list_del(&sum->list);
2430                 kfree(sum);
2431                 sum = NULL;
2432         }
2433         if (!sum)
2434                 return 0;
2435
2436         index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2437         num_sectors = sum->len / sctx->sectorsize;
2438         memcpy(csum, sum->sums + index, sctx->csum_size);
2439         if (index == num_sectors - 1) {
2440                 list_del(&sum->list);
2441                 kfree(sum);
2442         }
2443         return 1;
2444 }
2445
2446 /* scrub extent tries to collect up to 64 kB for each bio */
2447 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2448                         u64 physical, struct btrfs_device *dev, u64 flags,
2449                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2450 {
2451         int ret;
2452         u8 csum[BTRFS_CSUM_SIZE];
2453         u32 blocksize;
2454
2455         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2456                 blocksize = sctx->sectorsize;
2457                 spin_lock(&sctx->stat_lock);
2458                 sctx->stat.data_extents_scrubbed++;
2459                 sctx->stat.data_bytes_scrubbed += len;
2460                 spin_unlock(&sctx->stat_lock);
2461         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2462                 blocksize = sctx->nodesize;
2463                 spin_lock(&sctx->stat_lock);
2464                 sctx->stat.tree_extents_scrubbed++;
2465                 sctx->stat.tree_bytes_scrubbed += len;
2466                 spin_unlock(&sctx->stat_lock);
2467         } else {
2468                 blocksize = sctx->sectorsize;
2469                 WARN_ON(1);
2470         }
2471
2472         while (len) {
2473                 u64 l = min_t(u64, len, blocksize);
2474                 int have_csum = 0;
2475
2476                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2477                         /* push csums to sbio */
2478                         have_csum = scrub_find_csum(sctx, logical, l, csum);
2479                         if (have_csum == 0)
2480                                 ++sctx->stat.no_csum;
2481                         if (sctx->is_dev_replace && !have_csum) {
2482                                 ret = copy_nocow_pages(sctx, logical, l,
2483                                                        mirror_num,
2484                                                       physical_for_dev_replace);
2485                                 goto behind_scrub_pages;
2486                         }
2487                 }
2488                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2489                                   mirror_num, have_csum ? csum : NULL, 0,
2490                                   physical_for_dev_replace);
2491 behind_scrub_pages:
2492                 if (ret)
2493                         return ret;
2494                 len -= l;
2495                 logical += l;
2496                 physical += l;
2497                 physical_for_dev_replace += l;
2498         }
2499         return 0;
2500 }
2501
2502 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2503                                   u64 logical, u64 len,
2504                                   u64 physical, struct btrfs_device *dev,
2505                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2506 {
2507         struct scrub_ctx *sctx = sparity->sctx;
2508         struct scrub_block *sblock;
2509         int index;
2510
2511         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2512         if (!sblock) {
2513                 spin_lock(&sctx->stat_lock);
2514                 sctx->stat.malloc_errors++;
2515                 spin_unlock(&sctx->stat_lock);
2516                 return -ENOMEM;
2517         }
2518
2519         /* one ref inside this function, plus one for each page added to
2520          * a bio later on */
2521         atomic_set(&sblock->ref_count, 1);
2522         sblock->sctx = sctx;
2523         sblock->no_io_error_seen = 1;
2524         sblock->sparity = sparity;
2525         scrub_parity_get(sparity);
2526
2527         for (index = 0; len > 0; index++) {
2528                 struct scrub_page *spage;
2529                 u64 l = min_t(u64, len, PAGE_SIZE);
2530
2531                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2532                 if (!spage) {
2533 leave_nomem:
2534                         spin_lock(&sctx->stat_lock);
2535                         sctx->stat.malloc_errors++;
2536                         spin_unlock(&sctx->stat_lock);
2537                         scrub_block_put(sblock);
2538                         return -ENOMEM;
2539                 }
2540                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2541                 /* For scrub block */
2542                 scrub_page_get(spage);
2543                 sblock->pagev[index] = spage;
2544                 /* For scrub parity */
2545                 scrub_page_get(spage);
2546                 list_add_tail(&spage->list, &sparity->spages);
2547                 spage->sblock = sblock;
2548                 spage->dev = dev;
2549                 spage->flags = flags;
2550                 spage->generation = gen;
2551                 spage->logical = logical;
2552                 spage->physical = physical;
2553                 spage->mirror_num = mirror_num;
2554                 if (csum) {
2555                         spage->have_csum = 1;
2556                         memcpy(spage->csum, csum, sctx->csum_size);
2557                 } else {
2558                         spage->have_csum = 0;
2559                 }
2560                 sblock->page_count++;
2561                 spage->page = alloc_page(GFP_NOFS);
2562                 if (!spage->page)
2563                         goto leave_nomem;
2564                 len -= l;
2565                 logical += l;
2566                 physical += l;
2567         }
2568
2569         WARN_ON(sblock->page_count == 0);
2570         for (index = 0; index < sblock->page_count; index++) {
2571                 struct scrub_page *spage = sblock->pagev[index];
2572                 int ret;
2573
2574                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2575                 if (ret) {
2576                         scrub_block_put(sblock);
2577                         return ret;
2578                 }
2579         }
2580
2581         /* last one frees, either here or in bio completion for last page */
2582         scrub_block_put(sblock);
2583         return 0;
2584 }
2585
2586 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2587                                    u64 logical, u64 len,
2588                                    u64 physical, struct btrfs_device *dev,
2589                                    u64 flags, u64 gen, int mirror_num)
2590 {
2591         struct scrub_ctx *sctx = sparity->sctx;
2592         int ret;
2593         u8 csum[BTRFS_CSUM_SIZE];
2594         u32 blocksize;
2595
2596         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2597                 blocksize = sctx->sectorsize;
2598         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2599                 blocksize = sctx->nodesize;
2600         } else {
2601                 blocksize = sctx->sectorsize;
2602                 WARN_ON(1);
2603         }
2604
2605         while (len) {
2606                 u64 l = min_t(u64, len, blocksize);
2607                 int have_csum = 0;
2608
2609                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2610                         /* push csums to sbio */
2611                         have_csum = scrub_find_csum(sctx, logical, l, csum);
2612                         if (have_csum == 0)
2613                                 goto skip;
2614                 }
2615                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2616                                              flags, gen, mirror_num,
2617                                              have_csum ? csum : NULL);
2618                 if (ret)
2619                         return ret;
2620 skip:
2621                 len -= l;
2622                 logical += l;
2623                 physical += l;
2624         }
2625         return 0;
2626 }
2627
2628 /*
2629  * Given a physical address, this will calculate it's
2630  * logical offset. if this is a parity stripe, it will return
2631  * the most left data stripe's logical offset.
2632  *
2633  * return 0 if it is a data stripe, 1 means parity stripe.
2634  */
2635 static int get_raid56_logic_offset(u64 physical, int num,
2636                                    struct map_lookup *map, u64 *offset,
2637                                    u64 *stripe_start)
2638 {
2639         int i;
2640         int j = 0;
2641         u64 stripe_nr;
2642         u64 last_offset;
2643         int stripe_index;
2644         int rot;
2645
2646         last_offset = (physical - map->stripes[num].physical) *
2647                       nr_data_stripes(map);
2648         if (stripe_start)
2649                 *stripe_start = last_offset;
2650
2651         *offset = last_offset;
2652         for (i = 0; i < nr_data_stripes(map); i++) {
2653                 *offset = last_offset + i * map->stripe_len;
2654
2655                 stripe_nr = *offset;
2656                 do_div(stripe_nr, map->stripe_len);
2657                 do_div(stripe_nr, nr_data_stripes(map));
2658
2659                 /* Work out the disk rotation on this stripe-set */
2660                 rot = do_div(stripe_nr, map->num_stripes);
2661                 /* calculate which stripe this data locates */
2662                 rot += i;
2663                 stripe_index = rot % map->num_stripes;
2664                 if (stripe_index == num)
2665                         return 0;
2666                 if (stripe_index < num)
2667                         j++;
2668         }
2669         *offset = last_offset + j * map->stripe_len;
2670         return 1;
2671 }
2672
2673 static void scrub_free_parity(struct scrub_parity *sparity)
2674 {
2675         struct scrub_ctx *sctx = sparity->sctx;
2676         struct scrub_page *curr, *next;
2677         int nbits;
2678
2679         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2680         if (nbits) {
2681                 spin_lock(&sctx->stat_lock);
2682                 sctx->stat.read_errors += nbits;
2683                 sctx->stat.uncorrectable_errors += nbits;
2684                 spin_unlock(&sctx->stat_lock);
2685         }
2686
2687         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2688                 list_del_init(&curr->list);
2689                 scrub_page_put(curr);
2690         }
2691
2692         kfree(sparity);
2693 }
2694
2695 static void scrub_parity_bio_endio(struct bio *bio, int error)
2696 {
2697         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2698         struct scrub_ctx *sctx = sparity->sctx;
2699
2700         if (error)
2701                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2702                           sparity->nsectors);
2703
2704         scrub_free_parity(sparity);
2705         scrub_pending_bio_dec(sctx);
2706         bio_put(bio);
2707 }
2708
2709 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2710 {
2711         struct scrub_ctx *sctx = sparity->sctx;
2712         struct bio *bio;
2713         struct btrfs_raid_bio *rbio;
2714         struct scrub_page *spage;
2715         struct btrfs_bio *bbio = NULL;
2716         u64 *raid_map = NULL;
2717         u64 length;
2718         int ret;
2719
2720         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2721                            sparity->nsectors))
2722                 goto out;
2723
2724         length = sparity->logic_end - sparity->logic_start + 1;
2725         ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
2726                                sparity->logic_start,
2727                                &length, &bbio, 0, &raid_map);
2728         if (ret || !bbio || !raid_map)
2729                 goto bbio_out;
2730
2731         bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2732         if (!bio)
2733                 goto bbio_out;
2734
2735         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2736         bio->bi_private = sparity;
2737         bio->bi_end_io = scrub_parity_bio_endio;
2738
2739         rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
2740                                               raid_map, length,
2741                                               sparity->scrub_dev,
2742                                               sparity->dbitmap,
2743                                               sparity->nsectors);
2744         if (!rbio)
2745                 goto rbio_out;
2746
2747         list_for_each_entry(spage, &sparity->spages, list)
2748                 raid56_parity_add_scrub_pages(rbio, spage->page,
2749                                               spage->logical);
2750
2751         scrub_pending_bio_inc(sctx);
2752         raid56_parity_submit_scrub_rbio(rbio);
2753         return;
2754
2755 rbio_out:
2756         bio_put(bio);
2757 bbio_out:
2758         kfree(bbio);
2759         kfree(raid_map);
2760         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2761                   sparity->nsectors);
2762         spin_lock(&sctx->stat_lock);
2763         sctx->stat.malloc_errors++;
2764         spin_unlock(&sctx->stat_lock);
2765 out:
2766         scrub_free_parity(sparity);
2767 }
2768
2769 static inline int scrub_calc_parity_bitmap_len(int nsectors)
2770 {
2771         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
2772 }
2773
2774 static void scrub_parity_get(struct scrub_parity *sparity)
2775 {
2776         atomic_inc(&sparity->ref_count);
2777 }
2778
2779 static void scrub_parity_put(struct scrub_parity *sparity)
2780 {
2781         if (!atomic_dec_and_test(&sparity->ref_count))
2782                 return;
2783
2784         scrub_parity_check_and_repair(sparity);
2785 }
2786
2787 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2788                                                   struct map_lookup *map,
2789                                                   struct btrfs_device *sdev,
2790                                                   struct btrfs_path *path,
2791                                                   u64 logic_start,
2792                                                   u64 logic_end)
2793 {
2794         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2795         struct btrfs_root *root = fs_info->extent_root;
2796         struct btrfs_root *csum_root = fs_info->csum_root;
2797         struct btrfs_extent_item *extent;
2798         u64 flags;
2799         int ret;
2800         int slot;
2801         struct extent_buffer *l;
2802         struct btrfs_key key;
2803         u64 generation;
2804         u64 extent_logical;
2805         u64 extent_physical;
2806         u64 extent_len;
2807         struct btrfs_device *extent_dev;
2808         struct scrub_parity *sparity;
2809         int nsectors;
2810         int bitmap_len;
2811         int extent_mirror_num;
2812         int stop_loop = 0;
2813
2814         nsectors = map->stripe_len / root->sectorsize;
2815         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2816         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2817                           GFP_NOFS);
2818         if (!sparity) {
2819                 spin_lock(&sctx->stat_lock);
2820                 sctx->stat.malloc_errors++;
2821                 spin_unlock(&sctx->stat_lock);
2822                 return -ENOMEM;
2823         }
2824
2825         sparity->stripe_len = map->stripe_len;
2826         sparity->nsectors = nsectors;
2827         sparity->sctx = sctx;
2828         sparity->scrub_dev = sdev;
2829         sparity->logic_start = logic_start;
2830         sparity->logic_end = logic_end;
2831         atomic_set(&sparity->ref_count, 1);
2832         INIT_LIST_HEAD(&sparity->spages);
2833         sparity->dbitmap = sparity->bitmap;
2834         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2835
2836         ret = 0;
2837         while (logic_start < logic_end) {
2838                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2839                         key.type = BTRFS_METADATA_ITEM_KEY;
2840                 else
2841                         key.type = BTRFS_EXTENT_ITEM_KEY;
2842                 key.objectid = logic_start;
2843                 key.offset = (u64)-1;
2844
2845                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2846                 if (ret < 0)
2847                         goto out;
2848
2849                 if (ret > 0) {
2850                         ret = btrfs_previous_extent_item(root, path, 0);
2851                         if (ret < 0)
2852                                 goto out;
2853                         if (ret > 0) {
2854                                 btrfs_release_path(path);
2855                                 ret = btrfs_search_slot(NULL, root, &key,
2856                                                         path, 0, 0);
2857                                 if (ret < 0)
2858                                         goto out;
2859                         }
2860                 }
2861
2862                 stop_loop = 0;
2863                 while (1) {
2864                         u64 bytes;
2865
2866                         l = path->nodes[0];
2867                         slot = path->slots[0];
2868                         if (slot >= btrfs_header_nritems(l)) {
2869                                 ret = btrfs_next_leaf(root, path);
2870                                 if (ret == 0)
2871                                         continue;
2872                                 if (ret < 0)
2873                                         goto out;
2874
2875                                 stop_loop = 1;
2876                                 break;
2877                         }
2878                         btrfs_item_key_to_cpu(l, &key, slot);
2879
2880                         if (key.type == BTRFS_METADATA_ITEM_KEY)
2881                                 bytes = root->nodesize;
2882                         else
2883                                 bytes = key.offset;
2884
2885                         if (key.objectid + bytes <= logic_start)
2886                                 goto next;
2887
2888                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2889                             key.type != BTRFS_METADATA_ITEM_KEY)
2890                                 goto next;
2891
2892                         if (key.objectid > logic_end) {
2893                                 stop_loop = 1;
2894                                 break;
2895                         }
2896
2897                         while (key.objectid >= logic_start + map->stripe_len)
2898                                 logic_start += map->stripe_len;
2899
2900                         extent = btrfs_item_ptr(l, slot,
2901                                                 struct btrfs_extent_item);
2902                         flags = btrfs_extent_flags(l, extent);
2903                         generation = btrfs_extent_generation(l, extent);
2904
2905                         if (key.objectid < logic_start &&
2906                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2907                                 btrfs_err(fs_info,
2908                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2909                                            key.objectid, logic_start);
2910                                 goto next;
2911                         }
2912 again:
2913                         extent_logical = key.objectid;
2914                         extent_len = bytes;
2915
2916                         if (extent_logical < logic_start) {
2917                                 extent_len -= logic_start - extent_logical;
2918                                 extent_logical = logic_start;
2919                         }
2920
2921                         if (extent_logical + extent_len >
2922                             logic_start + map->stripe_len)
2923                                 extent_len = logic_start + map->stripe_len -
2924                                              extent_logical;
2925
2926                         scrub_parity_mark_sectors_data(sparity, extent_logical,
2927                                                        extent_len);
2928
2929                         scrub_remap_extent(fs_info, extent_logical,
2930                                            extent_len, &extent_physical,
2931                                            &extent_dev,
2932                                            &extent_mirror_num);
2933
2934                         ret = btrfs_lookup_csums_range(csum_root,
2935                                                 extent_logical,
2936                                                 extent_logical + extent_len - 1,
2937                                                 &sctx->csum_list, 1);
2938                         if (ret)
2939                                 goto out;
2940
2941                         ret = scrub_extent_for_parity(sparity, extent_logical,
2942                                                       extent_len,
2943                                                       extent_physical,
2944                                                       extent_dev, flags,
2945                                                       generation,
2946                                                       extent_mirror_num);
2947                         if (ret)
2948                                 goto out;
2949
2950                         scrub_free_csums(sctx);
2951                         if (extent_logical + extent_len <
2952                             key.objectid + bytes) {
2953                                 logic_start += map->stripe_len;
2954
2955                                 if (logic_start >= logic_end) {
2956                                         stop_loop = 1;
2957                                         break;
2958                                 }
2959
2960                                 if (logic_start < key.objectid + bytes) {
2961                                         cond_resched();
2962                                         goto again;
2963                                 }
2964                         }
2965 next:
2966                         path->slots[0]++;
2967                 }
2968
2969                 btrfs_release_path(path);
2970
2971                 if (stop_loop)
2972                         break;
2973
2974                 logic_start += map->stripe_len;
2975         }
2976 out:
2977         if (ret < 0)
2978                 scrub_parity_mark_sectors_error(sparity, logic_start,
2979                                                 logic_end - logic_start + 1);
2980         scrub_parity_put(sparity);
2981         scrub_submit(sctx);
2982         mutex_lock(&sctx->wr_ctx.wr_lock);
2983         scrub_wr_submit(sctx);
2984         mutex_unlock(&sctx->wr_ctx.wr_lock);
2985
2986         btrfs_release_path(path);
2987         return ret < 0 ? ret : 0;
2988 }
2989
2990 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2991                                            struct map_lookup *map,
2992                                            struct btrfs_device *scrub_dev,
2993                                            int num, u64 base, u64 length,
2994                                            int is_dev_replace)
2995 {
2996         struct btrfs_path *path, *ppath;
2997         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2998         struct btrfs_root *root = fs_info->extent_root;
2999         struct btrfs_root *csum_root = fs_info->csum_root;
3000         struct btrfs_extent_item *extent;
3001         struct blk_plug plug;
3002         u64 flags;
3003         int ret;
3004         int slot;
3005         u64 nstripes;
3006         struct extent_buffer *l;
3007         struct btrfs_key key;
3008         u64 physical;
3009         u64 logical;
3010         u64 logic_end;
3011         u64 physical_end;
3012         u64 generation;
3013         int mirror_num;
3014         struct reada_control *reada1;
3015         struct reada_control *reada2;
3016         struct btrfs_key key_start;
3017         struct btrfs_key key_end;
3018         u64 increment = map->stripe_len;
3019         u64 offset;
3020         u64 extent_logical;
3021         u64 extent_physical;
3022         u64 extent_len;
3023         u64 stripe_logical;
3024         u64 stripe_end;
3025         struct btrfs_device *extent_dev;
3026         int extent_mirror_num;
3027         int stop_loop = 0;
3028
3029         nstripes = length;
3030         physical = map->stripes[num].physical;
3031         offset = 0;
3032         do_div(nstripes, map->stripe_len);
3033         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3034                 offset = map->stripe_len * num;
3035                 increment = map->stripe_len * map->num_stripes;
3036                 mirror_num = 1;
3037         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3038                 int factor = map->num_stripes / map->sub_stripes;
3039                 offset = map->stripe_len * (num / map->sub_stripes);
3040                 increment = map->stripe_len * factor;
3041                 mirror_num = num % map->sub_stripes + 1;
3042         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3043                 increment = map->stripe_len;
3044                 mirror_num = num % map->num_stripes + 1;
3045         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3046                 increment = map->stripe_len;
3047                 mirror_num = num % map->num_stripes + 1;
3048         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
3049                                 BTRFS_BLOCK_GROUP_RAID6)) {
3050                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3051                 increment = map->stripe_len * nr_data_stripes(map);
3052                 mirror_num = 1;
3053         } else {
3054                 increment = map->stripe_len;
3055                 mirror_num = 1;
3056         }
3057
3058         path = btrfs_alloc_path();
3059         if (!path)
3060                 return -ENOMEM;
3061
3062         ppath = btrfs_alloc_path();
3063         if (!ppath) {
3064                 btrfs_free_path(ppath);
3065                 return -ENOMEM;
3066         }
3067
3068         /*
3069          * work on commit root. The related disk blocks are static as
3070          * long as COW is applied. This means, it is save to rewrite
3071          * them to repair disk errors without any race conditions
3072          */
3073         path->search_commit_root = 1;
3074         path->skip_locking = 1;
3075
3076         /*
3077          * trigger the readahead for extent tree csum tree and wait for
3078          * completion. During readahead, the scrub is officially paused
3079          * to not hold off transaction commits
3080          */
3081         logical = base + offset;
3082         physical_end = physical + nstripes * map->stripe_len;
3083         if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
3084                          BTRFS_BLOCK_GROUP_RAID6)) {
3085                 get_raid56_logic_offset(physical_end, num,
3086                                         map, &logic_end, NULL);
3087                 logic_end += base;
3088         } else {
3089                 logic_end = logical + increment * nstripes;
3090         }
3091         wait_event(sctx->list_wait,
3092                    atomic_read(&sctx->bios_in_flight) == 0);
3093         scrub_blocked_if_needed(fs_info);
3094
3095         /* FIXME it might be better to start readahead at commit root */
3096         key_start.objectid = logical;
3097         key_start.type = BTRFS_EXTENT_ITEM_KEY;
3098         key_start.offset = (u64)0;
3099         key_end.objectid = logic_end;
3100         key_end.type = BTRFS_METADATA_ITEM_KEY;
3101         key_end.offset = (u64)-1;
3102         reada1 = btrfs_reada_add(root, &key_start, &key_end);
3103
3104         key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3105         key_start.type = BTRFS_EXTENT_CSUM_KEY;
3106         key_start.offset = logical;
3107         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3108         key_end.type = BTRFS_EXTENT_CSUM_KEY;
3109         key_end.offset = logic_end;
3110         reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
3111
3112         if (!IS_ERR(reada1))
3113                 btrfs_reada_wait(reada1);
3114         if (!IS_ERR(reada2))
3115                 btrfs_reada_wait(reada2);
3116
3117
3118         /*
3119          * collect all data csums for the stripe to avoid seeking during
3120          * the scrub. This might currently (crc32) end up to be about 1MB
3121          */
3122         blk_start_plug(&plug);
3123
3124         /*
3125          * now find all extents for each stripe and scrub them
3126          */
3127         ret = 0;
3128         while (physical < physical_end) {
3129                 /* for raid56, we skip parity stripe */
3130                 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
3131                                 BTRFS_BLOCK_GROUP_RAID6)) {
3132                         ret = get_raid56_logic_offset(physical, num,
3133                                         map, &logical, &stripe_logical);
3134                         logical += base;
3135                         if (ret) {
3136                                 stripe_logical += base;
3137                                 stripe_end = stripe_logical + increment - 1;
3138                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3139                                                 ppath, stripe_logical,
3140                                                 stripe_end);
3141                                 if (ret)
3142                                         goto out;
3143                                 goto skip;
3144                         }
3145                 }
3146                 /*
3147                  * canceled?
3148                  */
3149                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3150                     atomic_read(&sctx->cancel_req)) {
3151                         ret = -ECANCELED;
3152                         goto out;
3153                 }
3154                 /*
3155                  * check to see if we have to pause
3156                  */
3157                 if (atomic_read(&fs_info->scrub_pause_req)) {
3158                         /* push queued extents */
3159                         atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3160                         scrub_submit(sctx);
3161                         mutex_lock(&sctx->wr_ctx.wr_lock);
3162                         scrub_wr_submit(sctx);
3163                         mutex_unlock(&sctx->wr_ctx.wr_lock);
3164                         wait_event(sctx->list_wait,
3165                                    atomic_read(&sctx->bios_in_flight) == 0);
3166                         atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3167                         scrub_blocked_if_needed(fs_info);
3168                 }
3169
3170                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3171                         key.type = BTRFS_METADATA_ITEM_KEY;
3172                 else
3173                         key.type = BTRFS_EXTENT_ITEM_KEY;
3174                 key.objectid = logical;
3175                 key.offset = (u64)-1;
3176
3177                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3178                 if (ret < 0)
3179                         goto out;
3180
3181                 if (ret > 0) {
3182                         ret = btrfs_previous_extent_item(root, path, 0);
3183                         if (ret < 0)
3184                                 goto out;
3185                         if (ret > 0) {
3186                                 /* there's no smaller item, so stick with the
3187                                  * larger one */
3188                                 btrfs_release_path(path);
3189                                 ret = btrfs_search_slot(NULL, root, &key,
3190                                                         path, 0, 0);
3191                                 if (ret < 0)
3192                                         goto out;
3193                         }
3194                 }
3195
3196                 stop_loop = 0;
3197                 while (1) {
3198                         u64 bytes;
3199
3200                         l = path->nodes[0];
3201                         slot = path->slots[0];
3202                         if (slot >= btrfs_header_nritems(l)) {
3203                                 ret = btrfs_next_leaf(root, path);
3204                                 if (ret == 0)
3205                                         continue;
3206                                 if (ret < 0)
3207                                         goto out;
3208
3209                                 stop_loop = 1;
3210                                 break;
3211                         }
3212                         btrfs_item_key_to_cpu(l, &key, slot);
3213
3214                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3215                                 bytes = root->nodesize;
3216                         else
3217                                 bytes = key.offset;
3218
3219                         if (key.objectid + bytes <= logical)
3220                                 goto next;
3221
3222                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3223                             key.type != BTRFS_METADATA_ITEM_KEY)
3224                                 goto next;
3225
3226                         if (key.objectid >= logical + map->stripe_len) {
3227                                 /* out of this device extent */
3228                                 if (key.objectid >= logic_end)
3229                                         stop_loop = 1;
3230                                 break;
3231                         }
3232
3233                         extent = btrfs_item_ptr(l, slot,
3234                                                 struct btrfs_extent_item);
3235                         flags = btrfs_extent_flags(l, extent);
3236                         generation = btrfs_extent_generation(l, extent);
3237
3238                         if (key.objectid < logical &&
3239                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
3240                                 btrfs_err(fs_info,
3241                                            "scrub: tree block %llu spanning "
3242                                            "stripes, ignored. logical=%llu",
3243                                        key.objectid, logical);
3244                                 goto next;
3245                         }
3246
3247 again:
3248                         extent_logical = key.objectid;
3249                         extent_len = bytes;
3250
3251                         /*
3252                          * trim extent to this stripe
3253                          */
3254                         if (extent_logical < logical) {
3255                                 extent_len -= logical - extent_logical;
3256                                 extent_logical = logical;
3257                         }
3258                         if (extent_logical + extent_len >
3259                             logical + map->stripe_len) {
3260                                 extent_len = logical + map->stripe_len -
3261                                              extent_logical;
3262                         }
3263
3264                         extent_physical = extent_logical - logical + physical;
3265                         extent_dev = scrub_dev;
3266                         extent_mirror_num = mirror_num;
3267                         if (is_dev_replace)
3268                                 scrub_remap_extent(fs_info, extent_logical,
3269                                                    extent_len, &extent_physical,
3270                                                    &extent_dev,
3271                                                    &extent_mirror_num);
3272
3273                         ret = btrfs_lookup_csums_range(csum_root, logical,
3274                                                 logical + map->stripe_len - 1,
3275                                                 &sctx->csum_list, 1);
3276                         if (ret)
3277                                 goto out;
3278
3279                         ret = scrub_extent(sctx, extent_logical, extent_len,
3280                                            extent_physical, extent_dev, flags,
3281                                            generation, extent_mirror_num,
3282                                            extent_logical - logical + physical);
3283                         if (ret)
3284                                 goto out;
3285
3286                         scrub_free_csums(sctx);
3287                         if (extent_logical + extent_len <
3288                             key.objectid + bytes) {
3289                                 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
3290                                         BTRFS_BLOCK_GROUP_RAID6)) {
3291                                         /*
3292                                          * loop until we find next data stripe
3293                                          * or we have finished all stripes.
3294                                          */
3295 loop:
3296                                         physical += map->stripe_len;
3297                                         ret = get_raid56_logic_offset(physical,
3298                                                         num, map, &logical,
3299                                                         &stripe_logical);
3300                                         logical += base;
3301
3302                                         if (ret && physical < physical_end) {
3303                                                 stripe_logical += base;
3304                                                 stripe_end = stripe_logical +
3305                                                                 increment - 1;
3306                                                 ret = scrub_raid56_parity(sctx,
3307                                                         map, scrub_dev, ppath,
3308                                                         stripe_logical,
3309                                                         stripe_end);
3310                                                 if (ret)
3311                                                         goto out;
3312                                                 goto loop;
3313                                         }
3314                                 } else {
3315                                         physical += map->stripe_len;
3316                                         logical += increment;
3317                                 }
3318                                 if (logical < key.objectid + bytes) {
3319                                         cond_resched();
3320                                         goto again;
3321                                 }
3322
3323                                 if (physical >= physical_end) {
3324                                         stop_loop = 1;
3325                                         break;
3326                                 }
3327                         }
3328 next:
3329                         path->slots[0]++;
3330                 }
3331                 btrfs_release_path(path);
3332 skip:
3333                 logical += increment;
3334                 physical += map->stripe_len;
3335                 spin_lock(&sctx->stat_lock);
3336                 if (stop_loop)
3337                         sctx->stat.last_physical = map->stripes[num].physical +
3338                                                    length;
3339                 else
3340                         sctx->stat.last_physical = physical;
3341                 spin_unlock(&sctx->stat_lock);
3342                 if (stop_loop)
3343                         break;
3344         }
3345 out:
3346         /* push queued extents */
3347         scrub_submit(sctx);
3348         mutex_lock(&sctx->wr_ctx.wr_lock);
3349         scrub_wr_submit(sctx);
3350         mutex_unlock(&sctx->wr_ctx.wr_lock);
3351
3352         blk_finish_plug(&plug);
3353         btrfs_free_path(path);
3354         btrfs_free_path(ppath);
3355         return ret < 0 ? ret : 0;
3356 }
3357
3358 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3359                                           struct btrfs_device *scrub_dev,
3360                                           u64 chunk_tree, u64 chunk_objectid,
3361                                           u64 chunk_offset, u64 length,
3362                                           u64 dev_offset, int is_dev_replace)
3363 {
3364         struct btrfs_mapping_tree *map_tree =
3365                 &sctx->dev_root->fs_info->mapping_tree;
3366         struct map_lookup *map;
3367         struct extent_map *em;
3368         int i;
3369         int ret = 0;
3370
3371         read_lock(&map_tree->map_tree.lock);
3372         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3373         read_unlock(&map_tree->map_tree.lock);
3374
3375         if (!em)
3376                 return -EINVAL;
3377
3378         map = (struct map_lookup *)em->bdev;
3379         if (em->start != chunk_offset)
3380                 goto out;
3381
3382         if (em->len < length)
3383                 goto out;
3384
3385         for (i = 0; i < map->num_stripes; ++i) {
3386                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3387                     map->stripes[i].physical == dev_offset) {
3388                         ret = scrub_stripe(sctx, map, scrub_dev, i,
3389                                            chunk_offset, length,
3390                                            is_dev_replace);
3391                         if (ret)
3392                                 goto out;
3393                 }
3394         }
3395 out:
3396         free_extent_map(em);
3397
3398         return ret;
3399 }
3400
3401 static noinline_for_stack
3402 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3403                            struct btrfs_device *scrub_dev, u64 start, u64 end,
3404                            int is_dev_replace)
3405 {
3406         struct btrfs_dev_extent *dev_extent = NULL;
3407         struct btrfs_path *path;
3408         struct btrfs_root *root = sctx->dev_root;
3409         struct btrfs_fs_info *fs_info = root->fs_info;
3410         u64 length;
3411         u64 chunk_tree;
3412         u64 chunk_objectid;
3413         u64 chunk_offset;
3414         int ret;
3415         int slot;
3416         struct extent_buffer *l;
3417         struct btrfs_key key;
3418         struct btrfs_key found_key;
3419         struct btrfs_block_group_cache *cache;
3420         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3421
3422         path = btrfs_alloc_path();
3423         if (!path)
3424                 return -ENOMEM;
3425
3426         path->reada = 2;
3427         path->search_commit_root = 1;
3428         path->skip_locking = 1;
3429
3430         key.objectid = scrub_dev->devid;
3431         key.offset = 0ull;
3432         key.type = BTRFS_DEV_EXTENT_KEY;
3433
3434         while (1) {
3435                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3436                 if (ret < 0)
3437                         break;
3438                 if (ret > 0) {
3439                         if (path->slots[0] >=
3440                             btrfs_header_nritems(path->nodes[0])) {
3441                                 ret = btrfs_next_leaf(root, path);
3442                                 if (ret)
3443                                         break;
3444                         }
3445                 }
3446
3447                 l = path->nodes[0];
3448                 slot = path->slots[0];
3449
3450                 btrfs_item_key_to_cpu(l, &found_key, slot);
3451
3452                 if (found_key.objectid != scrub_dev->devid)
3453                         break;
3454
3455                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3456                         break;
3457
3458                 if (found_key.offset >= end)
3459                         break;
3460
3461                 if (found_key.offset < key.offset)
3462                         break;
3463
3464                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3465                 length = btrfs_dev_extent_length(l, dev_extent);
3466
3467                 if (found_key.offset + length <= start)
3468                         goto skip;
3469
3470                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3471                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3472                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3473
3474                 /*
3475                  * get a reference on the corresponding block group to prevent
3476                  * the chunk from going away while we scrub it
3477                  */
3478                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3479
3480                 /* some chunks are removed but not committed to disk yet,
3481                  * continue scrubbing */
3482                 if (!cache)
3483                         goto skip;
3484
3485                 dev_replace->cursor_right = found_key.offset + length;
3486                 dev_replace->cursor_left = found_key.offset;
3487                 dev_replace->item_needs_writeback = 1;
3488                 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
3489                                   chunk_offset, length, found_key.offset,
3490                                   is_dev_replace);
3491
3492                 /*
3493                  * flush, submit all pending read and write bios, afterwards
3494                  * wait for them.
3495                  * Note that in the dev replace case, a read request causes
3496                  * write requests that are submitted in the read completion
3497                  * worker. Therefore in the current situation, it is required
3498                  * that all write requests are flushed, so that all read and
3499                  * write requests are really completed when bios_in_flight
3500                  * changes to 0.
3501                  */
3502                 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3503                 scrub_submit(sctx);
3504                 mutex_lock(&sctx->wr_ctx.wr_lock);
3505                 scrub_wr_submit(sctx);
3506                 mutex_unlock(&sctx->wr_ctx.wr_lock);
3507
3508                 wait_event(sctx->list_wait,
3509                            atomic_read(&sctx->bios_in_flight) == 0);
3510                 atomic_inc(&fs_info->scrubs_paused);
3511                 wake_up(&fs_info->scrub_pause_wait);
3512
3513                 /*
3514                  * must be called before we decrease @scrub_paused.
3515                  * make sure we don't block transaction commit while
3516                  * we are waiting pending workers finished.
3517                  */
3518                 wait_event(sctx->list_wait,
3519                            atomic_read(&sctx->workers_pending) == 0);
3520                 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3521
3522                 mutex_lock(&fs_info->scrub_lock);
3523                 __scrub_blocked_if_needed(fs_info);
3524                 atomic_dec(&fs_info->scrubs_paused);
3525                 mutex_unlock(&fs_info->scrub_lock);
3526                 wake_up(&fs_info->scrub_pause_wait);
3527
3528                 btrfs_put_block_group(cache);
3529                 if (ret)
3530                         break;
3531                 if (is_dev_replace &&
3532                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3533                         ret = -EIO;
3534                         break;
3535                 }
3536                 if (sctx->stat.malloc_errors > 0) {
3537                         ret = -ENOMEM;
3538                         break;
3539                 }
3540
3541                 dev_replace->cursor_left = dev_replace->cursor_right;
3542                 dev_replace->item_needs_writeback = 1;
3543 skip:
3544                 key.offset = found_key.offset + length;
3545                 btrfs_release_path(path);
3546         }
3547
3548         btrfs_free_path(path);
3549
3550         /*
3551          * ret can still be 1 from search_slot or next_leaf,
3552          * that's not an error
3553          */
3554         return ret < 0 ? ret : 0;
3555 }
3556
3557 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3558                                            struct btrfs_device *scrub_dev)
3559 {
3560         int     i;
3561         u64     bytenr;
3562         u64     gen;
3563         int     ret;
3564         struct btrfs_root *root = sctx->dev_root;
3565
3566         if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
3567                 return -EIO;
3568
3569         /* Seed devices of a new filesystem has their own generation. */
3570         if (scrub_dev->fs_devices != root->fs_info->fs_devices)
3571                 gen = scrub_dev->generation;
3572         else
3573                 gen = root->fs_info->last_trans_committed;
3574
3575         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3576                 bytenr = btrfs_sb_offset(i);
3577                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3578                     scrub_dev->commit_total_bytes)
3579                         break;
3580
3581                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3582                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3583                                   NULL, 1, bytenr);
3584                 if (ret)
3585                         return ret;
3586         }
3587         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3588
3589         return 0;
3590 }
3591
3592 /*
3593  * get a reference count on fs_info->scrub_workers. start worker if necessary
3594  */
3595 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3596                                                 int is_dev_replace)
3597 {
3598         int ret = 0;
3599         int flags = WQ_FREEZABLE | WQ_UNBOUND;
3600         int max_active = fs_info->thread_pool_size;
3601
3602         if (fs_info->scrub_workers_refcnt == 0) {
3603                 if (is_dev_replace)
3604                         fs_info->scrub_workers =
3605                                 btrfs_alloc_workqueue("btrfs-scrub", flags,
3606                                                       1, 4);
3607                 else
3608                         fs_info->scrub_workers =
3609                                 btrfs_alloc_workqueue("btrfs-scrub", flags,
3610                                                       max_active, 4);
3611                 if (!fs_info->scrub_workers) {
3612                         ret = -ENOMEM;
3613                         goto out;
3614                 }
3615                 fs_info->scrub_wr_completion_workers =
3616                         btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
3617                                               max_active, 2);
3618                 if (!fs_info->scrub_wr_completion_workers) {
3619                         ret = -ENOMEM;
3620                         goto out;
3621                 }
3622                 fs_info->scrub_nocow_workers =
3623                         btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
3624                 if (!fs_info->scrub_nocow_workers) {
3625                         ret = -ENOMEM;
3626                         goto out;
3627                 }
3628         }
3629         ++fs_info->scrub_workers_refcnt;
3630 out:
3631         return ret;
3632 }
3633
3634 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
3635 {
3636         if (--fs_info->scrub_workers_refcnt == 0) {
3637                 btrfs_destroy_workqueue(fs_info->scrub_workers);
3638                 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3639                 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
3640         }
3641         WARN_ON(fs_info->scrub_workers_refcnt < 0);
3642 }
3643
3644 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3645                     u64 end, struct btrfs_scrub_progress *progress,
3646                     int readonly, int is_dev_replace)
3647 {
3648         struct scrub_ctx *sctx;
3649         int ret;
3650         struct btrfs_device *dev;
3651         struct rcu_string *name;
3652
3653         if (btrfs_fs_closing(fs_info))
3654                 return -EINVAL;
3655
3656         if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
3657                 /*
3658                  * in this case scrub is unable to calculate the checksum
3659                  * the way scrub is implemented. Do not handle this
3660                  * situation at all because it won't ever happen.
3661                  */
3662                 btrfs_err(fs_info,
3663                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
3664                        fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
3665                 return -EINVAL;
3666         }
3667
3668         if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
3669                 /* not supported for data w/o checksums */
3670                 btrfs_err(fs_info,
3671                            "scrub: size assumption sectorsize != PAGE_SIZE "
3672                            "(%d != %lu) fails",
3673                        fs_info->chunk_root->sectorsize, PAGE_SIZE);
3674                 return -EINVAL;
3675         }
3676
3677         if (fs_info->chunk_root->nodesize >
3678             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3679             fs_info->chunk_root->sectorsize >
3680             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3681                 /*
3682                  * would exhaust the array bounds of pagev member in
3683                  * struct scrub_block
3684                  */
3685                 btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
3686                            "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
3687                        fs_info->chunk_root->nodesize,
3688                        SCRUB_MAX_PAGES_PER_BLOCK,
3689                        fs_info->chunk_root->sectorsize,
3690                        SCRUB_MAX_PAGES_PER_BLOCK);
3691                 return -EINVAL;
3692         }
3693
3694
3695         mutex_lock(&fs_info->fs_devices->device_list_mutex);
3696         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
3697         if (!dev || (dev->missing && !is_dev_replace)) {
3698                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3699                 return -ENODEV;
3700         }
3701
3702         if (!is_dev_replace && !readonly && !dev->writeable) {
3703                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3704                 rcu_read_lock();
3705                 name = rcu_dereference(dev->name);
3706                 btrfs_err(fs_info, "scrub: device %s is not writable",
3707                           name->str);
3708                 rcu_read_unlock();
3709                 return -EROFS;
3710         }
3711
3712         mutex_lock(&fs_info->scrub_lock);
3713         if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
3714                 mutex_unlock(&fs_info->scrub_lock);
3715                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3716                 return -EIO;
3717         }
3718
3719         btrfs_dev_replace_lock(&fs_info->dev_replace);
3720         if (dev->scrub_device ||
3721             (!is_dev_replace &&
3722              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3723                 btrfs_dev_replace_unlock(&fs_info->dev_replace);
3724                 mutex_unlock(&fs_info->scrub_lock);
3725                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3726                 return -EINPROGRESS;
3727         }
3728         btrfs_dev_replace_unlock(&fs_info->dev_replace);
3729
3730         ret = scrub_workers_get(fs_info, is_dev_replace);
3731         if (ret) {
3732                 mutex_unlock(&fs_info->scrub_lock);
3733                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3734                 return ret;
3735         }
3736
3737         sctx = scrub_setup_ctx(dev, is_dev_replace);
3738         if (IS_ERR(sctx)) {
3739                 mutex_unlock(&fs_info->scrub_lock);
3740                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3741                 scrub_workers_put(fs_info);
3742                 return PTR_ERR(sctx);
3743         }
3744         sctx->readonly = readonly;
3745         dev->scrub_device = sctx;
3746         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3747
3748         /*
3749          * checking @scrub_pause_req here, we can avoid
3750          * race between committing transaction and scrubbing.
3751          */
3752         __scrub_blocked_if_needed(fs_info);
3753         atomic_inc(&fs_info->scrubs_running);
3754         mutex_unlock(&fs_info->scrub_lock);
3755
3756         if (!is_dev_replace) {
3757                 /*
3758                  * by holding device list mutex, we can
3759                  * kick off writing super in log tree sync.
3760                  */
3761                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3762                 ret = scrub_supers(sctx, dev);
3763                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3764         }
3765
3766         if (!ret)
3767                 ret = scrub_enumerate_chunks(sctx, dev, start, end,
3768                                              is_dev_replace);
3769
3770         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3771         atomic_dec(&fs_info->scrubs_running);
3772         wake_up(&fs_info->scrub_pause_wait);
3773
3774         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
3775
3776         if (progress)
3777                 memcpy(progress, &sctx->stat, sizeof(*progress));
3778
3779         mutex_lock(&fs_info->scrub_lock);
3780         dev->scrub_device = NULL;
3781         scrub_workers_put(fs_info);
3782         mutex_unlock(&fs_info->scrub_lock);
3783
3784         scrub_free_ctx(sctx);
3785
3786         return ret;
3787 }
3788
3789 void btrfs_scrub_pause(struct btrfs_root *root)
3790 {
3791         struct btrfs_fs_info *fs_info = root->fs_info;
3792
3793         mutex_lock(&fs_info->scrub_lock);
3794         atomic_inc(&fs_info->scrub_pause_req);
3795         while (atomic_read(&fs_info->scrubs_paused) !=
3796                atomic_read(&fs_info->scrubs_running)) {
3797                 mutex_unlock(&fs_info->scrub_lock);
3798                 wait_event(fs_info->scrub_pause_wait,
3799                            atomic_read(&fs_info->scrubs_paused) ==
3800                            atomic_read(&fs_info->scrubs_running));
3801                 mutex_lock(&fs_info->scrub_lock);
3802         }
3803         mutex_unlock(&fs_info->scrub_lock);
3804 }
3805
3806 void btrfs_scrub_continue(struct btrfs_root *root)
3807 {
3808         struct btrfs_fs_info *fs_info = root->fs_info;
3809
3810         atomic_dec(&fs_info->scrub_pause_req);
3811         wake_up(&fs_info->scrub_pause_wait);
3812 }
3813
3814 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
3815 {
3816         mutex_lock(&fs_info->scrub_lock);
3817         if (!atomic_read(&fs_info->scrubs_running)) {
3818                 mutex_unlock(&fs_info->scrub_lock);
3819                 return -ENOTCONN;
3820         }
3821
3822         atomic_inc(&fs_info->scrub_cancel_req);
3823         while (atomic_read(&fs_info->scrubs_running)) {
3824                 mutex_unlock(&fs_info->scrub_lock);
3825                 wait_event(fs_info->scrub_pause_wait,
3826                            atomic_read(&fs_info->scrubs_running) == 0);
3827                 mutex_lock(&fs_info->scrub_lock);
3828         }
3829         atomic_dec(&fs_info->scrub_cancel_req);
3830         mutex_unlock(&fs_info->scrub_lock);
3831
3832         return 0;
3833 }
3834
3835 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
3836                            struct btrfs_device *dev)
3837 {
3838         struct scrub_ctx *sctx;
3839
3840         mutex_lock(&fs_info->scrub_lock);
3841         sctx = dev->scrub_device;
3842         if (!sctx) {
3843                 mutex_unlock(&fs_info->scrub_lock);
3844                 return -ENOTCONN;
3845         }
3846         atomic_inc(&sctx->cancel_req);
3847         while (dev->scrub_device) {
3848                 mutex_unlock(&fs_info->scrub_lock);
3849                 wait_event(fs_info->scrub_pause_wait,
3850                            dev->scrub_device == NULL);
3851                 mutex_lock(&fs_info->scrub_lock);
3852         }
3853         mutex_unlock(&fs_info->scrub_lock);
3854
3855         return 0;
3856 }
3857
3858 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3859                          struct btrfs_scrub_progress *progress)
3860 {
3861         struct btrfs_device *dev;
3862         struct scrub_ctx *sctx = NULL;
3863
3864         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3865         dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
3866         if (dev)
3867                 sctx = dev->scrub_device;
3868         if (sctx)
3869                 memcpy(progress, &sctx->stat, sizeof(*progress));
3870         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3871
3872         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3873 }
3874
3875 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3876                                u64 extent_logical, u64 extent_len,
3877                                u64 *extent_physical,
3878                                struct btrfs_device **extent_dev,
3879                                int *extent_mirror_num)
3880 {
3881         u64 mapped_length;
3882         struct btrfs_bio *bbio = NULL;
3883         int ret;
3884
3885         mapped_length = extent_len;
3886         ret = btrfs_map_block(fs_info, READ, extent_logical,
3887                               &mapped_length, &bbio, 0);
3888         if (ret || !bbio || mapped_length < extent_len ||
3889             !bbio->stripes[0].dev->bdev) {
3890                 kfree(bbio);
3891                 return;
3892         }
3893
3894         *extent_physical = bbio->stripes[0].physical;
3895         *extent_mirror_num = bbio->mirror_num;
3896         *extent_dev = bbio->stripes[0].dev;
3897         kfree(bbio);
3898 }
3899
3900 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3901                               struct scrub_wr_ctx *wr_ctx,
3902                               struct btrfs_fs_info *fs_info,
3903                               struct btrfs_device *dev,
3904                               int is_dev_replace)
3905 {
3906         WARN_ON(wr_ctx->wr_curr_bio != NULL);
3907
3908         mutex_init(&wr_ctx->wr_lock);
3909         wr_ctx->wr_curr_bio = NULL;
3910         if (!is_dev_replace)
3911                 return 0;
3912
3913         WARN_ON(!dev->bdev);
3914         wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3915                                          bio_get_nr_vecs(dev->bdev));
3916         wr_ctx->tgtdev = dev;
3917         atomic_set(&wr_ctx->flush_all_writes, 0);
3918         return 0;
3919 }
3920
3921 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3922 {
3923         mutex_lock(&wr_ctx->wr_lock);
3924         kfree(wr_ctx->wr_curr_bio);
3925         wr_ctx->wr_curr_bio = NULL;
3926         mutex_unlock(&wr_ctx->wr_lock);
3927 }
3928
3929 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3930                             int mirror_num, u64 physical_for_dev_replace)
3931 {
3932         struct scrub_copy_nocow_ctx *nocow_ctx;
3933         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3934
3935         nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3936         if (!nocow_ctx) {
3937                 spin_lock(&sctx->stat_lock);
3938                 sctx->stat.malloc_errors++;
3939                 spin_unlock(&sctx->stat_lock);
3940                 return -ENOMEM;
3941         }
3942
3943         scrub_pending_trans_workers_inc(sctx);
3944
3945         nocow_ctx->sctx = sctx;
3946         nocow_ctx->logical = logical;
3947         nocow_ctx->len = len;
3948         nocow_ctx->mirror_num = mirror_num;
3949         nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3950         btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
3951                         copy_nocow_pages_worker, NULL, NULL);
3952         INIT_LIST_HEAD(&nocow_ctx->inodes);
3953         btrfs_queue_work(fs_info->scrub_nocow_workers,
3954                          &nocow_ctx->work);
3955
3956         return 0;
3957 }
3958
3959 static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
3960 {
3961         struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3962         struct scrub_nocow_inode *nocow_inode;
3963
3964         nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
3965         if (!nocow_inode)
3966                 return -ENOMEM;
3967         nocow_inode->inum = inum;
3968         nocow_inode->offset = offset;
3969         nocow_inode->root = root;
3970         list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
3971         return 0;
3972 }
3973
3974 #define COPY_COMPLETE 1
3975
3976 static void copy_nocow_pages_worker(struct btrfs_work *work)
3977 {
3978         struct scrub_copy_nocow_ctx *nocow_ctx =
3979                 container_of(work, struct scrub_copy_nocow_ctx, work);
3980         struct scrub_ctx *sctx = nocow_ctx->sctx;
3981         u64 logical = nocow_ctx->logical;
3982         u64 len = nocow_ctx->len;
3983         int mirror_num = nocow_ctx->mirror_num;
3984         u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3985         int ret;
3986         struct btrfs_trans_handle *trans = NULL;
3987         struct btrfs_fs_info *fs_info;
3988         struct btrfs_path *path;
3989         struct btrfs_root *root;
3990         int not_written = 0;
3991
3992         fs_info = sctx->dev_root->fs_info;
3993         root = fs_info->extent_root;
3994
3995         path = btrfs_alloc_path();
3996         if (!path) {
3997                 spin_lock(&sctx->stat_lock);
3998                 sctx->stat.malloc_errors++;
3999                 spin_unlock(&sctx->stat_lock);
4000                 not_written = 1;
4001                 goto out;
4002         }
4003
4004         trans = btrfs_join_transaction(root);
4005         if (IS_ERR(trans)) {
4006                 not_written = 1;
4007                 goto out;
4008         }
4009
4010         ret = iterate_inodes_from_logical(logical, fs_info, path,
4011                                           record_inode_for_nocow, nocow_ctx);
4012         if (ret != 0 && ret != -ENOENT) {
4013                 btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
4014                         "phys %llu, len %llu, mir %u, ret %d",
4015                         logical, physical_for_dev_replace, len, mirror_num,
4016                         ret);
4017                 not_written = 1;
4018                 goto out;
4019         }
4020
4021         btrfs_end_transaction(trans, root);
4022         trans = NULL;
4023         while (!list_empty(&nocow_ctx->inodes)) {
4024                 struct scrub_nocow_inode *entry;
4025                 entry = list_first_entry(&nocow_ctx->inodes,
4026                                          struct scrub_nocow_inode,
4027                                          list);
4028                 list_del_init(&entry->list);
4029                 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
4030                                                  entry->root, nocow_ctx);
4031                 kfree(entry);
4032                 if (ret == COPY_COMPLETE) {
4033                         ret = 0;
4034                         break;
4035                 } else if (ret) {
4036                         break;
4037                 }
4038         }
4039 out:
4040         while (!list_empty(&nocow_ctx->inodes)) {
4041                 struct scrub_nocow_inode *entry;
4042                 entry = list_first_entry(&nocow_ctx->inodes,
4043                                          struct scrub_nocow_inode,
4044                                          list);
4045                 list_del_init(&entry->list);
4046                 kfree(entry);
4047         }
4048         if (trans && !IS_ERR(trans))
4049                 btrfs_end_transaction(trans, root);
4050         if (not_written)
4051                 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4052                                             num_uncorrectable_read_errors);
4053
4054         btrfs_free_path(path);
4055         kfree(nocow_ctx);
4056
4057         scrub_pending_trans_workers_dec(sctx);
4058 }
4059
4060 static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
4061                                  u64 logical)
4062 {
4063         struct extent_state *cached_state = NULL;
4064         struct btrfs_ordered_extent *ordered;
4065         struct extent_io_tree *io_tree;
4066         struct extent_map *em;
4067         u64 lockstart = start, lockend = start + len - 1;
4068         int ret = 0;
4069
4070         io_tree = &BTRFS_I(inode)->io_tree;
4071
4072         lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
4073         ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4074         if (ordered) {
4075                 btrfs_put_ordered_extent(ordered);
4076                 ret = 1;
4077                 goto out_unlock;
4078         }
4079
4080         em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4081         if (IS_ERR(em)) {
4082                 ret = PTR_ERR(em);
4083                 goto out_unlock;
4084         }
4085
4086         /*
4087          * This extent does not actually cover the logical extent anymore,
4088          * move on to the next inode.
4089          */
4090         if (em->block_start > logical ||
4091             em->block_start + em->block_len < logical + len) {
4092                 free_extent_map(em);
4093                 ret = 1;
4094                 goto out_unlock;
4095         }
4096         free_extent_map(em);
4097
4098 out_unlock:
4099         unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
4100                              GFP_NOFS);
4101         return ret;
4102 }
4103
4104 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4105                                       struct scrub_copy_nocow_ctx *nocow_ctx)
4106 {
4107         struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
4108         struct btrfs_key key;
4109         struct inode *inode;
4110         struct page *page;
4111         struct btrfs_root *local_root;
4112         struct extent_io_tree *io_tree;
4113         u64 physical_for_dev_replace;
4114         u64 nocow_ctx_logical;
4115         u64 len = nocow_ctx->len;
4116         unsigned long index;
4117         int srcu_index;
4118         int ret = 0;
4119         int err = 0;
4120
4121         key.objectid = root;
4122         key.type = BTRFS_ROOT_ITEM_KEY;
4123         key.offset = (u64)-1;
4124
4125         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4126
4127         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
4128         if (IS_ERR(local_root)) {
4129                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4130                 return PTR_ERR(local_root);
4131         }
4132
4133         key.type = BTRFS_INODE_ITEM_KEY;
4134         key.objectid = inum;
4135         key.offset = 0;
4136         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
4137         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4138         if (IS_ERR(inode))
4139                 return PTR_ERR(inode);
4140
4141         /* Avoid truncate/dio/punch hole.. */
4142         mutex_lock(&inode->i_mutex);
4143         inode_dio_wait(inode);
4144
4145         physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4146         io_tree = &BTRFS_I(inode)->io_tree;
4147         nocow_ctx_logical = nocow_ctx->logical;
4148
4149         ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
4150         if (ret) {
4151                 ret = ret > 0 ? 0 : ret;
4152                 goto out;
4153         }
4154
4155         while (len >= PAGE_CACHE_SIZE) {
4156                 index = offset >> PAGE_CACHE_SHIFT;
4157 again:
4158                 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4159                 if (!page) {
4160                         btrfs_err(fs_info, "find_or_create_page() failed");
4161                         ret = -ENOMEM;
4162                         goto out;
4163                 }
4164
4165                 if (PageUptodate(page)) {
4166                         if (PageDirty(page))
4167                                 goto next_page;
4168                 } else {
4169                         ClearPageError(page);
4170                         err = extent_read_full_page(io_tree, page,
4171                                                            btrfs_get_extent,
4172                                                            nocow_ctx->mirror_num);
4173                         if (err) {
4174                                 ret = err;
4175                                 goto next_page;
4176                         }
4177
4178                         lock_page(page);
4179                         /*
4180                          * If the page has been remove from the page cache,
4181                          * the data on it is meaningless, because it may be
4182                          * old one, the new data may be written into the new
4183                          * page in the page cache.
4184                          */
4185                         if (page->mapping != inode->i_mapping) {
4186                                 unlock_page(page);
4187                                 page_cache_release(page);
4188                                 goto again;
4189                         }
4190                         if (!PageUptodate(page)) {
4191                                 ret = -EIO;
4192                                 goto next_page;
4193                         }
4194                 }
4195
4196                 ret = check_extent_to_block(inode, offset, len,
4197                                             nocow_ctx_logical);
4198                 if (ret) {
4199                         ret = ret > 0 ? 0 : ret;
4200                         goto next_page;
4201                 }
4202
4203                 err = write_page_nocow(nocow_ctx->sctx,
4204                                        physical_for_dev_replace, page);
4205                 if (err)
4206                         ret = err;
4207 next_page:
4208                 unlock_page(page);
4209                 page_cache_release(page);
4210
4211                 if (ret)
4212                         break;
4213
4214                 offset += PAGE_CACHE_SIZE;
4215                 physical_for_dev_replace += PAGE_CACHE_SIZE;
4216                 nocow_ctx_logical += PAGE_CACHE_SIZE;
4217                 len -= PAGE_CACHE_SIZE;
4218         }
4219         ret = COPY_COMPLETE;
4220 out:
4221         mutex_unlock(&inode->i_mutex);
4222         iput(inode);
4223         return ret;
4224 }
4225
4226 static int write_page_nocow(struct scrub_ctx *sctx,
4227                             u64 physical_for_dev_replace, struct page *page)
4228 {
4229         struct bio *bio;
4230         struct btrfs_device *dev;
4231         int ret;
4232
4233         dev = sctx->wr_ctx.tgtdev;
4234         if (!dev)
4235                 return -EIO;
4236         if (!dev->bdev) {
4237                 printk_ratelimited(KERN_WARNING
4238                         "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
4239                 return -EIO;
4240         }
4241         bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
4242         if (!bio) {
4243                 spin_lock(&sctx->stat_lock);
4244                 sctx->stat.malloc_errors++;
4245                 spin_unlock(&sctx->stat_lock);
4246                 return -ENOMEM;
4247         }
4248         bio->bi_iter.bi_size = 0;
4249         bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
4250         bio->bi_bdev = dev->bdev;
4251         ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
4252         if (ret != PAGE_CACHE_SIZE) {
4253 leave_with_eio:
4254                 bio_put(bio);
4255                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4256                 return -EIO;
4257         }
4258
4259         if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
4260                 goto leave_with_eio;
4261
4262         bio_put(bio);
4263         return 0;
4264 }