fs/btrfs/scrub.c

   1 /*
   2  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/blkdev.h>
  20 #include <linux/ratelimit.h>
  21 #include "ctree.h"
  22 #include "volumes.h"
  23 #include "disk-io.h"
  24 #include "ordered-data.h"
  25 #include "transaction.h"
  26 #include "backref.h"
  27 #include "extent_io.h"
  28 #include "dev-replace.h"
  29 #include "check-integrity.h"
  30 #include "rcu-string.h"
  31 #include "raid56.h"
  32
  33 /*
  34  * This is only the first step towards a full-features scrub. It reads all
  35  * extent and super block and verifies the checksums. In case a bad checksum
  36  * is found or the extent cannot be read, good data will be written back if
  37  * any can be found.
  38  *
  39  * Future enhancements:
  40  *  - In case an unrepairable extent is encountered, track which files are
  41  *    affected and report them
  42  *  - track and record media errors, throw out bad devices
  43  *  - add a mode to also read unallocated space
  44  */
  45
  46 struct scrub_block;
  47 struct scrub_ctx;
  48
  49 /*
  50  * the following three values only influence the performance.
  51  * The last one configures the number of parallel and outstanding I/O
  52  * operations. The first two values configure an upper limit for the number
  53  * of (dynamically allocated) pages that are added to a bio.
  54  */
  55 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
  56 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
  57 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
  58
  59 /*
  60  * the following value times PAGE_SIZE needs to be large enough to match the
  61  * largest node/leaf/sector size that shall be supported.
  62  * Values larger than BTRFS_STRIPE_LEN are not supported.
  63  */
  64 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
  65
  66 struct scrub_recover {
  67         atomic_t                refs;
  68         struct btrfs_bio        *bbio;
  69         u64                     *raid_map;
  70         u64                     map_length;
  71 };
  72
  73 struct scrub_page {
  74         struct scrub_block      *sblock;
  75         struct page             *page;
  76         struct btrfs_device     *dev;
  77         struct list_head        list;
  78         u64                     flags;  /* extent flags */
  79         u64                     generation;
  80         u64                     logical;
  81         u64                     physical;
  82         u64                     physical_for_dev_replace;
  83         atomic_t                ref_count;
  84         struct {
  85                 unsigned int    mirror_num:8;
  86                 unsigned int    have_csum:1;
  87                 unsigned int    io_error:1;
  88         };
  89         u8                      csum[BTRFS_CSUM_SIZE];
  90
  91         struct scrub_recover    *recover;
  92 };
  93
  94 struct scrub_bio {
  95         int                     index;
  96         struct scrub_ctx        *sctx;
  97         struct btrfs_device     *dev;
  98         struct bio              *bio;
  99         int                     err;
 100         u64                     logical;
 101         u64                     physical;
 102 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
 103         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
 104 #else
 105         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
 106 #endif
 107         int                     page_count;
 108         int                     next_free;
 109         struct btrfs_work       work;
 110 };
 111
 112 struct scrub_block {
 113         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 114         int                     page_count;
 115         atomic_t                outstanding_pages;
 116         atomic_t                ref_count; /* free mem on transition to zero */
 117         struct scrub_ctx        *sctx;
 118         struct scrub_parity     *sparity;
 119         struct {
 120                 unsigned int    header_error:1;
 121                 unsigned int    checksum_error:1;
 122                 unsigned int    no_io_error_seen:1;
 123                 unsigned int    generation_error:1; /* also sets header_error */
 124
 125                 /* The following is for the data used to check parity */
 126                 /* It is for the data with checksum */
 127                 unsigned int    data_corrected:1;
 128         };
 129 };
 130
 131 /* Used for the chunks with parity stripe such RAID5/6 */
 132 struct scrub_parity {
 133         struct scrub_ctx        *sctx;
 134
 135         struct btrfs_device     *scrub_dev;
 136
 137         u64                     logic_start;
 138
 139         u64                     logic_end;
 140
 141         int                     nsectors;
 142
 143         int                     stripe_len;
 144
 145         atomic_t                ref_count;
 146
 147         struct list_head        spages;
 148
 149         /* Work of parity check and repair */
 150         struct btrfs_work       work;
 151
 152         /* Mark the parity blocks which have data */
 153         unsigned long           *dbitmap;
 154
 155         /*
 156          * Mark the parity blocks which have data, but errors happen when
 157          * read data or check data
 158          */
 159         unsigned long           *ebitmap;
 160
 161         unsigned long           bitmap[0];
 162 };
 163
 164 struct scrub_wr_ctx {
 165         struct scrub_bio *wr_curr_bio;
 166         struct btrfs_device *tgtdev;
 167         int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
 168         atomic_t flush_all_writes;
 169         struct mutex wr_lock;
 170 };
 171
 172 struct scrub_ctx {
 173         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 174         struct btrfs_root       *dev_root;
 175         int                     first_free;
 176         int                     curr;
 177         atomic_t                bios_in_flight;
 178         atomic_t                workers_pending;
 179         spinlock_t              list_lock;
 180         wait_queue_head_t       list_wait;
 181         u16                     csum_size;
 182         struct list_head        csum_list;
 183         atomic_t                cancel_req;
 184         int                     readonly;
 185         int                     pages_per_rd_bio;
 186         u32                     sectorsize;
 187         u32                     nodesize;
 188
 189         int                     is_dev_replace;
 190         struct scrub_wr_ctx     wr_ctx;
 191
 192         /*
 193          * statistics
 194          */
 195         struct btrfs_scrub_progress stat;
 196         spinlock_t              stat_lock;
 197 };
 198
 199 struct scrub_fixup_nodatasum {
 200         struct scrub_ctx        *sctx;
 201         struct btrfs_device     *dev;
 202         u64                     logical;
 203         struct btrfs_root       *root;
 204         struct btrfs_work       work;
 205         int                     mirror_num;
 206 };
 207
 208 struct scrub_nocow_inode {
 209         u64                     inum;
 210         u64                     offset;
 211         u64                     root;
 212         struct list_head        list;
 213 };
 214
 215 struct scrub_copy_nocow_ctx {
 216         struct scrub_ctx        *sctx;
 217         u64                     logical;
 218         u64                     len;
 219         int                     mirror_num;
 220         u64                     physical_for_dev_replace;
 221         struct list_head        inodes;
 222         struct btrfs_work       work;
 223 };
 224
 225 struct scrub_warning {
 226         struct btrfs_path       *path;
 227         u64                     extent_item_size;
 228         const char              *errstr;
 229         sector_t                sector;
 230         u64                     logical;
 231         struct btrfs_device     *dev;
 232 };
 233
 234 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 235 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 236 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
 237 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 238 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 239 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 240                                      struct btrfs_fs_info *fs_info,
 241                                      struct scrub_block *original_sblock,
 242                                      u64 length, u64 logical,
 243                                      struct scrub_block *sblocks_for_recheck);
 244 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 245                                 struct scrub_block *sblock, int is_metadata,
 246                                 int have_csum, u8 *csum, u64 generation,
 247                                 u16 csum_size, int retry_failed_mirror);
 248 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 249                                          struct scrub_block *sblock,
 250                                          int is_metadata, int have_csum,
 251                                          const u8 *csum, u64 generation,
 252                                          u16 csum_size);
 253 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 254                                              struct scrub_block *sblock_good,
 255                                              int force_write);
 256 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 257                                             struct scrub_block *sblock_good,
 258                                             int page_num, int force_write);
 259 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 260 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 261                                            int page_num);
 262 static int scrub_checksum_data(struct scrub_block *sblock);
 263 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 264 static int scrub_checksum_super(struct scrub_block *sblock);
 265 static void scrub_block_get(struct scrub_block *sblock);
 266 static void scrub_block_put(struct scrub_block *sblock);
 267 static void scrub_page_get(struct scrub_page *spage);
 268 static void scrub_page_put(struct scrub_page *spage);
 269 static void scrub_parity_get(struct scrub_parity *sparity);
 270 static void scrub_parity_put(struct scrub_parity *sparity);
 271 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 272                                     struct scrub_page *spage);
 273 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 274                        u64 physical, struct btrfs_device *dev, u64 flags,
 275                        u64 gen, int mirror_num, u8 *csum, int force,
 276                        u64 physical_for_dev_replace);
 277 static void scrub_bio_end_io(struct bio *bio, int err);
 278 static void scrub_bio_end_io_worker(struct btrfs_work *work);
 279 static void scrub_block_complete(struct scrub_block *sblock);
 280 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 281                                u64 extent_logical, u64 extent_len,
 282                                u64 *extent_physical,
 283                                struct btrfs_device **extent_dev,
 284                                int *extent_mirror_num);
 285 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
 286                               struct scrub_wr_ctx *wr_ctx,
 287                               struct btrfs_fs_info *fs_info,
 288                               struct btrfs_device *dev,
 289                               int is_dev_replace);
 290 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
 291 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 292                                     struct scrub_page *spage);
 293 static void scrub_wr_submit(struct scrub_ctx *sctx);
 294 static void scrub_wr_bio_end_io(struct bio *bio, int err);
 295 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 296 static int write_page_nocow(struct scrub_ctx *sctx,
 297                             u64 physical_for_dev_replace, struct page *page);
 298 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 299                                       struct scrub_copy_nocow_ctx *ctx);
 300 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 301                             int mirror_num, u64 physical_for_dev_replace);
 302 static void copy_nocow_pages_worker(struct btrfs_work *work);
 303 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 304 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 305
 306
 307 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 308 {
 309         atomic_inc(&sctx->bios_in_flight);
 310 }
 311
 312 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 313 {
 314         atomic_dec(&sctx->bios_in_flight);
 315         wake_up(&sctx->list_wait);
 316 }
 317
 318 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 319 {
 320         while (atomic_read(&fs_info->scrub_pause_req)) {
 321                 mutex_unlock(&fs_info->scrub_lock);
 322                 wait_event(fs_info->scrub_pause_wait,
 323                    atomic_read(&fs_info->scrub_pause_req) == 0);
 324                 mutex_lock(&fs_info->scrub_lock);
 325         }
 326 }
 327
 328 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 329 {
 330         atomic_inc(&fs_info->scrubs_paused);
 331         wake_up(&fs_info->scrub_pause_wait);
 332
 333         mutex_lock(&fs_info->scrub_lock);
 334         __scrub_blocked_if_needed(fs_info);
 335         atomic_dec(&fs_info->scrubs_paused);
 336         mutex_unlock(&fs_info->scrub_lock);
 337
 338         wake_up(&fs_info->scrub_pause_wait);
 339 }
 340
 341 /*
 342  * used for workers that require transaction commits (i.e., for the
 343  * NOCOW case)
 344  */
 345 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 346 {
 347         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 348
 349         /*
 350          * increment scrubs_running to prevent cancel requests from
 351          * completing as long as a worker is running. we must also
 352          * increment scrubs_paused to prevent deadlocking on pause
 353          * requests used for transactions commits (as the worker uses a
 354          * transaction context). it is safe to regard the worker
 355          * as paused for all matters practical. effectively, we only
 356          * avoid cancellation requests from completing.
 357          */
 358         mutex_lock(&fs_info->scrub_lock);
 359         atomic_inc(&fs_info->scrubs_running);
 360         atomic_inc(&fs_info->scrubs_paused);
 361         mutex_unlock(&fs_info->scrub_lock);
 362
 363         /*
 364          * check if @scrubs_running=@scrubs_paused condition
 365          * inside wait_event() is not an atomic operation.
 366          * which means we may inc/dec @scrub_running/paused
 367          * at any time. Let's wake up @scrub_pause_wait as
 368          * much as we can to let commit transaction blocked less.
 369          */
 370         wake_up(&fs_info->scrub_pause_wait);
 371
 372         atomic_inc(&sctx->workers_pending);
 373 }
 374
 375 /* used for workers that require transaction commits */
 376 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
 377 {
 378         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 379
 380         /*
 381          * see scrub_pending_trans_workers_inc() why we're pretending
 382          * to be paused in the scrub counters
 383          */
 384         mutex_lock(&fs_info->scrub_lock);
 385         atomic_dec(&fs_info->scrubs_running);
 386         atomic_dec(&fs_info->scrubs_paused);
 387         mutex_unlock(&fs_info->scrub_lock);
 388         atomic_dec(&sctx->workers_pending);
 389         wake_up(&fs_info->scrub_pause_wait);
 390         wake_up(&sctx->list_wait);
 391 }
 392
 393 static void scrub_free_csums(struct scrub_ctx *sctx)
 394 {
 395         while (!list_empty(&sctx->csum_list)) {
 396                 struct btrfs_ordered_sum *sum;
 397                 sum = list_first_entry(&sctx->csum_list,
 398                                        struct btrfs_ordered_sum, list);
 399                 list_del(&sum->list);
 400                 kfree(sum);
 401         }
 402 }
 403
 404 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 405 {
 406         int i;
 407
 408         if (!sctx)
 409                 return;
 410
 411         scrub_free_wr_ctx(&sctx->wr_ctx);
 412
 413         /* this can happen when scrub is cancelled */
 414         if (sctx->curr != -1) {
 415                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
 416
 417                 for (i = 0; i < sbio->page_count; i++) {
 418                         WARN_ON(!sbio->pagev[i]->page);
 419                         scrub_block_put(sbio->pagev[i]->sblock);
 420                 }
 421                 bio_put(sbio->bio);
 422         }
 423
 424         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 425                 struct scrub_bio *sbio = sctx->bios[i];
 426
 427                 if (!sbio)
 428                         break;
 429                 kfree(sbio);
 430         }
 431
 432         scrub_free_csums(sctx);
 433         kfree(sctx);
 434 }
 435
 436 static noinline_for_stack
 437 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 438 {
 439         struct scrub_ctx *sctx;
 440         int             i;
 441         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
 442         int pages_per_rd_bio;
 443         int ret;
 444
 445         /*
 446          * the setting of pages_per_rd_bio is correct for scrub but might
 447          * be wrong for the dev_replace code where we might read from
 448          * different devices in the initial huge bios. However, that
 449          * code is able to correctly handle the case when adding a page
 450          * to a bio fails.
 451          */
 452         if (dev->bdev)
 453                 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
 454                                          bio_get_nr_vecs(dev->bdev));
 455         else
 456                 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 457         sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
 458         if (!sctx)
 459                 goto nomem;
 460         sctx->is_dev_replace = is_dev_replace;
 461         sctx->pages_per_rd_bio = pages_per_rd_bio;
 462         sctx->curr = -1;
 463         sctx->dev_root = dev->dev_root;
 464         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 465                 struct scrub_bio *sbio;
 466
 467                 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
 468                 if (!sbio)
 469                         goto nomem;
 470                 sctx->bios[i] = sbio;
 471
 472                 sbio->index = i;
 473                 sbio->sctx = sctx;
 474                 sbio->page_count = 0;
 475                 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
 476                                 scrub_bio_end_io_worker, NULL, NULL);
 477
 478                 if (i != SCRUB_BIOS_PER_SCTX - 1)
 479                         sctx->bios[i]->next_free = i + 1;
 480                 else
 481                         sctx->bios[i]->next_free = -1;
 482         }
 483         sctx->first_free = 0;
 484         sctx->nodesize = dev->dev_root->nodesize;
 485         sctx->sectorsize = dev->dev_root->sectorsize;
 486         atomic_set(&sctx->bios_in_flight, 0);
 487         atomic_set(&sctx->workers_pending, 0);
 488         atomic_set(&sctx->cancel_req, 0);
 489         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 490         INIT_LIST_HEAD(&sctx->csum_list);
 491
 492         spin_lock_init(&sctx->list_lock);
 493         spin_lock_init(&sctx->stat_lock);
 494         init_waitqueue_head(&sctx->list_wait);
 495
 496         ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
 497                                  fs_info->dev_replace.tgtdev, is_dev_replace);
 498         if (ret) {
 499                 scrub_free_ctx(sctx);
 500                 return ERR_PTR(ret);
 501         }
 502         return sctx;
 503
 504 nomem:
 505         scrub_free_ctx(sctx);
 506         return ERR_PTR(-ENOMEM);
 507 }
 508
 509 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 510                                      void *warn_ctx)
 511 {
 512         u64 isize;
 513         u32 nlink;
 514         int ret;
 515         int i;
 516         struct extent_buffer *eb;
 517         struct btrfs_inode_item *inode_item;
 518         struct scrub_warning *swarn = warn_ctx;
 519         struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
 520         struct inode_fs_paths *ipath = NULL;
 521         struct btrfs_root *local_root;
 522         struct btrfs_key root_key;
 523         struct btrfs_key key;
 524
 525         root_key.objectid = root;
 526         root_key.type = BTRFS_ROOT_ITEM_KEY;
 527         root_key.offset = (u64)-1;
 528         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 529         if (IS_ERR(local_root)) {
 530                 ret = PTR_ERR(local_root);
 531                 goto err;
 532         }
 533
 534         /*
 535          * this makes the path point to (inum INODE_ITEM ioff)
 536          */
 537         key.objectid = inum;
 538         key.type = BTRFS_INODE_ITEM_KEY;
 539         key.offset = 0;
 540
 541         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 542         if (ret) {
 543                 btrfs_release_path(swarn->path);
 544                 goto err;
 545         }
 546
 547         eb = swarn->path->nodes[0];
 548         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 549                                         struct btrfs_inode_item);
 550         isize = btrfs_inode_size(eb, inode_item);
 551         nlink = btrfs_inode_nlink(eb, inode_item);
 552         btrfs_release_path(swarn->path);
 553
 554         ipath = init_ipath(4096, local_root, swarn->path);
 555         if (IS_ERR(ipath)) {
 556                 ret = PTR_ERR(ipath);
 557                 ipath = NULL;
 558                 goto err;
 559         }
 560         ret = paths_from_inode(inum, ipath);
 561
 562         if (ret < 0)
 563                 goto err;
 564
 565         /*
 566          * we deliberately ignore the bit ipath might have been too small to
 567          * hold all of the paths here
 568          */
 569         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 570                 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 571                         "%s, sector %llu, root %llu, inode %llu, offset %llu, "
 572                         "length %llu, links %u (path: %s)\n", swarn->errstr,
 573                         swarn->logical, rcu_str_deref(swarn->dev->name),
 574                         (unsigned long long)swarn->sector, root, inum, offset,
 575                         min(isize - offset, (u64)PAGE_SIZE), nlink,
 576                         (char *)(unsigned long)ipath->fspath->val[i]);
 577
 578         free_ipath(ipath);
 579         return 0;
 580
 581 err:
 582         printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 583                 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
 584                 "resolving failed with ret=%d\n", swarn->errstr,
 585                 swarn->logical, rcu_str_deref(swarn->dev->name),
 586                 (unsigned long long)swarn->sector, root, inum, offset, ret);
 587
 588         free_ipath(ipath);
 589         return 0;
 590 }
 591
 592 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 593 {
 594         struct btrfs_device *dev;
 595         struct btrfs_fs_info *fs_info;
 596         struct btrfs_path *path;
 597         struct btrfs_key found_key;
 598         struct extent_buffer *eb;
 599         struct btrfs_extent_item *ei;
 600         struct scrub_warning swarn;
 601         unsigned long ptr = 0;
 602         u64 extent_item_pos;
 603         u64 flags = 0;
 604         u64 ref_root;
 605         u32 item_size;
 606         u8 ref_level;
 607         int ret;
 608
 609         WARN_ON(sblock->page_count < 1);
 610         dev = sblock->pagev[0]->dev;
 611         fs_info = sblock->sctx->dev_root->fs_info;
 612
 613         path = btrfs_alloc_path();
 614         if (!path)
 615                 return;
 616
 617         swarn.sector = (sblock->pagev[0]->physical) >> 9;
 618         swarn.logical = sblock->pagev[0]->logical;
 619         swarn.errstr = errstr;
 620         swarn.dev = NULL;
 621
 622         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 623                                   &flags);
 624         if (ret < 0)
 625                 goto out;
 626
 627         extent_item_pos = swarn.logical - found_key.objectid;
 628         swarn.extent_item_size = found_key.offset;
 629
 630         eb = path->nodes[0];
 631         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 632         item_size = btrfs_item_size_nr(eb, path->slots[0]);
 633
 634         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 635                 do {
 636                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 637                                                       item_size, &ref_root,
 638                                                       &ref_level);
 639                         printk_in_rcu(KERN_WARNING
 640                                 "BTRFS: %s at logical %llu on dev %s, "
 641                                 "sector %llu: metadata %s (level %d) in tree "
 642                                 "%llu\n", errstr, swarn.logical,
 643                                 rcu_str_deref(dev->name),
 644                                 (unsigned long long)swarn.sector,
 645                                 ref_level ? "node" : "leaf",
 646                                 ret < 0 ? -1 : ref_level,
 647                                 ret < 0 ? -1 : ref_root);
 648                 } while (ret != 1);
 649                 btrfs_release_path(path);
 650         } else {
 651                 btrfs_release_path(path);
 652                 swarn.path = path;
 653                 swarn.dev = dev;
 654                 iterate_extent_inodes(fs_info, found_key.objectid,
 655                                         extent_item_pos, 1,
 656                                         scrub_print_warning_inode, &swarn);
 657         }
 658
 659 out:
 660         btrfs_free_path(path);
 661 }
 662
 663 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 664 {
 665         struct page *page = NULL;
 666         unsigned long index;
 667         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
 668         int ret;
 669         int corrected = 0;
 670         struct btrfs_key key;
 671         struct inode *inode = NULL;
 672         struct btrfs_fs_info *fs_info;
 673         u64 end = offset + PAGE_SIZE - 1;
 674         struct btrfs_root *local_root;
 675         int srcu_index;
 676
 677         key.objectid = root;
 678         key.type = BTRFS_ROOT_ITEM_KEY;
 679         key.offset = (u64)-1;
 680
 681         fs_info = fixup->root->fs_info;
 682         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
 683
 684         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
 685         if (IS_ERR(local_root)) {
 686                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 687                 return PTR_ERR(local_root);
 688         }
 689
 690         key.type = BTRFS_INODE_ITEM_KEY;
 691         key.objectid = inum;
 692         key.offset = 0;
 693         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
 694         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 695         if (IS_ERR(inode))
 696                 return PTR_ERR(inode);
 697
 698         index = offset >> PAGE_CACHE_SHIFT;
 699
 700         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 701         if (!page) {
 702                 ret = -ENOMEM;
 703                 goto out;
 704         }
 705
 706         if (PageUptodate(page)) {
 707                 if (PageDirty(page)) {
 708                         /*
 709                          * we need to write the data to the defect sector. the
 710                          * data that was in that sector is not in memory,
 711                          * because the page was modified. we must not write the
 712                          * modified page to that sector.
 713                          *
 714                          * TODO: what could be done here: wait for the delalloc
 715                          *       runner to write out that page (might involve
 716                          *       COW) and see whether the sector is still
 717                          *       referenced afterwards.
 718                          *
 719                          * For the meantime, we'll treat this error
 720                          * incorrectable, although there is a chance that a
 721                          * later scrub will find the bad sector again and that
 722                          * there's no dirty page in memory, then.
 723                          */
 724                         ret = -EIO;
 725                         goto out;
 726                 }
 727                 ret = repair_io_failure(inode, offset, PAGE_SIZE,
 728                                         fixup->logical, page,
 729                                         offset - page_offset(page),
 730                                         fixup->mirror_num);
 731                 unlock_page(page);
 732                 corrected = !ret;
 733         } else {
 734                 /*
 735                  * we need to get good data first. the general readpage path
 736                  * will call repair_io_failure for us, we just have to make
 737                  * sure we read the bad mirror.
 738                  */
 739                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 740                                         EXTENT_DAMAGED, GFP_NOFS);
 741                 if (ret) {
 742                         /* set_extent_bits should give proper error */
 743                         WARN_ON(ret > 0);
 744                         if (ret > 0)
 745                                 ret = -EFAULT;
 746                         goto out;
 747                 }
 748
 749                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
 750                                                 btrfs_get_extent,
 751                                                 fixup->mirror_num);
 752                 wait_on_page_locked(page);
 753
 754                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
 755                                                 end, EXTENT_DAMAGED, 0, NULL);
 756                 if (!corrected)
 757                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 758                                                 EXTENT_DAMAGED, GFP_NOFS);
 759         }
 760
 761 out:
 762         if (page)
 763                 put_page(page);
 764
 765         iput(inode);
 766
 767         if (ret < 0)
 768                 return ret;
 769
 770         if (ret == 0 && corrected) {
 771                 /*
 772                  * we only need to call readpage for one of the inodes belonging
 773                  * to this extent. so make iterate_extent_inodes stop
 774                  */
 775                 return 1;
 776         }
 777
 778         return -EIO;
 779 }
 780
 781 static void scrub_fixup_nodatasum(struct btrfs_work *work)
 782 {
 783         int ret;
 784         struct scrub_fixup_nodatasum *fixup;
 785         struct scrub_ctx *sctx;
 786         struct btrfs_trans_handle *trans = NULL;
 787         struct btrfs_path *path;
 788         int uncorrectable = 0;
 789
 790         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
 791         sctx = fixup->sctx;
 792
 793         path = btrfs_alloc_path();
 794         if (!path) {
 795                 spin_lock(&sctx->stat_lock);
 796                 ++sctx->stat.malloc_errors;
 797                 spin_unlock(&sctx->stat_lock);
 798                 uncorrectable = 1;
 799                 goto out;
 800         }
 801
 802         trans = btrfs_join_transaction(fixup->root);
 803         if (IS_ERR(trans)) {
 804                 uncorrectable = 1;
 805                 goto out;
 806         }
 807
 808         /*
 809          * the idea is to trigger a regular read through the standard path. we
 810          * read a page from the (failed) logical address by specifying the
 811          * corresponding copynum of the failed sector. thus, that readpage is
 812          * expected to fail.
 813          * that is the point where on-the-fly error correction will kick in
 814          * (once it's finished) and rewrite the failed sector if a good copy
 815          * can be found.
 816          */
 817         ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
 818                                                 path, scrub_fixup_readpage,
 819                                                 fixup);
 820         if (ret < 0) {
 821                 uncorrectable = 1;
 822                 goto out;
 823         }
 824         WARN_ON(ret != 1);
 825
 826         spin_lock(&sctx->stat_lock);
 827         ++sctx->stat.corrected_errors;
 828         spin_unlock(&sctx->stat_lock);
 829
 830 out:
 831         if (trans && !IS_ERR(trans))
 832                 btrfs_end_transaction(trans, fixup->root);
 833         if (uncorrectable) {
 834                 spin_lock(&sctx->stat_lock);
 835                 ++sctx->stat.uncorrectable_errors;
 836                 spin_unlock(&sctx->stat_lock);
 837                 btrfs_dev_replace_stats_inc(
 838                         &sctx->dev_root->fs_info->dev_replace.
 839                         num_uncorrectable_read_errors);
 840                 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
 841                     "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
 842                         fixup->logical, rcu_str_deref(fixup->dev->name));
 843         }
 844
 845         btrfs_free_path(path);
 846         kfree(fixup);
 847
 848         scrub_pending_trans_workers_dec(sctx);
 849 }
 850
 851 static inline void scrub_get_recover(struct scrub_recover *recover)
 852 {
 853         atomic_inc(&recover->refs);
 854 }
 855
 856 static inline void scrub_put_recover(struct scrub_recover *recover)
 857 {
 858         if (atomic_dec_and_test(&recover->refs)) {
 859                 kfree(recover->bbio);
 860                 kfree(recover->raid_map);
 861                 kfree(recover);
 862         }
 863 }
 864
 865 /*
 866  * scrub_handle_errored_block gets called when either verification of the
 867  * pages failed or the bio failed to read, e.g. with EIO. In the latter
 868  * case, this function handles all pages in the bio, even though only one
 869  * may be bad.
 870  * The goal of this function is to repair the errored block by using the
 871  * contents of one of the mirrors.
 872  */
 873 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 874 {
 875         struct scrub_ctx *sctx = sblock_to_check->sctx;
 876         struct btrfs_device *dev;
 877         struct btrfs_fs_info *fs_info;
 878         u64 length;
 879         u64 logical;
 880         u64 generation;
 881         unsigned int failed_mirror_index;
 882         unsigned int is_metadata;
 883         unsigned int have_csum;
 884         u8 *csum;
 885         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 886         struct scrub_block *sblock_bad;
 887         int ret;
 888         int mirror_index;
 889         int page_num;
 890         int success;
 891         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 892                                       DEFAULT_RATELIMIT_BURST);
 893
 894         BUG_ON(sblock_to_check->page_count < 1);
 895         fs_info = sctx->dev_root->fs_info;
 896         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 897                 /*
 898                  * if we find an error in a super block, we just report it.
 899                  * They will get written with the next transaction commit
 900                  * anyway
 901                  */
 902                 spin_lock(&sctx->stat_lock);
 903                 ++sctx->stat.super_errors;
 904                 spin_unlock(&sctx->stat_lock);
 905                 return 0;
 906         }
 907         length = sblock_to_check->page_count * PAGE_SIZE;
 908         logical = sblock_to_check->pagev[0]->logical;
 909         generation = sblock_to_check->pagev[0]->generation;
 910         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
 911         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
 912         is_metadata = !(sblock_to_check->pagev[0]->flags &
 913                         BTRFS_EXTENT_FLAG_DATA);
 914         have_csum = sblock_to_check->pagev[0]->have_csum;
 915         csum = sblock_to_check->pagev[0]->csum;
 916         dev = sblock_to_check->pagev[0]->dev;
 917
 918         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
 919                 sblocks_for_recheck = NULL;
 920                 goto nodatasum_case;
 921         }
 922
 923         /*
 924          * read all mirrors one after the other. This includes to
 925          * re-read the extent or metadata block that failed (that was
 926          * the cause that this fixup code is called) another time,
 927          * page by page this time in order to know which pages
 928          * caused I/O errors and which ones are good (for all mirrors).
 929          * It is the goal to handle the situation when more than one
 930          * mirror contains I/O errors, but the errors do not
 931          * overlap, i.e. the data can be repaired by selecting the
 932          * pages from those mirrors without I/O error on the
 933          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
 934          * would be that mirror #1 has an I/O error on the first page,
 935          * the second page is good, and mirror #2 has an I/O error on
 936          * the second page, but the first page is good.
 937          * Then the first page of the first mirror can be repaired by
 938          * taking the first page of the second mirror, and the
 939          * second page of the second mirror can be repaired by
 940          * copying the contents of the 2nd page of the 1st mirror.
 941          * One more note: if the pages of one mirror contain I/O
 942          * errors, the checksum cannot be verified. In order to get
 943          * the best data for repairing, the first attempt is to find
 944          * a mirror without I/O errors and with a validated checksum.
 945          * Only if this is not possible, the pages are picked from
 946          * mirrors with I/O errors without considering the checksum.
 947          * If the latter is the case, at the end, the checksum of the
 948          * repaired area is verified in order to correctly maintain
 949          * the statistics.
 950          */
 951
 952         sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
 953                                      sizeof(*sblocks_for_recheck),
 954                                      GFP_NOFS);
 955         if (!sblocks_for_recheck) {
 956                 spin_lock(&sctx->stat_lock);
 957                 sctx->stat.malloc_errors++;
 958                 sctx->stat.read_errors++;
 959                 sctx->stat.uncorrectable_errors++;
 960                 spin_unlock(&sctx->stat_lock);
 961                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 962                 goto out;
 963         }
 964
 965         /* setup the context, map the logical blocks and alloc the pages */
 966         ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
 967                                         logical, sblocks_for_recheck);
 968         if (ret) {
 969                 spin_lock(&sctx->stat_lock);
 970                 sctx->stat.read_errors++;
 971                 sctx->stat.uncorrectable_errors++;
 972                 spin_unlock(&sctx->stat_lock);
 973                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 974                 goto out;
 975         }
 976         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 977         sblock_bad = sblocks_for_recheck + failed_mirror_index;
 978
 979         /* build and submit the bios for the failed mirror, check checksums */
 980         scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
 981                             csum, generation, sctx->csum_size, 1);
 982
 983         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 984             sblock_bad->no_io_error_seen) {
 985                 /*
 986                  * the error disappeared after reading page by page, or
 987                  * the area was part of a huge bio and other parts of the
 988                  * bio caused I/O errors, or the block layer merged several
 989                  * read requests into one and the error is caused by a
 990                  * different bio (usually one of the two latter cases is
 991                  * the cause)
 992                  */
 993                 spin_lock(&sctx->stat_lock);
 994                 sctx->stat.unverified_errors++;
 995                 sblock_to_check->data_corrected = 1;
 996                 spin_unlock(&sctx->stat_lock);
 997
 998                 if (sctx->is_dev_replace)
 999                         scrub_write_block_to_dev_replace(sblock_bad);
1000                 goto out;
1001         }
1002
1003         if (!sblock_bad->no_io_error_seen) {
1004                 spin_lock(&sctx->stat_lock);
1005                 sctx->stat.read_errors++;
1006                 spin_unlock(&sctx->stat_lock);
1007                 if (__ratelimit(&_rs))
1008                         scrub_print_warning("i/o error", sblock_to_check);
1009                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1010         } else if (sblock_bad->checksum_error) {
1011                 spin_lock(&sctx->stat_lock);
1012                 sctx->stat.csum_errors++;
1013                 spin_unlock(&sctx->stat_lock);
1014                 if (__ratelimit(&_rs))
1015                         scrub_print_warning("checksum error", sblock_to_check);
1016                 btrfs_dev_stat_inc_and_print(dev,
1017                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
1018         } else if (sblock_bad->header_error) {
1019                 spin_lock(&sctx->stat_lock);
1020                 sctx->stat.verify_errors++;
1021                 spin_unlock(&sctx->stat_lock);
1022                 if (__ratelimit(&_rs))
1023                         scrub_print_warning("checksum/header error",
1024                                             sblock_to_check);
1025                 if (sblock_bad->generation_error)
1026                         btrfs_dev_stat_inc_and_print(dev,
1027                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1028                 else
1029                         btrfs_dev_stat_inc_and_print(dev,
1030                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1031         }
1032
1033         if (sctx->readonly) {
1034                 ASSERT(!sctx->is_dev_replace);
1035                 goto out;
1036         }
1037
1038         if (!is_metadata && !have_csum) {
1039                 struct scrub_fixup_nodatasum *fixup_nodatasum;
1040
1041 nodatasum_case:
1042                 WARN_ON(sctx->is_dev_replace);
1043
1044                 /*
1045                  * !is_metadata and !have_csum, this means that the data
1046                  * might not be COW'ed, that it might be modified
1047                  * concurrently. The general strategy to work on the
1048                  * commit root does not help in the case when COW is not
1049                  * used.
1050                  */
1051                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1052                 if (!fixup_nodatasum)
1053                         goto did_not_correct_error;
1054                 fixup_nodatasum->sctx = sctx;
1055                 fixup_nodatasum->dev = dev;
1056                 fixup_nodatasum->logical = logical;
1057                 fixup_nodatasum->root = fs_info->extent_root;
1058                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1059                 scrub_pending_trans_workers_inc(sctx);
1060                 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1061                                 scrub_fixup_nodatasum, NULL, NULL);
1062                 btrfs_queue_work(fs_info->scrub_workers,
1063                                  &fixup_nodatasum->work);
1064                 goto out;
1065         }
1066
1067         /*
1068          * now build and submit the bios for the other mirrors, check
1069          * checksums.
1070          * First try to pick the mirror which is completely without I/O
1071          * errors and also does not have a checksum error.
1072          * If one is found, and if a checksum is present, the full block
1073          * that is known to contain an error is rewritten. Afterwards
1074          * the block is known to be corrected.
1075          * If a mirror is found which is completely correct, and no
1076          * checksum is present, only those pages are rewritten that had
1077          * an I/O error in the block to be repaired, since it cannot be
1078          * determined, which copy of the other pages is better (and it
1079          * could happen otherwise that a correct page would be
1080          * overwritten by a bad one).
1081          */
1082         for (mirror_index = 0;
1083              mirror_index < BTRFS_MAX_MIRRORS &&
1084              sblocks_for_recheck[mirror_index].page_count > 0;
1085              mirror_index++) {
1086                 struct scrub_block *sblock_other;
1087
1088                 if (mirror_index == failed_mirror_index)
1089                         continue;
1090                 sblock_other = sblocks_for_recheck + mirror_index;
1091
1092                 /* build and submit the bios, check checksums */
1093                 scrub_recheck_block(fs_info, sblock_other, is_metadata,
1094                                     have_csum, csum, generation,
1095                                     sctx->csum_size, 0);
1096
1097                 if (!sblock_other->header_error &&
1098                     !sblock_other->checksum_error &&
1099                     sblock_other->no_io_error_seen) {
1100                         if (sctx->is_dev_replace) {
1101                                 scrub_write_block_to_dev_replace(sblock_other);
1102                         } else {
1103                                 int force_write = is_metadata || have_csum;
1104
1105                                 ret = scrub_repair_block_from_good_copy(
1106                                                 sblock_bad, sblock_other,
1107                                                 force_write);
1108                         }
1109                         if (0 == ret)
1110                                 goto corrected_error;
1111                 }
1112         }
1113
1114         /*
1115          * for dev_replace, pick good pages and write to the target device.
1116          */
1117         if (sctx->is_dev_replace) {
1118                 success = 1;
1119                 for (page_num = 0; page_num < sblock_bad->page_count;
1120                      page_num++) {
1121                         int sub_success;
1122
1123                         sub_success = 0;
1124                         for (mirror_index = 0;
1125                              mirror_index < BTRFS_MAX_MIRRORS &&
1126                              sblocks_for_recheck[mirror_index].page_count > 0;
1127                              mirror_index++) {
1128                                 struct scrub_block *sblock_other =
1129                                         sblocks_for_recheck + mirror_index;
1130                                 struct scrub_page *page_other =
1131                                         sblock_other->pagev[page_num];
1132
1133                                 if (!page_other->io_error) {
1134                                         ret = scrub_write_page_to_dev_replace(
1135                                                         sblock_other, page_num);
1136                                         if (ret == 0) {
1137                                                 /* succeeded for this page */
1138                                                 sub_success = 1;
1139                                                 break;
1140                                         } else {
1141                                                 btrfs_dev_replace_stats_inc(
1142                                                         &sctx->dev_root->
1143                                                         fs_info->dev_replace.
1144                                                         num_write_errors);
1145                                         }
1146                                 }
1147                         }
1148
1149                         if (!sub_success) {
1150                                 /*
1151                                  * did not find a mirror to fetch the page
1152                                  * from. scrub_write_page_to_dev_replace()
1153                                  * handles this case (page->io_error), by
1154                                  * filling the block with zeros before
1155                                  * submitting the write request
1156                                  */
1157                                 success = 0;
1158                                 ret = scrub_write_page_to_dev_replace(
1159                                                 sblock_bad, page_num);
1160                                 if (ret)
1161                                         btrfs_dev_replace_stats_inc(
1162                                                 &sctx->dev_root->fs_info->
1163                                                 dev_replace.num_write_errors);
1164                         }
1165                 }
1166
1167                 goto out;
1168         }
1169
1170         /*
1171          * for regular scrub, repair those pages that are errored.
1172          * In case of I/O errors in the area that is supposed to be
1173          * repaired, continue by picking good copies of those pages.
1174          * Select the good pages from mirrors to rewrite bad pages from
1175          * the area to fix. Afterwards verify the checksum of the block
1176          * that is supposed to be repaired. This verification step is
1177          * only done for the purpose of statistic counting and for the
1178          * final scrub report, whether errors remain.
1179          * A perfect algorithm could make use of the checksum and try
1180          * all possible combinations of pages from the different mirrors
1181          * until the checksum verification succeeds. For example, when
1182          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1183          * of mirror #2 is readable but the final checksum test fails,
1184          * then the 2nd page of mirror #3 could be tried, whether now
1185          * the final checksum succeedes. But this would be a rare
1186          * exception and is therefore not implemented. At least it is
1187          * avoided that the good copy is overwritten.
1188          * A more useful improvement would be to pick the sectors
1189          * without I/O error based on sector sizes (512 bytes on legacy
1190          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1191          * mirror could be repaired by taking 512 byte of a different
1192          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1193          * area are unreadable.
1194          */
1195
1196         /* can only fix I/O errors from here on */
1197         if (sblock_bad->no_io_error_seen)
1198                 goto did_not_correct_error;
1199
1200         success = 1;
1201         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1202                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1203
1204                 if (!page_bad->io_error)
1205                         continue;
1206
1207                 for (mirror_index = 0;
1208                      mirror_index < BTRFS_MAX_MIRRORS &&
1209                      sblocks_for_recheck[mirror_index].page_count > 0;
1210                      mirror_index++) {
1211                         struct scrub_block *sblock_other = sblocks_for_recheck +
1212                                                            mirror_index;
1213                         struct scrub_page *page_other = sblock_other->pagev[
1214                                                         page_num];
1215
1216                         if (!page_other->io_error) {
1217                                 ret = scrub_repair_page_from_good_copy(
1218                                         sblock_bad, sblock_other, page_num, 0);
1219                                 if (0 == ret) {
1220                                         page_bad->io_error = 0;
1221                                         break; /* succeeded for this page */
1222                                 }
1223                         }
1224                 }
1225
1226                 if (page_bad->io_error) {
1227                         /* did not find a mirror to copy the page from */
1228                         success = 0;
1229                 }
1230         }
1231
1232         if (success) {
1233                 if (is_metadata || have_csum) {
1234                         /*
1235                          * need to verify the checksum now that all
1236                          * sectors on disk are repaired (the write
1237                          * request for data to be repaired is on its way).
1238                          * Just be lazy and use scrub_recheck_block()
1239                          * which re-reads the data before the checksum
1240                          * is verified, but most likely the data comes out
1241                          * of the page cache.
1242                          */
1243                         scrub_recheck_block(fs_info, sblock_bad,
1244                                             is_metadata, have_csum, csum,
1245                                             generation, sctx->csum_size, 1);
1246                         if (!sblock_bad->header_error &&
1247                             !sblock_bad->checksum_error &&
1248                             sblock_bad->no_io_error_seen)
1249                                 goto corrected_error;
1250                         else
1251                                 goto did_not_correct_error;
1252                 } else {
1253 corrected_error:
1254                         spin_lock(&sctx->stat_lock);
1255                         sctx->stat.corrected_errors++;
1256                         sblock_to_check->data_corrected = 1;
1257                         spin_unlock(&sctx->stat_lock);
1258                         printk_ratelimited_in_rcu(KERN_ERR
1259                                 "BTRFS: fixed up error at logical %llu on dev %s\n",
1260                                 logical, rcu_str_deref(dev->name));
1261                 }
1262         } else {
1263 did_not_correct_error:
1264                 spin_lock(&sctx->stat_lock);
1265                 sctx->stat.uncorrectable_errors++;
1266                 spin_unlock(&sctx->stat_lock);
1267                 printk_ratelimited_in_rcu(KERN_ERR
1268                         "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
1269                         logical, rcu_str_deref(dev->name));
1270         }
1271
1272 out:
1273         if (sblocks_for_recheck) {
1274                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1275                      mirror_index++) {
1276                         struct scrub_block *sblock = sblocks_for_recheck +
1277                                                      mirror_index;
1278                         struct scrub_recover *recover;
1279                         int page_index;
1280
1281                         for (page_index = 0; page_index < sblock->page_count;
1282                              page_index++) {
1283                                 sblock->pagev[page_index]->sblock = NULL;
1284                                 recover = sblock->pagev[page_index]->recover;
1285                                 if (recover) {
1286                                         scrub_put_recover(recover);
1287                                         sblock->pagev[page_index]->recover =
1288                                                                         NULL;
1289                                 }
1290                                 scrub_page_put(sblock->pagev[page_index]);
1291                         }
1292                 }
1293                 kfree(sblocks_for_recheck);
1294         }
1295
1296         return 0;
1297 }
1298
1299 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
1300 {
1301         if (raid_map) {
1302                 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
1303
1304                 if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
1305                         return 3;
1306                 else
1307                         return 2;
1308         } else {
1309                 return (int)bbio->num_stripes;
1310         }
1311 }
1312
1313 static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
1314                                                  u64 mapped_length,
1315                                                  int nstripes, int mirror,
1316                                                  int *stripe_index,
1317                                                  u64 *stripe_offset)
1318 {
1319         int i;
1320
1321         if (raid_map) {
1322                 /* RAID5/6 */
1323                 for (i = 0; i < nstripes; i++) {
1324                         if (raid_map[i] == RAID6_Q_STRIPE ||
1325                             raid_map[i] == RAID5_P_STRIPE)
1326                                 continue;
1327
1328                         if (logical >= raid_map[i] &&
1329                             logical < raid_map[i] + mapped_length)
1330                                 break;
1331                 }
1332
1333                 *stripe_index = i;
1334                 *stripe_offset = logical - raid_map[i];
1335         } else {
1336                 /* The other RAID type */
1337                 *stripe_index = mirror;
1338                 *stripe_offset = 0;
1339         }
1340 }
1341
1342 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1343                                      struct btrfs_fs_info *fs_info,
1344                                      struct scrub_block *original_sblock,
1345                                      u64 length, u64 logical,
1346                                      struct scrub_block *sblocks_for_recheck)
1347 {
1348         struct scrub_recover *recover;
1349         struct btrfs_bio *bbio;
1350         u64 *raid_map;
1351         u64 sublen;
1352         u64 mapped_length;
1353         u64 stripe_offset;
1354         int stripe_index;
1355         int page_index;
1356         int mirror_index;
1357         int nmirrors;
1358         int ret;
1359
1360         /*
1361          * note: the two members ref_count and outstanding_pages
1362          * are not used (and not set) in the blocks that are used for
1363          * the recheck procedure
1364          */
1365
1366         page_index = 0;
1367         while (length > 0) {
1368                 sublen = min_t(u64, length, PAGE_SIZE);
1369                 mapped_length = sublen;
1370                 bbio = NULL;
1371                 raid_map = NULL;
1372
1373                 /*
1374                  * with a length of PAGE_SIZE, each returned stripe
1375                  * represents one mirror
1376                  */
1377                 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
1378                                        &mapped_length, &bbio, 0, &raid_map);
1379                 if (ret || !bbio || mapped_length < sublen) {
1380                         kfree(bbio);
1381                         kfree(raid_map);
1382                         return -EIO;
1383                 }
1384
1385                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1386                 if (!recover) {
1387                         kfree(bbio);
1388                         kfree(raid_map);
1389                         return -ENOMEM;
1390                 }
1391
1392                 atomic_set(&recover->refs, 1);
1393                 recover->bbio = bbio;
1394                 recover->raid_map = raid_map;
1395                 recover->map_length = mapped_length;
1396
1397                 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1398
1399                 nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
1400                 for (mirror_index = 0; mirror_index < nmirrors;
1401                      mirror_index++) {
1402                         struct scrub_block *sblock;
1403                         struct scrub_page *page;
1404
1405                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1406                                 continue;
1407
1408                         sblock = sblocks_for_recheck + mirror_index;
1409                         sblock->sctx = sctx;
1410                         page = kzalloc(sizeof(*page), GFP_NOFS);
1411                         if (!page) {
1412 leave_nomem:
1413                                 spin_lock(&sctx->stat_lock);
1414                                 sctx->stat.malloc_errors++;
1415                                 spin_unlock(&sctx->stat_lock);
1416                                 scrub_put_recover(recover);
1417                                 return -ENOMEM;
1418                         }
1419                         scrub_page_get(page);
1420                         sblock->pagev[page_index] = page;
1421                         page->logical = logical;
1422
1423                         scrub_stripe_index_and_offset(logical, raid_map,
1424                                                       mapped_length,
1425                                                       bbio->num_stripes -
1426                                                       bbio->num_tgtdevs,
1427                                                       mirror_index,
1428                                                       &stripe_index,
1429                                                       &stripe_offset);
1430                         page->physical = bbio->stripes[stripe_index].physical +
1431                                          stripe_offset;
1432                         page->dev = bbio->stripes[stripe_index].dev;
1433
1434                         BUG_ON(page_index >= original_sblock->page_count);
1435                         page->physical_for_dev_replace =
1436                                 original_sblock->pagev[page_index]->
1437                                 physical_for_dev_replace;
1438                         /* for missing devices, dev->bdev is NULL */
1439                         page->mirror_num = mirror_index + 1;
1440                         sblock->page_count++;
1441                         page->page = alloc_page(GFP_NOFS);
1442                         if (!page->page)
1443                                 goto leave_nomem;
1444
1445                         scrub_get_recover(recover);
1446                         page->recover = recover;
1447                 }
1448                 scrub_put_recover(recover);
1449                 length -= sublen;
1450                 logical += sublen;
1451                 page_index++;
1452         }
1453
1454         return 0;
1455 }
1456
1457 struct scrub_bio_ret {
1458         struct completion event;
1459         int error;
1460 };
1461
1462 static void scrub_bio_wait_endio(struct bio *bio, int error)
1463 {
1464         struct scrub_bio_ret *ret = bio->bi_private;
1465
1466         ret->error = error;
1467         complete(&ret->event);
1468 }
1469
1470 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1471 {
1472         return page->recover && page->recover->raid_map;
1473 }
1474
1475 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1476                                         struct bio *bio,
1477                                         struct scrub_page *page)
1478 {
1479         struct scrub_bio_ret done;
1480         int ret;
1481
1482         init_completion(&done.event);
1483         done.error = 0;
1484         bio->bi_iter.bi_sector = page->logical >> 9;
1485         bio->bi_private = &done;
1486         bio->bi_end_io = scrub_bio_wait_endio;
1487
1488         ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
1489                                     page->recover->raid_map,
1490                                     page->recover->map_length,
1491                                     page->mirror_num, 0);
1492         if (ret)
1493                 return ret;
1494
1495         wait_for_completion(&done.event);
1496         if (done.error)
1497                 return -EIO;
1498
1499         return 0;
1500 }
1501
1502 /*
1503  * this function will check the on disk data for checksum errors, header
1504  * errors and read I/O errors. If any I/O errors happen, the exact pages
1505  * which are errored are marked as being bad. The goal is to enable scrub
1506  * to take those pages that are not errored from all the mirrors so that
1507  * the pages that are errored in the just handled mirror can be repaired.
1508  */
1509 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1510                                 struct scrub_block *sblock, int is_metadata,
1511                                 int have_csum, u8 *csum, u64 generation,
1512                                 u16 csum_size, int retry_failed_mirror)
1513 {
1514         int page_num;
1515
1516         sblock->no_io_error_seen = 1;
1517         sblock->header_error = 0;
1518         sblock->checksum_error = 0;
1519
1520         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1521                 struct bio *bio;
1522                 struct scrub_page *page = sblock->pagev[page_num];
1523
1524                 if (page->dev->bdev == NULL) {
1525                         page->io_error = 1;
1526                         sblock->no_io_error_seen = 0;
1527                         continue;
1528                 }
1529
1530                 WARN_ON(!page->page);
1531                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1532                 if (!bio) {
1533                         page->io_error = 1;
1534                         sblock->no_io_error_seen = 0;
1535                         continue;
1536                 }
1537                 bio->bi_bdev = page->dev->bdev;
1538
1539                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1540                 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1541                         if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
1542                                 sblock->no_io_error_seen = 0;
1543                 } else {
1544                         bio->bi_iter.bi_sector = page->physical >> 9;
1545
1546                         if (btrfsic_submit_bio_wait(READ, bio))
1547                                 sblock->no_io_error_seen = 0;
1548                 }
1549
1550                 bio_put(bio);
1551         }
1552
1553         if (sblock->no_io_error_seen)
1554                 scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1555                                              have_csum, csum, generation,
1556                                              csum_size);
1557
1558         return;
1559 }
1560
1561 static inline int scrub_check_fsid(u8 fsid[],
1562                                    struct scrub_page *spage)
1563 {
1564         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1565         int ret;
1566
1567         ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1568         return !ret;
1569 }
1570
1571 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1572                                          struct scrub_block *sblock,
1573                                          int is_metadata, int have_csum,
1574                                          const u8 *csum, u64 generation,
1575                                          u16 csum_size)
1576 {
1577         int page_num;
1578         u8 calculated_csum[BTRFS_CSUM_SIZE];
1579         u32 crc = ~(u32)0;
1580         void *mapped_buffer;
1581
1582         WARN_ON(!sblock->pagev[0]->page);
1583         if (is_metadata) {
1584                 struct btrfs_header *h;
1585
1586                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1587                 h = (struct btrfs_header *)mapped_buffer;
1588
1589                 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1590                     !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
1591                     memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1592                            BTRFS_UUID_SIZE)) {
1593                         sblock->header_error = 1;
1594                 } else if (generation != btrfs_stack_header_generation(h)) {
1595                         sblock->header_error = 1;
1596                         sblock->generation_error = 1;
1597                 }
1598                 csum = h->csum;
1599         } else {
1600                 if (!have_csum)
1601                         return;
1602
1603                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1604         }
1605
1606         for (page_num = 0;;) {
1607                 if (page_num == 0 && is_metadata)
1608                         crc = btrfs_csum_data(
1609                                 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1610                                 crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1611                 else
1612                         crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
1613
1614                 kunmap_atomic(mapped_buffer);
1615                 page_num++;
1616                 if (page_num >= sblock->page_count)
1617                         break;
1618                 WARN_ON(!sblock->pagev[page_num]->page);
1619
1620                 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1621         }
1622
1623         btrfs_csum_final(crc, calculated_csum);
1624         if (memcmp(calculated_csum, csum, csum_size))
1625                 sblock->checksum_error = 1;
1626 }
1627
1628 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1629                                              struct scrub_block *sblock_good,
1630                                              int force_write)
1631 {
1632         int page_num;
1633         int ret = 0;
1634
1635         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1636                 int ret_sub;
1637
1638                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1639                                                            sblock_good,
1640                                                            page_num,
1641                                                            force_write);
1642                 if (ret_sub)
1643                         ret = ret_sub;
1644         }
1645
1646         return ret;
1647 }
1648
1649 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1650                                             struct scrub_block *sblock_good,
1651                                             int page_num, int force_write)
1652 {
1653         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1654         struct scrub_page *page_good = sblock_good->pagev[page_num];
1655
1656         BUG_ON(page_bad->page == NULL);
1657         BUG_ON(page_good->page == NULL);
1658         if (force_write || sblock_bad->header_error ||
1659             sblock_bad->checksum_error || page_bad->io_error) {
1660                 struct bio *bio;
1661                 int ret;
1662
1663                 if (!page_bad->dev->bdev) {
1664                         printk_ratelimited(KERN_WARNING "BTRFS: "
1665                                 "scrub_repair_page_from_good_copy(bdev == NULL) "
1666                                 "is unexpected!\n");
1667                         return -EIO;
1668                 }
1669
1670                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1671                 if (!bio)
1672                         return -EIO;
1673                 bio->bi_bdev = page_bad->dev->bdev;
1674                 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1675
1676                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1677                 if (PAGE_SIZE != ret) {
1678                         bio_put(bio);
1679                         return -EIO;
1680                 }
1681
1682                 if (btrfsic_submit_bio_wait(WRITE, bio)) {
1683                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1684                                 BTRFS_DEV_STAT_WRITE_ERRS);
1685                         btrfs_dev_replace_stats_inc(
1686                                 &sblock_bad->sctx->dev_root->fs_info->
1687                                 dev_replace.num_write_errors);
1688                         bio_put(bio);
1689                         return -EIO;
1690                 }
1691                 bio_put(bio);
1692         }
1693
1694         return 0;
1695 }
1696
1697 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1698 {
1699         int page_num;
1700
1701         /*
1702          * This block is used for the check of the parity on the source device,
1703          * so the data needn't be written into the destination device.
1704          */
1705         if (sblock->sparity)
1706                 return;
1707
1708         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1709                 int ret;
1710
1711                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1712                 if (ret)
1713                         btrfs_dev_replace_stats_inc(
1714                                 &sblock->sctx->dev_root->fs_info->dev_replace.
1715                                 num_write_errors);
1716         }
1717 }
1718
1719 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1720                                            int page_num)
1721 {
1722         struct scrub_page *spage = sblock->pagev[page_num];
1723
1724         BUG_ON(spage->page == NULL);
1725         if (spage->io_error) {
1726                 void *mapped_buffer = kmap_atomic(spage->page);
1727
1728                 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1729                 flush_dcache_page(spage->page);
1730                 kunmap_atomic(mapped_buffer);
1731         }
1732         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1733 }
1734
1735 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1736                                     struct scrub_page *spage)
1737 {
1738         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1739         struct scrub_bio *sbio;
1740         int ret;
1741
1742         mutex_lock(&wr_ctx->wr_lock);
1743 again:
1744         if (!wr_ctx->wr_curr_bio) {
1745                 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1746                                               GFP_NOFS);
1747                 if (!wr_ctx->wr_curr_bio) {
1748                         mutex_unlock(&wr_ctx->wr_lock);
1749                         return -ENOMEM;
1750                 }
1751                 wr_ctx->wr_curr_bio->sctx = sctx;
1752                 wr_ctx->wr_curr_bio->page_count = 0;
1753         }
1754         sbio = wr_ctx->wr_curr_bio;
1755         if (sbio->page_count == 0) {
1756                 struct bio *bio;
1757
1758                 sbio->physical = spage->physical_for_dev_replace;
1759                 sbio->logical = spage->logical;
1760                 sbio->dev = wr_ctx->tgtdev;
1761                 bio = sbio->bio;
1762                 if (!bio) {
1763                         bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1764                         if (!bio) {
1765                                 mutex_unlock(&wr_ctx->wr_lock);
1766                                 return -ENOMEM;
1767                         }
1768                         sbio->bio = bio;
1769                 }
1770
1771                 bio->bi_private = sbio;
1772                 bio->bi_end_io = scrub_wr_bio_end_io;
1773                 bio->bi_bdev = sbio->dev->bdev;
1774                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1775                 sbio->err = 0;
1776         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1777                    spage->physical_for_dev_replace ||
1778                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1779                    spage->logical) {
1780                 scrub_wr_submit(sctx);
1781                 goto again;
1782         }
1783
1784         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1785         if (ret != PAGE_SIZE) {
1786                 if (sbio->page_count < 1) {
1787                         bio_put(sbio->bio);
1788                         sbio->bio = NULL;
1789                         mutex_unlock(&wr_ctx->wr_lock);
1790                         return -EIO;
1791                 }
1792                 scrub_wr_submit(sctx);
1793                 goto again;
1794         }
1795
1796         sbio->pagev[sbio->page_count] = spage;
1797         scrub_page_get(spage);
1798         sbio->page_count++;
1799         if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1800                 scrub_wr_submit(sctx);
1801         mutex_unlock(&wr_ctx->wr_lock);
1802
1803         return 0;
1804 }
1805
1806 static void scrub_wr_submit(struct scrub_ctx *sctx)
1807 {
1808         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1809         struct scrub_bio *sbio;
1810
1811         if (!wr_ctx->wr_curr_bio)
1812                 return;
1813
1814         sbio = wr_ctx->wr_curr_bio;
1815         wr_ctx->wr_curr_bio = NULL;
1816         WARN_ON(!sbio->bio->bi_bdev);
1817         scrub_pending_bio_inc(sctx);
1818         /* process all writes in a single worker thread. Then the block layer
1819          * orders the requests before sending them to the driver which
1820          * doubled the write performance on spinning disks when measured
1821          * with Linux 3.5 */
1822         btrfsic_submit_bio(WRITE, sbio->bio);
1823 }
1824
1825 static void scrub_wr_bio_end_io(struct bio *bio, int err)
1826 {
1827         struct scrub_bio *sbio = bio->bi_private;
1828         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1829
1830         sbio->err = err;
1831         sbio->bio = bio;
1832
1833         btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1834                          scrub_wr_bio_end_io_worker, NULL, NULL);
1835         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1836 }
1837
1838 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1839 {
1840         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1841         struct scrub_ctx *sctx = sbio->sctx;
1842         int i;
1843
1844         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1845         if (sbio->err) {
1846                 struct btrfs_dev_replace *dev_replace =
1847                         &sbio->sctx->dev_root->fs_info->dev_replace;
1848
1849                 for (i = 0; i < sbio->page_count; i++) {
1850                         struct scrub_page *spage = sbio->pagev[i];
1851
1852                         spage->io_error = 1;
1853                         btrfs_dev_replace_stats_inc(&dev_replace->
1854                                                     num_write_errors);
1855                 }
1856         }
1857
1858         for (i = 0; i < sbio->page_count; i++)
1859                 scrub_page_put(sbio->pagev[i]);
1860
1861         bio_put(sbio->bio);
1862         kfree(sbio);
1863         scrub_pending_bio_dec(sctx);
1864 }
1865
1866 static int scrub_checksum(struct scrub_block *sblock)
1867 {
1868         u64 flags;
1869         int ret;
1870
1871         WARN_ON(sblock->page_count < 1);
1872         flags = sblock->pagev[0]->flags;
1873         ret = 0;
1874         if (flags & BTRFS_EXTENT_FLAG_DATA)
1875                 ret = scrub_checksum_data(sblock);
1876         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1877                 ret = scrub_checksum_tree_block(sblock);
1878         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1879                 (void)scrub_checksum_super(sblock);
1880         else
1881                 WARN_ON(1);
1882         if (ret)
1883                 scrub_handle_errored_block(sblock);
1884
1885         return ret;
1886 }
1887
1888 static int scrub_checksum_data(struct scrub_block *sblock)
1889 {
1890         struct scrub_ctx *sctx = sblock->sctx;
1891         u8 csum[BTRFS_CSUM_SIZE];
1892         u8 *on_disk_csum;
1893         struct page *page;
1894         void *buffer;
1895         u32 crc = ~(u32)0;
1896         int fail = 0;
1897         u64 len;
1898         int index;
1899
1900         BUG_ON(sblock->page_count < 1);
1901         if (!sblock->pagev[0]->have_csum)
1902                 return 0;
1903
1904         on_disk_csum = sblock->pagev[0]->csum;
1905         page = sblock->pagev[0]->page;
1906         buffer = kmap_atomic(page);
1907
1908         len = sctx->sectorsize;
1909         index = 0;
1910         for (;;) {
1911                 u64 l = min_t(u64, len, PAGE_SIZE);
1912
1913                 crc = btrfs_csum_data(buffer, crc, l);
1914                 kunmap_atomic(buffer);
1915                 len -= l;
1916                 if (len == 0)
1917                         break;
1918                 index++;
1919                 BUG_ON(index >= sblock->page_count);
1920                 BUG_ON(!sblock->pagev[index]->page);
1921                 page = sblock->pagev[index]->page;
1922                 buffer = kmap_atomic(page);
1923         }
1924
1925         btrfs_csum_final(crc, csum);
1926         if (memcmp(csum, on_disk_csum, sctx->csum_size))
1927                 fail = 1;
1928
1929         return fail;
1930 }
1931
1932 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1933 {
1934         struct scrub_ctx *sctx = sblock->sctx;
1935         struct btrfs_header *h;
1936         struct btrfs_root *root = sctx->dev_root;
1937         struct btrfs_fs_info *fs_info = root->fs_info;
1938         u8 calculated_csum[BTRFS_CSUM_SIZE];
1939         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1940         struct page *page;
1941         void *mapped_buffer;
1942         u64 mapped_size;
1943         void *p;
1944         u32 crc = ~(u32)0;
1945         int fail = 0;
1946         int crc_fail = 0;
1947         u64 len;
1948         int index;
1949
1950         BUG_ON(sblock->page_count < 1);
1951         page = sblock->pagev[0]->page;
1952         mapped_buffer = kmap_atomic(page);
1953         h = (struct btrfs_header *)mapped_buffer;
1954         memcpy(on_disk_csum, h->csum, sctx->csum_size);
1955
1956         /*
1957          * we don't use the getter functions here, as we
1958          * a) don't have an extent buffer and
1959          * b) the page is already kmapped
1960          */
1961
1962         if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1963                 ++fail;
1964
1965         if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1966                 ++fail;
1967
1968         if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
1969                 ++fail;
1970
1971         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1972                    BTRFS_UUID_SIZE))
1973                 ++fail;
1974
1975         len = sctx->nodesize - BTRFS_CSUM_SIZE;
1976         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1977         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1978         index = 0;
1979         for (;;) {
1980                 u64 l = min_t(u64, len, mapped_size);
1981
1982                 crc = btrfs_csum_data(p, crc, l);
1983                 kunmap_atomic(mapped_buffer);
1984                 len -= l;
1985                 if (len == 0)
1986                         break;
1987                 index++;
1988                 BUG_ON(index >= sblock->page_count);
1989                 BUG_ON(!sblock->pagev[index]->page);
1990                 page = sblock->pagev[index]->page;
1991                 mapped_buffer = kmap_atomic(page);
1992                 mapped_size = PAGE_SIZE;
1993                 p = mapped_buffer;
1994         }
1995
1996         btrfs_csum_final(crc, calculated_csum);
1997         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1998                 ++crc_fail;
1999
2000         return fail || crc_fail;
2001 }
2002
2003 static int scrub_checksum_super(struct scrub_block *sblock)
2004 {
2005         struct btrfs_super_block *s;
2006         struct scrub_ctx *sctx = sblock->sctx;
2007         u8 calculated_csum[BTRFS_CSUM_SIZE];
2008         u8 on_disk_csum[BTRFS_CSUM_SIZE];
2009         struct page *page;
2010         void *mapped_buffer;
2011         u64 mapped_size;
2012         void *p;
2013         u32 crc = ~(u32)0;
2014         int fail_gen = 0;
2015         int fail_cor = 0;
2016         u64 len;
2017         int index;
2018
2019         BUG_ON(sblock->page_count < 1);
2020         page = sblock->pagev[0]->page;
2021         mapped_buffer = kmap_atomic(page);
2022         s = (struct btrfs_super_block *)mapped_buffer;
2023         memcpy(on_disk_csum, s->csum, sctx->csum_size);
2024
2025         if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
2026                 ++fail_cor;
2027
2028         if (sblock->pagev[0]->generation != btrfs_super_generation(s))
2029                 ++fail_gen;
2030
2031         if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
2032                 ++fail_cor;
2033
2034         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
2035         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2036         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2037         index = 0;
2038         for (;;) {
2039                 u64 l = min_t(u64, len, mapped_size);
2040
2041                 crc = btrfs_csum_data(p, crc, l);
2042                 kunmap_atomic(mapped_buffer);
2043                 len -= l;
2044                 if (len == 0)
2045                         break;
2046                 index++;
2047                 BUG_ON(index >= sblock->page_count);
2048                 BUG_ON(!sblock->pagev[index]->page);
2049                 page = sblock->pagev[index]->page;
2050                 mapped_buffer = kmap_atomic(page);
2051                 mapped_size = PAGE_SIZE;
2052                 p = mapped_buffer;
2053         }
2054
2055         btrfs_csum_final(crc, calculated_csum);
2056         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2057                 ++fail_cor;
2058
2059         if (fail_cor + fail_gen) {
2060                 /*
2061                  * if we find an error in a super block, we just report it.
2062                  * They will get written with the next transaction commit
2063                  * anyway
2064                  */
2065                 spin_lock(&sctx->stat_lock);
2066                 ++sctx->stat.super_errors;
2067                 spin_unlock(&sctx->stat_lock);
2068                 if (fail_cor)
2069                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2070                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2071                 else
2072                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2073                                 BTRFS_DEV_STAT_GENERATION_ERRS);
2074         }
2075
2076         return fail_cor + fail_gen;
2077 }
2078
2079 static void scrub_block_get(struct scrub_block *sblock)
2080 {
2081         atomic_inc(&sblock->ref_count);
2082 }
2083
2084 static void scrub_block_put(struct scrub_block *sblock)
2085 {
2086         if (atomic_dec_and_test(&sblock->ref_count)) {
2087                 int i;
2088
2089                 if (sblock->sparity)
2090                         scrub_parity_put(sblock->sparity);
2091
2092                 for (i = 0; i < sblock->page_count; i++)
2093                         scrub_page_put(sblock->pagev[i]);
2094                 kfree(sblock);
2095         }
2096 }
2097
2098 static void scrub_page_get(struct scrub_page *spage)
2099 {
2100         atomic_inc(&spage->ref_count);
2101 }
2102
2103 static void scrub_page_put(struct scrub_page *spage)
2104 {
2105         if (atomic_dec_and_test(&spage->ref_count)) {
2106                 if (spage->page)
2107                         __free_page(spage->page);
2108                 kfree(spage);
2109         }
2110 }
2111
2112 static void scrub_submit(struct scrub_ctx *sctx)
2113 {
2114         struct scrub_bio *sbio;
2115
2116         if (sctx->curr == -1)
2117                 return;
2118
2119         sbio = sctx->bios[sctx->curr];
2120         sctx->curr = -1;
2121         scrub_pending_bio_inc(sctx);
2122
2123         if (!sbio->bio->bi_bdev) {
2124                 /*
2125                  * this case should not happen. If btrfs_map_block() is
2126                  * wrong, it could happen for dev-replace operations on
2127                  * missing devices when no mirrors are available, but in
2128                  * this case it should already fail the mount.
2129                  * This case is handled correctly (but _very_ slowly).
2130                  */
2131                 printk_ratelimited(KERN_WARNING
2132                         "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
2133                 bio_endio(sbio->bio, -EIO);
2134         } else {
2135                 btrfsic_submit_bio(READ, sbio->bio);
2136         }
2137 }
2138
2139 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2140                                     struct scrub_page *spage)
2141 {
2142         struct scrub_block *sblock = spage->sblock;
2143         struct scrub_bio *sbio;
2144         int ret;
2145
2146 again:
2147         /*
2148          * grab a fresh bio or wait for one to become available
2149          */
2150         while (sctx->curr == -1) {
2151                 spin_lock(&sctx->list_lock);
2152                 sctx->curr = sctx->first_free;
2153                 if (sctx->curr != -1) {
2154                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2155                         sctx->bios[sctx->curr]->next_free = -1;
2156                         sctx->bios[sctx->curr]->page_count = 0;
2157                         spin_unlock(&sctx->list_lock);
2158                 } else {
2159                         spin_unlock(&sctx->list_lock);
2160                         wait_event(sctx->list_wait, sctx->first_free != -1);
2161                 }
2162         }
2163         sbio = sctx->bios[sctx->curr];
2164         if (sbio->page_count == 0) {
2165                 struct bio *bio;
2166
2167                 sbio->physical = spage->physical;
2168                 sbio->logical = spage->logical;
2169                 sbio->dev = spage->dev;
2170                 bio = sbio->bio;
2171                 if (!bio) {
2172                         bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
2173                         if (!bio)
2174                                 return -ENOMEM;
2175                         sbio->bio = bio;
2176                 }
2177
2178                 bio->bi_private = sbio;
2179                 bio->bi_end_io = scrub_bio_end_io;
2180                 bio->bi_bdev = sbio->dev->bdev;
2181                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2182                 sbio->err = 0;
2183         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2184                    spage->physical ||
2185                    sbio->logical + sbio->page_count * PAGE_SIZE !=
2186                    spage->logical ||
2187                    sbio->dev != spage->dev) {
2188                 scrub_submit(sctx);
2189                 goto again;
2190         }
2191
2192         sbio->pagev[sbio->page_count] = spage;
2193         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2194         if (ret != PAGE_SIZE) {
2195                 if (sbio->page_count < 1) {
2196                         bio_put(sbio->bio);
2197                         sbio->bio = NULL;
2198                         return -EIO;
2199                 }
2200                 scrub_submit(sctx);
2201                 goto again;
2202         }
2203
2204         scrub_block_get(sblock); /* one for the page added to the bio */
2205         atomic_inc(&sblock->outstanding_pages);
2206         sbio->page_count++;
2207         if (sbio->page_count == sctx->pages_per_rd_bio)
2208                 scrub_submit(sctx);
2209
2210         return 0;
2211 }
2212
2213 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2214                        u64 physical, struct btrfs_device *dev, u64 flags,
2215                        u64 gen, int mirror_num, u8 *csum, int force,
2216                        u64 physical_for_dev_replace)
2217 {
2218         struct scrub_block *sblock;
2219         int index;
2220
2221         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2222         if (!sblock) {
2223                 spin_lock(&sctx->stat_lock);
2224                 sctx->stat.malloc_errors++;
2225                 spin_unlock(&sctx->stat_lock);
2226                 return -ENOMEM;
2227         }
2228
2229         /* one ref inside this function, plus one for each page added to
2230          * a bio later on */
2231         atomic_set(&sblock->ref_count, 1);
2232         sblock->sctx = sctx;
2233         sblock->no_io_error_seen = 1;
2234
2235         for (index = 0; len > 0; index++) {
2236                 struct scrub_page *spage;
2237                 u64 l = min_t(u64, len, PAGE_SIZE);
2238
2239                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2240                 if (!spage) {
2241 leave_nomem:
2242                         spin_lock(&sctx->stat_lock);
2243                         sctx->stat.malloc_errors++;
2244                         spin_unlock(&sctx->stat_lock);
2245                         scrub_block_put(sblock);
2246                         return -ENOMEM;
2247                 }
2248                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2249                 scrub_page_get(spage);
2250                 sblock->pagev[index] = spage;
2251                 spage->sblock = sblock;
2252                 spage->dev = dev;
2253                 spage->flags = flags;
2254                 spage->generation = gen;
2255                 spage->logical = logical;
2256                 spage->physical = physical;
2257                 spage->physical_for_dev_replace = physical_for_dev_replace;
2258                 spage->mirror_num = mirror_num;
2259                 if (csum) {
2260                         spage->have_csum = 1;
2261                         memcpy(spage->csum, csum, sctx->csum_size);
2262                 } else {
2263                         spage->have_csum = 0;
2264                 }
2265                 sblock->page_count++;
2266                 spage->page = alloc_page(GFP_NOFS);
2267                 if (!spage->page)
2268                         goto leave_nomem;
2269                 len -= l;
2270                 logical += l;
2271                 physical += l;
2272                 physical_for_dev_replace += l;
2273         }
2274
2275         WARN_ON(sblock->page_count == 0);
2276         for (index = 0; index < sblock->page_count; index++) {
2277                 struct scrub_page *spage = sblock->pagev[index];
2278                 int ret;
2279
2280                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2281                 if (ret) {
2282                         scrub_block_put(sblock);
2283                         return ret;
2284                 }
2285         }
2286
2287         if (force)
2288                 scrub_submit(sctx);
2289
2290         /* last one frees, either here or in bio completion for last page */
2291         scrub_block_put(sblock);
2292         return 0;
2293 }
2294
2295 static void scrub_bio_end_io(struct bio *bio, int err)
2296 {
2297         struct scrub_bio *sbio = bio->bi_private;
2298         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2299
2300         sbio->err = err;
2301         sbio->bio = bio;
2302
2303         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2304 }
2305
2306 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2307 {
2308         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2309         struct scrub_ctx *sctx = sbio->sctx;
2310         int i;
2311
2312         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2313         if (sbio->err) {
2314                 for (i = 0; i < sbio->page_count; i++) {
2315                         struct scrub_page *spage = sbio->pagev[i];
2316
2317                         spage->io_error = 1;
2318                         spage->sblock->no_io_error_seen = 0;
2319                 }
2320         }
2321
2322         /* now complete the scrub_block items that have all pages completed */
2323         for (i = 0; i < sbio->page_count; i++) {
2324                 struct scrub_page *spage = sbio->pagev[i];
2325                 struct scrub_block *sblock = spage->sblock;
2326
2327                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2328                         scrub_block_complete(sblock);
2329                 scrub_block_put(sblock);
2330         }
2331
2332         bio_put(sbio->bio);
2333         sbio->bio = NULL;
2334         spin_lock(&sctx->list_lock);
2335         sbio->next_free = sctx->first_free;
2336         sctx->first_free = sbio->index;
2337         spin_unlock(&sctx->list_lock);
2338
2339         if (sctx->is_dev_replace &&
2340             atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2341                 mutex_lock(&sctx->wr_ctx.wr_lock);
2342                 scrub_wr_submit(sctx);
2343                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2344         }
2345
2346         scrub_pending_bio_dec(sctx);
2347 }
2348
2349 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2350                                        unsigned long *bitmap,
2351                                        u64 start, u64 len)
2352 {
2353         int offset;
2354         int nsectors;
2355         int sectorsize = sparity->sctx->dev_root->sectorsize;
2356
2357         if (len >= sparity->stripe_len) {
2358                 bitmap_set(bitmap, 0, sparity->nsectors);
2359                 return;
2360         }
2361
2362         start -= sparity->logic_start;
2363         offset = (int)do_div(start, sparity->stripe_len);
2364         offset /= sectorsize;
2365         nsectors = (int)len / sectorsize;
2366
2367         if (offset + nsectors <= sparity->nsectors) {
2368                 bitmap_set(bitmap, offset, nsectors);
2369                 return;
2370         }
2371
2372         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2373         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2374 }
2375
2376 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2377                                                    u64 start, u64 len)
2378 {
2379         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2380 }
2381
2382 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2383                                                   u64 start, u64 len)
2384 {
2385         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2386 }
2387
2388 static void scrub_block_complete(struct scrub_block *sblock)
2389 {
2390         int corrupted = 0;
2391
2392         if (!sblock->no_io_error_seen) {
2393                 corrupted = 1;
2394                 scrub_handle_errored_block(sblock);
2395         } else {
2396                 /*
2397                  * if has checksum error, write via repair mechanism in
2398                  * dev replace case, otherwise write here in dev replace
2399                  * case.
2400                  */
2401                 corrupted = scrub_checksum(sblock);
2402                 if (!corrupted && sblock->sctx->is_dev_replace)
2403                         scrub_write_block_to_dev_replace(sblock);
2404         }
2405
2406         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2407                 u64 start = sblock->pagev[0]->logical;
2408                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2409                           PAGE_SIZE;
2410
2411                 scrub_parity_mark_sectors_error(sblock->sparity,
2412                                                 start, end - start);
2413         }
2414 }
2415
2416 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2417                            u8 *csum)
2418 {
2419         struct btrfs_ordered_sum *sum = NULL;
2420         unsigned long index;
2421         unsigned long num_sectors;
2422
2423         while (!list_empty(&sctx->csum_list)) {
2424                 sum = list_first_entry(&sctx->csum_list,
2425                                        struct btrfs_ordered_sum, list);
2426                 if (sum->bytenr > logical)
2427                         return 0;
2428                 if (sum->bytenr + sum->len > logical)
2429                         break;
2430
2431                 ++sctx->stat.csum_discards;
2432                 list_del(&sum->list);
2433                 kfree(sum);
2434                 sum = NULL;
2435         }
2436         if (!sum)
2437                 return 0;
2438
2439         index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2440         num_sectors = sum->len / sctx->sectorsize;
2441         memcpy(csum, sum->sums + index, sctx->csum_size);
2442         if (index == num_sectors - 1) {
2443                 list_del(&sum->list);
2444                 kfree(sum);
2445         }
2446         return 1;
2447 }
2448
2449 /* scrub extent tries to collect up to 64 kB for each bio */
2450 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2451                         u64 physical, struct btrfs_device *dev, u64 flags,
2452                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2453 {
2454         int ret;
2455         u8 csum[BTRFS_CSUM_SIZE];
2456         u32 blocksize;
2457
2458         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2459                 blocksize = sctx->sectorsize;
2460                 spin_lock(&sctx->stat_lock);
2461                 sctx->stat.data_extents_scrubbed++;
2462                 sctx->stat.data_bytes_scrubbed += len;
2463                 spin_unlock(&sctx->stat_lock);
2464         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2465                 blocksize = sctx->nodesize;
2466                 spin_lock(&sctx->stat_lock);
2467                 sctx->stat.tree_extents_scrubbed++;
2468                 sctx->stat.tree_bytes_scrubbed += len;
2469                 spin_unlock(&sctx->stat_lock);
2470         } else {
2471                 blocksize = sctx->sectorsize;
2472                 WARN_ON(1);
2473         }
2474
2475         while (len) {
2476                 u64 l = min_t(u64, len, blocksize);
2477                 int have_csum = 0;
2478
2479                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2480                         /* push csums to sbio */
2481                         have_csum = scrub_find_csum(sctx, logical, l, csum);
2482                         if (have_csum == 0)
2483                                 ++sctx->stat.no_csum;
2484                         if (sctx->is_dev_replace && !have_csum) {
2485                                 ret = copy_nocow_pages(sctx, logical, l,
2486                                                        mirror_num,
2487                                                       physical_for_dev_replace);
2488                                 goto behind_scrub_pages;
2489                         }
2490                 }
2491                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2492                                   mirror_num, have_csum ? csum : NULL, 0,
2493                                   physical_for_dev_replace);
2494 behind_scrub_pages:
2495                 if (ret)
2496                         return ret;
2497                 len -= l;
2498                 logical += l;
2499                 physical += l;
2500                 physical_for_dev_replace += l;
2501         }
2502         return 0;
2503 }
2504
2505 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2506                                   u64 logical, u64 len,
2507                                   u64 physical, struct btrfs_device *dev,
2508                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2509 {
2510         struct scrub_ctx *sctx = sparity->sctx;
2511         struct scrub_block *sblock;
2512         int index;
2513
2514         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2515         if (!sblock) {
2516                 spin_lock(&sctx->stat_lock);
2517                 sctx->stat.malloc_errors++;
2518                 spin_unlock(&sctx->stat_lock);
2519                 return -ENOMEM;
2520         }
2521
2522         /* one ref inside this function, plus one for each page added to
2523          * a bio later on */
2524         atomic_set(&sblock->ref_count, 1);
2525         sblock->sctx = sctx;
2526         sblock->no_io_error_seen = 1;
2527         sblock->sparity = sparity;
2528         scrub_parity_get(sparity);
2529
2530         for (index = 0; len > 0; index++) {
2531                 struct scrub_page *spage;
2532                 u64 l = min_t(u64, len, PAGE_SIZE);
2533
2534                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2535                 if (!spage) {
2536 leave_nomem:
2537                         spin_lock(&sctx->stat_lock);
2538                         sctx->stat.malloc_errors++;
2539                         spin_unlock(&sctx->stat_lock);
2540                         scrub_block_put(sblock);
2541                         return -ENOMEM;
2542                 }
2543                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2544                 /* For scrub block */
2545                 scrub_page_get(spage);
2546                 sblock->pagev[index] = spage;
2547                 /* For scrub parity */
2548                 scrub_page_get(spage);
2549                 list_add_tail(&spage->list, &sparity->spages);
2550                 spage->sblock = sblock;
2551                 spage->dev = dev;
2552                 spage->flags = flags;
2553                 spage->generation = gen;
2554                 spage->logical = logical;
2555                 spage->physical = physical;
2556                 spage->mirror_num = mirror_num;
2557                 if (csum) {
2558                         spage->have_csum = 1;
2559                         memcpy(spage->csum, csum, sctx->csum_size);
2560                 } else {
2561                         spage->have_csum = 0;
2562                 }
2563                 sblock->page_count++;
2564                 spage->page = alloc_page(GFP_NOFS);
2565                 if (!spage->page)
2566                         goto leave_nomem;
2567                 len -= l;
2568                 logical += l;
2569                 physical += l;
2570         }
2571
2572         WARN_ON(sblock->page_count == 0);
2573         for (index = 0; index < sblock->page_count; index++) {
2574                 struct scrub_page *spage = sblock->pagev[index];
2575                 int ret;
2576
2577                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2578                 if (ret) {
2579                         scrub_block_put(sblock);
2580                         return ret;
2581                 }
2582         }
2583
2584         /* last one frees, either here or in bio completion for last page */
2585         scrub_block_put(sblock);
2586         return 0;
2587 }
2588
2589 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2590                                    u64 logical, u64 len,
2591                                    u64 physical, struct btrfs_device *dev,
2592                                    u64 flags, u64 gen, int mirror_num)
2593 {
2594         struct scrub_ctx *sctx = sparity->sctx;
2595         int ret;
2596         u8 csum[BTRFS_CSUM_SIZE];
2597         u32 blocksize;
2598
2599         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2600                 blocksize = sctx->sectorsize;
2601         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2602                 blocksize = sctx->nodesize;
2603         } else {
2604                 blocksize = sctx->sectorsize;
2605                 WARN_ON(1);
2606         }
2607
2608         while (len) {
2609                 u64 l = min_t(u64, len, blocksize);
2610                 int have_csum = 0;
2611
2612                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2613                         /* push csums to sbio */
2614                         have_csum = scrub_find_csum(sctx, logical, l, csum);
2615                         if (have_csum == 0)
2616                                 goto skip;
2617                 }
2618                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2619                                              flags, gen, mirror_num,
2620                                              have_csum ? csum : NULL);
2621                 if (ret)
2622                         return ret;
2623 skip:
2624                 len -= l;
2625                 logical += l;
2626                 physical += l;
2627         }
2628         return 0;
2629 }
2630
2631 /*
2632  * Given a physical address, this will calculate it's
2633  * logical offset. if this is a parity stripe, it will return
2634  * the most left data stripe's logical offset.
2635  *
2636  * return 0 if it is a data stripe, 1 means parity stripe.
2637  */
2638 static int get_raid56_logic_offset(u64 physical, int num,
2639                                    struct map_lookup *map, u64 *offset,
2640                                    u64 *stripe_start)
2641 {
2642         int i;
2643         int j = 0;
2644         u64 stripe_nr;
2645         u64 last_offset;
2646         int stripe_index;
2647         int rot;
2648
2649         last_offset = (physical - map->stripes[num].physical) *
2650                       nr_data_stripes(map);
2651         if (stripe_start)
2652                 *stripe_start = last_offset;
2653
2654         *offset = last_offset;
2655         for (i = 0; i < nr_data_stripes(map); i++) {
2656                 *offset = last_offset + i * map->stripe_len;
2657
2658                 stripe_nr = *offset;
2659                 do_div(stripe_nr, map->stripe_len);
2660                 do_div(stripe_nr, nr_data_stripes(map));
2661
2662                 /* Work out the disk rotation on this stripe-set */
2663                 rot = do_div(stripe_nr, map->num_stripes);
2664                 /* calculate which stripe this data locates */
2665                 rot += i;
2666                 stripe_index = rot % map->num_stripes;
2667                 if (stripe_index == num)
2668                         return 0;
2669                 if (stripe_index < num)
2670                         j++;
2671         }
2672         *offset = last_offset + j * map->stripe_len;
2673         return 1;
2674 }
2675
2676 static void scrub_free_parity(struct scrub_parity *sparity)
2677 {
2678         struct scrub_ctx *sctx = sparity->sctx;
2679         struct scrub_page *curr, *next;
2680         int nbits;
2681
2682         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2683         if (nbits) {
2684                 spin_lock(&sctx->stat_lock);
2685                 sctx->stat.read_errors += nbits;
2686                 sctx->stat.uncorrectable_errors += nbits;
2687                 spin_unlock(&sctx->stat_lock);
2688         }
2689
2690         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2691                 list_del_init(&curr->list);
2692                 scrub_page_put(curr);
2693         }
2694
2695         kfree(sparity);
2696 }
2697
2698 static void scrub_parity_bio_endio(struct bio *bio, int error)
2699 {
2700         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2701         struct scrub_ctx *sctx = sparity->sctx;
2702
2703         if (error)
2704                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2705                           sparity->nsectors);
2706
2707         scrub_free_parity(sparity);
2708         scrub_pending_bio_dec(sctx);
2709         bio_put(bio);
2710 }
2711
2712 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2713 {
2714         struct scrub_ctx *sctx = sparity->sctx;
2715         struct bio *bio;
2716         struct btrfs_raid_bio *rbio;
2717         struct scrub_page *spage;
2718         struct btrfs_bio *bbio = NULL;
2719         u64 *raid_map = NULL;
2720         u64 length;
2721         int ret;
2722
2723         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2724                            sparity->nsectors))
2725                 goto out;
2726
2727         length = sparity->logic_end - sparity->logic_start + 1;
2728         ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
2729                                sparity->logic_start,
2730                                &length, &bbio, 0, &raid_map);
2731         if (ret || !bbio || !raid_map)
2732                 goto bbio_out;
2733
2734         bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2735         if (!bio)
2736                 goto bbio_out;
2737
2738         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2739         bio->bi_private = sparity;
2740         bio->bi_end_io = scrub_parity_bio_endio;
2741
2742         rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
2743                                               raid_map, length,
2744                                               sparity->scrub_dev,
2745                                               sparity->dbitmap,
2746                                               sparity->nsectors);
2747         if (!rbio)
2748                 goto rbio_out;
2749
2750         list_for_each_entry(spage, &sparity->spages, list)
2751                 raid56_parity_add_scrub_pages(rbio, spage->page,
2752                                               spage->logical);
2753
2754         scrub_pending_bio_inc(sctx);
2755         raid56_parity_submit_scrub_rbio(rbio);
2756         return;
2757
2758 rbio_out:
2759         bio_put(bio);
2760 bbio_out:
2761         kfree(bbio);
2762         kfree(raid_map);
2763         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2764                   sparity->nsectors);
2765         spin_lock(&sctx->stat_lock);
2766         sctx->stat.malloc_errors++;
2767         spin_unlock(&sctx->stat_lock);
2768 out:
2769         scrub_free_parity(sparity);
2770 }
2771
2772 static inline int scrub_calc_parity_bitmap_len(int nsectors)
2773 {
2774         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
2775 }
2776
2777 static void scrub_parity_get(struct scrub_parity *sparity)
2778 {
2779         atomic_inc(&sparity->ref_count);
2780 }
2781
2782 static void scrub_parity_put(struct scrub_parity *sparity)
2783 {
2784         if (!atomic_dec_and_test(&sparity->ref_count))
2785                 return;
2786
2787         scrub_parity_check_and_repair(sparity);
2788 }
2789
2790 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2791                                                   struct map_lookup *map,
2792                                                   struct btrfs_device *sdev,
2793                                                   struct btrfs_path *path,
2794                                                   u64 logic_start,
2795                                                   u64 logic_end)
2796 {
2797         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2798         struct btrfs_root *root = fs_info->extent_root;
2799         struct btrfs_root *csum_root = fs_info->csum_root;
2800         struct btrfs_extent_item *extent;
2801         u64 flags;
2802         int ret;
2803         int slot;
2804         struct extent_buffer *l;
2805         struct btrfs_key key;
2806         u64 generation;
2807         u64 extent_logical;
2808         u64 extent_physical;
2809         u64 extent_len;
2810         struct btrfs_device *extent_dev;
2811         struct scrub_parity *sparity;
2812         int nsectors;
2813         int bitmap_len;
2814         int extent_mirror_num;
2815         int stop_loop = 0;
2816
2817         nsectors = map->stripe_len / root->sectorsize;
2818         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2819         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2820                           GFP_NOFS);
2821         if (!sparity) {
2822                 spin_lock(&sctx->stat_lock);
2823                 sctx->stat.malloc_errors++;
2824                 spin_unlock(&sctx->stat_lock);
2825                 return -ENOMEM;
2826         }
2827
2828         sparity->stripe_len = map->stripe_len;
2829         sparity->nsectors = nsectors;
2830         sparity->sctx = sctx;
2831         sparity->scrub_dev = sdev;
2832         sparity->logic_start = logic_start;
2833         sparity->logic_end = logic_end;
2834         atomic_set(&sparity->ref_count, 1);
2835         INIT_LIST_HEAD(&sparity->spages);
2836         sparity->dbitmap = sparity->bitmap;
2837         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2838
2839         ret = 0;
2840         while (logic_start < logic_end) {
2841                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2842                         key.type = BTRFS_METADATA_ITEM_KEY;
2843                 else
2844                         key.type = BTRFS_EXTENT_ITEM_KEY;
2845                 key.objectid = logic_start;
2846                 key.offset = (u64)-1;
2847
2848                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2849                 if (ret < 0)
2850                         goto out;
2851
2852                 if (ret > 0) {
2853                         ret = btrfs_previous_extent_item(root, path, 0);
2854                         if (ret < 0)
2855                                 goto out;
2856                         if (ret > 0) {
2857                                 btrfs_release_path(path);
2858                                 ret = btrfs_search_slot(NULL, root, &key,
2859                                                         path, 0, 0);
2860                                 if (ret < 0)
2861                                         goto out;
2862                         }
2863                 }
2864
2865                 stop_loop = 0;
2866                 while (1) {
2867                         u64 bytes;
2868
2869                         l = path->nodes[0];
2870                         slot = path->slots[0];
2871                         if (slot >= btrfs_header_nritems(l)) {
2872                                 ret = btrfs_next_leaf(root, path);
2873                                 if (ret == 0)
2874                                         continue;
2875                                 if (ret < 0)
2876                                         goto out;
2877
2878                                 stop_loop = 1;
2879                                 break;
2880                         }
2881                         btrfs_item_key_to_cpu(l, &key, slot);
2882
2883                         if (key.type == BTRFS_METADATA_ITEM_KEY)
2884                                 bytes = root->nodesize;
2885                         else
2886                                 bytes = key.offset;
2887
2888                         if (key.objectid + bytes <= logic_start)
2889                                 goto next;
2890
2891                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2892                             key.type != BTRFS_METADATA_ITEM_KEY)
2893                                 goto next;
2894
2895                         if (key.objectid > logic_end) {
2896                                 stop_loop = 1;
2897                                 break;
2898                         }
2899
2900                         while (key.objectid >= logic_start + map->stripe_len)
2901                                 logic_start += map->stripe_len;
2902
2903                         extent = btrfs_item_ptr(l, slot,
2904                                                 struct btrfs_extent_item);
2905                         flags = btrfs_extent_flags(l, extent);
2906                         generation = btrfs_extent_generation(l, extent);
2907
2908                         if (key.objectid < logic_start &&
2909                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2910                                 btrfs_err(fs_info,
2911                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2912                                            key.objectid, logic_start);
2913                                 goto next;
2914                         }
2915 again:
2916                         extent_logical = key.objectid;
2917                         extent_len = bytes;
2918
2919                         if (extent_logical < logic_start) {
2920                                 extent_len -= logic_start - extent_logical;
2921                                 extent_logical = logic_start;
2922                         }
2923
2924                         if (extent_logical + extent_len >
2925                             logic_start + map->stripe_len)
2926                                 extent_len = logic_start + map->stripe_len -
2927                                              extent_logical;
2928
2929                         scrub_parity_mark_sectors_data(sparity, extent_logical,
2930                                                        extent_len);
2931
2932                         scrub_remap_extent(fs_info, extent_logical,
2933                                            extent_len, &extent_physical,
2934                                            &extent_dev,
2935                                            &extent_mirror_num);
2936
2937                         ret = btrfs_lookup_csums_range(csum_root,
2938                                                 extent_logical,
2939                                                 extent_logical + extent_len - 1,
2940                                                 &sctx->csum_list, 1);
2941                         if (ret)
2942                                 goto out;
2943
2944                         ret = scrub_extent_for_parity(sparity, extent_logical,
2945                                                       extent_len,
2946                                                       extent_physical,
2947                                                       extent_dev, flags,
2948                                                       generation,
2949                                                       extent_mirror_num);
2950                         if (ret)
2951                                 goto out;
2952
2953                         scrub_free_csums(sctx);
2954                         if (extent_logical + extent_len <
2955                             key.objectid + bytes) {
2956                                 logic_start += map->stripe_len;
2957
2958                                 if (logic_start >= logic_end) {
2959                                         stop_loop = 1;
2960                                         break;
2961                                 }
2962
2963                                 if (logic_start < key.objectid + bytes) {
2964                                         cond_resched();
2965                                         goto again;
2966                                 }
2967                         }
2968 next:
2969                         path->slots[0]++;
2970                 }
2971
2972                 btrfs_release_path(path);
2973
2974                 if (stop_loop)
2975                         break;
2976
2977                 logic_start += map->stripe_len;
2978         }
2979 out:
2980         if (ret < 0)
2981                 scrub_parity_mark_sectors_error(sparity, logic_start,
2982                                                 logic_end - logic_start + 1);
2983         scrub_parity_put(sparity);
2984         scrub_submit(sctx);
2985         mutex_lock(&sctx->wr_ctx.wr_lock);
2986         scrub_wr_submit(sctx);
2987         mutex_unlock(&sctx->wr_ctx.wr_lock);
2988
2989         btrfs_release_path(path);
2990         return ret < 0 ? ret : 0;
2991 }
2992
2993 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2994                                            struct map_lookup *map,
2995                                            struct btrfs_device *scrub_dev,
2996                                            int num, u64 base, u64 length,
2997                                            int is_dev_replace)
2998 {
2999         struct btrfs_path *path, *ppath;
3000         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3001         struct btrfs_root *root = fs_info->extent_root;
3002         struct btrfs_root *csum_root = fs_info->csum_root;
3003         struct btrfs_extent_item *extent;
3004         struct blk_plug plug;
3005         u64 flags;
3006         int ret;
3007         int slot;
3008         u64 nstripes;
3009         struct extent_buffer *l;
3010         struct btrfs_key key;
3011         u64 physical;
3012         u64 logical;
3013         u64 logic_end;
3014         u64 physical_end;
3015         u64 generation;
3016         int mirror_num;
3017         struct reada_control *reada1;
3018         struct reada_control *reada2;
3019         struct btrfs_key key_start;
3020         struct btrfs_key key_end;
3021         u64 increment = map->stripe_len;
3022         u64 offset;
3023         u64 extent_logical;
3024         u64 extent_physical;
3025         u64 extent_len;
3026         u64 stripe_logical;
3027         u64 stripe_end;
3028         struct btrfs_device *extent_dev;
3029         int extent_mirror_num;
3030         int stop_loop = 0;
3031
3032         nstripes = length;
3033         physical = map->stripes[num].physical;
3034         offset = 0;
3035         do_div(nstripes, map->stripe_len);
3036         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3037                 offset = map->stripe_len * num;
3038                 increment = map->stripe_len * map->num_stripes;
3039                 mirror_num = 1;
3040         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3041                 int factor = map->num_stripes / map->sub_stripes;
3042                 offset = map->stripe_len * (num / map->sub_stripes);
3043                 increment = map->stripe_len * factor;
3044                 mirror_num = num % map->sub_stripes + 1;
3045         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3046                 increment = map->stripe_len;
3047                 mirror_num = num % map->num_stripes + 1;
3048         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3049                 increment = map->stripe_len;
3050                 mirror_num = num % map->num_stripes + 1;
3051         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
3052                                 BTRFS_BLOCK_GROUP_RAID6)) {
3053                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3054                 increment = map->stripe_len * nr_data_stripes(map);
3055                 mirror_num = 1;
3056         } else {
3057                 increment = map->stripe_len;
3058                 mirror_num = 1;
3059         }
3060
3061         path = btrfs_alloc_path();
3062         if (!path)
3063                 return -ENOMEM;
3064
3065         ppath = btrfs_alloc_path();
3066         if (!ppath) {
3067                 btrfs_free_path(ppath);
3068                 return -ENOMEM;
3069         }
3070
3071         /*
3072          * work on commit root. The related disk blocks are static as
3073          * long as COW is applied. This means, it is save to rewrite
3074          * them to repair disk errors without any race conditions
3075          */
3076         path->search_commit_root = 1;
3077         path->skip_locking = 1;
3078
3079         /*
3080          * trigger the readahead for extent tree csum tree and wait for
3081          * completion. During readahead, the scrub is officially paused
3082          * to not hold off transaction commits
3083          */
3084         logical = base + offset;
3085         physical_end = physical + nstripes * map->stripe_len;
3086         if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
3087                          BTRFS_BLOCK_GROUP_RAID6)) {
3088                 get_raid56_logic_offset(physical_end, num,
3089                                         map, &logic_end, NULL);
3090                 logic_end += base;
3091         } else {
3092                 logic_end = logical + increment * nstripes;
3093         }
3094         wait_event(sctx->list_wait,
3095                    atomic_read(&sctx->bios_in_flight) == 0);
3096         scrub_blocked_if_needed(fs_info);
3097
3098         /* FIXME it might be better to start readahead at commit root */
3099         key_start.objectid = logical;
3100         key_start.type = BTRFS_EXTENT_ITEM_KEY;
3101         key_start.offset = (u64)0;
3102         key_end.objectid = logic_end;
3103         key_end.type = BTRFS_METADATA_ITEM_KEY;
3104         key_end.offset = (u64)-1;
3105         reada1 = btrfs_reada_add(root, &key_start, &key_end);
3106
3107         key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3108         key_start.type = BTRFS_EXTENT_CSUM_KEY;
3109         key_start.offset = logical;
3110         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3111         key_end.type = BTRFS_EXTENT_CSUM_KEY;
3112         key_end.offset = logic_end;
3113         reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
3114
3115         if (!IS_ERR(reada1))
3116                 btrfs_reada_wait(reada1);
3117         if (!IS_ERR(reada2))
3118                 btrfs_reada_wait(reada2);
3119
3120
3121         /*
3122          * collect all data csums for the stripe to avoid seeking during
3123          * the scrub. This might currently (crc32) end up to be about 1MB
3124          */
3125         blk_start_plug(&plug);
3126
3127         /*
3128          * now find all extents for each stripe and scrub them
3129          */
3130         ret = 0;
3131         while (physical < physical_end) {
3132                 /* for raid56, we skip parity stripe */
3133                 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
3134                                 BTRFS_BLOCK_GROUP_RAID6)) {
3135                         ret = get_raid56_logic_offset(physical, num,
3136                                         map, &logical, &stripe_logical);
3137                         logical += base;
3138                         if (ret) {
3139                                 stripe_logical += base;
3140                                 stripe_end = stripe_logical + increment - 1;
3141                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3142                                                 ppath, stripe_logical,
3143                                                 stripe_end);
3144                                 if (ret)
3145                                         goto out;
3146                                 goto skip;
3147                         }
3148                 }
3149                 /*
3150                  * canceled?
3151                  */
3152                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3153                     atomic_read(&sctx->cancel_req)) {
3154                         ret = -ECANCELED;
3155                         goto out;
3156                 }
3157                 /*
3158                  * check to see if we have to pause
3159                  */
3160                 if (atomic_read(&fs_info->scrub_pause_req)) {
3161                         /* push queued extents */
3162                         atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3163                         scrub_submit(sctx);
3164                         mutex_lock(&sctx->wr_ctx.wr_lock);
3165                         scrub_wr_submit(sctx);
3166                         mutex_unlock(&sctx->wr_ctx.wr_lock);
3167                         wait_event(sctx->list_wait,
3168                                    atomic_read(&sctx->bios_in_flight) == 0);
3169                         atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3170                         scrub_blocked_if_needed(fs_info);
3171                 }
3172
3173                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3174                         key.type = BTRFS_METADATA_ITEM_KEY;
3175                 else
3176                         key.type = BTRFS_EXTENT_ITEM_KEY;
3177                 key.objectid = logical;
3178                 key.offset = (u64)-1;
3179
3180                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3181                 if (ret < 0)
3182                         goto out;
3183
3184                 if (ret > 0) {
3185                         ret = btrfs_previous_extent_item(root, path, 0);
3186                         if (ret < 0)
3187                                 goto out;
3188                         if (ret > 0) {
3189                                 /* there's no smaller item, so stick with the
3190                                  * larger one */
3191                                 btrfs_release_path(path);
3192                                 ret = btrfs_search_slot(NULL, root, &key,
3193                                                         path, 0, 0);
3194                                 if (ret < 0)
3195                                         goto out;
3196                         }
3197                 }
3198
3199                 stop_loop = 0;
3200                 while (1) {
3201                         u64 bytes;
3202
3203                         l = path->nodes[0];
3204                         slot = path->slots[0];
3205                         if (slot >= btrfs_header_nritems(l)) {
3206                                 ret = btrfs_next_leaf(root, path);
3207                                 if (ret == 0)
3208                                         continue;
3209                                 if (ret < 0)
3210                                         goto out;
3211
3212                                 stop_loop = 1;
3213                                 break;
3214                         }
3215                         btrfs_item_key_to_cpu(l, &key, slot);
3216
3217                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3218                                 bytes = root->nodesize;
3219                         else
3220                                 bytes = key.offset;
3221
3222                         if (key.objectid + bytes <= logical)
3223                                 goto next;
3224
3225                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3226                             key.type != BTRFS_METADATA_ITEM_KEY)
3227                                 goto next;
3228
3229                         if (key.objectid >= logical + map->stripe_len) {
3230                                 /* out of this device extent */
3231                                 if (key.objectid >= logic_end)
3232                                         stop_loop = 1;
3233                                 break;
3234                         }
3235
3236                         extent = btrfs_item_ptr(l, slot,
3237                                                 struct btrfs_extent_item);
3238                         flags = btrfs_extent_flags(l, extent);
3239                         generation = btrfs_extent_generation(l, extent);
3240
3241                         if (key.objectid < logical &&
3242                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
3243                                 btrfs_err(fs_info,
3244                                            "scrub: tree block %llu spanning "
3245                                            "stripes, ignored. logical=%llu",
3246                                        key.objectid, logical);
3247                                 goto next;
3248                         }
3249
3250 again:
3251                         extent_logical = key.objectid;
3252                         extent_len = bytes;
3253
3254                         /*
3255                          * trim extent to this stripe
3256                          */
3257                         if (extent_logical < logical) {
3258                                 extent_len -= logical - extent_logical;
3259                                 extent_logical = logical;
3260                         }
3261                         if (extent_logical + extent_len >
3262                             logical + map->stripe_len) {
3263                                 extent_len = logical + map->stripe_len -
3264                                              extent_logical;
3265                         }
3266
3267                         extent_physical = extent_logical - logical + physical;
3268                         extent_dev = scrub_dev;
3269                         extent_mirror_num = mirror_num;
3270                         if (is_dev_replace)
3271                                 scrub_remap_extent(fs_info, extent_logical,
3272                                                    extent_len, &extent_physical,
3273                                                    &extent_dev,
3274                                                    &extent_mirror_num);
3275
3276                         ret = btrfs_lookup_csums_range(csum_root, logical,
3277                                                 logical + map->stripe_len - 1,
3278                                                 &sctx->csum_list, 1);
3279                         if (ret)
3280                                 goto out;
3281
3282                         ret = scrub_extent(sctx, extent_logical, extent_len,
3283                                            extent_physical, extent_dev, flags,
3284                                            generation, extent_mirror_num,
3285                                            extent_logical - logical + physical);
3286                         if (ret)
3287                                 goto out;
3288
3289                         scrub_free_csums(sctx);
3290                         if (extent_logical + extent_len <
3291                             key.objectid + bytes) {
3292                                 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
3293                                         BTRFS_BLOCK_GROUP_RAID6)) {
3294                                         /*
3295                                          * loop until we find next data stripe
3296                                          * or we have finished all stripes.
3297                                          */
3298 loop:
3299                                         physical += map->stripe_len;
3300                                         ret = get_raid56_logic_offset(physical,
3301                                                         num, map, &logical,
3302                                                         &stripe_logical);
3303                                         logical += base;
3304
3305                                         if (ret && physical < physical_end) {
3306                                                 stripe_logical += base;
3307                                                 stripe_end = stripe_logical +
3308                                                                 increment - 1;
3309                                                 ret = scrub_raid56_parity(sctx,
3310                                                         map, scrub_dev, ppath,
3311                                                         stripe_logical,
3312                                                         stripe_end);
3313                                                 if (ret)
3314                                                         goto out;
3315                                                 goto loop;
3316                                         }
3317                                 } else {
3318                                         physical += map->stripe_len;
3319                                         logical += increment;
3320                                 }
3321                                 if (logical < key.objectid + bytes) {
3322                                         cond_resched();
3323                                         goto again;
3324                                 }
3325
3326                                 if (physical >= physical_end) {
3327                                         stop_loop = 1;
3328                                         break;
3329                                 }
3330                         }
3331 next:
3332                         path->slots[0]++;
3333                 }
3334                 btrfs_release_path(path);
3335 skip:
3336                 logical += increment;
3337                 physical += map->stripe_len;
3338                 spin_lock(&sctx->stat_lock);
3339                 if (stop_loop)
3340                         sctx->stat.last_physical = map->stripes[num].physical +
3341                                                    length;
3342                 else
3343                         sctx->stat.last_physical = physical;
3344                 spin_unlock(&sctx->stat_lock);
3345                 if (stop_loop)
3346                         break;
3347         }
3348 out:
3349         /* push queued extents */
3350         scrub_submit(sctx);
3351         mutex_lock(&sctx->wr_ctx.wr_lock);
3352         scrub_wr_submit(sctx);
3353         mutex_unlock(&sctx->wr_ctx.wr_lock);
3354
3355         blk_finish_plug(&plug);
3356         btrfs_free_path(path);
3357         btrfs_free_path(ppath);
3358         return ret < 0 ? ret : 0;
3359 }
3360
3361 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3362                                           struct btrfs_device *scrub_dev,
3363                                           u64 chunk_tree, u64 chunk_objectid,
3364                                           u64 chunk_offset, u64 length,
3365                                           u64 dev_offset, int is_dev_replace)
3366 {
3367         struct btrfs_mapping_tree *map_tree =
3368                 &sctx->dev_root->fs_info->mapping_tree;
3369         struct map_lookup *map;
3370         struct extent_map *em;
3371         int i;
3372         int ret = 0;
3373
3374         read_lock(&map_tree->map_tree.lock);
3375         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3376         read_unlock(&map_tree->map_tree.lock);
3377
3378         if (!em)
3379                 return -EINVAL;
3380
3381         map = (struct map_lookup *)em->bdev;
3382         if (em->start != chunk_offset)
3383                 goto out;
3384
3385         if (em->len < length)
3386                 goto out;
3387
3388         for (i = 0; i < map->num_stripes; ++i) {
3389                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3390                     map->stripes[i].physical == dev_offset) {
3391                         ret = scrub_stripe(sctx, map, scrub_dev, i,
3392                                            chunk_offset, length,
3393                                            is_dev_replace);
3394                         if (ret)
3395                                 goto out;
3396                 }
3397         }
3398 out:
3399         free_extent_map(em);
3400
3401         return ret;
3402 }
3403
3404 static noinline_for_stack
3405 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3406                            struct btrfs_device *scrub_dev, u64 start, u64 end,
3407                            int is_dev_replace)
3408 {
3409         struct btrfs_dev_extent *dev_extent = NULL;
3410         struct btrfs_path *path;
3411         struct btrfs_root *root = sctx->dev_root;
3412         struct btrfs_fs_info *fs_info = root->fs_info;
3413         u64 length;
3414         u64 chunk_tree;
3415         u64 chunk_objectid;
3416         u64 chunk_offset;
3417         int ret;
3418         int slot;
3419         struct extent_buffer *l;
3420         struct btrfs_key key;
3421         struct btrfs_key found_key;
3422         struct btrfs_block_group_cache *cache;
3423         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3424
3425         path = btrfs_alloc_path();
3426         if (!path)
3427                 return -ENOMEM;
3428
3429         path->reada = 2;
3430         path->search_commit_root = 1;
3431         path->skip_locking = 1;
3432
3433         key.objectid = scrub_dev->devid;
3434         key.offset = 0ull;
3435         key.type = BTRFS_DEV_EXTENT_KEY;
3436
3437         while (1) {
3438                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3439                 if (ret < 0)
3440                         break;
3441                 if (ret > 0) {
3442                         if (path->slots[0] >=
3443                             btrfs_header_nritems(path->nodes[0])) {
3444                                 ret = btrfs_next_leaf(root, path);
3445                                 if (ret)
3446                                         break;
3447                         }
3448                 }
3449
3450                 l = path->nodes[0];
3451                 slot = path->slots[0];
3452
3453                 btrfs_item_key_to_cpu(l, &found_key, slot);
3454
3455                 if (found_key.objectid != scrub_dev->devid)
3456                         break;
3457
3458                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3459                         break;
3460
3461                 if (found_key.offset >= end)
3462                         break;
3463
3464                 if (found_key.offset < key.offset)
3465                         break;
3466
3467                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3468                 length = btrfs_dev_extent_length(l, dev_extent);
3469
3470                 if (found_key.offset + length <= start)
3471                         goto skip;
3472
3473                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3474                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3475                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3476
3477                 /*
3478                  * get a reference on the corresponding block group to prevent
3479                  * the chunk from going away while we scrub it
3480                  */
3481                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3482
3483                 /* some chunks are removed but not committed to disk yet,
3484                  * continue scrubbing */
3485                 if (!cache)
3486                         goto skip;
3487
3488                 dev_replace->cursor_right = found_key.offset + length;
3489                 dev_replace->cursor_left = found_key.offset;
3490                 dev_replace->item_needs_writeback = 1;
3491                 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
3492                                   chunk_offset, length, found_key.offset,
3493                                   is_dev_replace);
3494
3495                 /*
3496                  * flush, submit all pending read and write bios, afterwards
3497                  * wait for them.
3498                  * Note that in the dev replace case, a read request causes
3499                  * write requests that are submitted in the read completion
3500                  * worker. Therefore in the current situation, it is required
3501                  * that all write requests are flushed, so that all read and
3502                  * write requests are really completed when bios_in_flight
3503                  * changes to 0.
3504                  */
3505                 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3506                 scrub_submit(sctx);
3507                 mutex_lock(&sctx->wr_ctx.wr_lock);
3508                 scrub_wr_submit(sctx);
3509                 mutex_unlock(&sctx->wr_ctx.wr_lock);
3510
3511                 wait_event(sctx->list_wait,
3512                            atomic_read(&sctx->bios_in_flight) == 0);
3513                 atomic_inc(&fs_info->scrubs_paused);
3514                 wake_up(&fs_info->scrub_pause_wait);
3515
3516                 /*
3517                  * must be called before we decrease @scrub_paused.
3518                  * make sure we don't block transaction commit while
3519                  * we are waiting pending workers finished.
3520                  */
3521                 wait_event(sctx->list_wait,
3522                            atomic_read(&sctx->workers_pending) == 0);
3523                 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3524
3525                 mutex_lock(&fs_info->scrub_lock);
3526                 __scrub_blocked_if_needed(fs_info);
3527                 atomic_dec(&fs_info->scrubs_paused);
3528                 mutex_unlock(&fs_info->scrub_lock);
3529                 wake_up(&fs_info->scrub_pause_wait);
3530
3531                 btrfs_put_block_group(cache);
3532                 if (ret)
3533                         break;
3534                 if (is_dev_replace &&
3535                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3536                         ret = -EIO;
3537                         break;
3538                 }
3539                 if (sctx->stat.malloc_errors > 0) {
3540                         ret = -ENOMEM;
3541                         break;
3542                 }
3543
3544                 dev_replace->cursor_left = dev_replace->cursor_right;
3545                 dev_replace->item_needs_writeback = 1;
3546 skip:
3547                 key.offset = found_key.offset + length;
3548                 btrfs_release_path(path);
3549         }
3550
3551         btrfs_free_path(path);
3552
3553         /*
3554          * ret can still be 1 from search_slot or next_leaf,
3555          * that's not an error
3556          */
3557         return ret < 0 ? ret : 0;
3558 }
3559
3560 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3561                                            struct btrfs_device *scrub_dev)
3562 {
3563         int     i;
3564         u64     bytenr;
3565         u64     gen;
3566         int     ret;
3567         struct btrfs_root *root = sctx->dev_root;
3568
3569         if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
3570                 return -EIO;
3571
3572         /* Seed devices of a new filesystem has their own generation. */
3573         if (scrub_dev->fs_devices != root->fs_info->fs_devices)
3574                 gen = scrub_dev->generation;
3575         else
3576                 gen = root->fs_info->last_trans_committed;
3577
3578         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3579                 bytenr = btrfs_sb_offset(i);
3580                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3581                     scrub_dev->commit_total_bytes)
3582                         break;
3583
3584                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3585                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3586                                   NULL, 1, bytenr);
3587                 if (ret)
3588                         return ret;
3589         }
3590         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3591
3592         return 0;
3593 }
3594
3595 /*
3596  * get a reference count on fs_info->scrub_workers. start worker if necessary
3597  */
3598 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3599                                                 int is_dev_replace)
3600 {
3601         int ret = 0;
3602         int flags = WQ_FREEZABLE | WQ_UNBOUND;
3603         int max_active = fs_info->thread_pool_size;
3604
3605         if (fs_info->scrub_workers_refcnt == 0) {
3606                 if (is_dev_replace)
3607                         fs_info->scrub_workers =
3608                                 btrfs_alloc_workqueue("btrfs-scrub", flags,
3609                                                       1, 4);
3610                 else
3611                         fs_info->scrub_workers =
3612                                 btrfs_alloc_workqueue("btrfs-scrub", flags,
3613                                                       max_active, 4);
3614                 if (!fs_info->scrub_workers) {
3615                         ret = -ENOMEM;
3616                         goto out;
3617                 }
3618                 fs_info->scrub_wr_completion_workers =
3619                         btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
3620                                               max_active, 2);
3621                 if (!fs_info->scrub_wr_completion_workers) {
3622                         ret = -ENOMEM;
3623                         goto out;
3624                 }
3625                 fs_info->scrub_nocow_workers =
3626                         btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
3627                 if (!fs_info->scrub_nocow_workers) {
3628                         ret = -ENOMEM;
3629                         goto out;
3630                 }
3631         }
3632         ++fs_info->scrub_workers_refcnt;
3633 out:
3634         return ret;
3635 }
3636
3637 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
3638 {
3639         if (--fs_info->scrub_workers_refcnt == 0) {
3640                 btrfs_destroy_workqueue(fs_info->scrub_workers);
3641                 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3642                 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
3643         }
3644         WARN_ON(fs_info->scrub_workers_refcnt < 0);
3645 }
3646
3647 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3648                     u64 end, struct btrfs_scrub_progress *progress,
3649                     int readonly, int is_dev_replace)
3650 {
3651         struct scrub_ctx *sctx;
3652         int ret;
3653         struct btrfs_device *dev;
3654         struct rcu_string *name;
3655
3656         if (btrfs_fs_closing(fs_info))
3657                 return -EINVAL;
3658
3659         if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
3660                 /*
3661                  * in this case scrub is unable to calculate the checksum
3662                  * the way scrub is implemented. Do not handle this
3663                  * situation at all because it won't ever happen.
3664                  */
3665                 btrfs_err(fs_info,
3666                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
3667                        fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
3668                 return -EINVAL;
3669         }
3670
3671         if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
3672                 /* not supported for data w/o checksums */
3673                 btrfs_err(fs_info,
3674                            "scrub: size assumption sectorsize != PAGE_SIZE "
3675                            "(%d != %lu) fails",
3676                        fs_info->chunk_root->sectorsize, PAGE_SIZE);
3677                 return -EINVAL;
3678         }
3679
3680         if (fs_info->chunk_root->nodesize >
3681             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3682             fs_info->chunk_root->sectorsize >
3683             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3684                 /*
3685                  * would exhaust the array bounds of pagev member in
3686                  * struct scrub_block
3687                  */
3688                 btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
3689                            "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
3690                        fs_info->chunk_root->nodesize,
3691                        SCRUB_MAX_PAGES_PER_BLOCK,
3692                        fs_info->chunk_root->sectorsize,
3693                        SCRUB_MAX_PAGES_PER_BLOCK);
3694                 return -EINVAL;
3695         }
3696
3697
3698         mutex_lock(&fs_info->fs_devices->device_list_mutex);
3699         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
3700         if (!dev || (dev->missing && !is_dev_replace)) {
3701                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3702                 return -ENODEV;
3703         }
3704
3705         if (!is_dev_replace && !readonly && !dev->writeable) {
3706                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3707                 rcu_read_lock();
3708                 name = rcu_dereference(dev->name);
3709                 btrfs_err(fs_info, "scrub: device %s is not writable",
3710                           name->str);
3711                 rcu_read_unlock();
3712                 return -EROFS;
3713         }
3714
3715         mutex_lock(&fs_info->scrub_lock);
3716         if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
3717                 mutex_unlock(&fs_info->scrub_lock);
3718                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3719                 return -EIO;
3720         }
3721
3722         btrfs_dev_replace_lock(&fs_info->dev_replace);
3723         if (dev->scrub_device ||
3724             (!is_dev_replace &&
3725              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3726                 btrfs_dev_replace_unlock(&fs_info->dev_replace);
3727                 mutex_unlock(&fs_info->scrub_lock);
3728                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3729                 return -EINPROGRESS;
3730         }
3731         btrfs_dev_replace_unlock(&fs_info->dev_replace);
3732
3733         ret = scrub_workers_get(fs_info, is_dev_replace);
3734         if (ret) {
3735                 mutex_unlock(&fs_info->scrub_lock);
3736                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3737                 return ret;
3738         }
3739
3740         sctx = scrub_setup_ctx(dev, is_dev_replace);
3741         if (IS_ERR(sctx)) {
3742                 mutex_unlock(&fs_info->scrub_lock);
3743                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3744                 scrub_workers_put(fs_info);
3745                 return PTR_ERR(sctx);
3746         }
3747         sctx->readonly = readonly;
3748         dev->scrub_device = sctx;
3749         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3750
3751         /*
3752          * checking @scrub_pause_req here, we can avoid
3753          * race between committing transaction and scrubbing.
3754          */
3755         __scrub_blocked_if_needed(fs_info);
3756         atomic_inc(&fs_info->scrubs_running);
3757         mutex_unlock(&fs_info->scrub_lock);
3758
3759         if (!is_dev_replace) {
3760                 /*
3761                  * by holding device list mutex, we can
3762                  * kick off writing super in log tree sync.
3763                  */
3764                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3765                 ret = scrub_supers(sctx, dev);
3766                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3767         }
3768
3769         if (!ret)
3770                 ret = scrub_enumerate_chunks(sctx, dev, start, end,
3771                                              is_dev_replace);
3772
3773         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3774         atomic_dec(&fs_info->scrubs_running);
3775         wake_up(&fs_info->scrub_pause_wait);
3776
3777         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
3778
3779         if (progress)
3780                 memcpy(progress, &sctx->stat, sizeof(*progress));
3781
3782         mutex_lock(&fs_info->scrub_lock);
3783         dev->scrub_device = NULL;
3784         scrub_workers_put(fs_info);
3785         mutex_unlock(&fs_info->scrub_lock);
3786
3787         scrub_free_ctx(sctx);
3788
3789         return ret;
3790 }
3791
3792 void btrfs_scrub_pause(struct btrfs_root *root)
3793 {
3794         struct btrfs_fs_info *fs_info = root->fs_info;
3795
3796         mutex_lock(&fs_info->scrub_lock);
3797         atomic_inc(&fs_info->scrub_pause_req);
3798         while (atomic_read(&fs_info->scrubs_paused) !=
3799                atomic_read(&fs_info->scrubs_running)) {
3800                 mutex_unlock(&fs_info->scrub_lock);
3801                 wait_event(fs_info->scrub_pause_wait,
3802                            atomic_read(&fs_info->scrubs_paused) ==
3803                            atomic_read(&fs_info->scrubs_running));
3804                 mutex_lock(&fs_info->scrub_lock);
3805         }
3806         mutex_unlock(&fs_info->scrub_lock);
3807 }
3808
3809 void btrfs_scrub_continue(struct btrfs_root *root)
3810 {
3811         struct btrfs_fs_info *fs_info = root->fs_info;
3812
3813         atomic_dec(&fs_info->scrub_pause_req);
3814         wake_up(&fs_info->scrub_pause_wait);
3815 }
3816
3817 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
3818 {
3819         mutex_lock(&fs_info->scrub_lock);
3820         if (!atomic_read(&fs_info->scrubs_running)) {
3821                 mutex_unlock(&fs_info->scrub_lock);
3822                 return -ENOTCONN;
3823         }
3824
3825         atomic_inc(&fs_info->scrub_cancel_req);
3826         while (atomic_read(&fs_info->scrubs_running)) {
3827                 mutex_unlock(&fs_info->scrub_lock);
3828                 wait_event(fs_info->scrub_pause_wait,
3829                            atomic_read(&fs_info->scrubs_running) == 0);
3830                 mutex_lock(&fs_info->scrub_lock);
3831         }
3832         atomic_dec(&fs_info->scrub_cancel_req);
3833         mutex_unlock(&fs_info->scrub_lock);
3834
3835         return 0;
3836 }
3837
3838 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
3839                            struct btrfs_device *dev)
3840 {
3841         struct scrub_ctx *sctx;
3842
3843         mutex_lock(&fs_info->scrub_lock);
3844         sctx = dev->scrub_device;
3845         if (!sctx) {
3846                 mutex_unlock(&fs_info->scrub_lock);
3847                 return -ENOTCONN;
3848         }
3849         atomic_inc(&sctx->cancel_req);
3850         while (dev->scrub_device) {
3851                 mutex_unlock(&fs_info->scrub_lock);
3852                 wait_event(fs_info->scrub_pause_wait,
3853                            dev->scrub_device == NULL);
3854                 mutex_lock(&fs_info->scrub_lock);
3855         }
3856         mutex_unlock(&fs_info->scrub_lock);
3857
3858         return 0;
3859 }
3860
3861 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3862                          struct btrfs_scrub_progress *progress)
3863 {
3864         struct btrfs_device *dev;
3865         struct scrub_ctx *sctx = NULL;
3866
3867         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3868         dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
3869         if (dev)
3870                 sctx = dev->scrub_device;
3871         if (sctx)
3872                 memcpy(progress, &sctx->stat, sizeof(*progress));
3873         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3874
3875         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3876 }
3877
3878 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3879                                u64 extent_logical, u64 extent_len,
3880                                u64 *extent_physical,
3881                                struct btrfs_device **extent_dev,
3882                                int *extent_mirror_num)
3883 {
3884         u64 mapped_length;
3885         struct btrfs_bio *bbio = NULL;
3886         int ret;
3887
3888         mapped_length = extent_len;
3889         ret = btrfs_map_block(fs_info, READ, extent_logical,
3890                               &mapped_length, &bbio, 0);
3891         if (ret || !bbio || mapped_length < extent_len ||
3892             !bbio->stripes[0].dev->bdev) {
3893                 kfree(bbio);
3894                 return;
3895         }
3896
3897         *extent_physical = bbio->stripes[0].physical;
3898         *extent_mirror_num = bbio->mirror_num;
3899         *extent_dev = bbio->stripes[0].dev;
3900         kfree(bbio);
3901 }
3902
3903 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3904                               struct scrub_wr_ctx *wr_ctx,
3905                               struct btrfs_fs_info *fs_info,
3906                               struct btrfs_device *dev,
3907                               int is_dev_replace)
3908 {
3909         WARN_ON(wr_ctx->wr_curr_bio != NULL);
3910
3911         mutex_init(&wr_ctx->wr_lock);
3912         wr_ctx->wr_curr_bio = NULL;
3913         if (!is_dev_replace)
3914                 return 0;
3915
3916         WARN_ON(!dev->bdev);
3917         wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3918                                          bio_get_nr_vecs(dev->bdev));
3919         wr_ctx->tgtdev = dev;
3920         atomic_set(&wr_ctx->flush_all_writes, 0);
3921         return 0;
3922 }
3923
3924 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3925 {
3926         mutex_lock(&wr_ctx->wr_lock);
3927         kfree(wr_ctx->wr_curr_bio);
3928         wr_ctx->wr_curr_bio = NULL;
3929         mutex_unlock(&wr_ctx->wr_lock);
3930 }
3931
3932 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3933                             int mirror_num, u64 physical_for_dev_replace)
3934 {
3935         struct scrub_copy_nocow_ctx *nocow_ctx;
3936         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3937
3938         nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3939         if (!nocow_ctx) {
3940                 spin_lock(&sctx->stat_lock);
3941                 sctx->stat.malloc_errors++;
3942                 spin_unlock(&sctx->stat_lock);
3943                 return -ENOMEM;
3944         }
3945
3946         scrub_pending_trans_workers_inc(sctx);
3947
3948         nocow_ctx->sctx = sctx;
3949         nocow_ctx->logical = logical;
3950         nocow_ctx->len = len;
3951         nocow_ctx->mirror_num = mirror_num;
3952         nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3953         btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
3954                         copy_nocow_pages_worker, NULL, NULL);
3955         INIT_LIST_HEAD(&nocow_ctx->inodes);
3956         btrfs_queue_work(fs_info->scrub_nocow_workers,
3957                          &nocow_ctx->work);
3958
3959         return 0;
3960 }
3961
3962 static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
3963 {
3964         struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3965         struct scrub_nocow_inode *nocow_inode;
3966
3967         nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
3968         if (!nocow_inode)
3969                 return -ENOMEM;
3970         nocow_inode->inum = inum;
3971         nocow_inode->offset = offset;
3972         nocow_inode->root = root;
3973         list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
3974         return 0;
3975 }
3976
3977 #define COPY_COMPLETE 1
3978
3979 static void copy_nocow_pages_worker(struct btrfs_work *work)
3980 {
3981         struct scrub_copy_nocow_ctx *nocow_ctx =
3982                 container_of(work, struct scrub_copy_nocow_ctx, work);
3983         struct scrub_ctx *sctx = nocow_ctx->sctx;
3984         u64 logical = nocow_ctx->logical;
3985         u64 len = nocow_ctx->len;
3986         int mirror_num = nocow_ctx->mirror_num;
3987         u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3988         int ret;
3989         struct btrfs_trans_handle *trans = NULL;
3990         struct btrfs_fs_info *fs_info;
3991         struct btrfs_path *path;
3992         struct btrfs_root *root;
3993         int not_written = 0;
3994
3995         fs_info = sctx->dev_root->fs_info;
3996         root = fs_info->extent_root;
3997
3998         path = btrfs_alloc_path();
3999         if (!path) {
4000                 spin_lock(&sctx->stat_lock);
4001                 sctx->stat.malloc_errors++;
4002                 spin_unlock(&sctx->stat_lock);
4003                 not_written = 1;
4004                 goto out;
4005         }
4006
4007         trans = btrfs_join_transaction(root);
4008         if (IS_ERR(trans)) {
4009                 not_written = 1;
4010                 goto out;
4011         }
4012
4013         ret = iterate_inodes_from_logical(logical, fs_info, path,
4014                                           record_inode_for_nocow, nocow_ctx);
4015         if (ret != 0 && ret != -ENOENT) {
4016                 btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
4017                         "phys %llu, len %llu, mir %u, ret %d",
4018                         logical, physical_for_dev_replace, len, mirror_num,
4019                         ret);
4020                 not_written = 1;
4021                 goto out;
4022         }
4023
4024         btrfs_end_transaction(trans, root);
4025         trans = NULL;
4026         while (!list_empty(&nocow_ctx->inodes)) {
4027                 struct scrub_nocow_inode *entry;
4028                 entry = list_first_entry(&nocow_ctx->inodes,
4029                                          struct scrub_nocow_inode,
4030                                          list);
4031                 list_del_init(&entry->list);
4032                 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
4033                                                  entry->root, nocow_ctx);
4034                 kfree(entry);
4035                 if (ret == COPY_COMPLETE) {
4036                         ret = 0;
4037                         break;
4038                 } else if (ret) {
4039                         break;
4040                 }
4041         }
4042 out:
4043         while (!list_empty(&nocow_ctx->inodes)) {
4044                 struct scrub_nocow_inode *entry;
4045                 entry = list_first_entry(&nocow_ctx->inodes,
4046                                          struct scrub_nocow_inode,
4047                                          list);
4048                 list_del_init(&entry->list);
4049                 kfree(entry);
4050         }
4051         if (trans && !IS_ERR(trans))
4052                 btrfs_end_transaction(trans, root);
4053         if (not_written)
4054                 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4055                                             num_uncorrectable_read_errors);
4056
4057         btrfs_free_path(path);
4058         kfree(nocow_ctx);
4059
4060         scrub_pending_trans_workers_dec(sctx);
4061 }
4062
4063 static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
4064                                  u64 logical)
4065 {
4066         struct extent_state *cached_state = NULL;
4067         struct btrfs_ordered_extent *ordered;
4068         struct extent_io_tree *io_tree;
4069         struct extent_map *em;
4070         u64 lockstart = start, lockend = start + len - 1;
4071         int ret = 0;
4072
4073         io_tree = &BTRFS_I(inode)->io_tree;
4074
4075         lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
4076         ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4077         if (ordered) {
4078                 btrfs_put_ordered_extent(ordered);
4079                 ret = 1;
4080                 goto out_unlock;
4081         }
4082
4083         em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4084         if (IS_ERR(em)) {
4085                 ret = PTR_ERR(em);
4086                 goto out_unlock;
4087         }
4088
4089         /*
4090          * This extent does not actually cover the logical extent anymore,
4091          * move on to the next inode.
4092          */
4093         if (em->block_start > logical ||
4094             em->block_start + em->block_len < logical + len) {
4095                 free_extent_map(em);
4096                 ret = 1;
4097                 goto out_unlock;
4098         }
4099         free_extent_map(em);
4100
4101 out_unlock:
4102         unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
4103                              GFP_NOFS);
4104         return ret;
4105 }
4106
4107 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4108                                       struct scrub_copy_nocow_ctx *nocow_ctx)
4109 {
4110         struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
4111         struct btrfs_key key;
4112         struct inode *inode;
4113         struct page *page;
4114         struct btrfs_root *local_root;
4115         struct extent_io_tree *io_tree;
4116         u64 physical_for_dev_replace;
4117         u64 nocow_ctx_logical;
4118         u64 len = nocow_ctx->len;
4119         unsigned long index;
4120         int srcu_index;
4121         int ret = 0;
4122         int err = 0;
4123
4124         key.objectid = root;
4125         key.type = BTRFS_ROOT_ITEM_KEY;
4126         key.offset = (u64)-1;
4127
4128         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4129
4130         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
4131         if (IS_ERR(local_root)) {
4132                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4133                 return PTR_ERR(local_root);
4134         }
4135
4136         key.type = BTRFS_INODE_ITEM_KEY;
4137         key.objectid = inum;
4138         key.offset = 0;
4139         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
4140         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4141         if (IS_ERR(inode))
4142                 return PTR_ERR(inode);
4143
4144         /* Avoid truncate/dio/punch hole.. */
4145         mutex_lock(&inode->i_mutex);
4146         inode_dio_wait(inode);
4147
4148         physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4149         io_tree = &BTRFS_I(inode)->io_tree;
4150         nocow_ctx_logical = nocow_ctx->logical;
4151
4152         ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
4153         if (ret) {
4154                 ret = ret > 0 ? 0 : ret;
4155                 goto out;
4156         }
4157
4158         while (len >= PAGE_CACHE_SIZE) {
4159                 index = offset >> PAGE_CACHE_SHIFT;
4160 again:
4161                 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4162                 if (!page) {
4163                         btrfs_err(fs_info, "find_or_create_page() failed");
4164                         ret = -ENOMEM;
4165                         goto out;
4166                 }
4167
4168                 if (PageUptodate(page)) {
4169                         if (PageDirty(page))
4170                                 goto next_page;
4171                 } else {
4172                         ClearPageError(page);
4173                         err = extent_read_full_page(io_tree, page,
4174                                                            btrfs_get_extent,
4175                                                            nocow_ctx->mirror_num);
4176                         if (err) {
4177                                 ret = err;
4178                                 goto next_page;
4179                         }
4180
4181                         lock_page(page);
4182                         /*
4183                          * If the page has been remove from the page cache,
4184                          * the data on it is meaningless, because it may be
4185                          * old one, the new data may be written into the new
4186                          * page in the page cache.
4187                          */
4188                         if (page->mapping != inode->i_mapping) {
4189                                 unlock_page(page);
4190                                 page_cache_release(page);
4191                                 goto again;
4192                         }
4193                         if (!PageUptodate(page)) {
4194                                 ret = -EIO;
4195                                 goto next_page;
4196                         }
4197                 }
4198
4199                 ret = check_extent_to_block(inode, offset, len,
4200                                             nocow_ctx_logical);
4201                 if (ret) {
4202                         ret = ret > 0 ? 0 : ret;
4203                         goto next_page;
4204                 }
4205
4206                 err = write_page_nocow(nocow_ctx->sctx,
4207                                        physical_for_dev_replace, page);
4208                 if (err)
4209                         ret = err;
4210 next_page:
4211                 unlock_page(page);
4212                 page_cache_release(page);
4213
4214                 if (ret)
4215                         break;
4216
4217                 offset += PAGE_CACHE_SIZE;
4218                 physical_for_dev_replace += PAGE_CACHE_SIZE;
4219                 nocow_ctx_logical += PAGE_CACHE_SIZE;
4220                 len -= PAGE_CACHE_SIZE;
4221         }
4222         ret = COPY_COMPLETE;
4223 out:
4224         mutex_unlock(&inode->i_mutex);
4225         iput(inode);
4226         return ret;
4227 }
4228
4229 static int write_page_nocow(struct scrub_ctx *sctx,
4230                             u64 physical_for_dev_replace, struct page *page)
4231 {
4232         struct bio *bio;
4233         struct btrfs_device *dev;
4234         int ret;
4235
4236         dev = sctx->wr_ctx.tgtdev;
4237         if (!dev)
4238                 return -EIO;
4239         if (!dev->bdev) {
4240                 printk_ratelimited(KERN_WARNING
4241                         "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
4242                 return -EIO;
4243         }
4244         bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
4245         if (!bio) {
4246                 spin_lock(&sctx->stat_lock);
4247                 sctx->stat.malloc_errors++;
4248                 spin_unlock(&sctx->stat_lock);
4249                 return -ENOMEM;
4250         }
4251         bio->bi_iter.bi_size = 0;
4252         bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
4253         bio->bi_bdev = dev->bdev;
4254         ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
4255         if (ret != PAGE_CACHE_SIZE) {
4256 leave_with_eio:
4257                 bio_put(bio);
4258                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4259                 return -EIO;
4260         }
4261
4262         if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
4263                 goto leave_with_eio;
4264
4265         bio_put(bio);
4266         return 0;
4267 }