Merge tag 'for-linus-5.11-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / fs / btrfs / scrub.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
4  */
5
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include <crypto/hash.h>
10 #include "ctree.h"
11 #include "discard.h"
12 #include "volumes.h"
13 #include "disk-io.h"
14 #include "ordered-data.h"
15 #include "transaction.h"
16 #include "backref.h"
17 #include "extent_io.h"
18 #include "dev-replace.h"
19 #include "check-integrity.h"
20 #include "rcu-string.h"
21 #include "raid56.h"
22 #include "block-group.h"
23 #include "zoned.h"
24
25 /*
26  * This is only the first step towards a full-features scrub. It reads all
27  * extent and super block and verifies the checksums. In case a bad checksum
28  * is found or the extent cannot be read, good data will be written back if
29  * any can be found.
30  *
31  * Future enhancements:
32  *  - In case an unrepairable extent is encountered, track which files are
33  *    affected and report them
34  *  - track and record media errors, throw out bad devices
35  *  - add a mode to also read unallocated space
36  */
37
38 struct scrub_block;
39 struct scrub_ctx;
40
41 /*
42  * the following three values only influence the performance.
43  * The last one configures the number of parallel and outstanding I/O
44  * operations. The first two values configure an upper limit for the number
45  * of (dynamically allocated) pages that are added to a bio.
46  */
47 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
48 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
49 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
50
51 /*
52  * the following value times PAGE_SIZE needs to be large enough to match the
53  * largest node/leaf/sector size that shall be supported.
54  * Values larger than BTRFS_STRIPE_LEN are not supported.
55  */
56 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
57
58 struct scrub_recover {
59         refcount_t              refs;
60         struct btrfs_bio        *bbio;
61         u64                     map_length;
62 };
63
64 struct scrub_page {
65         struct scrub_block      *sblock;
66         struct page             *page;
67         struct btrfs_device     *dev;
68         struct list_head        list;
69         u64                     flags;  /* extent flags */
70         u64                     generation;
71         u64                     logical;
72         u64                     physical;
73         u64                     physical_for_dev_replace;
74         atomic_t                refs;
75         u8                      mirror_num;
76         int                     have_csum:1;
77         int                     io_error:1;
78         u8                      csum[BTRFS_CSUM_SIZE];
79
80         struct scrub_recover    *recover;
81 };
82
83 struct scrub_bio {
84         int                     index;
85         struct scrub_ctx        *sctx;
86         struct btrfs_device     *dev;
87         struct bio              *bio;
88         blk_status_t            status;
89         u64                     logical;
90         u64                     physical;
91 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
93 #else
94         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
95 #endif
96         int                     page_count;
97         int                     next_free;
98         struct btrfs_work       work;
99 };
100
101 struct scrub_block {
102         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
103         int                     page_count;
104         atomic_t                outstanding_pages;
105         refcount_t              refs; /* free mem on transition to zero */
106         struct scrub_ctx        *sctx;
107         struct scrub_parity     *sparity;
108         struct {
109                 unsigned int    header_error:1;
110                 unsigned int    checksum_error:1;
111                 unsigned int    no_io_error_seen:1;
112                 unsigned int    generation_error:1; /* also sets header_error */
113
114                 /* The following is for the data used to check parity */
115                 /* It is for the data with checksum */
116                 unsigned int    data_corrected:1;
117         };
118         struct btrfs_work       work;
119 };
120
121 /* Used for the chunks with parity stripe such RAID5/6 */
122 struct scrub_parity {
123         struct scrub_ctx        *sctx;
124
125         struct btrfs_device     *scrub_dev;
126
127         u64                     logic_start;
128
129         u64                     logic_end;
130
131         int                     nsectors;
132
133         u32                     stripe_len;
134
135         refcount_t              refs;
136
137         struct list_head        spages;
138
139         /* Work of parity check and repair */
140         struct btrfs_work       work;
141
142         /* Mark the parity blocks which have data */
143         unsigned long           *dbitmap;
144
145         /*
146          * Mark the parity blocks which have data, but errors happen when
147          * read data or check data
148          */
149         unsigned long           *ebitmap;
150
151         unsigned long           bitmap[];
152 };
153
154 struct scrub_ctx {
155         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
156         struct btrfs_fs_info    *fs_info;
157         int                     first_free;
158         int                     curr;
159         atomic_t                bios_in_flight;
160         atomic_t                workers_pending;
161         spinlock_t              list_lock;
162         wait_queue_head_t       list_wait;
163         struct list_head        csum_list;
164         atomic_t                cancel_req;
165         int                     readonly;
166         int                     pages_per_rd_bio;
167
168         int                     is_dev_replace;
169
170         struct scrub_bio        *wr_curr_bio;
171         struct mutex            wr_lock;
172         int                     pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
173         struct btrfs_device     *wr_tgtdev;
174         bool                    flush_all_writes;
175
176         /*
177          * statistics
178          */
179         struct btrfs_scrub_progress stat;
180         spinlock_t              stat_lock;
181
182         /*
183          * Use a ref counter to avoid use-after-free issues. Scrub workers
184          * decrement bios_in_flight and workers_pending and then do a wakeup
185          * on the list_wait wait queue. We must ensure the main scrub task
186          * doesn't free the scrub context before or while the workers are
187          * doing the wakeup() call.
188          */
189         refcount_t              refs;
190 };
191
192 struct scrub_warning {
193         struct btrfs_path       *path;
194         u64                     extent_item_size;
195         const char              *errstr;
196         u64                     physical;
197         u64                     logical;
198         struct btrfs_device     *dev;
199 };
200
201 struct full_stripe_lock {
202         struct rb_node node;
203         u64 logical;
204         u64 refs;
205         struct mutex mutex;
206 };
207
208 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
209 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
210 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
211 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
212                                      struct scrub_block *sblocks_for_recheck);
213 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
214                                 struct scrub_block *sblock,
215                                 int retry_failed_mirror);
216 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
217 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
218                                              struct scrub_block *sblock_good);
219 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
220                                             struct scrub_block *sblock_good,
221                                             int page_num, int force_write);
222 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
223 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
224                                            int page_num);
225 static int scrub_checksum_data(struct scrub_block *sblock);
226 static int scrub_checksum_tree_block(struct scrub_block *sblock);
227 static int scrub_checksum_super(struct scrub_block *sblock);
228 static void scrub_block_get(struct scrub_block *sblock);
229 static void scrub_block_put(struct scrub_block *sblock);
230 static void scrub_page_get(struct scrub_page *spage);
231 static void scrub_page_put(struct scrub_page *spage);
232 static void scrub_parity_get(struct scrub_parity *sparity);
233 static void scrub_parity_put(struct scrub_parity *sparity);
234 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
235                                     struct scrub_page *spage);
236 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
237                        u64 physical, struct btrfs_device *dev, u64 flags,
238                        u64 gen, int mirror_num, u8 *csum,
239                        u64 physical_for_dev_replace);
240 static void scrub_bio_end_io(struct bio *bio);
241 static void scrub_bio_end_io_worker(struct btrfs_work *work);
242 static void scrub_block_complete(struct scrub_block *sblock);
243 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
244                                u64 extent_logical, u32 extent_len,
245                                u64 *extent_physical,
246                                struct btrfs_device **extent_dev,
247                                int *extent_mirror_num);
248 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
249                                     struct scrub_page *spage);
250 static void scrub_wr_submit(struct scrub_ctx *sctx);
251 static void scrub_wr_bio_end_io(struct bio *bio);
252 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
253 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
254 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
255 static void scrub_put_ctx(struct scrub_ctx *sctx);
256
257 static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
258 {
259         return spage->recover &&
260                (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
261 }
262
263 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
264 {
265         refcount_inc(&sctx->refs);
266         atomic_inc(&sctx->bios_in_flight);
267 }
268
269 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
270 {
271         atomic_dec(&sctx->bios_in_flight);
272         wake_up(&sctx->list_wait);
273         scrub_put_ctx(sctx);
274 }
275
276 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
277 {
278         while (atomic_read(&fs_info->scrub_pause_req)) {
279                 mutex_unlock(&fs_info->scrub_lock);
280                 wait_event(fs_info->scrub_pause_wait,
281                    atomic_read(&fs_info->scrub_pause_req) == 0);
282                 mutex_lock(&fs_info->scrub_lock);
283         }
284 }
285
286 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
287 {
288         atomic_inc(&fs_info->scrubs_paused);
289         wake_up(&fs_info->scrub_pause_wait);
290 }
291
292 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
293 {
294         mutex_lock(&fs_info->scrub_lock);
295         __scrub_blocked_if_needed(fs_info);
296         atomic_dec(&fs_info->scrubs_paused);
297         mutex_unlock(&fs_info->scrub_lock);
298
299         wake_up(&fs_info->scrub_pause_wait);
300 }
301
302 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
303 {
304         scrub_pause_on(fs_info);
305         scrub_pause_off(fs_info);
306 }
307
308 /*
309  * Insert new full stripe lock into full stripe locks tree
310  *
311  * Return pointer to existing or newly inserted full_stripe_lock structure if
312  * everything works well.
313  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
314  *
315  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
316  * function
317  */
318 static struct full_stripe_lock *insert_full_stripe_lock(
319                 struct btrfs_full_stripe_locks_tree *locks_root,
320                 u64 fstripe_logical)
321 {
322         struct rb_node **p;
323         struct rb_node *parent = NULL;
324         struct full_stripe_lock *entry;
325         struct full_stripe_lock *ret;
326
327         lockdep_assert_held(&locks_root->lock);
328
329         p = &locks_root->root.rb_node;
330         while (*p) {
331                 parent = *p;
332                 entry = rb_entry(parent, struct full_stripe_lock, node);
333                 if (fstripe_logical < entry->logical) {
334                         p = &(*p)->rb_left;
335                 } else if (fstripe_logical > entry->logical) {
336                         p = &(*p)->rb_right;
337                 } else {
338                         entry->refs++;
339                         return entry;
340                 }
341         }
342
343         /*
344          * Insert new lock.
345          */
346         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
347         if (!ret)
348                 return ERR_PTR(-ENOMEM);
349         ret->logical = fstripe_logical;
350         ret->refs = 1;
351         mutex_init(&ret->mutex);
352
353         rb_link_node(&ret->node, parent, p);
354         rb_insert_color(&ret->node, &locks_root->root);
355         return ret;
356 }
357
358 /*
359  * Search for a full stripe lock of a block group
360  *
361  * Return pointer to existing full stripe lock if found
362  * Return NULL if not found
363  */
364 static struct full_stripe_lock *search_full_stripe_lock(
365                 struct btrfs_full_stripe_locks_tree *locks_root,
366                 u64 fstripe_logical)
367 {
368         struct rb_node *node;
369         struct full_stripe_lock *entry;
370
371         lockdep_assert_held(&locks_root->lock);
372
373         node = locks_root->root.rb_node;
374         while (node) {
375                 entry = rb_entry(node, struct full_stripe_lock, node);
376                 if (fstripe_logical < entry->logical)
377                         node = node->rb_left;
378                 else if (fstripe_logical > entry->logical)
379                         node = node->rb_right;
380                 else
381                         return entry;
382         }
383         return NULL;
384 }
385
386 /*
387  * Helper to get full stripe logical from a normal bytenr.
388  *
389  * Caller must ensure @cache is a RAID56 block group.
390  */
391 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
392 {
393         u64 ret;
394
395         /*
396          * Due to chunk item size limit, full stripe length should not be
397          * larger than U32_MAX. Just a sanity check here.
398          */
399         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
400
401         /*
402          * round_down() can only handle power of 2, while RAID56 full
403          * stripe length can be 64KiB * n, so we need to manually round down.
404          */
405         ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
406                         cache->full_stripe_len + cache->start;
407         return ret;
408 }
409
410 /*
411  * Lock a full stripe to avoid concurrency of recovery and read
412  *
413  * It's only used for profiles with parities (RAID5/6), for other profiles it
414  * does nothing.
415  *
416  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
417  * So caller must call unlock_full_stripe() at the same context.
418  *
419  * Return <0 if encounters error.
420  */
421 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
422                             bool *locked_ret)
423 {
424         struct btrfs_block_group *bg_cache;
425         struct btrfs_full_stripe_locks_tree *locks_root;
426         struct full_stripe_lock *existing;
427         u64 fstripe_start;
428         int ret = 0;
429
430         *locked_ret = false;
431         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
432         if (!bg_cache) {
433                 ASSERT(0);
434                 return -ENOENT;
435         }
436
437         /* Profiles not based on parity don't need full stripe lock */
438         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
439                 goto out;
440         locks_root = &bg_cache->full_stripe_locks_root;
441
442         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
443
444         /* Now insert the full stripe lock */
445         mutex_lock(&locks_root->lock);
446         existing = insert_full_stripe_lock(locks_root, fstripe_start);
447         mutex_unlock(&locks_root->lock);
448         if (IS_ERR(existing)) {
449                 ret = PTR_ERR(existing);
450                 goto out;
451         }
452         mutex_lock(&existing->mutex);
453         *locked_ret = true;
454 out:
455         btrfs_put_block_group(bg_cache);
456         return ret;
457 }
458
459 /*
460  * Unlock a full stripe.
461  *
462  * NOTE: Caller must ensure it's the same context calling corresponding
463  * lock_full_stripe().
464  *
465  * Return 0 if we unlock full stripe without problem.
466  * Return <0 for error
467  */
468 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
469                               bool locked)
470 {
471         struct btrfs_block_group *bg_cache;
472         struct btrfs_full_stripe_locks_tree *locks_root;
473         struct full_stripe_lock *fstripe_lock;
474         u64 fstripe_start;
475         bool freeit = false;
476         int ret = 0;
477
478         /* If we didn't acquire full stripe lock, no need to continue */
479         if (!locked)
480                 return 0;
481
482         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
483         if (!bg_cache) {
484                 ASSERT(0);
485                 return -ENOENT;
486         }
487         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
488                 goto out;
489
490         locks_root = &bg_cache->full_stripe_locks_root;
491         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
492
493         mutex_lock(&locks_root->lock);
494         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
495         /* Unpaired unlock_full_stripe() detected */
496         if (!fstripe_lock) {
497                 WARN_ON(1);
498                 ret = -ENOENT;
499                 mutex_unlock(&locks_root->lock);
500                 goto out;
501         }
502
503         if (fstripe_lock->refs == 0) {
504                 WARN_ON(1);
505                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
506                         fstripe_lock->logical);
507         } else {
508                 fstripe_lock->refs--;
509         }
510
511         if (fstripe_lock->refs == 0) {
512                 rb_erase(&fstripe_lock->node, &locks_root->root);
513                 freeit = true;
514         }
515         mutex_unlock(&locks_root->lock);
516
517         mutex_unlock(&fstripe_lock->mutex);
518         if (freeit)
519                 kfree(fstripe_lock);
520 out:
521         btrfs_put_block_group(bg_cache);
522         return ret;
523 }
524
525 static void scrub_free_csums(struct scrub_ctx *sctx)
526 {
527         while (!list_empty(&sctx->csum_list)) {
528                 struct btrfs_ordered_sum *sum;
529                 sum = list_first_entry(&sctx->csum_list,
530                                        struct btrfs_ordered_sum, list);
531                 list_del(&sum->list);
532                 kfree(sum);
533         }
534 }
535
536 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
537 {
538         int i;
539
540         if (!sctx)
541                 return;
542
543         /* this can happen when scrub is cancelled */
544         if (sctx->curr != -1) {
545                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
546
547                 for (i = 0; i < sbio->page_count; i++) {
548                         WARN_ON(!sbio->pagev[i]->page);
549                         scrub_block_put(sbio->pagev[i]->sblock);
550                 }
551                 bio_put(sbio->bio);
552         }
553
554         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
555                 struct scrub_bio *sbio = sctx->bios[i];
556
557                 if (!sbio)
558                         break;
559                 kfree(sbio);
560         }
561
562         kfree(sctx->wr_curr_bio);
563         scrub_free_csums(sctx);
564         kfree(sctx);
565 }
566
567 static void scrub_put_ctx(struct scrub_ctx *sctx)
568 {
569         if (refcount_dec_and_test(&sctx->refs))
570                 scrub_free_ctx(sctx);
571 }
572
573 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
574                 struct btrfs_fs_info *fs_info, int is_dev_replace)
575 {
576         struct scrub_ctx *sctx;
577         int             i;
578
579         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
580         if (!sctx)
581                 goto nomem;
582         refcount_set(&sctx->refs, 1);
583         sctx->is_dev_replace = is_dev_replace;
584         sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
585         sctx->curr = -1;
586         sctx->fs_info = fs_info;
587         INIT_LIST_HEAD(&sctx->csum_list);
588         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
589                 struct scrub_bio *sbio;
590
591                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
592                 if (!sbio)
593                         goto nomem;
594                 sctx->bios[i] = sbio;
595
596                 sbio->index = i;
597                 sbio->sctx = sctx;
598                 sbio->page_count = 0;
599                 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
600                                 NULL);
601
602                 if (i != SCRUB_BIOS_PER_SCTX - 1)
603                         sctx->bios[i]->next_free = i + 1;
604                 else
605                         sctx->bios[i]->next_free = -1;
606         }
607         sctx->first_free = 0;
608         atomic_set(&sctx->bios_in_flight, 0);
609         atomic_set(&sctx->workers_pending, 0);
610         atomic_set(&sctx->cancel_req, 0);
611
612         spin_lock_init(&sctx->list_lock);
613         spin_lock_init(&sctx->stat_lock);
614         init_waitqueue_head(&sctx->list_wait);
615
616         WARN_ON(sctx->wr_curr_bio != NULL);
617         mutex_init(&sctx->wr_lock);
618         sctx->wr_curr_bio = NULL;
619         if (is_dev_replace) {
620                 WARN_ON(!fs_info->dev_replace.tgtdev);
621                 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
622                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
623                 sctx->flush_all_writes = false;
624         }
625
626         return sctx;
627
628 nomem:
629         scrub_free_ctx(sctx);
630         return ERR_PTR(-ENOMEM);
631 }
632
633 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
634                                      void *warn_ctx)
635 {
636         u64 isize;
637         u32 nlink;
638         int ret;
639         int i;
640         unsigned nofs_flag;
641         struct extent_buffer *eb;
642         struct btrfs_inode_item *inode_item;
643         struct scrub_warning *swarn = warn_ctx;
644         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
645         struct inode_fs_paths *ipath = NULL;
646         struct btrfs_root *local_root;
647         struct btrfs_key key;
648
649         local_root = btrfs_get_fs_root(fs_info, root, true);
650         if (IS_ERR(local_root)) {
651                 ret = PTR_ERR(local_root);
652                 goto err;
653         }
654
655         /*
656          * this makes the path point to (inum INODE_ITEM ioff)
657          */
658         key.objectid = inum;
659         key.type = BTRFS_INODE_ITEM_KEY;
660         key.offset = 0;
661
662         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
663         if (ret) {
664                 btrfs_put_root(local_root);
665                 btrfs_release_path(swarn->path);
666                 goto err;
667         }
668
669         eb = swarn->path->nodes[0];
670         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
671                                         struct btrfs_inode_item);
672         isize = btrfs_inode_size(eb, inode_item);
673         nlink = btrfs_inode_nlink(eb, inode_item);
674         btrfs_release_path(swarn->path);
675
676         /*
677          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
678          * uses GFP_NOFS in this context, so we keep it consistent but it does
679          * not seem to be strictly necessary.
680          */
681         nofs_flag = memalloc_nofs_save();
682         ipath = init_ipath(4096, local_root, swarn->path);
683         memalloc_nofs_restore(nofs_flag);
684         if (IS_ERR(ipath)) {
685                 btrfs_put_root(local_root);
686                 ret = PTR_ERR(ipath);
687                 ipath = NULL;
688                 goto err;
689         }
690         ret = paths_from_inode(inum, ipath);
691
692         if (ret < 0)
693                 goto err;
694
695         /*
696          * we deliberately ignore the bit ipath might have been too small to
697          * hold all of the paths here
698          */
699         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
700                 btrfs_warn_in_rcu(fs_info,
701 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
702                                   swarn->errstr, swarn->logical,
703                                   rcu_str_deref(swarn->dev->name),
704                                   swarn->physical,
705                                   root, inum, offset,
706                                   min(isize - offset, (u64)PAGE_SIZE), nlink,
707                                   (char *)(unsigned long)ipath->fspath->val[i]);
708
709         btrfs_put_root(local_root);
710         free_ipath(ipath);
711         return 0;
712
713 err:
714         btrfs_warn_in_rcu(fs_info,
715                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
716                           swarn->errstr, swarn->logical,
717                           rcu_str_deref(swarn->dev->name),
718                           swarn->physical,
719                           root, inum, offset, ret);
720
721         free_ipath(ipath);
722         return 0;
723 }
724
725 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
726 {
727         struct btrfs_device *dev;
728         struct btrfs_fs_info *fs_info;
729         struct btrfs_path *path;
730         struct btrfs_key found_key;
731         struct extent_buffer *eb;
732         struct btrfs_extent_item *ei;
733         struct scrub_warning swarn;
734         unsigned long ptr = 0;
735         u64 extent_item_pos;
736         u64 flags = 0;
737         u64 ref_root;
738         u32 item_size;
739         u8 ref_level = 0;
740         int ret;
741
742         WARN_ON(sblock->page_count < 1);
743         dev = sblock->pagev[0]->dev;
744         fs_info = sblock->sctx->fs_info;
745
746         path = btrfs_alloc_path();
747         if (!path)
748                 return;
749
750         swarn.physical = sblock->pagev[0]->physical;
751         swarn.logical = sblock->pagev[0]->logical;
752         swarn.errstr = errstr;
753         swarn.dev = NULL;
754
755         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
756                                   &flags);
757         if (ret < 0)
758                 goto out;
759
760         extent_item_pos = swarn.logical - found_key.objectid;
761         swarn.extent_item_size = found_key.offset;
762
763         eb = path->nodes[0];
764         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
765         item_size = btrfs_item_size_nr(eb, path->slots[0]);
766
767         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
768                 do {
769                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
770                                                       item_size, &ref_root,
771                                                       &ref_level);
772                         btrfs_warn_in_rcu(fs_info,
773 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
774                                 errstr, swarn.logical,
775                                 rcu_str_deref(dev->name),
776                                 swarn.physical,
777                                 ref_level ? "node" : "leaf",
778                                 ret < 0 ? -1 : ref_level,
779                                 ret < 0 ? -1 : ref_root);
780                 } while (ret != 1);
781                 btrfs_release_path(path);
782         } else {
783                 btrfs_release_path(path);
784                 swarn.path = path;
785                 swarn.dev = dev;
786                 iterate_extent_inodes(fs_info, found_key.objectid,
787                                         extent_item_pos, 1,
788                                         scrub_print_warning_inode, &swarn, false);
789         }
790
791 out:
792         btrfs_free_path(path);
793 }
794
795 static inline void scrub_get_recover(struct scrub_recover *recover)
796 {
797         refcount_inc(&recover->refs);
798 }
799
800 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
801                                      struct scrub_recover *recover)
802 {
803         if (refcount_dec_and_test(&recover->refs)) {
804                 btrfs_bio_counter_dec(fs_info);
805                 btrfs_put_bbio(recover->bbio);
806                 kfree(recover);
807         }
808 }
809
810 /*
811  * scrub_handle_errored_block gets called when either verification of the
812  * pages failed or the bio failed to read, e.g. with EIO. In the latter
813  * case, this function handles all pages in the bio, even though only one
814  * may be bad.
815  * The goal of this function is to repair the errored block by using the
816  * contents of one of the mirrors.
817  */
818 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
819 {
820         struct scrub_ctx *sctx = sblock_to_check->sctx;
821         struct btrfs_device *dev;
822         struct btrfs_fs_info *fs_info;
823         u64 logical;
824         unsigned int failed_mirror_index;
825         unsigned int is_metadata;
826         unsigned int have_csum;
827         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
828         struct scrub_block *sblock_bad;
829         int ret;
830         int mirror_index;
831         int page_num;
832         int success;
833         bool full_stripe_locked;
834         unsigned int nofs_flag;
835         static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
836                                       DEFAULT_RATELIMIT_BURST);
837
838         BUG_ON(sblock_to_check->page_count < 1);
839         fs_info = sctx->fs_info;
840         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
841                 /*
842                  * if we find an error in a super block, we just report it.
843                  * They will get written with the next transaction commit
844                  * anyway
845                  */
846                 spin_lock(&sctx->stat_lock);
847                 ++sctx->stat.super_errors;
848                 spin_unlock(&sctx->stat_lock);
849                 return 0;
850         }
851         logical = sblock_to_check->pagev[0]->logical;
852         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
853         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
854         is_metadata = !(sblock_to_check->pagev[0]->flags &
855                         BTRFS_EXTENT_FLAG_DATA);
856         have_csum = sblock_to_check->pagev[0]->have_csum;
857         dev = sblock_to_check->pagev[0]->dev;
858
859         /*
860          * We must use GFP_NOFS because the scrub task might be waiting for a
861          * worker task executing this function and in turn a transaction commit
862          * might be waiting the scrub task to pause (which needs to wait for all
863          * the worker tasks to complete before pausing).
864          * We do allocations in the workers through insert_full_stripe_lock()
865          * and scrub_add_page_to_wr_bio(), which happens down the call chain of
866          * this function.
867          */
868         nofs_flag = memalloc_nofs_save();
869         /*
870          * For RAID5/6, race can happen for a different device scrub thread.
871          * For data corruption, Parity and Data threads will both try
872          * to recovery the data.
873          * Race can lead to doubly added csum error, or even unrecoverable
874          * error.
875          */
876         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
877         if (ret < 0) {
878                 memalloc_nofs_restore(nofs_flag);
879                 spin_lock(&sctx->stat_lock);
880                 if (ret == -ENOMEM)
881                         sctx->stat.malloc_errors++;
882                 sctx->stat.read_errors++;
883                 sctx->stat.uncorrectable_errors++;
884                 spin_unlock(&sctx->stat_lock);
885                 return ret;
886         }
887
888         /*
889          * read all mirrors one after the other. This includes to
890          * re-read the extent or metadata block that failed (that was
891          * the cause that this fixup code is called) another time,
892          * page by page this time in order to know which pages
893          * caused I/O errors and which ones are good (for all mirrors).
894          * It is the goal to handle the situation when more than one
895          * mirror contains I/O errors, but the errors do not
896          * overlap, i.e. the data can be repaired by selecting the
897          * pages from those mirrors without I/O error on the
898          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
899          * would be that mirror #1 has an I/O error on the first page,
900          * the second page is good, and mirror #2 has an I/O error on
901          * the second page, but the first page is good.
902          * Then the first page of the first mirror can be repaired by
903          * taking the first page of the second mirror, and the
904          * second page of the second mirror can be repaired by
905          * copying the contents of the 2nd page of the 1st mirror.
906          * One more note: if the pages of one mirror contain I/O
907          * errors, the checksum cannot be verified. In order to get
908          * the best data for repairing, the first attempt is to find
909          * a mirror without I/O errors and with a validated checksum.
910          * Only if this is not possible, the pages are picked from
911          * mirrors with I/O errors without considering the checksum.
912          * If the latter is the case, at the end, the checksum of the
913          * repaired area is verified in order to correctly maintain
914          * the statistics.
915          */
916
917         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
918                                       sizeof(*sblocks_for_recheck), GFP_KERNEL);
919         if (!sblocks_for_recheck) {
920                 spin_lock(&sctx->stat_lock);
921                 sctx->stat.malloc_errors++;
922                 sctx->stat.read_errors++;
923                 sctx->stat.uncorrectable_errors++;
924                 spin_unlock(&sctx->stat_lock);
925                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
926                 goto out;
927         }
928
929         /* setup the context, map the logical blocks and alloc the pages */
930         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
931         if (ret) {
932                 spin_lock(&sctx->stat_lock);
933                 sctx->stat.read_errors++;
934                 sctx->stat.uncorrectable_errors++;
935                 spin_unlock(&sctx->stat_lock);
936                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
937                 goto out;
938         }
939         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
940         sblock_bad = sblocks_for_recheck + failed_mirror_index;
941
942         /* build and submit the bios for the failed mirror, check checksums */
943         scrub_recheck_block(fs_info, sblock_bad, 1);
944
945         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
946             sblock_bad->no_io_error_seen) {
947                 /*
948                  * the error disappeared after reading page by page, or
949                  * the area was part of a huge bio and other parts of the
950                  * bio caused I/O errors, or the block layer merged several
951                  * read requests into one and the error is caused by a
952                  * different bio (usually one of the two latter cases is
953                  * the cause)
954                  */
955                 spin_lock(&sctx->stat_lock);
956                 sctx->stat.unverified_errors++;
957                 sblock_to_check->data_corrected = 1;
958                 spin_unlock(&sctx->stat_lock);
959
960                 if (sctx->is_dev_replace)
961                         scrub_write_block_to_dev_replace(sblock_bad);
962                 goto out;
963         }
964
965         if (!sblock_bad->no_io_error_seen) {
966                 spin_lock(&sctx->stat_lock);
967                 sctx->stat.read_errors++;
968                 spin_unlock(&sctx->stat_lock);
969                 if (__ratelimit(&rs))
970                         scrub_print_warning("i/o error", sblock_to_check);
971                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
972         } else if (sblock_bad->checksum_error) {
973                 spin_lock(&sctx->stat_lock);
974                 sctx->stat.csum_errors++;
975                 spin_unlock(&sctx->stat_lock);
976                 if (__ratelimit(&rs))
977                         scrub_print_warning("checksum error", sblock_to_check);
978                 btrfs_dev_stat_inc_and_print(dev,
979                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
980         } else if (sblock_bad->header_error) {
981                 spin_lock(&sctx->stat_lock);
982                 sctx->stat.verify_errors++;
983                 spin_unlock(&sctx->stat_lock);
984                 if (__ratelimit(&rs))
985                         scrub_print_warning("checksum/header error",
986                                             sblock_to_check);
987                 if (sblock_bad->generation_error)
988                         btrfs_dev_stat_inc_and_print(dev,
989                                 BTRFS_DEV_STAT_GENERATION_ERRS);
990                 else
991                         btrfs_dev_stat_inc_and_print(dev,
992                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
993         }
994
995         if (sctx->readonly) {
996                 ASSERT(!sctx->is_dev_replace);
997                 goto out;
998         }
999
1000         /*
1001          * now build and submit the bios for the other mirrors, check
1002          * checksums.
1003          * First try to pick the mirror which is completely without I/O
1004          * errors and also does not have a checksum error.
1005          * If one is found, and if a checksum is present, the full block
1006          * that is known to contain an error is rewritten. Afterwards
1007          * the block is known to be corrected.
1008          * If a mirror is found which is completely correct, and no
1009          * checksum is present, only those pages are rewritten that had
1010          * an I/O error in the block to be repaired, since it cannot be
1011          * determined, which copy of the other pages is better (and it
1012          * could happen otherwise that a correct page would be
1013          * overwritten by a bad one).
1014          */
1015         for (mirror_index = 0; ;mirror_index++) {
1016                 struct scrub_block *sblock_other;
1017
1018                 if (mirror_index == failed_mirror_index)
1019                         continue;
1020
1021                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1022                 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1023                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1024                                 break;
1025                         if (!sblocks_for_recheck[mirror_index].page_count)
1026                                 break;
1027
1028                         sblock_other = sblocks_for_recheck + mirror_index;
1029                 } else {
1030                         struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1031                         int max_allowed = r->bbio->num_stripes -
1032                                                 r->bbio->num_tgtdevs;
1033
1034                         if (mirror_index >= max_allowed)
1035                                 break;
1036                         if (!sblocks_for_recheck[1].page_count)
1037                                 break;
1038
1039                         ASSERT(failed_mirror_index == 0);
1040                         sblock_other = sblocks_for_recheck + 1;
1041                         sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1042                 }
1043
1044                 /* build and submit the bios, check checksums */
1045                 scrub_recheck_block(fs_info, sblock_other, 0);
1046
1047                 if (!sblock_other->header_error &&
1048                     !sblock_other->checksum_error &&
1049                     sblock_other->no_io_error_seen) {
1050                         if (sctx->is_dev_replace) {
1051                                 scrub_write_block_to_dev_replace(sblock_other);
1052                                 goto corrected_error;
1053                         } else {
1054                                 ret = scrub_repair_block_from_good_copy(
1055                                                 sblock_bad, sblock_other);
1056                                 if (!ret)
1057                                         goto corrected_error;
1058                         }
1059                 }
1060         }
1061
1062         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1063                 goto did_not_correct_error;
1064
1065         /*
1066          * In case of I/O errors in the area that is supposed to be
1067          * repaired, continue by picking good copies of those pages.
1068          * Select the good pages from mirrors to rewrite bad pages from
1069          * the area to fix. Afterwards verify the checksum of the block
1070          * that is supposed to be repaired. This verification step is
1071          * only done for the purpose of statistic counting and for the
1072          * final scrub report, whether errors remain.
1073          * A perfect algorithm could make use of the checksum and try
1074          * all possible combinations of pages from the different mirrors
1075          * until the checksum verification succeeds. For example, when
1076          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1077          * of mirror #2 is readable but the final checksum test fails,
1078          * then the 2nd page of mirror #3 could be tried, whether now
1079          * the final checksum succeeds. But this would be a rare
1080          * exception and is therefore not implemented. At least it is
1081          * avoided that the good copy is overwritten.
1082          * A more useful improvement would be to pick the sectors
1083          * without I/O error based on sector sizes (512 bytes on legacy
1084          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1085          * mirror could be repaired by taking 512 byte of a different
1086          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1087          * area are unreadable.
1088          */
1089         success = 1;
1090         for (page_num = 0; page_num < sblock_bad->page_count;
1091              page_num++) {
1092                 struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1093                 struct scrub_block *sblock_other = NULL;
1094
1095                 /* skip no-io-error page in scrub */
1096                 if (!spage_bad->io_error && !sctx->is_dev_replace)
1097                         continue;
1098
1099                 if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1100                         /*
1101                          * In case of dev replace, if raid56 rebuild process
1102                          * didn't work out correct data, then copy the content
1103                          * in sblock_bad to make sure target device is identical
1104                          * to source device, instead of writing garbage data in
1105                          * sblock_for_recheck array to target device.
1106                          */
1107                         sblock_other = NULL;
1108                 } else if (spage_bad->io_error) {
1109                         /* try to find no-io-error page in mirrors */
1110                         for (mirror_index = 0;
1111                              mirror_index < BTRFS_MAX_MIRRORS &&
1112                              sblocks_for_recheck[mirror_index].page_count > 0;
1113                              mirror_index++) {
1114                                 if (!sblocks_for_recheck[mirror_index].
1115                                     pagev[page_num]->io_error) {
1116                                         sblock_other = sblocks_for_recheck +
1117                                                        mirror_index;
1118                                         break;
1119                                 }
1120                         }
1121                         if (!sblock_other)
1122                                 success = 0;
1123                 }
1124
1125                 if (sctx->is_dev_replace) {
1126                         /*
1127                          * did not find a mirror to fetch the page
1128                          * from. scrub_write_page_to_dev_replace()
1129                          * handles this case (page->io_error), by
1130                          * filling the block with zeros before
1131                          * submitting the write request
1132                          */
1133                         if (!sblock_other)
1134                                 sblock_other = sblock_bad;
1135
1136                         if (scrub_write_page_to_dev_replace(sblock_other,
1137                                                             page_num) != 0) {
1138                                 atomic64_inc(
1139                                         &fs_info->dev_replace.num_write_errors);
1140                                 success = 0;
1141                         }
1142                 } else if (sblock_other) {
1143                         ret = scrub_repair_page_from_good_copy(sblock_bad,
1144                                                                sblock_other,
1145                                                                page_num, 0);
1146                         if (0 == ret)
1147                                 spage_bad->io_error = 0;
1148                         else
1149                                 success = 0;
1150                 }
1151         }
1152
1153         if (success && !sctx->is_dev_replace) {
1154                 if (is_metadata || have_csum) {
1155                         /*
1156                          * need to verify the checksum now that all
1157                          * sectors on disk are repaired (the write
1158                          * request for data to be repaired is on its way).
1159                          * Just be lazy and use scrub_recheck_block()
1160                          * which re-reads the data before the checksum
1161                          * is verified, but most likely the data comes out
1162                          * of the page cache.
1163                          */
1164                         scrub_recheck_block(fs_info, sblock_bad, 1);
1165                         if (!sblock_bad->header_error &&
1166                             !sblock_bad->checksum_error &&
1167                             sblock_bad->no_io_error_seen)
1168                                 goto corrected_error;
1169                         else
1170                                 goto did_not_correct_error;
1171                 } else {
1172 corrected_error:
1173                         spin_lock(&sctx->stat_lock);
1174                         sctx->stat.corrected_errors++;
1175                         sblock_to_check->data_corrected = 1;
1176                         spin_unlock(&sctx->stat_lock);
1177                         btrfs_err_rl_in_rcu(fs_info,
1178                                 "fixed up error at logical %llu on dev %s",
1179                                 logical, rcu_str_deref(dev->name));
1180                 }
1181         } else {
1182 did_not_correct_error:
1183                 spin_lock(&sctx->stat_lock);
1184                 sctx->stat.uncorrectable_errors++;
1185                 spin_unlock(&sctx->stat_lock);
1186                 btrfs_err_rl_in_rcu(fs_info,
1187                         "unable to fixup (regular) error at logical %llu on dev %s",
1188                         logical, rcu_str_deref(dev->name));
1189         }
1190
1191 out:
1192         if (sblocks_for_recheck) {
1193                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1194                      mirror_index++) {
1195                         struct scrub_block *sblock = sblocks_for_recheck +
1196                                                      mirror_index;
1197                         struct scrub_recover *recover;
1198                         int page_index;
1199
1200                         for (page_index = 0; page_index < sblock->page_count;
1201                              page_index++) {
1202                                 sblock->pagev[page_index]->sblock = NULL;
1203                                 recover = sblock->pagev[page_index]->recover;
1204                                 if (recover) {
1205                                         scrub_put_recover(fs_info, recover);
1206                                         sblock->pagev[page_index]->recover =
1207                                                                         NULL;
1208                                 }
1209                                 scrub_page_put(sblock->pagev[page_index]);
1210                         }
1211                 }
1212                 kfree(sblocks_for_recheck);
1213         }
1214
1215         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1216         memalloc_nofs_restore(nofs_flag);
1217         if (ret < 0)
1218                 return ret;
1219         return 0;
1220 }
1221
1222 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1223 {
1224         if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1225                 return 2;
1226         else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1227                 return 3;
1228         else
1229                 return (int)bbio->num_stripes;
1230 }
1231
1232 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1233                                                  u64 *raid_map,
1234                                                  u64 mapped_length,
1235                                                  int nstripes, int mirror,
1236                                                  int *stripe_index,
1237                                                  u64 *stripe_offset)
1238 {
1239         int i;
1240
1241         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1242                 /* RAID5/6 */
1243                 for (i = 0; i < nstripes; i++) {
1244                         if (raid_map[i] == RAID6_Q_STRIPE ||
1245                             raid_map[i] == RAID5_P_STRIPE)
1246                                 continue;
1247
1248                         if (logical >= raid_map[i] &&
1249                             logical < raid_map[i] + mapped_length)
1250                                 break;
1251                 }
1252
1253                 *stripe_index = i;
1254                 *stripe_offset = logical - raid_map[i];
1255         } else {
1256                 /* The other RAID type */
1257                 *stripe_index = mirror;
1258                 *stripe_offset = 0;
1259         }
1260 }
1261
1262 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1263                                      struct scrub_block *sblocks_for_recheck)
1264 {
1265         struct scrub_ctx *sctx = original_sblock->sctx;
1266         struct btrfs_fs_info *fs_info = sctx->fs_info;
1267         u64 length = original_sblock->page_count * PAGE_SIZE;
1268         u64 logical = original_sblock->pagev[0]->logical;
1269         u64 generation = original_sblock->pagev[0]->generation;
1270         u64 flags = original_sblock->pagev[0]->flags;
1271         u64 have_csum = original_sblock->pagev[0]->have_csum;
1272         struct scrub_recover *recover;
1273         struct btrfs_bio *bbio;
1274         u64 sublen;
1275         u64 mapped_length;
1276         u64 stripe_offset;
1277         int stripe_index;
1278         int page_index = 0;
1279         int mirror_index;
1280         int nmirrors;
1281         int ret;
1282
1283         /*
1284          * note: the two members refs and outstanding_pages
1285          * are not used (and not set) in the blocks that are used for
1286          * the recheck procedure
1287          */
1288
1289         while (length > 0) {
1290                 sublen = min_t(u64, length, PAGE_SIZE);
1291                 mapped_length = sublen;
1292                 bbio = NULL;
1293
1294                 /*
1295                  * with a length of PAGE_SIZE, each returned stripe
1296                  * represents one mirror
1297                  */
1298                 btrfs_bio_counter_inc_blocked(fs_info);
1299                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1300                                 logical, &mapped_length, &bbio);
1301                 if (ret || !bbio || mapped_length < sublen) {
1302                         btrfs_put_bbio(bbio);
1303                         btrfs_bio_counter_dec(fs_info);
1304                         return -EIO;
1305                 }
1306
1307                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1308                 if (!recover) {
1309                         btrfs_put_bbio(bbio);
1310                         btrfs_bio_counter_dec(fs_info);
1311                         return -ENOMEM;
1312                 }
1313
1314                 refcount_set(&recover->refs, 1);
1315                 recover->bbio = bbio;
1316                 recover->map_length = mapped_length;
1317
1318                 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1319
1320                 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1321
1322                 for (mirror_index = 0; mirror_index < nmirrors;
1323                      mirror_index++) {
1324                         struct scrub_block *sblock;
1325                         struct scrub_page *spage;
1326
1327                         sblock = sblocks_for_recheck + mirror_index;
1328                         sblock->sctx = sctx;
1329
1330                         spage = kzalloc(sizeof(*spage), GFP_NOFS);
1331                         if (!spage) {
1332 leave_nomem:
1333                                 spin_lock(&sctx->stat_lock);
1334                                 sctx->stat.malloc_errors++;
1335                                 spin_unlock(&sctx->stat_lock);
1336                                 scrub_put_recover(fs_info, recover);
1337                                 return -ENOMEM;
1338                         }
1339                         scrub_page_get(spage);
1340                         sblock->pagev[page_index] = spage;
1341                         spage->sblock = sblock;
1342                         spage->flags = flags;
1343                         spage->generation = generation;
1344                         spage->logical = logical;
1345                         spage->have_csum = have_csum;
1346                         if (have_csum)
1347                                 memcpy(spage->csum,
1348                                        original_sblock->pagev[0]->csum,
1349                                        sctx->fs_info->csum_size);
1350
1351                         scrub_stripe_index_and_offset(logical,
1352                                                       bbio->map_type,
1353                                                       bbio->raid_map,
1354                                                       mapped_length,
1355                                                       bbio->num_stripes -
1356                                                       bbio->num_tgtdevs,
1357                                                       mirror_index,
1358                                                       &stripe_index,
1359                                                       &stripe_offset);
1360                         spage->physical = bbio->stripes[stripe_index].physical +
1361                                          stripe_offset;
1362                         spage->dev = bbio->stripes[stripe_index].dev;
1363
1364                         BUG_ON(page_index >= original_sblock->page_count);
1365                         spage->physical_for_dev_replace =
1366                                 original_sblock->pagev[page_index]->
1367                                 physical_for_dev_replace;
1368                         /* for missing devices, dev->bdev is NULL */
1369                         spage->mirror_num = mirror_index + 1;
1370                         sblock->page_count++;
1371                         spage->page = alloc_page(GFP_NOFS);
1372                         if (!spage->page)
1373                                 goto leave_nomem;
1374
1375                         scrub_get_recover(recover);
1376                         spage->recover = recover;
1377                 }
1378                 scrub_put_recover(fs_info, recover);
1379                 length -= sublen;
1380                 logical += sublen;
1381                 page_index++;
1382         }
1383
1384         return 0;
1385 }
1386
1387 static void scrub_bio_wait_endio(struct bio *bio)
1388 {
1389         complete(bio->bi_private);
1390 }
1391
1392 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1393                                         struct bio *bio,
1394                                         struct scrub_page *spage)
1395 {
1396         DECLARE_COMPLETION_ONSTACK(done);
1397         int ret;
1398         int mirror_num;
1399
1400         bio->bi_iter.bi_sector = spage->logical >> 9;
1401         bio->bi_private = &done;
1402         bio->bi_end_io = scrub_bio_wait_endio;
1403
1404         mirror_num = spage->sblock->pagev[0]->mirror_num;
1405         ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio,
1406                                     spage->recover->map_length,
1407                                     mirror_num, 0);
1408         if (ret)
1409                 return ret;
1410
1411         wait_for_completion_io(&done);
1412         return blk_status_to_errno(bio->bi_status);
1413 }
1414
1415 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1416                                           struct scrub_block *sblock)
1417 {
1418         struct scrub_page *first_page = sblock->pagev[0];
1419         struct bio *bio;
1420         int page_num;
1421
1422         /* All pages in sblock belong to the same stripe on the same device. */
1423         ASSERT(first_page->dev);
1424         if (!first_page->dev->bdev)
1425                 goto out;
1426
1427         bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
1428         bio_set_dev(bio, first_page->dev->bdev);
1429
1430         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1431                 struct scrub_page *spage = sblock->pagev[page_num];
1432
1433                 WARN_ON(!spage->page);
1434                 bio_add_page(bio, spage->page, PAGE_SIZE, 0);
1435         }
1436
1437         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1438                 bio_put(bio);
1439                 goto out;
1440         }
1441
1442         bio_put(bio);
1443
1444         scrub_recheck_block_checksum(sblock);
1445
1446         return;
1447 out:
1448         for (page_num = 0; page_num < sblock->page_count; page_num++)
1449                 sblock->pagev[page_num]->io_error = 1;
1450
1451         sblock->no_io_error_seen = 0;
1452 }
1453
1454 /*
1455  * this function will check the on disk data for checksum errors, header
1456  * errors and read I/O errors. If any I/O errors happen, the exact pages
1457  * which are errored are marked as being bad. The goal is to enable scrub
1458  * to take those pages that are not errored from all the mirrors so that
1459  * the pages that are errored in the just handled mirror can be repaired.
1460  */
1461 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1462                                 struct scrub_block *sblock,
1463                                 int retry_failed_mirror)
1464 {
1465         int page_num;
1466
1467         sblock->no_io_error_seen = 1;
1468
1469         /* short cut for raid56 */
1470         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1471                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1472
1473         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1474                 struct bio *bio;
1475                 struct scrub_page *spage = sblock->pagev[page_num];
1476
1477                 if (spage->dev->bdev == NULL) {
1478                         spage->io_error = 1;
1479                         sblock->no_io_error_seen = 0;
1480                         continue;
1481                 }
1482
1483                 WARN_ON(!spage->page);
1484                 bio = btrfs_io_bio_alloc(1);
1485                 bio_set_dev(bio, spage->dev->bdev);
1486
1487                 bio_add_page(bio, spage->page, PAGE_SIZE, 0);
1488                 bio->bi_iter.bi_sector = spage->physical >> 9;
1489                 bio->bi_opf = REQ_OP_READ;
1490
1491                 if (btrfsic_submit_bio_wait(bio)) {
1492                         spage->io_error = 1;
1493                         sblock->no_io_error_seen = 0;
1494                 }
1495
1496                 bio_put(bio);
1497         }
1498
1499         if (sblock->no_io_error_seen)
1500                 scrub_recheck_block_checksum(sblock);
1501 }
1502
1503 static inline int scrub_check_fsid(u8 fsid[],
1504                                    struct scrub_page *spage)
1505 {
1506         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1507         int ret;
1508
1509         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1510         return !ret;
1511 }
1512
1513 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1514 {
1515         sblock->header_error = 0;
1516         sblock->checksum_error = 0;
1517         sblock->generation_error = 0;
1518
1519         if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1520                 scrub_checksum_data(sblock);
1521         else
1522                 scrub_checksum_tree_block(sblock);
1523 }
1524
1525 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1526                                              struct scrub_block *sblock_good)
1527 {
1528         int page_num;
1529         int ret = 0;
1530
1531         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1532                 int ret_sub;
1533
1534                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1535                                                            sblock_good,
1536                                                            page_num, 1);
1537                 if (ret_sub)
1538                         ret = ret_sub;
1539         }
1540
1541         return ret;
1542 }
1543
1544 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1545                                             struct scrub_block *sblock_good,
1546                                             int page_num, int force_write)
1547 {
1548         struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1549         struct scrub_page *spage_good = sblock_good->pagev[page_num];
1550         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1551
1552         BUG_ON(spage_bad->page == NULL);
1553         BUG_ON(spage_good->page == NULL);
1554         if (force_write || sblock_bad->header_error ||
1555             sblock_bad->checksum_error || spage_bad->io_error) {
1556                 struct bio *bio;
1557                 int ret;
1558
1559                 if (!spage_bad->dev->bdev) {
1560                         btrfs_warn_rl(fs_info,
1561                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1562                         return -EIO;
1563                 }
1564
1565                 bio = btrfs_io_bio_alloc(1);
1566                 bio_set_dev(bio, spage_bad->dev->bdev);
1567                 bio->bi_iter.bi_sector = spage_bad->physical >> 9;
1568                 bio->bi_opf = REQ_OP_WRITE;
1569
1570                 ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0);
1571                 if (PAGE_SIZE != ret) {
1572                         bio_put(bio);
1573                         return -EIO;
1574                 }
1575
1576                 if (btrfsic_submit_bio_wait(bio)) {
1577                         btrfs_dev_stat_inc_and_print(spage_bad->dev,
1578                                 BTRFS_DEV_STAT_WRITE_ERRS);
1579                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1580                         bio_put(bio);
1581                         return -EIO;
1582                 }
1583                 bio_put(bio);
1584         }
1585
1586         return 0;
1587 }
1588
1589 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1590 {
1591         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1592         int page_num;
1593
1594         /*
1595          * This block is used for the check of the parity on the source device,
1596          * so the data needn't be written into the destination device.
1597          */
1598         if (sblock->sparity)
1599                 return;
1600
1601         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1602                 int ret;
1603
1604                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1605                 if (ret)
1606                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1607         }
1608 }
1609
1610 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1611                                            int page_num)
1612 {
1613         struct scrub_page *spage = sblock->pagev[page_num];
1614
1615         BUG_ON(spage->page == NULL);
1616         if (spage->io_error)
1617                 clear_page(page_address(spage->page));
1618
1619         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1620 }
1621
1622 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1623                                     struct scrub_page *spage)
1624 {
1625         struct scrub_bio *sbio;
1626         int ret;
1627
1628         mutex_lock(&sctx->wr_lock);
1629 again:
1630         if (!sctx->wr_curr_bio) {
1631                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1632                                               GFP_KERNEL);
1633                 if (!sctx->wr_curr_bio) {
1634                         mutex_unlock(&sctx->wr_lock);
1635                         return -ENOMEM;
1636                 }
1637                 sctx->wr_curr_bio->sctx = sctx;
1638                 sctx->wr_curr_bio->page_count = 0;
1639         }
1640         sbio = sctx->wr_curr_bio;
1641         if (sbio->page_count == 0) {
1642                 struct bio *bio;
1643
1644                 sbio->physical = spage->physical_for_dev_replace;
1645                 sbio->logical = spage->logical;
1646                 sbio->dev = sctx->wr_tgtdev;
1647                 bio = sbio->bio;
1648                 if (!bio) {
1649                         bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1650                         sbio->bio = bio;
1651                 }
1652
1653                 bio->bi_private = sbio;
1654                 bio->bi_end_io = scrub_wr_bio_end_io;
1655                 bio_set_dev(bio, sbio->dev->bdev);
1656                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1657                 bio->bi_opf = REQ_OP_WRITE;
1658                 sbio->status = 0;
1659         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1660                    spage->physical_for_dev_replace ||
1661                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1662                    spage->logical) {
1663                 scrub_wr_submit(sctx);
1664                 goto again;
1665         }
1666
1667         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1668         if (ret != PAGE_SIZE) {
1669                 if (sbio->page_count < 1) {
1670                         bio_put(sbio->bio);
1671                         sbio->bio = NULL;
1672                         mutex_unlock(&sctx->wr_lock);
1673                         return -EIO;
1674                 }
1675                 scrub_wr_submit(sctx);
1676                 goto again;
1677         }
1678
1679         sbio->pagev[sbio->page_count] = spage;
1680         scrub_page_get(spage);
1681         sbio->page_count++;
1682         if (sbio->page_count == sctx->pages_per_wr_bio)
1683                 scrub_wr_submit(sctx);
1684         mutex_unlock(&sctx->wr_lock);
1685
1686         return 0;
1687 }
1688
1689 static void scrub_wr_submit(struct scrub_ctx *sctx)
1690 {
1691         struct scrub_bio *sbio;
1692
1693         if (!sctx->wr_curr_bio)
1694                 return;
1695
1696         sbio = sctx->wr_curr_bio;
1697         sctx->wr_curr_bio = NULL;
1698         WARN_ON(!sbio->bio->bi_disk);
1699         scrub_pending_bio_inc(sctx);
1700         /* process all writes in a single worker thread. Then the block layer
1701          * orders the requests before sending them to the driver which
1702          * doubled the write performance on spinning disks when measured
1703          * with Linux 3.5 */
1704         btrfsic_submit_bio(sbio->bio);
1705 }
1706
1707 static void scrub_wr_bio_end_io(struct bio *bio)
1708 {
1709         struct scrub_bio *sbio = bio->bi_private;
1710         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1711
1712         sbio->status = bio->bi_status;
1713         sbio->bio = bio;
1714
1715         btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1716         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1717 }
1718
1719 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1720 {
1721         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1722         struct scrub_ctx *sctx = sbio->sctx;
1723         int i;
1724
1725         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1726         if (sbio->status) {
1727                 struct btrfs_dev_replace *dev_replace =
1728                         &sbio->sctx->fs_info->dev_replace;
1729
1730                 for (i = 0; i < sbio->page_count; i++) {
1731                         struct scrub_page *spage = sbio->pagev[i];
1732
1733                         spage->io_error = 1;
1734                         atomic64_inc(&dev_replace->num_write_errors);
1735                 }
1736         }
1737
1738         for (i = 0; i < sbio->page_count; i++)
1739                 scrub_page_put(sbio->pagev[i]);
1740
1741         bio_put(sbio->bio);
1742         kfree(sbio);
1743         scrub_pending_bio_dec(sctx);
1744 }
1745
1746 static int scrub_checksum(struct scrub_block *sblock)
1747 {
1748         u64 flags;
1749         int ret;
1750
1751         /*
1752          * No need to initialize these stats currently,
1753          * because this function only use return value
1754          * instead of these stats value.
1755          *
1756          * Todo:
1757          * always use stats
1758          */
1759         sblock->header_error = 0;
1760         sblock->generation_error = 0;
1761         sblock->checksum_error = 0;
1762
1763         WARN_ON(sblock->page_count < 1);
1764         flags = sblock->pagev[0]->flags;
1765         ret = 0;
1766         if (flags & BTRFS_EXTENT_FLAG_DATA)
1767                 ret = scrub_checksum_data(sblock);
1768         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1769                 ret = scrub_checksum_tree_block(sblock);
1770         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1771                 (void)scrub_checksum_super(sblock);
1772         else
1773                 WARN_ON(1);
1774         if (ret)
1775                 scrub_handle_errored_block(sblock);
1776
1777         return ret;
1778 }
1779
1780 static int scrub_checksum_data(struct scrub_block *sblock)
1781 {
1782         struct scrub_ctx *sctx = sblock->sctx;
1783         struct btrfs_fs_info *fs_info = sctx->fs_info;
1784         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1785         u8 csum[BTRFS_CSUM_SIZE];
1786         struct scrub_page *spage;
1787         char *kaddr;
1788
1789         BUG_ON(sblock->page_count < 1);
1790         spage = sblock->pagev[0];
1791         if (!spage->have_csum)
1792                 return 0;
1793
1794         kaddr = page_address(spage->page);
1795
1796         shash->tfm = fs_info->csum_shash;
1797         crypto_shash_init(shash);
1798
1799         /*
1800          * In scrub_pages() and scrub_pages_for_parity() we ensure each spage
1801          * only contains one sector of data.
1802          */
1803         crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1804
1805         if (memcmp(csum, spage->csum, fs_info->csum_size))
1806                 sblock->checksum_error = 1;
1807         return sblock->checksum_error;
1808 }
1809
1810 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1811 {
1812         struct scrub_ctx *sctx = sblock->sctx;
1813         struct btrfs_header *h;
1814         struct btrfs_fs_info *fs_info = sctx->fs_info;
1815         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1816         u8 calculated_csum[BTRFS_CSUM_SIZE];
1817         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1818         /*
1819          * This is done in sectorsize steps even for metadata as there's a
1820          * constraint for nodesize to be aligned to sectorsize. This will need
1821          * to change so we don't misuse data and metadata units like that.
1822          */
1823         const u32 sectorsize = sctx->fs_info->sectorsize;
1824         const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1825         int i;
1826         struct scrub_page *spage;
1827         char *kaddr;
1828
1829         BUG_ON(sblock->page_count < 1);
1830
1831         /* Each member in pagev is just one block, not a full page */
1832         ASSERT(sblock->page_count == num_sectors);
1833
1834         spage = sblock->pagev[0];
1835         kaddr = page_address(spage->page);
1836         h = (struct btrfs_header *)kaddr;
1837         memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1838
1839         /*
1840          * we don't use the getter functions here, as we
1841          * a) don't have an extent buffer and
1842          * b) the page is already kmapped
1843          */
1844         if (spage->logical != btrfs_stack_header_bytenr(h))
1845                 sblock->header_error = 1;
1846
1847         if (spage->generation != btrfs_stack_header_generation(h)) {
1848                 sblock->header_error = 1;
1849                 sblock->generation_error = 1;
1850         }
1851
1852         if (!scrub_check_fsid(h->fsid, spage))
1853                 sblock->header_error = 1;
1854
1855         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1856                    BTRFS_UUID_SIZE))
1857                 sblock->header_error = 1;
1858
1859         shash->tfm = fs_info->csum_shash;
1860         crypto_shash_init(shash);
1861         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1862                             sectorsize - BTRFS_CSUM_SIZE);
1863
1864         for (i = 1; i < num_sectors; i++) {
1865                 kaddr = page_address(sblock->pagev[i]->page);
1866                 crypto_shash_update(shash, kaddr, sectorsize);
1867         }
1868
1869         crypto_shash_final(shash, calculated_csum);
1870         if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1871                 sblock->checksum_error = 1;
1872
1873         return sblock->header_error || sblock->checksum_error;
1874 }
1875
1876 static int scrub_checksum_super(struct scrub_block *sblock)
1877 {
1878         struct btrfs_super_block *s;
1879         struct scrub_ctx *sctx = sblock->sctx;
1880         struct btrfs_fs_info *fs_info = sctx->fs_info;
1881         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1882         u8 calculated_csum[BTRFS_CSUM_SIZE];
1883         struct scrub_page *spage;
1884         char *kaddr;
1885         int fail_gen = 0;
1886         int fail_cor = 0;
1887
1888         BUG_ON(sblock->page_count < 1);
1889         spage = sblock->pagev[0];
1890         kaddr = page_address(spage->page);
1891         s = (struct btrfs_super_block *)kaddr;
1892
1893         if (spage->logical != btrfs_super_bytenr(s))
1894                 ++fail_cor;
1895
1896         if (spage->generation != btrfs_super_generation(s))
1897                 ++fail_gen;
1898
1899         if (!scrub_check_fsid(s->fsid, spage))
1900                 ++fail_cor;
1901
1902         shash->tfm = fs_info->csum_shash;
1903         crypto_shash_init(shash);
1904         crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1905                         BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1906
1907         if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1908                 ++fail_cor;
1909
1910         if (fail_cor + fail_gen) {
1911                 /*
1912                  * if we find an error in a super block, we just report it.
1913                  * They will get written with the next transaction commit
1914                  * anyway
1915                  */
1916                 spin_lock(&sctx->stat_lock);
1917                 ++sctx->stat.super_errors;
1918                 spin_unlock(&sctx->stat_lock);
1919                 if (fail_cor)
1920                         btrfs_dev_stat_inc_and_print(spage->dev,
1921                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1922                 else
1923                         btrfs_dev_stat_inc_and_print(spage->dev,
1924                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1925         }
1926
1927         return fail_cor + fail_gen;
1928 }
1929
1930 static void scrub_block_get(struct scrub_block *sblock)
1931 {
1932         refcount_inc(&sblock->refs);
1933 }
1934
1935 static void scrub_block_put(struct scrub_block *sblock)
1936 {
1937         if (refcount_dec_and_test(&sblock->refs)) {
1938                 int i;
1939
1940                 if (sblock->sparity)
1941                         scrub_parity_put(sblock->sparity);
1942
1943                 for (i = 0; i < sblock->page_count; i++)
1944                         scrub_page_put(sblock->pagev[i]);
1945                 kfree(sblock);
1946         }
1947 }
1948
1949 static void scrub_page_get(struct scrub_page *spage)
1950 {
1951         atomic_inc(&spage->refs);
1952 }
1953
1954 static void scrub_page_put(struct scrub_page *spage)
1955 {
1956         if (atomic_dec_and_test(&spage->refs)) {
1957                 if (spage->page)
1958                         __free_page(spage->page);
1959                 kfree(spage);
1960         }
1961 }
1962
1963 static void scrub_submit(struct scrub_ctx *sctx)
1964 {
1965         struct scrub_bio *sbio;
1966
1967         if (sctx->curr == -1)
1968                 return;
1969
1970         sbio = sctx->bios[sctx->curr];
1971         sctx->curr = -1;
1972         scrub_pending_bio_inc(sctx);
1973         btrfsic_submit_bio(sbio->bio);
1974 }
1975
1976 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1977                                     struct scrub_page *spage)
1978 {
1979         struct scrub_block *sblock = spage->sblock;
1980         struct scrub_bio *sbio;
1981         int ret;
1982
1983 again:
1984         /*
1985          * grab a fresh bio or wait for one to become available
1986          */
1987         while (sctx->curr == -1) {
1988                 spin_lock(&sctx->list_lock);
1989                 sctx->curr = sctx->first_free;
1990                 if (sctx->curr != -1) {
1991                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
1992                         sctx->bios[sctx->curr]->next_free = -1;
1993                         sctx->bios[sctx->curr]->page_count = 0;
1994                         spin_unlock(&sctx->list_lock);
1995                 } else {
1996                         spin_unlock(&sctx->list_lock);
1997                         wait_event(sctx->list_wait, sctx->first_free != -1);
1998                 }
1999         }
2000         sbio = sctx->bios[sctx->curr];
2001         if (sbio->page_count == 0) {
2002                 struct bio *bio;
2003
2004                 sbio->physical = spage->physical;
2005                 sbio->logical = spage->logical;
2006                 sbio->dev = spage->dev;
2007                 bio = sbio->bio;
2008                 if (!bio) {
2009                         bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
2010                         sbio->bio = bio;
2011                 }
2012
2013                 bio->bi_private = sbio;
2014                 bio->bi_end_io = scrub_bio_end_io;
2015                 bio_set_dev(bio, sbio->dev->bdev);
2016                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2017                 bio->bi_opf = REQ_OP_READ;
2018                 sbio->status = 0;
2019         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2020                    spage->physical ||
2021                    sbio->logical + sbio->page_count * PAGE_SIZE !=
2022                    spage->logical ||
2023                    sbio->dev != spage->dev) {
2024                 scrub_submit(sctx);
2025                 goto again;
2026         }
2027
2028         sbio->pagev[sbio->page_count] = spage;
2029         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2030         if (ret != PAGE_SIZE) {
2031                 if (sbio->page_count < 1) {
2032                         bio_put(sbio->bio);
2033                         sbio->bio = NULL;
2034                         return -EIO;
2035                 }
2036                 scrub_submit(sctx);
2037                 goto again;
2038         }
2039
2040         scrub_block_get(sblock); /* one for the page added to the bio */
2041         atomic_inc(&sblock->outstanding_pages);
2042         sbio->page_count++;
2043         if (sbio->page_count == sctx->pages_per_rd_bio)
2044                 scrub_submit(sctx);
2045
2046         return 0;
2047 }
2048
2049 static void scrub_missing_raid56_end_io(struct bio *bio)
2050 {
2051         struct scrub_block *sblock = bio->bi_private;
2052         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2053
2054         if (bio->bi_status)
2055                 sblock->no_io_error_seen = 0;
2056
2057         bio_put(bio);
2058
2059         btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2060 }
2061
2062 static void scrub_missing_raid56_worker(struct btrfs_work *work)
2063 {
2064         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2065         struct scrub_ctx *sctx = sblock->sctx;
2066         struct btrfs_fs_info *fs_info = sctx->fs_info;
2067         u64 logical;
2068         struct btrfs_device *dev;
2069
2070         logical = sblock->pagev[0]->logical;
2071         dev = sblock->pagev[0]->dev;
2072
2073         if (sblock->no_io_error_seen)
2074                 scrub_recheck_block_checksum(sblock);
2075
2076         if (!sblock->no_io_error_seen) {
2077                 spin_lock(&sctx->stat_lock);
2078                 sctx->stat.read_errors++;
2079                 spin_unlock(&sctx->stat_lock);
2080                 btrfs_err_rl_in_rcu(fs_info,
2081                         "IO error rebuilding logical %llu for dev %s",
2082                         logical, rcu_str_deref(dev->name));
2083         } else if (sblock->header_error || sblock->checksum_error) {
2084                 spin_lock(&sctx->stat_lock);
2085                 sctx->stat.uncorrectable_errors++;
2086                 spin_unlock(&sctx->stat_lock);
2087                 btrfs_err_rl_in_rcu(fs_info,
2088                         "failed to rebuild valid logical %llu for dev %s",
2089                         logical, rcu_str_deref(dev->name));
2090         } else {
2091                 scrub_write_block_to_dev_replace(sblock);
2092         }
2093
2094         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2095                 mutex_lock(&sctx->wr_lock);
2096                 scrub_wr_submit(sctx);
2097                 mutex_unlock(&sctx->wr_lock);
2098         }
2099
2100         scrub_block_put(sblock);
2101         scrub_pending_bio_dec(sctx);
2102 }
2103
2104 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2105 {
2106         struct scrub_ctx *sctx = sblock->sctx;
2107         struct btrfs_fs_info *fs_info = sctx->fs_info;
2108         u64 length = sblock->page_count * PAGE_SIZE;
2109         u64 logical = sblock->pagev[0]->logical;
2110         struct btrfs_bio *bbio = NULL;
2111         struct bio *bio;
2112         struct btrfs_raid_bio *rbio;
2113         int ret;
2114         int i;
2115
2116         btrfs_bio_counter_inc_blocked(fs_info);
2117         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2118                         &length, &bbio);
2119         if (ret || !bbio || !bbio->raid_map)
2120                 goto bbio_out;
2121
2122         if (WARN_ON(!sctx->is_dev_replace ||
2123                     !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2124                 /*
2125                  * We shouldn't be scrubbing a missing device. Even for dev
2126                  * replace, we should only get here for RAID 5/6. We either
2127                  * managed to mount something with no mirrors remaining or
2128                  * there's a bug in scrub_remap_extent()/btrfs_map_block().
2129                  */
2130                 goto bbio_out;
2131         }
2132
2133         bio = btrfs_io_bio_alloc(0);
2134         bio->bi_iter.bi_sector = logical >> 9;
2135         bio->bi_private = sblock;
2136         bio->bi_end_io = scrub_missing_raid56_end_io;
2137
2138         rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2139         if (!rbio)
2140                 goto rbio_out;
2141
2142         for (i = 0; i < sblock->page_count; i++) {
2143                 struct scrub_page *spage = sblock->pagev[i];
2144
2145                 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2146         }
2147
2148         btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
2149         scrub_block_get(sblock);
2150         scrub_pending_bio_inc(sctx);
2151         raid56_submit_missing_rbio(rbio);
2152         return;
2153
2154 rbio_out:
2155         bio_put(bio);
2156 bbio_out:
2157         btrfs_bio_counter_dec(fs_info);
2158         btrfs_put_bbio(bbio);
2159         spin_lock(&sctx->stat_lock);
2160         sctx->stat.malloc_errors++;
2161         spin_unlock(&sctx->stat_lock);
2162 }
2163
2164 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
2165                        u64 physical, struct btrfs_device *dev, u64 flags,
2166                        u64 gen, int mirror_num, u8 *csum,
2167                        u64 physical_for_dev_replace)
2168 {
2169         struct scrub_block *sblock;
2170         const u32 sectorsize = sctx->fs_info->sectorsize;
2171         int index;
2172
2173         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2174         if (!sblock) {
2175                 spin_lock(&sctx->stat_lock);
2176                 sctx->stat.malloc_errors++;
2177                 spin_unlock(&sctx->stat_lock);
2178                 return -ENOMEM;
2179         }
2180
2181         /* one ref inside this function, plus one for each page added to
2182          * a bio later on */
2183         refcount_set(&sblock->refs, 1);
2184         sblock->sctx = sctx;
2185         sblock->no_io_error_seen = 1;
2186
2187         for (index = 0; len > 0; index++) {
2188                 struct scrub_page *spage;
2189                 /*
2190                  * Here we will allocate one page for one sector to scrub.
2191                  * This is fine if PAGE_SIZE == sectorsize, but will cost
2192                  * more memory for PAGE_SIZE > sectorsize case.
2193                  */
2194                 u32 l = min(sectorsize, len);
2195
2196                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2197                 if (!spage) {
2198 leave_nomem:
2199                         spin_lock(&sctx->stat_lock);
2200                         sctx->stat.malloc_errors++;
2201                         spin_unlock(&sctx->stat_lock);
2202                         scrub_block_put(sblock);
2203                         return -ENOMEM;
2204                 }
2205                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2206                 scrub_page_get(spage);
2207                 sblock->pagev[index] = spage;
2208                 spage->sblock = sblock;
2209                 spage->dev = dev;
2210                 spage->flags = flags;
2211                 spage->generation = gen;
2212                 spage->logical = logical;
2213                 spage->physical = physical;
2214                 spage->physical_for_dev_replace = physical_for_dev_replace;
2215                 spage->mirror_num = mirror_num;
2216                 if (csum) {
2217                         spage->have_csum = 1;
2218                         memcpy(spage->csum, csum, sctx->fs_info->csum_size);
2219                 } else {
2220                         spage->have_csum = 0;
2221                 }
2222                 sblock->page_count++;
2223                 spage->page = alloc_page(GFP_KERNEL);
2224                 if (!spage->page)
2225                         goto leave_nomem;
2226                 len -= l;
2227                 logical += l;
2228                 physical += l;
2229                 physical_for_dev_replace += l;
2230         }
2231
2232         WARN_ON(sblock->page_count == 0);
2233         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2234                 /*
2235                  * This case should only be hit for RAID 5/6 device replace. See
2236                  * the comment in scrub_missing_raid56_pages() for details.
2237                  */
2238                 scrub_missing_raid56_pages(sblock);
2239         } else {
2240                 for (index = 0; index < sblock->page_count; index++) {
2241                         struct scrub_page *spage = sblock->pagev[index];
2242                         int ret;
2243
2244                         ret = scrub_add_page_to_rd_bio(sctx, spage);
2245                         if (ret) {
2246                                 scrub_block_put(sblock);
2247                                 return ret;
2248                         }
2249                 }
2250
2251                 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2252                         scrub_submit(sctx);
2253         }
2254
2255         /* last one frees, either here or in bio completion for last page */
2256         scrub_block_put(sblock);
2257         return 0;
2258 }
2259
2260 static void scrub_bio_end_io(struct bio *bio)
2261 {
2262         struct scrub_bio *sbio = bio->bi_private;
2263         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2264
2265         sbio->status = bio->bi_status;
2266         sbio->bio = bio;
2267
2268         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2269 }
2270
2271 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2272 {
2273         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2274         struct scrub_ctx *sctx = sbio->sctx;
2275         int i;
2276
2277         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2278         if (sbio->status) {
2279                 for (i = 0; i < sbio->page_count; i++) {
2280                         struct scrub_page *spage = sbio->pagev[i];
2281
2282                         spage->io_error = 1;
2283                         spage->sblock->no_io_error_seen = 0;
2284                 }
2285         }
2286
2287         /* now complete the scrub_block items that have all pages completed */
2288         for (i = 0; i < sbio->page_count; i++) {
2289                 struct scrub_page *spage = sbio->pagev[i];
2290                 struct scrub_block *sblock = spage->sblock;
2291
2292                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2293                         scrub_block_complete(sblock);
2294                 scrub_block_put(sblock);
2295         }
2296
2297         bio_put(sbio->bio);
2298         sbio->bio = NULL;
2299         spin_lock(&sctx->list_lock);
2300         sbio->next_free = sctx->first_free;
2301         sctx->first_free = sbio->index;
2302         spin_unlock(&sctx->list_lock);
2303
2304         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2305                 mutex_lock(&sctx->wr_lock);
2306                 scrub_wr_submit(sctx);
2307                 mutex_unlock(&sctx->wr_lock);
2308         }
2309
2310         scrub_pending_bio_dec(sctx);
2311 }
2312
2313 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2314                                        unsigned long *bitmap,
2315                                        u64 start, u32 len)
2316 {
2317         u64 offset;
2318         u32 nsectors;
2319         u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2320
2321         if (len >= sparity->stripe_len) {
2322                 bitmap_set(bitmap, 0, sparity->nsectors);
2323                 return;
2324         }
2325
2326         start -= sparity->logic_start;
2327         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2328         offset = offset >> sectorsize_bits;
2329         nsectors = len >> sectorsize_bits;
2330
2331         if (offset + nsectors <= sparity->nsectors) {
2332                 bitmap_set(bitmap, offset, nsectors);
2333                 return;
2334         }
2335
2336         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2337         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2338 }
2339
2340 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2341                                                    u64 start, u32 len)
2342 {
2343         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2344 }
2345
2346 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2347                                                   u64 start, u32 len)
2348 {
2349         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2350 }
2351
2352 static void scrub_block_complete(struct scrub_block *sblock)
2353 {
2354         int corrupted = 0;
2355
2356         if (!sblock->no_io_error_seen) {
2357                 corrupted = 1;
2358                 scrub_handle_errored_block(sblock);
2359         } else {
2360                 /*
2361                  * if has checksum error, write via repair mechanism in
2362                  * dev replace case, otherwise write here in dev replace
2363                  * case.
2364                  */
2365                 corrupted = scrub_checksum(sblock);
2366                 if (!corrupted && sblock->sctx->is_dev_replace)
2367                         scrub_write_block_to_dev_replace(sblock);
2368         }
2369
2370         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2371                 u64 start = sblock->pagev[0]->logical;
2372                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2373                           PAGE_SIZE;
2374
2375                 ASSERT(end - start <= U32_MAX);
2376                 scrub_parity_mark_sectors_error(sblock->sparity,
2377                                                 start, end - start);
2378         }
2379 }
2380
2381 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2382 {
2383         sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2384         list_del(&sum->list);
2385         kfree(sum);
2386 }
2387
2388 /*
2389  * Find the desired csum for range [logical, logical + sectorsize), and store
2390  * the csum into @csum.
2391  *
2392  * The search source is sctx->csum_list, which is a pre-populated list
2393  * storing bytenr ordered csum ranges.  We're reponsible to cleanup any range
2394  * that is before @logical.
2395  *
2396  * Return 0 if there is no csum for the range.
2397  * Return 1 if there is csum for the range and copied to @csum.
2398  */
2399 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2400 {
2401         bool found = false;
2402
2403         while (!list_empty(&sctx->csum_list)) {
2404                 struct btrfs_ordered_sum *sum = NULL;
2405                 unsigned long index;
2406                 unsigned long num_sectors;
2407
2408                 sum = list_first_entry(&sctx->csum_list,
2409                                        struct btrfs_ordered_sum, list);
2410                 /* The current csum range is beyond our range, no csum found */
2411                 if (sum->bytenr > logical)
2412                         break;
2413
2414                 /*
2415                  * The current sum is before our bytenr, since scrub is always
2416                  * done in bytenr order, the csum will never be used anymore,
2417                  * clean it up so that later calls won't bother with the range,
2418                  * and continue search the next range.
2419                  */
2420                 if (sum->bytenr + sum->len <= logical) {
2421                         drop_csum_range(sctx, sum);
2422                         continue;
2423                 }
2424
2425                 /* Now the csum range covers our bytenr, copy the csum */
2426                 found = true;
2427                 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2428                 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2429
2430                 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2431                        sctx->fs_info->csum_size);
2432
2433                 /* Cleanup the range if we're at the end of the csum range */
2434                 if (index == num_sectors - 1)
2435                         drop_csum_range(sctx, sum);
2436                 break;
2437         }
2438         if (!found)
2439                 return 0;
2440         return 1;
2441 }
2442
2443 /* scrub extent tries to collect up to 64 kB for each bio */
2444 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2445                         u64 logical, u32 len,
2446                         u64 physical, struct btrfs_device *dev, u64 flags,
2447                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2448 {
2449         int ret;
2450         u8 csum[BTRFS_CSUM_SIZE];
2451         u32 blocksize;
2452
2453         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2454                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2455                         blocksize = map->stripe_len;
2456                 else
2457                         blocksize = sctx->fs_info->sectorsize;
2458                 spin_lock(&sctx->stat_lock);
2459                 sctx->stat.data_extents_scrubbed++;
2460                 sctx->stat.data_bytes_scrubbed += len;
2461                 spin_unlock(&sctx->stat_lock);
2462         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2463                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2464                         blocksize = map->stripe_len;
2465                 else
2466                         blocksize = sctx->fs_info->nodesize;
2467                 spin_lock(&sctx->stat_lock);
2468                 sctx->stat.tree_extents_scrubbed++;
2469                 sctx->stat.tree_bytes_scrubbed += len;
2470                 spin_unlock(&sctx->stat_lock);
2471         } else {
2472                 blocksize = sctx->fs_info->sectorsize;
2473                 WARN_ON(1);
2474         }
2475
2476         while (len) {
2477                 u32 l = min(len, blocksize);
2478                 int have_csum = 0;
2479
2480                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2481                         /* push csums to sbio */
2482                         have_csum = scrub_find_csum(sctx, logical, csum);
2483                         if (have_csum == 0)
2484                                 ++sctx->stat.no_csum;
2485                 }
2486                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2487                                   mirror_num, have_csum ? csum : NULL,
2488                                   physical_for_dev_replace);
2489                 if (ret)
2490                         return ret;
2491                 len -= l;
2492                 logical += l;
2493                 physical += l;
2494                 physical_for_dev_replace += l;
2495         }
2496         return 0;
2497 }
2498
2499 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2500                                   u64 logical, u32 len,
2501                                   u64 physical, struct btrfs_device *dev,
2502                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2503 {
2504         struct scrub_ctx *sctx = sparity->sctx;
2505         struct scrub_block *sblock;
2506         const u32 sectorsize = sctx->fs_info->sectorsize;
2507         int index;
2508
2509         ASSERT(IS_ALIGNED(len, sectorsize));
2510
2511         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2512         if (!sblock) {
2513                 spin_lock(&sctx->stat_lock);
2514                 sctx->stat.malloc_errors++;
2515                 spin_unlock(&sctx->stat_lock);
2516                 return -ENOMEM;
2517         }
2518
2519         /* one ref inside this function, plus one for each page added to
2520          * a bio later on */
2521         refcount_set(&sblock->refs, 1);
2522         sblock->sctx = sctx;
2523         sblock->no_io_error_seen = 1;
2524         sblock->sparity = sparity;
2525         scrub_parity_get(sparity);
2526
2527         for (index = 0; len > 0; index++) {
2528                 struct scrub_page *spage;
2529
2530                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2531                 if (!spage) {
2532 leave_nomem:
2533                         spin_lock(&sctx->stat_lock);
2534                         sctx->stat.malloc_errors++;
2535                         spin_unlock(&sctx->stat_lock);
2536                         scrub_block_put(sblock);
2537                         return -ENOMEM;
2538                 }
2539                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2540                 /* For scrub block */
2541                 scrub_page_get(spage);
2542                 sblock->pagev[index] = spage;
2543                 /* For scrub parity */
2544                 scrub_page_get(spage);
2545                 list_add_tail(&spage->list, &sparity->spages);
2546                 spage->sblock = sblock;
2547                 spage->dev = dev;
2548                 spage->flags = flags;
2549                 spage->generation = gen;
2550                 spage->logical = logical;
2551                 spage->physical = physical;
2552                 spage->mirror_num = mirror_num;
2553                 if (csum) {
2554                         spage->have_csum = 1;
2555                         memcpy(spage->csum, csum, sctx->fs_info->csum_size);
2556                 } else {
2557                         spage->have_csum = 0;
2558                 }
2559                 sblock->page_count++;
2560                 spage->page = alloc_page(GFP_KERNEL);
2561                 if (!spage->page)
2562                         goto leave_nomem;
2563
2564
2565                 /* Iterate over the stripe range in sectorsize steps */
2566                 len -= sectorsize;
2567                 logical += sectorsize;
2568                 physical += sectorsize;
2569         }
2570
2571         WARN_ON(sblock->page_count == 0);
2572         for (index = 0; index < sblock->page_count; index++) {
2573                 struct scrub_page *spage = sblock->pagev[index];
2574                 int ret;
2575
2576                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2577                 if (ret) {
2578                         scrub_block_put(sblock);
2579                         return ret;
2580                 }
2581         }
2582
2583         /* last one frees, either here or in bio completion for last page */
2584         scrub_block_put(sblock);
2585         return 0;
2586 }
2587
2588 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2589                                    u64 logical, u32 len,
2590                                    u64 physical, struct btrfs_device *dev,
2591                                    u64 flags, u64 gen, int mirror_num)
2592 {
2593         struct scrub_ctx *sctx = sparity->sctx;
2594         int ret;
2595         u8 csum[BTRFS_CSUM_SIZE];
2596         u32 blocksize;
2597
2598         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2599                 scrub_parity_mark_sectors_error(sparity, logical, len);
2600                 return 0;
2601         }
2602
2603         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2604                 blocksize = sparity->stripe_len;
2605         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2606                 blocksize = sparity->stripe_len;
2607         } else {
2608                 blocksize = sctx->fs_info->sectorsize;
2609                 WARN_ON(1);
2610         }
2611
2612         while (len) {
2613                 u32 l = min(len, blocksize);
2614                 int have_csum = 0;
2615
2616                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2617                         /* push csums to sbio */
2618                         have_csum = scrub_find_csum(sctx, logical, csum);
2619                         if (have_csum == 0)
2620                                 goto skip;
2621                 }
2622                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2623                                              flags, gen, mirror_num,
2624                                              have_csum ? csum : NULL);
2625                 if (ret)
2626                         return ret;
2627 skip:
2628                 len -= l;
2629                 logical += l;
2630                 physical += l;
2631         }
2632         return 0;
2633 }
2634
2635 /*
2636  * Given a physical address, this will calculate it's
2637  * logical offset. if this is a parity stripe, it will return
2638  * the most left data stripe's logical offset.
2639  *
2640  * return 0 if it is a data stripe, 1 means parity stripe.
2641  */
2642 static int get_raid56_logic_offset(u64 physical, int num,
2643                                    struct map_lookup *map, u64 *offset,
2644                                    u64 *stripe_start)
2645 {
2646         int i;
2647         int j = 0;
2648         u64 stripe_nr;
2649         u64 last_offset;
2650         u32 stripe_index;
2651         u32 rot;
2652         const int data_stripes = nr_data_stripes(map);
2653
2654         last_offset = (physical - map->stripes[num].physical) * data_stripes;
2655         if (stripe_start)
2656                 *stripe_start = last_offset;
2657
2658         *offset = last_offset;
2659         for (i = 0; i < data_stripes; i++) {
2660                 *offset = last_offset + i * map->stripe_len;
2661
2662                 stripe_nr = div64_u64(*offset, map->stripe_len);
2663                 stripe_nr = div_u64(stripe_nr, data_stripes);
2664
2665                 /* Work out the disk rotation on this stripe-set */
2666                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2667                 /* calculate which stripe this data locates */
2668                 rot += i;
2669                 stripe_index = rot % map->num_stripes;
2670                 if (stripe_index == num)
2671                         return 0;
2672                 if (stripe_index < num)
2673                         j++;
2674         }
2675         *offset = last_offset + j * map->stripe_len;
2676         return 1;
2677 }
2678
2679 static void scrub_free_parity(struct scrub_parity *sparity)
2680 {
2681         struct scrub_ctx *sctx = sparity->sctx;
2682         struct scrub_page *curr, *next;
2683         int nbits;
2684
2685         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2686         if (nbits) {
2687                 spin_lock(&sctx->stat_lock);
2688                 sctx->stat.read_errors += nbits;
2689                 sctx->stat.uncorrectable_errors += nbits;
2690                 spin_unlock(&sctx->stat_lock);
2691         }
2692
2693         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2694                 list_del_init(&curr->list);
2695                 scrub_page_put(curr);
2696         }
2697
2698         kfree(sparity);
2699 }
2700
2701 static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2702 {
2703         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2704                                                     work);
2705         struct scrub_ctx *sctx = sparity->sctx;
2706
2707         scrub_free_parity(sparity);
2708         scrub_pending_bio_dec(sctx);
2709 }
2710
2711 static void scrub_parity_bio_endio(struct bio *bio)
2712 {
2713         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2714         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2715
2716         if (bio->bi_status)
2717                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2718                           sparity->nsectors);
2719
2720         bio_put(bio);
2721
2722         btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
2723                         NULL);
2724         btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
2725 }
2726
2727 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2728 {
2729         struct scrub_ctx *sctx = sparity->sctx;
2730         struct btrfs_fs_info *fs_info = sctx->fs_info;
2731         struct bio *bio;
2732         struct btrfs_raid_bio *rbio;
2733         struct btrfs_bio *bbio = NULL;
2734         u64 length;
2735         int ret;
2736
2737         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2738                            sparity->nsectors))
2739                 goto out;
2740
2741         length = sparity->logic_end - sparity->logic_start;
2742
2743         btrfs_bio_counter_inc_blocked(fs_info);
2744         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2745                                &length, &bbio);
2746         if (ret || !bbio || !bbio->raid_map)
2747                 goto bbio_out;
2748
2749         bio = btrfs_io_bio_alloc(0);
2750         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2751         bio->bi_private = sparity;
2752         bio->bi_end_io = scrub_parity_bio_endio;
2753
2754         rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
2755                                               length, sparity->scrub_dev,
2756                                               sparity->dbitmap,
2757                                               sparity->nsectors);
2758         if (!rbio)
2759                 goto rbio_out;
2760
2761         scrub_pending_bio_inc(sctx);
2762         raid56_parity_submit_scrub_rbio(rbio);
2763         return;
2764
2765 rbio_out:
2766         bio_put(bio);
2767 bbio_out:
2768         btrfs_bio_counter_dec(fs_info);
2769         btrfs_put_bbio(bbio);
2770         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2771                   sparity->nsectors);
2772         spin_lock(&sctx->stat_lock);
2773         sctx->stat.malloc_errors++;
2774         spin_unlock(&sctx->stat_lock);
2775 out:
2776         scrub_free_parity(sparity);
2777 }
2778
2779 static inline int scrub_calc_parity_bitmap_len(int nsectors)
2780 {
2781         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
2782 }
2783
2784 static void scrub_parity_get(struct scrub_parity *sparity)
2785 {
2786         refcount_inc(&sparity->refs);
2787 }
2788
2789 static void scrub_parity_put(struct scrub_parity *sparity)
2790 {
2791         if (!refcount_dec_and_test(&sparity->refs))
2792                 return;
2793
2794         scrub_parity_check_and_repair(sparity);
2795 }
2796
2797 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2798                                                   struct map_lookup *map,
2799                                                   struct btrfs_device *sdev,
2800                                                   struct btrfs_path *path,
2801                                                   u64 logic_start,
2802                                                   u64 logic_end)
2803 {
2804         struct btrfs_fs_info *fs_info = sctx->fs_info;
2805         struct btrfs_root *root = fs_info->extent_root;
2806         struct btrfs_root *csum_root = fs_info->csum_root;
2807         struct btrfs_extent_item *extent;
2808         struct btrfs_bio *bbio = NULL;
2809         u64 flags;
2810         int ret;
2811         int slot;
2812         struct extent_buffer *l;
2813         struct btrfs_key key;
2814         u64 generation;
2815         u64 extent_logical;
2816         u64 extent_physical;
2817         /* Check the comment in scrub_stripe() for why u32 is enough here */
2818         u32 extent_len;
2819         u64 mapped_length;
2820         struct btrfs_device *extent_dev;
2821         struct scrub_parity *sparity;
2822         int nsectors;
2823         int bitmap_len;
2824         int extent_mirror_num;
2825         int stop_loop = 0;
2826
2827         ASSERT(map->stripe_len <= U32_MAX);
2828         nsectors = map->stripe_len >> fs_info->sectorsize_bits;
2829         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2830         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2831                           GFP_NOFS);
2832         if (!sparity) {
2833                 spin_lock(&sctx->stat_lock);
2834                 sctx->stat.malloc_errors++;
2835                 spin_unlock(&sctx->stat_lock);
2836                 return -ENOMEM;
2837         }
2838
2839         ASSERT(map->stripe_len <= U32_MAX);
2840         sparity->stripe_len = map->stripe_len;
2841         sparity->nsectors = nsectors;
2842         sparity->sctx = sctx;
2843         sparity->scrub_dev = sdev;
2844         sparity->logic_start = logic_start;
2845         sparity->logic_end = logic_end;
2846         refcount_set(&sparity->refs, 1);
2847         INIT_LIST_HEAD(&sparity->spages);
2848         sparity->dbitmap = sparity->bitmap;
2849         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2850
2851         ret = 0;
2852         while (logic_start < logic_end) {
2853                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2854                         key.type = BTRFS_METADATA_ITEM_KEY;
2855                 else
2856                         key.type = BTRFS_EXTENT_ITEM_KEY;
2857                 key.objectid = logic_start;
2858                 key.offset = (u64)-1;
2859
2860                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2861                 if (ret < 0)
2862                         goto out;
2863
2864                 if (ret > 0) {
2865                         ret = btrfs_previous_extent_item(root, path, 0);
2866                         if (ret < 0)
2867                                 goto out;
2868                         if (ret > 0) {
2869                                 btrfs_release_path(path);
2870                                 ret = btrfs_search_slot(NULL, root, &key,
2871                                                         path, 0, 0);
2872                                 if (ret < 0)
2873                                         goto out;
2874                         }
2875                 }
2876
2877                 stop_loop = 0;
2878                 while (1) {
2879                         u64 bytes;
2880
2881                         l = path->nodes[0];
2882                         slot = path->slots[0];
2883                         if (slot >= btrfs_header_nritems(l)) {
2884                                 ret = btrfs_next_leaf(root, path);
2885                                 if (ret == 0)
2886                                         continue;
2887                                 if (ret < 0)
2888                                         goto out;
2889
2890                                 stop_loop = 1;
2891                                 break;
2892                         }
2893                         btrfs_item_key_to_cpu(l, &key, slot);
2894
2895                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2896                             key.type != BTRFS_METADATA_ITEM_KEY)
2897                                 goto next;
2898
2899                         if (key.type == BTRFS_METADATA_ITEM_KEY)
2900                                 bytes = fs_info->nodesize;
2901                         else
2902                                 bytes = key.offset;
2903
2904                         if (key.objectid + bytes <= logic_start)
2905                                 goto next;
2906
2907                         if (key.objectid >= logic_end) {
2908                                 stop_loop = 1;
2909                                 break;
2910                         }
2911
2912                         while (key.objectid >= logic_start + map->stripe_len)
2913                                 logic_start += map->stripe_len;
2914
2915                         extent = btrfs_item_ptr(l, slot,
2916                                                 struct btrfs_extent_item);
2917                         flags = btrfs_extent_flags(l, extent);
2918                         generation = btrfs_extent_generation(l, extent);
2919
2920                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
2921                             (key.objectid < logic_start ||
2922                              key.objectid + bytes >
2923                              logic_start + map->stripe_len)) {
2924                                 btrfs_err(fs_info,
2925                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2926                                           key.objectid, logic_start);
2927                                 spin_lock(&sctx->stat_lock);
2928                                 sctx->stat.uncorrectable_errors++;
2929                                 spin_unlock(&sctx->stat_lock);
2930                                 goto next;
2931                         }
2932 again:
2933                         extent_logical = key.objectid;
2934                         ASSERT(bytes <= U32_MAX);
2935                         extent_len = bytes;
2936
2937                         if (extent_logical < logic_start) {
2938                                 extent_len -= logic_start - extent_logical;
2939                                 extent_logical = logic_start;
2940                         }
2941
2942                         if (extent_logical + extent_len >
2943                             logic_start + map->stripe_len)
2944                                 extent_len = logic_start + map->stripe_len -
2945                                              extent_logical;
2946
2947                         scrub_parity_mark_sectors_data(sparity, extent_logical,
2948                                                        extent_len);
2949
2950                         mapped_length = extent_len;
2951                         bbio = NULL;
2952                         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
2953                                         extent_logical, &mapped_length, &bbio,
2954                                         0);
2955                         if (!ret) {
2956                                 if (!bbio || mapped_length < extent_len)
2957                                         ret = -EIO;
2958                         }
2959                         if (ret) {
2960                                 btrfs_put_bbio(bbio);
2961                                 goto out;
2962                         }
2963                         extent_physical = bbio->stripes[0].physical;
2964                         extent_mirror_num = bbio->mirror_num;
2965                         extent_dev = bbio->stripes[0].dev;
2966                         btrfs_put_bbio(bbio);
2967
2968                         ret = btrfs_lookup_csums_range(csum_root,
2969                                                 extent_logical,
2970                                                 extent_logical + extent_len - 1,
2971                                                 &sctx->csum_list, 1);
2972                         if (ret)
2973                                 goto out;
2974
2975                         ret = scrub_extent_for_parity(sparity, extent_logical,
2976                                                       extent_len,
2977                                                       extent_physical,
2978                                                       extent_dev, flags,
2979                                                       generation,
2980                                                       extent_mirror_num);
2981
2982                         scrub_free_csums(sctx);
2983
2984                         if (ret)
2985                                 goto out;
2986
2987                         if (extent_logical + extent_len <
2988                             key.objectid + bytes) {
2989                                 logic_start += map->stripe_len;
2990
2991                                 if (logic_start >= logic_end) {
2992                                         stop_loop = 1;
2993                                         break;
2994                                 }
2995
2996                                 if (logic_start < key.objectid + bytes) {
2997                                         cond_resched();
2998                                         goto again;
2999                                 }
3000                         }
3001 next:
3002                         path->slots[0]++;
3003                 }
3004
3005                 btrfs_release_path(path);
3006
3007                 if (stop_loop)
3008                         break;
3009
3010                 logic_start += map->stripe_len;
3011         }
3012 out:
3013         if (ret < 0) {
3014                 ASSERT(logic_end - logic_start <= U32_MAX);
3015                 scrub_parity_mark_sectors_error(sparity, logic_start,
3016                                                 logic_end - logic_start);
3017         }
3018         scrub_parity_put(sparity);
3019         scrub_submit(sctx);
3020         mutex_lock(&sctx->wr_lock);
3021         scrub_wr_submit(sctx);
3022         mutex_unlock(&sctx->wr_lock);
3023
3024         btrfs_release_path(path);
3025         return ret < 0 ? ret : 0;
3026 }
3027
3028 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3029                                            struct map_lookup *map,
3030                                            struct btrfs_device *scrub_dev,
3031                                            int num, u64 base, u64 length,
3032                                            struct btrfs_block_group *cache)
3033 {
3034         struct btrfs_path *path, *ppath;
3035         struct btrfs_fs_info *fs_info = sctx->fs_info;
3036         struct btrfs_root *root = fs_info->extent_root;
3037         struct btrfs_root *csum_root = fs_info->csum_root;
3038         struct btrfs_extent_item *extent;
3039         struct blk_plug plug;
3040         u64 flags;
3041         int ret;
3042         int slot;
3043         u64 nstripes;
3044         struct extent_buffer *l;
3045         u64 physical;
3046         u64 logical;
3047         u64 logic_end;
3048         u64 physical_end;
3049         u64 generation;
3050         int mirror_num;
3051         struct reada_control *reada1;
3052         struct reada_control *reada2;
3053         struct btrfs_key key;
3054         struct btrfs_key key_end;
3055         u64 increment = map->stripe_len;
3056         u64 offset;
3057         u64 extent_logical;
3058         u64 extent_physical;
3059         /*
3060          * Unlike chunk length, extent length should never go beyond
3061          * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here.
3062          */
3063         u32 extent_len;
3064         u64 stripe_logical;
3065         u64 stripe_end;
3066         struct btrfs_device *extent_dev;
3067         int extent_mirror_num;
3068         int stop_loop = 0;
3069
3070         physical = map->stripes[num].physical;
3071         offset = 0;
3072         nstripes = div64_u64(length, map->stripe_len);
3073         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3074                 offset = map->stripe_len * num;
3075                 increment = map->stripe_len * map->num_stripes;
3076                 mirror_num = 1;
3077         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3078                 int factor = map->num_stripes / map->sub_stripes;
3079                 offset = map->stripe_len * (num / map->sub_stripes);
3080                 increment = map->stripe_len * factor;
3081                 mirror_num = num % map->sub_stripes + 1;
3082         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
3083                 increment = map->stripe_len;
3084                 mirror_num = num % map->num_stripes + 1;
3085         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3086                 increment = map->stripe_len;
3087                 mirror_num = num % map->num_stripes + 1;
3088         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3089                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3090                 increment = map->stripe_len * nr_data_stripes(map);
3091                 mirror_num = 1;
3092         } else {
3093                 increment = map->stripe_len;
3094                 mirror_num = 1;
3095         }
3096
3097         path = btrfs_alloc_path();
3098         if (!path)
3099                 return -ENOMEM;
3100
3101         ppath = btrfs_alloc_path();
3102         if (!ppath) {
3103                 btrfs_free_path(path);
3104                 return -ENOMEM;
3105         }
3106
3107         /*
3108          * work on commit root. The related disk blocks are static as
3109          * long as COW is applied. This means, it is save to rewrite
3110          * them to repair disk errors without any race conditions
3111          */
3112         path->search_commit_root = 1;
3113         path->skip_locking = 1;
3114
3115         ppath->search_commit_root = 1;
3116         ppath->skip_locking = 1;
3117         /*
3118          * trigger the readahead for extent tree csum tree and wait for
3119          * completion. During readahead, the scrub is officially paused
3120          * to not hold off transaction commits
3121          */
3122         logical = base + offset;
3123         physical_end = physical + nstripes * map->stripe_len;
3124         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3125                 get_raid56_logic_offset(physical_end, num,
3126                                         map, &logic_end, NULL);
3127                 logic_end += base;
3128         } else {
3129                 logic_end = logical + increment * nstripes;
3130         }
3131         wait_event(sctx->list_wait,
3132                    atomic_read(&sctx->bios_in_flight) == 0);
3133         scrub_blocked_if_needed(fs_info);
3134
3135         /* FIXME it might be better to start readahead at commit root */
3136         key.objectid = logical;
3137         key.type = BTRFS_EXTENT_ITEM_KEY;
3138         key.offset = (u64)0;
3139         key_end.objectid = logic_end;
3140         key_end.type = BTRFS_METADATA_ITEM_KEY;
3141         key_end.offset = (u64)-1;
3142         reada1 = btrfs_reada_add(root, &key, &key_end);
3143
3144         if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
3145                 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3146                 key.type = BTRFS_EXTENT_CSUM_KEY;
3147                 key.offset = logical;
3148                 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3149                 key_end.type = BTRFS_EXTENT_CSUM_KEY;
3150                 key_end.offset = logic_end;
3151                 reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3152         } else {
3153                 reada2 = NULL;
3154         }
3155
3156         if (!IS_ERR(reada1))
3157                 btrfs_reada_wait(reada1);
3158         if (!IS_ERR_OR_NULL(reada2))
3159                 btrfs_reada_wait(reada2);
3160
3161
3162         /*
3163          * collect all data csums for the stripe to avoid seeking during
3164          * the scrub. This might currently (crc32) end up to be about 1MB
3165          */
3166         blk_start_plug(&plug);
3167
3168         /*
3169          * now find all extents for each stripe and scrub them
3170          */
3171         ret = 0;
3172         while (physical < physical_end) {
3173                 /*
3174                  * canceled?
3175                  */
3176                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3177                     atomic_read(&sctx->cancel_req)) {
3178                         ret = -ECANCELED;
3179                         goto out;
3180                 }
3181                 /*
3182                  * check to see if we have to pause
3183                  */
3184                 if (atomic_read(&fs_info->scrub_pause_req)) {
3185                         /* push queued extents */
3186                         sctx->flush_all_writes = true;
3187                         scrub_submit(sctx);
3188                         mutex_lock(&sctx->wr_lock);
3189                         scrub_wr_submit(sctx);
3190                         mutex_unlock(&sctx->wr_lock);
3191                         wait_event(sctx->list_wait,
3192                                    atomic_read(&sctx->bios_in_flight) == 0);
3193                         sctx->flush_all_writes = false;
3194                         scrub_blocked_if_needed(fs_info);
3195                 }
3196
3197                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3198                         ret = get_raid56_logic_offset(physical, num, map,
3199                                                       &logical,
3200                                                       &stripe_logical);
3201                         logical += base;
3202                         if (ret) {
3203                                 /* it is parity strip */
3204                                 stripe_logical += base;
3205                                 stripe_end = stripe_logical + increment;
3206                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3207                                                           ppath, stripe_logical,
3208                                                           stripe_end);
3209                                 if (ret)
3210                                         goto out;
3211                                 goto skip;
3212                         }
3213                 }
3214
3215                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3216                         key.type = BTRFS_METADATA_ITEM_KEY;
3217                 else
3218                         key.type = BTRFS_EXTENT_ITEM_KEY;
3219                 key.objectid = logical;
3220                 key.offset = (u64)-1;
3221
3222                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3223                 if (ret < 0)
3224                         goto out;
3225
3226                 if (ret > 0) {
3227                         ret = btrfs_previous_extent_item(root, path, 0);
3228                         if (ret < 0)
3229                                 goto out;
3230                         if (ret > 0) {
3231                                 /* there's no smaller item, so stick with the
3232                                  * larger one */
3233                                 btrfs_release_path(path);
3234                                 ret = btrfs_search_slot(NULL, root, &key,
3235                                                         path, 0, 0);
3236                                 if (ret < 0)
3237                                         goto out;
3238                         }
3239                 }
3240
3241                 stop_loop = 0;
3242                 while (1) {
3243                         u64 bytes;
3244
3245                         l = path->nodes[0];
3246                         slot = path->slots[0];
3247                         if (slot >= btrfs_header_nritems(l)) {
3248                                 ret = btrfs_next_leaf(root, path);
3249                                 if (ret == 0)
3250                                         continue;
3251                                 if (ret < 0)
3252                                         goto out;
3253
3254                                 stop_loop = 1;
3255                                 break;
3256                         }
3257                         btrfs_item_key_to_cpu(l, &key, slot);
3258
3259                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3260                             key.type != BTRFS_METADATA_ITEM_KEY)
3261                                 goto next;
3262
3263                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3264                                 bytes = fs_info->nodesize;
3265                         else
3266                                 bytes = key.offset;
3267
3268                         if (key.objectid + bytes <= logical)
3269                                 goto next;
3270
3271                         if (key.objectid >= logical + map->stripe_len) {
3272                                 /* out of this device extent */
3273                                 if (key.objectid >= logic_end)
3274                                         stop_loop = 1;
3275                                 break;
3276                         }
3277
3278                         /*
3279                          * If our block group was removed in the meanwhile, just
3280                          * stop scrubbing since there is no point in continuing.
3281                          * Continuing would prevent reusing its device extents
3282                          * for new block groups for a long time.
3283                          */
3284                         spin_lock(&cache->lock);
3285                         if (cache->removed) {
3286                                 spin_unlock(&cache->lock);
3287                                 ret = 0;
3288                                 goto out;
3289                         }
3290                         spin_unlock(&cache->lock);
3291
3292                         extent = btrfs_item_ptr(l, slot,
3293                                                 struct btrfs_extent_item);
3294                         flags = btrfs_extent_flags(l, extent);
3295                         generation = btrfs_extent_generation(l, extent);
3296
3297                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3298                             (key.objectid < logical ||
3299                              key.objectid + bytes >
3300                              logical + map->stripe_len)) {
3301                                 btrfs_err(fs_info,
3302                                            "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3303                                        key.objectid, logical);
3304                                 spin_lock(&sctx->stat_lock);
3305                                 sctx->stat.uncorrectable_errors++;
3306                                 spin_unlock(&sctx->stat_lock);
3307                                 goto next;
3308                         }
3309
3310 again:
3311                         extent_logical = key.objectid;
3312                         ASSERT(bytes <= U32_MAX);
3313                         extent_len = bytes;
3314
3315                         /*
3316                          * trim extent to this stripe
3317                          */
3318                         if (extent_logical < logical) {
3319                                 extent_len -= logical - extent_logical;
3320                                 extent_logical = logical;
3321                         }
3322                         if (extent_logical + extent_len >
3323                             logical + map->stripe_len) {
3324                                 extent_len = logical + map->stripe_len -
3325                                              extent_logical;
3326                         }
3327
3328                         extent_physical = extent_logical - logical + physical;
3329                         extent_dev = scrub_dev;
3330                         extent_mirror_num = mirror_num;
3331                         if (sctx->is_dev_replace)
3332                                 scrub_remap_extent(fs_info, extent_logical,
3333                                                    extent_len, &extent_physical,
3334                                                    &extent_dev,
3335                                                    &extent_mirror_num);
3336
3337                         if (flags & BTRFS_EXTENT_FLAG_DATA) {
3338                                 ret = btrfs_lookup_csums_range(csum_root,
3339                                                 extent_logical,
3340                                                 extent_logical + extent_len - 1,
3341                                                 &sctx->csum_list, 1);
3342                                 if (ret)
3343                                         goto out;
3344                         }
3345
3346                         ret = scrub_extent(sctx, map, extent_logical, extent_len,
3347                                            extent_physical, extent_dev, flags,
3348                                            generation, extent_mirror_num,
3349                                            extent_logical - logical + physical);
3350
3351                         scrub_free_csums(sctx);
3352
3353                         if (ret)
3354                                 goto out;
3355
3356                         if (extent_logical + extent_len <
3357                             key.objectid + bytes) {
3358                                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3359                                         /*
3360                                          * loop until we find next data stripe
3361                                          * or we have finished all stripes.
3362                                          */
3363 loop:
3364                                         physical += map->stripe_len;
3365                                         ret = get_raid56_logic_offset(physical,
3366                                                         num, map, &logical,
3367                                                         &stripe_logical);
3368                                         logical += base;
3369
3370                                         if (ret && physical < physical_end) {
3371                                                 stripe_logical += base;
3372                                                 stripe_end = stripe_logical +
3373                                                                 increment;
3374                                                 ret = scrub_raid56_parity(sctx,
3375                                                         map, scrub_dev, ppath,
3376                                                         stripe_logical,
3377                                                         stripe_end);
3378                                                 if (ret)
3379                                                         goto out;
3380                                                 goto loop;
3381                                         }
3382                                 } else {
3383                                         physical += map->stripe_len;
3384                                         logical += increment;
3385                                 }
3386                                 if (logical < key.objectid + bytes) {
3387                                         cond_resched();
3388                                         goto again;
3389                                 }
3390
3391                                 if (physical >= physical_end) {
3392                                         stop_loop = 1;
3393                                         break;
3394                                 }
3395                         }
3396 next:
3397                         path->slots[0]++;
3398                 }
3399                 btrfs_release_path(path);
3400 skip:
3401                 logical += increment;
3402                 physical += map->stripe_len;
3403                 spin_lock(&sctx->stat_lock);
3404                 if (stop_loop)
3405                         sctx->stat.last_physical = map->stripes[num].physical +
3406                                                    length;
3407                 else
3408                         sctx->stat.last_physical = physical;
3409                 spin_unlock(&sctx->stat_lock);
3410                 if (stop_loop)
3411                         break;
3412         }
3413 out:
3414         /* push queued extents */
3415         scrub_submit(sctx);
3416         mutex_lock(&sctx->wr_lock);
3417         scrub_wr_submit(sctx);
3418         mutex_unlock(&sctx->wr_lock);
3419
3420         blk_finish_plug(&plug);
3421         btrfs_free_path(path);
3422         btrfs_free_path(ppath);
3423         return ret < 0 ? ret : 0;
3424 }
3425
3426 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3427                                           struct btrfs_device *scrub_dev,
3428                                           u64 chunk_offset, u64 length,
3429                                           u64 dev_offset,
3430                                           struct btrfs_block_group *cache)
3431 {
3432         struct btrfs_fs_info *fs_info = sctx->fs_info;
3433         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3434         struct map_lookup *map;
3435         struct extent_map *em;
3436         int i;
3437         int ret = 0;
3438
3439         read_lock(&map_tree->lock);
3440         em = lookup_extent_mapping(map_tree, chunk_offset, 1);
3441         read_unlock(&map_tree->lock);
3442
3443         if (!em) {
3444                 /*
3445                  * Might have been an unused block group deleted by the cleaner
3446                  * kthread or relocation.
3447                  */
3448                 spin_lock(&cache->lock);
3449                 if (!cache->removed)
3450                         ret = -EINVAL;
3451                 spin_unlock(&cache->lock);
3452
3453                 return ret;
3454         }
3455
3456         map = em->map_lookup;
3457         if (em->start != chunk_offset)
3458                 goto out;
3459
3460         if (em->len < length)
3461                 goto out;
3462
3463         for (i = 0; i < map->num_stripes; ++i) {
3464                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3465                     map->stripes[i].physical == dev_offset) {
3466                         ret = scrub_stripe(sctx, map, scrub_dev, i,
3467                                            chunk_offset, length, cache);
3468                         if (ret)
3469                                 goto out;
3470                 }
3471         }
3472 out:
3473         free_extent_map(em);
3474
3475         return ret;
3476 }
3477
3478 static noinline_for_stack
3479 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3480                            struct btrfs_device *scrub_dev, u64 start, u64 end)
3481 {
3482         struct btrfs_dev_extent *dev_extent = NULL;
3483         struct btrfs_path *path;
3484         struct btrfs_fs_info *fs_info = sctx->fs_info;
3485         struct btrfs_root *root = fs_info->dev_root;
3486         u64 length;
3487         u64 chunk_offset;
3488         int ret = 0;
3489         int ro_set;
3490         int slot;
3491         struct extent_buffer *l;
3492         struct btrfs_key key;
3493         struct btrfs_key found_key;
3494         struct btrfs_block_group *cache;
3495         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3496
3497         path = btrfs_alloc_path();
3498         if (!path)
3499                 return -ENOMEM;
3500
3501         path->reada = READA_FORWARD;
3502         path->search_commit_root = 1;
3503         path->skip_locking = 1;
3504
3505         key.objectid = scrub_dev->devid;
3506         key.offset = 0ull;
3507         key.type = BTRFS_DEV_EXTENT_KEY;
3508
3509         while (1) {
3510                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3511                 if (ret < 0)
3512                         break;
3513                 if (ret > 0) {
3514                         if (path->slots[0] >=
3515                             btrfs_header_nritems(path->nodes[0])) {
3516                                 ret = btrfs_next_leaf(root, path);
3517                                 if (ret < 0)
3518                                         break;
3519                                 if (ret > 0) {
3520                                         ret = 0;
3521                                         break;
3522                                 }
3523                         } else {
3524                                 ret = 0;
3525                         }
3526                 }
3527
3528                 l = path->nodes[0];
3529                 slot = path->slots[0];
3530
3531                 btrfs_item_key_to_cpu(l, &found_key, slot);
3532
3533                 if (found_key.objectid != scrub_dev->devid)
3534                         break;
3535
3536                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3537                         break;
3538
3539                 if (found_key.offset >= end)
3540                         break;
3541
3542                 if (found_key.offset < key.offset)
3543                         break;
3544
3545                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3546                 length = btrfs_dev_extent_length(l, dev_extent);
3547
3548                 if (found_key.offset + length <= start)
3549                         goto skip;
3550
3551                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3552
3553                 /*
3554                  * get a reference on the corresponding block group to prevent
3555                  * the chunk from going away while we scrub it
3556                  */
3557                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3558
3559                 /* some chunks are removed but not committed to disk yet,
3560                  * continue scrubbing */
3561                 if (!cache)
3562                         goto skip;
3563
3564                 /*
3565                  * Make sure that while we are scrubbing the corresponding block
3566                  * group doesn't get its logical address and its device extents
3567                  * reused for another block group, which can possibly be of a
3568                  * different type and different profile. We do this to prevent
3569                  * false error detections and crashes due to bogus attempts to
3570                  * repair extents.
3571                  */
3572                 spin_lock(&cache->lock);
3573                 if (cache->removed) {
3574                         spin_unlock(&cache->lock);
3575                         btrfs_put_block_group(cache);
3576                         goto skip;
3577                 }
3578                 btrfs_freeze_block_group(cache);
3579                 spin_unlock(&cache->lock);
3580
3581                 /*
3582                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3583                  * to avoid deadlock caused by:
3584                  * btrfs_inc_block_group_ro()
3585                  * -> btrfs_wait_for_commit()
3586                  * -> btrfs_commit_transaction()
3587                  * -> btrfs_scrub_pause()
3588                  */
3589                 scrub_pause_on(fs_info);
3590
3591                 /*
3592                  * Don't do chunk preallocation for scrub.
3593                  *
3594                  * This is especially important for SYSTEM bgs, or we can hit
3595                  * -EFBIG from btrfs_finish_chunk_alloc() like:
3596                  * 1. The only SYSTEM bg is marked RO.
3597                  *    Since SYSTEM bg is small, that's pretty common.
3598                  * 2. New SYSTEM bg will be allocated
3599                  *    Due to regular version will allocate new chunk.
3600                  * 3. New SYSTEM bg is empty and will get cleaned up
3601                  *    Before cleanup really happens, it's marked RO again.
3602                  * 4. Empty SYSTEM bg get scrubbed
3603                  *    We go back to 2.
3604                  *
3605                  * This can easily boost the amount of SYSTEM chunks if cleaner
3606                  * thread can't be triggered fast enough, and use up all space
3607                  * of btrfs_super_block::sys_chunk_array
3608                  *
3609                  * While for dev replace, we need to try our best to mark block
3610                  * group RO, to prevent race between:
3611                  * - Write duplication
3612                  *   Contains latest data
3613                  * - Scrub copy
3614                  *   Contains data from commit tree
3615                  *
3616                  * If target block group is not marked RO, nocow writes can
3617                  * be overwritten by scrub copy, causing data corruption.
3618                  * So for dev-replace, it's not allowed to continue if a block
3619                  * group is not RO.
3620                  */
3621                 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3622                 if (ret == 0) {
3623                         ro_set = 1;
3624                 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3625                         /*
3626                          * btrfs_inc_block_group_ro return -ENOSPC when it
3627                          * failed in creating new chunk for metadata.
3628                          * It is not a problem for scrub, because
3629                          * metadata are always cowed, and our scrub paused
3630                          * commit_transactions.
3631                          */
3632                         ro_set = 0;
3633                 } else {
3634                         btrfs_warn(fs_info,
3635                                    "failed setting block group ro: %d", ret);
3636                         btrfs_unfreeze_block_group(cache);
3637                         btrfs_put_block_group(cache);
3638                         scrub_pause_off(fs_info);
3639                         break;
3640                 }
3641
3642                 /*
3643                  * Now the target block is marked RO, wait for nocow writes to
3644                  * finish before dev-replace.
3645                  * COW is fine, as COW never overwrites extents in commit tree.
3646                  */
3647                 if (sctx->is_dev_replace) {
3648                         btrfs_wait_nocow_writers(cache);
3649                         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3650                                         cache->length);
3651                 }
3652
3653                 scrub_pause_off(fs_info);
3654                 down_write(&dev_replace->rwsem);
3655                 dev_replace->cursor_right = found_key.offset + length;
3656                 dev_replace->cursor_left = found_key.offset;
3657                 dev_replace->item_needs_writeback = 1;
3658                 up_write(&dev_replace->rwsem);
3659
3660                 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3661                                   found_key.offset, cache);
3662
3663                 /*
3664                  * flush, submit all pending read and write bios, afterwards
3665                  * wait for them.
3666                  * Note that in the dev replace case, a read request causes
3667                  * write requests that are submitted in the read completion
3668                  * worker. Therefore in the current situation, it is required
3669                  * that all write requests are flushed, so that all read and
3670                  * write requests are really completed when bios_in_flight
3671                  * changes to 0.
3672                  */
3673                 sctx->flush_all_writes = true;
3674                 scrub_submit(sctx);
3675                 mutex_lock(&sctx->wr_lock);
3676                 scrub_wr_submit(sctx);
3677                 mutex_unlock(&sctx->wr_lock);
3678
3679                 wait_event(sctx->list_wait,
3680                            atomic_read(&sctx->bios_in_flight) == 0);
3681
3682                 scrub_pause_on(fs_info);
3683
3684                 /*
3685                  * must be called before we decrease @scrub_paused.
3686                  * make sure we don't block transaction commit while
3687                  * we are waiting pending workers finished.
3688                  */
3689                 wait_event(sctx->list_wait,
3690                            atomic_read(&sctx->workers_pending) == 0);
3691                 sctx->flush_all_writes = false;
3692
3693                 scrub_pause_off(fs_info);
3694
3695                 down_write(&dev_replace->rwsem);
3696                 dev_replace->cursor_left = dev_replace->cursor_right;
3697                 dev_replace->item_needs_writeback = 1;
3698                 up_write(&dev_replace->rwsem);
3699
3700                 if (ro_set)
3701                         btrfs_dec_block_group_ro(cache);
3702
3703                 /*
3704                  * We might have prevented the cleaner kthread from deleting
3705                  * this block group if it was already unused because we raced
3706                  * and set it to RO mode first. So add it back to the unused
3707                  * list, otherwise it might not ever be deleted unless a manual
3708                  * balance is triggered or it becomes used and unused again.
3709                  */
3710                 spin_lock(&cache->lock);
3711                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3712                     cache->used == 0) {
3713                         spin_unlock(&cache->lock);
3714                         if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3715                                 btrfs_discard_queue_work(&fs_info->discard_ctl,
3716                                                          cache);
3717                         else
3718                                 btrfs_mark_bg_unused(cache);
3719                 } else {
3720                         spin_unlock(&cache->lock);
3721                 }
3722
3723                 btrfs_unfreeze_block_group(cache);
3724                 btrfs_put_block_group(cache);
3725                 if (ret)
3726                         break;
3727                 if (sctx->is_dev_replace &&
3728                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3729                         ret = -EIO;
3730                         break;
3731                 }
3732                 if (sctx->stat.malloc_errors > 0) {
3733                         ret = -ENOMEM;
3734                         break;
3735                 }
3736 skip:
3737                 key.offset = found_key.offset + length;
3738                 btrfs_release_path(path);
3739         }
3740
3741         btrfs_free_path(path);
3742
3743         return ret;
3744 }
3745
3746 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3747                                            struct btrfs_device *scrub_dev)
3748 {
3749         int     i;
3750         u64     bytenr;
3751         u64     gen;
3752         int     ret;
3753         struct btrfs_fs_info *fs_info = sctx->fs_info;
3754
3755         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3756                 return -EROFS;
3757
3758         /* Seed devices of a new filesystem has their own generation. */
3759         if (scrub_dev->fs_devices != fs_info->fs_devices)
3760                 gen = scrub_dev->generation;
3761         else
3762                 gen = fs_info->last_trans_committed;
3763
3764         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3765                 bytenr = btrfs_sb_offset(i);
3766                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3767                     scrub_dev->commit_total_bytes)
3768                         break;
3769                 if (!btrfs_check_super_location(scrub_dev, bytenr))
3770                         continue;
3771
3772                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3773                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3774                                   NULL, bytenr);
3775                 if (ret)
3776                         return ret;
3777         }
3778         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3779
3780         return 0;
3781 }
3782
3783 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
3784 {
3785         if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
3786                                         &fs_info->scrub_lock)) {
3787                 struct btrfs_workqueue *scrub_workers = NULL;
3788                 struct btrfs_workqueue *scrub_wr_comp = NULL;
3789                 struct btrfs_workqueue *scrub_parity = NULL;
3790
3791                 scrub_workers = fs_info->scrub_workers;
3792                 scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3793                 scrub_parity = fs_info->scrub_parity_workers;
3794
3795                 fs_info->scrub_workers = NULL;
3796                 fs_info->scrub_wr_completion_workers = NULL;
3797                 fs_info->scrub_parity_workers = NULL;
3798                 mutex_unlock(&fs_info->scrub_lock);
3799
3800                 btrfs_destroy_workqueue(scrub_workers);
3801                 btrfs_destroy_workqueue(scrub_wr_comp);
3802                 btrfs_destroy_workqueue(scrub_parity);
3803         }
3804 }
3805
3806 /*
3807  * get a reference count on fs_info->scrub_workers. start worker if necessary
3808  */
3809 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3810                                                 int is_dev_replace)
3811 {
3812         struct btrfs_workqueue *scrub_workers = NULL;
3813         struct btrfs_workqueue *scrub_wr_comp = NULL;
3814         struct btrfs_workqueue *scrub_parity = NULL;
3815         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
3816         int max_active = fs_info->thread_pool_size;
3817         int ret = -ENOMEM;
3818
3819         if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
3820                 return 0;
3821
3822         scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
3823                                               is_dev_replace ? 1 : max_active, 4);
3824         if (!scrub_workers)
3825                 goto fail_scrub_workers;
3826
3827         scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
3828                                               max_active, 2);
3829         if (!scrub_wr_comp)
3830                 goto fail_scrub_wr_completion_workers;
3831
3832         scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
3833                                              max_active, 2);
3834         if (!scrub_parity)
3835                 goto fail_scrub_parity_workers;
3836
3837         mutex_lock(&fs_info->scrub_lock);
3838         if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
3839                 ASSERT(fs_info->scrub_workers == NULL &&
3840                        fs_info->scrub_wr_completion_workers == NULL &&
3841                        fs_info->scrub_parity_workers == NULL);
3842                 fs_info->scrub_workers = scrub_workers;
3843                 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
3844                 fs_info->scrub_parity_workers = scrub_parity;
3845                 refcount_set(&fs_info->scrub_workers_refcnt, 1);
3846                 mutex_unlock(&fs_info->scrub_lock);
3847                 return 0;
3848         }
3849         /* Other thread raced in and created the workers for us */
3850         refcount_inc(&fs_info->scrub_workers_refcnt);
3851         mutex_unlock(&fs_info->scrub_lock);
3852
3853         ret = 0;
3854         btrfs_destroy_workqueue(scrub_parity);
3855 fail_scrub_parity_workers:
3856         btrfs_destroy_workqueue(scrub_wr_comp);
3857 fail_scrub_wr_completion_workers:
3858         btrfs_destroy_workqueue(scrub_workers);
3859 fail_scrub_workers:
3860         return ret;
3861 }
3862
3863 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3864                     u64 end, struct btrfs_scrub_progress *progress,
3865                     int readonly, int is_dev_replace)
3866 {
3867         struct scrub_ctx *sctx;
3868         int ret;
3869         struct btrfs_device *dev;
3870         unsigned int nofs_flag;
3871
3872         if (btrfs_fs_closing(fs_info))
3873                 return -EAGAIN;
3874
3875         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
3876                 /*
3877                  * in this case scrub is unable to calculate the checksum
3878                  * the way scrub is implemented. Do not handle this
3879                  * situation at all because it won't ever happen.
3880                  */
3881                 btrfs_err(fs_info,
3882                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
3883                        fs_info->nodesize,
3884                        BTRFS_STRIPE_LEN);
3885                 return -EINVAL;
3886         }
3887
3888         if (fs_info->nodesize >
3889             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3890             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3891                 /*
3892                  * would exhaust the array bounds of pagev member in
3893                  * struct scrub_block
3894                  */
3895                 btrfs_err(fs_info,
3896                           "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
3897                        fs_info->nodesize,
3898                        SCRUB_MAX_PAGES_PER_BLOCK,
3899                        fs_info->sectorsize,
3900                        SCRUB_MAX_PAGES_PER_BLOCK);
3901                 return -EINVAL;
3902         }
3903
3904         /* Allocate outside of device_list_mutex */
3905         sctx = scrub_setup_ctx(fs_info, is_dev_replace);
3906         if (IS_ERR(sctx))
3907                 return PTR_ERR(sctx);
3908
3909         ret = scrub_workers_get(fs_info, is_dev_replace);
3910         if (ret)
3911                 goto out_free_ctx;
3912
3913         mutex_lock(&fs_info->fs_devices->device_list_mutex);
3914         dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
3915         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
3916                      !is_dev_replace)) {
3917                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3918                 ret = -ENODEV;
3919                 goto out;
3920         }
3921
3922         if (!is_dev_replace && !readonly &&
3923             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
3924                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3925                 btrfs_err_in_rcu(fs_info,
3926                         "scrub on devid %llu: filesystem on %s is not writable",
3927                                  devid, rcu_str_deref(dev->name));
3928                 ret = -EROFS;
3929                 goto out;
3930         }
3931
3932         mutex_lock(&fs_info->scrub_lock);
3933         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3934             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
3935                 mutex_unlock(&fs_info->scrub_lock);
3936                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3937                 ret = -EIO;
3938                 goto out;
3939         }
3940
3941         down_read(&fs_info->dev_replace.rwsem);
3942         if (dev->scrub_ctx ||
3943             (!is_dev_replace &&
3944              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3945                 up_read(&fs_info->dev_replace.rwsem);
3946                 mutex_unlock(&fs_info->scrub_lock);
3947                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3948                 ret = -EINPROGRESS;
3949                 goto out;
3950         }
3951         up_read(&fs_info->dev_replace.rwsem);
3952
3953         sctx->readonly = readonly;
3954         dev->scrub_ctx = sctx;
3955         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3956
3957         /*
3958          * checking @scrub_pause_req here, we can avoid
3959          * race between committing transaction and scrubbing.
3960          */
3961         __scrub_blocked_if_needed(fs_info);
3962         atomic_inc(&fs_info->scrubs_running);
3963         mutex_unlock(&fs_info->scrub_lock);
3964
3965         /*
3966          * In order to avoid deadlock with reclaim when there is a transaction
3967          * trying to pause scrub, make sure we use GFP_NOFS for all the
3968          * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
3969          * invoked by our callees. The pausing request is done when the
3970          * transaction commit starts, and it blocks the transaction until scrub
3971          * is paused (done at specific points at scrub_stripe() or right above
3972          * before incrementing fs_info->scrubs_running).
3973          */
3974         nofs_flag = memalloc_nofs_save();
3975         if (!is_dev_replace) {
3976                 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
3977                 /*
3978                  * by holding device list mutex, we can
3979                  * kick off writing super in log tree sync.
3980                  */
3981                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3982                 ret = scrub_supers(sctx, dev);
3983                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3984         }
3985
3986         if (!ret)
3987                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
3988         memalloc_nofs_restore(nofs_flag);
3989
3990         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3991         atomic_dec(&fs_info->scrubs_running);
3992         wake_up(&fs_info->scrub_pause_wait);
3993
3994         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
3995
3996         if (progress)
3997                 memcpy(progress, &sctx->stat, sizeof(*progress));
3998
3999         if (!is_dev_replace)
4000                 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4001                         ret ? "not finished" : "finished", devid, ret);
4002
4003         mutex_lock(&fs_info->scrub_lock);
4004         dev->scrub_ctx = NULL;
4005         mutex_unlock(&fs_info->scrub_lock);
4006
4007         scrub_workers_put(fs_info);
4008         scrub_put_ctx(sctx);
4009
4010         return ret;
4011 out:
4012         scrub_workers_put(fs_info);
4013 out_free_ctx:
4014         scrub_free_ctx(sctx);
4015
4016         return ret;
4017 }
4018
4019 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4020 {
4021         mutex_lock(&fs_info->scrub_lock);
4022         atomic_inc(&fs_info->scrub_pause_req);
4023         while (atomic_read(&fs_info->scrubs_paused) !=
4024                atomic_read(&fs_info->scrubs_running)) {
4025                 mutex_unlock(&fs_info->scrub_lock);
4026                 wait_event(fs_info->scrub_pause_wait,
4027                            atomic_read(&fs_info->scrubs_paused) ==
4028                            atomic_read(&fs_info->scrubs_running));
4029                 mutex_lock(&fs_info->scrub_lock);
4030         }
4031         mutex_unlock(&fs_info->scrub_lock);
4032 }
4033
4034 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4035 {
4036         atomic_dec(&fs_info->scrub_pause_req);
4037         wake_up(&fs_info->scrub_pause_wait);
4038 }
4039
4040 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4041 {
4042         mutex_lock(&fs_info->scrub_lock);
4043         if (!atomic_read(&fs_info->scrubs_running)) {
4044                 mutex_unlock(&fs_info->scrub_lock);
4045                 return -ENOTCONN;
4046         }
4047
4048         atomic_inc(&fs_info->scrub_cancel_req);
4049         while (atomic_read(&fs_info->scrubs_running)) {
4050                 mutex_unlock(&fs_info->scrub_lock);
4051                 wait_event(fs_info->scrub_pause_wait,
4052                            atomic_read(&fs_info->scrubs_running) == 0);
4053                 mutex_lock(&fs_info->scrub_lock);
4054         }
4055         atomic_dec(&fs_info->scrub_cancel_req);
4056         mutex_unlock(&fs_info->scrub_lock);
4057
4058         return 0;
4059 }
4060
4061 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4062 {
4063         struct btrfs_fs_info *fs_info = dev->fs_info;
4064         struct scrub_ctx *sctx;
4065
4066         mutex_lock(&fs_info->scrub_lock);
4067         sctx = dev->scrub_ctx;
4068         if (!sctx) {
4069                 mutex_unlock(&fs_info->scrub_lock);
4070                 return -ENOTCONN;
4071         }
4072         atomic_inc(&sctx->cancel_req);
4073         while (dev->scrub_ctx) {
4074                 mutex_unlock(&fs_info->scrub_lock);
4075                 wait_event(fs_info->scrub_pause_wait,
4076                            dev->scrub_ctx == NULL);
4077                 mutex_lock(&fs_info->scrub_lock);
4078         }
4079         mutex_unlock(&fs_info->scrub_lock);
4080
4081         return 0;
4082 }
4083
4084 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4085                          struct btrfs_scrub_progress *progress)
4086 {
4087         struct btrfs_device *dev;
4088         struct scrub_ctx *sctx = NULL;
4089
4090         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4091         dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
4092         if (dev)
4093                 sctx = dev->scrub_ctx;
4094         if (sctx)
4095                 memcpy(progress, &sctx->stat, sizeof(*progress));
4096         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4097
4098         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4099 }
4100
4101 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4102                                u64 extent_logical, u32 extent_len,
4103                                u64 *extent_physical,
4104                                struct btrfs_device **extent_dev,
4105                                int *extent_mirror_num)
4106 {
4107         u64 mapped_length;
4108         struct btrfs_bio *bbio = NULL;
4109         int ret;
4110
4111         mapped_length = extent_len;
4112         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4113                               &mapped_length, &bbio, 0);
4114         if (ret || !bbio || mapped_length < extent_len ||
4115             !bbio->stripes[0].dev->bdev) {
4116                 btrfs_put_bbio(bbio);
4117                 return;
4118         }
4119
4120         *extent_physical = bbio->stripes[0].physical;
4121         *extent_mirror_num = bbio->mirror_num;
4122         *extent_dev = bbio->stripes[0].dev;
4123         btrfs_put_bbio(bbio);
4124 }