Merge tag 'mips_5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux
[linux-2.6-microblaze.git] / fs / btrfs / scrub.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
4  */
5
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include <crypto/hash.h>
10 #include "ctree.h"
11 #include "discard.h"
12 #include "volumes.h"
13 #include "disk-io.h"
14 #include "ordered-data.h"
15 #include "transaction.h"
16 #include "backref.h"
17 #include "extent_io.h"
18 #include "dev-replace.h"
19 #include "check-integrity.h"
20 #include "rcu-string.h"
21 #include "raid56.h"
22 #include "block-group.h"
23 #include "zoned.h"
24
25 /*
26  * This is only the first step towards a full-features scrub. It reads all
27  * extent and super block and verifies the checksums. In case a bad checksum
28  * is found or the extent cannot be read, good data will be written back if
29  * any can be found.
30  *
31  * Future enhancements:
32  *  - In case an unrepairable extent is encountered, track which files are
33  *    affected and report them
34  *  - track and record media errors, throw out bad devices
35  *  - add a mode to also read unallocated space
36  */
37
38 struct scrub_block;
39 struct scrub_ctx;
40
41 /*
42  * The following three values only influence the performance.
43  *
44  * The last one configures the number of parallel and outstanding I/O
45  * operations. The first one configures an upper limit for the number
46  * of (dynamically allocated) pages that are added to a bio.
47  */
48 #define SCRUB_SECTORS_PER_BIO   32      /* 128KiB per bio for 4KiB pages */
49 #define SCRUB_BIOS_PER_SCTX     64      /* 8MiB per device in flight for 4KiB pages */
50
51 /*
52  * The following value times PAGE_SIZE needs to be large enough to match the
53  * largest node/leaf/sector size that shall be supported.
54  */
55 #define SCRUB_MAX_SECTORS_PER_BLOCK     (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
56
57 struct scrub_recover {
58         refcount_t              refs;
59         struct btrfs_io_context *bioc;
60         u64                     map_length;
61 };
62
63 struct scrub_sector {
64         struct scrub_block      *sblock;
65         struct page             *page;
66         struct btrfs_device     *dev;
67         struct list_head        list;
68         u64                     flags;  /* extent flags */
69         u64                     generation;
70         u64                     logical;
71         u64                     physical;
72         u64                     physical_for_dev_replace;
73         atomic_t                refs;
74         u8                      mirror_num;
75         unsigned int            have_csum:1;
76         unsigned int            io_error:1;
77         u8                      csum[BTRFS_CSUM_SIZE];
78
79         struct scrub_recover    *recover;
80 };
81
82 struct scrub_bio {
83         int                     index;
84         struct scrub_ctx        *sctx;
85         struct btrfs_device     *dev;
86         struct bio              *bio;
87         blk_status_t            status;
88         u64                     logical;
89         u64                     physical;
90         struct scrub_sector     *sectors[SCRUB_SECTORS_PER_BIO];
91         int                     sector_count;
92         int                     next_free;
93         struct work_struct      work;
94 };
95
96 struct scrub_block {
97         struct scrub_sector     *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
98         int                     sector_count;
99         atomic_t                outstanding_sectors;
100         refcount_t              refs; /* free mem on transition to zero */
101         struct scrub_ctx        *sctx;
102         struct scrub_parity     *sparity;
103         struct {
104                 unsigned int    header_error:1;
105                 unsigned int    checksum_error:1;
106                 unsigned int    no_io_error_seen:1;
107                 unsigned int    generation_error:1; /* also sets header_error */
108
109                 /* The following is for the data used to check parity */
110                 /* It is for the data with checksum */
111                 unsigned int    data_corrected:1;
112         };
113         struct work_struct      work;
114 };
115
116 /* Used for the chunks with parity stripe such RAID5/6 */
117 struct scrub_parity {
118         struct scrub_ctx        *sctx;
119
120         struct btrfs_device     *scrub_dev;
121
122         u64                     logic_start;
123
124         u64                     logic_end;
125
126         int                     nsectors;
127
128         u32                     stripe_len;
129
130         refcount_t              refs;
131
132         struct list_head        sectors_list;
133
134         /* Work of parity check and repair */
135         struct work_struct      work;
136
137         /* Mark the parity blocks which have data */
138         unsigned long           *dbitmap;
139
140         /*
141          * Mark the parity blocks which have data, but errors happen when
142          * read data or check data
143          */
144         unsigned long           *ebitmap;
145
146         unsigned long           bitmap[];
147 };
148
149 struct scrub_ctx {
150         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
151         struct btrfs_fs_info    *fs_info;
152         int                     first_free;
153         int                     curr;
154         atomic_t                bios_in_flight;
155         atomic_t                workers_pending;
156         spinlock_t              list_lock;
157         wait_queue_head_t       list_wait;
158         struct list_head        csum_list;
159         atomic_t                cancel_req;
160         int                     readonly;
161         int                     sectors_per_bio;
162
163         /* State of IO submission throttling affecting the associated device */
164         ktime_t                 throttle_deadline;
165         u64                     throttle_sent;
166
167         int                     is_dev_replace;
168         u64                     write_pointer;
169
170         struct scrub_bio        *wr_curr_bio;
171         struct mutex            wr_lock;
172         struct btrfs_device     *wr_tgtdev;
173         bool                    flush_all_writes;
174
175         /*
176          * statistics
177          */
178         struct btrfs_scrub_progress stat;
179         spinlock_t              stat_lock;
180
181         /*
182          * Use a ref counter to avoid use-after-free issues. Scrub workers
183          * decrement bios_in_flight and workers_pending and then do a wakeup
184          * on the list_wait wait queue. We must ensure the main scrub task
185          * doesn't free the scrub context before or while the workers are
186          * doing the wakeup() call.
187          */
188         refcount_t              refs;
189 };
190
191 struct scrub_warning {
192         struct btrfs_path       *path;
193         u64                     extent_item_size;
194         const char              *errstr;
195         u64                     physical;
196         u64                     logical;
197         struct btrfs_device     *dev;
198 };
199
200 struct full_stripe_lock {
201         struct rb_node node;
202         u64 logical;
203         u64 refs;
204         struct mutex mutex;
205 };
206
207 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
208                                      struct scrub_block *sblocks_for_recheck);
209 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
210                                 struct scrub_block *sblock,
211                                 int retry_failed_mirror);
212 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
213 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
214                                              struct scrub_block *sblock_good);
215 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
216                                             struct scrub_block *sblock_good,
217                                             int sector_num, int force_write);
218 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
219 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
220                                              int sector_num);
221 static int scrub_checksum_data(struct scrub_block *sblock);
222 static int scrub_checksum_tree_block(struct scrub_block *sblock);
223 static int scrub_checksum_super(struct scrub_block *sblock);
224 static void scrub_block_put(struct scrub_block *sblock);
225 static void scrub_sector_get(struct scrub_sector *sector);
226 static void scrub_sector_put(struct scrub_sector *sector);
227 static void scrub_parity_get(struct scrub_parity *sparity);
228 static void scrub_parity_put(struct scrub_parity *sparity);
229 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
230                          u64 physical, struct btrfs_device *dev, u64 flags,
231                          u64 gen, int mirror_num, u8 *csum,
232                          u64 physical_for_dev_replace);
233 static void scrub_bio_end_io(struct bio *bio);
234 static void scrub_bio_end_io_worker(struct work_struct *work);
235 static void scrub_block_complete(struct scrub_block *sblock);
236 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
237                                  u64 extent_logical, u32 extent_len,
238                                  u64 *extent_physical,
239                                  struct btrfs_device **extent_dev,
240                                  int *extent_mirror_num);
241 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
242                                       struct scrub_sector *sector);
243 static void scrub_wr_submit(struct scrub_ctx *sctx);
244 static void scrub_wr_bio_end_io(struct bio *bio);
245 static void scrub_wr_bio_end_io_worker(struct work_struct *work);
246 static void scrub_put_ctx(struct scrub_ctx *sctx);
247
248 static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
249 {
250         return sector->recover &&
251                (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
252 }
253
254 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
255 {
256         refcount_inc(&sctx->refs);
257         atomic_inc(&sctx->bios_in_flight);
258 }
259
260 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
261 {
262         atomic_dec(&sctx->bios_in_flight);
263         wake_up(&sctx->list_wait);
264         scrub_put_ctx(sctx);
265 }
266
267 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
268 {
269         while (atomic_read(&fs_info->scrub_pause_req)) {
270                 mutex_unlock(&fs_info->scrub_lock);
271                 wait_event(fs_info->scrub_pause_wait,
272                    atomic_read(&fs_info->scrub_pause_req) == 0);
273                 mutex_lock(&fs_info->scrub_lock);
274         }
275 }
276
277 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
278 {
279         atomic_inc(&fs_info->scrubs_paused);
280         wake_up(&fs_info->scrub_pause_wait);
281 }
282
283 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
284 {
285         mutex_lock(&fs_info->scrub_lock);
286         __scrub_blocked_if_needed(fs_info);
287         atomic_dec(&fs_info->scrubs_paused);
288         mutex_unlock(&fs_info->scrub_lock);
289
290         wake_up(&fs_info->scrub_pause_wait);
291 }
292
293 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
294 {
295         scrub_pause_on(fs_info);
296         scrub_pause_off(fs_info);
297 }
298
299 /*
300  * Insert new full stripe lock into full stripe locks tree
301  *
302  * Return pointer to existing or newly inserted full_stripe_lock structure if
303  * everything works well.
304  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
305  *
306  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
307  * function
308  */
309 static struct full_stripe_lock *insert_full_stripe_lock(
310                 struct btrfs_full_stripe_locks_tree *locks_root,
311                 u64 fstripe_logical)
312 {
313         struct rb_node **p;
314         struct rb_node *parent = NULL;
315         struct full_stripe_lock *entry;
316         struct full_stripe_lock *ret;
317
318         lockdep_assert_held(&locks_root->lock);
319
320         p = &locks_root->root.rb_node;
321         while (*p) {
322                 parent = *p;
323                 entry = rb_entry(parent, struct full_stripe_lock, node);
324                 if (fstripe_logical < entry->logical) {
325                         p = &(*p)->rb_left;
326                 } else if (fstripe_logical > entry->logical) {
327                         p = &(*p)->rb_right;
328                 } else {
329                         entry->refs++;
330                         return entry;
331                 }
332         }
333
334         /*
335          * Insert new lock.
336          */
337         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
338         if (!ret)
339                 return ERR_PTR(-ENOMEM);
340         ret->logical = fstripe_logical;
341         ret->refs = 1;
342         mutex_init(&ret->mutex);
343
344         rb_link_node(&ret->node, parent, p);
345         rb_insert_color(&ret->node, &locks_root->root);
346         return ret;
347 }
348
349 /*
350  * Search for a full stripe lock of a block group
351  *
352  * Return pointer to existing full stripe lock if found
353  * Return NULL if not found
354  */
355 static struct full_stripe_lock *search_full_stripe_lock(
356                 struct btrfs_full_stripe_locks_tree *locks_root,
357                 u64 fstripe_logical)
358 {
359         struct rb_node *node;
360         struct full_stripe_lock *entry;
361
362         lockdep_assert_held(&locks_root->lock);
363
364         node = locks_root->root.rb_node;
365         while (node) {
366                 entry = rb_entry(node, struct full_stripe_lock, node);
367                 if (fstripe_logical < entry->logical)
368                         node = node->rb_left;
369                 else if (fstripe_logical > entry->logical)
370                         node = node->rb_right;
371                 else
372                         return entry;
373         }
374         return NULL;
375 }
376
377 /*
378  * Helper to get full stripe logical from a normal bytenr.
379  *
380  * Caller must ensure @cache is a RAID56 block group.
381  */
382 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
383 {
384         u64 ret;
385
386         /*
387          * Due to chunk item size limit, full stripe length should not be
388          * larger than U32_MAX. Just a sanity check here.
389          */
390         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
391
392         /*
393          * round_down() can only handle power of 2, while RAID56 full
394          * stripe length can be 64KiB * n, so we need to manually round down.
395          */
396         ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
397                         cache->full_stripe_len + cache->start;
398         return ret;
399 }
400
401 /*
402  * Lock a full stripe to avoid concurrency of recovery and read
403  *
404  * It's only used for profiles with parities (RAID5/6), for other profiles it
405  * does nothing.
406  *
407  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
408  * So caller must call unlock_full_stripe() at the same context.
409  *
410  * Return <0 if encounters error.
411  */
412 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
413                             bool *locked_ret)
414 {
415         struct btrfs_block_group *bg_cache;
416         struct btrfs_full_stripe_locks_tree *locks_root;
417         struct full_stripe_lock *existing;
418         u64 fstripe_start;
419         int ret = 0;
420
421         *locked_ret = false;
422         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
423         if (!bg_cache) {
424                 ASSERT(0);
425                 return -ENOENT;
426         }
427
428         /* Profiles not based on parity don't need full stripe lock */
429         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
430                 goto out;
431         locks_root = &bg_cache->full_stripe_locks_root;
432
433         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
434
435         /* Now insert the full stripe lock */
436         mutex_lock(&locks_root->lock);
437         existing = insert_full_stripe_lock(locks_root, fstripe_start);
438         mutex_unlock(&locks_root->lock);
439         if (IS_ERR(existing)) {
440                 ret = PTR_ERR(existing);
441                 goto out;
442         }
443         mutex_lock(&existing->mutex);
444         *locked_ret = true;
445 out:
446         btrfs_put_block_group(bg_cache);
447         return ret;
448 }
449
450 /*
451  * Unlock a full stripe.
452  *
453  * NOTE: Caller must ensure it's the same context calling corresponding
454  * lock_full_stripe().
455  *
456  * Return 0 if we unlock full stripe without problem.
457  * Return <0 for error
458  */
459 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
460                               bool locked)
461 {
462         struct btrfs_block_group *bg_cache;
463         struct btrfs_full_stripe_locks_tree *locks_root;
464         struct full_stripe_lock *fstripe_lock;
465         u64 fstripe_start;
466         bool freeit = false;
467         int ret = 0;
468
469         /* If we didn't acquire full stripe lock, no need to continue */
470         if (!locked)
471                 return 0;
472
473         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
474         if (!bg_cache) {
475                 ASSERT(0);
476                 return -ENOENT;
477         }
478         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
479                 goto out;
480
481         locks_root = &bg_cache->full_stripe_locks_root;
482         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
483
484         mutex_lock(&locks_root->lock);
485         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
486         /* Unpaired unlock_full_stripe() detected */
487         if (!fstripe_lock) {
488                 WARN_ON(1);
489                 ret = -ENOENT;
490                 mutex_unlock(&locks_root->lock);
491                 goto out;
492         }
493
494         if (fstripe_lock->refs == 0) {
495                 WARN_ON(1);
496                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
497                         fstripe_lock->logical);
498         } else {
499                 fstripe_lock->refs--;
500         }
501
502         if (fstripe_lock->refs == 0) {
503                 rb_erase(&fstripe_lock->node, &locks_root->root);
504                 freeit = true;
505         }
506         mutex_unlock(&locks_root->lock);
507
508         mutex_unlock(&fstripe_lock->mutex);
509         if (freeit)
510                 kfree(fstripe_lock);
511 out:
512         btrfs_put_block_group(bg_cache);
513         return ret;
514 }
515
516 static void scrub_free_csums(struct scrub_ctx *sctx)
517 {
518         while (!list_empty(&sctx->csum_list)) {
519                 struct btrfs_ordered_sum *sum;
520                 sum = list_first_entry(&sctx->csum_list,
521                                        struct btrfs_ordered_sum, list);
522                 list_del(&sum->list);
523                 kfree(sum);
524         }
525 }
526
527 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
528 {
529         int i;
530
531         if (!sctx)
532                 return;
533
534         /* this can happen when scrub is cancelled */
535         if (sctx->curr != -1) {
536                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
537
538                 for (i = 0; i < sbio->sector_count; i++) {
539                         WARN_ON(!sbio->sectors[i]->page);
540                         scrub_block_put(sbio->sectors[i]->sblock);
541                 }
542                 bio_put(sbio->bio);
543         }
544
545         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
546                 struct scrub_bio *sbio = sctx->bios[i];
547
548                 if (!sbio)
549                         break;
550                 kfree(sbio);
551         }
552
553         kfree(sctx->wr_curr_bio);
554         scrub_free_csums(sctx);
555         kfree(sctx);
556 }
557
558 static void scrub_put_ctx(struct scrub_ctx *sctx)
559 {
560         if (refcount_dec_and_test(&sctx->refs))
561                 scrub_free_ctx(sctx);
562 }
563
564 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
565                 struct btrfs_fs_info *fs_info, int is_dev_replace)
566 {
567         struct scrub_ctx *sctx;
568         int             i;
569
570         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
571         if (!sctx)
572                 goto nomem;
573         refcount_set(&sctx->refs, 1);
574         sctx->is_dev_replace = is_dev_replace;
575         sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
576         sctx->curr = -1;
577         sctx->fs_info = fs_info;
578         INIT_LIST_HEAD(&sctx->csum_list);
579         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
580                 struct scrub_bio *sbio;
581
582                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
583                 if (!sbio)
584                         goto nomem;
585                 sctx->bios[i] = sbio;
586
587                 sbio->index = i;
588                 sbio->sctx = sctx;
589                 sbio->sector_count = 0;
590                 INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
591
592                 if (i != SCRUB_BIOS_PER_SCTX - 1)
593                         sctx->bios[i]->next_free = i + 1;
594                 else
595                         sctx->bios[i]->next_free = -1;
596         }
597         sctx->first_free = 0;
598         atomic_set(&sctx->bios_in_flight, 0);
599         atomic_set(&sctx->workers_pending, 0);
600         atomic_set(&sctx->cancel_req, 0);
601
602         spin_lock_init(&sctx->list_lock);
603         spin_lock_init(&sctx->stat_lock);
604         init_waitqueue_head(&sctx->list_wait);
605         sctx->throttle_deadline = 0;
606
607         WARN_ON(sctx->wr_curr_bio != NULL);
608         mutex_init(&sctx->wr_lock);
609         sctx->wr_curr_bio = NULL;
610         if (is_dev_replace) {
611                 WARN_ON(!fs_info->dev_replace.tgtdev);
612                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
613                 sctx->flush_all_writes = false;
614         }
615
616         return sctx;
617
618 nomem:
619         scrub_free_ctx(sctx);
620         return ERR_PTR(-ENOMEM);
621 }
622
623 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
624                                      void *warn_ctx)
625 {
626         u32 nlink;
627         int ret;
628         int i;
629         unsigned nofs_flag;
630         struct extent_buffer *eb;
631         struct btrfs_inode_item *inode_item;
632         struct scrub_warning *swarn = warn_ctx;
633         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
634         struct inode_fs_paths *ipath = NULL;
635         struct btrfs_root *local_root;
636         struct btrfs_key key;
637
638         local_root = btrfs_get_fs_root(fs_info, root, true);
639         if (IS_ERR(local_root)) {
640                 ret = PTR_ERR(local_root);
641                 goto err;
642         }
643
644         /*
645          * this makes the path point to (inum INODE_ITEM ioff)
646          */
647         key.objectid = inum;
648         key.type = BTRFS_INODE_ITEM_KEY;
649         key.offset = 0;
650
651         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
652         if (ret) {
653                 btrfs_put_root(local_root);
654                 btrfs_release_path(swarn->path);
655                 goto err;
656         }
657
658         eb = swarn->path->nodes[0];
659         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
660                                         struct btrfs_inode_item);
661         nlink = btrfs_inode_nlink(eb, inode_item);
662         btrfs_release_path(swarn->path);
663
664         /*
665          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
666          * uses GFP_NOFS in this context, so we keep it consistent but it does
667          * not seem to be strictly necessary.
668          */
669         nofs_flag = memalloc_nofs_save();
670         ipath = init_ipath(4096, local_root, swarn->path);
671         memalloc_nofs_restore(nofs_flag);
672         if (IS_ERR(ipath)) {
673                 btrfs_put_root(local_root);
674                 ret = PTR_ERR(ipath);
675                 ipath = NULL;
676                 goto err;
677         }
678         ret = paths_from_inode(inum, ipath);
679
680         if (ret < 0)
681                 goto err;
682
683         /*
684          * we deliberately ignore the bit ipath might have been too small to
685          * hold all of the paths here
686          */
687         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
688                 btrfs_warn_in_rcu(fs_info,
689 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
690                                   swarn->errstr, swarn->logical,
691                                   rcu_str_deref(swarn->dev->name),
692                                   swarn->physical,
693                                   root, inum, offset,
694                                   fs_info->sectorsize, nlink,
695                                   (char *)(unsigned long)ipath->fspath->val[i]);
696
697         btrfs_put_root(local_root);
698         free_ipath(ipath);
699         return 0;
700
701 err:
702         btrfs_warn_in_rcu(fs_info,
703                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
704                           swarn->errstr, swarn->logical,
705                           rcu_str_deref(swarn->dev->name),
706                           swarn->physical,
707                           root, inum, offset, ret);
708
709         free_ipath(ipath);
710         return 0;
711 }
712
713 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
714 {
715         struct btrfs_device *dev;
716         struct btrfs_fs_info *fs_info;
717         struct btrfs_path *path;
718         struct btrfs_key found_key;
719         struct extent_buffer *eb;
720         struct btrfs_extent_item *ei;
721         struct scrub_warning swarn;
722         unsigned long ptr = 0;
723         u64 extent_item_pos;
724         u64 flags = 0;
725         u64 ref_root;
726         u32 item_size;
727         u8 ref_level = 0;
728         int ret;
729
730         WARN_ON(sblock->sector_count < 1);
731         dev = sblock->sectors[0]->dev;
732         fs_info = sblock->sctx->fs_info;
733
734         path = btrfs_alloc_path();
735         if (!path)
736                 return;
737
738         swarn.physical = sblock->sectors[0]->physical;
739         swarn.logical = sblock->sectors[0]->logical;
740         swarn.errstr = errstr;
741         swarn.dev = NULL;
742
743         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
744                                   &flags);
745         if (ret < 0)
746                 goto out;
747
748         extent_item_pos = swarn.logical - found_key.objectid;
749         swarn.extent_item_size = found_key.offset;
750
751         eb = path->nodes[0];
752         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
753         item_size = btrfs_item_size(eb, path->slots[0]);
754
755         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
756                 do {
757                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
758                                                       item_size, &ref_root,
759                                                       &ref_level);
760                         btrfs_warn_in_rcu(fs_info,
761 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
762                                 errstr, swarn.logical,
763                                 rcu_str_deref(dev->name),
764                                 swarn.physical,
765                                 ref_level ? "node" : "leaf",
766                                 ret < 0 ? -1 : ref_level,
767                                 ret < 0 ? -1 : ref_root);
768                 } while (ret != 1);
769                 btrfs_release_path(path);
770         } else {
771                 btrfs_release_path(path);
772                 swarn.path = path;
773                 swarn.dev = dev;
774                 iterate_extent_inodes(fs_info, found_key.objectid,
775                                         extent_item_pos, 1,
776                                         scrub_print_warning_inode, &swarn, false);
777         }
778
779 out:
780         btrfs_free_path(path);
781 }
782
783 static inline void scrub_get_recover(struct scrub_recover *recover)
784 {
785         refcount_inc(&recover->refs);
786 }
787
788 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
789                                      struct scrub_recover *recover)
790 {
791         if (refcount_dec_and_test(&recover->refs)) {
792                 btrfs_bio_counter_dec(fs_info);
793                 btrfs_put_bioc(recover->bioc);
794                 kfree(recover);
795         }
796 }
797
798 /*
799  * scrub_handle_errored_block gets called when either verification of the
800  * sectors failed or the bio failed to read, e.g. with EIO. In the latter
801  * case, this function handles all sectors in the bio, even though only one
802  * may be bad.
803  * The goal of this function is to repair the errored block by using the
804  * contents of one of the mirrors.
805  */
806 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
807 {
808         struct scrub_ctx *sctx = sblock_to_check->sctx;
809         struct btrfs_device *dev;
810         struct btrfs_fs_info *fs_info;
811         u64 logical;
812         unsigned int failed_mirror_index;
813         unsigned int is_metadata;
814         unsigned int have_csum;
815         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
816         struct scrub_block *sblock_bad;
817         int ret;
818         int mirror_index;
819         int sector_num;
820         int success;
821         bool full_stripe_locked;
822         unsigned int nofs_flag;
823         static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
824                                       DEFAULT_RATELIMIT_BURST);
825
826         BUG_ON(sblock_to_check->sector_count < 1);
827         fs_info = sctx->fs_info;
828         if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
829                 /*
830                  * if we find an error in a super block, we just report it.
831                  * They will get written with the next transaction commit
832                  * anyway
833                  */
834                 spin_lock(&sctx->stat_lock);
835                 ++sctx->stat.super_errors;
836                 spin_unlock(&sctx->stat_lock);
837                 return 0;
838         }
839         logical = sblock_to_check->sectors[0]->logical;
840         BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
841         failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
842         is_metadata = !(sblock_to_check->sectors[0]->flags &
843                         BTRFS_EXTENT_FLAG_DATA);
844         have_csum = sblock_to_check->sectors[0]->have_csum;
845         dev = sblock_to_check->sectors[0]->dev;
846
847         if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
848                 return 0;
849
850         /*
851          * We must use GFP_NOFS because the scrub task might be waiting for a
852          * worker task executing this function and in turn a transaction commit
853          * might be waiting the scrub task to pause (which needs to wait for all
854          * the worker tasks to complete before pausing).
855          * We do allocations in the workers through insert_full_stripe_lock()
856          * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
857          * this function.
858          */
859         nofs_flag = memalloc_nofs_save();
860         /*
861          * For RAID5/6, race can happen for a different device scrub thread.
862          * For data corruption, Parity and Data threads will both try
863          * to recovery the data.
864          * Race can lead to doubly added csum error, or even unrecoverable
865          * error.
866          */
867         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
868         if (ret < 0) {
869                 memalloc_nofs_restore(nofs_flag);
870                 spin_lock(&sctx->stat_lock);
871                 if (ret == -ENOMEM)
872                         sctx->stat.malloc_errors++;
873                 sctx->stat.read_errors++;
874                 sctx->stat.uncorrectable_errors++;
875                 spin_unlock(&sctx->stat_lock);
876                 return ret;
877         }
878
879         /*
880          * read all mirrors one after the other. This includes to
881          * re-read the extent or metadata block that failed (that was
882          * the cause that this fixup code is called) another time,
883          * sector by sector this time in order to know which sectors
884          * caused I/O errors and which ones are good (for all mirrors).
885          * It is the goal to handle the situation when more than one
886          * mirror contains I/O errors, but the errors do not
887          * overlap, i.e. the data can be repaired by selecting the
888          * sectors from those mirrors without I/O error on the
889          * particular sectors. One example (with blocks >= 2 * sectorsize)
890          * would be that mirror #1 has an I/O error on the first sector,
891          * the second sector is good, and mirror #2 has an I/O error on
892          * the second sector, but the first sector is good.
893          * Then the first sector of the first mirror can be repaired by
894          * taking the first sector of the second mirror, and the
895          * second sector of the second mirror can be repaired by
896          * copying the contents of the 2nd sector of the 1st mirror.
897          * One more note: if the sectors of one mirror contain I/O
898          * errors, the checksum cannot be verified. In order to get
899          * the best data for repairing, the first attempt is to find
900          * a mirror without I/O errors and with a validated checksum.
901          * Only if this is not possible, the sectors are picked from
902          * mirrors with I/O errors without considering the checksum.
903          * If the latter is the case, at the end, the checksum of the
904          * repaired area is verified in order to correctly maintain
905          * the statistics.
906          */
907
908         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
909                                       sizeof(*sblocks_for_recheck), GFP_KERNEL);
910         if (!sblocks_for_recheck) {
911                 spin_lock(&sctx->stat_lock);
912                 sctx->stat.malloc_errors++;
913                 sctx->stat.read_errors++;
914                 sctx->stat.uncorrectable_errors++;
915                 spin_unlock(&sctx->stat_lock);
916                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
917                 goto out;
918         }
919
920         /* Setup the context, map the logical blocks and alloc the sectors */
921         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
922         if (ret) {
923                 spin_lock(&sctx->stat_lock);
924                 sctx->stat.read_errors++;
925                 sctx->stat.uncorrectable_errors++;
926                 spin_unlock(&sctx->stat_lock);
927                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
928                 goto out;
929         }
930         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
931         sblock_bad = sblocks_for_recheck + failed_mirror_index;
932
933         /* build and submit the bios for the failed mirror, check checksums */
934         scrub_recheck_block(fs_info, sblock_bad, 1);
935
936         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
937             sblock_bad->no_io_error_seen) {
938                 /*
939                  * The error disappeared after reading sector by sector, or
940                  * the area was part of a huge bio and other parts of the
941                  * bio caused I/O errors, or the block layer merged several
942                  * read requests into one and the error is caused by a
943                  * different bio (usually one of the two latter cases is
944                  * the cause)
945                  */
946                 spin_lock(&sctx->stat_lock);
947                 sctx->stat.unverified_errors++;
948                 sblock_to_check->data_corrected = 1;
949                 spin_unlock(&sctx->stat_lock);
950
951                 if (sctx->is_dev_replace)
952                         scrub_write_block_to_dev_replace(sblock_bad);
953                 goto out;
954         }
955
956         if (!sblock_bad->no_io_error_seen) {
957                 spin_lock(&sctx->stat_lock);
958                 sctx->stat.read_errors++;
959                 spin_unlock(&sctx->stat_lock);
960                 if (__ratelimit(&rs))
961                         scrub_print_warning("i/o error", sblock_to_check);
962                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
963         } else if (sblock_bad->checksum_error) {
964                 spin_lock(&sctx->stat_lock);
965                 sctx->stat.csum_errors++;
966                 spin_unlock(&sctx->stat_lock);
967                 if (__ratelimit(&rs))
968                         scrub_print_warning("checksum error", sblock_to_check);
969                 btrfs_dev_stat_inc_and_print(dev,
970                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
971         } else if (sblock_bad->header_error) {
972                 spin_lock(&sctx->stat_lock);
973                 sctx->stat.verify_errors++;
974                 spin_unlock(&sctx->stat_lock);
975                 if (__ratelimit(&rs))
976                         scrub_print_warning("checksum/header error",
977                                             sblock_to_check);
978                 if (sblock_bad->generation_error)
979                         btrfs_dev_stat_inc_and_print(dev,
980                                 BTRFS_DEV_STAT_GENERATION_ERRS);
981                 else
982                         btrfs_dev_stat_inc_and_print(dev,
983                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
984         }
985
986         if (sctx->readonly) {
987                 ASSERT(!sctx->is_dev_replace);
988                 goto out;
989         }
990
991         /*
992          * now build and submit the bios for the other mirrors, check
993          * checksums.
994          * First try to pick the mirror which is completely without I/O
995          * errors and also does not have a checksum error.
996          * If one is found, and if a checksum is present, the full block
997          * that is known to contain an error is rewritten. Afterwards
998          * the block is known to be corrected.
999          * If a mirror is found which is completely correct, and no
1000          * checksum is present, only those sectors are rewritten that had
1001          * an I/O error in the block to be repaired, since it cannot be
1002          * determined, which copy of the other sectors is better (and it
1003          * could happen otherwise that a correct sector would be
1004          * overwritten by a bad one).
1005          */
1006         for (mirror_index = 0; ;mirror_index++) {
1007                 struct scrub_block *sblock_other;
1008
1009                 if (mirror_index == failed_mirror_index)
1010                         continue;
1011
1012                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1013                 if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1014                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1015                                 break;
1016                         if (!sblocks_for_recheck[mirror_index].sector_count)
1017                                 break;
1018
1019                         sblock_other = sblocks_for_recheck + mirror_index;
1020                 } else {
1021                         struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1022                         int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1023
1024                         if (mirror_index >= max_allowed)
1025                                 break;
1026                         if (!sblocks_for_recheck[1].sector_count)
1027                                 break;
1028
1029                         ASSERT(failed_mirror_index == 0);
1030                         sblock_other = sblocks_for_recheck + 1;
1031                         sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
1032                 }
1033
1034                 /* build and submit the bios, check checksums */
1035                 scrub_recheck_block(fs_info, sblock_other, 0);
1036
1037                 if (!sblock_other->header_error &&
1038                     !sblock_other->checksum_error &&
1039                     sblock_other->no_io_error_seen) {
1040                         if (sctx->is_dev_replace) {
1041                                 scrub_write_block_to_dev_replace(sblock_other);
1042                                 goto corrected_error;
1043                         } else {
1044                                 ret = scrub_repair_block_from_good_copy(
1045                                                 sblock_bad, sblock_other);
1046                                 if (!ret)
1047                                         goto corrected_error;
1048                         }
1049                 }
1050         }
1051
1052         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1053                 goto did_not_correct_error;
1054
1055         /*
1056          * In case of I/O errors in the area that is supposed to be
1057          * repaired, continue by picking good copies of those sectors.
1058          * Select the good sectors from mirrors to rewrite bad sectors from
1059          * the area to fix. Afterwards verify the checksum of the block
1060          * that is supposed to be repaired. This verification step is
1061          * only done for the purpose of statistic counting and for the
1062          * final scrub report, whether errors remain.
1063          * A perfect algorithm could make use of the checksum and try
1064          * all possible combinations of sectors from the different mirrors
1065          * until the checksum verification succeeds. For example, when
1066          * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1067          * of mirror #2 is readable but the final checksum test fails,
1068          * then the 2nd sector of mirror #3 could be tried, whether now
1069          * the final checksum succeeds. But this would be a rare
1070          * exception and is therefore not implemented. At least it is
1071          * avoided that the good copy is overwritten.
1072          * A more useful improvement would be to pick the sectors
1073          * without I/O error based on sector sizes (512 bytes on legacy
1074          * disks) instead of on sectorsize. Then maybe 512 byte of one
1075          * mirror could be repaired by taking 512 byte of a different
1076          * mirror, even if other 512 byte sectors in the same sectorsize
1077          * area are unreadable.
1078          */
1079         success = 1;
1080         for (sector_num = 0; sector_num < sblock_bad->sector_count;
1081              sector_num++) {
1082                 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1083                 struct scrub_block *sblock_other = NULL;
1084
1085                 /* Skip no-io-error sectors in scrub */
1086                 if (!sector_bad->io_error && !sctx->is_dev_replace)
1087                         continue;
1088
1089                 if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1090                         /*
1091                          * In case of dev replace, if raid56 rebuild process
1092                          * didn't work out correct data, then copy the content
1093                          * in sblock_bad to make sure target device is identical
1094                          * to source device, instead of writing garbage data in
1095                          * sblock_for_recheck array to target device.
1096                          */
1097                         sblock_other = NULL;
1098                 } else if (sector_bad->io_error) {
1099                         /* Try to find no-io-error sector in mirrors */
1100                         for (mirror_index = 0;
1101                              mirror_index < BTRFS_MAX_MIRRORS &&
1102                              sblocks_for_recheck[mirror_index].sector_count > 0;
1103                              mirror_index++) {
1104                                 if (!sblocks_for_recheck[mirror_index].
1105                                     sectors[sector_num]->io_error) {
1106                                         sblock_other = sblocks_for_recheck +
1107                                                        mirror_index;
1108                                         break;
1109                                 }
1110                         }
1111                         if (!sblock_other)
1112                                 success = 0;
1113                 }
1114
1115                 if (sctx->is_dev_replace) {
1116                         /*
1117                          * Did not find a mirror to fetch the sector from.
1118                          * scrub_write_sector_to_dev_replace() handles this
1119                          * case (sector->io_error), by filling the block with
1120                          * zeros before submitting the write request
1121                          */
1122                         if (!sblock_other)
1123                                 sblock_other = sblock_bad;
1124
1125                         if (scrub_write_sector_to_dev_replace(sblock_other,
1126                                                               sector_num) != 0) {
1127                                 atomic64_inc(
1128                                         &fs_info->dev_replace.num_write_errors);
1129                                 success = 0;
1130                         }
1131                 } else if (sblock_other) {
1132                         ret = scrub_repair_sector_from_good_copy(sblock_bad,
1133                                                                  sblock_other,
1134                                                                  sector_num, 0);
1135                         if (0 == ret)
1136                                 sector_bad->io_error = 0;
1137                         else
1138                                 success = 0;
1139                 }
1140         }
1141
1142         if (success && !sctx->is_dev_replace) {
1143                 if (is_metadata || have_csum) {
1144                         /*
1145                          * need to verify the checksum now that all
1146                          * sectors on disk are repaired (the write
1147                          * request for data to be repaired is on its way).
1148                          * Just be lazy and use scrub_recheck_block()
1149                          * which re-reads the data before the checksum
1150                          * is verified, but most likely the data comes out
1151                          * of the page cache.
1152                          */
1153                         scrub_recheck_block(fs_info, sblock_bad, 1);
1154                         if (!sblock_bad->header_error &&
1155                             !sblock_bad->checksum_error &&
1156                             sblock_bad->no_io_error_seen)
1157                                 goto corrected_error;
1158                         else
1159                                 goto did_not_correct_error;
1160                 } else {
1161 corrected_error:
1162                         spin_lock(&sctx->stat_lock);
1163                         sctx->stat.corrected_errors++;
1164                         sblock_to_check->data_corrected = 1;
1165                         spin_unlock(&sctx->stat_lock);
1166                         btrfs_err_rl_in_rcu(fs_info,
1167                                 "fixed up error at logical %llu on dev %s",
1168                                 logical, rcu_str_deref(dev->name));
1169                 }
1170         } else {
1171 did_not_correct_error:
1172                 spin_lock(&sctx->stat_lock);
1173                 sctx->stat.uncorrectable_errors++;
1174                 spin_unlock(&sctx->stat_lock);
1175                 btrfs_err_rl_in_rcu(fs_info,
1176                         "unable to fixup (regular) error at logical %llu on dev %s",
1177                         logical, rcu_str_deref(dev->name));
1178         }
1179
1180 out:
1181         if (sblocks_for_recheck) {
1182                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1183                      mirror_index++) {
1184                         struct scrub_block *sblock = sblocks_for_recheck +
1185                                                      mirror_index;
1186                         struct scrub_recover *recover;
1187                         int i;
1188
1189                         for (i = 0; i < sblock->sector_count; i++) {
1190                                 sblock->sectors[i]->sblock = NULL;
1191                                 recover = sblock->sectors[i]->recover;
1192                                 if (recover) {
1193                                         scrub_put_recover(fs_info, recover);
1194                                         sblock->sectors[i]->recover = NULL;
1195                                 }
1196                                 scrub_sector_put(sblock->sectors[i]);
1197                         }
1198                 }
1199                 kfree(sblocks_for_recheck);
1200         }
1201
1202         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1203         memalloc_nofs_restore(nofs_flag);
1204         if (ret < 0)
1205                 return ret;
1206         return 0;
1207 }
1208
1209 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1210 {
1211         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1212                 return 2;
1213         else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1214                 return 3;
1215         else
1216                 return (int)bioc->num_stripes;
1217 }
1218
1219 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1220                                                  u64 *raid_map,
1221                                                  u64 mapped_length,
1222                                                  int nstripes, int mirror,
1223                                                  int *stripe_index,
1224                                                  u64 *stripe_offset)
1225 {
1226         int i;
1227
1228         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1229                 /* RAID5/6 */
1230                 for (i = 0; i < nstripes; i++) {
1231                         if (raid_map[i] == RAID6_Q_STRIPE ||
1232                             raid_map[i] == RAID5_P_STRIPE)
1233                                 continue;
1234
1235                         if (logical >= raid_map[i] &&
1236                             logical < raid_map[i] + mapped_length)
1237                                 break;
1238                 }
1239
1240                 *stripe_index = i;
1241                 *stripe_offset = logical - raid_map[i];
1242         } else {
1243                 /* The other RAID type */
1244                 *stripe_index = mirror;
1245                 *stripe_offset = 0;
1246         }
1247 }
1248
1249 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1250                                      struct scrub_block *sblocks_for_recheck)
1251 {
1252         struct scrub_ctx *sctx = original_sblock->sctx;
1253         struct btrfs_fs_info *fs_info = sctx->fs_info;
1254         u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
1255         u64 logical = original_sblock->sectors[0]->logical;
1256         u64 generation = original_sblock->sectors[0]->generation;
1257         u64 flags = original_sblock->sectors[0]->flags;
1258         u64 have_csum = original_sblock->sectors[0]->have_csum;
1259         struct scrub_recover *recover;
1260         struct btrfs_io_context *bioc;
1261         u64 sublen;
1262         u64 mapped_length;
1263         u64 stripe_offset;
1264         int stripe_index;
1265         int sector_index = 0;
1266         int mirror_index;
1267         int nmirrors;
1268         int ret;
1269
1270         /*
1271          * Note: the two members refs and outstanding_sectors are not used (and
1272          * not set) in the blocks that are used for the recheck procedure.
1273          */
1274
1275         while (length > 0) {
1276                 sublen = min_t(u64, length, fs_info->sectorsize);
1277                 mapped_length = sublen;
1278                 bioc = NULL;
1279
1280                 /*
1281                  * With a length of sectorsize, each returned stripe represents
1282                  * one mirror
1283                  */
1284                 btrfs_bio_counter_inc_blocked(fs_info);
1285                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1286                                        logical, &mapped_length, &bioc);
1287                 if (ret || !bioc || mapped_length < sublen) {
1288                         btrfs_put_bioc(bioc);
1289                         btrfs_bio_counter_dec(fs_info);
1290                         return -EIO;
1291                 }
1292
1293                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1294                 if (!recover) {
1295                         btrfs_put_bioc(bioc);
1296                         btrfs_bio_counter_dec(fs_info);
1297                         return -ENOMEM;
1298                 }
1299
1300                 refcount_set(&recover->refs, 1);
1301                 recover->bioc = bioc;
1302                 recover->map_length = mapped_length;
1303
1304                 ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1305
1306                 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1307
1308                 for (mirror_index = 0; mirror_index < nmirrors;
1309                      mirror_index++) {
1310                         struct scrub_block *sblock;
1311                         struct scrub_sector *sector;
1312
1313                         sblock = sblocks_for_recheck + mirror_index;
1314                         sblock->sctx = sctx;
1315
1316                         sector = kzalloc(sizeof(*sector), GFP_NOFS);
1317                         if (!sector) {
1318 leave_nomem:
1319                                 spin_lock(&sctx->stat_lock);
1320                                 sctx->stat.malloc_errors++;
1321                                 spin_unlock(&sctx->stat_lock);
1322                                 scrub_put_recover(fs_info, recover);
1323                                 return -ENOMEM;
1324                         }
1325                         scrub_sector_get(sector);
1326                         sblock->sectors[sector_index] = sector;
1327                         sector->sblock = sblock;
1328                         sector->flags = flags;
1329                         sector->generation = generation;
1330                         sector->logical = logical;
1331                         sector->have_csum = have_csum;
1332                         if (have_csum)
1333                                 memcpy(sector->csum,
1334                                        original_sblock->sectors[0]->csum,
1335                                        sctx->fs_info->csum_size);
1336
1337                         scrub_stripe_index_and_offset(logical,
1338                                                       bioc->map_type,
1339                                                       bioc->raid_map,
1340                                                       mapped_length,
1341                                                       bioc->num_stripes -
1342                                                       bioc->num_tgtdevs,
1343                                                       mirror_index,
1344                                                       &stripe_index,
1345                                                       &stripe_offset);
1346                         sector->physical = bioc->stripes[stripe_index].physical +
1347                                          stripe_offset;
1348                         sector->dev = bioc->stripes[stripe_index].dev;
1349
1350                         BUG_ON(sector_index >= original_sblock->sector_count);
1351                         sector->physical_for_dev_replace =
1352                                 original_sblock->sectors[sector_index]->
1353                                 physical_for_dev_replace;
1354                         /* For missing devices, dev->bdev is NULL */
1355                         sector->mirror_num = mirror_index + 1;
1356                         sblock->sector_count++;
1357                         sector->page = alloc_page(GFP_NOFS);
1358                         if (!sector->page)
1359                                 goto leave_nomem;
1360
1361                         scrub_get_recover(recover);
1362                         sector->recover = recover;
1363                 }
1364                 scrub_put_recover(fs_info, recover);
1365                 length -= sublen;
1366                 logical += sublen;
1367                 sector_index++;
1368         }
1369
1370         return 0;
1371 }
1372
1373 static void scrub_bio_wait_endio(struct bio *bio)
1374 {
1375         complete(bio->bi_private);
1376 }
1377
1378 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1379                                         struct bio *bio,
1380                                         struct scrub_sector *sector)
1381 {
1382         DECLARE_COMPLETION_ONSTACK(done);
1383         int ret;
1384         int mirror_num;
1385
1386         bio->bi_iter.bi_sector = sector->logical >> 9;
1387         bio->bi_private = &done;
1388         bio->bi_end_io = scrub_bio_wait_endio;
1389
1390         mirror_num = sector->sblock->sectors[0]->mirror_num;
1391         ret = raid56_parity_recover(bio, sector->recover->bioc,
1392                                     sector->recover->map_length,
1393                                     mirror_num, 0);
1394         if (ret)
1395                 return ret;
1396
1397         wait_for_completion_io(&done);
1398         return blk_status_to_errno(bio->bi_status);
1399 }
1400
1401 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1402                                           struct scrub_block *sblock)
1403 {
1404         struct scrub_sector *first_sector = sblock->sectors[0];
1405         struct bio *bio;
1406         int i;
1407
1408         /* All sectors in sblock belong to the same stripe on the same device. */
1409         ASSERT(first_sector->dev);
1410         if (!first_sector->dev->bdev)
1411                 goto out;
1412
1413         bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
1414
1415         for (i = 0; i < sblock->sector_count; i++) {
1416                 struct scrub_sector *sector = sblock->sectors[i];
1417
1418                 WARN_ON(!sector->page);
1419                 bio_add_page(bio, sector->page, PAGE_SIZE, 0);
1420         }
1421
1422         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
1423                 bio_put(bio);
1424                 goto out;
1425         }
1426
1427         bio_put(bio);
1428
1429         scrub_recheck_block_checksum(sblock);
1430
1431         return;
1432 out:
1433         for (i = 0; i < sblock->sector_count; i++)
1434                 sblock->sectors[i]->io_error = 1;
1435
1436         sblock->no_io_error_seen = 0;
1437 }
1438
1439 /*
1440  * This function will check the on disk data for checksum errors, header errors
1441  * and read I/O errors. If any I/O errors happen, the exact sectors which are
1442  * errored are marked as being bad. The goal is to enable scrub to take those
1443  * sectors that are not errored from all the mirrors so that the sectors that
1444  * are errored in the just handled mirror can be repaired.
1445  */
1446 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1447                                 struct scrub_block *sblock,
1448                                 int retry_failed_mirror)
1449 {
1450         int i;
1451
1452         sblock->no_io_error_seen = 1;
1453
1454         /* short cut for raid56 */
1455         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
1456                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1457
1458         for (i = 0; i < sblock->sector_count; i++) {
1459                 struct scrub_sector *sector = sblock->sectors[i];
1460                 struct bio bio;
1461                 struct bio_vec bvec;
1462
1463                 if (sector->dev->bdev == NULL) {
1464                         sector->io_error = 1;
1465                         sblock->no_io_error_seen = 0;
1466                         continue;
1467                 }
1468
1469                 WARN_ON(!sector->page);
1470                 bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
1471                 bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
1472                 bio.bi_iter.bi_sector = sector->physical >> 9;
1473
1474                 btrfsic_check_bio(&bio);
1475                 if (submit_bio_wait(&bio)) {
1476                         sector->io_error = 1;
1477                         sblock->no_io_error_seen = 0;
1478                 }
1479
1480                 bio_uninit(&bio);
1481         }
1482
1483         if (sblock->no_io_error_seen)
1484                 scrub_recheck_block_checksum(sblock);
1485 }
1486
1487 static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
1488 {
1489         struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
1490         int ret;
1491
1492         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1493         return !ret;
1494 }
1495
1496 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1497 {
1498         sblock->header_error = 0;
1499         sblock->checksum_error = 0;
1500         sblock->generation_error = 0;
1501
1502         if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1503                 scrub_checksum_data(sblock);
1504         else
1505                 scrub_checksum_tree_block(sblock);
1506 }
1507
1508 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1509                                              struct scrub_block *sblock_good)
1510 {
1511         int i;
1512         int ret = 0;
1513
1514         for (i = 0; i < sblock_bad->sector_count; i++) {
1515                 int ret_sub;
1516
1517                 ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
1518                                                              sblock_good, i, 1);
1519                 if (ret_sub)
1520                         ret = ret_sub;
1521         }
1522
1523         return ret;
1524 }
1525
1526 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
1527                                               struct scrub_block *sblock_good,
1528                                               int sector_num, int force_write)
1529 {
1530         struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1531         struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1532         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1533         const u32 sectorsize = fs_info->sectorsize;
1534
1535         BUG_ON(sector_bad->page == NULL);
1536         BUG_ON(sector_good->page == NULL);
1537         if (force_write || sblock_bad->header_error ||
1538             sblock_bad->checksum_error || sector_bad->io_error) {
1539                 struct bio bio;
1540                 struct bio_vec bvec;
1541                 int ret;
1542
1543                 if (!sector_bad->dev->bdev) {
1544                         btrfs_warn_rl(fs_info,
1545                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1546                         return -EIO;
1547                 }
1548
1549                 bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
1550                 bio.bi_iter.bi_sector = sector_bad->physical >> 9;
1551                 __bio_add_page(&bio, sector_good->page, sectorsize, 0);
1552
1553                 btrfsic_check_bio(&bio);
1554                 ret = submit_bio_wait(&bio);
1555                 bio_uninit(&bio);
1556
1557                 if (ret) {
1558                         btrfs_dev_stat_inc_and_print(sector_bad->dev,
1559                                 BTRFS_DEV_STAT_WRITE_ERRS);
1560                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1561                         return -EIO;
1562                 }
1563         }
1564
1565         return 0;
1566 }
1567
1568 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1569 {
1570         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1571         int i;
1572
1573         /*
1574          * This block is used for the check of the parity on the source device,
1575          * so the data needn't be written into the destination device.
1576          */
1577         if (sblock->sparity)
1578                 return;
1579
1580         for (i = 0; i < sblock->sector_count; i++) {
1581                 int ret;
1582
1583                 ret = scrub_write_sector_to_dev_replace(sblock, i);
1584                 if (ret)
1585                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1586         }
1587 }
1588
1589 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1590 {
1591         struct scrub_sector *sector = sblock->sectors[sector_num];
1592
1593         BUG_ON(sector->page == NULL);
1594         if (sector->io_error)
1595                 clear_page(page_address(sector->page));
1596
1597         return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1598 }
1599
1600 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1601 {
1602         int ret = 0;
1603         u64 length;
1604
1605         if (!btrfs_is_zoned(sctx->fs_info))
1606                 return 0;
1607
1608         if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1609                 return 0;
1610
1611         if (sctx->write_pointer < physical) {
1612                 length = physical - sctx->write_pointer;
1613
1614                 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1615                                                 sctx->write_pointer, length);
1616                 if (!ret)
1617                         sctx->write_pointer = physical;
1618         }
1619         return ret;
1620 }
1621
1622 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
1623                                       struct scrub_sector *sector)
1624 {
1625         struct scrub_bio *sbio;
1626         int ret;
1627         const u32 sectorsize = sctx->fs_info->sectorsize;
1628
1629         mutex_lock(&sctx->wr_lock);
1630 again:
1631         if (!sctx->wr_curr_bio) {
1632                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1633                                               GFP_KERNEL);
1634                 if (!sctx->wr_curr_bio) {
1635                         mutex_unlock(&sctx->wr_lock);
1636                         return -ENOMEM;
1637                 }
1638                 sctx->wr_curr_bio->sctx = sctx;
1639                 sctx->wr_curr_bio->sector_count = 0;
1640         }
1641         sbio = sctx->wr_curr_bio;
1642         if (sbio->sector_count == 0) {
1643                 ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
1644                 if (ret) {
1645                         mutex_unlock(&sctx->wr_lock);
1646                         return ret;
1647                 }
1648
1649                 sbio->physical = sector->physical_for_dev_replace;
1650                 sbio->logical = sector->logical;
1651                 sbio->dev = sctx->wr_tgtdev;
1652                 if (!sbio->bio) {
1653                         sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
1654                                               REQ_OP_WRITE, GFP_NOFS);
1655                 }
1656                 sbio->bio->bi_private = sbio;
1657                 sbio->bio->bi_end_io = scrub_wr_bio_end_io;
1658                 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
1659                 sbio->status = 0;
1660         } else if (sbio->physical + sbio->sector_count * sectorsize !=
1661                    sector->physical_for_dev_replace ||
1662                    sbio->logical + sbio->sector_count * sectorsize !=
1663                    sector->logical) {
1664                 scrub_wr_submit(sctx);
1665                 goto again;
1666         }
1667
1668         ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
1669         if (ret != sectorsize) {
1670                 if (sbio->sector_count < 1) {
1671                         bio_put(sbio->bio);
1672                         sbio->bio = NULL;
1673                         mutex_unlock(&sctx->wr_lock);
1674                         return -EIO;
1675                 }
1676                 scrub_wr_submit(sctx);
1677                 goto again;
1678         }
1679
1680         sbio->sectors[sbio->sector_count] = sector;
1681         scrub_sector_get(sector);
1682         sbio->sector_count++;
1683         if (sbio->sector_count == sctx->sectors_per_bio)
1684                 scrub_wr_submit(sctx);
1685         mutex_unlock(&sctx->wr_lock);
1686
1687         return 0;
1688 }
1689
1690 static void scrub_wr_submit(struct scrub_ctx *sctx)
1691 {
1692         struct scrub_bio *sbio;
1693
1694         if (!sctx->wr_curr_bio)
1695                 return;
1696
1697         sbio = sctx->wr_curr_bio;
1698         sctx->wr_curr_bio = NULL;
1699         scrub_pending_bio_inc(sctx);
1700         /* process all writes in a single worker thread. Then the block layer
1701          * orders the requests before sending them to the driver which
1702          * doubled the write performance on spinning disks when measured
1703          * with Linux 3.5 */
1704         btrfsic_check_bio(sbio->bio);
1705         submit_bio(sbio->bio);
1706
1707         if (btrfs_is_zoned(sctx->fs_info))
1708                 sctx->write_pointer = sbio->physical + sbio->sector_count *
1709                         sctx->fs_info->sectorsize;
1710 }
1711
1712 static void scrub_wr_bio_end_io(struct bio *bio)
1713 {
1714         struct scrub_bio *sbio = bio->bi_private;
1715         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1716
1717         sbio->status = bio->bi_status;
1718         sbio->bio = bio;
1719
1720         INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
1721         queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1722 }
1723
1724 static void scrub_wr_bio_end_io_worker(struct work_struct *work)
1725 {
1726         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1727         struct scrub_ctx *sctx = sbio->sctx;
1728         int i;
1729
1730         ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
1731         if (sbio->status) {
1732                 struct btrfs_dev_replace *dev_replace =
1733                         &sbio->sctx->fs_info->dev_replace;
1734
1735                 for (i = 0; i < sbio->sector_count; i++) {
1736                         struct scrub_sector *sector = sbio->sectors[i];
1737
1738                         sector->io_error = 1;
1739                         atomic64_inc(&dev_replace->num_write_errors);
1740                 }
1741         }
1742
1743         for (i = 0; i < sbio->sector_count; i++)
1744                 scrub_sector_put(sbio->sectors[i]);
1745
1746         bio_put(sbio->bio);
1747         kfree(sbio);
1748         scrub_pending_bio_dec(sctx);
1749 }
1750
1751 static int scrub_checksum(struct scrub_block *sblock)
1752 {
1753         u64 flags;
1754         int ret;
1755
1756         /*
1757          * No need to initialize these stats currently,
1758          * because this function only use return value
1759          * instead of these stats value.
1760          *
1761          * Todo:
1762          * always use stats
1763          */
1764         sblock->header_error = 0;
1765         sblock->generation_error = 0;
1766         sblock->checksum_error = 0;
1767
1768         WARN_ON(sblock->sector_count < 1);
1769         flags = sblock->sectors[0]->flags;
1770         ret = 0;
1771         if (flags & BTRFS_EXTENT_FLAG_DATA)
1772                 ret = scrub_checksum_data(sblock);
1773         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1774                 ret = scrub_checksum_tree_block(sblock);
1775         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1776                 (void)scrub_checksum_super(sblock);
1777         else
1778                 WARN_ON(1);
1779         if (ret)
1780                 scrub_handle_errored_block(sblock);
1781
1782         return ret;
1783 }
1784
1785 static int scrub_checksum_data(struct scrub_block *sblock)
1786 {
1787         struct scrub_ctx *sctx = sblock->sctx;
1788         struct btrfs_fs_info *fs_info = sctx->fs_info;
1789         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1790         u8 csum[BTRFS_CSUM_SIZE];
1791         struct scrub_sector *sector;
1792         char *kaddr;
1793
1794         BUG_ON(sblock->sector_count < 1);
1795         sector = sblock->sectors[0];
1796         if (!sector->have_csum)
1797                 return 0;
1798
1799         kaddr = page_address(sector->page);
1800
1801         shash->tfm = fs_info->csum_shash;
1802         crypto_shash_init(shash);
1803
1804         /*
1805          * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
1806          * only contains one sector of data.
1807          */
1808         crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1809
1810         if (memcmp(csum, sector->csum, fs_info->csum_size))
1811                 sblock->checksum_error = 1;
1812         return sblock->checksum_error;
1813 }
1814
1815 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1816 {
1817         struct scrub_ctx *sctx = sblock->sctx;
1818         struct btrfs_header *h;
1819         struct btrfs_fs_info *fs_info = sctx->fs_info;
1820         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1821         u8 calculated_csum[BTRFS_CSUM_SIZE];
1822         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1823         /*
1824          * This is done in sectorsize steps even for metadata as there's a
1825          * constraint for nodesize to be aligned to sectorsize. This will need
1826          * to change so we don't misuse data and metadata units like that.
1827          */
1828         const u32 sectorsize = sctx->fs_info->sectorsize;
1829         const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1830         int i;
1831         struct scrub_sector *sector;
1832         char *kaddr;
1833
1834         BUG_ON(sblock->sector_count < 1);
1835
1836         /* Each member in sectors is just one sector */
1837         ASSERT(sblock->sector_count == num_sectors);
1838
1839         sector = sblock->sectors[0];
1840         kaddr = page_address(sector->page);
1841         h = (struct btrfs_header *)kaddr;
1842         memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1843
1844         /*
1845          * we don't use the getter functions here, as we
1846          * a) don't have an extent buffer and
1847          * b) the page is already kmapped
1848          */
1849         if (sector->logical != btrfs_stack_header_bytenr(h))
1850                 sblock->header_error = 1;
1851
1852         if (sector->generation != btrfs_stack_header_generation(h)) {
1853                 sblock->header_error = 1;
1854                 sblock->generation_error = 1;
1855         }
1856
1857         if (!scrub_check_fsid(h->fsid, sector))
1858                 sblock->header_error = 1;
1859
1860         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1861                    BTRFS_UUID_SIZE))
1862                 sblock->header_error = 1;
1863
1864         shash->tfm = fs_info->csum_shash;
1865         crypto_shash_init(shash);
1866         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1867                             sectorsize - BTRFS_CSUM_SIZE);
1868
1869         for (i = 1; i < num_sectors; i++) {
1870                 kaddr = page_address(sblock->sectors[i]->page);
1871                 crypto_shash_update(shash, kaddr, sectorsize);
1872         }
1873
1874         crypto_shash_final(shash, calculated_csum);
1875         if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1876                 sblock->checksum_error = 1;
1877
1878         return sblock->header_error || sblock->checksum_error;
1879 }
1880
1881 static int scrub_checksum_super(struct scrub_block *sblock)
1882 {
1883         struct btrfs_super_block *s;
1884         struct scrub_ctx *sctx = sblock->sctx;
1885         struct btrfs_fs_info *fs_info = sctx->fs_info;
1886         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1887         u8 calculated_csum[BTRFS_CSUM_SIZE];
1888         struct scrub_sector *sector;
1889         char *kaddr;
1890         int fail_gen = 0;
1891         int fail_cor = 0;
1892
1893         BUG_ON(sblock->sector_count < 1);
1894         sector = sblock->sectors[0];
1895         kaddr = page_address(sector->page);
1896         s = (struct btrfs_super_block *)kaddr;
1897
1898         if (sector->logical != btrfs_super_bytenr(s))
1899                 ++fail_cor;
1900
1901         if (sector->generation != btrfs_super_generation(s))
1902                 ++fail_gen;
1903
1904         if (!scrub_check_fsid(s->fsid, sector))
1905                 ++fail_cor;
1906
1907         shash->tfm = fs_info->csum_shash;
1908         crypto_shash_init(shash);
1909         crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1910                         BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1911
1912         if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1913                 ++fail_cor;
1914
1915         if (fail_cor + fail_gen) {
1916                 /*
1917                  * if we find an error in a super block, we just report it.
1918                  * They will get written with the next transaction commit
1919                  * anyway
1920                  */
1921                 spin_lock(&sctx->stat_lock);
1922                 ++sctx->stat.super_errors;
1923                 spin_unlock(&sctx->stat_lock);
1924                 if (fail_cor)
1925                         btrfs_dev_stat_inc_and_print(sector->dev,
1926                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1927                 else
1928                         btrfs_dev_stat_inc_and_print(sector->dev,
1929                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1930         }
1931
1932         return fail_cor + fail_gen;
1933 }
1934
1935 static void scrub_block_get(struct scrub_block *sblock)
1936 {
1937         refcount_inc(&sblock->refs);
1938 }
1939
1940 static void scrub_block_put(struct scrub_block *sblock)
1941 {
1942         if (refcount_dec_and_test(&sblock->refs)) {
1943                 int i;
1944
1945                 if (sblock->sparity)
1946                         scrub_parity_put(sblock->sparity);
1947
1948                 for (i = 0; i < sblock->sector_count; i++)
1949                         scrub_sector_put(sblock->sectors[i]);
1950                 kfree(sblock);
1951         }
1952 }
1953
1954 static void scrub_sector_get(struct scrub_sector *sector)
1955 {
1956         atomic_inc(&sector->refs);
1957 }
1958
1959 static void scrub_sector_put(struct scrub_sector *sector)
1960 {
1961         if (atomic_dec_and_test(&sector->refs)) {
1962                 if (sector->page)
1963                         __free_page(sector->page);
1964                 kfree(sector);
1965         }
1966 }
1967
1968 /*
1969  * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1970  * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1971  */
1972 static void scrub_throttle(struct scrub_ctx *sctx)
1973 {
1974         const int time_slice = 1000;
1975         struct scrub_bio *sbio;
1976         struct btrfs_device *device;
1977         s64 delta;
1978         ktime_t now;
1979         u32 div;
1980         u64 bwlimit;
1981
1982         sbio = sctx->bios[sctx->curr];
1983         device = sbio->dev;
1984         bwlimit = READ_ONCE(device->scrub_speed_max);
1985         if (bwlimit == 0)
1986                 return;
1987
1988         /*
1989          * Slice is divided into intervals when the IO is submitted, adjust by
1990          * bwlimit and maximum of 64 intervals.
1991          */
1992         div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1993         div = min_t(u32, 64, div);
1994
1995         /* Start new epoch, set deadline */
1996         now = ktime_get();
1997         if (sctx->throttle_deadline == 0) {
1998                 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1999                 sctx->throttle_sent = 0;
2000         }
2001
2002         /* Still in the time to send? */
2003         if (ktime_before(now, sctx->throttle_deadline)) {
2004                 /* If current bio is within the limit, send it */
2005                 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
2006                 if (sctx->throttle_sent <= div_u64(bwlimit, div))
2007                         return;
2008
2009                 /* We're over the limit, sleep until the rest of the slice */
2010                 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2011         } else {
2012                 /* New request after deadline, start new epoch */
2013                 delta = 0;
2014         }
2015
2016         if (delta) {
2017                 long timeout;
2018
2019                 timeout = div_u64(delta * HZ, 1000);
2020                 schedule_timeout_interruptible(timeout);
2021         }
2022
2023         /* Next call will start the deadline period */
2024         sctx->throttle_deadline = 0;
2025 }
2026
2027 static void scrub_submit(struct scrub_ctx *sctx)
2028 {
2029         struct scrub_bio *sbio;
2030
2031         if (sctx->curr == -1)
2032                 return;
2033
2034         scrub_throttle(sctx);
2035
2036         sbio = sctx->bios[sctx->curr];
2037         sctx->curr = -1;
2038         scrub_pending_bio_inc(sctx);
2039         btrfsic_check_bio(sbio->bio);
2040         submit_bio(sbio->bio);
2041 }
2042
2043 static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
2044                                       struct scrub_sector *sector)
2045 {
2046         struct scrub_block *sblock = sector->sblock;
2047         struct scrub_bio *sbio;
2048         const u32 sectorsize = sctx->fs_info->sectorsize;
2049         int ret;
2050
2051 again:
2052         /*
2053          * grab a fresh bio or wait for one to become available
2054          */
2055         while (sctx->curr == -1) {
2056                 spin_lock(&sctx->list_lock);
2057                 sctx->curr = sctx->first_free;
2058                 if (sctx->curr != -1) {
2059                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2060                         sctx->bios[sctx->curr]->next_free = -1;
2061                         sctx->bios[sctx->curr]->sector_count = 0;
2062                         spin_unlock(&sctx->list_lock);
2063                 } else {
2064                         spin_unlock(&sctx->list_lock);
2065                         wait_event(sctx->list_wait, sctx->first_free != -1);
2066                 }
2067         }
2068         sbio = sctx->bios[sctx->curr];
2069         if (sbio->sector_count == 0) {
2070                 sbio->physical = sector->physical;
2071                 sbio->logical = sector->logical;
2072                 sbio->dev = sector->dev;
2073                 if (!sbio->bio) {
2074                         sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
2075                                               REQ_OP_READ, GFP_NOFS);
2076                 }
2077                 sbio->bio->bi_private = sbio;
2078                 sbio->bio->bi_end_io = scrub_bio_end_io;
2079                 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2080                 sbio->status = 0;
2081         } else if (sbio->physical + sbio->sector_count * sectorsize !=
2082                    sector->physical ||
2083                    sbio->logical + sbio->sector_count * sectorsize !=
2084                    sector->logical ||
2085                    sbio->dev != sector->dev) {
2086                 scrub_submit(sctx);
2087                 goto again;
2088         }
2089
2090         sbio->sectors[sbio->sector_count] = sector;
2091         ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
2092         if (ret != sectorsize) {
2093                 if (sbio->sector_count < 1) {
2094                         bio_put(sbio->bio);
2095                         sbio->bio = NULL;
2096                         return -EIO;
2097                 }
2098                 scrub_submit(sctx);
2099                 goto again;
2100         }
2101
2102         scrub_block_get(sblock); /* one for the page added to the bio */
2103         atomic_inc(&sblock->outstanding_sectors);
2104         sbio->sector_count++;
2105         if (sbio->sector_count == sctx->sectors_per_bio)
2106                 scrub_submit(sctx);
2107
2108         return 0;
2109 }
2110
2111 static void scrub_missing_raid56_end_io(struct bio *bio)
2112 {
2113         struct scrub_block *sblock = bio->bi_private;
2114         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2115
2116         if (bio->bi_status)
2117                 sblock->no_io_error_seen = 0;
2118
2119         bio_put(bio);
2120
2121         queue_work(fs_info->scrub_workers, &sblock->work);
2122 }
2123
2124 static void scrub_missing_raid56_worker(struct work_struct *work)
2125 {
2126         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2127         struct scrub_ctx *sctx = sblock->sctx;
2128         struct btrfs_fs_info *fs_info = sctx->fs_info;
2129         u64 logical;
2130         struct btrfs_device *dev;
2131
2132         logical = sblock->sectors[0]->logical;
2133         dev = sblock->sectors[0]->dev;
2134
2135         if (sblock->no_io_error_seen)
2136                 scrub_recheck_block_checksum(sblock);
2137
2138         if (!sblock->no_io_error_seen) {
2139                 spin_lock(&sctx->stat_lock);
2140                 sctx->stat.read_errors++;
2141                 spin_unlock(&sctx->stat_lock);
2142                 btrfs_err_rl_in_rcu(fs_info,
2143                         "IO error rebuilding logical %llu for dev %s",
2144                         logical, rcu_str_deref(dev->name));
2145         } else if (sblock->header_error || sblock->checksum_error) {
2146                 spin_lock(&sctx->stat_lock);
2147                 sctx->stat.uncorrectable_errors++;
2148                 spin_unlock(&sctx->stat_lock);
2149                 btrfs_err_rl_in_rcu(fs_info,
2150                         "failed to rebuild valid logical %llu for dev %s",
2151                         logical, rcu_str_deref(dev->name));
2152         } else {
2153                 scrub_write_block_to_dev_replace(sblock);
2154         }
2155
2156         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2157                 mutex_lock(&sctx->wr_lock);
2158                 scrub_wr_submit(sctx);
2159                 mutex_unlock(&sctx->wr_lock);
2160         }
2161
2162         scrub_block_put(sblock);
2163         scrub_pending_bio_dec(sctx);
2164 }
2165
2166 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2167 {
2168         struct scrub_ctx *sctx = sblock->sctx;
2169         struct btrfs_fs_info *fs_info = sctx->fs_info;
2170         u64 length = sblock->sector_count << fs_info->sectorsize_bits;
2171         u64 logical = sblock->sectors[0]->logical;
2172         struct btrfs_io_context *bioc = NULL;
2173         struct bio *bio;
2174         struct btrfs_raid_bio *rbio;
2175         int ret;
2176         int i;
2177
2178         btrfs_bio_counter_inc_blocked(fs_info);
2179         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2180                                &length, &bioc);
2181         if (ret || !bioc || !bioc->raid_map)
2182                 goto bioc_out;
2183
2184         if (WARN_ON(!sctx->is_dev_replace ||
2185                     !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2186                 /*
2187                  * We shouldn't be scrubbing a missing device. Even for dev
2188                  * replace, we should only get here for RAID 5/6. We either
2189                  * managed to mount something with no mirrors remaining or
2190                  * there's a bug in scrub_find_good_copy()/btrfs_map_block().
2191                  */
2192                 goto bioc_out;
2193         }
2194
2195         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2196         bio->bi_iter.bi_sector = logical >> 9;
2197         bio->bi_private = sblock;
2198         bio->bi_end_io = scrub_missing_raid56_end_io;
2199
2200         rbio = raid56_alloc_missing_rbio(bio, bioc, length);
2201         if (!rbio)
2202                 goto rbio_out;
2203
2204         for (i = 0; i < sblock->sector_count; i++) {
2205                 struct scrub_sector *sector = sblock->sectors[i];
2206
2207                 /*
2208                  * For now, our scrub is still one page per sector, so pgoff
2209                  * is always 0.
2210                  */
2211                 raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
2212         }
2213
2214         INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
2215         scrub_block_get(sblock);
2216         scrub_pending_bio_inc(sctx);
2217         raid56_submit_missing_rbio(rbio);
2218         return;
2219
2220 rbio_out:
2221         bio_put(bio);
2222 bioc_out:
2223         btrfs_bio_counter_dec(fs_info);
2224         btrfs_put_bioc(bioc);
2225         spin_lock(&sctx->stat_lock);
2226         sctx->stat.malloc_errors++;
2227         spin_unlock(&sctx->stat_lock);
2228 }
2229
2230 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
2231                        u64 physical, struct btrfs_device *dev, u64 flags,
2232                        u64 gen, int mirror_num, u8 *csum,
2233                        u64 physical_for_dev_replace)
2234 {
2235         struct scrub_block *sblock;
2236         const u32 sectorsize = sctx->fs_info->sectorsize;
2237         int index;
2238
2239         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2240         if (!sblock) {
2241                 spin_lock(&sctx->stat_lock);
2242                 sctx->stat.malloc_errors++;
2243                 spin_unlock(&sctx->stat_lock);
2244                 return -ENOMEM;
2245         }
2246
2247         /* one ref inside this function, plus one for each page added to
2248          * a bio later on */
2249         refcount_set(&sblock->refs, 1);
2250         sblock->sctx = sctx;
2251         sblock->no_io_error_seen = 1;
2252
2253         for (index = 0; len > 0; index++) {
2254                 struct scrub_sector *sector;
2255                 /*
2256                  * Here we will allocate one page for one sector to scrub.
2257                  * This is fine if PAGE_SIZE == sectorsize, but will cost
2258                  * more memory for PAGE_SIZE > sectorsize case.
2259                  */
2260                 u32 l = min(sectorsize, len);
2261
2262                 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2263                 if (!sector) {
2264 leave_nomem:
2265                         spin_lock(&sctx->stat_lock);
2266                         sctx->stat.malloc_errors++;
2267                         spin_unlock(&sctx->stat_lock);
2268                         scrub_block_put(sblock);
2269                         return -ENOMEM;
2270                 }
2271                 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2272                 scrub_sector_get(sector);
2273                 sblock->sectors[index] = sector;
2274                 sector->sblock = sblock;
2275                 sector->dev = dev;
2276                 sector->flags = flags;
2277                 sector->generation = gen;
2278                 sector->logical = logical;
2279                 sector->physical = physical;
2280                 sector->physical_for_dev_replace = physical_for_dev_replace;
2281                 sector->mirror_num = mirror_num;
2282                 if (csum) {
2283                         sector->have_csum = 1;
2284                         memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2285                 } else {
2286                         sector->have_csum = 0;
2287                 }
2288                 sblock->sector_count++;
2289                 sector->page = alloc_page(GFP_KERNEL);
2290                 if (!sector->page)
2291                         goto leave_nomem;
2292                 len -= l;
2293                 logical += l;
2294                 physical += l;
2295                 physical_for_dev_replace += l;
2296         }
2297
2298         WARN_ON(sblock->sector_count == 0);
2299         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2300                 /*
2301                  * This case should only be hit for RAID 5/6 device replace. See
2302                  * the comment in scrub_missing_raid56_pages() for details.
2303                  */
2304                 scrub_missing_raid56_pages(sblock);
2305         } else {
2306                 for (index = 0; index < sblock->sector_count; index++) {
2307                         struct scrub_sector *sector = sblock->sectors[index];
2308                         int ret;
2309
2310                         ret = scrub_add_sector_to_rd_bio(sctx, sector);
2311                         if (ret) {
2312                                 scrub_block_put(sblock);
2313                                 return ret;
2314                         }
2315                 }
2316
2317                 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2318                         scrub_submit(sctx);
2319         }
2320
2321         /* last one frees, either here or in bio completion for last page */
2322         scrub_block_put(sblock);
2323         return 0;
2324 }
2325
2326 static void scrub_bio_end_io(struct bio *bio)
2327 {
2328         struct scrub_bio *sbio = bio->bi_private;
2329         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2330
2331         sbio->status = bio->bi_status;
2332         sbio->bio = bio;
2333
2334         queue_work(fs_info->scrub_workers, &sbio->work);
2335 }
2336
2337 static void scrub_bio_end_io_worker(struct work_struct *work)
2338 {
2339         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2340         struct scrub_ctx *sctx = sbio->sctx;
2341         int i;
2342
2343         ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2344         if (sbio->status) {
2345                 for (i = 0; i < sbio->sector_count; i++) {
2346                         struct scrub_sector *sector = sbio->sectors[i];
2347
2348                         sector->io_error = 1;
2349                         sector->sblock->no_io_error_seen = 0;
2350                 }
2351         }
2352
2353         /* Now complete the scrub_block items that have all pages completed */
2354         for (i = 0; i < sbio->sector_count; i++) {
2355                 struct scrub_sector *sector = sbio->sectors[i];
2356                 struct scrub_block *sblock = sector->sblock;
2357
2358                 if (atomic_dec_and_test(&sblock->outstanding_sectors))
2359                         scrub_block_complete(sblock);
2360                 scrub_block_put(sblock);
2361         }
2362
2363         bio_put(sbio->bio);
2364         sbio->bio = NULL;
2365         spin_lock(&sctx->list_lock);
2366         sbio->next_free = sctx->first_free;
2367         sctx->first_free = sbio->index;
2368         spin_unlock(&sctx->list_lock);
2369
2370         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2371                 mutex_lock(&sctx->wr_lock);
2372                 scrub_wr_submit(sctx);
2373                 mutex_unlock(&sctx->wr_lock);
2374         }
2375
2376         scrub_pending_bio_dec(sctx);
2377 }
2378
2379 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2380                                        unsigned long *bitmap,
2381                                        u64 start, u32 len)
2382 {
2383         u64 offset;
2384         u32 nsectors;
2385         u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2386
2387         if (len >= sparity->stripe_len) {
2388                 bitmap_set(bitmap, 0, sparity->nsectors);
2389                 return;
2390         }
2391
2392         start -= sparity->logic_start;
2393         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2394         offset = offset >> sectorsize_bits;
2395         nsectors = len >> sectorsize_bits;
2396
2397         if (offset + nsectors <= sparity->nsectors) {
2398                 bitmap_set(bitmap, offset, nsectors);
2399                 return;
2400         }
2401
2402         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2403         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2404 }
2405
2406 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2407                                                    u64 start, u32 len)
2408 {
2409         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2410 }
2411
2412 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2413                                                   u64 start, u32 len)
2414 {
2415         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2416 }
2417
2418 static void scrub_block_complete(struct scrub_block *sblock)
2419 {
2420         int corrupted = 0;
2421
2422         if (!sblock->no_io_error_seen) {
2423                 corrupted = 1;
2424                 scrub_handle_errored_block(sblock);
2425         } else {
2426                 /*
2427                  * if has checksum error, write via repair mechanism in
2428                  * dev replace case, otherwise write here in dev replace
2429                  * case.
2430                  */
2431                 corrupted = scrub_checksum(sblock);
2432                 if (!corrupted && sblock->sctx->is_dev_replace)
2433                         scrub_write_block_to_dev_replace(sblock);
2434         }
2435
2436         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2437                 u64 start = sblock->sectors[0]->logical;
2438                 u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
2439                           sblock->sctx->fs_info->sectorsize;
2440
2441                 ASSERT(end - start <= U32_MAX);
2442                 scrub_parity_mark_sectors_error(sblock->sparity,
2443                                                 start, end - start);
2444         }
2445 }
2446
2447 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2448 {
2449         sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2450         list_del(&sum->list);
2451         kfree(sum);
2452 }
2453
2454 /*
2455  * Find the desired csum for range [logical, logical + sectorsize), and store
2456  * the csum into @csum.
2457  *
2458  * The search source is sctx->csum_list, which is a pre-populated list
2459  * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
2460  * that is before @logical.
2461  *
2462  * Return 0 if there is no csum for the range.
2463  * Return 1 if there is csum for the range and copied to @csum.
2464  */
2465 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2466 {
2467         bool found = false;
2468
2469         while (!list_empty(&sctx->csum_list)) {
2470                 struct btrfs_ordered_sum *sum = NULL;
2471                 unsigned long index;
2472                 unsigned long num_sectors;
2473
2474                 sum = list_first_entry(&sctx->csum_list,
2475                                        struct btrfs_ordered_sum, list);
2476                 /* The current csum range is beyond our range, no csum found */
2477                 if (sum->bytenr > logical)
2478                         break;
2479
2480                 /*
2481                  * The current sum is before our bytenr, since scrub is always
2482                  * done in bytenr order, the csum will never be used anymore,
2483                  * clean it up so that later calls won't bother with the range,
2484                  * and continue search the next range.
2485                  */
2486                 if (sum->bytenr + sum->len <= logical) {
2487                         drop_csum_range(sctx, sum);
2488                         continue;
2489                 }
2490
2491                 /* Now the csum range covers our bytenr, copy the csum */
2492                 found = true;
2493                 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2494                 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2495
2496                 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2497                        sctx->fs_info->csum_size);
2498
2499                 /* Cleanup the range if we're at the end of the csum range */
2500                 if (index == num_sectors - 1)
2501                         drop_csum_range(sctx, sum);
2502                 break;
2503         }
2504         if (!found)
2505                 return 0;
2506         return 1;
2507 }
2508
2509 /* scrub extent tries to collect up to 64 kB for each bio */
2510 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2511                         u64 logical, u32 len,
2512                         u64 physical, struct btrfs_device *dev, u64 flags,
2513                         u64 gen, int mirror_num)
2514 {
2515         struct btrfs_device *src_dev = dev;
2516         u64 src_physical = physical;
2517         int src_mirror = mirror_num;
2518         int ret;
2519         u8 csum[BTRFS_CSUM_SIZE];
2520         u32 blocksize;
2521
2522         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2523                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2524                         blocksize = map->stripe_len;
2525                 else
2526                         blocksize = sctx->fs_info->sectorsize;
2527                 spin_lock(&sctx->stat_lock);
2528                 sctx->stat.data_extents_scrubbed++;
2529                 sctx->stat.data_bytes_scrubbed += len;
2530                 spin_unlock(&sctx->stat_lock);
2531         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2532                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2533                         blocksize = map->stripe_len;
2534                 else
2535                         blocksize = sctx->fs_info->nodesize;
2536                 spin_lock(&sctx->stat_lock);
2537                 sctx->stat.tree_extents_scrubbed++;
2538                 sctx->stat.tree_bytes_scrubbed += len;
2539                 spin_unlock(&sctx->stat_lock);
2540         } else {
2541                 blocksize = sctx->fs_info->sectorsize;
2542                 WARN_ON(1);
2543         }
2544
2545         /*
2546          * For dev-replace case, we can have @dev being a missing device.
2547          * Regular scrub will avoid its execution on missing device at all,
2548          * as that would trigger tons of read error.
2549          *
2550          * Reading from missing device will cause read error counts to
2551          * increase unnecessarily.
2552          * So here we change the read source to a good mirror.
2553          */
2554         if (sctx->is_dev_replace && !dev->bdev)
2555                 scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
2556                                      &src_dev, &src_mirror);
2557         while (len) {
2558                 u32 l = min(len, blocksize);
2559                 int have_csum = 0;
2560
2561                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2562                         /* push csums to sbio */
2563                         have_csum = scrub_find_csum(sctx, logical, csum);
2564                         if (have_csum == 0)
2565                                 ++sctx->stat.no_csum;
2566                 }
2567                 ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
2568                                     flags, gen, src_mirror,
2569                                     have_csum ? csum : NULL, physical);
2570                 if (ret)
2571                         return ret;
2572                 len -= l;
2573                 logical += l;
2574                 physical += l;
2575                 src_physical += l;
2576         }
2577         return 0;
2578 }
2579
2580 static int scrub_sectors_for_parity(struct scrub_parity *sparity,
2581                                   u64 logical, u32 len,
2582                                   u64 physical, struct btrfs_device *dev,
2583                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2584 {
2585         struct scrub_ctx *sctx = sparity->sctx;
2586         struct scrub_block *sblock;
2587         const u32 sectorsize = sctx->fs_info->sectorsize;
2588         int index;
2589
2590         ASSERT(IS_ALIGNED(len, sectorsize));
2591
2592         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2593         if (!sblock) {
2594                 spin_lock(&sctx->stat_lock);
2595                 sctx->stat.malloc_errors++;
2596                 spin_unlock(&sctx->stat_lock);
2597                 return -ENOMEM;
2598         }
2599
2600         /* one ref inside this function, plus one for each page added to
2601          * a bio later on */
2602         refcount_set(&sblock->refs, 1);
2603         sblock->sctx = sctx;
2604         sblock->no_io_error_seen = 1;
2605         sblock->sparity = sparity;
2606         scrub_parity_get(sparity);
2607
2608         for (index = 0; len > 0; index++) {
2609                 struct scrub_sector *sector;
2610
2611                 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2612                 if (!sector) {
2613 leave_nomem:
2614                         spin_lock(&sctx->stat_lock);
2615                         sctx->stat.malloc_errors++;
2616                         spin_unlock(&sctx->stat_lock);
2617                         scrub_block_put(sblock);
2618                         return -ENOMEM;
2619                 }
2620                 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2621                 /* For scrub block */
2622                 scrub_sector_get(sector);
2623                 sblock->sectors[index] = sector;
2624                 /* For scrub parity */
2625                 scrub_sector_get(sector);
2626                 list_add_tail(&sector->list, &sparity->sectors_list);
2627                 sector->sblock = sblock;
2628                 sector->dev = dev;
2629                 sector->flags = flags;
2630                 sector->generation = gen;
2631                 sector->logical = logical;
2632                 sector->physical = physical;
2633                 sector->mirror_num = mirror_num;
2634                 if (csum) {
2635                         sector->have_csum = 1;
2636                         memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2637                 } else {
2638                         sector->have_csum = 0;
2639                 }
2640                 sblock->sector_count++;
2641                 sector->page = alloc_page(GFP_KERNEL);
2642                 if (!sector->page)
2643                         goto leave_nomem;
2644
2645
2646                 /* Iterate over the stripe range in sectorsize steps */
2647                 len -= sectorsize;
2648                 logical += sectorsize;
2649                 physical += sectorsize;
2650         }
2651
2652         WARN_ON(sblock->sector_count == 0);
2653         for (index = 0; index < sblock->sector_count; index++) {
2654                 struct scrub_sector *sector = sblock->sectors[index];
2655                 int ret;
2656
2657                 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2658                 if (ret) {
2659                         scrub_block_put(sblock);
2660                         return ret;
2661                 }
2662         }
2663
2664         /* Last one frees, either here or in bio completion for last sector */
2665         scrub_block_put(sblock);
2666         return 0;
2667 }
2668
2669 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2670                                    u64 logical, u32 len,
2671                                    u64 physical, struct btrfs_device *dev,
2672                                    u64 flags, u64 gen, int mirror_num)
2673 {
2674         struct scrub_ctx *sctx = sparity->sctx;
2675         int ret;
2676         u8 csum[BTRFS_CSUM_SIZE];
2677         u32 blocksize;
2678
2679         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2680                 scrub_parity_mark_sectors_error(sparity, logical, len);
2681                 return 0;
2682         }
2683
2684         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2685                 blocksize = sparity->stripe_len;
2686         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2687                 blocksize = sparity->stripe_len;
2688         } else {
2689                 blocksize = sctx->fs_info->sectorsize;
2690                 WARN_ON(1);
2691         }
2692
2693         while (len) {
2694                 u32 l = min(len, blocksize);
2695                 int have_csum = 0;
2696
2697                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2698                         /* push csums to sbio */
2699                         have_csum = scrub_find_csum(sctx, logical, csum);
2700                         if (have_csum == 0)
2701                                 goto skip;
2702                 }
2703                 ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
2704                                              flags, gen, mirror_num,
2705                                              have_csum ? csum : NULL);
2706                 if (ret)
2707                         return ret;
2708 skip:
2709                 len -= l;
2710                 logical += l;
2711                 physical += l;
2712         }
2713         return 0;
2714 }
2715
2716 /*
2717  * Given a physical address, this will calculate it's
2718  * logical offset. if this is a parity stripe, it will return
2719  * the most left data stripe's logical offset.
2720  *
2721  * return 0 if it is a data stripe, 1 means parity stripe.
2722  */
2723 static int get_raid56_logic_offset(u64 physical, int num,
2724                                    struct map_lookup *map, u64 *offset,
2725                                    u64 *stripe_start)
2726 {
2727         int i;
2728         int j = 0;
2729         u64 stripe_nr;
2730         u64 last_offset;
2731         u32 stripe_index;
2732         u32 rot;
2733         const int data_stripes = nr_data_stripes(map);
2734
2735         last_offset = (physical - map->stripes[num].physical) * data_stripes;
2736         if (stripe_start)
2737                 *stripe_start = last_offset;
2738
2739         *offset = last_offset;
2740         for (i = 0; i < data_stripes; i++) {
2741                 *offset = last_offset + i * map->stripe_len;
2742
2743                 stripe_nr = div64_u64(*offset, map->stripe_len);
2744                 stripe_nr = div_u64(stripe_nr, data_stripes);
2745
2746                 /* Work out the disk rotation on this stripe-set */
2747                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2748                 /* calculate which stripe this data locates */
2749                 rot += i;
2750                 stripe_index = rot % map->num_stripes;
2751                 if (stripe_index == num)
2752                         return 0;
2753                 if (stripe_index < num)
2754                         j++;
2755         }
2756         *offset = last_offset + j * map->stripe_len;
2757         return 1;
2758 }
2759
2760 static void scrub_free_parity(struct scrub_parity *sparity)
2761 {
2762         struct scrub_ctx *sctx = sparity->sctx;
2763         struct scrub_sector *curr, *next;
2764         int nbits;
2765
2766         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2767         if (nbits) {
2768                 spin_lock(&sctx->stat_lock);
2769                 sctx->stat.read_errors += nbits;
2770                 sctx->stat.uncorrectable_errors += nbits;
2771                 spin_unlock(&sctx->stat_lock);
2772         }
2773
2774         list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
2775                 list_del_init(&curr->list);
2776                 scrub_sector_put(curr);
2777         }
2778
2779         kfree(sparity);
2780 }
2781
2782 static void scrub_parity_bio_endio_worker(struct work_struct *work)
2783 {
2784         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2785                                                     work);
2786         struct scrub_ctx *sctx = sparity->sctx;
2787
2788         scrub_free_parity(sparity);
2789         scrub_pending_bio_dec(sctx);
2790 }
2791
2792 static void scrub_parity_bio_endio(struct bio *bio)
2793 {
2794         struct scrub_parity *sparity = bio->bi_private;
2795         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2796
2797         if (bio->bi_status)
2798                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2799                           sparity->nsectors);
2800
2801         bio_put(bio);
2802
2803         INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
2804         queue_work(fs_info->scrub_parity_workers, &sparity->work);
2805 }
2806
2807 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2808 {
2809         struct scrub_ctx *sctx = sparity->sctx;
2810         struct btrfs_fs_info *fs_info = sctx->fs_info;
2811         struct bio *bio;
2812         struct btrfs_raid_bio *rbio;
2813         struct btrfs_io_context *bioc = NULL;
2814         u64 length;
2815         int ret;
2816
2817         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2818                            sparity->nsectors))
2819                 goto out;
2820
2821         length = sparity->logic_end - sparity->logic_start;
2822
2823         btrfs_bio_counter_inc_blocked(fs_info);
2824         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2825                                &length, &bioc);
2826         if (ret || !bioc || !bioc->raid_map)
2827                 goto bioc_out;
2828
2829         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2830         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2831         bio->bi_private = sparity;
2832         bio->bi_end_io = scrub_parity_bio_endio;
2833
2834         rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
2835                                               sparity->scrub_dev,
2836                                               sparity->dbitmap,
2837                                               sparity->nsectors);
2838         if (!rbio)
2839                 goto rbio_out;
2840
2841         scrub_pending_bio_inc(sctx);
2842         raid56_parity_submit_scrub_rbio(rbio);
2843         return;
2844
2845 rbio_out:
2846         bio_put(bio);
2847 bioc_out:
2848         btrfs_bio_counter_dec(fs_info);
2849         btrfs_put_bioc(bioc);
2850         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2851                   sparity->nsectors);
2852         spin_lock(&sctx->stat_lock);
2853         sctx->stat.malloc_errors++;
2854         spin_unlock(&sctx->stat_lock);
2855 out:
2856         scrub_free_parity(sparity);
2857 }
2858
2859 static inline int scrub_calc_parity_bitmap_len(int nsectors)
2860 {
2861         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
2862 }
2863
2864 static void scrub_parity_get(struct scrub_parity *sparity)
2865 {
2866         refcount_inc(&sparity->refs);
2867 }
2868
2869 static void scrub_parity_put(struct scrub_parity *sparity)
2870 {
2871         if (!refcount_dec_and_test(&sparity->refs))
2872                 return;
2873
2874         scrub_parity_check_and_repair(sparity);
2875 }
2876
2877 /*
2878  * Return 0 if the extent item range covers any byte of the range.
2879  * Return <0 if the extent item is before @search_start.
2880  * Return >0 if the extent item is after @start_start + @search_len.
2881  */
2882 static int compare_extent_item_range(struct btrfs_path *path,
2883                                      u64 search_start, u64 search_len)
2884 {
2885         struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
2886         u64 len;
2887         struct btrfs_key key;
2888
2889         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2890         ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
2891                key.type == BTRFS_METADATA_ITEM_KEY);
2892         if (key.type == BTRFS_METADATA_ITEM_KEY)
2893                 len = fs_info->nodesize;
2894         else
2895                 len = key.offset;
2896
2897         if (key.objectid + len <= search_start)
2898                 return -1;
2899         if (key.objectid >= search_start + search_len)
2900                 return 1;
2901         return 0;
2902 }
2903
2904 /*
2905  * Locate one extent item which covers any byte in range
2906  * [@search_start, @search_start + @search_length)
2907  *
2908  * If the path is not initialized, we will initialize the search by doing
2909  * a btrfs_search_slot().
2910  * If the path is already initialized, we will use the path as the initial
2911  * slot, to avoid duplicated btrfs_search_slot() calls.
2912  *
2913  * NOTE: If an extent item starts before @search_start, we will still
2914  * return the extent item. This is for data extent crossing stripe boundary.
2915  *
2916  * Return 0 if we found such extent item, and @path will point to the extent item.
2917  * Return >0 if no such extent item can be found, and @path will be released.
2918  * Return <0 if hit fatal error, and @path will be released.
2919  */
2920 static int find_first_extent_item(struct btrfs_root *extent_root,
2921                                   struct btrfs_path *path,
2922                                   u64 search_start, u64 search_len)
2923 {
2924         struct btrfs_fs_info *fs_info = extent_root->fs_info;
2925         struct btrfs_key key;
2926         int ret;
2927
2928         /* Continue using the existing path */
2929         if (path->nodes[0])
2930                 goto search_forward;
2931
2932         if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2933                 key.type = BTRFS_METADATA_ITEM_KEY;
2934         else
2935                 key.type = BTRFS_EXTENT_ITEM_KEY;
2936         key.objectid = search_start;
2937         key.offset = (u64)-1;
2938
2939         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2940         if (ret < 0)
2941                 return ret;
2942
2943         ASSERT(ret > 0);
2944         /*
2945          * Here we intentionally pass 0 as @min_objectid, as there could be
2946          * an extent item starting before @search_start.
2947          */
2948         ret = btrfs_previous_extent_item(extent_root, path, 0);
2949         if (ret < 0)
2950                 return ret;
2951         /*
2952          * No matter whether we have found an extent item, the next loop will
2953          * properly do every check on the key.
2954          */
2955 search_forward:
2956         while (true) {
2957                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2958                 if (key.objectid >= search_start + search_len)
2959                         break;
2960                 if (key.type != BTRFS_METADATA_ITEM_KEY &&
2961                     key.type != BTRFS_EXTENT_ITEM_KEY)
2962                         goto next;
2963
2964                 ret = compare_extent_item_range(path, search_start, search_len);
2965                 if (ret == 0)
2966                         return ret;
2967                 if (ret > 0)
2968                         break;
2969 next:
2970                 path->slots[0]++;
2971                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2972                         ret = btrfs_next_leaf(extent_root, path);
2973                         if (ret) {
2974                                 /* Either no more item or fatal error */
2975                                 btrfs_release_path(path);
2976                                 return ret;
2977                         }
2978                 }
2979         }
2980         btrfs_release_path(path);
2981         return 1;
2982 }
2983
2984 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
2985                             u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
2986 {
2987         struct btrfs_key key;
2988         struct btrfs_extent_item *ei;
2989
2990         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2991         ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
2992                key.type == BTRFS_EXTENT_ITEM_KEY);
2993         *extent_start_ret = key.objectid;
2994         if (key.type == BTRFS_METADATA_ITEM_KEY)
2995                 *size_ret = path->nodes[0]->fs_info->nodesize;
2996         else
2997                 *size_ret = key.offset;
2998         ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
2999         *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
3000         *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
3001 }
3002
3003 static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
3004                                       u64 boundary_start, u64 boudary_len)
3005 {
3006         return (extent_start < boundary_start &&
3007                 extent_start + extent_len > boundary_start) ||
3008                (extent_start < boundary_start + boudary_len &&
3009                 extent_start + extent_len > boundary_start + boudary_len);
3010 }
3011
3012 static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
3013                                                struct scrub_parity *sparity,
3014                                                struct map_lookup *map,
3015                                                struct btrfs_device *sdev,
3016                                                struct btrfs_path *path,
3017                                                u64 logical)
3018 {
3019         struct btrfs_fs_info *fs_info = sctx->fs_info;
3020         struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
3021         struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
3022         u64 cur_logical = logical;
3023         int ret;
3024
3025         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3026
3027         /* Path must not be populated */
3028         ASSERT(!path->nodes[0]);
3029
3030         while (cur_logical < logical + map->stripe_len) {
3031                 struct btrfs_io_context *bioc = NULL;
3032                 struct btrfs_device *extent_dev;
3033                 u64 extent_start;
3034                 u64 extent_size;
3035                 u64 mapped_length;
3036                 u64 extent_flags;
3037                 u64 extent_gen;
3038                 u64 extent_physical;
3039                 u64 extent_mirror_num;
3040
3041                 ret = find_first_extent_item(extent_root, path, cur_logical,
3042                                              logical + map->stripe_len - cur_logical);
3043                 /* No more extent item in this data stripe */
3044                 if (ret > 0) {
3045                         ret = 0;
3046                         break;
3047                 }
3048                 if (ret < 0)
3049                         break;
3050                 get_extent_info(path, &extent_start, &extent_size, &extent_flags,
3051                                 &extent_gen);
3052
3053                 /* Metadata should not cross stripe boundaries */
3054                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3055                     does_range_cross_boundary(extent_start, extent_size,
3056                                               logical, map->stripe_len)) {
3057                         btrfs_err(fs_info,
3058         "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3059                                   extent_start, logical);
3060                         spin_lock(&sctx->stat_lock);
3061                         sctx->stat.uncorrectable_errors++;
3062                         spin_unlock(&sctx->stat_lock);
3063                         cur_logical += extent_size;
3064                         continue;
3065                 }
3066
3067                 /* Skip hole range which doesn't have any extent */
3068                 cur_logical = max(extent_start, cur_logical);
3069
3070                 /* Truncate the range inside this data stripe */
3071                 extent_size = min(extent_start + extent_size,
3072                                   logical + map->stripe_len) - cur_logical;
3073                 extent_start = cur_logical;
3074                 ASSERT(extent_size <= U32_MAX);
3075
3076                 scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
3077
3078                 mapped_length = extent_size;
3079                 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
3080                                       &mapped_length, &bioc, 0);
3081                 if (!ret && (!bioc || mapped_length < extent_size))
3082                         ret = -EIO;
3083                 if (ret) {
3084                         btrfs_put_bioc(bioc);
3085                         scrub_parity_mark_sectors_error(sparity, extent_start,
3086                                                         extent_size);
3087                         break;
3088                 }
3089                 extent_physical = bioc->stripes[0].physical;
3090                 extent_mirror_num = bioc->mirror_num;
3091                 extent_dev = bioc->stripes[0].dev;
3092                 btrfs_put_bioc(bioc);
3093
3094                 ret = btrfs_lookup_csums_range(csum_root, extent_start,
3095                                                extent_start + extent_size - 1,
3096                                                &sctx->csum_list, 1);
3097                 if (ret) {
3098                         scrub_parity_mark_sectors_error(sparity, extent_start,
3099                                                         extent_size);
3100                         break;
3101                 }
3102
3103                 ret = scrub_extent_for_parity(sparity, extent_start,
3104                                               extent_size, extent_physical,
3105                                               extent_dev, extent_flags,
3106                                               extent_gen, extent_mirror_num);
3107                 scrub_free_csums(sctx);
3108
3109                 if (ret) {
3110                         scrub_parity_mark_sectors_error(sparity, extent_start,
3111                                                         extent_size);
3112                         break;
3113                 }
3114
3115                 cond_resched();
3116                 cur_logical += extent_size;
3117         }
3118         btrfs_release_path(path);
3119         return ret;
3120 }
3121
3122 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3123                                                   struct map_lookup *map,
3124                                                   struct btrfs_device *sdev,
3125                                                   u64 logic_start,
3126                                                   u64 logic_end)
3127 {
3128         struct btrfs_fs_info *fs_info = sctx->fs_info;
3129         struct btrfs_path *path;
3130         u64 cur_logical;
3131         int ret;
3132         struct scrub_parity *sparity;
3133         int nsectors;
3134         int bitmap_len;
3135
3136         path = btrfs_alloc_path();
3137         if (!path) {
3138                 spin_lock(&sctx->stat_lock);
3139                 sctx->stat.malloc_errors++;
3140                 spin_unlock(&sctx->stat_lock);
3141                 return -ENOMEM;
3142         }
3143         path->search_commit_root = 1;
3144         path->skip_locking = 1;
3145
3146         ASSERT(map->stripe_len <= U32_MAX);
3147         nsectors = map->stripe_len >> fs_info->sectorsize_bits;
3148         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
3149         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
3150                           GFP_NOFS);
3151         if (!sparity) {
3152                 spin_lock(&sctx->stat_lock);
3153                 sctx->stat.malloc_errors++;
3154                 spin_unlock(&sctx->stat_lock);
3155                 btrfs_free_path(path);
3156                 return -ENOMEM;
3157         }
3158
3159         ASSERT(map->stripe_len <= U32_MAX);
3160         sparity->stripe_len = map->stripe_len;
3161         sparity->nsectors = nsectors;
3162         sparity->sctx = sctx;
3163         sparity->scrub_dev = sdev;
3164         sparity->logic_start = logic_start;
3165         sparity->logic_end = logic_end;
3166         refcount_set(&sparity->refs, 1);
3167         INIT_LIST_HEAD(&sparity->sectors_list);
3168         sparity->dbitmap = sparity->bitmap;
3169         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
3170
3171         ret = 0;
3172         for (cur_logical = logic_start; cur_logical < logic_end;
3173              cur_logical += map->stripe_len) {
3174                 ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
3175                                                           sdev, path, cur_logical);
3176                 if (ret < 0)
3177                         break;
3178         }
3179
3180         scrub_parity_put(sparity);
3181         scrub_submit(sctx);
3182         mutex_lock(&sctx->wr_lock);
3183         scrub_wr_submit(sctx);
3184         mutex_unlock(&sctx->wr_lock);
3185
3186         btrfs_free_path(path);
3187         return ret < 0 ? ret : 0;
3188 }
3189
3190 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3191 {
3192         if (!btrfs_is_zoned(sctx->fs_info))
3193                 return;
3194
3195         sctx->flush_all_writes = true;
3196         scrub_submit(sctx);
3197         mutex_lock(&sctx->wr_lock);
3198         scrub_wr_submit(sctx);
3199         mutex_unlock(&sctx->wr_lock);
3200
3201         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3202 }
3203
3204 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3205                                         u64 physical, u64 physical_end)
3206 {
3207         struct btrfs_fs_info *fs_info = sctx->fs_info;
3208         int ret = 0;
3209
3210         if (!btrfs_is_zoned(fs_info))
3211                 return 0;
3212
3213         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3214
3215         mutex_lock(&sctx->wr_lock);
3216         if (sctx->write_pointer < physical_end) {
3217                 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3218                                                     physical,
3219                                                     sctx->write_pointer);
3220                 if (ret)
3221                         btrfs_err(fs_info,
3222                                   "zoned: failed to recover write pointer");
3223         }
3224         mutex_unlock(&sctx->wr_lock);
3225         btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3226
3227         return ret;
3228 }
3229
3230 /*
3231  * Scrub one range which can only has simple mirror based profile.
3232  * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
3233  *  RAID0/RAID10).
3234  *
3235  * Since we may need to handle a subset of block group, we need @logical_start
3236  * and @logical_length parameter.
3237  */
3238 static int scrub_simple_mirror(struct scrub_ctx *sctx,
3239                                struct btrfs_root *extent_root,
3240                                struct btrfs_root *csum_root,
3241                                struct btrfs_block_group *bg,
3242                                struct map_lookup *map,
3243                                u64 logical_start, u64 logical_length,
3244                                struct btrfs_device *device,
3245                                u64 physical, int mirror_num)
3246 {
3247         struct btrfs_fs_info *fs_info = sctx->fs_info;
3248         const u64 logical_end = logical_start + logical_length;
3249         /* An artificial limit, inherit from old scrub behavior */
3250         const u32 max_length = SZ_64K;
3251         struct btrfs_path path = { 0 };
3252         u64 cur_logical = logical_start;
3253         int ret;
3254
3255         /* The range must be inside the bg */
3256         ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
3257
3258         path.search_commit_root = 1;
3259         path.skip_locking = 1;
3260         /* Go through each extent items inside the logical range */
3261         while (cur_logical < logical_end) {
3262                 u64 extent_start;
3263                 u64 extent_len;
3264                 u64 extent_flags;
3265                 u64 extent_gen;
3266                 u64 scrub_len;
3267
3268                 /* Canceled? */
3269                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3270                     atomic_read(&sctx->cancel_req)) {
3271                         ret = -ECANCELED;
3272                         break;
3273                 }
3274                 /* Paused? */
3275                 if (atomic_read(&fs_info->scrub_pause_req)) {
3276                         /* Push queued extents */
3277                         sctx->flush_all_writes = true;
3278                         scrub_submit(sctx);
3279                         mutex_lock(&sctx->wr_lock);
3280                         scrub_wr_submit(sctx);
3281                         mutex_unlock(&sctx->wr_lock);
3282                         wait_event(sctx->list_wait,
3283                                    atomic_read(&sctx->bios_in_flight) == 0);
3284                         sctx->flush_all_writes = false;
3285                         scrub_blocked_if_needed(fs_info);
3286                 }
3287                 /* Block group removed? */
3288                 spin_lock(&bg->lock);
3289                 if (bg->removed) {
3290                         spin_unlock(&bg->lock);
3291                         ret = 0;
3292                         break;
3293                 }
3294                 spin_unlock(&bg->lock);
3295
3296                 ret = find_first_extent_item(extent_root, &path, cur_logical,
3297                                              logical_end - cur_logical);
3298                 if (ret > 0) {
3299                         /* No more extent, just update the accounting */
3300                         sctx->stat.last_physical = physical + logical_length;
3301                         ret = 0;
3302                         break;
3303                 }
3304                 if (ret < 0)
3305                         break;
3306                 get_extent_info(&path, &extent_start, &extent_len,
3307                                 &extent_flags, &extent_gen);
3308                 /* Skip hole range which doesn't have any extent */
3309                 cur_logical = max(extent_start, cur_logical);
3310
3311                 /*
3312                  * Scrub len has three limits:
3313                  * - Extent size limit
3314                  * - Scrub range limit
3315                  *   This is especially imporatant for RAID0/RAID10 to reuse
3316                  *   this function
3317                  * - Max scrub size limit
3318                  */
3319                 scrub_len = min(min(extent_start + extent_len,
3320                                     logical_end), cur_logical + max_length) -
3321                             cur_logical;
3322
3323                 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
3324                         ret = btrfs_lookup_csums_range(csum_root, cur_logical,
3325                                         cur_logical + scrub_len - 1,
3326                                         &sctx->csum_list, 1);
3327                         if (ret)
3328                                 break;
3329                 }
3330                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3331                     does_range_cross_boundary(extent_start, extent_len,
3332                                               logical_start, logical_length)) {
3333                         btrfs_err(fs_info,
3334 "scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
3335                                   extent_start, logical_start, logical_end);
3336                         spin_lock(&sctx->stat_lock);
3337                         sctx->stat.uncorrectable_errors++;
3338                         spin_unlock(&sctx->stat_lock);
3339                         cur_logical += scrub_len;
3340                         continue;
3341                 }
3342                 ret = scrub_extent(sctx, map, cur_logical, scrub_len,
3343                                    cur_logical - logical_start + physical,
3344                                    device, extent_flags, extent_gen,
3345                                    mirror_num);
3346                 scrub_free_csums(sctx);
3347                 if (ret)
3348                         break;
3349                 if (sctx->is_dev_replace)
3350                         sync_replace_for_zoned(sctx);
3351                 cur_logical += scrub_len;
3352                 /* Don't hold CPU for too long time */
3353                 cond_resched();
3354         }
3355         btrfs_release_path(&path);
3356         return ret;
3357 }
3358
3359 /* Calculate the full stripe length for simple stripe based profiles */
3360 static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
3361 {
3362         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3363                             BTRFS_BLOCK_GROUP_RAID10));
3364
3365         return map->num_stripes / map->sub_stripes * map->stripe_len;
3366 }
3367
3368 /* Get the logical bytenr for the stripe */
3369 static u64 simple_stripe_get_logical(struct map_lookup *map,
3370                                      struct btrfs_block_group *bg,
3371                                      int stripe_index)
3372 {
3373         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3374                             BTRFS_BLOCK_GROUP_RAID10));
3375         ASSERT(stripe_index < map->num_stripes);
3376
3377         /*
3378          * (stripe_index / sub_stripes) gives how many data stripes we need to
3379          * skip.
3380          */
3381         return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
3382 }
3383
3384 /* Get the mirror number for the stripe */
3385 static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
3386 {
3387         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3388                             BTRFS_BLOCK_GROUP_RAID10));
3389         ASSERT(stripe_index < map->num_stripes);
3390
3391         /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
3392         return stripe_index % map->sub_stripes + 1;
3393 }
3394
3395 static int scrub_simple_stripe(struct scrub_ctx *sctx,
3396                                struct btrfs_root *extent_root,
3397                                struct btrfs_root *csum_root,
3398                                struct btrfs_block_group *bg,
3399                                struct map_lookup *map,
3400                                struct btrfs_device *device,
3401                                int stripe_index)
3402 {
3403         const u64 logical_increment = simple_stripe_full_stripe_len(map);
3404         const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
3405         const u64 orig_physical = map->stripes[stripe_index].physical;
3406         const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
3407         u64 cur_logical = orig_logical;
3408         u64 cur_physical = orig_physical;
3409         int ret = 0;
3410
3411         while (cur_logical < bg->start + bg->length) {
3412                 /*
3413                  * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
3414                  * just RAID1, so we can reuse scrub_simple_mirror() to scrub
3415                  * this stripe.
3416                  */
3417                 ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
3418                                           cur_logical, map->stripe_len, device,
3419                                           cur_physical, mirror_num);
3420                 if (ret)
3421                         return ret;
3422                 /* Skip to next stripe which belongs to the target device */
3423                 cur_logical += logical_increment;
3424                 /* For physical offset, we just go to next stripe */
3425                 cur_physical += map->stripe_len;
3426         }
3427         return ret;
3428 }
3429
3430 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3431                                            struct btrfs_block_group *bg,
3432                                            struct map_lookup *map,
3433                                            struct btrfs_device *scrub_dev,
3434                                            int stripe_index, u64 dev_extent_len)
3435 {
3436         struct btrfs_path *path;
3437         struct btrfs_fs_info *fs_info = sctx->fs_info;
3438         struct btrfs_root *root;
3439         struct btrfs_root *csum_root;
3440         struct blk_plug plug;
3441         const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3442         const u64 chunk_logical = bg->start;
3443         int ret;
3444         u64 physical = map->stripes[stripe_index].physical;
3445         const u64 physical_end = physical + dev_extent_len;
3446         u64 logical;
3447         u64 logic_end;
3448         /* The logical increment after finishing one stripe */
3449         u64 increment;
3450         /* Offset inside the chunk */
3451         u64 offset;
3452         u64 stripe_logical;
3453         u64 stripe_end;
3454         int stop_loop = 0;
3455
3456         path = btrfs_alloc_path();
3457         if (!path)
3458                 return -ENOMEM;
3459
3460         /*
3461          * work on commit root. The related disk blocks are static as
3462          * long as COW is applied. This means, it is save to rewrite
3463          * them to repair disk errors without any race conditions
3464          */
3465         path->search_commit_root = 1;
3466         path->skip_locking = 1;
3467         path->reada = READA_FORWARD;
3468
3469         wait_event(sctx->list_wait,
3470                    atomic_read(&sctx->bios_in_flight) == 0);
3471         scrub_blocked_if_needed(fs_info);
3472
3473         root = btrfs_extent_root(fs_info, bg->start);
3474         csum_root = btrfs_csum_root(fs_info, bg->start);
3475
3476         /*
3477          * collect all data csums for the stripe to avoid seeking during
3478          * the scrub. This might currently (crc32) end up to be about 1MB
3479          */
3480         blk_start_plug(&plug);
3481
3482         if (sctx->is_dev_replace &&
3483             btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3484                 mutex_lock(&sctx->wr_lock);
3485                 sctx->write_pointer = physical;
3486                 mutex_unlock(&sctx->wr_lock);
3487                 sctx->flush_all_writes = true;
3488         }
3489
3490         /*
3491          * There used to be a big double loop to handle all profiles using the
3492          * same routine, which grows larger and more gross over time.
3493          *
3494          * So here we handle each profile differently, so simpler profiles
3495          * have simpler scrubbing function.
3496          */
3497         if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
3498                          BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3499                 /*
3500                  * Above check rules out all complex profile, the remaining
3501                  * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
3502                  * mirrored duplication without stripe.
3503                  *
3504                  * Only @physical and @mirror_num needs to calculated using
3505                  * @stripe_index.
3506                  */
3507                 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3508                                 bg->start, bg->length, scrub_dev,
3509                                 map->stripes[stripe_index].physical,
3510                                 stripe_index + 1);
3511                 offset = 0;
3512                 goto out;
3513         }
3514         if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3515                 ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
3516                                           scrub_dev, stripe_index);
3517                 offset = map->stripe_len * (stripe_index / map->sub_stripes);
3518                 goto out;
3519         }
3520
3521         /* Only RAID56 goes through the old code */
3522         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3523         ret = 0;
3524
3525         /* Calculate the logical end of the stripe */
3526         get_raid56_logic_offset(physical_end, stripe_index,
3527                                 map, &logic_end, NULL);
3528         logic_end += chunk_logical;
3529
3530         /* Initialize @offset in case we need to go to out: label */
3531         get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
3532         increment = map->stripe_len * nr_data_stripes(map);
3533
3534         /*
3535          * Due to the rotation, for RAID56 it's better to iterate each stripe
3536          * using their physical offset.
3537          */
3538         while (physical < physical_end) {
3539                 ret = get_raid56_logic_offset(physical, stripe_index, map,
3540                                               &logical, &stripe_logical);
3541                 logical += chunk_logical;
3542                 if (ret) {
3543                         /* it is parity strip */
3544                         stripe_logical += chunk_logical;
3545                         stripe_end = stripe_logical + increment;
3546                         ret = scrub_raid56_parity(sctx, map, scrub_dev,
3547                                                   stripe_logical,
3548                                                   stripe_end);
3549                         if (ret)
3550                                 goto out;
3551                         goto next;
3552                 }
3553
3554                 /*
3555                  * Now we're at a data stripe, scrub each extents in the range.
3556                  *
3557                  * At this stage, if we ignore the repair part, inside each data
3558                  * stripe it is no different than SINGLE profile.
3559                  * We can reuse scrub_simple_mirror() here, as the repair part
3560                  * is still based on @mirror_num.
3561                  */
3562                 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3563                                           logical, map->stripe_len,
3564                                           scrub_dev, physical, 1);
3565                 if (ret < 0)
3566                         goto out;
3567 next:
3568                 logical += increment;
3569                 physical += map->stripe_len;
3570                 spin_lock(&sctx->stat_lock);
3571                 if (stop_loop)
3572                         sctx->stat.last_physical = map->stripes[stripe_index].physical +
3573                                                    dev_extent_len;
3574                 else
3575                         sctx->stat.last_physical = physical;
3576                 spin_unlock(&sctx->stat_lock);
3577                 if (stop_loop)
3578                         break;
3579         }
3580 out:
3581         /* push queued extents */
3582         scrub_submit(sctx);
3583         mutex_lock(&sctx->wr_lock);
3584         scrub_wr_submit(sctx);
3585         mutex_unlock(&sctx->wr_lock);
3586
3587         blk_finish_plug(&plug);
3588         btrfs_free_path(path);
3589
3590         if (sctx->is_dev_replace && ret >= 0) {
3591                 int ret2;
3592
3593                 ret2 = sync_write_pointer_for_zoned(sctx,
3594                                 chunk_logical + offset,
3595                                 map->stripes[stripe_index].physical,
3596                                 physical_end);
3597                 if (ret2)
3598                         ret = ret2;
3599         }
3600
3601         return ret < 0 ? ret : 0;
3602 }
3603
3604 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3605                                           struct btrfs_block_group *bg,
3606                                           struct btrfs_device *scrub_dev,
3607                                           u64 dev_offset,
3608                                           u64 dev_extent_len)
3609 {
3610         struct btrfs_fs_info *fs_info = sctx->fs_info;
3611         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3612         struct map_lookup *map;
3613         struct extent_map *em;
3614         int i;
3615         int ret = 0;
3616
3617         read_lock(&map_tree->lock);
3618         em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3619         read_unlock(&map_tree->lock);
3620
3621         if (!em) {
3622                 /*
3623                  * Might have been an unused block group deleted by the cleaner
3624                  * kthread or relocation.
3625                  */
3626                 spin_lock(&bg->lock);
3627                 if (!bg->removed)
3628                         ret = -EINVAL;
3629                 spin_unlock(&bg->lock);
3630
3631                 return ret;
3632         }
3633         if (em->start != bg->start)
3634                 goto out;
3635         if (em->len < dev_extent_len)
3636                 goto out;
3637
3638         map = em->map_lookup;
3639         for (i = 0; i < map->num_stripes; ++i) {
3640                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3641                     map->stripes[i].physical == dev_offset) {
3642                         ret = scrub_stripe(sctx, bg, map, scrub_dev, i,
3643                                            dev_extent_len);
3644                         if (ret)
3645                                 goto out;
3646                 }
3647         }
3648 out:
3649         free_extent_map(em);
3650
3651         return ret;
3652 }
3653
3654 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3655                                           struct btrfs_block_group *cache)
3656 {
3657         struct btrfs_fs_info *fs_info = cache->fs_info;
3658         struct btrfs_trans_handle *trans;
3659
3660         if (!btrfs_is_zoned(fs_info))
3661                 return 0;
3662
3663         btrfs_wait_block_group_reservations(cache);
3664         btrfs_wait_nocow_writers(cache);
3665         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3666
3667         trans = btrfs_join_transaction(root);
3668         if (IS_ERR(trans))
3669                 return PTR_ERR(trans);
3670         return btrfs_commit_transaction(trans);
3671 }
3672
3673 static noinline_for_stack
3674 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3675                            struct btrfs_device *scrub_dev, u64 start, u64 end)
3676 {
3677         struct btrfs_dev_extent *dev_extent = NULL;
3678         struct btrfs_path *path;
3679         struct btrfs_fs_info *fs_info = sctx->fs_info;
3680         struct btrfs_root *root = fs_info->dev_root;
3681         u64 chunk_offset;
3682         int ret = 0;
3683         int ro_set;
3684         int slot;
3685         struct extent_buffer *l;
3686         struct btrfs_key key;
3687         struct btrfs_key found_key;
3688         struct btrfs_block_group *cache;
3689         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3690
3691         path = btrfs_alloc_path();
3692         if (!path)
3693                 return -ENOMEM;
3694
3695         path->reada = READA_FORWARD;
3696         path->search_commit_root = 1;
3697         path->skip_locking = 1;
3698
3699         key.objectid = scrub_dev->devid;
3700         key.offset = 0ull;
3701         key.type = BTRFS_DEV_EXTENT_KEY;
3702
3703         while (1) {
3704                 u64 dev_extent_len;
3705
3706                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3707                 if (ret < 0)
3708                         break;
3709                 if (ret > 0) {
3710                         if (path->slots[0] >=
3711                             btrfs_header_nritems(path->nodes[0])) {
3712                                 ret = btrfs_next_leaf(root, path);
3713                                 if (ret < 0)
3714                                         break;
3715                                 if (ret > 0) {
3716                                         ret = 0;
3717                                         break;
3718                                 }
3719                         } else {
3720                                 ret = 0;
3721                         }
3722                 }
3723
3724                 l = path->nodes[0];
3725                 slot = path->slots[0];
3726
3727                 btrfs_item_key_to_cpu(l, &found_key, slot);
3728
3729                 if (found_key.objectid != scrub_dev->devid)
3730                         break;
3731
3732                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3733                         break;
3734
3735                 if (found_key.offset >= end)
3736                         break;
3737
3738                 if (found_key.offset < key.offset)
3739                         break;
3740
3741                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3742                 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3743
3744                 if (found_key.offset + dev_extent_len <= start)
3745                         goto skip;
3746
3747                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3748
3749                 /*
3750                  * get a reference on the corresponding block group to prevent
3751                  * the chunk from going away while we scrub it
3752                  */
3753                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3754
3755                 /* some chunks are removed but not committed to disk yet,
3756                  * continue scrubbing */
3757                 if (!cache)
3758                         goto skip;
3759
3760                 ASSERT(cache->start <= chunk_offset);
3761                 /*
3762                  * We are using the commit root to search for device extents, so
3763                  * that means we could have found a device extent item from a
3764                  * block group that was deleted in the current transaction. The
3765                  * logical start offset of the deleted block group, stored at
3766                  * @chunk_offset, might be part of the logical address range of
3767                  * a new block group (which uses different physical extents).
3768                  * In this case btrfs_lookup_block_group() has returned the new
3769                  * block group, and its start address is less than @chunk_offset.
3770                  *
3771                  * We skip such new block groups, because it's pointless to
3772                  * process them, as we won't find their extents because we search
3773                  * for them using the commit root of the extent tree. For a device
3774                  * replace it's also fine to skip it, we won't miss copying them
3775                  * to the target device because we have the write duplication
3776                  * setup through the regular write path (by btrfs_map_block()),
3777                  * and we have committed a transaction when we started the device
3778                  * replace, right after setting up the device replace state.
3779                  */
3780                 if (cache->start < chunk_offset) {
3781                         btrfs_put_block_group(cache);
3782                         goto skip;
3783                 }
3784
3785                 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3786                         spin_lock(&cache->lock);
3787                         if (!cache->to_copy) {
3788                                 spin_unlock(&cache->lock);
3789                                 btrfs_put_block_group(cache);
3790                                 goto skip;
3791                         }
3792                         spin_unlock(&cache->lock);
3793                 }
3794
3795                 /*
3796                  * Make sure that while we are scrubbing the corresponding block
3797                  * group doesn't get its logical address and its device extents
3798                  * reused for another block group, which can possibly be of a
3799                  * different type and different profile. We do this to prevent
3800                  * false error detections and crashes due to bogus attempts to
3801                  * repair extents.
3802                  */
3803                 spin_lock(&cache->lock);
3804                 if (cache->removed) {
3805                         spin_unlock(&cache->lock);
3806                         btrfs_put_block_group(cache);
3807                         goto skip;
3808                 }
3809                 btrfs_freeze_block_group(cache);
3810                 spin_unlock(&cache->lock);
3811
3812                 /*
3813                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3814                  * to avoid deadlock caused by:
3815                  * btrfs_inc_block_group_ro()
3816                  * -> btrfs_wait_for_commit()
3817                  * -> btrfs_commit_transaction()
3818                  * -> btrfs_scrub_pause()
3819                  */
3820                 scrub_pause_on(fs_info);
3821
3822                 /*
3823                  * Don't do chunk preallocation for scrub.
3824                  *
3825                  * This is especially important for SYSTEM bgs, or we can hit
3826                  * -EFBIG from btrfs_finish_chunk_alloc() like:
3827                  * 1. The only SYSTEM bg is marked RO.
3828                  *    Since SYSTEM bg is small, that's pretty common.
3829                  * 2. New SYSTEM bg will be allocated
3830                  *    Due to regular version will allocate new chunk.
3831                  * 3. New SYSTEM bg is empty and will get cleaned up
3832                  *    Before cleanup really happens, it's marked RO again.
3833                  * 4. Empty SYSTEM bg get scrubbed
3834                  *    We go back to 2.
3835                  *
3836                  * This can easily boost the amount of SYSTEM chunks if cleaner
3837                  * thread can't be triggered fast enough, and use up all space
3838                  * of btrfs_super_block::sys_chunk_array
3839                  *
3840                  * While for dev replace, we need to try our best to mark block
3841                  * group RO, to prevent race between:
3842                  * - Write duplication
3843                  *   Contains latest data
3844                  * - Scrub copy
3845                  *   Contains data from commit tree
3846                  *
3847                  * If target block group is not marked RO, nocow writes can
3848                  * be overwritten by scrub copy, causing data corruption.
3849                  * So for dev-replace, it's not allowed to continue if a block
3850                  * group is not RO.
3851                  */
3852                 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3853                 if (!ret && sctx->is_dev_replace) {
3854                         ret = finish_extent_writes_for_zoned(root, cache);
3855                         if (ret) {
3856                                 btrfs_dec_block_group_ro(cache);
3857                                 scrub_pause_off(fs_info);
3858                                 btrfs_put_block_group(cache);
3859                                 break;
3860                         }
3861                 }
3862
3863                 if (ret == 0) {
3864                         ro_set = 1;
3865                 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3866                         /*
3867                          * btrfs_inc_block_group_ro return -ENOSPC when it
3868                          * failed in creating new chunk for metadata.
3869                          * It is not a problem for scrub, because
3870                          * metadata are always cowed, and our scrub paused
3871                          * commit_transactions.
3872                          */
3873                         ro_set = 0;
3874                 } else if (ret == -ETXTBSY) {
3875                         btrfs_warn(fs_info,
3876                    "skipping scrub of block group %llu due to active swapfile",
3877                                    cache->start);
3878                         scrub_pause_off(fs_info);
3879                         ret = 0;
3880                         goto skip_unfreeze;
3881                 } else {
3882                         btrfs_warn(fs_info,
3883                                    "failed setting block group ro: %d", ret);
3884                         btrfs_unfreeze_block_group(cache);
3885                         btrfs_put_block_group(cache);
3886                         scrub_pause_off(fs_info);
3887                         break;
3888                 }
3889
3890                 /*
3891                  * Now the target block is marked RO, wait for nocow writes to
3892                  * finish before dev-replace.
3893                  * COW is fine, as COW never overwrites extents in commit tree.
3894                  */
3895                 if (sctx->is_dev_replace) {
3896                         btrfs_wait_nocow_writers(cache);
3897                         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3898                                         cache->length);
3899                 }
3900
3901                 scrub_pause_off(fs_info);
3902                 down_write(&dev_replace->rwsem);
3903                 dev_replace->cursor_right = found_key.offset + dev_extent_len;
3904                 dev_replace->cursor_left = found_key.offset;
3905                 dev_replace->item_needs_writeback = 1;
3906                 up_write(&dev_replace->rwsem);
3907
3908                 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
3909                                   dev_extent_len);
3910
3911                 /*
3912                  * flush, submit all pending read and write bios, afterwards
3913                  * wait for them.
3914                  * Note that in the dev replace case, a read request causes
3915                  * write requests that are submitted in the read completion
3916                  * worker. Therefore in the current situation, it is required
3917                  * that all write requests are flushed, so that all read and
3918                  * write requests are really completed when bios_in_flight
3919                  * changes to 0.
3920                  */
3921                 sctx->flush_all_writes = true;
3922                 scrub_submit(sctx);
3923                 mutex_lock(&sctx->wr_lock);
3924                 scrub_wr_submit(sctx);
3925                 mutex_unlock(&sctx->wr_lock);
3926
3927                 wait_event(sctx->list_wait,
3928                            atomic_read(&sctx->bios_in_flight) == 0);
3929
3930                 scrub_pause_on(fs_info);
3931
3932                 /*
3933                  * must be called before we decrease @scrub_paused.
3934                  * make sure we don't block transaction commit while
3935                  * we are waiting pending workers finished.
3936                  */
3937                 wait_event(sctx->list_wait,
3938                            atomic_read(&sctx->workers_pending) == 0);
3939                 sctx->flush_all_writes = false;
3940
3941                 scrub_pause_off(fs_info);
3942
3943                 if (sctx->is_dev_replace &&
3944                     !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3945                                                       cache, found_key.offset))
3946                         ro_set = 0;
3947
3948                 down_write(&dev_replace->rwsem);
3949                 dev_replace->cursor_left = dev_replace->cursor_right;
3950                 dev_replace->item_needs_writeback = 1;
3951                 up_write(&dev_replace->rwsem);
3952
3953                 if (ro_set)
3954                         btrfs_dec_block_group_ro(cache);
3955
3956                 /*
3957                  * We might have prevented the cleaner kthread from deleting
3958                  * this block group if it was already unused because we raced
3959                  * and set it to RO mode first. So add it back to the unused
3960                  * list, otherwise it might not ever be deleted unless a manual
3961                  * balance is triggered or it becomes used and unused again.
3962                  */
3963                 spin_lock(&cache->lock);
3964                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3965                     cache->used == 0) {
3966                         spin_unlock(&cache->lock);
3967                         if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3968                                 btrfs_discard_queue_work(&fs_info->discard_ctl,
3969                                                          cache);
3970                         else
3971                                 btrfs_mark_bg_unused(cache);
3972                 } else {
3973                         spin_unlock(&cache->lock);
3974                 }
3975 skip_unfreeze:
3976                 btrfs_unfreeze_block_group(cache);
3977                 btrfs_put_block_group(cache);
3978                 if (ret)
3979                         break;
3980                 if (sctx->is_dev_replace &&
3981                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3982                         ret = -EIO;
3983                         break;
3984                 }
3985                 if (sctx->stat.malloc_errors > 0) {
3986                         ret = -ENOMEM;
3987                         break;
3988                 }
3989 skip:
3990                 key.offset = found_key.offset + dev_extent_len;
3991                 btrfs_release_path(path);
3992         }
3993
3994         btrfs_free_path(path);
3995
3996         return ret;
3997 }
3998
3999 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
4000                                            struct btrfs_device *scrub_dev)
4001 {
4002         int     i;
4003         u64     bytenr;
4004         u64     gen;
4005         int     ret;
4006         struct btrfs_fs_info *fs_info = sctx->fs_info;
4007
4008         if (BTRFS_FS_ERROR(fs_info))
4009                 return -EROFS;
4010
4011         /* Seed devices of a new filesystem has their own generation. */
4012         if (scrub_dev->fs_devices != fs_info->fs_devices)
4013                 gen = scrub_dev->generation;
4014         else
4015                 gen = fs_info->last_trans_committed;
4016
4017         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4018                 bytenr = btrfs_sb_offset(i);
4019                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4020                     scrub_dev->commit_total_bytes)
4021                         break;
4022                 if (!btrfs_check_super_location(scrub_dev, bytenr))
4023                         continue;
4024
4025                 ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4026                                     scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4027                                     NULL, bytenr);
4028                 if (ret)
4029                         return ret;
4030         }
4031         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4032
4033         return 0;
4034 }
4035
4036 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
4037 {
4038         if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
4039                                         &fs_info->scrub_lock)) {
4040                 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
4041                 struct workqueue_struct *scrub_wr_comp =
4042                                                 fs_info->scrub_wr_completion_workers;
4043                 struct workqueue_struct *scrub_parity =
4044                                                 fs_info->scrub_parity_workers;
4045
4046                 fs_info->scrub_workers = NULL;
4047                 fs_info->scrub_wr_completion_workers = NULL;
4048                 fs_info->scrub_parity_workers = NULL;
4049                 mutex_unlock(&fs_info->scrub_lock);
4050
4051                 if (scrub_workers)
4052                         destroy_workqueue(scrub_workers);
4053                 if (scrub_wr_comp)
4054                         destroy_workqueue(scrub_wr_comp);
4055                 if (scrub_parity)
4056                         destroy_workqueue(scrub_parity);
4057         }
4058 }
4059
4060 /*
4061  * get a reference count on fs_info->scrub_workers. start worker if necessary
4062  */
4063 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4064                                                 int is_dev_replace)
4065 {
4066         struct workqueue_struct *scrub_workers = NULL;
4067         struct workqueue_struct *scrub_wr_comp = NULL;
4068         struct workqueue_struct *scrub_parity = NULL;
4069         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4070         int max_active = fs_info->thread_pool_size;
4071         int ret = -ENOMEM;
4072
4073         if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4074                 return 0;
4075
4076         scrub_workers = alloc_workqueue("btrfs-scrub", flags,
4077                                         is_dev_replace ? 1 : max_active);
4078         if (!scrub_workers)
4079                 goto fail_scrub_workers;
4080
4081         scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
4082         if (!scrub_wr_comp)
4083                 goto fail_scrub_wr_completion_workers;
4084
4085         scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
4086         if (!scrub_parity)
4087                 goto fail_scrub_parity_workers;
4088
4089         mutex_lock(&fs_info->scrub_lock);
4090         if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4091                 ASSERT(fs_info->scrub_workers == NULL &&
4092                        fs_info->scrub_wr_completion_workers == NULL &&
4093                        fs_info->scrub_parity_workers == NULL);
4094                 fs_info->scrub_workers = scrub_workers;
4095                 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4096                 fs_info->scrub_parity_workers = scrub_parity;
4097                 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4098                 mutex_unlock(&fs_info->scrub_lock);
4099                 return 0;
4100         }
4101         /* Other thread raced in and created the workers for us */
4102         refcount_inc(&fs_info->scrub_workers_refcnt);
4103         mutex_unlock(&fs_info->scrub_lock);
4104
4105         ret = 0;
4106         destroy_workqueue(scrub_parity);
4107 fail_scrub_parity_workers:
4108         destroy_workqueue(scrub_wr_comp);
4109 fail_scrub_wr_completion_workers:
4110         destroy_workqueue(scrub_workers);
4111 fail_scrub_workers:
4112         return ret;
4113 }
4114
4115 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4116                     u64 end, struct btrfs_scrub_progress *progress,
4117                     int readonly, int is_dev_replace)
4118 {
4119         struct btrfs_dev_lookup_args args = { .devid = devid };
4120         struct scrub_ctx *sctx;
4121         int ret;
4122         struct btrfs_device *dev;
4123         unsigned int nofs_flag;
4124
4125         if (btrfs_fs_closing(fs_info))
4126                 return -EAGAIN;
4127
4128         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4129                 /*
4130                  * in this case scrub is unable to calculate the checksum
4131                  * the way scrub is implemented. Do not handle this
4132                  * situation at all because it won't ever happen.
4133                  */
4134                 btrfs_err(fs_info,
4135                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4136                        fs_info->nodesize,
4137                        BTRFS_STRIPE_LEN);
4138                 return -EINVAL;
4139         }
4140
4141         if (fs_info->nodesize >
4142             SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
4143             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
4144                 /*
4145                  * Would exhaust the array bounds of sectorv member in
4146                  * struct scrub_block
4147                  */
4148                 btrfs_err(fs_info,
4149 "scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
4150                        fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
4151                        fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
4152                 return -EINVAL;
4153         }
4154
4155         /* Allocate outside of device_list_mutex */
4156         sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4157         if (IS_ERR(sctx))
4158                 return PTR_ERR(sctx);
4159
4160         ret = scrub_workers_get(fs_info, is_dev_replace);
4161         if (ret)
4162                 goto out_free_ctx;
4163
4164         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4165         dev = btrfs_find_device(fs_info->fs_devices, &args);
4166         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4167                      !is_dev_replace)) {
4168                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4169                 ret = -ENODEV;
4170                 goto out;
4171         }
4172
4173         if (!is_dev_replace && !readonly &&
4174             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4175                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4176                 btrfs_err_in_rcu(fs_info,
4177                         "scrub on devid %llu: filesystem on %s is not writable",
4178                                  devid, rcu_str_deref(dev->name));
4179                 ret = -EROFS;
4180                 goto out;
4181         }
4182
4183         mutex_lock(&fs_info->scrub_lock);
4184         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4185             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4186                 mutex_unlock(&fs_info->scrub_lock);
4187                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4188                 ret = -EIO;
4189                 goto out;
4190         }
4191
4192         down_read(&fs_info->dev_replace.rwsem);
4193         if (dev->scrub_ctx ||
4194             (!is_dev_replace &&
4195              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4196                 up_read(&fs_info->dev_replace.rwsem);
4197                 mutex_unlock(&fs_info->scrub_lock);
4198                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4199                 ret = -EINPROGRESS;
4200                 goto out;
4201         }
4202         up_read(&fs_info->dev_replace.rwsem);
4203
4204         sctx->readonly = readonly;
4205         dev->scrub_ctx = sctx;
4206         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4207
4208         /*
4209          * checking @scrub_pause_req here, we can avoid
4210          * race between committing transaction and scrubbing.
4211          */
4212         __scrub_blocked_if_needed(fs_info);
4213         atomic_inc(&fs_info->scrubs_running);
4214         mutex_unlock(&fs_info->scrub_lock);
4215
4216         /*
4217          * In order to avoid deadlock with reclaim when there is a transaction
4218          * trying to pause scrub, make sure we use GFP_NOFS for all the
4219          * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
4220          * invoked by our callees. The pausing request is done when the
4221          * transaction commit starts, and it blocks the transaction until scrub
4222          * is paused (done at specific points at scrub_stripe() or right above
4223          * before incrementing fs_info->scrubs_running).
4224          */
4225         nofs_flag = memalloc_nofs_save();
4226         if (!is_dev_replace) {
4227                 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4228                 /*
4229                  * by holding device list mutex, we can
4230                  * kick off writing super in log tree sync.
4231                  */
4232                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4233                 ret = scrub_supers(sctx, dev);
4234                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4235         }
4236
4237         if (!ret)
4238                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4239         memalloc_nofs_restore(nofs_flag);
4240
4241         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4242         atomic_dec(&fs_info->scrubs_running);
4243         wake_up(&fs_info->scrub_pause_wait);
4244
4245         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4246
4247         if (progress)
4248                 memcpy(progress, &sctx->stat, sizeof(*progress));
4249
4250         if (!is_dev_replace)
4251                 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4252                         ret ? "not finished" : "finished", devid, ret);
4253
4254         mutex_lock(&fs_info->scrub_lock);
4255         dev->scrub_ctx = NULL;
4256         mutex_unlock(&fs_info->scrub_lock);
4257
4258         scrub_workers_put(fs_info);
4259         scrub_put_ctx(sctx);
4260
4261         return ret;
4262 out:
4263         scrub_workers_put(fs_info);
4264 out_free_ctx:
4265         scrub_free_ctx(sctx);
4266
4267         return ret;
4268 }
4269
4270 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4271 {
4272         mutex_lock(&fs_info->scrub_lock);
4273         atomic_inc(&fs_info->scrub_pause_req);
4274         while (atomic_read(&fs_info->scrubs_paused) !=
4275                atomic_read(&fs_info->scrubs_running)) {
4276                 mutex_unlock(&fs_info->scrub_lock);
4277                 wait_event(fs_info->scrub_pause_wait,
4278                            atomic_read(&fs_info->scrubs_paused) ==
4279                            atomic_read(&fs_info->scrubs_running));
4280                 mutex_lock(&fs_info->scrub_lock);
4281         }
4282         mutex_unlock(&fs_info->scrub_lock);
4283 }
4284
4285 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4286 {
4287         atomic_dec(&fs_info->scrub_pause_req);
4288         wake_up(&fs_info->scrub_pause_wait);
4289 }
4290
4291 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4292 {
4293         mutex_lock(&fs_info->scrub_lock);
4294         if (!atomic_read(&fs_info->scrubs_running)) {
4295                 mutex_unlock(&fs_info->scrub_lock);
4296                 return -ENOTCONN;
4297         }
4298
4299         atomic_inc(&fs_info->scrub_cancel_req);
4300         while (atomic_read(&fs_info->scrubs_running)) {
4301                 mutex_unlock(&fs_info->scrub_lock);
4302                 wait_event(fs_info->scrub_pause_wait,
4303                            atomic_read(&fs_info->scrubs_running) == 0);
4304                 mutex_lock(&fs_info->scrub_lock);
4305         }
4306         atomic_dec(&fs_info->scrub_cancel_req);
4307         mutex_unlock(&fs_info->scrub_lock);
4308
4309         return 0;
4310 }
4311
4312 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4313 {
4314         struct btrfs_fs_info *fs_info = dev->fs_info;
4315         struct scrub_ctx *sctx;
4316
4317         mutex_lock(&fs_info->scrub_lock);
4318         sctx = dev->scrub_ctx;
4319         if (!sctx) {
4320                 mutex_unlock(&fs_info->scrub_lock);
4321                 return -ENOTCONN;
4322         }
4323         atomic_inc(&sctx->cancel_req);
4324         while (dev->scrub_ctx) {
4325                 mutex_unlock(&fs_info->scrub_lock);
4326                 wait_event(fs_info->scrub_pause_wait,
4327                            dev->scrub_ctx == NULL);
4328                 mutex_lock(&fs_info->scrub_lock);
4329         }
4330         mutex_unlock(&fs_info->scrub_lock);
4331
4332         return 0;
4333 }
4334
4335 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4336                          struct btrfs_scrub_progress *progress)
4337 {
4338         struct btrfs_dev_lookup_args args = { .devid = devid };
4339         struct btrfs_device *dev;
4340         struct scrub_ctx *sctx = NULL;
4341
4342         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4343         dev = btrfs_find_device(fs_info->fs_devices, &args);
4344         if (dev)
4345                 sctx = dev->scrub_ctx;
4346         if (sctx)
4347                 memcpy(progress, &sctx->stat, sizeof(*progress));
4348         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4349
4350         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4351 }
4352
4353 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
4354                                  u64 extent_logical, u32 extent_len,
4355                                  u64 *extent_physical,
4356                                  struct btrfs_device **extent_dev,
4357                                  int *extent_mirror_num)
4358 {
4359         u64 mapped_length;
4360         struct btrfs_io_context *bioc = NULL;
4361         int ret;
4362
4363         mapped_length = extent_len;
4364         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4365                               &mapped_length, &bioc, 0);
4366         if (ret || !bioc || mapped_length < extent_len ||
4367             !bioc->stripes[0].dev->bdev) {
4368                 btrfs_put_bioc(bioc);
4369                 return;
4370         }
4371
4372         *extent_physical = bioc->stripes[0].physical;
4373         *extent_mirror_num = bioc->mirror_num;
4374         *extent_dev = bioc->stripes[0].dev;
4375         btrfs_put_bioc(bioc);
4376 }