Merge tag 'thermal-5.20-rc1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/rafae...
[linux-2.6-microblaze.git] / fs / btrfs / scrub.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
4  */
5
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include <crypto/hash.h>
10 #include "ctree.h"
11 #include "discard.h"
12 #include "volumes.h"
13 #include "disk-io.h"
14 #include "ordered-data.h"
15 #include "transaction.h"
16 #include "backref.h"
17 #include "extent_io.h"
18 #include "dev-replace.h"
19 #include "check-integrity.h"
20 #include "rcu-string.h"
21 #include "raid56.h"
22 #include "block-group.h"
23 #include "zoned.h"
24
25 /*
26  * This is only the first step towards a full-features scrub. It reads all
27  * extent and super block and verifies the checksums. In case a bad checksum
28  * is found or the extent cannot be read, good data will be written back if
29  * any can be found.
30  *
31  * Future enhancements:
32  *  - In case an unrepairable extent is encountered, track which files are
33  *    affected and report them
34  *  - track and record media errors, throw out bad devices
35  *  - add a mode to also read unallocated space
36  */
37
38 struct scrub_block;
39 struct scrub_ctx;
40
41 /*
42  * The following three values only influence the performance.
43  *
44  * The last one configures the number of parallel and outstanding I/O
45  * operations. The first one configures an upper limit for the number
46  * of (dynamically allocated) pages that are added to a bio.
47  */
48 #define SCRUB_SECTORS_PER_BIO   32      /* 128KiB per bio for 4KiB pages */
49 #define SCRUB_BIOS_PER_SCTX     64      /* 8MiB per device in flight for 4KiB pages */
50
51 /*
52  * The following value times PAGE_SIZE needs to be large enough to match the
53  * largest node/leaf/sector size that shall be supported.
54  */
55 #define SCRUB_MAX_SECTORS_PER_BLOCK     (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
56
57 struct scrub_recover {
58         refcount_t              refs;
59         struct btrfs_io_context *bioc;
60         u64                     map_length;
61 };
62
63 struct scrub_sector {
64         struct scrub_block      *sblock;
65         struct page             *page;
66         struct btrfs_device     *dev;
67         struct list_head        list;
68         u64                     flags;  /* extent flags */
69         u64                     generation;
70         u64                     logical;
71         u64                     physical;
72         u64                     physical_for_dev_replace;
73         atomic_t                refs;
74         u8                      mirror_num;
75         unsigned int            have_csum:1;
76         unsigned int            io_error:1;
77         u8                      csum[BTRFS_CSUM_SIZE];
78
79         struct scrub_recover    *recover;
80 };
81
82 struct scrub_bio {
83         int                     index;
84         struct scrub_ctx        *sctx;
85         struct btrfs_device     *dev;
86         struct bio              *bio;
87         blk_status_t            status;
88         u64                     logical;
89         u64                     physical;
90         struct scrub_sector     *sectors[SCRUB_SECTORS_PER_BIO];
91         int                     sector_count;
92         int                     next_free;
93         struct work_struct      work;
94 };
95
96 struct scrub_block {
97         struct scrub_sector     *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
98         int                     sector_count;
99         atomic_t                outstanding_sectors;
100         refcount_t              refs; /* free mem on transition to zero */
101         struct scrub_ctx        *sctx;
102         struct scrub_parity     *sparity;
103         struct {
104                 unsigned int    header_error:1;
105                 unsigned int    checksum_error:1;
106                 unsigned int    no_io_error_seen:1;
107                 unsigned int    generation_error:1; /* also sets header_error */
108
109                 /* The following is for the data used to check parity */
110                 /* It is for the data with checksum */
111                 unsigned int    data_corrected:1;
112         };
113         struct work_struct      work;
114 };
115
116 /* Used for the chunks with parity stripe such RAID5/6 */
117 struct scrub_parity {
118         struct scrub_ctx        *sctx;
119
120         struct btrfs_device     *scrub_dev;
121
122         u64                     logic_start;
123
124         u64                     logic_end;
125
126         int                     nsectors;
127
128         u32                     stripe_len;
129
130         refcount_t              refs;
131
132         struct list_head        sectors_list;
133
134         /* Work of parity check and repair */
135         struct work_struct      work;
136
137         /* Mark the parity blocks which have data */
138         unsigned long           dbitmap;
139
140         /*
141          * Mark the parity blocks which have data, but errors happen when
142          * read data or check data
143          */
144         unsigned long           ebitmap;
145 };
146
147 struct scrub_ctx {
148         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
149         struct btrfs_fs_info    *fs_info;
150         int                     first_free;
151         int                     curr;
152         atomic_t                bios_in_flight;
153         atomic_t                workers_pending;
154         spinlock_t              list_lock;
155         wait_queue_head_t       list_wait;
156         struct list_head        csum_list;
157         atomic_t                cancel_req;
158         int                     readonly;
159         int                     sectors_per_bio;
160
161         /* State of IO submission throttling affecting the associated device */
162         ktime_t                 throttle_deadline;
163         u64                     throttle_sent;
164
165         int                     is_dev_replace;
166         u64                     write_pointer;
167
168         struct scrub_bio        *wr_curr_bio;
169         struct mutex            wr_lock;
170         struct btrfs_device     *wr_tgtdev;
171         bool                    flush_all_writes;
172
173         /*
174          * statistics
175          */
176         struct btrfs_scrub_progress stat;
177         spinlock_t              stat_lock;
178
179         /*
180          * Use a ref counter to avoid use-after-free issues. Scrub workers
181          * decrement bios_in_flight and workers_pending and then do a wakeup
182          * on the list_wait wait queue. We must ensure the main scrub task
183          * doesn't free the scrub context before or while the workers are
184          * doing the wakeup() call.
185          */
186         refcount_t              refs;
187 };
188
189 struct scrub_warning {
190         struct btrfs_path       *path;
191         u64                     extent_item_size;
192         const char              *errstr;
193         u64                     physical;
194         u64                     logical;
195         struct btrfs_device     *dev;
196 };
197
198 struct full_stripe_lock {
199         struct rb_node node;
200         u64 logical;
201         u64 refs;
202         struct mutex mutex;
203 };
204
205 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
206                                      struct scrub_block *sblocks_for_recheck);
207 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
208                                 struct scrub_block *sblock,
209                                 int retry_failed_mirror);
210 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
211 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
212                                              struct scrub_block *sblock_good);
213 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
214                                             struct scrub_block *sblock_good,
215                                             int sector_num, int force_write);
216 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
217 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
218                                              int sector_num);
219 static int scrub_checksum_data(struct scrub_block *sblock);
220 static int scrub_checksum_tree_block(struct scrub_block *sblock);
221 static int scrub_checksum_super(struct scrub_block *sblock);
222 static void scrub_block_put(struct scrub_block *sblock);
223 static void scrub_sector_get(struct scrub_sector *sector);
224 static void scrub_sector_put(struct scrub_sector *sector);
225 static void scrub_parity_get(struct scrub_parity *sparity);
226 static void scrub_parity_put(struct scrub_parity *sparity);
227 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
228                          u64 physical, struct btrfs_device *dev, u64 flags,
229                          u64 gen, int mirror_num, u8 *csum,
230                          u64 physical_for_dev_replace);
231 static void scrub_bio_end_io(struct bio *bio);
232 static void scrub_bio_end_io_worker(struct work_struct *work);
233 static void scrub_block_complete(struct scrub_block *sblock);
234 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
235                                  u64 extent_logical, u32 extent_len,
236                                  u64 *extent_physical,
237                                  struct btrfs_device **extent_dev,
238                                  int *extent_mirror_num);
239 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
240                                       struct scrub_sector *sector);
241 static void scrub_wr_submit(struct scrub_ctx *sctx);
242 static void scrub_wr_bio_end_io(struct bio *bio);
243 static void scrub_wr_bio_end_io_worker(struct work_struct *work);
244 static void scrub_put_ctx(struct scrub_ctx *sctx);
245
246 static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
247 {
248         return sector->recover &&
249                (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
250 }
251
252 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
253 {
254         refcount_inc(&sctx->refs);
255         atomic_inc(&sctx->bios_in_flight);
256 }
257
258 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
259 {
260         atomic_dec(&sctx->bios_in_flight);
261         wake_up(&sctx->list_wait);
262         scrub_put_ctx(sctx);
263 }
264
265 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
266 {
267         while (atomic_read(&fs_info->scrub_pause_req)) {
268                 mutex_unlock(&fs_info->scrub_lock);
269                 wait_event(fs_info->scrub_pause_wait,
270                    atomic_read(&fs_info->scrub_pause_req) == 0);
271                 mutex_lock(&fs_info->scrub_lock);
272         }
273 }
274
275 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
276 {
277         atomic_inc(&fs_info->scrubs_paused);
278         wake_up(&fs_info->scrub_pause_wait);
279 }
280
281 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
282 {
283         mutex_lock(&fs_info->scrub_lock);
284         __scrub_blocked_if_needed(fs_info);
285         atomic_dec(&fs_info->scrubs_paused);
286         mutex_unlock(&fs_info->scrub_lock);
287
288         wake_up(&fs_info->scrub_pause_wait);
289 }
290
291 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
292 {
293         scrub_pause_on(fs_info);
294         scrub_pause_off(fs_info);
295 }
296
297 /*
298  * Insert new full stripe lock into full stripe locks tree
299  *
300  * Return pointer to existing or newly inserted full_stripe_lock structure if
301  * everything works well.
302  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
303  *
304  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
305  * function
306  */
307 static struct full_stripe_lock *insert_full_stripe_lock(
308                 struct btrfs_full_stripe_locks_tree *locks_root,
309                 u64 fstripe_logical)
310 {
311         struct rb_node **p;
312         struct rb_node *parent = NULL;
313         struct full_stripe_lock *entry;
314         struct full_stripe_lock *ret;
315
316         lockdep_assert_held(&locks_root->lock);
317
318         p = &locks_root->root.rb_node;
319         while (*p) {
320                 parent = *p;
321                 entry = rb_entry(parent, struct full_stripe_lock, node);
322                 if (fstripe_logical < entry->logical) {
323                         p = &(*p)->rb_left;
324                 } else if (fstripe_logical > entry->logical) {
325                         p = &(*p)->rb_right;
326                 } else {
327                         entry->refs++;
328                         return entry;
329                 }
330         }
331
332         /*
333          * Insert new lock.
334          */
335         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
336         if (!ret)
337                 return ERR_PTR(-ENOMEM);
338         ret->logical = fstripe_logical;
339         ret->refs = 1;
340         mutex_init(&ret->mutex);
341
342         rb_link_node(&ret->node, parent, p);
343         rb_insert_color(&ret->node, &locks_root->root);
344         return ret;
345 }
346
347 /*
348  * Search for a full stripe lock of a block group
349  *
350  * Return pointer to existing full stripe lock if found
351  * Return NULL if not found
352  */
353 static struct full_stripe_lock *search_full_stripe_lock(
354                 struct btrfs_full_stripe_locks_tree *locks_root,
355                 u64 fstripe_logical)
356 {
357         struct rb_node *node;
358         struct full_stripe_lock *entry;
359
360         lockdep_assert_held(&locks_root->lock);
361
362         node = locks_root->root.rb_node;
363         while (node) {
364                 entry = rb_entry(node, struct full_stripe_lock, node);
365                 if (fstripe_logical < entry->logical)
366                         node = node->rb_left;
367                 else if (fstripe_logical > entry->logical)
368                         node = node->rb_right;
369                 else
370                         return entry;
371         }
372         return NULL;
373 }
374
375 /*
376  * Helper to get full stripe logical from a normal bytenr.
377  *
378  * Caller must ensure @cache is a RAID56 block group.
379  */
380 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
381 {
382         u64 ret;
383
384         /*
385          * Due to chunk item size limit, full stripe length should not be
386          * larger than U32_MAX. Just a sanity check here.
387          */
388         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
389
390         /*
391          * round_down() can only handle power of 2, while RAID56 full
392          * stripe length can be 64KiB * n, so we need to manually round down.
393          */
394         ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
395                         cache->full_stripe_len + cache->start;
396         return ret;
397 }
398
399 /*
400  * Lock a full stripe to avoid concurrency of recovery and read
401  *
402  * It's only used for profiles with parities (RAID5/6), for other profiles it
403  * does nothing.
404  *
405  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
406  * So caller must call unlock_full_stripe() at the same context.
407  *
408  * Return <0 if encounters error.
409  */
410 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
411                             bool *locked_ret)
412 {
413         struct btrfs_block_group *bg_cache;
414         struct btrfs_full_stripe_locks_tree *locks_root;
415         struct full_stripe_lock *existing;
416         u64 fstripe_start;
417         int ret = 0;
418
419         *locked_ret = false;
420         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
421         if (!bg_cache) {
422                 ASSERT(0);
423                 return -ENOENT;
424         }
425
426         /* Profiles not based on parity don't need full stripe lock */
427         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
428                 goto out;
429         locks_root = &bg_cache->full_stripe_locks_root;
430
431         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
432
433         /* Now insert the full stripe lock */
434         mutex_lock(&locks_root->lock);
435         existing = insert_full_stripe_lock(locks_root, fstripe_start);
436         mutex_unlock(&locks_root->lock);
437         if (IS_ERR(existing)) {
438                 ret = PTR_ERR(existing);
439                 goto out;
440         }
441         mutex_lock(&existing->mutex);
442         *locked_ret = true;
443 out:
444         btrfs_put_block_group(bg_cache);
445         return ret;
446 }
447
448 /*
449  * Unlock a full stripe.
450  *
451  * NOTE: Caller must ensure it's the same context calling corresponding
452  * lock_full_stripe().
453  *
454  * Return 0 if we unlock full stripe without problem.
455  * Return <0 for error
456  */
457 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
458                               bool locked)
459 {
460         struct btrfs_block_group *bg_cache;
461         struct btrfs_full_stripe_locks_tree *locks_root;
462         struct full_stripe_lock *fstripe_lock;
463         u64 fstripe_start;
464         bool freeit = false;
465         int ret = 0;
466
467         /* If we didn't acquire full stripe lock, no need to continue */
468         if (!locked)
469                 return 0;
470
471         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
472         if (!bg_cache) {
473                 ASSERT(0);
474                 return -ENOENT;
475         }
476         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
477                 goto out;
478
479         locks_root = &bg_cache->full_stripe_locks_root;
480         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
481
482         mutex_lock(&locks_root->lock);
483         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
484         /* Unpaired unlock_full_stripe() detected */
485         if (!fstripe_lock) {
486                 WARN_ON(1);
487                 ret = -ENOENT;
488                 mutex_unlock(&locks_root->lock);
489                 goto out;
490         }
491
492         if (fstripe_lock->refs == 0) {
493                 WARN_ON(1);
494                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
495                         fstripe_lock->logical);
496         } else {
497                 fstripe_lock->refs--;
498         }
499
500         if (fstripe_lock->refs == 0) {
501                 rb_erase(&fstripe_lock->node, &locks_root->root);
502                 freeit = true;
503         }
504         mutex_unlock(&locks_root->lock);
505
506         mutex_unlock(&fstripe_lock->mutex);
507         if (freeit)
508                 kfree(fstripe_lock);
509 out:
510         btrfs_put_block_group(bg_cache);
511         return ret;
512 }
513
514 static void scrub_free_csums(struct scrub_ctx *sctx)
515 {
516         while (!list_empty(&sctx->csum_list)) {
517                 struct btrfs_ordered_sum *sum;
518                 sum = list_first_entry(&sctx->csum_list,
519                                        struct btrfs_ordered_sum, list);
520                 list_del(&sum->list);
521                 kfree(sum);
522         }
523 }
524
525 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
526 {
527         int i;
528
529         if (!sctx)
530                 return;
531
532         /* this can happen when scrub is cancelled */
533         if (sctx->curr != -1) {
534                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
535
536                 for (i = 0; i < sbio->sector_count; i++) {
537                         WARN_ON(!sbio->sectors[i]->page);
538                         scrub_block_put(sbio->sectors[i]->sblock);
539                 }
540                 bio_put(sbio->bio);
541         }
542
543         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
544                 struct scrub_bio *sbio = sctx->bios[i];
545
546                 if (!sbio)
547                         break;
548                 kfree(sbio);
549         }
550
551         kfree(sctx->wr_curr_bio);
552         scrub_free_csums(sctx);
553         kfree(sctx);
554 }
555
556 static void scrub_put_ctx(struct scrub_ctx *sctx)
557 {
558         if (refcount_dec_and_test(&sctx->refs))
559                 scrub_free_ctx(sctx);
560 }
561
562 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
563                 struct btrfs_fs_info *fs_info, int is_dev_replace)
564 {
565         struct scrub_ctx *sctx;
566         int             i;
567
568         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
569         if (!sctx)
570                 goto nomem;
571         refcount_set(&sctx->refs, 1);
572         sctx->is_dev_replace = is_dev_replace;
573         sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
574         sctx->curr = -1;
575         sctx->fs_info = fs_info;
576         INIT_LIST_HEAD(&sctx->csum_list);
577         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
578                 struct scrub_bio *sbio;
579
580                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
581                 if (!sbio)
582                         goto nomem;
583                 sctx->bios[i] = sbio;
584
585                 sbio->index = i;
586                 sbio->sctx = sctx;
587                 sbio->sector_count = 0;
588                 INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
589
590                 if (i != SCRUB_BIOS_PER_SCTX - 1)
591                         sctx->bios[i]->next_free = i + 1;
592                 else
593                         sctx->bios[i]->next_free = -1;
594         }
595         sctx->first_free = 0;
596         atomic_set(&sctx->bios_in_flight, 0);
597         atomic_set(&sctx->workers_pending, 0);
598         atomic_set(&sctx->cancel_req, 0);
599
600         spin_lock_init(&sctx->list_lock);
601         spin_lock_init(&sctx->stat_lock);
602         init_waitqueue_head(&sctx->list_wait);
603         sctx->throttle_deadline = 0;
604
605         WARN_ON(sctx->wr_curr_bio != NULL);
606         mutex_init(&sctx->wr_lock);
607         sctx->wr_curr_bio = NULL;
608         if (is_dev_replace) {
609                 WARN_ON(!fs_info->dev_replace.tgtdev);
610                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
611                 sctx->flush_all_writes = false;
612         }
613
614         return sctx;
615
616 nomem:
617         scrub_free_ctx(sctx);
618         return ERR_PTR(-ENOMEM);
619 }
620
621 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
622                                      void *warn_ctx)
623 {
624         u32 nlink;
625         int ret;
626         int i;
627         unsigned nofs_flag;
628         struct extent_buffer *eb;
629         struct btrfs_inode_item *inode_item;
630         struct scrub_warning *swarn = warn_ctx;
631         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
632         struct inode_fs_paths *ipath = NULL;
633         struct btrfs_root *local_root;
634         struct btrfs_key key;
635
636         local_root = btrfs_get_fs_root(fs_info, root, true);
637         if (IS_ERR(local_root)) {
638                 ret = PTR_ERR(local_root);
639                 goto err;
640         }
641
642         /*
643          * this makes the path point to (inum INODE_ITEM ioff)
644          */
645         key.objectid = inum;
646         key.type = BTRFS_INODE_ITEM_KEY;
647         key.offset = 0;
648
649         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
650         if (ret) {
651                 btrfs_put_root(local_root);
652                 btrfs_release_path(swarn->path);
653                 goto err;
654         }
655
656         eb = swarn->path->nodes[0];
657         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
658                                         struct btrfs_inode_item);
659         nlink = btrfs_inode_nlink(eb, inode_item);
660         btrfs_release_path(swarn->path);
661
662         /*
663          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
664          * uses GFP_NOFS in this context, so we keep it consistent but it does
665          * not seem to be strictly necessary.
666          */
667         nofs_flag = memalloc_nofs_save();
668         ipath = init_ipath(4096, local_root, swarn->path);
669         memalloc_nofs_restore(nofs_flag);
670         if (IS_ERR(ipath)) {
671                 btrfs_put_root(local_root);
672                 ret = PTR_ERR(ipath);
673                 ipath = NULL;
674                 goto err;
675         }
676         ret = paths_from_inode(inum, ipath);
677
678         if (ret < 0)
679                 goto err;
680
681         /*
682          * we deliberately ignore the bit ipath might have been too small to
683          * hold all of the paths here
684          */
685         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
686                 btrfs_warn_in_rcu(fs_info,
687 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
688                                   swarn->errstr, swarn->logical,
689                                   rcu_str_deref(swarn->dev->name),
690                                   swarn->physical,
691                                   root, inum, offset,
692                                   fs_info->sectorsize, nlink,
693                                   (char *)(unsigned long)ipath->fspath->val[i]);
694
695         btrfs_put_root(local_root);
696         free_ipath(ipath);
697         return 0;
698
699 err:
700         btrfs_warn_in_rcu(fs_info,
701                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
702                           swarn->errstr, swarn->logical,
703                           rcu_str_deref(swarn->dev->name),
704                           swarn->physical,
705                           root, inum, offset, ret);
706
707         free_ipath(ipath);
708         return 0;
709 }
710
711 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
712 {
713         struct btrfs_device *dev;
714         struct btrfs_fs_info *fs_info;
715         struct btrfs_path *path;
716         struct btrfs_key found_key;
717         struct extent_buffer *eb;
718         struct btrfs_extent_item *ei;
719         struct scrub_warning swarn;
720         unsigned long ptr = 0;
721         u64 extent_item_pos;
722         u64 flags = 0;
723         u64 ref_root;
724         u32 item_size;
725         u8 ref_level = 0;
726         int ret;
727
728         WARN_ON(sblock->sector_count < 1);
729         dev = sblock->sectors[0]->dev;
730         fs_info = sblock->sctx->fs_info;
731
732         path = btrfs_alloc_path();
733         if (!path)
734                 return;
735
736         swarn.physical = sblock->sectors[0]->physical;
737         swarn.logical = sblock->sectors[0]->logical;
738         swarn.errstr = errstr;
739         swarn.dev = NULL;
740
741         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
742                                   &flags);
743         if (ret < 0)
744                 goto out;
745
746         extent_item_pos = swarn.logical - found_key.objectid;
747         swarn.extent_item_size = found_key.offset;
748
749         eb = path->nodes[0];
750         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
751         item_size = btrfs_item_size(eb, path->slots[0]);
752
753         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
754                 do {
755                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
756                                                       item_size, &ref_root,
757                                                       &ref_level);
758                         btrfs_warn_in_rcu(fs_info,
759 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
760                                 errstr, swarn.logical,
761                                 rcu_str_deref(dev->name),
762                                 swarn.physical,
763                                 ref_level ? "node" : "leaf",
764                                 ret < 0 ? -1 : ref_level,
765                                 ret < 0 ? -1 : ref_root);
766                 } while (ret != 1);
767                 btrfs_release_path(path);
768         } else {
769                 btrfs_release_path(path);
770                 swarn.path = path;
771                 swarn.dev = dev;
772                 iterate_extent_inodes(fs_info, found_key.objectid,
773                                         extent_item_pos, 1,
774                                         scrub_print_warning_inode, &swarn, false);
775         }
776
777 out:
778         btrfs_free_path(path);
779 }
780
781 static inline void scrub_get_recover(struct scrub_recover *recover)
782 {
783         refcount_inc(&recover->refs);
784 }
785
786 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
787                                      struct scrub_recover *recover)
788 {
789         if (refcount_dec_and_test(&recover->refs)) {
790                 btrfs_bio_counter_dec(fs_info);
791                 btrfs_put_bioc(recover->bioc);
792                 kfree(recover);
793         }
794 }
795
796 /*
797  * scrub_handle_errored_block gets called when either verification of the
798  * sectors failed or the bio failed to read, e.g. with EIO. In the latter
799  * case, this function handles all sectors in the bio, even though only one
800  * may be bad.
801  * The goal of this function is to repair the errored block by using the
802  * contents of one of the mirrors.
803  */
804 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
805 {
806         struct scrub_ctx *sctx = sblock_to_check->sctx;
807         struct btrfs_device *dev;
808         struct btrfs_fs_info *fs_info;
809         u64 logical;
810         unsigned int failed_mirror_index;
811         unsigned int is_metadata;
812         unsigned int have_csum;
813         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
814         struct scrub_block *sblock_bad;
815         int ret;
816         int mirror_index;
817         int sector_num;
818         int success;
819         bool full_stripe_locked;
820         unsigned int nofs_flag;
821         static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
822                                       DEFAULT_RATELIMIT_BURST);
823
824         BUG_ON(sblock_to_check->sector_count < 1);
825         fs_info = sctx->fs_info;
826         if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
827                 /*
828                  * if we find an error in a super block, we just report it.
829                  * They will get written with the next transaction commit
830                  * anyway
831                  */
832                 spin_lock(&sctx->stat_lock);
833                 ++sctx->stat.super_errors;
834                 spin_unlock(&sctx->stat_lock);
835                 return 0;
836         }
837         logical = sblock_to_check->sectors[0]->logical;
838         BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
839         failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
840         is_metadata = !(sblock_to_check->sectors[0]->flags &
841                         BTRFS_EXTENT_FLAG_DATA);
842         have_csum = sblock_to_check->sectors[0]->have_csum;
843         dev = sblock_to_check->sectors[0]->dev;
844
845         if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
846                 return 0;
847
848         /*
849          * We must use GFP_NOFS because the scrub task might be waiting for a
850          * worker task executing this function and in turn a transaction commit
851          * might be waiting the scrub task to pause (which needs to wait for all
852          * the worker tasks to complete before pausing).
853          * We do allocations in the workers through insert_full_stripe_lock()
854          * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
855          * this function.
856          */
857         nofs_flag = memalloc_nofs_save();
858         /*
859          * For RAID5/6, race can happen for a different device scrub thread.
860          * For data corruption, Parity and Data threads will both try
861          * to recovery the data.
862          * Race can lead to doubly added csum error, or even unrecoverable
863          * error.
864          */
865         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
866         if (ret < 0) {
867                 memalloc_nofs_restore(nofs_flag);
868                 spin_lock(&sctx->stat_lock);
869                 if (ret == -ENOMEM)
870                         sctx->stat.malloc_errors++;
871                 sctx->stat.read_errors++;
872                 sctx->stat.uncorrectable_errors++;
873                 spin_unlock(&sctx->stat_lock);
874                 return ret;
875         }
876
877         /*
878          * read all mirrors one after the other. This includes to
879          * re-read the extent or metadata block that failed (that was
880          * the cause that this fixup code is called) another time,
881          * sector by sector this time in order to know which sectors
882          * caused I/O errors and which ones are good (for all mirrors).
883          * It is the goal to handle the situation when more than one
884          * mirror contains I/O errors, but the errors do not
885          * overlap, i.e. the data can be repaired by selecting the
886          * sectors from those mirrors without I/O error on the
887          * particular sectors. One example (with blocks >= 2 * sectorsize)
888          * would be that mirror #1 has an I/O error on the first sector,
889          * the second sector is good, and mirror #2 has an I/O error on
890          * the second sector, but the first sector is good.
891          * Then the first sector of the first mirror can be repaired by
892          * taking the first sector of the second mirror, and the
893          * second sector of the second mirror can be repaired by
894          * copying the contents of the 2nd sector of the 1st mirror.
895          * One more note: if the sectors of one mirror contain I/O
896          * errors, the checksum cannot be verified. In order to get
897          * the best data for repairing, the first attempt is to find
898          * a mirror without I/O errors and with a validated checksum.
899          * Only if this is not possible, the sectors are picked from
900          * mirrors with I/O errors without considering the checksum.
901          * If the latter is the case, at the end, the checksum of the
902          * repaired area is verified in order to correctly maintain
903          * the statistics.
904          */
905
906         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
907                                       sizeof(*sblocks_for_recheck), GFP_KERNEL);
908         if (!sblocks_for_recheck) {
909                 spin_lock(&sctx->stat_lock);
910                 sctx->stat.malloc_errors++;
911                 sctx->stat.read_errors++;
912                 sctx->stat.uncorrectable_errors++;
913                 spin_unlock(&sctx->stat_lock);
914                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
915                 goto out;
916         }
917
918         /* Setup the context, map the logical blocks and alloc the sectors */
919         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
920         if (ret) {
921                 spin_lock(&sctx->stat_lock);
922                 sctx->stat.read_errors++;
923                 sctx->stat.uncorrectable_errors++;
924                 spin_unlock(&sctx->stat_lock);
925                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
926                 goto out;
927         }
928         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
929         sblock_bad = sblocks_for_recheck + failed_mirror_index;
930
931         /* build and submit the bios for the failed mirror, check checksums */
932         scrub_recheck_block(fs_info, sblock_bad, 1);
933
934         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
935             sblock_bad->no_io_error_seen) {
936                 /*
937                  * The error disappeared after reading sector by sector, or
938                  * the area was part of a huge bio and other parts of the
939                  * bio caused I/O errors, or the block layer merged several
940                  * read requests into one and the error is caused by a
941                  * different bio (usually one of the two latter cases is
942                  * the cause)
943                  */
944                 spin_lock(&sctx->stat_lock);
945                 sctx->stat.unverified_errors++;
946                 sblock_to_check->data_corrected = 1;
947                 spin_unlock(&sctx->stat_lock);
948
949                 if (sctx->is_dev_replace)
950                         scrub_write_block_to_dev_replace(sblock_bad);
951                 goto out;
952         }
953
954         if (!sblock_bad->no_io_error_seen) {
955                 spin_lock(&sctx->stat_lock);
956                 sctx->stat.read_errors++;
957                 spin_unlock(&sctx->stat_lock);
958                 if (__ratelimit(&rs))
959                         scrub_print_warning("i/o error", sblock_to_check);
960                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
961         } else if (sblock_bad->checksum_error) {
962                 spin_lock(&sctx->stat_lock);
963                 sctx->stat.csum_errors++;
964                 spin_unlock(&sctx->stat_lock);
965                 if (__ratelimit(&rs))
966                         scrub_print_warning("checksum error", sblock_to_check);
967                 btrfs_dev_stat_inc_and_print(dev,
968                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
969         } else if (sblock_bad->header_error) {
970                 spin_lock(&sctx->stat_lock);
971                 sctx->stat.verify_errors++;
972                 spin_unlock(&sctx->stat_lock);
973                 if (__ratelimit(&rs))
974                         scrub_print_warning("checksum/header error",
975                                             sblock_to_check);
976                 if (sblock_bad->generation_error)
977                         btrfs_dev_stat_inc_and_print(dev,
978                                 BTRFS_DEV_STAT_GENERATION_ERRS);
979                 else
980                         btrfs_dev_stat_inc_and_print(dev,
981                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
982         }
983
984         if (sctx->readonly) {
985                 ASSERT(!sctx->is_dev_replace);
986                 goto out;
987         }
988
989         /*
990          * now build and submit the bios for the other mirrors, check
991          * checksums.
992          * First try to pick the mirror which is completely without I/O
993          * errors and also does not have a checksum error.
994          * If one is found, and if a checksum is present, the full block
995          * that is known to contain an error is rewritten. Afterwards
996          * the block is known to be corrected.
997          * If a mirror is found which is completely correct, and no
998          * checksum is present, only those sectors are rewritten that had
999          * an I/O error in the block to be repaired, since it cannot be
1000          * determined, which copy of the other sectors is better (and it
1001          * could happen otherwise that a correct sector would be
1002          * overwritten by a bad one).
1003          */
1004         for (mirror_index = 0; ;mirror_index++) {
1005                 struct scrub_block *sblock_other;
1006
1007                 if (mirror_index == failed_mirror_index)
1008                         continue;
1009
1010                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1011                 if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1012                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1013                                 break;
1014                         if (!sblocks_for_recheck[mirror_index].sector_count)
1015                                 break;
1016
1017                         sblock_other = sblocks_for_recheck + mirror_index;
1018                 } else {
1019                         struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1020                         int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1021
1022                         if (mirror_index >= max_allowed)
1023                                 break;
1024                         if (!sblocks_for_recheck[1].sector_count)
1025                                 break;
1026
1027                         ASSERT(failed_mirror_index == 0);
1028                         sblock_other = sblocks_for_recheck + 1;
1029                         sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
1030                 }
1031
1032                 /* build and submit the bios, check checksums */
1033                 scrub_recheck_block(fs_info, sblock_other, 0);
1034
1035                 if (!sblock_other->header_error &&
1036                     !sblock_other->checksum_error &&
1037                     sblock_other->no_io_error_seen) {
1038                         if (sctx->is_dev_replace) {
1039                                 scrub_write_block_to_dev_replace(sblock_other);
1040                                 goto corrected_error;
1041                         } else {
1042                                 ret = scrub_repair_block_from_good_copy(
1043                                                 sblock_bad, sblock_other);
1044                                 if (!ret)
1045                                         goto corrected_error;
1046                         }
1047                 }
1048         }
1049
1050         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1051                 goto did_not_correct_error;
1052
1053         /*
1054          * In case of I/O errors in the area that is supposed to be
1055          * repaired, continue by picking good copies of those sectors.
1056          * Select the good sectors from mirrors to rewrite bad sectors from
1057          * the area to fix. Afterwards verify the checksum of the block
1058          * that is supposed to be repaired. This verification step is
1059          * only done for the purpose of statistic counting and for the
1060          * final scrub report, whether errors remain.
1061          * A perfect algorithm could make use of the checksum and try
1062          * all possible combinations of sectors from the different mirrors
1063          * until the checksum verification succeeds. For example, when
1064          * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1065          * of mirror #2 is readable but the final checksum test fails,
1066          * then the 2nd sector of mirror #3 could be tried, whether now
1067          * the final checksum succeeds. But this would be a rare
1068          * exception and is therefore not implemented. At least it is
1069          * avoided that the good copy is overwritten.
1070          * A more useful improvement would be to pick the sectors
1071          * without I/O error based on sector sizes (512 bytes on legacy
1072          * disks) instead of on sectorsize. Then maybe 512 byte of one
1073          * mirror could be repaired by taking 512 byte of a different
1074          * mirror, even if other 512 byte sectors in the same sectorsize
1075          * area are unreadable.
1076          */
1077         success = 1;
1078         for (sector_num = 0; sector_num < sblock_bad->sector_count;
1079              sector_num++) {
1080                 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1081                 struct scrub_block *sblock_other = NULL;
1082
1083                 /* Skip no-io-error sectors in scrub */
1084                 if (!sector_bad->io_error && !sctx->is_dev_replace)
1085                         continue;
1086
1087                 if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1088                         /*
1089                          * In case of dev replace, if raid56 rebuild process
1090                          * didn't work out correct data, then copy the content
1091                          * in sblock_bad to make sure target device is identical
1092                          * to source device, instead of writing garbage data in
1093                          * sblock_for_recheck array to target device.
1094                          */
1095                         sblock_other = NULL;
1096                 } else if (sector_bad->io_error) {
1097                         /* Try to find no-io-error sector in mirrors */
1098                         for (mirror_index = 0;
1099                              mirror_index < BTRFS_MAX_MIRRORS &&
1100                              sblocks_for_recheck[mirror_index].sector_count > 0;
1101                              mirror_index++) {
1102                                 if (!sblocks_for_recheck[mirror_index].
1103                                     sectors[sector_num]->io_error) {
1104                                         sblock_other = sblocks_for_recheck +
1105                                                        mirror_index;
1106                                         break;
1107                                 }
1108                         }
1109                         if (!sblock_other)
1110                                 success = 0;
1111                 }
1112
1113                 if (sctx->is_dev_replace) {
1114                         /*
1115                          * Did not find a mirror to fetch the sector from.
1116                          * scrub_write_sector_to_dev_replace() handles this
1117                          * case (sector->io_error), by filling the block with
1118                          * zeros before submitting the write request
1119                          */
1120                         if (!sblock_other)
1121                                 sblock_other = sblock_bad;
1122
1123                         if (scrub_write_sector_to_dev_replace(sblock_other,
1124                                                               sector_num) != 0) {
1125                                 atomic64_inc(
1126                                         &fs_info->dev_replace.num_write_errors);
1127                                 success = 0;
1128                         }
1129                 } else if (sblock_other) {
1130                         ret = scrub_repair_sector_from_good_copy(sblock_bad,
1131                                                                  sblock_other,
1132                                                                  sector_num, 0);
1133                         if (0 == ret)
1134                                 sector_bad->io_error = 0;
1135                         else
1136                                 success = 0;
1137                 }
1138         }
1139
1140         if (success && !sctx->is_dev_replace) {
1141                 if (is_metadata || have_csum) {
1142                         /*
1143                          * need to verify the checksum now that all
1144                          * sectors on disk are repaired (the write
1145                          * request for data to be repaired is on its way).
1146                          * Just be lazy and use scrub_recheck_block()
1147                          * which re-reads the data before the checksum
1148                          * is verified, but most likely the data comes out
1149                          * of the page cache.
1150                          */
1151                         scrub_recheck_block(fs_info, sblock_bad, 1);
1152                         if (!sblock_bad->header_error &&
1153                             !sblock_bad->checksum_error &&
1154                             sblock_bad->no_io_error_seen)
1155                                 goto corrected_error;
1156                         else
1157                                 goto did_not_correct_error;
1158                 } else {
1159 corrected_error:
1160                         spin_lock(&sctx->stat_lock);
1161                         sctx->stat.corrected_errors++;
1162                         sblock_to_check->data_corrected = 1;
1163                         spin_unlock(&sctx->stat_lock);
1164                         btrfs_err_rl_in_rcu(fs_info,
1165                                 "fixed up error at logical %llu on dev %s",
1166                                 logical, rcu_str_deref(dev->name));
1167                 }
1168         } else {
1169 did_not_correct_error:
1170                 spin_lock(&sctx->stat_lock);
1171                 sctx->stat.uncorrectable_errors++;
1172                 spin_unlock(&sctx->stat_lock);
1173                 btrfs_err_rl_in_rcu(fs_info,
1174                         "unable to fixup (regular) error at logical %llu on dev %s",
1175                         logical, rcu_str_deref(dev->name));
1176         }
1177
1178 out:
1179         if (sblocks_for_recheck) {
1180                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1181                      mirror_index++) {
1182                         struct scrub_block *sblock = sblocks_for_recheck +
1183                                                      mirror_index;
1184                         struct scrub_recover *recover;
1185                         int i;
1186
1187                         for (i = 0; i < sblock->sector_count; i++) {
1188                                 sblock->sectors[i]->sblock = NULL;
1189                                 recover = sblock->sectors[i]->recover;
1190                                 if (recover) {
1191                                         scrub_put_recover(fs_info, recover);
1192                                         sblock->sectors[i]->recover = NULL;
1193                                 }
1194                                 scrub_sector_put(sblock->sectors[i]);
1195                         }
1196                 }
1197                 kfree(sblocks_for_recheck);
1198         }
1199
1200         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1201         memalloc_nofs_restore(nofs_flag);
1202         if (ret < 0)
1203                 return ret;
1204         return 0;
1205 }
1206
1207 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1208 {
1209         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1210                 return 2;
1211         else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1212                 return 3;
1213         else
1214                 return (int)bioc->num_stripes;
1215 }
1216
1217 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1218                                                  u64 *raid_map,
1219                                                  int nstripes, int mirror,
1220                                                  int *stripe_index,
1221                                                  u64 *stripe_offset)
1222 {
1223         int i;
1224
1225         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1226                 /* RAID5/6 */
1227                 for (i = 0; i < nstripes; i++) {
1228                         if (raid_map[i] == RAID6_Q_STRIPE ||
1229                             raid_map[i] == RAID5_P_STRIPE)
1230                                 continue;
1231
1232                         if (logical >= raid_map[i] &&
1233                             logical < raid_map[i] + BTRFS_STRIPE_LEN)
1234                                 break;
1235                 }
1236
1237                 *stripe_index = i;
1238                 *stripe_offset = logical - raid_map[i];
1239         } else {
1240                 /* The other RAID type */
1241                 *stripe_index = mirror;
1242                 *stripe_offset = 0;
1243         }
1244 }
1245
1246 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1247                                      struct scrub_block *sblocks_for_recheck)
1248 {
1249         struct scrub_ctx *sctx = original_sblock->sctx;
1250         struct btrfs_fs_info *fs_info = sctx->fs_info;
1251         u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
1252         u64 logical = original_sblock->sectors[0]->logical;
1253         u64 generation = original_sblock->sectors[0]->generation;
1254         u64 flags = original_sblock->sectors[0]->flags;
1255         u64 have_csum = original_sblock->sectors[0]->have_csum;
1256         struct scrub_recover *recover;
1257         struct btrfs_io_context *bioc;
1258         u64 sublen;
1259         u64 mapped_length;
1260         u64 stripe_offset;
1261         int stripe_index;
1262         int sector_index = 0;
1263         int mirror_index;
1264         int nmirrors;
1265         int ret;
1266
1267         /*
1268          * Note: the two members refs and outstanding_sectors are not used (and
1269          * not set) in the blocks that are used for the recheck procedure.
1270          */
1271
1272         while (length > 0) {
1273                 sublen = min_t(u64, length, fs_info->sectorsize);
1274                 mapped_length = sublen;
1275                 bioc = NULL;
1276
1277                 /*
1278                  * With a length of sectorsize, each returned stripe represents
1279                  * one mirror
1280                  */
1281                 btrfs_bio_counter_inc_blocked(fs_info);
1282                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1283                                        logical, &mapped_length, &bioc);
1284                 if (ret || !bioc || mapped_length < sublen) {
1285                         btrfs_put_bioc(bioc);
1286                         btrfs_bio_counter_dec(fs_info);
1287                         return -EIO;
1288                 }
1289
1290                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1291                 if (!recover) {
1292                         btrfs_put_bioc(bioc);
1293                         btrfs_bio_counter_dec(fs_info);
1294                         return -ENOMEM;
1295                 }
1296
1297                 refcount_set(&recover->refs, 1);
1298                 recover->bioc = bioc;
1299                 recover->map_length = mapped_length;
1300
1301                 ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1302
1303                 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1304
1305                 for (mirror_index = 0; mirror_index < nmirrors;
1306                      mirror_index++) {
1307                         struct scrub_block *sblock;
1308                         struct scrub_sector *sector;
1309
1310                         sblock = sblocks_for_recheck + mirror_index;
1311                         sblock->sctx = sctx;
1312
1313                         sector = kzalloc(sizeof(*sector), GFP_NOFS);
1314                         if (!sector) {
1315 leave_nomem:
1316                                 spin_lock(&sctx->stat_lock);
1317                                 sctx->stat.malloc_errors++;
1318                                 spin_unlock(&sctx->stat_lock);
1319                                 scrub_put_recover(fs_info, recover);
1320                                 return -ENOMEM;
1321                         }
1322                         scrub_sector_get(sector);
1323                         sblock->sectors[sector_index] = sector;
1324                         sector->sblock = sblock;
1325                         sector->flags = flags;
1326                         sector->generation = generation;
1327                         sector->logical = logical;
1328                         sector->have_csum = have_csum;
1329                         if (have_csum)
1330                                 memcpy(sector->csum,
1331                                        original_sblock->sectors[0]->csum,
1332                                        sctx->fs_info->csum_size);
1333
1334                         scrub_stripe_index_and_offset(logical,
1335                                                       bioc->map_type,
1336                                                       bioc->raid_map,
1337                                                       bioc->num_stripes -
1338                                                       bioc->num_tgtdevs,
1339                                                       mirror_index,
1340                                                       &stripe_index,
1341                                                       &stripe_offset);
1342                         sector->physical = bioc->stripes[stripe_index].physical +
1343                                          stripe_offset;
1344                         sector->dev = bioc->stripes[stripe_index].dev;
1345
1346                         BUG_ON(sector_index >= original_sblock->sector_count);
1347                         sector->physical_for_dev_replace =
1348                                 original_sblock->sectors[sector_index]->
1349                                 physical_for_dev_replace;
1350                         /* For missing devices, dev->bdev is NULL */
1351                         sector->mirror_num = mirror_index + 1;
1352                         sblock->sector_count++;
1353                         sector->page = alloc_page(GFP_NOFS);
1354                         if (!sector->page)
1355                                 goto leave_nomem;
1356
1357                         scrub_get_recover(recover);
1358                         sector->recover = recover;
1359                 }
1360                 scrub_put_recover(fs_info, recover);
1361                 length -= sublen;
1362                 logical += sublen;
1363                 sector_index++;
1364         }
1365
1366         return 0;
1367 }
1368
1369 static void scrub_bio_wait_endio(struct bio *bio)
1370 {
1371         complete(bio->bi_private);
1372 }
1373
1374 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1375                                         struct bio *bio,
1376                                         struct scrub_sector *sector)
1377 {
1378         DECLARE_COMPLETION_ONSTACK(done);
1379
1380         bio->bi_iter.bi_sector = sector->logical >> 9;
1381         bio->bi_private = &done;
1382         bio->bi_end_io = scrub_bio_wait_endio;
1383         raid56_parity_recover(bio, sector->recover->bioc,
1384                               sector->sblock->sectors[0]->mirror_num, false);
1385
1386         wait_for_completion_io(&done);
1387         return blk_status_to_errno(bio->bi_status);
1388 }
1389
1390 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1391                                           struct scrub_block *sblock)
1392 {
1393         struct scrub_sector *first_sector = sblock->sectors[0];
1394         struct bio *bio;
1395         int i;
1396
1397         /* All sectors in sblock belong to the same stripe on the same device. */
1398         ASSERT(first_sector->dev);
1399         if (!first_sector->dev->bdev)
1400                 goto out;
1401
1402         bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
1403
1404         for (i = 0; i < sblock->sector_count; i++) {
1405                 struct scrub_sector *sector = sblock->sectors[i];
1406
1407                 WARN_ON(!sector->page);
1408                 bio_add_page(bio, sector->page, PAGE_SIZE, 0);
1409         }
1410
1411         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
1412                 bio_put(bio);
1413                 goto out;
1414         }
1415
1416         bio_put(bio);
1417
1418         scrub_recheck_block_checksum(sblock);
1419
1420         return;
1421 out:
1422         for (i = 0; i < sblock->sector_count; i++)
1423                 sblock->sectors[i]->io_error = 1;
1424
1425         sblock->no_io_error_seen = 0;
1426 }
1427
1428 /*
1429  * This function will check the on disk data for checksum errors, header errors
1430  * and read I/O errors. If any I/O errors happen, the exact sectors which are
1431  * errored are marked as being bad. The goal is to enable scrub to take those
1432  * sectors that are not errored from all the mirrors so that the sectors that
1433  * are errored in the just handled mirror can be repaired.
1434  */
1435 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1436                                 struct scrub_block *sblock,
1437                                 int retry_failed_mirror)
1438 {
1439         int i;
1440
1441         sblock->no_io_error_seen = 1;
1442
1443         /* short cut for raid56 */
1444         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
1445                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1446
1447         for (i = 0; i < sblock->sector_count; i++) {
1448                 struct scrub_sector *sector = sblock->sectors[i];
1449                 struct bio bio;
1450                 struct bio_vec bvec;
1451
1452                 if (sector->dev->bdev == NULL) {
1453                         sector->io_error = 1;
1454                         sblock->no_io_error_seen = 0;
1455                         continue;
1456                 }
1457
1458                 WARN_ON(!sector->page);
1459                 bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
1460                 bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
1461                 bio.bi_iter.bi_sector = sector->physical >> 9;
1462
1463                 btrfsic_check_bio(&bio);
1464                 if (submit_bio_wait(&bio)) {
1465                         sector->io_error = 1;
1466                         sblock->no_io_error_seen = 0;
1467                 }
1468
1469                 bio_uninit(&bio);
1470         }
1471
1472         if (sblock->no_io_error_seen)
1473                 scrub_recheck_block_checksum(sblock);
1474 }
1475
1476 static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
1477 {
1478         struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
1479         int ret;
1480
1481         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1482         return !ret;
1483 }
1484
1485 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1486 {
1487         sblock->header_error = 0;
1488         sblock->checksum_error = 0;
1489         sblock->generation_error = 0;
1490
1491         if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1492                 scrub_checksum_data(sblock);
1493         else
1494                 scrub_checksum_tree_block(sblock);
1495 }
1496
1497 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1498                                              struct scrub_block *sblock_good)
1499 {
1500         int i;
1501         int ret = 0;
1502
1503         for (i = 0; i < sblock_bad->sector_count; i++) {
1504                 int ret_sub;
1505
1506                 ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
1507                                                              sblock_good, i, 1);
1508                 if (ret_sub)
1509                         ret = ret_sub;
1510         }
1511
1512         return ret;
1513 }
1514
1515 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
1516                                               struct scrub_block *sblock_good,
1517                                               int sector_num, int force_write)
1518 {
1519         struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1520         struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1521         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1522         const u32 sectorsize = fs_info->sectorsize;
1523
1524         BUG_ON(sector_bad->page == NULL);
1525         BUG_ON(sector_good->page == NULL);
1526         if (force_write || sblock_bad->header_error ||
1527             sblock_bad->checksum_error || sector_bad->io_error) {
1528                 struct bio bio;
1529                 struct bio_vec bvec;
1530                 int ret;
1531
1532                 if (!sector_bad->dev->bdev) {
1533                         btrfs_warn_rl(fs_info,
1534                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1535                         return -EIO;
1536                 }
1537
1538                 bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
1539                 bio.bi_iter.bi_sector = sector_bad->physical >> 9;
1540                 __bio_add_page(&bio, sector_good->page, sectorsize, 0);
1541
1542                 btrfsic_check_bio(&bio);
1543                 ret = submit_bio_wait(&bio);
1544                 bio_uninit(&bio);
1545
1546                 if (ret) {
1547                         btrfs_dev_stat_inc_and_print(sector_bad->dev,
1548                                 BTRFS_DEV_STAT_WRITE_ERRS);
1549                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1550                         return -EIO;
1551                 }
1552         }
1553
1554         return 0;
1555 }
1556
1557 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1558 {
1559         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1560         int i;
1561
1562         /*
1563          * This block is used for the check of the parity on the source device,
1564          * so the data needn't be written into the destination device.
1565          */
1566         if (sblock->sparity)
1567                 return;
1568
1569         for (i = 0; i < sblock->sector_count; i++) {
1570                 int ret;
1571
1572                 ret = scrub_write_sector_to_dev_replace(sblock, i);
1573                 if (ret)
1574                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1575         }
1576 }
1577
1578 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1579 {
1580         struct scrub_sector *sector = sblock->sectors[sector_num];
1581
1582         BUG_ON(sector->page == NULL);
1583         if (sector->io_error)
1584                 clear_page(page_address(sector->page));
1585
1586         return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1587 }
1588
1589 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1590 {
1591         int ret = 0;
1592         u64 length;
1593
1594         if (!btrfs_is_zoned(sctx->fs_info))
1595                 return 0;
1596
1597         if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1598                 return 0;
1599
1600         if (sctx->write_pointer < physical) {
1601                 length = physical - sctx->write_pointer;
1602
1603                 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1604                                                 sctx->write_pointer, length);
1605                 if (!ret)
1606                         sctx->write_pointer = physical;
1607         }
1608         return ret;
1609 }
1610
1611 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
1612                                       struct scrub_sector *sector)
1613 {
1614         struct scrub_bio *sbio;
1615         int ret;
1616         const u32 sectorsize = sctx->fs_info->sectorsize;
1617
1618         mutex_lock(&sctx->wr_lock);
1619 again:
1620         if (!sctx->wr_curr_bio) {
1621                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1622                                               GFP_KERNEL);
1623                 if (!sctx->wr_curr_bio) {
1624                         mutex_unlock(&sctx->wr_lock);
1625                         return -ENOMEM;
1626                 }
1627                 sctx->wr_curr_bio->sctx = sctx;
1628                 sctx->wr_curr_bio->sector_count = 0;
1629         }
1630         sbio = sctx->wr_curr_bio;
1631         if (sbio->sector_count == 0) {
1632                 ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
1633                 if (ret) {
1634                         mutex_unlock(&sctx->wr_lock);
1635                         return ret;
1636                 }
1637
1638                 sbio->physical = sector->physical_for_dev_replace;
1639                 sbio->logical = sector->logical;
1640                 sbio->dev = sctx->wr_tgtdev;
1641                 if (!sbio->bio) {
1642                         sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
1643                                               REQ_OP_WRITE, GFP_NOFS);
1644                 }
1645                 sbio->bio->bi_private = sbio;
1646                 sbio->bio->bi_end_io = scrub_wr_bio_end_io;
1647                 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
1648                 sbio->status = 0;
1649         } else if (sbio->physical + sbio->sector_count * sectorsize !=
1650                    sector->physical_for_dev_replace ||
1651                    sbio->logical + sbio->sector_count * sectorsize !=
1652                    sector->logical) {
1653                 scrub_wr_submit(sctx);
1654                 goto again;
1655         }
1656
1657         ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
1658         if (ret != sectorsize) {
1659                 if (sbio->sector_count < 1) {
1660                         bio_put(sbio->bio);
1661                         sbio->bio = NULL;
1662                         mutex_unlock(&sctx->wr_lock);
1663                         return -EIO;
1664                 }
1665                 scrub_wr_submit(sctx);
1666                 goto again;
1667         }
1668
1669         sbio->sectors[sbio->sector_count] = sector;
1670         scrub_sector_get(sector);
1671         sbio->sector_count++;
1672         if (sbio->sector_count == sctx->sectors_per_bio)
1673                 scrub_wr_submit(sctx);
1674         mutex_unlock(&sctx->wr_lock);
1675
1676         return 0;
1677 }
1678
1679 static void scrub_wr_submit(struct scrub_ctx *sctx)
1680 {
1681         struct scrub_bio *sbio;
1682
1683         if (!sctx->wr_curr_bio)
1684                 return;
1685
1686         sbio = sctx->wr_curr_bio;
1687         sctx->wr_curr_bio = NULL;
1688         scrub_pending_bio_inc(sctx);
1689         /* process all writes in a single worker thread. Then the block layer
1690          * orders the requests before sending them to the driver which
1691          * doubled the write performance on spinning disks when measured
1692          * with Linux 3.5 */
1693         btrfsic_check_bio(sbio->bio);
1694         submit_bio(sbio->bio);
1695
1696         if (btrfs_is_zoned(sctx->fs_info))
1697                 sctx->write_pointer = sbio->physical + sbio->sector_count *
1698                         sctx->fs_info->sectorsize;
1699 }
1700
1701 static void scrub_wr_bio_end_io(struct bio *bio)
1702 {
1703         struct scrub_bio *sbio = bio->bi_private;
1704         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1705
1706         sbio->status = bio->bi_status;
1707         sbio->bio = bio;
1708
1709         INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
1710         queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1711 }
1712
1713 static void scrub_wr_bio_end_io_worker(struct work_struct *work)
1714 {
1715         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1716         struct scrub_ctx *sctx = sbio->sctx;
1717         int i;
1718
1719         ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
1720         if (sbio->status) {
1721                 struct btrfs_dev_replace *dev_replace =
1722                         &sbio->sctx->fs_info->dev_replace;
1723
1724                 for (i = 0; i < sbio->sector_count; i++) {
1725                         struct scrub_sector *sector = sbio->sectors[i];
1726
1727                         sector->io_error = 1;
1728                         atomic64_inc(&dev_replace->num_write_errors);
1729                 }
1730         }
1731
1732         for (i = 0; i < sbio->sector_count; i++)
1733                 scrub_sector_put(sbio->sectors[i]);
1734
1735         bio_put(sbio->bio);
1736         kfree(sbio);
1737         scrub_pending_bio_dec(sctx);
1738 }
1739
1740 static int scrub_checksum(struct scrub_block *sblock)
1741 {
1742         u64 flags;
1743         int ret;
1744
1745         /*
1746          * No need to initialize these stats currently,
1747          * because this function only use return value
1748          * instead of these stats value.
1749          *
1750          * Todo:
1751          * always use stats
1752          */
1753         sblock->header_error = 0;
1754         sblock->generation_error = 0;
1755         sblock->checksum_error = 0;
1756
1757         WARN_ON(sblock->sector_count < 1);
1758         flags = sblock->sectors[0]->flags;
1759         ret = 0;
1760         if (flags & BTRFS_EXTENT_FLAG_DATA)
1761                 ret = scrub_checksum_data(sblock);
1762         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1763                 ret = scrub_checksum_tree_block(sblock);
1764         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1765                 (void)scrub_checksum_super(sblock);
1766         else
1767                 WARN_ON(1);
1768         if (ret)
1769                 scrub_handle_errored_block(sblock);
1770
1771         return ret;
1772 }
1773
1774 static int scrub_checksum_data(struct scrub_block *sblock)
1775 {
1776         struct scrub_ctx *sctx = sblock->sctx;
1777         struct btrfs_fs_info *fs_info = sctx->fs_info;
1778         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1779         u8 csum[BTRFS_CSUM_SIZE];
1780         struct scrub_sector *sector;
1781         char *kaddr;
1782
1783         BUG_ON(sblock->sector_count < 1);
1784         sector = sblock->sectors[0];
1785         if (!sector->have_csum)
1786                 return 0;
1787
1788         kaddr = page_address(sector->page);
1789
1790         shash->tfm = fs_info->csum_shash;
1791         crypto_shash_init(shash);
1792
1793         /*
1794          * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
1795          * only contains one sector of data.
1796          */
1797         crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1798
1799         if (memcmp(csum, sector->csum, fs_info->csum_size))
1800                 sblock->checksum_error = 1;
1801         return sblock->checksum_error;
1802 }
1803
1804 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1805 {
1806         struct scrub_ctx *sctx = sblock->sctx;
1807         struct btrfs_header *h;
1808         struct btrfs_fs_info *fs_info = sctx->fs_info;
1809         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1810         u8 calculated_csum[BTRFS_CSUM_SIZE];
1811         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1812         /*
1813          * This is done in sectorsize steps even for metadata as there's a
1814          * constraint for nodesize to be aligned to sectorsize. This will need
1815          * to change so we don't misuse data and metadata units like that.
1816          */
1817         const u32 sectorsize = sctx->fs_info->sectorsize;
1818         const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1819         int i;
1820         struct scrub_sector *sector;
1821         char *kaddr;
1822
1823         BUG_ON(sblock->sector_count < 1);
1824
1825         /* Each member in sectors is just one sector */
1826         ASSERT(sblock->sector_count == num_sectors);
1827
1828         sector = sblock->sectors[0];
1829         kaddr = page_address(sector->page);
1830         h = (struct btrfs_header *)kaddr;
1831         memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1832
1833         /*
1834          * we don't use the getter functions here, as we
1835          * a) don't have an extent buffer and
1836          * b) the page is already kmapped
1837          */
1838         if (sector->logical != btrfs_stack_header_bytenr(h))
1839                 sblock->header_error = 1;
1840
1841         if (sector->generation != btrfs_stack_header_generation(h)) {
1842                 sblock->header_error = 1;
1843                 sblock->generation_error = 1;
1844         }
1845
1846         if (!scrub_check_fsid(h->fsid, sector))
1847                 sblock->header_error = 1;
1848
1849         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1850                    BTRFS_UUID_SIZE))
1851                 sblock->header_error = 1;
1852
1853         shash->tfm = fs_info->csum_shash;
1854         crypto_shash_init(shash);
1855         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1856                             sectorsize - BTRFS_CSUM_SIZE);
1857
1858         for (i = 1; i < num_sectors; i++) {
1859                 kaddr = page_address(sblock->sectors[i]->page);
1860                 crypto_shash_update(shash, kaddr, sectorsize);
1861         }
1862
1863         crypto_shash_final(shash, calculated_csum);
1864         if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1865                 sblock->checksum_error = 1;
1866
1867         return sblock->header_error || sblock->checksum_error;
1868 }
1869
1870 static int scrub_checksum_super(struct scrub_block *sblock)
1871 {
1872         struct btrfs_super_block *s;
1873         struct scrub_ctx *sctx = sblock->sctx;
1874         struct btrfs_fs_info *fs_info = sctx->fs_info;
1875         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1876         u8 calculated_csum[BTRFS_CSUM_SIZE];
1877         struct scrub_sector *sector;
1878         char *kaddr;
1879         int fail_gen = 0;
1880         int fail_cor = 0;
1881
1882         BUG_ON(sblock->sector_count < 1);
1883         sector = sblock->sectors[0];
1884         kaddr = page_address(sector->page);
1885         s = (struct btrfs_super_block *)kaddr;
1886
1887         if (sector->logical != btrfs_super_bytenr(s))
1888                 ++fail_cor;
1889
1890         if (sector->generation != btrfs_super_generation(s))
1891                 ++fail_gen;
1892
1893         if (!scrub_check_fsid(s->fsid, sector))
1894                 ++fail_cor;
1895
1896         shash->tfm = fs_info->csum_shash;
1897         crypto_shash_init(shash);
1898         crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1899                         BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1900
1901         if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1902                 ++fail_cor;
1903
1904         if (fail_cor + fail_gen) {
1905                 /*
1906                  * if we find an error in a super block, we just report it.
1907                  * They will get written with the next transaction commit
1908                  * anyway
1909                  */
1910                 spin_lock(&sctx->stat_lock);
1911                 ++sctx->stat.super_errors;
1912                 spin_unlock(&sctx->stat_lock);
1913                 if (fail_cor)
1914                         btrfs_dev_stat_inc_and_print(sector->dev,
1915                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1916                 else
1917                         btrfs_dev_stat_inc_and_print(sector->dev,
1918                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1919         }
1920
1921         return fail_cor + fail_gen;
1922 }
1923
1924 static void scrub_block_get(struct scrub_block *sblock)
1925 {
1926         refcount_inc(&sblock->refs);
1927 }
1928
1929 static void scrub_block_put(struct scrub_block *sblock)
1930 {
1931         if (refcount_dec_and_test(&sblock->refs)) {
1932                 int i;
1933
1934                 if (sblock->sparity)
1935                         scrub_parity_put(sblock->sparity);
1936
1937                 for (i = 0; i < sblock->sector_count; i++)
1938                         scrub_sector_put(sblock->sectors[i]);
1939                 kfree(sblock);
1940         }
1941 }
1942
1943 static void scrub_sector_get(struct scrub_sector *sector)
1944 {
1945         atomic_inc(&sector->refs);
1946 }
1947
1948 static void scrub_sector_put(struct scrub_sector *sector)
1949 {
1950         if (atomic_dec_and_test(&sector->refs)) {
1951                 if (sector->page)
1952                         __free_page(sector->page);
1953                 kfree(sector);
1954         }
1955 }
1956
1957 /*
1958  * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1959  * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1960  */
1961 static void scrub_throttle(struct scrub_ctx *sctx)
1962 {
1963         const int time_slice = 1000;
1964         struct scrub_bio *sbio;
1965         struct btrfs_device *device;
1966         s64 delta;
1967         ktime_t now;
1968         u32 div;
1969         u64 bwlimit;
1970
1971         sbio = sctx->bios[sctx->curr];
1972         device = sbio->dev;
1973         bwlimit = READ_ONCE(device->scrub_speed_max);
1974         if (bwlimit == 0)
1975                 return;
1976
1977         /*
1978          * Slice is divided into intervals when the IO is submitted, adjust by
1979          * bwlimit and maximum of 64 intervals.
1980          */
1981         div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1982         div = min_t(u32, 64, div);
1983
1984         /* Start new epoch, set deadline */
1985         now = ktime_get();
1986         if (sctx->throttle_deadline == 0) {
1987                 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1988                 sctx->throttle_sent = 0;
1989         }
1990
1991         /* Still in the time to send? */
1992         if (ktime_before(now, sctx->throttle_deadline)) {
1993                 /* If current bio is within the limit, send it */
1994                 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
1995                 if (sctx->throttle_sent <= div_u64(bwlimit, div))
1996                         return;
1997
1998                 /* We're over the limit, sleep until the rest of the slice */
1999                 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2000         } else {
2001                 /* New request after deadline, start new epoch */
2002                 delta = 0;
2003         }
2004
2005         if (delta) {
2006                 long timeout;
2007
2008                 timeout = div_u64(delta * HZ, 1000);
2009                 schedule_timeout_interruptible(timeout);
2010         }
2011
2012         /* Next call will start the deadline period */
2013         sctx->throttle_deadline = 0;
2014 }
2015
2016 static void scrub_submit(struct scrub_ctx *sctx)
2017 {
2018         struct scrub_bio *sbio;
2019
2020         if (sctx->curr == -1)
2021                 return;
2022
2023         scrub_throttle(sctx);
2024
2025         sbio = sctx->bios[sctx->curr];
2026         sctx->curr = -1;
2027         scrub_pending_bio_inc(sctx);
2028         btrfsic_check_bio(sbio->bio);
2029         submit_bio(sbio->bio);
2030 }
2031
2032 static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
2033                                       struct scrub_sector *sector)
2034 {
2035         struct scrub_block *sblock = sector->sblock;
2036         struct scrub_bio *sbio;
2037         const u32 sectorsize = sctx->fs_info->sectorsize;
2038         int ret;
2039
2040 again:
2041         /*
2042          * grab a fresh bio or wait for one to become available
2043          */
2044         while (sctx->curr == -1) {
2045                 spin_lock(&sctx->list_lock);
2046                 sctx->curr = sctx->first_free;
2047                 if (sctx->curr != -1) {
2048                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2049                         sctx->bios[sctx->curr]->next_free = -1;
2050                         sctx->bios[sctx->curr]->sector_count = 0;
2051                         spin_unlock(&sctx->list_lock);
2052                 } else {
2053                         spin_unlock(&sctx->list_lock);
2054                         wait_event(sctx->list_wait, sctx->first_free != -1);
2055                 }
2056         }
2057         sbio = sctx->bios[sctx->curr];
2058         if (sbio->sector_count == 0) {
2059                 sbio->physical = sector->physical;
2060                 sbio->logical = sector->logical;
2061                 sbio->dev = sector->dev;
2062                 if (!sbio->bio) {
2063                         sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
2064                                               REQ_OP_READ, GFP_NOFS);
2065                 }
2066                 sbio->bio->bi_private = sbio;
2067                 sbio->bio->bi_end_io = scrub_bio_end_io;
2068                 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2069                 sbio->status = 0;
2070         } else if (sbio->physical + sbio->sector_count * sectorsize !=
2071                    sector->physical ||
2072                    sbio->logical + sbio->sector_count * sectorsize !=
2073                    sector->logical ||
2074                    sbio->dev != sector->dev) {
2075                 scrub_submit(sctx);
2076                 goto again;
2077         }
2078
2079         sbio->sectors[sbio->sector_count] = sector;
2080         ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
2081         if (ret != sectorsize) {
2082                 if (sbio->sector_count < 1) {
2083                         bio_put(sbio->bio);
2084                         sbio->bio = NULL;
2085                         return -EIO;
2086                 }
2087                 scrub_submit(sctx);
2088                 goto again;
2089         }
2090
2091         scrub_block_get(sblock); /* one for the page added to the bio */
2092         atomic_inc(&sblock->outstanding_sectors);
2093         sbio->sector_count++;
2094         if (sbio->sector_count == sctx->sectors_per_bio)
2095                 scrub_submit(sctx);
2096
2097         return 0;
2098 }
2099
2100 static void scrub_missing_raid56_end_io(struct bio *bio)
2101 {
2102         struct scrub_block *sblock = bio->bi_private;
2103         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2104
2105         if (bio->bi_status)
2106                 sblock->no_io_error_seen = 0;
2107
2108         bio_put(bio);
2109
2110         queue_work(fs_info->scrub_workers, &sblock->work);
2111 }
2112
2113 static void scrub_missing_raid56_worker(struct work_struct *work)
2114 {
2115         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2116         struct scrub_ctx *sctx = sblock->sctx;
2117         struct btrfs_fs_info *fs_info = sctx->fs_info;
2118         u64 logical;
2119         struct btrfs_device *dev;
2120
2121         logical = sblock->sectors[0]->logical;
2122         dev = sblock->sectors[0]->dev;
2123
2124         if (sblock->no_io_error_seen)
2125                 scrub_recheck_block_checksum(sblock);
2126
2127         if (!sblock->no_io_error_seen) {
2128                 spin_lock(&sctx->stat_lock);
2129                 sctx->stat.read_errors++;
2130                 spin_unlock(&sctx->stat_lock);
2131                 btrfs_err_rl_in_rcu(fs_info,
2132                         "IO error rebuilding logical %llu for dev %s",
2133                         logical, rcu_str_deref(dev->name));
2134         } else if (sblock->header_error || sblock->checksum_error) {
2135                 spin_lock(&sctx->stat_lock);
2136                 sctx->stat.uncorrectable_errors++;
2137                 spin_unlock(&sctx->stat_lock);
2138                 btrfs_err_rl_in_rcu(fs_info,
2139                         "failed to rebuild valid logical %llu for dev %s",
2140                         logical, rcu_str_deref(dev->name));
2141         } else {
2142                 scrub_write_block_to_dev_replace(sblock);
2143         }
2144
2145         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2146                 mutex_lock(&sctx->wr_lock);
2147                 scrub_wr_submit(sctx);
2148                 mutex_unlock(&sctx->wr_lock);
2149         }
2150
2151         scrub_block_put(sblock);
2152         scrub_pending_bio_dec(sctx);
2153 }
2154
2155 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2156 {
2157         struct scrub_ctx *sctx = sblock->sctx;
2158         struct btrfs_fs_info *fs_info = sctx->fs_info;
2159         u64 length = sblock->sector_count << fs_info->sectorsize_bits;
2160         u64 logical = sblock->sectors[0]->logical;
2161         struct btrfs_io_context *bioc = NULL;
2162         struct bio *bio;
2163         struct btrfs_raid_bio *rbio;
2164         int ret;
2165         int i;
2166
2167         btrfs_bio_counter_inc_blocked(fs_info);
2168         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2169                                &length, &bioc);
2170         if (ret || !bioc || !bioc->raid_map)
2171                 goto bioc_out;
2172
2173         if (WARN_ON(!sctx->is_dev_replace ||
2174                     !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2175                 /*
2176                  * We shouldn't be scrubbing a missing device. Even for dev
2177                  * replace, we should only get here for RAID 5/6. We either
2178                  * managed to mount something with no mirrors remaining or
2179                  * there's a bug in scrub_find_good_copy()/btrfs_map_block().
2180                  */
2181                 goto bioc_out;
2182         }
2183
2184         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2185         bio->bi_iter.bi_sector = logical >> 9;
2186         bio->bi_private = sblock;
2187         bio->bi_end_io = scrub_missing_raid56_end_io;
2188
2189         rbio = raid56_alloc_missing_rbio(bio, bioc);
2190         if (!rbio)
2191                 goto rbio_out;
2192
2193         for (i = 0; i < sblock->sector_count; i++) {
2194                 struct scrub_sector *sector = sblock->sectors[i];
2195
2196                 /*
2197                  * For now, our scrub is still one page per sector, so pgoff
2198                  * is always 0.
2199                  */
2200                 raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
2201         }
2202
2203         INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
2204         scrub_block_get(sblock);
2205         scrub_pending_bio_inc(sctx);
2206         raid56_submit_missing_rbio(rbio);
2207         return;
2208
2209 rbio_out:
2210         bio_put(bio);
2211 bioc_out:
2212         btrfs_bio_counter_dec(fs_info);
2213         btrfs_put_bioc(bioc);
2214         spin_lock(&sctx->stat_lock);
2215         sctx->stat.malloc_errors++;
2216         spin_unlock(&sctx->stat_lock);
2217 }
2218
2219 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
2220                        u64 physical, struct btrfs_device *dev, u64 flags,
2221                        u64 gen, int mirror_num, u8 *csum,
2222                        u64 physical_for_dev_replace)
2223 {
2224         struct scrub_block *sblock;
2225         const u32 sectorsize = sctx->fs_info->sectorsize;
2226         int index;
2227
2228         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2229         if (!sblock) {
2230                 spin_lock(&sctx->stat_lock);
2231                 sctx->stat.malloc_errors++;
2232                 spin_unlock(&sctx->stat_lock);
2233                 return -ENOMEM;
2234         }
2235
2236         /* one ref inside this function, plus one for each page added to
2237          * a bio later on */
2238         refcount_set(&sblock->refs, 1);
2239         sblock->sctx = sctx;
2240         sblock->no_io_error_seen = 1;
2241
2242         for (index = 0; len > 0; index++) {
2243                 struct scrub_sector *sector;
2244                 /*
2245                  * Here we will allocate one page for one sector to scrub.
2246                  * This is fine if PAGE_SIZE == sectorsize, but will cost
2247                  * more memory for PAGE_SIZE > sectorsize case.
2248                  */
2249                 u32 l = min(sectorsize, len);
2250
2251                 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2252                 if (!sector) {
2253 leave_nomem:
2254                         spin_lock(&sctx->stat_lock);
2255                         sctx->stat.malloc_errors++;
2256                         spin_unlock(&sctx->stat_lock);
2257                         scrub_block_put(sblock);
2258                         return -ENOMEM;
2259                 }
2260                 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2261                 scrub_sector_get(sector);
2262                 sblock->sectors[index] = sector;
2263                 sector->sblock = sblock;
2264                 sector->dev = dev;
2265                 sector->flags = flags;
2266                 sector->generation = gen;
2267                 sector->logical = logical;
2268                 sector->physical = physical;
2269                 sector->physical_for_dev_replace = physical_for_dev_replace;
2270                 sector->mirror_num = mirror_num;
2271                 if (csum) {
2272                         sector->have_csum = 1;
2273                         memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2274                 } else {
2275                         sector->have_csum = 0;
2276                 }
2277                 sblock->sector_count++;
2278                 sector->page = alloc_page(GFP_KERNEL);
2279                 if (!sector->page)
2280                         goto leave_nomem;
2281                 len -= l;
2282                 logical += l;
2283                 physical += l;
2284                 physical_for_dev_replace += l;
2285         }
2286
2287         WARN_ON(sblock->sector_count == 0);
2288         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2289                 /*
2290                  * This case should only be hit for RAID 5/6 device replace. See
2291                  * the comment in scrub_missing_raid56_pages() for details.
2292                  */
2293                 scrub_missing_raid56_pages(sblock);
2294         } else {
2295                 for (index = 0; index < sblock->sector_count; index++) {
2296                         struct scrub_sector *sector = sblock->sectors[index];
2297                         int ret;
2298
2299                         ret = scrub_add_sector_to_rd_bio(sctx, sector);
2300                         if (ret) {
2301                                 scrub_block_put(sblock);
2302                                 return ret;
2303                         }
2304                 }
2305
2306                 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2307                         scrub_submit(sctx);
2308         }
2309
2310         /* last one frees, either here or in bio completion for last page */
2311         scrub_block_put(sblock);
2312         return 0;
2313 }
2314
2315 static void scrub_bio_end_io(struct bio *bio)
2316 {
2317         struct scrub_bio *sbio = bio->bi_private;
2318         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2319
2320         sbio->status = bio->bi_status;
2321         sbio->bio = bio;
2322
2323         queue_work(fs_info->scrub_workers, &sbio->work);
2324 }
2325
2326 static void scrub_bio_end_io_worker(struct work_struct *work)
2327 {
2328         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2329         struct scrub_ctx *sctx = sbio->sctx;
2330         int i;
2331
2332         ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2333         if (sbio->status) {
2334                 for (i = 0; i < sbio->sector_count; i++) {
2335                         struct scrub_sector *sector = sbio->sectors[i];
2336
2337                         sector->io_error = 1;
2338                         sector->sblock->no_io_error_seen = 0;
2339                 }
2340         }
2341
2342         /* Now complete the scrub_block items that have all pages completed */
2343         for (i = 0; i < sbio->sector_count; i++) {
2344                 struct scrub_sector *sector = sbio->sectors[i];
2345                 struct scrub_block *sblock = sector->sblock;
2346
2347                 if (atomic_dec_and_test(&sblock->outstanding_sectors))
2348                         scrub_block_complete(sblock);
2349                 scrub_block_put(sblock);
2350         }
2351
2352         bio_put(sbio->bio);
2353         sbio->bio = NULL;
2354         spin_lock(&sctx->list_lock);
2355         sbio->next_free = sctx->first_free;
2356         sctx->first_free = sbio->index;
2357         spin_unlock(&sctx->list_lock);
2358
2359         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2360                 mutex_lock(&sctx->wr_lock);
2361                 scrub_wr_submit(sctx);
2362                 mutex_unlock(&sctx->wr_lock);
2363         }
2364
2365         scrub_pending_bio_dec(sctx);
2366 }
2367
2368 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2369                                        unsigned long *bitmap,
2370                                        u64 start, u32 len)
2371 {
2372         u64 offset;
2373         u32 nsectors;
2374         u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2375
2376         if (len >= sparity->stripe_len) {
2377                 bitmap_set(bitmap, 0, sparity->nsectors);
2378                 return;
2379         }
2380
2381         start -= sparity->logic_start;
2382         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2383         offset = offset >> sectorsize_bits;
2384         nsectors = len >> sectorsize_bits;
2385
2386         if (offset + nsectors <= sparity->nsectors) {
2387                 bitmap_set(bitmap, offset, nsectors);
2388                 return;
2389         }
2390
2391         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2392         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2393 }
2394
2395 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2396                                                    u64 start, u32 len)
2397 {
2398         __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
2399 }
2400
2401 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2402                                                   u64 start, u32 len)
2403 {
2404         __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
2405 }
2406
2407 static void scrub_block_complete(struct scrub_block *sblock)
2408 {
2409         int corrupted = 0;
2410
2411         if (!sblock->no_io_error_seen) {
2412                 corrupted = 1;
2413                 scrub_handle_errored_block(sblock);
2414         } else {
2415                 /*
2416                  * if has checksum error, write via repair mechanism in
2417                  * dev replace case, otherwise write here in dev replace
2418                  * case.
2419                  */
2420                 corrupted = scrub_checksum(sblock);
2421                 if (!corrupted && sblock->sctx->is_dev_replace)
2422                         scrub_write_block_to_dev_replace(sblock);
2423         }
2424
2425         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2426                 u64 start = sblock->sectors[0]->logical;
2427                 u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
2428                           sblock->sctx->fs_info->sectorsize;
2429
2430                 ASSERT(end - start <= U32_MAX);
2431                 scrub_parity_mark_sectors_error(sblock->sparity,
2432                                                 start, end - start);
2433         }
2434 }
2435
2436 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2437 {
2438         sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2439         list_del(&sum->list);
2440         kfree(sum);
2441 }
2442
2443 /*
2444  * Find the desired csum for range [logical, logical + sectorsize), and store
2445  * the csum into @csum.
2446  *
2447  * The search source is sctx->csum_list, which is a pre-populated list
2448  * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
2449  * that is before @logical.
2450  *
2451  * Return 0 if there is no csum for the range.
2452  * Return 1 if there is csum for the range and copied to @csum.
2453  */
2454 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2455 {
2456         bool found = false;
2457
2458         while (!list_empty(&sctx->csum_list)) {
2459                 struct btrfs_ordered_sum *sum = NULL;
2460                 unsigned long index;
2461                 unsigned long num_sectors;
2462
2463                 sum = list_first_entry(&sctx->csum_list,
2464                                        struct btrfs_ordered_sum, list);
2465                 /* The current csum range is beyond our range, no csum found */
2466                 if (sum->bytenr > logical)
2467                         break;
2468
2469                 /*
2470                  * The current sum is before our bytenr, since scrub is always
2471                  * done in bytenr order, the csum will never be used anymore,
2472                  * clean it up so that later calls won't bother with the range,
2473                  * and continue search the next range.
2474                  */
2475                 if (sum->bytenr + sum->len <= logical) {
2476                         drop_csum_range(sctx, sum);
2477                         continue;
2478                 }
2479
2480                 /* Now the csum range covers our bytenr, copy the csum */
2481                 found = true;
2482                 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2483                 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2484
2485                 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2486                        sctx->fs_info->csum_size);
2487
2488                 /* Cleanup the range if we're at the end of the csum range */
2489                 if (index == num_sectors - 1)
2490                         drop_csum_range(sctx, sum);
2491                 break;
2492         }
2493         if (!found)
2494                 return 0;
2495         return 1;
2496 }
2497
2498 /* scrub extent tries to collect up to 64 kB for each bio */
2499 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2500                         u64 logical, u32 len,
2501                         u64 physical, struct btrfs_device *dev, u64 flags,
2502                         u64 gen, int mirror_num)
2503 {
2504         struct btrfs_device *src_dev = dev;
2505         u64 src_physical = physical;
2506         int src_mirror = mirror_num;
2507         int ret;
2508         u8 csum[BTRFS_CSUM_SIZE];
2509         u32 blocksize;
2510
2511         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2512                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2513                         blocksize = map->stripe_len;
2514                 else
2515                         blocksize = sctx->fs_info->sectorsize;
2516                 spin_lock(&sctx->stat_lock);
2517                 sctx->stat.data_extents_scrubbed++;
2518                 sctx->stat.data_bytes_scrubbed += len;
2519                 spin_unlock(&sctx->stat_lock);
2520         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2521                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2522                         blocksize = map->stripe_len;
2523                 else
2524                         blocksize = sctx->fs_info->nodesize;
2525                 spin_lock(&sctx->stat_lock);
2526                 sctx->stat.tree_extents_scrubbed++;
2527                 sctx->stat.tree_bytes_scrubbed += len;
2528                 spin_unlock(&sctx->stat_lock);
2529         } else {
2530                 blocksize = sctx->fs_info->sectorsize;
2531                 WARN_ON(1);
2532         }
2533
2534         /*
2535          * For dev-replace case, we can have @dev being a missing device.
2536          * Regular scrub will avoid its execution on missing device at all,
2537          * as that would trigger tons of read error.
2538          *
2539          * Reading from missing device will cause read error counts to
2540          * increase unnecessarily.
2541          * So here we change the read source to a good mirror.
2542          */
2543         if (sctx->is_dev_replace && !dev->bdev)
2544                 scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
2545                                      &src_dev, &src_mirror);
2546         while (len) {
2547                 u32 l = min(len, blocksize);
2548                 int have_csum = 0;
2549
2550                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2551                         /* push csums to sbio */
2552                         have_csum = scrub_find_csum(sctx, logical, csum);
2553                         if (have_csum == 0)
2554                                 ++sctx->stat.no_csum;
2555                 }
2556                 ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
2557                                     flags, gen, src_mirror,
2558                                     have_csum ? csum : NULL, physical);
2559                 if (ret)
2560                         return ret;
2561                 len -= l;
2562                 logical += l;
2563                 physical += l;
2564                 src_physical += l;
2565         }
2566         return 0;
2567 }
2568
2569 static int scrub_sectors_for_parity(struct scrub_parity *sparity,
2570                                   u64 logical, u32 len,
2571                                   u64 physical, struct btrfs_device *dev,
2572                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2573 {
2574         struct scrub_ctx *sctx = sparity->sctx;
2575         struct scrub_block *sblock;
2576         const u32 sectorsize = sctx->fs_info->sectorsize;
2577         int index;
2578
2579         ASSERT(IS_ALIGNED(len, sectorsize));
2580
2581         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2582         if (!sblock) {
2583                 spin_lock(&sctx->stat_lock);
2584                 sctx->stat.malloc_errors++;
2585                 spin_unlock(&sctx->stat_lock);
2586                 return -ENOMEM;
2587         }
2588
2589         /* one ref inside this function, plus one for each page added to
2590          * a bio later on */
2591         refcount_set(&sblock->refs, 1);
2592         sblock->sctx = sctx;
2593         sblock->no_io_error_seen = 1;
2594         sblock->sparity = sparity;
2595         scrub_parity_get(sparity);
2596
2597         for (index = 0; len > 0; index++) {
2598                 struct scrub_sector *sector;
2599
2600                 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2601                 if (!sector) {
2602 leave_nomem:
2603                         spin_lock(&sctx->stat_lock);
2604                         sctx->stat.malloc_errors++;
2605                         spin_unlock(&sctx->stat_lock);
2606                         scrub_block_put(sblock);
2607                         return -ENOMEM;
2608                 }
2609                 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2610                 /* For scrub block */
2611                 scrub_sector_get(sector);
2612                 sblock->sectors[index] = sector;
2613                 /* For scrub parity */
2614                 scrub_sector_get(sector);
2615                 list_add_tail(&sector->list, &sparity->sectors_list);
2616                 sector->sblock = sblock;
2617                 sector->dev = dev;
2618                 sector->flags = flags;
2619                 sector->generation = gen;
2620                 sector->logical = logical;
2621                 sector->physical = physical;
2622                 sector->mirror_num = mirror_num;
2623                 if (csum) {
2624                         sector->have_csum = 1;
2625                         memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2626                 } else {
2627                         sector->have_csum = 0;
2628                 }
2629                 sblock->sector_count++;
2630                 sector->page = alloc_page(GFP_KERNEL);
2631                 if (!sector->page)
2632                         goto leave_nomem;
2633
2634
2635                 /* Iterate over the stripe range in sectorsize steps */
2636                 len -= sectorsize;
2637                 logical += sectorsize;
2638                 physical += sectorsize;
2639         }
2640
2641         WARN_ON(sblock->sector_count == 0);
2642         for (index = 0; index < sblock->sector_count; index++) {
2643                 struct scrub_sector *sector = sblock->sectors[index];
2644                 int ret;
2645
2646                 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2647                 if (ret) {
2648                         scrub_block_put(sblock);
2649                         return ret;
2650                 }
2651         }
2652
2653         /* Last one frees, either here or in bio completion for last sector */
2654         scrub_block_put(sblock);
2655         return 0;
2656 }
2657
2658 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2659                                    u64 logical, u32 len,
2660                                    u64 physical, struct btrfs_device *dev,
2661                                    u64 flags, u64 gen, int mirror_num)
2662 {
2663         struct scrub_ctx *sctx = sparity->sctx;
2664         int ret;
2665         u8 csum[BTRFS_CSUM_SIZE];
2666         u32 blocksize;
2667
2668         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2669                 scrub_parity_mark_sectors_error(sparity, logical, len);
2670                 return 0;
2671         }
2672
2673         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2674                 blocksize = sparity->stripe_len;
2675         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2676                 blocksize = sparity->stripe_len;
2677         } else {
2678                 blocksize = sctx->fs_info->sectorsize;
2679                 WARN_ON(1);
2680         }
2681
2682         while (len) {
2683                 u32 l = min(len, blocksize);
2684                 int have_csum = 0;
2685
2686                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2687                         /* push csums to sbio */
2688                         have_csum = scrub_find_csum(sctx, logical, csum);
2689                         if (have_csum == 0)
2690                                 goto skip;
2691                 }
2692                 ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
2693                                              flags, gen, mirror_num,
2694                                              have_csum ? csum : NULL);
2695                 if (ret)
2696                         return ret;
2697 skip:
2698                 len -= l;
2699                 logical += l;
2700                 physical += l;
2701         }
2702         return 0;
2703 }
2704
2705 /*
2706  * Given a physical address, this will calculate it's
2707  * logical offset. if this is a parity stripe, it will return
2708  * the most left data stripe's logical offset.
2709  *
2710  * return 0 if it is a data stripe, 1 means parity stripe.
2711  */
2712 static int get_raid56_logic_offset(u64 physical, int num,
2713                                    struct map_lookup *map, u64 *offset,
2714                                    u64 *stripe_start)
2715 {
2716         int i;
2717         int j = 0;
2718         u64 stripe_nr;
2719         u64 last_offset;
2720         u32 stripe_index;
2721         u32 rot;
2722         const int data_stripes = nr_data_stripes(map);
2723
2724         last_offset = (physical - map->stripes[num].physical) * data_stripes;
2725         if (stripe_start)
2726                 *stripe_start = last_offset;
2727
2728         *offset = last_offset;
2729         for (i = 0; i < data_stripes; i++) {
2730                 *offset = last_offset + i * map->stripe_len;
2731
2732                 stripe_nr = div64_u64(*offset, map->stripe_len);
2733                 stripe_nr = div_u64(stripe_nr, data_stripes);
2734
2735                 /* Work out the disk rotation on this stripe-set */
2736                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2737                 /* calculate which stripe this data locates */
2738                 rot += i;
2739                 stripe_index = rot % map->num_stripes;
2740                 if (stripe_index == num)
2741                         return 0;
2742                 if (stripe_index < num)
2743                         j++;
2744         }
2745         *offset = last_offset + j * map->stripe_len;
2746         return 1;
2747 }
2748
2749 static void scrub_free_parity(struct scrub_parity *sparity)
2750 {
2751         struct scrub_ctx *sctx = sparity->sctx;
2752         struct scrub_sector *curr, *next;
2753         int nbits;
2754
2755         nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
2756         if (nbits) {
2757                 spin_lock(&sctx->stat_lock);
2758                 sctx->stat.read_errors += nbits;
2759                 sctx->stat.uncorrectable_errors += nbits;
2760                 spin_unlock(&sctx->stat_lock);
2761         }
2762
2763         list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
2764                 list_del_init(&curr->list);
2765                 scrub_sector_put(curr);
2766         }
2767
2768         kfree(sparity);
2769 }
2770
2771 static void scrub_parity_bio_endio_worker(struct work_struct *work)
2772 {
2773         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2774                                                     work);
2775         struct scrub_ctx *sctx = sparity->sctx;
2776
2777         scrub_free_parity(sparity);
2778         scrub_pending_bio_dec(sctx);
2779 }
2780
2781 static void scrub_parity_bio_endio(struct bio *bio)
2782 {
2783         struct scrub_parity *sparity = bio->bi_private;
2784         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2785
2786         if (bio->bi_status)
2787                 bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
2788                           &sparity->dbitmap, sparity->nsectors);
2789
2790         bio_put(bio);
2791
2792         INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
2793         queue_work(fs_info->scrub_parity_workers, &sparity->work);
2794 }
2795
2796 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2797 {
2798         struct scrub_ctx *sctx = sparity->sctx;
2799         struct btrfs_fs_info *fs_info = sctx->fs_info;
2800         struct bio *bio;
2801         struct btrfs_raid_bio *rbio;
2802         struct btrfs_io_context *bioc = NULL;
2803         u64 length;
2804         int ret;
2805
2806         if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
2807                            &sparity->ebitmap, sparity->nsectors))
2808                 goto out;
2809
2810         length = sparity->logic_end - sparity->logic_start;
2811
2812         btrfs_bio_counter_inc_blocked(fs_info);
2813         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2814                                &length, &bioc);
2815         if (ret || !bioc || !bioc->raid_map)
2816                 goto bioc_out;
2817
2818         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2819         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2820         bio->bi_private = sparity;
2821         bio->bi_end_io = scrub_parity_bio_endio;
2822
2823         rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
2824                                               sparity->scrub_dev,
2825                                               &sparity->dbitmap,
2826                                               sparity->nsectors);
2827         if (!rbio)
2828                 goto rbio_out;
2829
2830         scrub_pending_bio_inc(sctx);
2831         raid56_parity_submit_scrub_rbio(rbio);
2832         return;
2833
2834 rbio_out:
2835         bio_put(bio);
2836 bioc_out:
2837         btrfs_bio_counter_dec(fs_info);
2838         btrfs_put_bioc(bioc);
2839         bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
2840                   sparity->nsectors);
2841         spin_lock(&sctx->stat_lock);
2842         sctx->stat.malloc_errors++;
2843         spin_unlock(&sctx->stat_lock);
2844 out:
2845         scrub_free_parity(sparity);
2846 }
2847
2848 static void scrub_parity_get(struct scrub_parity *sparity)
2849 {
2850         refcount_inc(&sparity->refs);
2851 }
2852
2853 static void scrub_parity_put(struct scrub_parity *sparity)
2854 {
2855         if (!refcount_dec_and_test(&sparity->refs))
2856                 return;
2857
2858         scrub_parity_check_and_repair(sparity);
2859 }
2860
2861 /*
2862  * Return 0 if the extent item range covers any byte of the range.
2863  * Return <0 if the extent item is before @search_start.
2864  * Return >0 if the extent item is after @start_start + @search_len.
2865  */
2866 static int compare_extent_item_range(struct btrfs_path *path,
2867                                      u64 search_start, u64 search_len)
2868 {
2869         struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
2870         u64 len;
2871         struct btrfs_key key;
2872
2873         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2874         ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
2875                key.type == BTRFS_METADATA_ITEM_KEY);
2876         if (key.type == BTRFS_METADATA_ITEM_KEY)
2877                 len = fs_info->nodesize;
2878         else
2879                 len = key.offset;
2880
2881         if (key.objectid + len <= search_start)
2882                 return -1;
2883         if (key.objectid >= search_start + search_len)
2884                 return 1;
2885         return 0;
2886 }
2887
2888 /*
2889  * Locate one extent item which covers any byte in range
2890  * [@search_start, @search_start + @search_length)
2891  *
2892  * If the path is not initialized, we will initialize the search by doing
2893  * a btrfs_search_slot().
2894  * If the path is already initialized, we will use the path as the initial
2895  * slot, to avoid duplicated btrfs_search_slot() calls.
2896  *
2897  * NOTE: If an extent item starts before @search_start, we will still
2898  * return the extent item. This is for data extent crossing stripe boundary.
2899  *
2900  * Return 0 if we found such extent item, and @path will point to the extent item.
2901  * Return >0 if no such extent item can be found, and @path will be released.
2902  * Return <0 if hit fatal error, and @path will be released.
2903  */
2904 static int find_first_extent_item(struct btrfs_root *extent_root,
2905                                   struct btrfs_path *path,
2906                                   u64 search_start, u64 search_len)
2907 {
2908         struct btrfs_fs_info *fs_info = extent_root->fs_info;
2909         struct btrfs_key key;
2910         int ret;
2911
2912         /* Continue using the existing path */
2913         if (path->nodes[0])
2914                 goto search_forward;
2915
2916         if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2917                 key.type = BTRFS_METADATA_ITEM_KEY;
2918         else
2919                 key.type = BTRFS_EXTENT_ITEM_KEY;
2920         key.objectid = search_start;
2921         key.offset = (u64)-1;
2922
2923         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2924         if (ret < 0)
2925                 return ret;
2926
2927         ASSERT(ret > 0);
2928         /*
2929          * Here we intentionally pass 0 as @min_objectid, as there could be
2930          * an extent item starting before @search_start.
2931          */
2932         ret = btrfs_previous_extent_item(extent_root, path, 0);
2933         if (ret < 0)
2934                 return ret;
2935         /*
2936          * No matter whether we have found an extent item, the next loop will
2937          * properly do every check on the key.
2938          */
2939 search_forward:
2940         while (true) {
2941                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2942                 if (key.objectid >= search_start + search_len)
2943                         break;
2944                 if (key.type != BTRFS_METADATA_ITEM_KEY &&
2945                     key.type != BTRFS_EXTENT_ITEM_KEY)
2946                         goto next;
2947
2948                 ret = compare_extent_item_range(path, search_start, search_len);
2949                 if (ret == 0)
2950                         return ret;
2951                 if (ret > 0)
2952                         break;
2953 next:
2954                 path->slots[0]++;
2955                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2956                         ret = btrfs_next_leaf(extent_root, path);
2957                         if (ret) {
2958                                 /* Either no more item or fatal error */
2959                                 btrfs_release_path(path);
2960                                 return ret;
2961                         }
2962                 }
2963         }
2964         btrfs_release_path(path);
2965         return 1;
2966 }
2967
2968 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
2969                             u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
2970 {
2971         struct btrfs_key key;
2972         struct btrfs_extent_item *ei;
2973
2974         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2975         ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
2976                key.type == BTRFS_EXTENT_ITEM_KEY);
2977         *extent_start_ret = key.objectid;
2978         if (key.type == BTRFS_METADATA_ITEM_KEY)
2979                 *size_ret = path->nodes[0]->fs_info->nodesize;
2980         else
2981                 *size_ret = key.offset;
2982         ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
2983         *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
2984         *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
2985 }
2986
2987 static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
2988                                       u64 boundary_start, u64 boudary_len)
2989 {
2990         return (extent_start < boundary_start &&
2991                 extent_start + extent_len > boundary_start) ||
2992                (extent_start < boundary_start + boudary_len &&
2993                 extent_start + extent_len > boundary_start + boudary_len);
2994 }
2995
2996 static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
2997                                                struct scrub_parity *sparity,
2998                                                struct map_lookup *map,
2999                                                struct btrfs_device *sdev,
3000                                                struct btrfs_path *path,
3001                                                u64 logical)
3002 {
3003         struct btrfs_fs_info *fs_info = sctx->fs_info;
3004         struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
3005         struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
3006         u64 cur_logical = logical;
3007         int ret;
3008
3009         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3010
3011         /* Path must not be populated */
3012         ASSERT(!path->nodes[0]);
3013
3014         while (cur_logical < logical + map->stripe_len) {
3015                 struct btrfs_io_context *bioc = NULL;
3016                 struct btrfs_device *extent_dev;
3017                 u64 extent_start;
3018                 u64 extent_size;
3019                 u64 mapped_length;
3020                 u64 extent_flags;
3021                 u64 extent_gen;
3022                 u64 extent_physical;
3023                 u64 extent_mirror_num;
3024
3025                 ret = find_first_extent_item(extent_root, path, cur_logical,
3026                                              logical + map->stripe_len - cur_logical);
3027                 /* No more extent item in this data stripe */
3028                 if (ret > 0) {
3029                         ret = 0;
3030                         break;
3031                 }
3032                 if (ret < 0)
3033                         break;
3034                 get_extent_info(path, &extent_start, &extent_size, &extent_flags,
3035                                 &extent_gen);
3036
3037                 /* Metadata should not cross stripe boundaries */
3038                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3039                     does_range_cross_boundary(extent_start, extent_size,
3040                                               logical, map->stripe_len)) {
3041                         btrfs_err(fs_info,
3042         "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3043                                   extent_start, logical);
3044                         spin_lock(&sctx->stat_lock);
3045                         sctx->stat.uncorrectable_errors++;
3046                         spin_unlock(&sctx->stat_lock);
3047                         cur_logical += extent_size;
3048                         continue;
3049                 }
3050
3051                 /* Skip hole range which doesn't have any extent */
3052                 cur_logical = max(extent_start, cur_logical);
3053
3054                 /* Truncate the range inside this data stripe */
3055                 extent_size = min(extent_start + extent_size,
3056                                   logical + map->stripe_len) - cur_logical;
3057                 extent_start = cur_logical;
3058                 ASSERT(extent_size <= U32_MAX);
3059
3060                 scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
3061
3062                 mapped_length = extent_size;
3063                 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
3064                                       &mapped_length, &bioc, 0);
3065                 if (!ret && (!bioc || mapped_length < extent_size))
3066                         ret = -EIO;
3067                 if (ret) {
3068                         btrfs_put_bioc(bioc);
3069                         scrub_parity_mark_sectors_error(sparity, extent_start,
3070                                                         extent_size);
3071                         break;
3072                 }
3073                 extent_physical = bioc->stripes[0].physical;
3074                 extent_mirror_num = bioc->mirror_num;
3075                 extent_dev = bioc->stripes[0].dev;
3076                 btrfs_put_bioc(bioc);
3077
3078                 ret = btrfs_lookup_csums_range(csum_root, extent_start,
3079                                                extent_start + extent_size - 1,
3080                                                &sctx->csum_list, 1);
3081                 if (ret) {
3082                         scrub_parity_mark_sectors_error(sparity, extent_start,
3083                                                         extent_size);
3084                         break;
3085                 }
3086
3087                 ret = scrub_extent_for_parity(sparity, extent_start,
3088                                               extent_size, extent_physical,
3089                                               extent_dev, extent_flags,
3090                                               extent_gen, extent_mirror_num);
3091                 scrub_free_csums(sctx);
3092
3093                 if (ret) {
3094                         scrub_parity_mark_sectors_error(sparity, extent_start,
3095                                                         extent_size);
3096                         break;
3097                 }
3098
3099                 cond_resched();
3100                 cur_logical += extent_size;
3101         }
3102         btrfs_release_path(path);
3103         return ret;
3104 }
3105
3106 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3107                                                   struct map_lookup *map,
3108                                                   struct btrfs_device *sdev,
3109                                                   u64 logic_start,
3110                                                   u64 logic_end)
3111 {
3112         struct btrfs_fs_info *fs_info = sctx->fs_info;
3113         struct btrfs_path *path;
3114         u64 cur_logical;
3115         int ret;
3116         struct scrub_parity *sparity;
3117         int nsectors;
3118
3119         path = btrfs_alloc_path();
3120         if (!path) {
3121                 spin_lock(&sctx->stat_lock);
3122                 sctx->stat.malloc_errors++;
3123                 spin_unlock(&sctx->stat_lock);
3124                 return -ENOMEM;
3125         }
3126         path->search_commit_root = 1;
3127         path->skip_locking = 1;
3128
3129         ASSERT(map->stripe_len <= U32_MAX);
3130         nsectors = map->stripe_len >> fs_info->sectorsize_bits;
3131         ASSERT(nsectors <= BITS_PER_LONG);
3132         sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
3133         if (!sparity) {
3134                 spin_lock(&sctx->stat_lock);
3135                 sctx->stat.malloc_errors++;
3136                 spin_unlock(&sctx->stat_lock);
3137                 btrfs_free_path(path);
3138                 return -ENOMEM;
3139         }
3140
3141         ASSERT(map->stripe_len <= U32_MAX);
3142         sparity->stripe_len = map->stripe_len;
3143         sparity->nsectors = nsectors;
3144         sparity->sctx = sctx;
3145         sparity->scrub_dev = sdev;
3146         sparity->logic_start = logic_start;
3147         sparity->logic_end = logic_end;
3148         refcount_set(&sparity->refs, 1);
3149         INIT_LIST_HEAD(&sparity->sectors_list);
3150
3151         ret = 0;
3152         for (cur_logical = logic_start; cur_logical < logic_end;
3153              cur_logical += map->stripe_len) {
3154                 ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
3155                                                           sdev, path, cur_logical);
3156                 if (ret < 0)
3157                         break;
3158         }
3159
3160         scrub_parity_put(sparity);
3161         scrub_submit(sctx);
3162         mutex_lock(&sctx->wr_lock);
3163         scrub_wr_submit(sctx);
3164         mutex_unlock(&sctx->wr_lock);
3165
3166         btrfs_free_path(path);
3167         return ret < 0 ? ret : 0;
3168 }
3169
3170 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3171 {
3172         if (!btrfs_is_zoned(sctx->fs_info))
3173                 return;
3174
3175         sctx->flush_all_writes = true;
3176         scrub_submit(sctx);
3177         mutex_lock(&sctx->wr_lock);
3178         scrub_wr_submit(sctx);
3179         mutex_unlock(&sctx->wr_lock);
3180
3181         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3182 }
3183
3184 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3185                                         u64 physical, u64 physical_end)
3186 {
3187         struct btrfs_fs_info *fs_info = sctx->fs_info;
3188         int ret = 0;
3189
3190         if (!btrfs_is_zoned(fs_info))
3191                 return 0;
3192
3193         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3194
3195         mutex_lock(&sctx->wr_lock);
3196         if (sctx->write_pointer < physical_end) {
3197                 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3198                                                     physical,
3199                                                     sctx->write_pointer);
3200                 if (ret)
3201                         btrfs_err(fs_info,
3202                                   "zoned: failed to recover write pointer");
3203         }
3204         mutex_unlock(&sctx->wr_lock);
3205         btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3206
3207         return ret;
3208 }
3209
3210 /*
3211  * Scrub one range which can only has simple mirror based profile.
3212  * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
3213  *  RAID0/RAID10).
3214  *
3215  * Since we may need to handle a subset of block group, we need @logical_start
3216  * and @logical_length parameter.
3217  */
3218 static int scrub_simple_mirror(struct scrub_ctx *sctx,
3219                                struct btrfs_root *extent_root,
3220                                struct btrfs_root *csum_root,
3221                                struct btrfs_block_group *bg,
3222                                struct map_lookup *map,
3223                                u64 logical_start, u64 logical_length,
3224                                struct btrfs_device *device,
3225                                u64 physical, int mirror_num)
3226 {
3227         struct btrfs_fs_info *fs_info = sctx->fs_info;
3228         const u64 logical_end = logical_start + logical_length;
3229         /* An artificial limit, inherit from old scrub behavior */
3230         const u32 max_length = SZ_64K;
3231         struct btrfs_path path = { 0 };
3232         u64 cur_logical = logical_start;
3233         int ret;
3234
3235         /* The range must be inside the bg */
3236         ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
3237
3238         path.search_commit_root = 1;
3239         path.skip_locking = 1;
3240         /* Go through each extent items inside the logical range */
3241         while (cur_logical < logical_end) {
3242                 u64 extent_start;
3243                 u64 extent_len;
3244                 u64 extent_flags;
3245                 u64 extent_gen;
3246                 u64 scrub_len;
3247
3248                 /* Canceled? */
3249                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3250                     atomic_read(&sctx->cancel_req)) {
3251                         ret = -ECANCELED;
3252                         break;
3253                 }
3254                 /* Paused? */
3255                 if (atomic_read(&fs_info->scrub_pause_req)) {
3256                         /* Push queued extents */
3257                         sctx->flush_all_writes = true;
3258                         scrub_submit(sctx);
3259                         mutex_lock(&sctx->wr_lock);
3260                         scrub_wr_submit(sctx);
3261                         mutex_unlock(&sctx->wr_lock);
3262                         wait_event(sctx->list_wait,
3263                                    atomic_read(&sctx->bios_in_flight) == 0);
3264                         sctx->flush_all_writes = false;
3265                         scrub_blocked_if_needed(fs_info);
3266                 }
3267                 /* Block group removed? */
3268                 spin_lock(&bg->lock);
3269                 if (bg->removed) {
3270                         spin_unlock(&bg->lock);
3271                         ret = 0;
3272                         break;
3273                 }
3274                 spin_unlock(&bg->lock);
3275
3276                 ret = find_first_extent_item(extent_root, &path, cur_logical,
3277                                              logical_end - cur_logical);
3278                 if (ret > 0) {
3279                         /* No more extent, just update the accounting */
3280                         sctx->stat.last_physical = physical + logical_length;
3281                         ret = 0;
3282                         break;
3283                 }
3284                 if (ret < 0)
3285                         break;
3286                 get_extent_info(&path, &extent_start, &extent_len,
3287                                 &extent_flags, &extent_gen);
3288                 /* Skip hole range which doesn't have any extent */
3289                 cur_logical = max(extent_start, cur_logical);
3290
3291                 /*
3292                  * Scrub len has three limits:
3293                  * - Extent size limit
3294                  * - Scrub range limit
3295                  *   This is especially imporatant for RAID0/RAID10 to reuse
3296                  *   this function
3297                  * - Max scrub size limit
3298                  */
3299                 scrub_len = min(min(extent_start + extent_len,
3300                                     logical_end), cur_logical + max_length) -
3301                             cur_logical;
3302
3303                 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
3304                         ret = btrfs_lookup_csums_range(csum_root, cur_logical,
3305                                         cur_logical + scrub_len - 1,
3306                                         &sctx->csum_list, 1);
3307                         if (ret)
3308                                 break;
3309                 }
3310                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3311                     does_range_cross_boundary(extent_start, extent_len,
3312                                               logical_start, logical_length)) {
3313                         btrfs_err(fs_info,
3314 "scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
3315                                   extent_start, logical_start, logical_end);
3316                         spin_lock(&sctx->stat_lock);
3317                         sctx->stat.uncorrectable_errors++;
3318                         spin_unlock(&sctx->stat_lock);
3319                         cur_logical += scrub_len;
3320                         continue;
3321                 }
3322                 ret = scrub_extent(sctx, map, cur_logical, scrub_len,
3323                                    cur_logical - logical_start + physical,
3324                                    device, extent_flags, extent_gen,
3325                                    mirror_num);
3326                 scrub_free_csums(sctx);
3327                 if (ret)
3328                         break;
3329                 if (sctx->is_dev_replace)
3330                         sync_replace_for_zoned(sctx);
3331                 cur_logical += scrub_len;
3332                 /* Don't hold CPU for too long time */
3333                 cond_resched();
3334         }
3335         btrfs_release_path(&path);
3336         return ret;
3337 }
3338
3339 /* Calculate the full stripe length for simple stripe based profiles */
3340 static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
3341 {
3342         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3343                             BTRFS_BLOCK_GROUP_RAID10));
3344
3345         return map->num_stripes / map->sub_stripes * map->stripe_len;
3346 }
3347
3348 /* Get the logical bytenr for the stripe */
3349 static u64 simple_stripe_get_logical(struct map_lookup *map,
3350                                      struct btrfs_block_group *bg,
3351                                      int stripe_index)
3352 {
3353         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3354                             BTRFS_BLOCK_GROUP_RAID10));
3355         ASSERT(stripe_index < map->num_stripes);
3356
3357         /*
3358          * (stripe_index / sub_stripes) gives how many data stripes we need to
3359          * skip.
3360          */
3361         return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
3362 }
3363
3364 /* Get the mirror number for the stripe */
3365 static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
3366 {
3367         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3368                             BTRFS_BLOCK_GROUP_RAID10));
3369         ASSERT(stripe_index < map->num_stripes);
3370
3371         /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
3372         return stripe_index % map->sub_stripes + 1;
3373 }
3374
3375 static int scrub_simple_stripe(struct scrub_ctx *sctx,
3376                                struct btrfs_root *extent_root,
3377                                struct btrfs_root *csum_root,
3378                                struct btrfs_block_group *bg,
3379                                struct map_lookup *map,
3380                                struct btrfs_device *device,
3381                                int stripe_index)
3382 {
3383         const u64 logical_increment = simple_stripe_full_stripe_len(map);
3384         const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
3385         const u64 orig_physical = map->stripes[stripe_index].physical;
3386         const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
3387         u64 cur_logical = orig_logical;
3388         u64 cur_physical = orig_physical;
3389         int ret = 0;
3390
3391         while (cur_logical < bg->start + bg->length) {
3392                 /*
3393                  * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
3394                  * just RAID1, so we can reuse scrub_simple_mirror() to scrub
3395                  * this stripe.
3396                  */
3397                 ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
3398                                           cur_logical, map->stripe_len, device,
3399                                           cur_physical, mirror_num);
3400                 if (ret)
3401                         return ret;
3402                 /* Skip to next stripe which belongs to the target device */
3403                 cur_logical += logical_increment;
3404                 /* For physical offset, we just go to next stripe */
3405                 cur_physical += map->stripe_len;
3406         }
3407         return ret;
3408 }
3409
3410 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3411                                            struct btrfs_block_group *bg,
3412                                            struct extent_map *em,
3413                                            struct btrfs_device *scrub_dev,
3414                                            int stripe_index)
3415 {
3416         struct btrfs_path *path;
3417         struct btrfs_fs_info *fs_info = sctx->fs_info;
3418         struct btrfs_root *root;
3419         struct btrfs_root *csum_root;
3420         struct blk_plug plug;
3421         struct map_lookup *map = em->map_lookup;
3422         const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3423         const u64 chunk_logical = bg->start;
3424         int ret;
3425         u64 physical = map->stripes[stripe_index].physical;
3426         const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
3427         const u64 physical_end = physical + dev_stripe_len;
3428         u64 logical;
3429         u64 logic_end;
3430         /* The logical increment after finishing one stripe */
3431         u64 increment;
3432         /* Offset inside the chunk */
3433         u64 offset;
3434         u64 stripe_logical;
3435         u64 stripe_end;
3436         int stop_loop = 0;
3437
3438         path = btrfs_alloc_path();
3439         if (!path)
3440                 return -ENOMEM;
3441
3442         /*
3443          * work on commit root. The related disk blocks are static as
3444          * long as COW is applied. This means, it is save to rewrite
3445          * them to repair disk errors without any race conditions
3446          */
3447         path->search_commit_root = 1;
3448         path->skip_locking = 1;
3449         path->reada = READA_FORWARD;
3450
3451         wait_event(sctx->list_wait,
3452                    atomic_read(&sctx->bios_in_flight) == 0);
3453         scrub_blocked_if_needed(fs_info);
3454
3455         root = btrfs_extent_root(fs_info, bg->start);
3456         csum_root = btrfs_csum_root(fs_info, bg->start);
3457
3458         /*
3459          * collect all data csums for the stripe to avoid seeking during
3460          * the scrub. This might currently (crc32) end up to be about 1MB
3461          */
3462         blk_start_plug(&plug);
3463
3464         if (sctx->is_dev_replace &&
3465             btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3466                 mutex_lock(&sctx->wr_lock);
3467                 sctx->write_pointer = physical;
3468                 mutex_unlock(&sctx->wr_lock);
3469                 sctx->flush_all_writes = true;
3470         }
3471
3472         /*
3473          * There used to be a big double loop to handle all profiles using the
3474          * same routine, which grows larger and more gross over time.
3475          *
3476          * So here we handle each profile differently, so simpler profiles
3477          * have simpler scrubbing function.
3478          */
3479         if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
3480                          BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3481                 /*
3482                  * Above check rules out all complex profile, the remaining
3483                  * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
3484                  * mirrored duplication without stripe.
3485                  *
3486                  * Only @physical and @mirror_num needs to calculated using
3487                  * @stripe_index.
3488                  */
3489                 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3490                                 bg->start, bg->length, scrub_dev,
3491                                 map->stripes[stripe_index].physical,
3492                                 stripe_index + 1);
3493                 offset = 0;
3494                 goto out;
3495         }
3496         if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3497                 ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
3498                                           scrub_dev, stripe_index);
3499                 offset = map->stripe_len * (stripe_index / map->sub_stripes);
3500                 goto out;
3501         }
3502
3503         /* Only RAID56 goes through the old code */
3504         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3505         ret = 0;
3506
3507         /* Calculate the logical end of the stripe */
3508         get_raid56_logic_offset(physical_end, stripe_index,
3509                                 map, &logic_end, NULL);
3510         logic_end += chunk_logical;
3511
3512         /* Initialize @offset in case we need to go to out: label */
3513         get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
3514         increment = map->stripe_len * nr_data_stripes(map);
3515
3516         /*
3517          * Due to the rotation, for RAID56 it's better to iterate each stripe
3518          * using their physical offset.
3519          */
3520         while (physical < physical_end) {
3521                 ret = get_raid56_logic_offset(physical, stripe_index, map,
3522                                               &logical, &stripe_logical);
3523                 logical += chunk_logical;
3524                 if (ret) {
3525                         /* it is parity strip */
3526                         stripe_logical += chunk_logical;
3527                         stripe_end = stripe_logical + increment;
3528                         ret = scrub_raid56_parity(sctx, map, scrub_dev,
3529                                                   stripe_logical,
3530                                                   stripe_end);
3531                         if (ret)
3532                                 goto out;
3533                         goto next;
3534                 }
3535
3536                 /*
3537                  * Now we're at a data stripe, scrub each extents in the range.
3538                  *
3539                  * At this stage, if we ignore the repair part, inside each data
3540                  * stripe it is no different than SINGLE profile.
3541                  * We can reuse scrub_simple_mirror() here, as the repair part
3542                  * is still based on @mirror_num.
3543                  */
3544                 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3545                                           logical, map->stripe_len,
3546                                           scrub_dev, physical, 1);
3547                 if (ret < 0)
3548                         goto out;
3549 next:
3550                 logical += increment;
3551                 physical += map->stripe_len;
3552                 spin_lock(&sctx->stat_lock);
3553                 if (stop_loop)
3554                         sctx->stat.last_physical =
3555                                 map->stripes[stripe_index].physical + dev_stripe_len;
3556                 else
3557                         sctx->stat.last_physical = physical;
3558                 spin_unlock(&sctx->stat_lock);
3559                 if (stop_loop)
3560                         break;
3561         }
3562 out:
3563         /* push queued extents */
3564         scrub_submit(sctx);
3565         mutex_lock(&sctx->wr_lock);
3566         scrub_wr_submit(sctx);
3567         mutex_unlock(&sctx->wr_lock);
3568
3569         blk_finish_plug(&plug);
3570         btrfs_free_path(path);
3571
3572         if (sctx->is_dev_replace && ret >= 0) {
3573                 int ret2;
3574
3575                 ret2 = sync_write_pointer_for_zoned(sctx,
3576                                 chunk_logical + offset,
3577                                 map->stripes[stripe_index].physical,
3578                                 physical_end);
3579                 if (ret2)
3580                         ret = ret2;
3581         }
3582
3583         return ret < 0 ? ret : 0;
3584 }
3585
3586 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3587                                           struct btrfs_block_group *bg,
3588                                           struct btrfs_device *scrub_dev,
3589                                           u64 dev_offset,
3590                                           u64 dev_extent_len)
3591 {
3592         struct btrfs_fs_info *fs_info = sctx->fs_info;
3593         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3594         struct map_lookup *map;
3595         struct extent_map *em;
3596         int i;
3597         int ret = 0;
3598
3599         read_lock(&map_tree->lock);
3600         em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3601         read_unlock(&map_tree->lock);
3602
3603         if (!em) {
3604                 /*
3605                  * Might have been an unused block group deleted by the cleaner
3606                  * kthread or relocation.
3607                  */
3608                 spin_lock(&bg->lock);
3609                 if (!bg->removed)
3610                         ret = -EINVAL;
3611                 spin_unlock(&bg->lock);
3612
3613                 return ret;
3614         }
3615         if (em->start != bg->start)
3616                 goto out;
3617         if (em->len < dev_extent_len)
3618                 goto out;
3619
3620         map = em->map_lookup;
3621         for (i = 0; i < map->num_stripes; ++i) {
3622                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3623                     map->stripes[i].physical == dev_offset) {
3624                         ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
3625                         if (ret)
3626                                 goto out;
3627                 }
3628         }
3629 out:
3630         free_extent_map(em);
3631
3632         return ret;
3633 }
3634
3635 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3636                                           struct btrfs_block_group *cache)
3637 {
3638         struct btrfs_fs_info *fs_info = cache->fs_info;
3639         struct btrfs_trans_handle *trans;
3640
3641         if (!btrfs_is_zoned(fs_info))
3642                 return 0;
3643
3644         btrfs_wait_block_group_reservations(cache);
3645         btrfs_wait_nocow_writers(cache);
3646         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3647
3648         trans = btrfs_join_transaction(root);
3649         if (IS_ERR(trans))
3650                 return PTR_ERR(trans);
3651         return btrfs_commit_transaction(trans);
3652 }
3653
3654 static noinline_for_stack
3655 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3656                            struct btrfs_device *scrub_dev, u64 start, u64 end)
3657 {
3658         struct btrfs_dev_extent *dev_extent = NULL;
3659         struct btrfs_path *path;
3660         struct btrfs_fs_info *fs_info = sctx->fs_info;
3661         struct btrfs_root *root = fs_info->dev_root;
3662         u64 chunk_offset;
3663         int ret = 0;
3664         int ro_set;
3665         int slot;
3666         struct extent_buffer *l;
3667         struct btrfs_key key;
3668         struct btrfs_key found_key;
3669         struct btrfs_block_group *cache;
3670         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3671
3672         path = btrfs_alloc_path();
3673         if (!path)
3674                 return -ENOMEM;
3675
3676         path->reada = READA_FORWARD;
3677         path->search_commit_root = 1;
3678         path->skip_locking = 1;
3679
3680         key.objectid = scrub_dev->devid;
3681         key.offset = 0ull;
3682         key.type = BTRFS_DEV_EXTENT_KEY;
3683
3684         while (1) {
3685                 u64 dev_extent_len;
3686
3687                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3688                 if (ret < 0)
3689                         break;
3690                 if (ret > 0) {
3691                         if (path->slots[0] >=
3692                             btrfs_header_nritems(path->nodes[0])) {
3693                                 ret = btrfs_next_leaf(root, path);
3694                                 if (ret < 0)
3695                                         break;
3696                                 if (ret > 0) {
3697                                         ret = 0;
3698                                         break;
3699                                 }
3700                         } else {
3701                                 ret = 0;
3702                         }
3703                 }
3704
3705                 l = path->nodes[0];
3706                 slot = path->slots[0];
3707
3708                 btrfs_item_key_to_cpu(l, &found_key, slot);
3709
3710                 if (found_key.objectid != scrub_dev->devid)
3711                         break;
3712
3713                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3714                         break;
3715
3716                 if (found_key.offset >= end)
3717                         break;
3718
3719                 if (found_key.offset < key.offset)
3720                         break;
3721
3722                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3723                 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3724
3725                 if (found_key.offset + dev_extent_len <= start)
3726                         goto skip;
3727
3728                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3729
3730                 /*
3731                  * get a reference on the corresponding block group to prevent
3732                  * the chunk from going away while we scrub it
3733                  */
3734                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3735
3736                 /* some chunks are removed but not committed to disk yet,
3737                  * continue scrubbing */
3738                 if (!cache)
3739                         goto skip;
3740
3741                 ASSERT(cache->start <= chunk_offset);
3742                 /*
3743                  * We are using the commit root to search for device extents, so
3744                  * that means we could have found a device extent item from a
3745                  * block group that was deleted in the current transaction. The
3746                  * logical start offset of the deleted block group, stored at
3747                  * @chunk_offset, might be part of the logical address range of
3748                  * a new block group (which uses different physical extents).
3749                  * In this case btrfs_lookup_block_group() has returned the new
3750                  * block group, and its start address is less than @chunk_offset.
3751                  *
3752                  * We skip such new block groups, because it's pointless to
3753                  * process them, as we won't find their extents because we search
3754                  * for them using the commit root of the extent tree. For a device
3755                  * replace it's also fine to skip it, we won't miss copying them
3756                  * to the target device because we have the write duplication
3757                  * setup through the regular write path (by btrfs_map_block()),
3758                  * and we have committed a transaction when we started the device
3759                  * replace, right after setting up the device replace state.
3760                  */
3761                 if (cache->start < chunk_offset) {
3762                         btrfs_put_block_group(cache);
3763                         goto skip;
3764                 }
3765
3766                 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3767                         spin_lock(&cache->lock);
3768                         if (!cache->to_copy) {
3769                                 spin_unlock(&cache->lock);
3770                                 btrfs_put_block_group(cache);
3771                                 goto skip;
3772                         }
3773                         spin_unlock(&cache->lock);
3774                 }
3775
3776                 /*
3777                  * Make sure that while we are scrubbing the corresponding block
3778                  * group doesn't get its logical address and its device extents
3779                  * reused for another block group, which can possibly be of a
3780                  * different type and different profile. We do this to prevent
3781                  * false error detections and crashes due to bogus attempts to
3782                  * repair extents.
3783                  */
3784                 spin_lock(&cache->lock);
3785                 if (cache->removed) {
3786                         spin_unlock(&cache->lock);
3787                         btrfs_put_block_group(cache);
3788                         goto skip;
3789                 }
3790                 btrfs_freeze_block_group(cache);
3791                 spin_unlock(&cache->lock);
3792
3793                 /*
3794                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3795                  * to avoid deadlock caused by:
3796                  * btrfs_inc_block_group_ro()
3797                  * -> btrfs_wait_for_commit()
3798                  * -> btrfs_commit_transaction()
3799                  * -> btrfs_scrub_pause()
3800                  */
3801                 scrub_pause_on(fs_info);
3802
3803                 /*
3804                  * Don't do chunk preallocation for scrub.
3805                  *
3806                  * This is especially important for SYSTEM bgs, or we can hit
3807                  * -EFBIG from btrfs_finish_chunk_alloc() like:
3808                  * 1. The only SYSTEM bg is marked RO.
3809                  *    Since SYSTEM bg is small, that's pretty common.
3810                  * 2. New SYSTEM bg will be allocated
3811                  *    Due to regular version will allocate new chunk.
3812                  * 3. New SYSTEM bg is empty and will get cleaned up
3813                  *    Before cleanup really happens, it's marked RO again.
3814                  * 4. Empty SYSTEM bg get scrubbed
3815                  *    We go back to 2.
3816                  *
3817                  * This can easily boost the amount of SYSTEM chunks if cleaner
3818                  * thread can't be triggered fast enough, and use up all space
3819                  * of btrfs_super_block::sys_chunk_array
3820                  *
3821                  * While for dev replace, we need to try our best to mark block
3822                  * group RO, to prevent race between:
3823                  * - Write duplication
3824                  *   Contains latest data
3825                  * - Scrub copy
3826                  *   Contains data from commit tree
3827                  *
3828                  * If target block group is not marked RO, nocow writes can
3829                  * be overwritten by scrub copy, causing data corruption.
3830                  * So for dev-replace, it's not allowed to continue if a block
3831                  * group is not RO.
3832                  */
3833                 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3834                 if (!ret && sctx->is_dev_replace) {
3835                         ret = finish_extent_writes_for_zoned(root, cache);
3836                         if (ret) {
3837                                 btrfs_dec_block_group_ro(cache);
3838                                 scrub_pause_off(fs_info);
3839                                 btrfs_put_block_group(cache);
3840                                 break;
3841                         }
3842                 }
3843
3844                 if (ret == 0) {
3845                         ro_set = 1;
3846                 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3847                         /*
3848                          * btrfs_inc_block_group_ro return -ENOSPC when it
3849                          * failed in creating new chunk for metadata.
3850                          * It is not a problem for scrub, because
3851                          * metadata are always cowed, and our scrub paused
3852                          * commit_transactions.
3853                          */
3854                         ro_set = 0;
3855                 } else if (ret == -ETXTBSY) {
3856                         btrfs_warn(fs_info,
3857                    "skipping scrub of block group %llu due to active swapfile",
3858                                    cache->start);
3859                         scrub_pause_off(fs_info);
3860                         ret = 0;
3861                         goto skip_unfreeze;
3862                 } else {
3863                         btrfs_warn(fs_info,
3864                                    "failed setting block group ro: %d", ret);
3865                         btrfs_unfreeze_block_group(cache);
3866                         btrfs_put_block_group(cache);
3867                         scrub_pause_off(fs_info);
3868                         break;
3869                 }
3870
3871                 /*
3872                  * Now the target block is marked RO, wait for nocow writes to
3873                  * finish before dev-replace.
3874                  * COW is fine, as COW never overwrites extents in commit tree.
3875                  */
3876                 if (sctx->is_dev_replace) {
3877                         btrfs_wait_nocow_writers(cache);
3878                         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3879                                         cache->length);
3880                 }
3881
3882                 scrub_pause_off(fs_info);
3883                 down_write(&dev_replace->rwsem);
3884                 dev_replace->cursor_right = found_key.offset + dev_extent_len;
3885                 dev_replace->cursor_left = found_key.offset;
3886                 dev_replace->item_needs_writeback = 1;
3887                 up_write(&dev_replace->rwsem);
3888
3889                 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
3890                                   dev_extent_len);
3891
3892                 /*
3893                  * flush, submit all pending read and write bios, afterwards
3894                  * wait for them.
3895                  * Note that in the dev replace case, a read request causes
3896                  * write requests that are submitted in the read completion
3897                  * worker. Therefore in the current situation, it is required
3898                  * that all write requests are flushed, so that all read and
3899                  * write requests are really completed when bios_in_flight
3900                  * changes to 0.
3901                  */
3902                 sctx->flush_all_writes = true;
3903                 scrub_submit(sctx);
3904                 mutex_lock(&sctx->wr_lock);
3905                 scrub_wr_submit(sctx);
3906                 mutex_unlock(&sctx->wr_lock);
3907
3908                 wait_event(sctx->list_wait,
3909                            atomic_read(&sctx->bios_in_flight) == 0);
3910
3911                 scrub_pause_on(fs_info);
3912
3913                 /*
3914                  * must be called before we decrease @scrub_paused.
3915                  * make sure we don't block transaction commit while
3916                  * we are waiting pending workers finished.
3917                  */
3918                 wait_event(sctx->list_wait,
3919                            atomic_read(&sctx->workers_pending) == 0);
3920                 sctx->flush_all_writes = false;
3921
3922                 scrub_pause_off(fs_info);
3923
3924                 if (sctx->is_dev_replace &&
3925                     !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3926                                                       cache, found_key.offset))
3927                         ro_set = 0;
3928
3929                 down_write(&dev_replace->rwsem);
3930                 dev_replace->cursor_left = dev_replace->cursor_right;
3931                 dev_replace->item_needs_writeback = 1;
3932                 up_write(&dev_replace->rwsem);
3933
3934                 if (ro_set)
3935                         btrfs_dec_block_group_ro(cache);
3936
3937                 /*
3938                  * We might have prevented the cleaner kthread from deleting
3939                  * this block group if it was already unused because we raced
3940                  * and set it to RO mode first. So add it back to the unused
3941                  * list, otherwise it might not ever be deleted unless a manual
3942                  * balance is triggered or it becomes used and unused again.
3943                  */
3944                 spin_lock(&cache->lock);
3945                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3946                     cache->used == 0) {
3947                         spin_unlock(&cache->lock);
3948                         if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3949                                 btrfs_discard_queue_work(&fs_info->discard_ctl,
3950                                                          cache);
3951                         else
3952                                 btrfs_mark_bg_unused(cache);
3953                 } else {
3954                         spin_unlock(&cache->lock);
3955                 }
3956 skip_unfreeze:
3957                 btrfs_unfreeze_block_group(cache);
3958                 btrfs_put_block_group(cache);
3959                 if (ret)
3960                         break;
3961                 if (sctx->is_dev_replace &&
3962                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3963                         ret = -EIO;
3964                         break;
3965                 }
3966                 if (sctx->stat.malloc_errors > 0) {
3967                         ret = -ENOMEM;
3968                         break;
3969                 }
3970 skip:
3971                 key.offset = found_key.offset + dev_extent_len;
3972                 btrfs_release_path(path);
3973         }
3974
3975         btrfs_free_path(path);
3976
3977         return ret;
3978 }
3979
3980 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3981                                            struct btrfs_device *scrub_dev)
3982 {
3983         int     i;
3984         u64     bytenr;
3985         u64     gen;
3986         int     ret;
3987         struct btrfs_fs_info *fs_info = sctx->fs_info;
3988
3989         if (BTRFS_FS_ERROR(fs_info))
3990                 return -EROFS;
3991
3992         /* Seed devices of a new filesystem has their own generation. */
3993         if (scrub_dev->fs_devices != fs_info->fs_devices)
3994                 gen = scrub_dev->generation;
3995         else
3996                 gen = fs_info->last_trans_committed;
3997
3998         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3999                 bytenr = btrfs_sb_offset(i);
4000                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4001                     scrub_dev->commit_total_bytes)
4002                         break;
4003                 if (!btrfs_check_super_location(scrub_dev, bytenr))
4004                         continue;
4005
4006                 ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4007                                     scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4008                                     NULL, bytenr);
4009                 if (ret)
4010                         return ret;
4011         }
4012         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4013
4014         return 0;
4015 }
4016
4017 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
4018 {
4019         if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
4020                                         &fs_info->scrub_lock)) {
4021                 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
4022                 struct workqueue_struct *scrub_wr_comp =
4023                                                 fs_info->scrub_wr_completion_workers;
4024                 struct workqueue_struct *scrub_parity =
4025                                                 fs_info->scrub_parity_workers;
4026
4027                 fs_info->scrub_workers = NULL;
4028                 fs_info->scrub_wr_completion_workers = NULL;
4029                 fs_info->scrub_parity_workers = NULL;
4030                 mutex_unlock(&fs_info->scrub_lock);
4031
4032                 if (scrub_workers)
4033                         destroy_workqueue(scrub_workers);
4034                 if (scrub_wr_comp)
4035                         destroy_workqueue(scrub_wr_comp);
4036                 if (scrub_parity)
4037                         destroy_workqueue(scrub_parity);
4038         }
4039 }
4040
4041 /*
4042  * get a reference count on fs_info->scrub_workers. start worker if necessary
4043  */
4044 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4045                                                 int is_dev_replace)
4046 {
4047         struct workqueue_struct *scrub_workers = NULL;
4048         struct workqueue_struct *scrub_wr_comp = NULL;
4049         struct workqueue_struct *scrub_parity = NULL;
4050         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4051         int max_active = fs_info->thread_pool_size;
4052         int ret = -ENOMEM;
4053
4054         if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4055                 return 0;
4056
4057         scrub_workers = alloc_workqueue("btrfs-scrub", flags,
4058                                         is_dev_replace ? 1 : max_active);
4059         if (!scrub_workers)
4060                 goto fail_scrub_workers;
4061
4062         scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
4063         if (!scrub_wr_comp)
4064                 goto fail_scrub_wr_completion_workers;
4065
4066         scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
4067         if (!scrub_parity)
4068                 goto fail_scrub_parity_workers;
4069
4070         mutex_lock(&fs_info->scrub_lock);
4071         if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4072                 ASSERT(fs_info->scrub_workers == NULL &&
4073                        fs_info->scrub_wr_completion_workers == NULL &&
4074                        fs_info->scrub_parity_workers == NULL);
4075                 fs_info->scrub_workers = scrub_workers;
4076                 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4077                 fs_info->scrub_parity_workers = scrub_parity;
4078                 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4079                 mutex_unlock(&fs_info->scrub_lock);
4080                 return 0;
4081         }
4082         /* Other thread raced in and created the workers for us */
4083         refcount_inc(&fs_info->scrub_workers_refcnt);
4084         mutex_unlock(&fs_info->scrub_lock);
4085
4086         ret = 0;
4087         destroy_workqueue(scrub_parity);
4088 fail_scrub_parity_workers:
4089         destroy_workqueue(scrub_wr_comp);
4090 fail_scrub_wr_completion_workers:
4091         destroy_workqueue(scrub_workers);
4092 fail_scrub_workers:
4093         return ret;
4094 }
4095
4096 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4097                     u64 end, struct btrfs_scrub_progress *progress,
4098                     int readonly, int is_dev_replace)
4099 {
4100         struct btrfs_dev_lookup_args args = { .devid = devid };
4101         struct scrub_ctx *sctx;
4102         int ret;
4103         struct btrfs_device *dev;
4104         unsigned int nofs_flag;
4105
4106         if (btrfs_fs_closing(fs_info))
4107                 return -EAGAIN;
4108
4109         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4110                 /*
4111                  * in this case scrub is unable to calculate the checksum
4112                  * the way scrub is implemented. Do not handle this
4113                  * situation at all because it won't ever happen.
4114                  */
4115                 btrfs_err(fs_info,
4116                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4117                        fs_info->nodesize,
4118                        BTRFS_STRIPE_LEN);
4119                 return -EINVAL;
4120         }
4121
4122         if (fs_info->nodesize >
4123             SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
4124             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
4125                 /*
4126                  * Would exhaust the array bounds of sectorv member in
4127                  * struct scrub_block
4128                  */
4129                 btrfs_err(fs_info,
4130 "scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
4131                        fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
4132                        fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
4133                 return -EINVAL;
4134         }
4135
4136         /* Allocate outside of device_list_mutex */
4137         sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4138         if (IS_ERR(sctx))
4139                 return PTR_ERR(sctx);
4140
4141         ret = scrub_workers_get(fs_info, is_dev_replace);
4142         if (ret)
4143                 goto out_free_ctx;
4144
4145         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4146         dev = btrfs_find_device(fs_info->fs_devices, &args);
4147         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4148                      !is_dev_replace)) {
4149                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4150                 ret = -ENODEV;
4151                 goto out;
4152         }
4153
4154         if (!is_dev_replace && !readonly &&
4155             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4156                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4157                 btrfs_err_in_rcu(fs_info,
4158                         "scrub on devid %llu: filesystem on %s is not writable",
4159                                  devid, rcu_str_deref(dev->name));
4160                 ret = -EROFS;
4161                 goto out;
4162         }
4163
4164         mutex_lock(&fs_info->scrub_lock);
4165         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4166             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4167                 mutex_unlock(&fs_info->scrub_lock);
4168                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4169                 ret = -EIO;
4170                 goto out;
4171         }
4172
4173         down_read(&fs_info->dev_replace.rwsem);
4174         if (dev->scrub_ctx ||
4175             (!is_dev_replace &&
4176              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4177                 up_read(&fs_info->dev_replace.rwsem);
4178                 mutex_unlock(&fs_info->scrub_lock);
4179                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4180                 ret = -EINPROGRESS;
4181                 goto out;
4182         }
4183         up_read(&fs_info->dev_replace.rwsem);
4184
4185         sctx->readonly = readonly;
4186         dev->scrub_ctx = sctx;
4187         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4188
4189         /*
4190          * checking @scrub_pause_req here, we can avoid
4191          * race between committing transaction and scrubbing.
4192          */
4193         __scrub_blocked_if_needed(fs_info);
4194         atomic_inc(&fs_info->scrubs_running);
4195         mutex_unlock(&fs_info->scrub_lock);
4196
4197         /*
4198          * In order to avoid deadlock with reclaim when there is a transaction
4199          * trying to pause scrub, make sure we use GFP_NOFS for all the
4200          * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
4201          * invoked by our callees. The pausing request is done when the
4202          * transaction commit starts, and it blocks the transaction until scrub
4203          * is paused (done at specific points at scrub_stripe() or right above
4204          * before incrementing fs_info->scrubs_running).
4205          */
4206         nofs_flag = memalloc_nofs_save();
4207         if (!is_dev_replace) {
4208                 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4209                 /*
4210                  * by holding device list mutex, we can
4211                  * kick off writing super in log tree sync.
4212                  */
4213                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4214                 ret = scrub_supers(sctx, dev);
4215                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4216         }
4217
4218         if (!ret)
4219                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4220         memalloc_nofs_restore(nofs_flag);
4221
4222         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4223         atomic_dec(&fs_info->scrubs_running);
4224         wake_up(&fs_info->scrub_pause_wait);
4225
4226         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4227
4228         if (progress)
4229                 memcpy(progress, &sctx->stat, sizeof(*progress));
4230
4231         if (!is_dev_replace)
4232                 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4233                         ret ? "not finished" : "finished", devid, ret);
4234
4235         mutex_lock(&fs_info->scrub_lock);
4236         dev->scrub_ctx = NULL;
4237         mutex_unlock(&fs_info->scrub_lock);
4238
4239         scrub_workers_put(fs_info);
4240         scrub_put_ctx(sctx);
4241
4242         return ret;
4243 out:
4244         scrub_workers_put(fs_info);
4245 out_free_ctx:
4246         scrub_free_ctx(sctx);
4247
4248         return ret;
4249 }
4250
4251 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4252 {
4253         mutex_lock(&fs_info->scrub_lock);
4254         atomic_inc(&fs_info->scrub_pause_req);
4255         while (atomic_read(&fs_info->scrubs_paused) !=
4256                atomic_read(&fs_info->scrubs_running)) {
4257                 mutex_unlock(&fs_info->scrub_lock);
4258                 wait_event(fs_info->scrub_pause_wait,
4259                            atomic_read(&fs_info->scrubs_paused) ==
4260                            atomic_read(&fs_info->scrubs_running));
4261                 mutex_lock(&fs_info->scrub_lock);
4262         }
4263         mutex_unlock(&fs_info->scrub_lock);
4264 }
4265
4266 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4267 {
4268         atomic_dec(&fs_info->scrub_pause_req);
4269         wake_up(&fs_info->scrub_pause_wait);
4270 }
4271
4272 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4273 {
4274         mutex_lock(&fs_info->scrub_lock);
4275         if (!atomic_read(&fs_info->scrubs_running)) {
4276                 mutex_unlock(&fs_info->scrub_lock);
4277                 return -ENOTCONN;
4278         }
4279
4280         atomic_inc(&fs_info->scrub_cancel_req);
4281         while (atomic_read(&fs_info->scrubs_running)) {
4282                 mutex_unlock(&fs_info->scrub_lock);
4283                 wait_event(fs_info->scrub_pause_wait,
4284                            atomic_read(&fs_info->scrubs_running) == 0);
4285                 mutex_lock(&fs_info->scrub_lock);
4286         }
4287         atomic_dec(&fs_info->scrub_cancel_req);
4288         mutex_unlock(&fs_info->scrub_lock);
4289
4290         return 0;
4291 }
4292
4293 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4294 {
4295         struct btrfs_fs_info *fs_info = dev->fs_info;
4296         struct scrub_ctx *sctx;
4297
4298         mutex_lock(&fs_info->scrub_lock);
4299         sctx = dev->scrub_ctx;
4300         if (!sctx) {
4301                 mutex_unlock(&fs_info->scrub_lock);
4302                 return -ENOTCONN;
4303         }
4304         atomic_inc(&sctx->cancel_req);
4305         while (dev->scrub_ctx) {
4306                 mutex_unlock(&fs_info->scrub_lock);
4307                 wait_event(fs_info->scrub_pause_wait,
4308                            dev->scrub_ctx == NULL);
4309                 mutex_lock(&fs_info->scrub_lock);
4310         }
4311         mutex_unlock(&fs_info->scrub_lock);
4312
4313         return 0;
4314 }
4315
4316 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4317                          struct btrfs_scrub_progress *progress)
4318 {
4319         struct btrfs_dev_lookup_args args = { .devid = devid };
4320         struct btrfs_device *dev;
4321         struct scrub_ctx *sctx = NULL;
4322
4323         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4324         dev = btrfs_find_device(fs_info->fs_devices, &args);
4325         if (dev)
4326                 sctx = dev->scrub_ctx;
4327         if (sctx)
4328                 memcpy(progress, &sctx->stat, sizeof(*progress));
4329         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4330
4331         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4332 }
4333
4334 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
4335                                  u64 extent_logical, u32 extent_len,
4336                                  u64 *extent_physical,
4337                                  struct btrfs_device **extent_dev,
4338                                  int *extent_mirror_num)
4339 {
4340         u64 mapped_length;
4341         struct btrfs_io_context *bioc = NULL;
4342         int ret;
4343
4344         mapped_length = extent_len;
4345         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4346                               &mapped_length, &bioc, 0);
4347         if (ret || !bioc || mapped_length < extent_len ||
4348             !bioc->stripes[0].dev->bdev) {
4349                 btrfs_put_bioc(bioc);
4350                 return;
4351         }
4352
4353         *extent_physical = bioc->stripes[0].physical;
4354         *extent_mirror_num = bioc->mirror_num;
4355         *extent_dev = bioc->stripes[0].dev;
4356         btrfs_put_bioc(bioc);
4357 }