Merge tag 'pwm/for-4.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/thierry...
[linux-2.6-microblaze.git] / fs / btrfs / scrub.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
4  */
5
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include "ctree.h"
10 #include "volumes.h"
11 #include "disk-io.h"
12 #include "ordered-data.h"
13 #include "transaction.h"
14 #include "backref.h"
15 #include "extent_io.h"
16 #include "dev-replace.h"
17 #include "check-integrity.h"
18 #include "rcu-string.h"
19 #include "raid56.h"
20
21 /*
22  * This is only the first step towards a full-features scrub. It reads all
23  * extent and super block and verifies the checksums. In case a bad checksum
24  * is found or the extent cannot be read, good data will be written back if
25  * any can be found.
26  *
27  * Future enhancements:
28  *  - In case an unrepairable extent is encountered, track which files are
29  *    affected and report them
30  *  - track and record media errors, throw out bad devices
31  *  - add a mode to also read unallocated space
32  */
33
34 struct scrub_block;
35 struct scrub_ctx;
36
37 /*
38  * the following three values only influence the performance.
39  * The last one configures the number of parallel and outstanding I/O
40  * operations. The first two values configure an upper limit for the number
41  * of (dynamically allocated) pages that are added to a bio.
42  */
43 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
44 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
45 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
46
47 /*
48  * the following value times PAGE_SIZE needs to be large enough to match the
49  * largest node/leaf/sector size that shall be supported.
50  * Values larger than BTRFS_STRIPE_LEN are not supported.
51  */
52 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
53
54 struct scrub_recover {
55         refcount_t              refs;
56         struct btrfs_bio        *bbio;
57         u64                     map_length;
58 };
59
60 struct scrub_page {
61         struct scrub_block      *sblock;
62         struct page             *page;
63         struct btrfs_device     *dev;
64         struct list_head        list;
65         u64                     flags;  /* extent flags */
66         u64                     generation;
67         u64                     logical;
68         u64                     physical;
69         u64                     physical_for_dev_replace;
70         atomic_t                refs;
71         struct {
72                 unsigned int    mirror_num:8;
73                 unsigned int    have_csum:1;
74                 unsigned int    io_error:1;
75         };
76         u8                      csum[BTRFS_CSUM_SIZE];
77
78         struct scrub_recover    *recover;
79 };
80
81 struct scrub_bio {
82         int                     index;
83         struct scrub_ctx        *sctx;
84         struct btrfs_device     *dev;
85         struct bio              *bio;
86         blk_status_t            status;
87         u64                     logical;
88         u64                     physical;
89 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
90         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
91 #else
92         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
93 #endif
94         int                     page_count;
95         int                     next_free;
96         struct btrfs_work       work;
97 };
98
99 struct scrub_block {
100         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
101         int                     page_count;
102         atomic_t                outstanding_pages;
103         refcount_t              refs; /* free mem on transition to zero */
104         struct scrub_ctx        *sctx;
105         struct scrub_parity     *sparity;
106         struct {
107                 unsigned int    header_error:1;
108                 unsigned int    checksum_error:1;
109                 unsigned int    no_io_error_seen:1;
110                 unsigned int    generation_error:1; /* also sets header_error */
111
112                 /* The following is for the data used to check parity */
113                 /* It is for the data with checksum */
114                 unsigned int    data_corrected:1;
115         };
116         struct btrfs_work       work;
117 };
118
119 /* Used for the chunks with parity stripe such RAID5/6 */
120 struct scrub_parity {
121         struct scrub_ctx        *sctx;
122
123         struct btrfs_device     *scrub_dev;
124
125         u64                     logic_start;
126
127         u64                     logic_end;
128
129         int                     nsectors;
130
131         u64                     stripe_len;
132
133         refcount_t              refs;
134
135         struct list_head        spages;
136
137         /* Work of parity check and repair */
138         struct btrfs_work       work;
139
140         /* Mark the parity blocks which have data */
141         unsigned long           *dbitmap;
142
143         /*
144          * Mark the parity blocks which have data, but errors happen when
145          * read data or check data
146          */
147         unsigned long           *ebitmap;
148
149         unsigned long           bitmap[0];
150 };
151
152 struct scrub_ctx {
153         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
154         struct btrfs_fs_info    *fs_info;
155         int                     first_free;
156         int                     curr;
157         atomic_t                bios_in_flight;
158         atomic_t                workers_pending;
159         spinlock_t              list_lock;
160         wait_queue_head_t       list_wait;
161         u16                     csum_size;
162         struct list_head        csum_list;
163         atomic_t                cancel_req;
164         int                     readonly;
165         int                     pages_per_rd_bio;
166
167         int                     is_dev_replace;
168
169         struct scrub_bio        *wr_curr_bio;
170         struct mutex            wr_lock;
171         int                     pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
172         struct btrfs_device     *wr_tgtdev;
173         bool                    flush_all_writes;
174
175         /*
176          * statistics
177          */
178         struct btrfs_scrub_progress stat;
179         spinlock_t              stat_lock;
180
181         /*
182          * Use a ref counter to avoid use-after-free issues. Scrub workers
183          * decrement bios_in_flight and workers_pending and then do a wakeup
184          * on the list_wait wait queue. We must ensure the main scrub task
185          * doesn't free the scrub context before or while the workers are
186          * doing the wakeup() call.
187          */
188         refcount_t              refs;
189 };
190
191 struct scrub_fixup_nodatasum {
192         struct scrub_ctx        *sctx;
193         struct btrfs_device     *dev;
194         u64                     logical;
195         struct btrfs_root       *root;
196         struct btrfs_work       work;
197         int                     mirror_num;
198 };
199
200 struct scrub_nocow_inode {
201         u64                     inum;
202         u64                     offset;
203         u64                     root;
204         struct list_head        list;
205 };
206
207 struct scrub_copy_nocow_ctx {
208         struct scrub_ctx        *sctx;
209         u64                     logical;
210         u64                     len;
211         int                     mirror_num;
212         u64                     physical_for_dev_replace;
213         struct list_head        inodes;
214         struct btrfs_work       work;
215 };
216
217 struct scrub_warning {
218         struct btrfs_path       *path;
219         u64                     extent_item_size;
220         const char              *errstr;
221         u64                     physical;
222         u64                     logical;
223         struct btrfs_device     *dev;
224 };
225
226 struct full_stripe_lock {
227         struct rb_node node;
228         u64 logical;
229         u64 refs;
230         struct mutex mutex;
231 };
232
233 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
234 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
235 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
236 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
237 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
238 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
239                                      struct scrub_block *sblocks_for_recheck);
240 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
241                                 struct scrub_block *sblock,
242                                 int retry_failed_mirror);
243 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
244 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
245                                              struct scrub_block *sblock_good);
246 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
247                                             struct scrub_block *sblock_good,
248                                             int page_num, int force_write);
249 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
250 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
251                                            int page_num);
252 static int scrub_checksum_data(struct scrub_block *sblock);
253 static int scrub_checksum_tree_block(struct scrub_block *sblock);
254 static int scrub_checksum_super(struct scrub_block *sblock);
255 static void scrub_block_get(struct scrub_block *sblock);
256 static void scrub_block_put(struct scrub_block *sblock);
257 static void scrub_page_get(struct scrub_page *spage);
258 static void scrub_page_put(struct scrub_page *spage);
259 static void scrub_parity_get(struct scrub_parity *sparity);
260 static void scrub_parity_put(struct scrub_parity *sparity);
261 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
262                                     struct scrub_page *spage);
263 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
264                        u64 physical, struct btrfs_device *dev, u64 flags,
265                        u64 gen, int mirror_num, u8 *csum, int force,
266                        u64 physical_for_dev_replace);
267 static void scrub_bio_end_io(struct bio *bio);
268 static void scrub_bio_end_io_worker(struct btrfs_work *work);
269 static void scrub_block_complete(struct scrub_block *sblock);
270 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
271                                u64 extent_logical, u64 extent_len,
272                                u64 *extent_physical,
273                                struct btrfs_device **extent_dev,
274                                int *extent_mirror_num);
275 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
276                                     struct scrub_page *spage);
277 static void scrub_wr_submit(struct scrub_ctx *sctx);
278 static void scrub_wr_bio_end_io(struct bio *bio);
279 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
280 static int write_page_nocow(struct scrub_ctx *sctx,
281                             u64 physical_for_dev_replace, struct page *page);
282 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
283                                       struct scrub_copy_nocow_ctx *ctx);
284 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
285                             int mirror_num, u64 physical_for_dev_replace);
286 static void copy_nocow_pages_worker(struct btrfs_work *work);
287 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
288 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
289 static void scrub_put_ctx(struct scrub_ctx *sctx);
290
291 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
292 {
293         return page->recover &&
294                (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
295 }
296
297 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
298 {
299         refcount_inc(&sctx->refs);
300         atomic_inc(&sctx->bios_in_flight);
301 }
302
303 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
304 {
305         atomic_dec(&sctx->bios_in_flight);
306         wake_up(&sctx->list_wait);
307         scrub_put_ctx(sctx);
308 }
309
310 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
311 {
312         while (atomic_read(&fs_info->scrub_pause_req)) {
313                 mutex_unlock(&fs_info->scrub_lock);
314                 wait_event(fs_info->scrub_pause_wait,
315                    atomic_read(&fs_info->scrub_pause_req) == 0);
316                 mutex_lock(&fs_info->scrub_lock);
317         }
318 }
319
320 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
321 {
322         atomic_inc(&fs_info->scrubs_paused);
323         wake_up(&fs_info->scrub_pause_wait);
324 }
325
326 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
327 {
328         mutex_lock(&fs_info->scrub_lock);
329         __scrub_blocked_if_needed(fs_info);
330         atomic_dec(&fs_info->scrubs_paused);
331         mutex_unlock(&fs_info->scrub_lock);
332
333         wake_up(&fs_info->scrub_pause_wait);
334 }
335
336 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
337 {
338         scrub_pause_on(fs_info);
339         scrub_pause_off(fs_info);
340 }
341
342 /*
343  * Insert new full stripe lock into full stripe locks tree
344  *
345  * Return pointer to existing or newly inserted full_stripe_lock structure if
346  * everything works well.
347  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
348  *
349  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
350  * function
351  */
352 static struct full_stripe_lock *insert_full_stripe_lock(
353                 struct btrfs_full_stripe_locks_tree *locks_root,
354                 u64 fstripe_logical)
355 {
356         struct rb_node **p;
357         struct rb_node *parent = NULL;
358         struct full_stripe_lock *entry;
359         struct full_stripe_lock *ret;
360
361         lockdep_assert_held(&locks_root->lock);
362
363         p = &locks_root->root.rb_node;
364         while (*p) {
365                 parent = *p;
366                 entry = rb_entry(parent, struct full_stripe_lock, node);
367                 if (fstripe_logical < entry->logical) {
368                         p = &(*p)->rb_left;
369                 } else if (fstripe_logical > entry->logical) {
370                         p = &(*p)->rb_right;
371                 } else {
372                         entry->refs++;
373                         return entry;
374                 }
375         }
376
377         /* Insert new lock */
378         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
379         if (!ret)
380                 return ERR_PTR(-ENOMEM);
381         ret->logical = fstripe_logical;
382         ret->refs = 1;
383         mutex_init(&ret->mutex);
384
385         rb_link_node(&ret->node, parent, p);
386         rb_insert_color(&ret->node, &locks_root->root);
387         return ret;
388 }
389
390 /*
391  * Search for a full stripe lock of a block group
392  *
393  * Return pointer to existing full stripe lock if found
394  * Return NULL if not found
395  */
396 static struct full_stripe_lock *search_full_stripe_lock(
397                 struct btrfs_full_stripe_locks_tree *locks_root,
398                 u64 fstripe_logical)
399 {
400         struct rb_node *node;
401         struct full_stripe_lock *entry;
402
403         lockdep_assert_held(&locks_root->lock);
404
405         node = locks_root->root.rb_node;
406         while (node) {
407                 entry = rb_entry(node, struct full_stripe_lock, node);
408                 if (fstripe_logical < entry->logical)
409                         node = node->rb_left;
410                 else if (fstripe_logical > entry->logical)
411                         node = node->rb_right;
412                 else
413                         return entry;
414         }
415         return NULL;
416 }
417
418 /*
419  * Helper to get full stripe logical from a normal bytenr.
420  *
421  * Caller must ensure @cache is a RAID56 block group.
422  */
423 static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
424                                    u64 bytenr)
425 {
426         u64 ret;
427
428         /*
429          * Due to chunk item size limit, full stripe length should not be
430          * larger than U32_MAX. Just a sanity check here.
431          */
432         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
433
434         /*
435          * round_down() can only handle power of 2, while RAID56 full
436          * stripe length can be 64KiB * n, so we need to manually round down.
437          */
438         ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
439                 cache->full_stripe_len + cache->key.objectid;
440         return ret;
441 }
442
443 /*
444  * Lock a full stripe to avoid concurrency of recovery and read
445  *
446  * It's only used for profiles with parities (RAID5/6), for other profiles it
447  * does nothing.
448  *
449  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
450  * So caller must call unlock_full_stripe() at the same context.
451  *
452  * Return <0 if encounters error.
453  */
454 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
455                             bool *locked_ret)
456 {
457         struct btrfs_block_group_cache *bg_cache;
458         struct btrfs_full_stripe_locks_tree *locks_root;
459         struct full_stripe_lock *existing;
460         u64 fstripe_start;
461         int ret = 0;
462
463         *locked_ret = false;
464         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
465         if (!bg_cache) {
466                 ASSERT(0);
467                 return -ENOENT;
468         }
469
470         /* Profiles not based on parity don't need full stripe lock */
471         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
472                 goto out;
473         locks_root = &bg_cache->full_stripe_locks_root;
474
475         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
476
477         /* Now insert the full stripe lock */
478         mutex_lock(&locks_root->lock);
479         existing = insert_full_stripe_lock(locks_root, fstripe_start);
480         mutex_unlock(&locks_root->lock);
481         if (IS_ERR(existing)) {
482                 ret = PTR_ERR(existing);
483                 goto out;
484         }
485         mutex_lock(&existing->mutex);
486         *locked_ret = true;
487 out:
488         btrfs_put_block_group(bg_cache);
489         return ret;
490 }
491
492 /*
493  * Unlock a full stripe.
494  *
495  * NOTE: Caller must ensure it's the same context calling corresponding
496  * lock_full_stripe().
497  *
498  * Return 0 if we unlock full stripe without problem.
499  * Return <0 for error
500  */
501 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
502                               bool locked)
503 {
504         struct btrfs_block_group_cache *bg_cache;
505         struct btrfs_full_stripe_locks_tree *locks_root;
506         struct full_stripe_lock *fstripe_lock;
507         u64 fstripe_start;
508         bool freeit = false;
509         int ret = 0;
510
511         /* If we didn't acquire full stripe lock, no need to continue */
512         if (!locked)
513                 return 0;
514
515         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
516         if (!bg_cache) {
517                 ASSERT(0);
518                 return -ENOENT;
519         }
520         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
521                 goto out;
522
523         locks_root = &bg_cache->full_stripe_locks_root;
524         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
525
526         mutex_lock(&locks_root->lock);
527         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
528         /* Unpaired unlock_full_stripe() detected */
529         if (!fstripe_lock) {
530                 WARN_ON(1);
531                 ret = -ENOENT;
532                 mutex_unlock(&locks_root->lock);
533                 goto out;
534         }
535
536         if (fstripe_lock->refs == 0) {
537                 WARN_ON(1);
538                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
539                         fstripe_lock->logical);
540         } else {
541                 fstripe_lock->refs--;
542         }
543
544         if (fstripe_lock->refs == 0) {
545                 rb_erase(&fstripe_lock->node, &locks_root->root);
546                 freeit = true;
547         }
548         mutex_unlock(&locks_root->lock);
549
550         mutex_unlock(&fstripe_lock->mutex);
551         if (freeit)
552                 kfree(fstripe_lock);
553 out:
554         btrfs_put_block_group(bg_cache);
555         return ret;
556 }
557
558 /*
559  * used for workers that require transaction commits (i.e., for the
560  * NOCOW case)
561  */
562 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
563 {
564         struct btrfs_fs_info *fs_info = sctx->fs_info;
565
566         refcount_inc(&sctx->refs);
567         /*
568          * increment scrubs_running to prevent cancel requests from
569          * completing as long as a worker is running. we must also
570          * increment scrubs_paused to prevent deadlocking on pause
571          * requests used for transactions commits (as the worker uses a
572          * transaction context). it is safe to regard the worker
573          * as paused for all matters practical. effectively, we only
574          * avoid cancellation requests from completing.
575          */
576         mutex_lock(&fs_info->scrub_lock);
577         atomic_inc(&fs_info->scrubs_running);
578         atomic_inc(&fs_info->scrubs_paused);
579         mutex_unlock(&fs_info->scrub_lock);
580
581         /*
582          * check if @scrubs_running=@scrubs_paused condition
583          * inside wait_event() is not an atomic operation.
584          * which means we may inc/dec @scrub_running/paused
585          * at any time. Let's wake up @scrub_pause_wait as
586          * much as we can to let commit transaction blocked less.
587          */
588         wake_up(&fs_info->scrub_pause_wait);
589
590         atomic_inc(&sctx->workers_pending);
591 }
592
593 /* used for workers that require transaction commits */
594 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
595 {
596         struct btrfs_fs_info *fs_info = sctx->fs_info;
597
598         /*
599          * see scrub_pending_trans_workers_inc() why we're pretending
600          * to be paused in the scrub counters
601          */
602         mutex_lock(&fs_info->scrub_lock);
603         atomic_dec(&fs_info->scrubs_running);
604         atomic_dec(&fs_info->scrubs_paused);
605         mutex_unlock(&fs_info->scrub_lock);
606         atomic_dec(&sctx->workers_pending);
607         wake_up(&fs_info->scrub_pause_wait);
608         wake_up(&sctx->list_wait);
609         scrub_put_ctx(sctx);
610 }
611
612 static void scrub_free_csums(struct scrub_ctx *sctx)
613 {
614         while (!list_empty(&sctx->csum_list)) {
615                 struct btrfs_ordered_sum *sum;
616                 sum = list_first_entry(&sctx->csum_list,
617                                        struct btrfs_ordered_sum, list);
618                 list_del(&sum->list);
619                 kfree(sum);
620         }
621 }
622
623 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
624 {
625         int i;
626
627         if (!sctx)
628                 return;
629
630         /* this can happen when scrub is cancelled */
631         if (sctx->curr != -1) {
632                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
633
634                 for (i = 0; i < sbio->page_count; i++) {
635                         WARN_ON(!sbio->pagev[i]->page);
636                         scrub_block_put(sbio->pagev[i]->sblock);
637                 }
638                 bio_put(sbio->bio);
639         }
640
641         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
642                 struct scrub_bio *sbio = sctx->bios[i];
643
644                 if (!sbio)
645                         break;
646                 kfree(sbio);
647         }
648
649         kfree(sctx->wr_curr_bio);
650         scrub_free_csums(sctx);
651         kfree(sctx);
652 }
653
654 static void scrub_put_ctx(struct scrub_ctx *sctx)
655 {
656         if (refcount_dec_and_test(&sctx->refs))
657                 scrub_free_ctx(sctx);
658 }
659
660 static noinline_for_stack
661 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
662 {
663         struct scrub_ctx *sctx;
664         int             i;
665         struct btrfs_fs_info *fs_info = dev->fs_info;
666
667         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
668         if (!sctx)
669                 goto nomem;
670         refcount_set(&sctx->refs, 1);
671         sctx->is_dev_replace = is_dev_replace;
672         sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
673         sctx->curr = -1;
674         sctx->fs_info = dev->fs_info;
675         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
676                 struct scrub_bio *sbio;
677
678                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
679                 if (!sbio)
680                         goto nomem;
681                 sctx->bios[i] = sbio;
682
683                 sbio->index = i;
684                 sbio->sctx = sctx;
685                 sbio->page_count = 0;
686                 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
687                                 scrub_bio_end_io_worker, NULL, NULL);
688
689                 if (i != SCRUB_BIOS_PER_SCTX - 1)
690                         sctx->bios[i]->next_free = i + 1;
691                 else
692                         sctx->bios[i]->next_free = -1;
693         }
694         sctx->first_free = 0;
695         atomic_set(&sctx->bios_in_flight, 0);
696         atomic_set(&sctx->workers_pending, 0);
697         atomic_set(&sctx->cancel_req, 0);
698         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
699         INIT_LIST_HEAD(&sctx->csum_list);
700
701         spin_lock_init(&sctx->list_lock);
702         spin_lock_init(&sctx->stat_lock);
703         init_waitqueue_head(&sctx->list_wait);
704
705         WARN_ON(sctx->wr_curr_bio != NULL);
706         mutex_init(&sctx->wr_lock);
707         sctx->wr_curr_bio = NULL;
708         if (is_dev_replace) {
709                 WARN_ON(!fs_info->dev_replace.tgtdev);
710                 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
711                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
712                 sctx->flush_all_writes = false;
713         }
714
715         return sctx;
716
717 nomem:
718         scrub_free_ctx(sctx);
719         return ERR_PTR(-ENOMEM);
720 }
721
722 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
723                                      void *warn_ctx)
724 {
725         u64 isize;
726         u32 nlink;
727         int ret;
728         int i;
729         unsigned nofs_flag;
730         struct extent_buffer *eb;
731         struct btrfs_inode_item *inode_item;
732         struct scrub_warning *swarn = warn_ctx;
733         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
734         struct inode_fs_paths *ipath = NULL;
735         struct btrfs_root *local_root;
736         struct btrfs_key root_key;
737         struct btrfs_key key;
738
739         root_key.objectid = root;
740         root_key.type = BTRFS_ROOT_ITEM_KEY;
741         root_key.offset = (u64)-1;
742         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
743         if (IS_ERR(local_root)) {
744                 ret = PTR_ERR(local_root);
745                 goto err;
746         }
747
748         /*
749          * this makes the path point to (inum INODE_ITEM ioff)
750          */
751         key.objectid = inum;
752         key.type = BTRFS_INODE_ITEM_KEY;
753         key.offset = 0;
754
755         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
756         if (ret) {
757                 btrfs_release_path(swarn->path);
758                 goto err;
759         }
760
761         eb = swarn->path->nodes[0];
762         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
763                                         struct btrfs_inode_item);
764         isize = btrfs_inode_size(eb, inode_item);
765         nlink = btrfs_inode_nlink(eb, inode_item);
766         btrfs_release_path(swarn->path);
767
768         /*
769          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
770          * uses GFP_NOFS in this context, so we keep it consistent but it does
771          * not seem to be strictly necessary.
772          */
773         nofs_flag = memalloc_nofs_save();
774         ipath = init_ipath(4096, local_root, swarn->path);
775         memalloc_nofs_restore(nofs_flag);
776         if (IS_ERR(ipath)) {
777                 ret = PTR_ERR(ipath);
778                 ipath = NULL;
779                 goto err;
780         }
781         ret = paths_from_inode(inum, ipath);
782
783         if (ret < 0)
784                 goto err;
785
786         /*
787          * we deliberately ignore the bit ipath might have been too small to
788          * hold all of the paths here
789          */
790         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
791                 btrfs_warn_in_rcu(fs_info,
792 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
793                                   swarn->errstr, swarn->logical,
794                                   rcu_str_deref(swarn->dev->name),
795                                   swarn->physical,
796                                   root, inum, offset,
797                                   min(isize - offset, (u64)PAGE_SIZE), nlink,
798                                   (char *)(unsigned long)ipath->fspath->val[i]);
799
800         free_ipath(ipath);
801         return 0;
802
803 err:
804         btrfs_warn_in_rcu(fs_info,
805                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
806                           swarn->errstr, swarn->logical,
807                           rcu_str_deref(swarn->dev->name),
808                           swarn->physical,
809                           root, inum, offset, ret);
810
811         free_ipath(ipath);
812         return 0;
813 }
814
815 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
816 {
817         struct btrfs_device *dev;
818         struct btrfs_fs_info *fs_info;
819         struct btrfs_path *path;
820         struct btrfs_key found_key;
821         struct extent_buffer *eb;
822         struct btrfs_extent_item *ei;
823         struct scrub_warning swarn;
824         unsigned long ptr = 0;
825         u64 extent_item_pos;
826         u64 flags = 0;
827         u64 ref_root;
828         u32 item_size;
829         u8 ref_level = 0;
830         int ret;
831
832         WARN_ON(sblock->page_count < 1);
833         dev = sblock->pagev[0]->dev;
834         fs_info = sblock->sctx->fs_info;
835
836         path = btrfs_alloc_path();
837         if (!path)
838                 return;
839
840         swarn.physical = sblock->pagev[0]->physical;
841         swarn.logical = sblock->pagev[0]->logical;
842         swarn.errstr = errstr;
843         swarn.dev = NULL;
844
845         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
846                                   &flags);
847         if (ret < 0)
848                 goto out;
849
850         extent_item_pos = swarn.logical - found_key.objectid;
851         swarn.extent_item_size = found_key.offset;
852
853         eb = path->nodes[0];
854         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
855         item_size = btrfs_item_size_nr(eb, path->slots[0]);
856
857         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
858                 do {
859                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
860                                                       item_size, &ref_root,
861                                                       &ref_level);
862                         btrfs_warn_in_rcu(fs_info,
863 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
864                                 errstr, swarn.logical,
865                                 rcu_str_deref(dev->name),
866                                 swarn.physical,
867                                 ref_level ? "node" : "leaf",
868                                 ret < 0 ? -1 : ref_level,
869                                 ret < 0 ? -1 : ref_root);
870                 } while (ret != 1);
871                 btrfs_release_path(path);
872         } else {
873                 btrfs_release_path(path);
874                 swarn.path = path;
875                 swarn.dev = dev;
876                 iterate_extent_inodes(fs_info, found_key.objectid,
877                                         extent_item_pos, 1,
878                                         scrub_print_warning_inode, &swarn, false);
879         }
880
881 out:
882         btrfs_free_path(path);
883 }
884
885 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
886 {
887         struct page *page = NULL;
888         unsigned long index;
889         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
890         int ret;
891         int corrected = 0;
892         struct btrfs_key key;
893         struct inode *inode = NULL;
894         struct btrfs_fs_info *fs_info;
895         u64 end = offset + PAGE_SIZE - 1;
896         struct btrfs_root *local_root;
897         int srcu_index;
898
899         key.objectid = root;
900         key.type = BTRFS_ROOT_ITEM_KEY;
901         key.offset = (u64)-1;
902
903         fs_info = fixup->root->fs_info;
904         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
905
906         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
907         if (IS_ERR(local_root)) {
908                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
909                 return PTR_ERR(local_root);
910         }
911
912         key.type = BTRFS_INODE_ITEM_KEY;
913         key.objectid = inum;
914         key.offset = 0;
915         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
916         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
917         if (IS_ERR(inode))
918                 return PTR_ERR(inode);
919
920         index = offset >> PAGE_SHIFT;
921
922         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
923         if (!page) {
924                 ret = -ENOMEM;
925                 goto out;
926         }
927
928         if (PageUptodate(page)) {
929                 if (PageDirty(page)) {
930                         /*
931                          * we need to write the data to the defect sector. the
932                          * data that was in that sector is not in memory,
933                          * because the page was modified. we must not write the
934                          * modified page to that sector.
935                          *
936                          * TODO: what could be done here: wait for the delalloc
937                          *       runner to write out that page (might involve
938                          *       COW) and see whether the sector is still
939                          *       referenced afterwards.
940                          *
941                          * For the meantime, we'll treat this error
942                          * incorrectable, although there is a chance that a
943                          * later scrub will find the bad sector again and that
944                          * there's no dirty page in memory, then.
945                          */
946                         ret = -EIO;
947                         goto out;
948                 }
949                 ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE,
950                                         fixup->logical, page,
951                                         offset - page_offset(page),
952                                         fixup->mirror_num);
953                 unlock_page(page);
954                 corrected = !ret;
955         } else {
956                 /*
957                  * we need to get good data first. the general readpage path
958                  * will call repair_io_failure for us, we just have to make
959                  * sure we read the bad mirror.
960                  */
961                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
962                                         EXTENT_DAMAGED);
963                 if (ret) {
964                         /* set_extent_bits should give proper error */
965                         WARN_ON(ret > 0);
966                         if (ret > 0)
967                                 ret = -EFAULT;
968                         goto out;
969                 }
970
971                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
972                                                 btrfs_get_extent,
973                                                 fixup->mirror_num);
974                 wait_on_page_locked(page);
975
976                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
977                                                 end, EXTENT_DAMAGED, 0, NULL);
978                 if (!corrected)
979                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
980                                                 EXTENT_DAMAGED);
981         }
982
983 out:
984         if (page)
985                 put_page(page);
986
987         iput(inode);
988
989         if (ret < 0)
990                 return ret;
991
992         if (ret == 0 && corrected) {
993                 /*
994                  * we only need to call readpage for one of the inodes belonging
995                  * to this extent. so make iterate_extent_inodes stop
996                  */
997                 return 1;
998         }
999
1000         return -EIO;
1001 }
1002
1003 static void scrub_fixup_nodatasum(struct btrfs_work *work)
1004 {
1005         struct btrfs_fs_info *fs_info;
1006         int ret;
1007         struct scrub_fixup_nodatasum *fixup;
1008         struct scrub_ctx *sctx;
1009         struct btrfs_trans_handle *trans = NULL;
1010         struct btrfs_path *path;
1011         int uncorrectable = 0;
1012
1013         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
1014         sctx = fixup->sctx;
1015         fs_info = fixup->root->fs_info;
1016
1017         path = btrfs_alloc_path();
1018         if (!path) {
1019                 spin_lock(&sctx->stat_lock);
1020                 ++sctx->stat.malloc_errors;
1021                 spin_unlock(&sctx->stat_lock);
1022                 uncorrectable = 1;
1023                 goto out;
1024         }
1025
1026         trans = btrfs_join_transaction(fixup->root);
1027         if (IS_ERR(trans)) {
1028                 uncorrectable = 1;
1029                 goto out;
1030         }
1031
1032         /*
1033          * the idea is to trigger a regular read through the standard path. we
1034          * read a page from the (failed) logical address by specifying the
1035          * corresponding copynum of the failed sector. thus, that readpage is
1036          * expected to fail.
1037          * that is the point where on-the-fly error correction will kick in
1038          * (once it's finished) and rewrite the failed sector if a good copy
1039          * can be found.
1040          */
1041         ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
1042                                           scrub_fixup_readpage, fixup, false);
1043         if (ret < 0) {
1044                 uncorrectable = 1;
1045                 goto out;
1046         }
1047         WARN_ON(ret != 1);
1048
1049         spin_lock(&sctx->stat_lock);
1050         ++sctx->stat.corrected_errors;
1051         spin_unlock(&sctx->stat_lock);
1052
1053 out:
1054         if (trans && !IS_ERR(trans))
1055                 btrfs_end_transaction(trans);
1056         if (uncorrectable) {
1057                 spin_lock(&sctx->stat_lock);
1058                 ++sctx->stat.uncorrectable_errors;
1059                 spin_unlock(&sctx->stat_lock);
1060                 btrfs_dev_replace_stats_inc(
1061                         &fs_info->dev_replace.num_uncorrectable_read_errors);
1062                 btrfs_err_rl_in_rcu(fs_info,
1063                     "unable to fixup (nodatasum) error at logical %llu on dev %s",
1064                         fixup->logical, rcu_str_deref(fixup->dev->name));
1065         }
1066
1067         btrfs_free_path(path);
1068         kfree(fixup);
1069
1070         scrub_pending_trans_workers_dec(sctx);
1071 }
1072
1073 static inline void scrub_get_recover(struct scrub_recover *recover)
1074 {
1075         refcount_inc(&recover->refs);
1076 }
1077
1078 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
1079                                      struct scrub_recover *recover)
1080 {
1081         if (refcount_dec_and_test(&recover->refs)) {
1082                 btrfs_bio_counter_dec(fs_info);
1083                 btrfs_put_bbio(recover->bbio);
1084                 kfree(recover);
1085         }
1086 }
1087
1088 /*
1089  * scrub_handle_errored_block gets called when either verification of the
1090  * pages failed or the bio failed to read, e.g. with EIO. In the latter
1091  * case, this function handles all pages in the bio, even though only one
1092  * may be bad.
1093  * The goal of this function is to repair the errored block by using the
1094  * contents of one of the mirrors.
1095  */
1096 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
1097 {
1098         struct scrub_ctx *sctx = sblock_to_check->sctx;
1099         struct btrfs_device *dev;
1100         struct btrfs_fs_info *fs_info;
1101         u64 logical;
1102         unsigned int failed_mirror_index;
1103         unsigned int is_metadata;
1104         unsigned int have_csum;
1105         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
1106         struct scrub_block *sblock_bad;
1107         int ret;
1108         int mirror_index;
1109         int page_num;
1110         int success;
1111         bool full_stripe_locked;
1112         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1113                                       DEFAULT_RATELIMIT_BURST);
1114
1115         BUG_ON(sblock_to_check->page_count < 1);
1116         fs_info = sctx->fs_info;
1117         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
1118                 /*
1119                  * if we find an error in a super block, we just report it.
1120                  * They will get written with the next transaction commit
1121                  * anyway
1122                  */
1123                 spin_lock(&sctx->stat_lock);
1124                 ++sctx->stat.super_errors;
1125                 spin_unlock(&sctx->stat_lock);
1126                 return 0;
1127         }
1128         logical = sblock_to_check->pagev[0]->logical;
1129         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
1130         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
1131         is_metadata = !(sblock_to_check->pagev[0]->flags &
1132                         BTRFS_EXTENT_FLAG_DATA);
1133         have_csum = sblock_to_check->pagev[0]->have_csum;
1134         dev = sblock_to_check->pagev[0]->dev;
1135
1136         /*
1137          * For RAID5/6, race can happen for a different device scrub thread.
1138          * For data corruption, Parity and Data threads will both try
1139          * to recovery the data.
1140          * Race can lead to doubly added csum error, or even unrecoverable
1141          * error.
1142          */
1143         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
1144         if (ret < 0) {
1145                 spin_lock(&sctx->stat_lock);
1146                 if (ret == -ENOMEM)
1147                         sctx->stat.malloc_errors++;
1148                 sctx->stat.read_errors++;
1149                 sctx->stat.uncorrectable_errors++;
1150                 spin_unlock(&sctx->stat_lock);
1151                 return ret;
1152         }
1153
1154         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
1155                 sblocks_for_recheck = NULL;
1156                 goto nodatasum_case;
1157         }
1158
1159         /*
1160          * read all mirrors one after the other. This includes to
1161          * re-read the extent or metadata block that failed (that was
1162          * the cause that this fixup code is called) another time,
1163          * page by page this time in order to know which pages
1164          * caused I/O errors and which ones are good (for all mirrors).
1165          * It is the goal to handle the situation when more than one
1166          * mirror contains I/O errors, but the errors do not
1167          * overlap, i.e. the data can be repaired by selecting the
1168          * pages from those mirrors without I/O error on the
1169          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
1170          * would be that mirror #1 has an I/O error on the first page,
1171          * the second page is good, and mirror #2 has an I/O error on
1172          * the second page, but the first page is good.
1173          * Then the first page of the first mirror can be repaired by
1174          * taking the first page of the second mirror, and the
1175          * second page of the second mirror can be repaired by
1176          * copying the contents of the 2nd page of the 1st mirror.
1177          * One more note: if the pages of one mirror contain I/O
1178          * errors, the checksum cannot be verified. In order to get
1179          * the best data for repairing, the first attempt is to find
1180          * a mirror without I/O errors and with a validated checksum.
1181          * Only if this is not possible, the pages are picked from
1182          * mirrors with I/O errors without considering the checksum.
1183          * If the latter is the case, at the end, the checksum of the
1184          * repaired area is verified in order to correctly maintain
1185          * the statistics.
1186          */
1187
1188         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
1189                                       sizeof(*sblocks_for_recheck), GFP_NOFS);
1190         if (!sblocks_for_recheck) {
1191                 spin_lock(&sctx->stat_lock);
1192                 sctx->stat.malloc_errors++;
1193                 sctx->stat.read_errors++;
1194                 sctx->stat.uncorrectable_errors++;
1195                 spin_unlock(&sctx->stat_lock);
1196                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1197                 goto out;
1198         }
1199
1200         /* setup the context, map the logical blocks and alloc the pages */
1201         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
1202         if (ret) {
1203                 spin_lock(&sctx->stat_lock);
1204                 sctx->stat.read_errors++;
1205                 sctx->stat.uncorrectable_errors++;
1206                 spin_unlock(&sctx->stat_lock);
1207                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1208                 goto out;
1209         }
1210         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
1211         sblock_bad = sblocks_for_recheck + failed_mirror_index;
1212
1213         /* build and submit the bios for the failed mirror, check checksums */
1214         scrub_recheck_block(fs_info, sblock_bad, 1);
1215
1216         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
1217             sblock_bad->no_io_error_seen) {
1218                 /*
1219                  * the error disappeared after reading page by page, or
1220                  * the area was part of a huge bio and other parts of the
1221                  * bio caused I/O errors, or the block layer merged several
1222                  * read requests into one and the error is caused by a
1223                  * different bio (usually one of the two latter cases is
1224                  * the cause)
1225                  */
1226                 spin_lock(&sctx->stat_lock);
1227                 sctx->stat.unverified_errors++;
1228                 sblock_to_check->data_corrected = 1;
1229                 spin_unlock(&sctx->stat_lock);
1230
1231                 if (sctx->is_dev_replace)
1232                         scrub_write_block_to_dev_replace(sblock_bad);
1233                 goto out;
1234         }
1235
1236         if (!sblock_bad->no_io_error_seen) {
1237                 spin_lock(&sctx->stat_lock);
1238                 sctx->stat.read_errors++;
1239                 spin_unlock(&sctx->stat_lock);
1240                 if (__ratelimit(&_rs))
1241                         scrub_print_warning("i/o error", sblock_to_check);
1242                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1243         } else if (sblock_bad->checksum_error) {
1244                 spin_lock(&sctx->stat_lock);
1245                 sctx->stat.csum_errors++;
1246                 spin_unlock(&sctx->stat_lock);
1247                 if (__ratelimit(&_rs))
1248                         scrub_print_warning("checksum error", sblock_to_check);
1249                 btrfs_dev_stat_inc_and_print(dev,
1250                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
1251         } else if (sblock_bad->header_error) {
1252                 spin_lock(&sctx->stat_lock);
1253                 sctx->stat.verify_errors++;
1254                 spin_unlock(&sctx->stat_lock);
1255                 if (__ratelimit(&_rs))
1256                         scrub_print_warning("checksum/header error",
1257                                             sblock_to_check);
1258                 if (sblock_bad->generation_error)
1259                         btrfs_dev_stat_inc_and_print(dev,
1260                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1261                 else
1262                         btrfs_dev_stat_inc_and_print(dev,
1263                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1264         }
1265
1266         if (sctx->readonly) {
1267                 ASSERT(!sctx->is_dev_replace);
1268                 goto out;
1269         }
1270
1271         if (!is_metadata && !have_csum) {
1272                 struct scrub_fixup_nodatasum *fixup_nodatasum;
1273
1274                 WARN_ON(sctx->is_dev_replace);
1275
1276 nodatasum_case:
1277
1278                 /*
1279                  * !is_metadata and !have_csum, this means that the data
1280                  * might not be COWed, that it might be modified
1281                  * concurrently. The general strategy to work on the
1282                  * commit root does not help in the case when COW is not
1283                  * used.
1284                  */
1285                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1286                 if (!fixup_nodatasum)
1287                         goto did_not_correct_error;
1288                 fixup_nodatasum->sctx = sctx;
1289                 fixup_nodatasum->dev = dev;
1290                 fixup_nodatasum->logical = logical;
1291                 fixup_nodatasum->root = fs_info->extent_root;
1292                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1293                 scrub_pending_trans_workers_inc(sctx);
1294                 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1295                                 scrub_fixup_nodatasum, NULL, NULL);
1296                 btrfs_queue_work(fs_info->scrub_workers,
1297                                  &fixup_nodatasum->work);
1298                 goto out;
1299         }
1300
1301         /*
1302          * now build and submit the bios for the other mirrors, check
1303          * checksums.
1304          * First try to pick the mirror which is completely without I/O
1305          * errors and also does not have a checksum error.
1306          * If one is found, and if a checksum is present, the full block
1307          * that is known to contain an error is rewritten. Afterwards
1308          * the block is known to be corrected.
1309          * If a mirror is found which is completely correct, and no
1310          * checksum is present, only those pages are rewritten that had
1311          * an I/O error in the block to be repaired, since it cannot be
1312          * determined, which copy of the other pages is better (and it
1313          * could happen otherwise that a correct page would be
1314          * overwritten by a bad one).
1315          */
1316         for (mirror_index = 0; ;mirror_index++) {
1317                 struct scrub_block *sblock_other;
1318
1319                 if (mirror_index == failed_mirror_index)
1320                         continue;
1321
1322                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1323                 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1324                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1325                                 break;
1326                         if (!sblocks_for_recheck[mirror_index].page_count)
1327                                 break;
1328
1329                         sblock_other = sblocks_for_recheck + mirror_index;
1330                 } else {
1331                         struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1332                         int max_allowed = r->bbio->num_stripes -
1333                                                 r->bbio->num_tgtdevs;
1334
1335                         if (mirror_index >= max_allowed)
1336                                 break;
1337                         if (!sblocks_for_recheck[1].page_count)
1338                                 break;
1339
1340                         ASSERT(failed_mirror_index == 0);
1341                         sblock_other = sblocks_for_recheck + 1;
1342                         sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1343                 }
1344
1345                 /* build and submit the bios, check checksums */
1346                 scrub_recheck_block(fs_info, sblock_other, 0);
1347
1348                 if (!sblock_other->header_error &&
1349                     !sblock_other->checksum_error &&
1350                     sblock_other->no_io_error_seen) {
1351                         if (sctx->is_dev_replace) {
1352                                 scrub_write_block_to_dev_replace(sblock_other);
1353                                 goto corrected_error;
1354                         } else {
1355                                 ret = scrub_repair_block_from_good_copy(
1356                                                 sblock_bad, sblock_other);
1357                                 if (!ret)
1358                                         goto corrected_error;
1359                         }
1360                 }
1361         }
1362
1363         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1364                 goto did_not_correct_error;
1365
1366         /*
1367          * In case of I/O errors in the area that is supposed to be
1368          * repaired, continue by picking good copies of those pages.
1369          * Select the good pages from mirrors to rewrite bad pages from
1370          * the area to fix. Afterwards verify the checksum of the block
1371          * that is supposed to be repaired. This verification step is
1372          * only done for the purpose of statistic counting and for the
1373          * final scrub report, whether errors remain.
1374          * A perfect algorithm could make use of the checksum and try
1375          * all possible combinations of pages from the different mirrors
1376          * until the checksum verification succeeds. For example, when
1377          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1378          * of mirror #2 is readable but the final checksum test fails,
1379          * then the 2nd page of mirror #3 could be tried, whether now
1380          * the final checksum succeeds. But this would be a rare
1381          * exception and is therefore not implemented. At least it is
1382          * avoided that the good copy is overwritten.
1383          * A more useful improvement would be to pick the sectors
1384          * without I/O error based on sector sizes (512 bytes on legacy
1385          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1386          * mirror could be repaired by taking 512 byte of a different
1387          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1388          * area are unreadable.
1389          */
1390         success = 1;
1391         for (page_num = 0; page_num < sblock_bad->page_count;
1392              page_num++) {
1393                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1394                 struct scrub_block *sblock_other = NULL;
1395
1396                 /* skip no-io-error page in scrub */
1397                 if (!page_bad->io_error && !sctx->is_dev_replace)
1398                         continue;
1399
1400                 if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1401                         /*
1402                          * In case of dev replace, if raid56 rebuild process
1403                          * didn't work out correct data, then copy the content
1404                          * in sblock_bad to make sure target device is identical
1405                          * to source device, instead of writing garbage data in
1406                          * sblock_for_recheck array to target device.
1407                          */
1408                         sblock_other = NULL;
1409                 } else if (page_bad->io_error) {
1410                         /* try to find no-io-error page in mirrors */
1411                         for (mirror_index = 0;
1412                              mirror_index < BTRFS_MAX_MIRRORS &&
1413                              sblocks_for_recheck[mirror_index].page_count > 0;
1414                              mirror_index++) {
1415                                 if (!sblocks_for_recheck[mirror_index].
1416                                     pagev[page_num]->io_error) {
1417                                         sblock_other = sblocks_for_recheck +
1418                                                        mirror_index;
1419                                         break;
1420                                 }
1421                         }
1422                         if (!sblock_other)
1423                                 success = 0;
1424                 }
1425
1426                 if (sctx->is_dev_replace) {
1427                         /*
1428                          * did not find a mirror to fetch the page
1429                          * from. scrub_write_page_to_dev_replace()
1430                          * handles this case (page->io_error), by
1431                          * filling the block with zeros before
1432                          * submitting the write request
1433                          */
1434                         if (!sblock_other)
1435                                 sblock_other = sblock_bad;
1436
1437                         if (scrub_write_page_to_dev_replace(sblock_other,
1438                                                             page_num) != 0) {
1439                                 btrfs_dev_replace_stats_inc(
1440                                         &fs_info->dev_replace.num_write_errors);
1441                                 success = 0;
1442                         }
1443                 } else if (sblock_other) {
1444                         ret = scrub_repair_page_from_good_copy(sblock_bad,
1445                                                                sblock_other,
1446                                                                page_num, 0);
1447                         if (0 == ret)
1448                                 page_bad->io_error = 0;
1449                         else
1450                                 success = 0;
1451                 }
1452         }
1453
1454         if (success && !sctx->is_dev_replace) {
1455                 if (is_metadata || have_csum) {
1456                         /*
1457                          * need to verify the checksum now that all
1458                          * sectors on disk are repaired (the write
1459                          * request for data to be repaired is on its way).
1460                          * Just be lazy and use scrub_recheck_block()
1461                          * which re-reads the data before the checksum
1462                          * is verified, but most likely the data comes out
1463                          * of the page cache.
1464                          */
1465                         scrub_recheck_block(fs_info, sblock_bad, 1);
1466                         if (!sblock_bad->header_error &&
1467                             !sblock_bad->checksum_error &&
1468                             sblock_bad->no_io_error_seen)
1469                                 goto corrected_error;
1470                         else
1471                                 goto did_not_correct_error;
1472                 } else {
1473 corrected_error:
1474                         spin_lock(&sctx->stat_lock);
1475                         sctx->stat.corrected_errors++;
1476                         sblock_to_check->data_corrected = 1;
1477                         spin_unlock(&sctx->stat_lock);
1478                         btrfs_err_rl_in_rcu(fs_info,
1479                                 "fixed up error at logical %llu on dev %s",
1480                                 logical, rcu_str_deref(dev->name));
1481                 }
1482         } else {
1483 did_not_correct_error:
1484                 spin_lock(&sctx->stat_lock);
1485                 sctx->stat.uncorrectable_errors++;
1486                 spin_unlock(&sctx->stat_lock);
1487                 btrfs_err_rl_in_rcu(fs_info,
1488                         "unable to fixup (regular) error at logical %llu on dev %s",
1489                         logical, rcu_str_deref(dev->name));
1490         }
1491
1492 out:
1493         if (sblocks_for_recheck) {
1494                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1495                      mirror_index++) {
1496                         struct scrub_block *sblock = sblocks_for_recheck +
1497                                                      mirror_index;
1498                         struct scrub_recover *recover;
1499                         int page_index;
1500
1501                         for (page_index = 0; page_index < sblock->page_count;
1502                              page_index++) {
1503                                 sblock->pagev[page_index]->sblock = NULL;
1504                                 recover = sblock->pagev[page_index]->recover;
1505                                 if (recover) {
1506                                         scrub_put_recover(fs_info, recover);
1507                                         sblock->pagev[page_index]->recover =
1508                                                                         NULL;
1509                                 }
1510                                 scrub_page_put(sblock->pagev[page_index]);
1511                         }
1512                 }
1513                 kfree(sblocks_for_recheck);
1514         }
1515
1516         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1517         if (ret < 0)
1518                 return ret;
1519         return 0;
1520 }
1521
1522 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1523 {
1524         if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1525                 return 2;
1526         else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1527                 return 3;
1528         else
1529                 return (int)bbio->num_stripes;
1530 }
1531
1532 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1533                                                  u64 *raid_map,
1534                                                  u64 mapped_length,
1535                                                  int nstripes, int mirror,
1536                                                  int *stripe_index,
1537                                                  u64 *stripe_offset)
1538 {
1539         int i;
1540
1541         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1542                 /* RAID5/6 */
1543                 for (i = 0; i < nstripes; i++) {
1544                         if (raid_map[i] == RAID6_Q_STRIPE ||
1545                             raid_map[i] == RAID5_P_STRIPE)
1546                                 continue;
1547
1548                         if (logical >= raid_map[i] &&
1549                             logical < raid_map[i] + mapped_length)
1550                                 break;
1551                 }
1552
1553                 *stripe_index = i;
1554                 *stripe_offset = logical - raid_map[i];
1555         } else {
1556                 /* The other RAID type */
1557                 *stripe_index = mirror;
1558                 *stripe_offset = 0;
1559         }
1560 }
1561
1562 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1563                                      struct scrub_block *sblocks_for_recheck)
1564 {
1565         struct scrub_ctx *sctx = original_sblock->sctx;
1566         struct btrfs_fs_info *fs_info = sctx->fs_info;
1567         u64 length = original_sblock->page_count * PAGE_SIZE;
1568         u64 logical = original_sblock->pagev[0]->logical;
1569         u64 generation = original_sblock->pagev[0]->generation;
1570         u64 flags = original_sblock->pagev[0]->flags;
1571         u64 have_csum = original_sblock->pagev[0]->have_csum;
1572         struct scrub_recover *recover;
1573         struct btrfs_bio *bbio;
1574         u64 sublen;
1575         u64 mapped_length;
1576         u64 stripe_offset;
1577         int stripe_index;
1578         int page_index = 0;
1579         int mirror_index;
1580         int nmirrors;
1581         int ret;
1582
1583         /*
1584          * note: the two members refs and outstanding_pages
1585          * are not used (and not set) in the blocks that are used for
1586          * the recheck procedure
1587          */
1588
1589         while (length > 0) {
1590                 sublen = min_t(u64, length, PAGE_SIZE);
1591                 mapped_length = sublen;
1592                 bbio = NULL;
1593
1594                 /*
1595                  * with a length of PAGE_SIZE, each returned stripe
1596                  * represents one mirror
1597                  */
1598                 btrfs_bio_counter_inc_blocked(fs_info);
1599                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1600                                 logical, &mapped_length, &bbio);
1601                 if (ret || !bbio || mapped_length < sublen) {
1602                         btrfs_put_bbio(bbio);
1603                         btrfs_bio_counter_dec(fs_info);
1604                         return -EIO;
1605                 }
1606
1607                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1608                 if (!recover) {
1609                         btrfs_put_bbio(bbio);
1610                         btrfs_bio_counter_dec(fs_info);
1611                         return -ENOMEM;
1612                 }
1613
1614                 refcount_set(&recover->refs, 1);
1615                 recover->bbio = bbio;
1616                 recover->map_length = mapped_length;
1617
1618                 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1619
1620                 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1621
1622                 for (mirror_index = 0; mirror_index < nmirrors;
1623                      mirror_index++) {
1624                         struct scrub_block *sblock;
1625                         struct scrub_page *page;
1626
1627                         sblock = sblocks_for_recheck + mirror_index;
1628                         sblock->sctx = sctx;
1629
1630                         page = kzalloc(sizeof(*page), GFP_NOFS);
1631                         if (!page) {
1632 leave_nomem:
1633                                 spin_lock(&sctx->stat_lock);
1634                                 sctx->stat.malloc_errors++;
1635                                 spin_unlock(&sctx->stat_lock);
1636                                 scrub_put_recover(fs_info, recover);
1637                                 return -ENOMEM;
1638                         }
1639                         scrub_page_get(page);
1640                         sblock->pagev[page_index] = page;
1641                         page->sblock = sblock;
1642                         page->flags = flags;
1643                         page->generation = generation;
1644                         page->logical = logical;
1645                         page->have_csum = have_csum;
1646                         if (have_csum)
1647                                 memcpy(page->csum,
1648                                        original_sblock->pagev[0]->csum,
1649                                        sctx->csum_size);
1650
1651                         scrub_stripe_index_and_offset(logical,
1652                                                       bbio->map_type,
1653                                                       bbio->raid_map,
1654                                                       mapped_length,
1655                                                       bbio->num_stripes -
1656                                                       bbio->num_tgtdevs,
1657                                                       mirror_index,
1658                                                       &stripe_index,
1659                                                       &stripe_offset);
1660                         page->physical = bbio->stripes[stripe_index].physical +
1661                                          stripe_offset;
1662                         page->dev = bbio->stripes[stripe_index].dev;
1663
1664                         BUG_ON(page_index >= original_sblock->page_count);
1665                         page->physical_for_dev_replace =
1666                                 original_sblock->pagev[page_index]->
1667                                 physical_for_dev_replace;
1668                         /* for missing devices, dev->bdev is NULL */
1669                         page->mirror_num = mirror_index + 1;
1670                         sblock->page_count++;
1671                         page->page = alloc_page(GFP_NOFS);
1672                         if (!page->page)
1673                                 goto leave_nomem;
1674
1675                         scrub_get_recover(recover);
1676                         page->recover = recover;
1677                 }
1678                 scrub_put_recover(fs_info, recover);
1679                 length -= sublen;
1680                 logical += sublen;
1681                 page_index++;
1682         }
1683
1684         return 0;
1685 }
1686
1687 static void scrub_bio_wait_endio(struct bio *bio)
1688 {
1689         complete(bio->bi_private);
1690 }
1691
1692 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1693                                         struct bio *bio,
1694                                         struct scrub_page *page)
1695 {
1696         DECLARE_COMPLETION_ONSTACK(done);
1697         int ret;
1698         int mirror_num;
1699
1700         bio->bi_iter.bi_sector = page->logical >> 9;
1701         bio->bi_private = &done;
1702         bio->bi_end_io = scrub_bio_wait_endio;
1703
1704         mirror_num = page->sblock->pagev[0]->mirror_num;
1705         ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
1706                                     page->recover->map_length,
1707                                     mirror_num, 0);
1708         if (ret)
1709                 return ret;
1710
1711         wait_for_completion_io(&done);
1712         return blk_status_to_errno(bio->bi_status);
1713 }
1714
1715 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1716                                           struct scrub_block *sblock)
1717 {
1718         struct scrub_page *first_page = sblock->pagev[0];
1719         struct bio *bio;
1720         int page_num;
1721
1722         /* All pages in sblock belong to the same stripe on the same device. */
1723         ASSERT(first_page->dev);
1724         if (!first_page->dev->bdev)
1725                 goto out;
1726
1727         bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
1728         bio_set_dev(bio, first_page->dev->bdev);
1729
1730         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1731                 struct scrub_page *page = sblock->pagev[page_num];
1732
1733                 WARN_ON(!page->page);
1734                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1735         }
1736
1737         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1738                 bio_put(bio);
1739                 goto out;
1740         }
1741
1742         bio_put(bio);
1743
1744         scrub_recheck_block_checksum(sblock);
1745
1746         return;
1747 out:
1748         for (page_num = 0; page_num < sblock->page_count; page_num++)
1749                 sblock->pagev[page_num]->io_error = 1;
1750
1751         sblock->no_io_error_seen = 0;
1752 }
1753
1754 /*
1755  * this function will check the on disk data for checksum errors, header
1756  * errors and read I/O errors. If any I/O errors happen, the exact pages
1757  * which are errored are marked as being bad. The goal is to enable scrub
1758  * to take those pages that are not errored from all the mirrors so that
1759  * the pages that are errored in the just handled mirror can be repaired.
1760  */
1761 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1762                                 struct scrub_block *sblock,
1763                                 int retry_failed_mirror)
1764 {
1765         int page_num;
1766
1767         sblock->no_io_error_seen = 1;
1768
1769         /* short cut for raid56 */
1770         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1771                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1772
1773         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1774                 struct bio *bio;
1775                 struct scrub_page *page = sblock->pagev[page_num];
1776
1777                 if (page->dev->bdev == NULL) {
1778                         page->io_error = 1;
1779                         sblock->no_io_error_seen = 0;
1780                         continue;
1781                 }
1782
1783                 WARN_ON(!page->page);
1784                 bio = btrfs_io_bio_alloc(1);
1785                 bio_set_dev(bio, page->dev->bdev);
1786
1787                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1788                 bio->bi_iter.bi_sector = page->physical >> 9;
1789                 bio->bi_opf = REQ_OP_READ;
1790
1791                 if (btrfsic_submit_bio_wait(bio)) {
1792                         page->io_error = 1;
1793                         sblock->no_io_error_seen = 0;
1794                 }
1795
1796                 bio_put(bio);
1797         }
1798
1799         if (sblock->no_io_error_seen)
1800                 scrub_recheck_block_checksum(sblock);
1801 }
1802
1803 static inline int scrub_check_fsid(u8 fsid[],
1804                                    struct scrub_page *spage)
1805 {
1806         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1807         int ret;
1808
1809         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1810         return !ret;
1811 }
1812
1813 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1814 {
1815         sblock->header_error = 0;
1816         sblock->checksum_error = 0;
1817         sblock->generation_error = 0;
1818
1819         if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1820                 scrub_checksum_data(sblock);
1821         else
1822                 scrub_checksum_tree_block(sblock);
1823 }
1824
1825 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1826                                              struct scrub_block *sblock_good)
1827 {
1828         int page_num;
1829         int ret = 0;
1830
1831         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1832                 int ret_sub;
1833
1834                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1835                                                            sblock_good,
1836                                                            page_num, 1);
1837                 if (ret_sub)
1838                         ret = ret_sub;
1839         }
1840
1841         return ret;
1842 }
1843
1844 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1845                                             struct scrub_block *sblock_good,
1846                                             int page_num, int force_write)
1847 {
1848         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1849         struct scrub_page *page_good = sblock_good->pagev[page_num];
1850         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1851
1852         BUG_ON(page_bad->page == NULL);
1853         BUG_ON(page_good->page == NULL);
1854         if (force_write || sblock_bad->header_error ||
1855             sblock_bad->checksum_error || page_bad->io_error) {
1856                 struct bio *bio;
1857                 int ret;
1858
1859                 if (!page_bad->dev->bdev) {
1860                         btrfs_warn_rl(fs_info,
1861                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1862                         return -EIO;
1863                 }
1864
1865                 bio = btrfs_io_bio_alloc(1);
1866                 bio_set_dev(bio, page_bad->dev->bdev);
1867                 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1868                 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
1869
1870                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1871                 if (PAGE_SIZE != ret) {
1872                         bio_put(bio);
1873                         return -EIO;
1874                 }
1875
1876                 if (btrfsic_submit_bio_wait(bio)) {
1877                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1878                                 BTRFS_DEV_STAT_WRITE_ERRS);
1879                         btrfs_dev_replace_stats_inc(
1880                                 &fs_info->dev_replace.num_write_errors);
1881                         bio_put(bio);
1882                         return -EIO;
1883                 }
1884                 bio_put(bio);
1885         }
1886
1887         return 0;
1888 }
1889
1890 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1891 {
1892         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1893         int page_num;
1894
1895         /*
1896          * This block is used for the check of the parity on the source device,
1897          * so the data needn't be written into the destination device.
1898          */
1899         if (sblock->sparity)
1900                 return;
1901
1902         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1903                 int ret;
1904
1905                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1906                 if (ret)
1907                         btrfs_dev_replace_stats_inc(
1908                                 &fs_info->dev_replace.num_write_errors);
1909         }
1910 }
1911
1912 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1913                                            int page_num)
1914 {
1915         struct scrub_page *spage = sblock->pagev[page_num];
1916
1917         BUG_ON(spage->page == NULL);
1918         if (spage->io_error) {
1919                 void *mapped_buffer = kmap_atomic(spage->page);
1920
1921                 clear_page(mapped_buffer);
1922                 flush_dcache_page(spage->page);
1923                 kunmap_atomic(mapped_buffer);
1924         }
1925         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1926 }
1927
1928 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1929                                     struct scrub_page *spage)
1930 {
1931         struct scrub_bio *sbio;
1932         int ret;
1933
1934         mutex_lock(&sctx->wr_lock);
1935 again:
1936         if (!sctx->wr_curr_bio) {
1937                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1938                                               GFP_KERNEL);
1939                 if (!sctx->wr_curr_bio) {
1940                         mutex_unlock(&sctx->wr_lock);
1941                         return -ENOMEM;
1942                 }
1943                 sctx->wr_curr_bio->sctx = sctx;
1944                 sctx->wr_curr_bio->page_count = 0;
1945         }
1946         sbio = sctx->wr_curr_bio;
1947         if (sbio->page_count == 0) {
1948                 struct bio *bio;
1949
1950                 sbio->physical = spage->physical_for_dev_replace;
1951                 sbio->logical = spage->logical;
1952                 sbio->dev = sctx->wr_tgtdev;
1953                 bio = sbio->bio;
1954                 if (!bio) {
1955                         bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1956                         sbio->bio = bio;
1957                 }
1958
1959                 bio->bi_private = sbio;
1960                 bio->bi_end_io = scrub_wr_bio_end_io;
1961                 bio_set_dev(bio, sbio->dev->bdev);
1962                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1963                 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
1964                 sbio->status = 0;
1965         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1966                    spage->physical_for_dev_replace ||
1967                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1968                    spage->logical) {
1969                 scrub_wr_submit(sctx);
1970                 goto again;
1971         }
1972
1973         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1974         if (ret != PAGE_SIZE) {
1975                 if (sbio->page_count < 1) {
1976                         bio_put(sbio->bio);
1977                         sbio->bio = NULL;
1978                         mutex_unlock(&sctx->wr_lock);
1979                         return -EIO;
1980                 }
1981                 scrub_wr_submit(sctx);
1982                 goto again;
1983         }
1984
1985         sbio->pagev[sbio->page_count] = spage;
1986         scrub_page_get(spage);
1987         sbio->page_count++;
1988         if (sbio->page_count == sctx->pages_per_wr_bio)
1989                 scrub_wr_submit(sctx);
1990         mutex_unlock(&sctx->wr_lock);
1991
1992         return 0;
1993 }
1994
1995 static void scrub_wr_submit(struct scrub_ctx *sctx)
1996 {
1997         struct scrub_bio *sbio;
1998
1999         if (!sctx->wr_curr_bio)
2000                 return;
2001
2002         sbio = sctx->wr_curr_bio;
2003         sctx->wr_curr_bio = NULL;
2004         WARN_ON(!sbio->bio->bi_disk);
2005         scrub_pending_bio_inc(sctx);
2006         /* process all writes in a single worker thread. Then the block layer
2007          * orders the requests before sending them to the driver which
2008          * doubled the write performance on spinning disks when measured
2009          * with Linux 3.5 */
2010         btrfsic_submit_bio(sbio->bio);
2011 }
2012
2013 static void scrub_wr_bio_end_io(struct bio *bio)
2014 {
2015         struct scrub_bio *sbio = bio->bi_private;
2016         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2017
2018         sbio->status = bio->bi_status;
2019         sbio->bio = bio;
2020
2021         btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
2022                          scrub_wr_bio_end_io_worker, NULL, NULL);
2023         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
2024 }
2025
2026 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
2027 {
2028         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2029         struct scrub_ctx *sctx = sbio->sctx;
2030         int i;
2031
2032         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
2033         if (sbio->status) {
2034                 struct btrfs_dev_replace *dev_replace =
2035                         &sbio->sctx->fs_info->dev_replace;
2036
2037                 for (i = 0; i < sbio->page_count; i++) {
2038                         struct scrub_page *spage = sbio->pagev[i];
2039
2040                         spage->io_error = 1;
2041                         btrfs_dev_replace_stats_inc(&dev_replace->
2042                                                     num_write_errors);
2043                 }
2044         }
2045
2046         for (i = 0; i < sbio->page_count; i++)
2047                 scrub_page_put(sbio->pagev[i]);
2048
2049         bio_put(sbio->bio);
2050         kfree(sbio);
2051         scrub_pending_bio_dec(sctx);
2052 }
2053
2054 static int scrub_checksum(struct scrub_block *sblock)
2055 {
2056         u64 flags;
2057         int ret;
2058
2059         /*
2060          * No need to initialize these stats currently,
2061          * because this function only use return value
2062          * instead of these stats value.
2063          *
2064          * Todo:
2065          * always use stats
2066          */
2067         sblock->header_error = 0;
2068         sblock->generation_error = 0;
2069         sblock->checksum_error = 0;
2070
2071         WARN_ON(sblock->page_count < 1);
2072         flags = sblock->pagev[0]->flags;
2073         ret = 0;
2074         if (flags & BTRFS_EXTENT_FLAG_DATA)
2075                 ret = scrub_checksum_data(sblock);
2076         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
2077                 ret = scrub_checksum_tree_block(sblock);
2078         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
2079                 (void)scrub_checksum_super(sblock);
2080         else
2081                 WARN_ON(1);
2082         if (ret)
2083                 scrub_handle_errored_block(sblock);
2084
2085         return ret;
2086 }
2087
2088 static int scrub_checksum_data(struct scrub_block *sblock)
2089 {
2090         struct scrub_ctx *sctx = sblock->sctx;
2091         u8 csum[BTRFS_CSUM_SIZE];
2092         u8 *on_disk_csum;
2093         struct page *page;
2094         void *buffer;
2095         u32 crc = ~(u32)0;
2096         u64 len;
2097         int index;
2098
2099         BUG_ON(sblock->page_count < 1);
2100         if (!sblock->pagev[0]->have_csum)
2101                 return 0;
2102
2103         on_disk_csum = sblock->pagev[0]->csum;
2104         page = sblock->pagev[0]->page;
2105         buffer = kmap_atomic(page);
2106
2107         len = sctx->fs_info->sectorsize;
2108         index = 0;
2109         for (;;) {
2110                 u64 l = min_t(u64, len, PAGE_SIZE);
2111
2112                 crc = btrfs_csum_data(buffer, crc, l);
2113                 kunmap_atomic(buffer);
2114                 len -= l;
2115                 if (len == 0)
2116                         break;
2117                 index++;
2118                 BUG_ON(index >= sblock->page_count);
2119                 BUG_ON(!sblock->pagev[index]->page);
2120                 page = sblock->pagev[index]->page;
2121                 buffer = kmap_atomic(page);
2122         }
2123
2124         btrfs_csum_final(crc, csum);
2125         if (memcmp(csum, on_disk_csum, sctx->csum_size))
2126                 sblock->checksum_error = 1;
2127
2128         return sblock->checksum_error;
2129 }
2130
2131 static int scrub_checksum_tree_block(struct scrub_block *sblock)
2132 {
2133         struct scrub_ctx *sctx = sblock->sctx;
2134         struct btrfs_header *h;
2135         struct btrfs_fs_info *fs_info = sctx->fs_info;
2136         u8 calculated_csum[BTRFS_CSUM_SIZE];
2137         u8 on_disk_csum[BTRFS_CSUM_SIZE];
2138         struct page *page;
2139         void *mapped_buffer;
2140         u64 mapped_size;
2141         void *p;
2142         u32 crc = ~(u32)0;
2143         u64 len;
2144         int index;
2145
2146         BUG_ON(sblock->page_count < 1);
2147         page = sblock->pagev[0]->page;
2148         mapped_buffer = kmap_atomic(page);
2149         h = (struct btrfs_header *)mapped_buffer;
2150         memcpy(on_disk_csum, h->csum, sctx->csum_size);
2151
2152         /*
2153          * we don't use the getter functions here, as we
2154          * a) don't have an extent buffer and
2155          * b) the page is already kmapped
2156          */
2157         if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
2158                 sblock->header_error = 1;
2159
2160         if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
2161                 sblock->header_error = 1;
2162                 sblock->generation_error = 1;
2163         }
2164
2165         if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
2166                 sblock->header_error = 1;
2167
2168         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
2169                    BTRFS_UUID_SIZE))
2170                 sblock->header_error = 1;
2171
2172         len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
2173         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2174         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2175         index = 0;
2176         for (;;) {
2177                 u64 l = min_t(u64, len, mapped_size);
2178
2179                 crc = btrfs_csum_data(p, crc, l);
2180                 kunmap_atomic(mapped_buffer);
2181                 len -= l;
2182                 if (len == 0)
2183                         break;
2184                 index++;
2185                 BUG_ON(index >= sblock->page_count);
2186                 BUG_ON(!sblock->pagev[index]->page);
2187                 page = sblock->pagev[index]->page;
2188                 mapped_buffer = kmap_atomic(page);
2189                 mapped_size = PAGE_SIZE;
2190                 p = mapped_buffer;
2191         }
2192
2193         btrfs_csum_final(crc, calculated_csum);
2194         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2195                 sblock->checksum_error = 1;
2196
2197         return sblock->header_error || sblock->checksum_error;
2198 }
2199
2200 static int scrub_checksum_super(struct scrub_block *sblock)
2201 {
2202         struct btrfs_super_block *s;
2203         struct scrub_ctx *sctx = sblock->sctx;
2204         u8 calculated_csum[BTRFS_CSUM_SIZE];
2205         u8 on_disk_csum[BTRFS_CSUM_SIZE];
2206         struct page *page;
2207         void *mapped_buffer;
2208         u64 mapped_size;
2209         void *p;
2210         u32 crc = ~(u32)0;
2211         int fail_gen = 0;
2212         int fail_cor = 0;
2213         u64 len;
2214         int index;
2215
2216         BUG_ON(sblock->page_count < 1);
2217         page = sblock->pagev[0]->page;
2218         mapped_buffer = kmap_atomic(page);
2219         s = (struct btrfs_super_block *)mapped_buffer;
2220         memcpy(on_disk_csum, s->csum, sctx->csum_size);
2221
2222         if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
2223                 ++fail_cor;
2224
2225         if (sblock->pagev[0]->generation != btrfs_super_generation(s))
2226                 ++fail_gen;
2227
2228         if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
2229                 ++fail_cor;
2230
2231         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
2232         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2233         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2234         index = 0;
2235         for (;;) {
2236                 u64 l = min_t(u64, len, mapped_size);
2237
2238                 crc = btrfs_csum_data(p, crc, l);
2239                 kunmap_atomic(mapped_buffer);
2240                 len -= l;
2241                 if (len == 0)
2242                         break;
2243                 index++;
2244                 BUG_ON(index >= sblock->page_count);
2245                 BUG_ON(!sblock->pagev[index]->page);
2246                 page = sblock->pagev[index]->page;
2247                 mapped_buffer = kmap_atomic(page);
2248                 mapped_size = PAGE_SIZE;
2249                 p = mapped_buffer;
2250         }
2251
2252         btrfs_csum_final(crc, calculated_csum);
2253         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2254                 ++fail_cor;
2255
2256         if (fail_cor + fail_gen) {
2257                 /*
2258                  * if we find an error in a super block, we just report it.
2259                  * They will get written with the next transaction commit
2260                  * anyway
2261                  */
2262                 spin_lock(&sctx->stat_lock);
2263                 ++sctx->stat.super_errors;
2264                 spin_unlock(&sctx->stat_lock);
2265                 if (fail_cor)
2266                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2267                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2268                 else
2269                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2270                                 BTRFS_DEV_STAT_GENERATION_ERRS);
2271         }
2272
2273         return fail_cor + fail_gen;
2274 }
2275
2276 static void scrub_block_get(struct scrub_block *sblock)
2277 {
2278         refcount_inc(&sblock->refs);
2279 }
2280
2281 static void scrub_block_put(struct scrub_block *sblock)
2282 {
2283         if (refcount_dec_and_test(&sblock->refs)) {
2284                 int i;
2285
2286                 if (sblock->sparity)
2287                         scrub_parity_put(sblock->sparity);
2288
2289                 for (i = 0; i < sblock->page_count; i++)
2290                         scrub_page_put(sblock->pagev[i]);
2291                 kfree(sblock);
2292         }
2293 }
2294
2295 static void scrub_page_get(struct scrub_page *spage)
2296 {
2297         atomic_inc(&spage->refs);
2298 }
2299
2300 static void scrub_page_put(struct scrub_page *spage)
2301 {
2302         if (atomic_dec_and_test(&spage->refs)) {
2303                 if (spage->page)
2304                         __free_page(spage->page);
2305                 kfree(spage);
2306         }
2307 }
2308
2309 static void scrub_submit(struct scrub_ctx *sctx)
2310 {
2311         struct scrub_bio *sbio;
2312
2313         if (sctx->curr == -1)
2314                 return;
2315
2316         sbio = sctx->bios[sctx->curr];
2317         sctx->curr = -1;
2318         scrub_pending_bio_inc(sctx);
2319         btrfsic_submit_bio(sbio->bio);
2320 }
2321
2322 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2323                                     struct scrub_page *spage)
2324 {
2325         struct scrub_block *sblock = spage->sblock;
2326         struct scrub_bio *sbio;
2327         int ret;
2328
2329 again:
2330         /*
2331          * grab a fresh bio or wait for one to become available
2332          */
2333         while (sctx->curr == -1) {
2334                 spin_lock(&sctx->list_lock);
2335                 sctx->curr = sctx->first_free;
2336                 if (sctx->curr != -1) {
2337                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2338                         sctx->bios[sctx->curr]->next_free = -1;
2339                         sctx->bios[sctx->curr]->page_count = 0;
2340                         spin_unlock(&sctx->list_lock);
2341                 } else {
2342                         spin_unlock(&sctx->list_lock);
2343                         wait_event(sctx->list_wait, sctx->first_free != -1);
2344                 }
2345         }
2346         sbio = sctx->bios[sctx->curr];
2347         if (sbio->page_count == 0) {
2348                 struct bio *bio;
2349
2350                 sbio->physical = spage->physical;
2351                 sbio->logical = spage->logical;
2352                 sbio->dev = spage->dev;
2353                 bio = sbio->bio;
2354                 if (!bio) {
2355                         bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
2356                         sbio->bio = bio;
2357                 }
2358
2359                 bio->bi_private = sbio;
2360                 bio->bi_end_io = scrub_bio_end_io;
2361                 bio_set_dev(bio, sbio->dev->bdev);
2362                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2363                 bio_set_op_attrs(bio, REQ_OP_READ, 0);
2364                 sbio->status = 0;
2365         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2366                    spage->physical ||
2367                    sbio->logical + sbio->page_count * PAGE_SIZE !=
2368                    spage->logical ||
2369                    sbio->dev != spage->dev) {
2370                 scrub_submit(sctx);
2371                 goto again;
2372         }
2373
2374         sbio->pagev[sbio->page_count] = spage;
2375         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2376         if (ret != PAGE_SIZE) {
2377                 if (sbio->page_count < 1) {
2378                         bio_put(sbio->bio);
2379                         sbio->bio = NULL;
2380                         return -EIO;
2381                 }
2382                 scrub_submit(sctx);
2383                 goto again;
2384         }
2385
2386         scrub_block_get(sblock); /* one for the page added to the bio */
2387         atomic_inc(&sblock->outstanding_pages);
2388         sbio->page_count++;
2389         if (sbio->page_count == sctx->pages_per_rd_bio)
2390                 scrub_submit(sctx);
2391
2392         return 0;
2393 }
2394
2395 static void scrub_missing_raid56_end_io(struct bio *bio)
2396 {
2397         struct scrub_block *sblock = bio->bi_private;
2398         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2399
2400         if (bio->bi_status)
2401                 sblock->no_io_error_seen = 0;
2402
2403         bio_put(bio);
2404
2405         btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2406 }
2407
2408 static void scrub_missing_raid56_worker(struct btrfs_work *work)
2409 {
2410         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2411         struct scrub_ctx *sctx = sblock->sctx;
2412         struct btrfs_fs_info *fs_info = sctx->fs_info;
2413         u64 logical;
2414         struct btrfs_device *dev;
2415
2416         logical = sblock->pagev[0]->logical;
2417         dev = sblock->pagev[0]->dev;
2418
2419         if (sblock->no_io_error_seen)
2420                 scrub_recheck_block_checksum(sblock);
2421
2422         if (!sblock->no_io_error_seen) {
2423                 spin_lock(&sctx->stat_lock);
2424                 sctx->stat.read_errors++;
2425                 spin_unlock(&sctx->stat_lock);
2426                 btrfs_err_rl_in_rcu(fs_info,
2427                         "IO error rebuilding logical %llu for dev %s",
2428                         logical, rcu_str_deref(dev->name));
2429         } else if (sblock->header_error || sblock->checksum_error) {
2430                 spin_lock(&sctx->stat_lock);
2431                 sctx->stat.uncorrectable_errors++;
2432                 spin_unlock(&sctx->stat_lock);
2433                 btrfs_err_rl_in_rcu(fs_info,
2434                         "failed to rebuild valid logical %llu for dev %s",
2435                         logical, rcu_str_deref(dev->name));
2436         } else {
2437                 scrub_write_block_to_dev_replace(sblock);
2438         }
2439
2440         scrub_block_put(sblock);
2441
2442         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2443                 mutex_lock(&sctx->wr_lock);
2444                 scrub_wr_submit(sctx);
2445                 mutex_unlock(&sctx->wr_lock);
2446         }
2447
2448         scrub_pending_bio_dec(sctx);
2449 }
2450
2451 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2452 {
2453         struct scrub_ctx *sctx = sblock->sctx;
2454         struct btrfs_fs_info *fs_info = sctx->fs_info;
2455         u64 length = sblock->page_count * PAGE_SIZE;
2456         u64 logical = sblock->pagev[0]->logical;
2457         struct btrfs_bio *bbio = NULL;
2458         struct bio *bio;
2459         struct btrfs_raid_bio *rbio;
2460         int ret;
2461         int i;
2462
2463         btrfs_bio_counter_inc_blocked(fs_info);
2464         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2465                         &length, &bbio);
2466         if (ret || !bbio || !bbio->raid_map)
2467                 goto bbio_out;
2468
2469         if (WARN_ON(!sctx->is_dev_replace ||
2470                     !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2471                 /*
2472                  * We shouldn't be scrubbing a missing device. Even for dev
2473                  * replace, we should only get here for RAID 5/6. We either
2474                  * managed to mount something with no mirrors remaining or
2475                  * there's a bug in scrub_remap_extent()/btrfs_map_block().
2476                  */
2477                 goto bbio_out;
2478         }
2479
2480         bio = btrfs_io_bio_alloc(0);
2481         bio->bi_iter.bi_sector = logical >> 9;
2482         bio->bi_private = sblock;
2483         bio->bi_end_io = scrub_missing_raid56_end_io;
2484
2485         rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2486         if (!rbio)
2487                 goto rbio_out;
2488
2489         for (i = 0; i < sblock->page_count; i++) {
2490                 struct scrub_page *spage = sblock->pagev[i];
2491
2492                 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2493         }
2494
2495         btrfs_init_work(&sblock->work, btrfs_scrub_helper,
2496                         scrub_missing_raid56_worker, NULL, NULL);
2497         scrub_block_get(sblock);
2498         scrub_pending_bio_inc(sctx);
2499         raid56_submit_missing_rbio(rbio);
2500         return;
2501
2502 rbio_out:
2503         bio_put(bio);
2504 bbio_out:
2505         btrfs_bio_counter_dec(fs_info);
2506         btrfs_put_bbio(bbio);
2507         spin_lock(&sctx->stat_lock);
2508         sctx->stat.malloc_errors++;
2509         spin_unlock(&sctx->stat_lock);
2510 }
2511
2512 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2513                        u64 physical, struct btrfs_device *dev, u64 flags,
2514                        u64 gen, int mirror_num, u8 *csum, int force,
2515                        u64 physical_for_dev_replace)
2516 {
2517         struct scrub_block *sblock;
2518         int index;
2519
2520         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2521         if (!sblock) {
2522                 spin_lock(&sctx->stat_lock);
2523                 sctx->stat.malloc_errors++;
2524                 spin_unlock(&sctx->stat_lock);
2525                 return -ENOMEM;
2526         }
2527
2528         /* one ref inside this function, plus one for each page added to
2529          * a bio later on */
2530         refcount_set(&sblock->refs, 1);
2531         sblock->sctx = sctx;
2532         sblock->no_io_error_seen = 1;
2533
2534         for (index = 0; len > 0; index++) {
2535                 struct scrub_page *spage;
2536                 u64 l = min_t(u64, len, PAGE_SIZE);
2537
2538                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2539                 if (!spage) {
2540 leave_nomem:
2541                         spin_lock(&sctx->stat_lock);
2542                         sctx->stat.malloc_errors++;
2543                         spin_unlock(&sctx->stat_lock);
2544                         scrub_block_put(sblock);
2545                         return -ENOMEM;
2546                 }
2547                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2548                 scrub_page_get(spage);
2549                 sblock->pagev[index] = spage;
2550                 spage->sblock = sblock;
2551                 spage->dev = dev;
2552                 spage->flags = flags;
2553                 spage->generation = gen;
2554                 spage->logical = logical;
2555                 spage->physical = physical;
2556                 spage->physical_for_dev_replace = physical_for_dev_replace;
2557                 spage->mirror_num = mirror_num;
2558                 if (csum) {
2559                         spage->have_csum = 1;
2560                         memcpy(spage->csum, csum, sctx->csum_size);
2561                 } else {
2562                         spage->have_csum = 0;
2563                 }
2564                 sblock->page_count++;
2565                 spage->page = alloc_page(GFP_KERNEL);
2566                 if (!spage->page)
2567                         goto leave_nomem;
2568                 len -= l;
2569                 logical += l;
2570                 physical += l;
2571                 physical_for_dev_replace += l;
2572         }
2573
2574         WARN_ON(sblock->page_count == 0);
2575         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2576                 /*
2577                  * This case should only be hit for RAID 5/6 device replace. See
2578                  * the comment in scrub_missing_raid56_pages() for details.
2579                  */
2580                 scrub_missing_raid56_pages(sblock);
2581         } else {
2582                 for (index = 0; index < sblock->page_count; index++) {
2583                         struct scrub_page *spage = sblock->pagev[index];
2584                         int ret;
2585
2586                         ret = scrub_add_page_to_rd_bio(sctx, spage);
2587                         if (ret) {
2588                                 scrub_block_put(sblock);
2589                                 return ret;
2590                         }
2591                 }
2592
2593                 if (force)
2594                         scrub_submit(sctx);
2595         }
2596
2597         /* last one frees, either here or in bio completion for last page */
2598         scrub_block_put(sblock);
2599         return 0;
2600 }
2601
2602 static void scrub_bio_end_io(struct bio *bio)
2603 {
2604         struct scrub_bio *sbio = bio->bi_private;
2605         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2606
2607         sbio->status = bio->bi_status;
2608         sbio->bio = bio;
2609
2610         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2611 }
2612
2613 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2614 {
2615         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2616         struct scrub_ctx *sctx = sbio->sctx;
2617         int i;
2618
2619         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2620         if (sbio->status) {
2621                 for (i = 0; i < sbio->page_count; i++) {
2622                         struct scrub_page *spage = sbio->pagev[i];
2623
2624                         spage->io_error = 1;
2625                         spage->sblock->no_io_error_seen = 0;
2626                 }
2627         }
2628
2629         /* now complete the scrub_block items that have all pages completed */
2630         for (i = 0; i < sbio->page_count; i++) {
2631                 struct scrub_page *spage = sbio->pagev[i];
2632                 struct scrub_block *sblock = spage->sblock;
2633
2634                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2635                         scrub_block_complete(sblock);
2636                 scrub_block_put(sblock);
2637         }
2638
2639         bio_put(sbio->bio);
2640         sbio->bio = NULL;
2641         spin_lock(&sctx->list_lock);
2642         sbio->next_free = sctx->first_free;
2643         sctx->first_free = sbio->index;
2644         spin_unlock(&sctx->list_lock);
2645
2646         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2647                 mutex_lock(&sctx->wr_lock);
2648                 scrub_wr_submit(sctx);
2649                 mutex_unlock(&sctx->wr_lock);
2650         }
2651
2652         scrub_pending_bio_dec(sctx);
2653 }
2654
2655 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2656                                        unsigned long *bitmap,
2657                                        u64 start, u64 len)
2658 {
2659         u64 offset;
2660         u64 nsectors64;
2661         u32 nsectors;
2662         int sectorsize = sparity->sctx->fs_info->sectorsize;
2663
2664         if (len >= sparity->stripe_len) {
2665                 bitmap_set(bitmap, 0, sparity->nsectors);
2666                 return;
2667         }
2668
2669         start -= sparity->logic_start;
2670         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2671         offset = div_u64(offset, sectorsize);
2672         nsectors64 = div_u64(len, sectorsize);
2673
2674         ASSERT(nsectors64 < UINT_MAX);
2675         nsectors = (u32)nsectors64;
2676
2677         if (offset + nsectors <= sparity->nsectors) {
2678                 bitmap_set(bitmap, offset, nsectors);
2679                 return;
2680         }
2681
2682         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2683         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2684 }
2685
2686 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2687                                                    u64 start, u64 len)
2688 {
2689         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2690 }
2691
2692 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2693                                                   u64 start, u64 len)
2694 {
2695         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2696 }
2697
2698 static void scrub_block_complete(struct scrub_block *sblock)
2699 {
2700         int corrupted = 0;
2701
2702         if (!sblock->no_io_error_seen) {
2703                 corrupted = 1;
2704                 scrub_handle_errored_block(sblock);
2705         } else {
2706                 /*
2707                  * if has checksum error, write via repair mechanism in
2708                  * dev replace case, otherwise write here in dev replace
2709                  * case.
2710                  */
2711                 corrupted = scrub_checksum(sblock);
2712                 if (!corrupted && sblock->sctx->is_dev_replace)
2713                         scrub_write_block_to_dev_replace(sblock);
2714         }
2715
2716         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2717                 u64 start = sblock->pagev[0]->logical;
2718                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2719                           PAGE_SIZE;
2720
2721                 scrub_parity_mark_sectors_error(sblock->sparity,
2722                                                 start, end - start);
2723         }
2724 }
2725
2726 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2727 {
2728         struct btrfs_ordered_sum *sum = NULL;
2729         unsigned long index;
2730         unsigned long num_sectors;
2731
2732         while (!list_empty(&sctx->csum_list)) {
2733                 sum = list_first_entry(&sctx->csum_list,
2734                                        struct btrfs_ordered_sum, list);
2735                 if (sum->bytenr > logical)
2736                         return 0;
2737                 if (sum->bytenr + sum->len > logical)
2738                         break;
2739
2740                 ++sctx->stat.csum_discards;
2741                 list_del(&sum->list);
2742                 kfree(sum);
2743                 sum = NULL;
2744         }
2745         if (!sum)
2746                 return 0;
2747
2748         index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
2749         ASSERT(index < UINT_MAX);
2750
2751         num_sectors = sum->len / sctx->fs_info->sectorsize;
2752         memcpy(csum, sum->sums + index, sctx->csum_size);
2753         if (index == num_sectors - 1) {
2754                 list_del(&sum->list);
2755                 kfree(sum);
2756         }
2757         return 1;
2758 }
2759
2760 /* scrub extent tries to collect up to 64 kB for each bio */
2761 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2762                         u64 logical, u64 len,
2763                         u64 physical, struct btrfs_device *dev, u64 flags,
2764                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2765 {
2766         int ret;
2767         u8 csum[BTRFS_CSUM_SIZE];
2768         u32 blocksize;
2769
2770         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2771                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2772                         blocksize = map->stripe_len;
2773                 else
2774                         blocksize = sctx->fs_info->sectorsize;
2775                 spin_lock(&sctx->stat_lock);
2776                 sctx->stat.data_extents_scrubbed++;
2777                 sctx->stat.data_bytes_scrubbed += len;
2778                 spin_unlock(&sctx->stat_lock);
2779         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2780                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2781                         blocksize = map->stripe_len;
2782                 else
2783                         blocksize = sctx->fs_info->nodesize;
2784                 spin_lock(&sctx->stat_lock);
2785                 sctx->stat.tree_extents_scrubbed++;
2786                 sctx->stat.tree_bytes_scrubbed += len;
2787                 spin_unlock(&sctx->stat_lock);
2788         } else {
2789                 blocksize = sctx->fs_info->sectorsize;
2790                 WARN_ON(1);
2791         }
2792
2793         while (len) {
2794                 u64 l = min_t(u64, len, blocksize);
2795                 int have_csum = 0;
2796
2797                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2798                         /* push csums to sbio */
2799                         have_csum = scrub_find_csum(sctx, logical, csum);
2800                         if (have_csum == 0)
2801                                 ++sctx->stat.no_csum;
2802                         if (sctx->is_dev_replace && !have_csum) {
2803                                 ret = copy_nocow_pages(sctx, logical, l,
2804                                                        mirror_num,
2805                                                       physical_for_dev_replace);
2806                                 goto behind_scrub_pages;
2807                         }
2808                 }
2809                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2810                                   mirror_num, have_csum ? csum : NULL, 0,
2811                                   physical_for_dev_replace);
2812 behind_scrub_pages:
2813                 if (ret)
2814                         return ret;
2815                 len -= l;
2816                 logical += l;
2817                 physical += l;
2818                 physical_for_dev_replace += l;
2819         }
2820         return 0;
2821 }
2822
2823 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2824                                   u64 logical, u64 len,
2825                                   u64 physical, struct btrfs_device *dev,
2826                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2827 {
2828         struct scrub_ctx *sctx = sparity->sctx;
2829         struct scrub_block *sblock;
2830         int index;
2831
2832         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2833         if (!sblock) {
2834                 spin_lock(&sctx->stat_lock);
2835                 sctx->stat.malloc_errors++;
2836                 spin_unlock(&sctx->stat_lock);
2837                 return -ENOMEM;
2838         }
2839
2840         /* one ref inside this function, plus one for each page added to
2841          * a bio later on */
2842         refcount_set(&sblock->refs, 1);
2843         sblock->sctx = sctx;
2844         sblock->no_io_error_seen = 1;
2845         sblock->sparity = sparity;
2846         scrub_parity_get(sparity);
2847
2848         for (index = 0; len > 0; index++) {
2849                 struct scrub_page *spage;
2850                 u64 l = min_t(u64, len, PAGE_SIZE);
2851
2852                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2853                 if (!spage) {
2854 leave_nomem:
2855                         spin_lock(&sctx->stat_lock);
2856                         sctx->stat.malloc_errors++;
2857                         spin_unlock(&sctx->stat_lock);
2858                         scrub_block_put(sblock);
2859                         return -ENOMEM;
2860                 }
2861                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2862                 /* For scrub block */
2863                 scrub_page_get(spage);
2864                 sblock->pagev[index] = spage;
2865                 /* For scrub parity */
2866                 scrub_page_get(spage);
2867                 list_add_tail(&spage->list, &sparity->spages);
2868                 spage->sblock = sblock;
2869                 spage->dev = dev;
2870                 spage->flags = flags;
2871                 spage->generation = gen;
2872                 spage->logical = logical;
2873                 spage->physical = physical;
2874                 spage->mirror_num = mirror_num;
2875                 if (csum) {
2876                         spage->have_csum = 1;
2877                         memcpy(spage->csum, csum, sctx->csum_size);
2878                 } else {
2879                         spage->have_csum = 0;
2880                 }
2881                 sblock->page_count++;
2882                 spage->page = alloc_page(GFP_KERNEL);
2883                 if (!spage->page)
2884                         goto leave_nomem;
2885                 len -= l;
2886                 logical += l;
2887                 physical += l;
2888         }
2889
2890         WARN_ON(sblock->page_count == 0);
2891         for (index = 0; index < sblock->page_count; index++) {
2892                 struct scrub_page *spage = sblock->pagev[index];
2893                 int ret;
2894
2895                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2896                 if (ret) {
2897                         scrub_block_put(sblock);
2898                         return ret;
2899                 }
2900         }
2901
2902         /* last one frees, either here or in bio completion for last page */
2903         scrub_block_put(sblock);
2904         return 0;
2905 }
2906
2907 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2908                                    u64 logical, u64 len,
2909                                    u64 physical, struct btrfs_device *dev,
2910                                    u64 flags, u64 gen, int mirror_num)
2911 {
2912         struct scrub_ctx *sctx = sparity->sctx;
2913         int ret;
2914         u8 csum[BTRFS_CSUM_SIZE];
2915         u32 blocksize;
2916
2917         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2918                 scrub_parity_mark_sectors_error(sparity, logical, len);
2919                 return 0;
2920         }
2921
2922         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2923                 blocksize = sparity->stripe_len;
2924         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2925                 blocksize = sparity->stripe_len;
2926         } else {
2927                 blocksize = sctx->fs_info->sectorsize;
2928                 WARN_ON(1);
2929         }
2930
2931         while (len) {
2932                 u64 l = min_t(u64, len, blocksize);
2933                 int have_csum = 0;
2934
2935                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2936                         /* push csums to sbio */
2937                         have_csum = scrub_find_csum(sctx, logical, csum);
2938                         if (have_csum == 0)
2939                                 goto skip;
2940                 }
2941                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2942                                              flags, gen, mirror_num,
2943                                              have_csum ? csum : NULL);
2944                 if (ret)
2945                         return ret;
2946 skip:
2947                 len -= l;
2948                 logical += l;
2949                 physical += l;
2950         }
2951         return 0;
2952 }
2953
2954 /*
2955  * Given a physical address, this will calculate it's
2956  * logical offset. if this is a parity stripe, it will return
2957  * the most left data stripe's logical offset.
2958  *
2959  * return 0 if it is a data stripe, 1 means parity stripe.
2960  */
2961 static int get_raid56_logic_offset(u64 physical, int num,
2962                                    struct map_lookup *map, u64 *offset,
2963                                    u64 *stripe_start)
2964 {
2965         int i;
2966         int j = 0;
2967         u64 stripe_nr;
2968         u64 last_offset;
2969         u32 stripe_index;
2970         u32 rot;
2971
2972         last_offset = (physical - map->stripes[num].physical) *
2973                       nr_data_stripes(map);
2974         if (stripe_start)
2975                 *stripe_start = last_offset;
2976
2977         *offset = last_offset;
2978         for (i = 0; i < nr_data_stripes(map); i++) {
2979                 *offset = last_offset + i * map->stripe_len;
2980
2981                 stripe_nr = div64_u64(*offset, map->stripe_len);
2982                 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
2983
2984                 /* Work out the disk rotation on this stripe-set */
2985                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2986                 /* calculate which stripe this data locates */
2987                 rot += i;
2988                 stripe_index = rot % map->num_stripes;
2989                 if (stripe_index == num)
2990                         return 0;
2991                 if (stripe_index < num)
2992                         j++;
2993         }
2994         *offset = last_offset + j * map->stripe_len;
2995         return 1;
2996 }
2997
2998 static void scrub_free_parity(struct scrub_parity *sparity)
2999 {
3000         struct scrub_ctx *sctx = sparity->sctx;
3001         struct scrub_page *curr, *next;
3002         int nbits;
3003
3004         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
3005         if (nbits) {
3006                 spin_lock(&sctx->stat_lock);
3007                 sctx->stat.read_errors += nbits;
3008                 sctx->stat.uncorrectable_errors += nbits;
3009                 spin_unlock(&sctx->stat_lock);
3010         }
3011
3012         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
3013                 list_del_init(&curr->list);
3014                 scrub_page_put(curr);
3015         }
3016
3017         kfree(sparity);
3018 }
3019
3020 static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
3021 {
3022         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
3023                                                     work);
3024         struct scrub_ctx *sctx = sparity->sctx;
3025
3026         scrub_free_parity(sparity);
3027         scrub_pending_bio_dec(sctx);
3028 }
3029
3030 static void scrub_parity_bio_endio(struct bio *bio)
3031 {
3032         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
3033         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
3034
3035         if (bio->bi_status)
3036                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3037                           sparity->nsectors);
3038
3039         bio_put(bio);
3040
3041         btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
3042                         scrub_parity_bio_endio_worker, NULL, NULL);
3043         btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
3044 }
3045
3046 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
3047 {
3048         struct scrub_ctx *sctx = sparity->sctx;
3049         struct btrfs_fs_info *fs_info = sctx->fs_info;
3050         struct bio *bio;
3051         struct btrfs_raid_bio *rbio;
3052         struct btrfs_bio *bbio = NULL;
3053         u64 length;
3054         int ret;
3055
3056         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
3057                            sparity->nsectors))
3058                 goto out;
3059
3060         length = sparity->logic_end - sparity->logic_start;
3061
3062         btrfs_bio_counter_inc_blocked(fs_info);
3063         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
3064                                &length, &bbio);
3065         if (ret || !bbio || !bbio->raid_map)
3066                 goto bbio_out;
3067
3068         bio = btrfs_io_bio_alloc(0);
3069         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
3070         bio->bi_private = sparity;
3071         bio->bi_end_io = scrub_parity_bio_endio;
3072
3073         rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
3074                                               length, sparity->scrub_dev,
3075                                               sparity->dbitmap,
3076                                               sparity->nsectors);
3077         if (!rbio)
3078                 goto rbio_out;
3079
3080         scrub_pending_bio_inc(sctx);
3081         raid56_parity_submit_scrub_rbio(rbio);
3082         return;
3083
3084 rbio_out:
3085         bio_put(bio);
3086 bbio_out:
3087         btrfs_bio_counter_dec(fs_info);
3088         btrfs_put_bbio(bbio);
3089         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3090                   sparity->nsectors);
3091         spin_lock(&sctx->stat_lock);
3092         sctx->stat.malloc_errors++;
3093         spin_unlock(&sctx->stat_lock);
3094 out:
3095         scrub_free_parity(sparity);
3096 }
3097
3098 static inline int scrub_calc_parity_bitmap_len(int nsectors)
3099 {
3100         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
3101 }
3102
3103 static void scrub_parity_get(struct scrub_parity *sparity)
3104 {
3105         refcount_inc(&sparity->refs);
3106 }
3107
3108 static void scrub_parity_put(struct scrub_parity *sparity)
3109 {
3110         if (!refcount_dec_and_test(&sparity->refs))
3111                 return;
3112
3113         scrub_parity_check_and_repair(sparity);
3114 }
3115
3116 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3117                                                   struct map_lookup *map,
3118                                                   struct btrfs_device *sdev,
3119                                                   struct btrfs_path *path,
3120                                                   u64 logic_start,
3121                                                   u64 logic_end)
3122 {
3123         struct btrfs_fs_info *fs_info = sctx->fs_info;
3124         struct btrfs_root *root = fs_info->extent_root;
3125         struct btrfs_root *csum_root = fs_info->csum_root;
3126         struct btrfs_extent_item *extent;
3127         struct btrfs_bio *bbio = NULL;
3128         u64 flags;
3129         int ret;
3130         int slot;
3131         struct extent_buffer *l;
3132         struct btrfs_key key;
3133         u64 generation;
3134         u64 extent_logical;
3135         u64 extent_physical;
3136         u64 extent_len;
3137         u64 mapped_length;
3138         struct btrfs_device *extent_dev;
3139         struct scrub_parity *sparity;
3140         int nsectors;
3141         int bitmap_len;
3142         int extent_mirror_num;
3143         int stop_loop = 0;
3144
3145         nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
3146         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
3147         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
3148                           GFP_NOFS);
3149         if (!sparity) {
3150                 spin_lock(&sctx->stat_lock);
3151                 sctx->stat.malloc_errors++;
3152                 spin_unlock(&sctx->stat_lock);
3153                 return -ENOMEM;
3154         }
3155
3156         sparity->stripe_len = map->stripe_len;
3157         sparity->nsectors = nsectors;
3158         sparity->sctx = sctx;
3159         sparity->scrub_dev = sdev;
3160         sparity->logic_start = logic_start;
3161         sparity->logic_end = logic_end;
3162         refcount_set(&sparity->refs, 1);
3163         INIT_LIST_HEAD(&sparity->spages);
3164         sparity->dbitmap = sparity->bitmap;
3165         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
3166
3167         ret = 0;
3168         while (logic_start < logic_end) {
3169                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3170                         key.type = BTRFS_METADATA_ITEM_KEY;
3171                 else
3172                         key.type = BTRFS_EXTENT_ITEM_KEY;
3173                 key.objectid = logic_start;
3174                 key.offset = (u64)-1;
3175
3176                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3177                 if (ret < 0)
3178                         goto out;
3179
3180                 if (ret > 0) {
3181                         ret = btrfs_previous_extent_item(root, path, 0);
3182                         if (ret < 0)
3183                                 goto out;
3184                         if (ret > 0) {
3185                                 btrfs_release_path(path);
3186                                 ret = btrfs_search_slot(NULL, root, &key,
3187                                                         path, 0, 0);
3188                                 if (ret < 0)
3189                                         goto out;
3190                         }
3191                 }
3192
3193                 stop_loop = 0;
3194                 while (1) {
3195                         u64 bytes;
3196
3197                         l = path->nodes[0];
3198                         slot = path->slots[0];
3199                         if (slot >= btrfs_header_nritems(l)) {
3200                                 ret = btrfs_next_leaf(root, path);
3201                                 if (ret == 0)
3202                                         continue;
3203                                 if (ret < 0)
3204                                         goto out;
3205
3206                                 stop_loop = 1;
3207                                 break;
3208                         }
3209                         btrfs_item_key_to_cpu(l, &key, slot);
3210
3211                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3212                             key.type != BTRFS_METADATA_ITEM_KEY)
3213                                 goto next;
3214
3215                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3216                                 bytes = fs_info->nodesize;
3217                         else
3218                                 bytes = key.offset;
3219
3220                         if (key.objectid + bytes <= logic_start)
3221                                 goto next;
3222
3223                         if (key.objectid >= logic_end) {
3224                                 stop_loop = 1;
3225                                 break;
3226                         }
3227
3228                         while (key.objectid >= logic_start + map->stripe_len)
3229                                 logic_start += map->stripe_len;
3230
3231                         extent = btrfs_item_ptr(l, slot,
3232                                                 struct btrfs_extent_item);
3233                         flags = btrfs_extent_flags(l, extent);
3234                         generation = btrfs_extent_generation(l, extent);
3235
3236                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3237                             (key.objectid < logic_start ||
3238                              key.objectid + bytes >
3239                              logic_start + map->stripe_len)) {
3240                                 btrfs_err(fs_info,
3241                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3242                                           key.objectid, logic_start);
3243                                 spin_lock(&sctx->stat_lock);
3244                                 sctx->stat.uncorrectable_errors++;
3245                                 spin_unlock(&sctx->stat_lock);
3246                                 goto next;
3247                         }
3248 again:
3249                         extent_logical = key.objectid;
3250                         extent_len = bytes;
3251
3252                         if (extent_logical < logic_start) {
3253                                 extent_len -= logic_start - extent_logical;
3254                                 extent_logical = logic_start;
3255                         }
3256
3257                         if (extent_logical + extent_len >
3258                             logic_start + map->stripe_len)
3259                                 extent_len = logic_start + map->stripe_len -
3260                                              extent_logical;
3261
3262                         scrub_parity_mark_sectors_data(sparity, extent_logical,
3263                                                        extent_len);
3264
3265                         mapped_length = extent_len;
3266                         bbio = NULL;
3267                         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
3268                                         extent_logical, &mapped_length, &bbio,
3269                                         0);
3270                         if (!ret) {
3271                                 if (!bbio || mapped_length < extent_len)
3272                                         ret = -EIO;
3273                         }
3274                         if (ret) {
3275                                 btrfs_put_bbio(bbio);
3276                                 goto out;
3277                         }
3278                         extent_physical = bbio->stripes[0].physical;
3279                         extent_mirror_num = bbio->mirror_num;
3280                         extent_dev = bbio->stripes[0].dev;
3281                         btrfs_put_bbio(bbio);
3282
3283                         ret = btrfs_lookup_csums_range(csum_root,
3284                                                 extent_logical,
3285                                                 extent_logical + extent_len - 1,
3286                                                 &sctx->csum_list, 1);
3287                         if (ret)
3288                                 goto out;
3289
3290                         ret = scrub_extent_for_parity(sparity, extent_logical,
3291                                                       extent_len,
3292                                                       extent_physical,
3293                                                       extent_dev, flags,
3294                                                       generation,
3295                                                       extent_mirror_num);
3296
3297                         scrub_free_csums(sctx);
3298
3299                         if (ret)
3300                                 goto out;
3301
3302                         if (extent_logical + extent_len <
3303                             key.objectid + bytes) {
3304                                 logic_start += map->stripe_len;
3305
3306                                 if (logic_start >= logic_end) {
3307                                         stop_loop = 1;
3308                                         break;
3309                                 }
3310
3311                                 if (logic_start < key.objectid + bytes) {
3312                                         cond_resched();
3313                                         goto again;
3314                                 }
3315                         }
3316 next:
3317                         path->slots[0]++;
3318                 }
3319
3320                 btrfs_release_path(path);
3321
3322                 if (stop_loop)
3323                         break;
3324
3325                 logic_start += map->stripe_len;
3326         }
3327 out:
3328         if (ret < 0)
3329                 scrub_parity_mark_sectors_error(sparity, logic_start,
3330                                                 logic_end - logic_start);
3331         scrub_parity_put(sparity);
3332         scrub_submit(sctx);
3333         mutex_lock(&sctx->wr_lock);
3334         scrub_wr_submit(sctx);
3335         mutex_unlock(&sctx->wr_lock);
3336
3337         btrfs_release_path(path);
3338         return ret < 0 ? ret : 0;
3339 }
3340
3341 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3342                                            struct map_lookup *map,
3343                                            struct btrfs_device *scrub_dev,
3344                                            int num, u64 base, u64 length,
3345                                            int is_dev_replace)
3346 {
3347         struct btrfs_path *path, *ppath;
3348         struct btrfs_fs_info *fs_info = sctx->fs_info;
3349         struct btrfs_root *root = fs_info->extent_root;
3350         struct btrfs_root *csum_root = fs_info->csum_root;
3351         struct btrfs_extent_item *extent;
3352         struct blk_plug plug;
3353         u64 flags;
3354         int ret;
3355         int slot;
3356         u64 nstripes;
3357         struct extent_buffer *l;
3358         u64 physical;
3359         u64 logical;
3360         u64 logic_end;
3361         u64 physical_end;
3362         u64 generation;
3363         int mirror_num;
3364         struct reada_control *reada1;
3365         struct reada_control *reada2;
3366         struct btrfs_key key;
3367         struct btrfs_key key_end;
3368         u64 increment = map->stripe_len;
3369         u64 offset;
3370         u64 extent_logical;
3371         u64 extent_physical;
3372         u64 extent_len;
3373         u64 stripe_logical;
3374         u64 stripe_end;
3375         struct btrfs_device *extent_dev;
3376         int extent_mirror_num;
3377         int stop_loop = 0;
3378
3379         physical = map->stripes[num].physical;
3380         offset = 0;
3381         nstripes = div64_u64(length, map->stripe_len);
3382         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3383                 offset = map->stripe_len * num;
3384                 increment = map->stripe_len * map->num_stripes;
3385                 mirror_num = 1;
3386         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3387                 int factor = map->num_stripes / map->sub_stripes;
3388                 offset = map->stripe_len * (num / map->sub_stripes);
3389                 increment = map->stripe_len * factor;
3390                 mirror_num = num % map->sub_stripes + 1;
3391         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3392                 increment = map->stripe_len;
3393                 mirror_num = num % map->num_stripes + 1;
3394         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3395                 increment = map->stripe_len;
3396                 mirror_num = num % map->num_stripes + 1;
3397         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3398                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3399                 increment = map->stripe_len * nr_data_stripes(map);
3400                 mirror_num = 1;
3401         } else {
3402                 increment = map->stripe_len;
3403                 mirror_num = 1;
3404         }
3405
3406         path = btrfs_alloc_path();
3407         if (!path)
3408                 return -ENOMEM;
3409
3410         ppath = btrfs_alloc_path();
3411         if (!ppath) {
3412                 btrfs_free_path(path);
3413                 return -ENOMEM;
3414         }
3415
3416         /*
3417          * work on commit root. The related disk blocks are static as
3418          * long as COW is applied. This means, it is save to rewrite
3419          * them to repair disk errors without any race conditions
3420          */
3421         path->search_commit_root = 1;
3422         path->skip_locking = 1;
3423
3424         ppath->search_commit_root = 1;
3425         ppath->skip_locking = 1;
3426         /*
3427          * trigger the readahead for extent tree csum tree and wait for
3428          * completion. During readahead, the scrub is officially paused
3429          * to not hold off transaction commits
3430          */
3431         logical = base + offset;
3432         physical_end = physical + nstripes * map->stripe_len;
3433         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3434                 get_raid56_logic_offset(physical_end, num,
3435                                         map, &logic_end, NULL);
3436                 logic_end += base;
3437         } else {
3438                 logic_end = logical + increment * nstripes;
3439         }
3440         wait_event(sctx->list_wait,
3441                    atomic_read(&sctx->bios_in_flight) == 0);
3442         scrub_blocked_if_needed(fs_info);
3443
3444         /* FIXME it might be better to start readahead at commit root */
3445         key.objectid = logical;
3446         key.type = BTRFS_EXTENT_ITEM_KEY;
3447         key.offset = (u64)0;
3448         key_end.objectid = logic_end;
3449         key_end.type = BTRFS_METADATA_ITEM_KEY;
3450         key_end.offset = (u64)-1;
3451         reada1 = btrfs_reada_add(root, &key, &key_end);
3452
3453         key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3454         key.type = BTRFS_EXTENT_CSUM_KEY;
3455         key.offset = logical;
3456         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3457         key_end.type = BTRFS_EXTENT_CSUM_KEY;
3458         key_end.offset = logic_end;
3459         reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3460
3461         if (!IS_ERR(reada1))
3462                 btrfs_reada_wait(reada1);
3463         if (!IS_ERR(reada2))
3464                 btrfs_reada_wait(reada2);
3465
3466
3467         /*
3468          * collect all data csums for the stripe to avoid seeking during
3469          * the scrub. This might currently (crc32) end up to be about 1MB
3470          */
3471         blk_start_plug(&plug);
3472
3473         /*
3474          * now find all extents for each stripe and scrub them
3475          */
3476         ret = 0;
3477         while (physical < physical_end) {
3478                 /*
3479                  * canceled?
3480                  */
3481                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3482                     atomic_read(&sctx->cancel_req)) {
3483                         ret = -ECANCELED;
3484                         goto out;
3485                 }
3486                 /*
3487                  * check to see if we have to pause
3488                  */
3489                 if (atomic_read(&fs_info->scrub_pause_req)) {
3490                         /* push queued extents */
3491                         sctx->flush_all_writes = true;
3492                         scrub_submit(sctx);
3493                         mutex_lock(&sctx->wr_lock);
3494                         scrub_wr_submit(sctx);
3495                         mutex_unlock(&sctx->wr_lock);
3496                         wait_event(sctx->list_wait,
3497                                    atomic_read(&sctx->bios_in_flight) == 0);
3498                         sctx->flush_all_writes = false;
3499                         scrub_blocked_if_needed(fs_info);
3500                 }
3501
3502                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3503                         ret = get_raid56_logic_offset(physical, num, map,
3504                                                       &logical,
3505                                                       &stripe_logical);
3506                         logical += base;
3507                         if (ret) {
3508                                 /* it is parity strip */
3509                                 stripe_logical += base;
3510                                 stripe_end = stripe_logical + increment;
3511                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3512                                                           ppath, stripe_logical,
3513                                                           stripe_end);
3514                                 if (ret)
3515                                         goto out;
3516                                 goto skip;
3517                         }
3518                 }
3519
3520                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3521                         key.type = BTRFS_METADATA_ITEM_KEY;
3522                 else
3523                         key.type = BTRFS_EXTENT_ITEM_KEY;
3524                 key.objectid = logical;
3525                 key.offset = (u64)-1;
3526
3527                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3528                 if (ret < 0)
3529                         goto out;
3530
3531                 if (ret > 0) {
3532                         ret = btrfs_previous_extent_item(root, path, 0);
3533                         if (ret < 0)
3534                                 goto out;
3535                         if (ret > 0) {
3536                                 /* there's no smaller item, so stick with the
3537                                  * larger one */
3538                                 btrfs_release_path(path);
3539                                 ret = btrfs_search_slot(NULL, root, &key,
3540                                                         path, 0, 0);
3541                                 if (ret < 0)
3542                                         goto out;
3543                         }
3544                 }
3545
3546                 stop_loop = 0;
3547                 while (1) {
3548                         u64 bytes;
3549
3550                         l = path->nodes[0];
3551                         slot = path->slots[0];
3552                         if (slot >= btrfs_header_nritems(l)) {
3553                                 ret = btrfs_next_leaf(root, path);
3554                                 if (ret == 0)
3555                                         continue;
3556                                 if (ret < 0)
3557                                         goto out;
3558
3559                                 stop_loop = 1;
3560                                 break;
3561                         }
3562                         btrfs_item_key_to_cpu(l, &key, slot);
3563
3564                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3565                             key.type != BTRFS_METADATA_ITEM_KEY)
3566                                 goto next;
3567
3568                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3569                                 bytes = fs_info->nodesize;
3570                         else
3571                                 bytes = key.offset;
3572
3573                         if (key.objectid + bytes <= logical)
3574                                 goto next;
3575
3576                         if (key.objectid >= logical + map->stripe_len) {
3577                                 /* out of this device extent */
3578                                 if (key.objectid >= logic_end)
3579                                         stop_loop = 1;
3580                                 break;
3581                         }
3582
3583                         extent = btrfs_item_ptr(l, slot,
3584                                                 struct btrfs_extent_item);
3585                         flags = btrfs_extent_flags(l, extent);
3586                         generation = btrfs_extent_generation(l, extent);
3587
3588                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3589                             (key.objectid < logical ||
3590                              key.objectid + bytes >
3591                              logical + map->stripe_len)) {
3592                                 btrfs_err(fs_info,
3593                                            "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3594                                        key.objectid, logical);
3595                                 spin_lock(&sctx->stat_lock);
3596                                 sctx->stat.uncorrectable_errors++;
3597                                 spin_unlock(&sctx->stat_lock);
3598                                 goto next;
3599                         }
3600
3601 again:
3602                         extent_logical = key.objectid;
3603                         extent_len = bytes;
3604
3605                         /*
3606                          * trim extent to this stripe
3607                          */
3608                         if (extent_logical < logical) {
3609                                 extent_len -= logical - extent_logical;
3610                                 extent_logical = logical;
3611                         }
3612                         if (extent_logical + extent_len >
3613                             logical + map->stripe_len) {
3614                                 extent_len = logical + map->stripe_len -
3615                                              extent_logical;
3616                         }
3617
3618                         extent_physical = extent_logical - logical + physical;
3619                         extent_dev = scrub_dev;
3620                         extent_mirror_num = mirror_num;
3621                         if (is_dev_replace)
3622                                 scrub_remap_extent(fs_info, extent_logical,
3623                                                    extent_len, &extent_physical,
3624                                                    &extent_dev,
3625                                                    &extent_mirror_num);
3626
3627                         ret = btrfs_lookup_csums_range(csum_root,
3628                                                        extent_logical,
3629                                                        extent_logical +
3630                                                        extent_len - 1,
3631                                                        &sctx->csum_list, 1);
3632                         if (ret)
3633                                 goto out;
3634
3635                         ret = scrub_extent(sctx, map, extent_logical, extent_len,
3636                                            extent_physical, extent_dev, flags,
3637                                            generation, extent_mirror_num,
3638                                            extent_logical - logical + physical);
3639
3640                         scrub_free_csums(sctx);
3641
3642                         if (ret)
3643                                 goto out;
3644
3645                         if (extent_logical + extent_len <
3646                             key.objectid + bytes) {
3647                                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3648                                         /*
3649                                          * loop until we find next data stripe
3650                                          * or we have finished all stripes.
3651                                          */
3652 loop:
3653                                         physical += map->stripe_len;
3654                                         ret = get_raid56_logic_offset(physical,
3655                                                         num, map, &logical,
3656                                                         &stripe_logical);
3657                                         logical += base;
3658
3659                                         if (ret && physical < physical_end) {
3660                                                 stripe_logical += base;
3661                                                 stripe_end = stripe_logical +
3662                                                                 increment;
3663                                                 ret = scrub_raid56_parity(sctx,
3664                                                         map, scrub_dev, ppath,
3665                                                         stripe_logical,
3666                                                         stripe_end);
3667                                                 if (ret)
3668                                                         goto out;
3669                                                 goto loop;
3670                                         }
3671                                 } else {
3672                                         physical += map->stripe_len;
3673                                         logical += increment;
3674                                 }
3675                                 if (logical < key.objectid + bytes) {
3676                                         cond_resched();
3677                                         goto again;
3678                                 }
3679
3680                                 if (physical >= physical_end) {
3681                                         stop_loop = 1;
3682                                         break;
3683                                 }
3684                         }
3685 next:
3686                         path->slots[0]++;
3687                 }
3688                 btrfs_release_path(path);
3689 skip:
3690                 logical += increment;
3691                 physical += map->stripe_len;
3692                 spin_lock(&sctx->stat_lock);
3693                 if (stop_loop)
3694                         sctx->stat.last_physical = map->stripes[num].physical +
3695                                                    length;
3696                 else
3697                         sctx->stat.last_physical = physical;
3698                 spin_unlock(&sctx->stat_lock);
3699                 if (stop_loop)
3700                         break;
3701         }
3702 out:
3703         /* push queued extents */
3704         scrub_submit(sctx);
3705         mutex_lock(&sctx->wr_lock);
3706         scrub_wr_submit(sctx);
3707         mutex_unlock(&sctx->wr_lock);
3708
3709         blk_finish_plug(&plug);
3710         btrfs_free_path(path);
3711         btrfs_free_path(ppath);
3712         return ret < 0 ? ret : 0;
3713 }
3714
3715 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3716                                           struct btrfs_device *scrub_dev,
3717                                           u64 chunk_offset, u64 length,
3718                                           u64 dev_offset,
3719                                           struct btrfs_block_group_cache *cache,
3720                                           int is_dev_replace)
3721 {
3722         struct btrfs_fs_info *fs_info = sctx->fs_info;
3723         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3724         struct map_lookup *map;
3725         struct extent_map *em;
3726         int i;
3727         int ret = 0;
3728
3729         read_lock(&map_tree->map_tree.lock);
3730         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3731         read_unlock(&map_tree->map_tree.lock);
3732
3733         if (!em) {
3734                 /*
3735                  * Might have been an unused block group deleted by the cleaner
3736                  * kthread or relocation.
3737                  */
3738                 spin_lock(&cache->lock);
3739                 if (!cache->removed)
3740                         ret = -EINVAL;
3741                 spin_unlock(&cache->lock);
3742
3743                 return ret;
3744         }
3745
3746         map = em->map_lookup;
3747         if (em->start != chunk_offset)
3748                 goto out;
3749
3750         if (em->len < length)
3751                 goto out;
3752
3753         for (i = 0; i < map->num_stripes; ++i) {
3754                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3755                     map->stripes[i].physical == dev_offset) {
3756                         ret = scrub_stripe(sctx, map, scrub_dev, i,
3757                                            chunk_offset, length,
3758                                            is_dev_replace);
3759                         if (ret)
3760                                 goto out;
3761                 }
3762         }
3763 out:
3764         free_extent_map(em);
3765
3766         return ret;
3767 }
3768
3769 static noinline_for_stack
3770 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3771                            struct btrfs_device *scrub_dev, u64 start, u64 end,
3772                            int is_dev_replace)
3773 {
3774         struct btrfs_dev_extent *dev_extent = NULL;
3775         struct btrfs_path *path;
3776         struct btrfs_fs_info *fs_info = sctx->fs_info;
3777         struct btrfs_root *root = fs_info->dev_root;
3778         u64 length;
3779         u64 chunk_offset;
3780         int ret = 0;
3781         int ro_set;
3782         int slot;
3783         struct extent_buffer *l;
3784         struct btrfs_key key;
3785         struct btrfs_key found_key;
3786         struct btrfs_block_group_cache *cache;
3787         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3788
3789         path = btrfs_alloc_path();
3790         if (!path)
3791                 return -ENOMEM;
3792
3793         path->reada = READA_FORWARD;
3794         path->search_commit_root = 1;
3795         path->skip_locking = 1;
3796
3797         key.objectid = scrub_dev->devid;
3798         key.offset = 0ull;
3799         key.type = BTRFS_DEV_EXTENT_KEY;
3800
3801         while (1) {
3802                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3803                 if (ret < 0)
3804                         break;
3805                 if (ret > 0) {
3806                         if (path->slots[0] >=
3807                             btrfs_header_nritems(path->nodes[0])) {
3808                                 ret = btrfs_next_leaf(root, path);
3809                                 if (ret < 0)
3810                                         break;
3811                                 if (ret > 0) {
3812                                         ret = 0;
3813                                         break;
3814                                 }
3815                         } else {
3816                                 ret = 0;
3817                         }
3818                 }
3819
3820                 l = path->nodes[0];
3821                 slot = path->slots[0];
3822
3823                 btrfs_item_key_to_cpu(l, &found_key, slot);
3824
3825                 if (found_key.objectid != scrub_dev->devid)
3826                         break;
3827
3828                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3829                         break;
3830
3831                 if (found_key.offset >= end)
3832                         break;
3833
3834                 if (found_key.offset < key.offset)
3835                         break;
3836
3837                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3838                 length = btrfs_dev_extent_length(l, dev_extent);
3839
3840                 if (found_key.offset + length <= start)
3841                         goto skip;
3842
3843                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3844
3845                 /*
3846                  * get a reference on the corresponding block group to prevent
3847                  * the chunk from going away while we scrub it
3848                  */
3849                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3850
3851                 /* some chunks are removed but not committed to disk yet,
3852                  * continue scrubbing */
3853                 if (!cache)
3854                         goto skip;
3855
3856                 /*
3857                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3858                  * to avoid deadlock caused by:
3859                  * btrfs_inc_block_group_ro()
3860                  * -> btrfs_wait_for_commit()
3861                  * -> btrfs_commit_transaction()
3862                  * -> btrfs_scrub_pause()
3863                  */
3864                 scrub_pause_on(fs_info);
3865                 ret = btrfs_inc_block_group_ro(fs_info, cache);
3866                 if (!ret && is_dev_replace) {
3867                         /*
3868                          * If we are doing a device replace wait for any tasks
3869                          * that started dellaloc right before we set the block
3870                          * group to RO mode, as they might have just allocated
3871                          * an extent from it or decided they could do a nocow
3872                          * write. And if any such tasks did that, wait for their
3873                          * ordered extents to complete and then commit the
3874                          * current transaction, so that we can later see the new
3875                          * extent items in the extent tree - the ordered extents
3876                          * create delayed data references (for cow writes) when
3877                          * they complete, which will be run and insert the
3878                          * corresponding extent items into the extent tree when
3879                          * we commit the transaction they used when running
3880                          * inode.c:btrfs_finish_ordered_io(). We later use
3881                          * the commit root of the extent tree to find extents
3882                          * to copy from the srcdev into the tgtdev, and we don't
3883                          * want to miss any new extents.
3884                          */
3885                         btrfs_wait_block_group_reservations(cache);
3886                         btrfs_wait_nocow_writers(cache);
3887                         ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
3888                                                        cache->key.objectid,
3889                                                        cache->key.offset);
3890                         if (ret > 0) {
3891                                 struct btrfs_trans_handle *trans;
3892
3893                                 trans = btrfs_join_transaction(root);
3894                                 if (IS_ERR(trans))
3895                                         ret = PTR_ERR(trans);
3896                                 else
3897                                         ret = btrfs_commit_transaction(trans);
3898                                 if (ret) {
3899                                         scrub_pause_off(fs_info);
3900                                         btrfs_put_block_group(cache);
3901                                         break;
3902                                 }
3903                         }
3904                 }
3905                 scrub_pause_off(fs_info);
3906
3907                 if (ret == 0) {
3908                         ro_set = 1;
3909                 } else if (ret == -ENOSPC) {
3910                         /*
3911                          * btrfs_inc_block_group_ro return -ENOSPC when it
3912                          * failed in creating new chunk for metadata.
3913                          * It is not a problem for scrub/replace, because
3914                          * metadata are always cowed, and our scrub paused
3915                          * commit_transactions.
3916                          */
3917                         ro_set = 0;
3918                 } else {
3919                         btrfs_warn(fs_info,
3920                                    "failed setting block group ro: %d", ret);
3921                         btrfs_put_block_group(cache);
3922                         break;
3923                 }
3924
3925                 btrfs_dev_replace_write_lock(&fs_info->dev_replace);
3926                 dev_replace->cursor_right = found_key.offset + length;
3927                 dev_replace->cursor_left = found_key.offset;
3928                 dev_replace->item_needs_writeback = 1;
3929                 btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
3930                 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3931                                   found_key.offset, cache, is_dev_replace);
3932
3933                 /*
3934                  * flush, submit all pending read and write bios, afterwards
3935                  * wait for them.
3936                  * Note that in the dev replace case, a read request causes
3937                  * write requests that are submitted in the read completion
3938                  * worker. Therefore in the current situation, it is required
3939                  * that all write requests are flushed, so that all read and
3940                  * write requests are really completed when bios_in_flight
3941                  * changes to 0.
3942                  */
3943                 sctx->flush_all_writes = true;
3944                 scrub_submit(sctx);
3945                 mutex_lock(&sctx->wr_lock);
3946                 scrub_wr_submit(sctx);
3947                 mutex_unlock(&sctx->wr_lock);
3948
3949                 wait_event(sctx->list_wait,
3950                            atomic_read(&sctx->bios_in_flight) == 0);
3951
3952                 scrub_pause_on(fs_info);
3953
3954                 /*
3955                  * must be called before we decrease @scrub_paused.
3956                  * make sure we don't block transaction commit while
3957                  * we are waiting pending workers finished.
3958                  */
3959                 wait_event(sctx->list_wait,
3960                            atomic_read(&sctx->workers_pending) == 0);
3961                 sctx->flush_all_writes = false;
3962
3963                 scrub_pause_off(fs_info);
3964
3965                 btrfs_dev_replace_write_lock(&fs_info->dev_replace);
3966                 dev_replace->cursor_left = dev_replace->cursor_right;
3967                 dev_replace->item_needs_writeback = 1;
3968                 btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
3969
3970                 if (ro_set)
3971                         btrfs_dec_block_group_ro(cache);
3972
3973                 /*
3974                  * We might have prevented the cleaner kthread from deleting
3975                  * this block group if it was already unused because we raced
3976                  * and set it to RO mode first. So add it back to the unused
3977                  * list, otherwise it might not ever be deleted unless a manual
3978                  * balance is triggered or it becomes used and unused again.
3979                  */
3980                 spin_lock(&cache->lock);
3981                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3982                     btrfs_block_group_used(&cache->item) == 0) {
3983                         spin_unlock(&cache->lock);
3984                         spin_lock(&fs_info->unused_bgs_lock);
3985                         if (list_empty(&cache->bg_list)) {
3986                                 btrfs_get_block_group(cache);
3987                                 trace_btrfs_add_unused_block_group(cache);
3988                                 list_add_tail(&cache->bg_list,
3989                                               &fs_info->unused_bgs);
3990                         }
3991                         spin_unlock(&fs_info->unused_bgs_lock);
3992                 } else {
3993                         spin_unlock(&cache->lock);
3994                 }
3995
3996                 btrfs_put_block_group(cache);
3997                 if (ret)
3998                         break;
3999                 if (is_dev_replace &&
4000                     atomic64_read(&dev_replace->num_write_errors) > 0) {
4001                         ret = -EIO;
4002                         break;
4003                 }
4004                 if (sctx->stat.malloc_errors > 0) {
4005                         ret = -ENOMEM;
4006                         break;
4007                 }
4008 skip:
4009                 key.offset = found_key.offset + length;
4010                 btrfs_release_path(path);
4011         }
4012
4013         btrfs_free_path(path);
4014
4015         return ret;
4016 }
4017
4018 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
4019                                            struct btrfs_device *scrub_dev)
4020 {
4021         int     i;
4022         u64     bytenr;
4023         u64     gen;
4024         int     ret;
4025         struct btrfs_fs_info *fs_info = sctx->fs_info;
4026
4027         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
4028                 return -EIO;
4029
4030         /* Seed devices of a new filesystem has their own generation. */
4031         if (scrub_dev->fs_devices != fs_info->fs_devices)
4032                 gen = scrub_dev->generation;
4033         else
4034                 gen = fs_info->last_trans_committed;
4035
4036         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4037                 bytenr = btrfs_sb_offset(i);
4038                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4039                     scrub_dev->commit_total_bytes)
4040                         break;
4041
4042                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4043                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4044                                   NULL, 1, bytenr);
4045                 if (ret)
4046                         return ret;
4047         }
4048         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4049
4050         return 0;
4051 }
4052
4053 /*
4054  * get a reference count on fs_info->scrub_workers. start worker if necessary
4055  */
4056 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4057                                                 int is_dev_replace)
4058 {
4059         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4060         int max_active = fs_info->thread_pool_size;
4061
4062         if (fs_info->scrub_workers_refcnt == 0) {
4063                 fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
4064                                 flags, is_dev_replace ? 1 : max_active, 4);
4065                 if (!fs_info->scrub_workers)
4066                         goto fail_scrub_workers;
4067
4068                 fs_info->scrub_wr_completion_workers =
4069                         btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
4070                                               max_active, 2);
4071                 if (!fs_info->scrub_wr_completion_workers)
4072                         goto fail_scrub_wr_completion_workers;
4073
4074                 fs_info->scrub_nocow_workers =
4075                         btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0);
4076                 if (!fs_info->scrub_nocow_workers)
4077                         goto fail_scrub_nocow_workers;
4078                 fs_info->scrub_parity_workers =
4079                         btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
4080                                               max_active, 2);
4081                 if (!fs_info->scrub_parity_workers)
4082                         goto fail_scrub_parity_workers;
4083         }
4084         ++fs_info->scrub_workers_refcnt;
4085         return 0;
4086
4087 fail_scrub_parity_workers:
4088         btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
4089 fail_scrub_nocow_workers:
4090         btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4091 fail_scrub_wr_completion_workers:
4092         btrfs_destroy_workqueue(fs_info->scrub_workers);
4093 fail_scrub_workers:
4094         return -ENOMEM;
4095 }
4096
4097 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
4098 {
4099         if (--fs_info->scrub_workers_refcnt == 0) {
4100                 btrfs_destroy_workqueue(fs_info->scrub_workers);
4101                 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4102                 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
4103                 btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
4104         }
4105         WARN_ON(fs_info->scrub_workers_refcnt < 0);
4106 }
4107
4108 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4109                     u64 end, struct btrfs_scrub_progress *progress,
4110                     int readonly, int is_dev_replace)
4111 {
4112         struct scrub_ctx *sctx;
4113         int ret;
4114         struct btrfs_device *dev;
4115         struct rcu_string *name;
4116
4117         if (btrfs_fs_closing(fs_info))
4118                 return -EINVAL;
4119
4120         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4121                 /*
4122                  * in this case scrub is unable to calculate the checksum
4123                  * the way scrub is implemented. Do not handle this
4124                  * situation at all because it won't ever happen.
4125                  */
4126                 btrfs_err(fs_info,
4127                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4128                        fs_info->nodesize,
4129                        BTRFS_STRIPE_LEN);
4130                 return -EINVAL;
4131         }
4132
4133         if (fs_info->sectorsize != PAGE_SIZE) {
4134                 /* not supported for data w/o checksums */
4135                 btrfs_err_rl(fs_info,
4136                            "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
4137                        fs_info->sectorsize, PAGE_SIZE);
4138                 return -EINVAL;
4139         }
4140
4141         if (fs_info->nodesize >
4142             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
4143             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
4144                 /*
4145                  * would exhaust the array bounds of pagev member in
4146                  * struct scrub_block
4147                  */
4148                 btrfs_err(fs_info,
4149                           "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
4150                        fs_info->nodesize,
4151                        SCRUB_MAX_PAGES_PER_BLOCK,
4152                        fs_info->sectorsize,
4153                        SCRUB_MAX_PAGES_PER_BLOCK);
4154                 return -EINVAL;
4155         }
4156
4157
4158         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4159         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
4160         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4161                      !is_dev_replace)) {
4162                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4163                 return -ENODEV;
4164         }
4165
4166         if (!is_dev_replace && !readonly &&
4167             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4168                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4169                 rcu_read_lock();
4170                 name = rcu_dereference(dev->name);
4171                 btrfs_err(fs_info, "scrub: device %s is not writable",
4172                           name->str);
4173                 rcu_read_unlock();
4174                 return -EROFS;
4175         }
4176
4177         mutex_lock(&fs_info->scrub_lock);
4178         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4179             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4180                 mutex_unlock(&fs_info->scrub_lock);
4181                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4182                 return -EIO;
4183         }
4184
4185         btrfs_dev_replace_read_lock(&fs_info->dev_replace);
4186         if (dev->scrub_ctx ||
4187             (!is_dev_replace &&
4188              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4189                 btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
4190                 mutex_unlock(&fs_info->scrub_lock);
4191                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4192                 return -EINPROGRESS;
4193         }
4194         btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
4195
4196         ret = scrub_workers_get(fs_info, is_dev_replace);
4197         if (ret) {
4198                 mutex_unlock(&fs_info->scrub_lock);
4199                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4200                 return ret;
4201         }
4202
4203         sctx = scrub_setup_ctx(dev, is_dev_replace);
4204         if (IS_ERR(sctx)) {
4205                 mutex_unlock(&fs_info->scrub_lock);
4206                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4207                 scrub_workers_put(fs_info);
4208                 return PTR_ERR(sctx);
4209         }
4210         sctx->readonly = readonly;
4211         dev->scrub_ctx = sctx;
4212         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4213
4214         /*
4215          * checking @scrub_pause_req here, we can avoid
4216          * race between committing transaction and scrubbing.
4217          */
4218         __scrub_blocked_if_needed(fs_info);
4219         atomic_inc(&fs_info->scrubs_running);
4220         mutex_unlock(&fs_info->scrub_lock);
4221
4222         if (!is_dev_replace) {
4223                 /*
4224                  * by holding device list mutex, we can
4225                  * kick off writing super in log tree sync.
4226                  */
4227                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4228                 ret = scrub_supers(sctx, dev);
4229                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4230         }
4231
4232         if (!ret)
4233                 ret = scrub_enumerate_chunks(sctx, dev, start, end,
4234                                              is_dev_replace);
4235
4236         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4237         atomic_dec(&fs_info->scrubs_running);
4238         wake_up(&fs_info->scrub_pause_wait);
4239
4240         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4241
4242         if (progress)
4243                 memcpy(progress, &sctx->stat, sizeof(*progress));
4244
4245         mutex_lock(&fs_info->scrub_lock);
4246         dev->scrub_ctx = NULL;
4247         scrub_workers_put(fs_info);
4248         mutex_unlock(&fs_info->scrub_lock);
4249
4250         scrub_put_ctx(sctx);
4251
4252         return ret;
4253 }
4254
4255 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4256 {
4257         mutex_lock(&fs_info->scrub_lock);
4258         atomic_inc(&fs_info->scrub_pause_req);
4259         while (atomic_read(&fs_info->scrubs_paused) !=
4260                atomic_read(&fs_info->scrubs_running)) {
4261                 mutex_unlock(&fs_info->scrub_lock);
4262                 wait_event(fs_info->scrub_pause_wait,
4263                            atomic_read(&fs_info->scrubs_paused) ==
4264                            atomic_read(&fs_info->scrubs_running));
4265                 mutex_lock(&fs_info->scrub_lock);
4266         }
4267         mutex_unlock(&fs_info->scrub_lock);
4268 }
4269
4270 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4271 {
4272         atomic_dec(&fs_info->scrub_pause_req);
4273         wake_up(&fs_info->scrub_pause_wait);
4274 }
4275
4276 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4277 {
4278         mutex_lock(&fs_info->scrub_lock);
4279         if (!atomic_read(&fs_info->scrubs_running)) {
4280                 mutex_unlock(&fs_info->scrub_lock);
4281                 return -ENOTCONN;
4282         }
4283
4284         atomic_inc(&fs_info->scrub_cancel_req);
4285         while (atomic_read(&fs_info->scrubs_running)) {
4286                 mutex_unlock(&fs_info->scrub_lock);
4287                 wait_event(fs_info->scrub_pause_wait,
4288                            atomic_read(&fs_info->scrubs_running) == 0);
4289                 mutex_lock(&fs_info->scrub_lock);
4290         }
4291         atomic_dec(&fs_info->scrub_cancel_req);
4292         mutex_unlock(&fs_info->scrub_lock);
4293
4294         return 0;
4295 }
4296
4297 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
4298                            struct btrfs_device *dev)
4299 {
4300         struct scrub_ctx *sctx;
4301
4302         mutex_lock(&fs_info->scrub_lock);
4303         sctx = dev->scrub_ctx;
4304         if (!sctx) {
4305                 mutex_unlock(&fs_info->scrub_lock);
4306                 return -ENOTCONN;
4307         }
4308         atomic_inc(&sctx->cancel_req);
4309         while (dev->scrub_ctx) {
4310                 mutex_unlock(&fs_info->scrub_lock);
4311                 wait_event(fs_info->scrub_pause_wait,
4312                            dev->scrub_ctx == NULL);
4313                 mutex_lock(&fs_info->scrub_lock);
4314         }
4315         mutex_unlock(&fs_info->scrub_lock);
4316
4317         return 0;
4318 }
4319
4320 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4321                          struct btrfs_scrub_progress *progress)
4322 {
4323         struct btrfs_device *dev;
4324         struct scrub_ctx *sctx = NULL;
4325
4326         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4327         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
4328         if (dev)
4329                 sctx = dev->scrub_ctx;
4330         if (sctx)
4331                 memcpy(progress, &sctx->stat, sizeof(*progress));
4332         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4333
4334         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4335 }
4336
4337 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4338                                u64 extent_logical, u64 extent_len,
4339                                u64 *extent_physical,
4340                                struct btrfs_device **extent_dev,
4341                                int *extent_mirror_num)
4342 {
4343         u64 mapped_length;
4344         struct btrfs_bio *bbio = NULL;
4345         int ret;
4346
4347         mapped_length = extent_len;
4348         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4349                               &mapped_length, &bbio, 0);
4350         if (ret || !bbio || mapped_length < extent_len ||
4351             !bbio->stripes[0].dev->bdev) {
4352                 btrfs_put_bbio(bbio);
4353                 return;
4354         }
4355
4356         *extent_physical = bbio->stripes[0].physical;
4357         *extent_mirror_num = bbio->mirror_num;
4358         *extent_dev = bbio->stripes[0].dev;
4359         btrfs_put_bbio(bbio);
4360 }
4361
4362 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
4363                             int mirror_num, u64 physical_for_dev_replace)
4364 {
4365         struct scrub_copy_nocow_ctx *nocow_ctx;
4366         struct btrfs_fs_info *fs_info = sctx->fs_info;
4367
4368         nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
4369         if (!nocow_ctx) {
4370                 spin_lock(&sctx->stat_lock);
4371                 sctx->stat.malloc_errors++;
4372                 spin_unlock(&sctx->stat_lock);
4373                 return -ENOMEM;
4374         }
4375
4376         scrub_pending_trans_workers_inc(sctx);
4377
4378         nocow_ctx->sctx = sctx;
4379         nocow_ctx->logical = logical;
4380         nocow_ctx->len = len;
4381         nocow_ctx->mirror_num = mirror_num;
4382         nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
4383         btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
4384                         copy_nocow_pages_worker, NULL, NULL);
4385         INIT_LIST_HEAD(&nocow_ctx->inodes);
4386         btrfs_queue_work(fs_info->scrub_nocow_workers,
4387                          &nocow_ctx->work);
4388
4389         return 0;
4390 }
4391
4392 static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
4393 {
4394         struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
4395         struct scrub_nocow_inode *nocow_inode;
4396
4397         nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
4398         if (!nocow_inode)
4399                 return -ENOMEM;
4400         nocow_inode->inum = inum;
4401         nocow_inode->offset = offset;
4402         nocow_inode->root = root;
4403         list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
4404         return 0;
4405 }
4406
4407 #define COPY_COMPLETE 1
4408
4409 static void copy_nocow_pages_worker(struct btrfs_work *work)
4410 {
4411         struct scrub_copy_nocow_ctx *nocow_ctx =
4412                 container_of(work, struct scrub_copy_nocow_ctx, work);
4413         struct scrub_ctx *sctx = nocow_ctx->sctx;
4414         struct btrfs_fs_info *fs_info = sctx->fs_info;
4415         struct btrfs_root *root = fs_info->extent_root;
4416         u64 logical = nocow_ctx->logical;
4417         u64 len = nocow_ctx->len;
4418         int mirror_num = nocow_ctx->mirror_num;
4419         u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4420         int ret;
4421         struct btrfs_trans_handle *trans = NULL;
4422         struct btrfs_path *path;
4423         int not_written = 0;
4424
4425         path = btrfs_alloc_path();
4426         if (!path) {
4427                 spin_lock(&sctx->stat_lock);
4428                 sctx->stat.malloc_errors++;
4429                 spin_unlock(&sctx->stat_lock);
4430                 not_written = 1;
4431                 goto out;
4432         }
4433
4434         trans = btrfs_join_transaction(root);
4435         if (IS_ERR(trans)) {
4436                 not_written = 1;
4437                 goto out;
4438         }
4439
4440         ret = iterate_inodes_from_logical(logical, fs_info, path,
4441                         record_inode_for_nocow, nocow_ctx, false);
4442         if (ret != 0 && ret != -ENOENT) {
4443                 btrfs_warn(fs_info,
4444                            "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d",
4445                            logical, physical_for_dev_replace, len, mirror_num,
4446                            ret);
4447                 not_written = 1;
4448                 goto out;
4449         }
4450
4451         btrfs_end_transaction(trans);
4452         trans = NULL;
4453         while (!list_empty(&nocow_ctx->inodes)) {
4454                 struct scrub_nocow_inode *entry;
4455                 entry = list_first_entry(&nocow_ctx->inodes,
4456                                          struct scrub_nocow_inode,
4457                                          list);
4458                 list_del_init(&entry->list);
4459                 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
4460                                                  entry->root, nocow_ctx);
4461                 kfree(entry);
4462                 if (ret == COPY_COMPLETE) {
4463                         ret = 0;
4464                         break;
4465                 } else if (ret) {
4466                         break;
4467                 }
4468         }
4469 out:
4470         while (!list_empty(&nocow_ctx->inodes)) {
4471                 struct scrub_nocow_inode *entry;
4472                 entry = list_first_entry(&nocow_ctx->inodes,
4473                                          struct scrub_nocow_inode,
4474                                          list);
4475                 list_del_init(&entry->list);
4476                 kfree(entry);
4477         }
4478         if (trans && !IS_ERR(trans))
4479                 btrfs_end_transaction(trans);
4480         if (not_written)
4481                 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4482                                             num_uncorrectable_read_errors);
4483
4484         btrfs_free_path(path);
4485         kfree(nocow_ctx);
4486
4487         scrub_pending_trans_workers_dec(sctx);
4488 }
4489
4490 static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
4491                                  u64 logical)
4492 {
4493         struct extent_state *cached_state = NULL;
4494         struct btrfs_ordered_extent *ordered;
4495         struct extent_io_tree *io_tree;
4496         struct extent_map *em;
4497         u64 lockstart = start, lockend = start + len - 1;
4498         int ret = 0;
4499
4500         io_tree = &inode->io_tree;
4501
4502         lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
4503         ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4504         if (ordered) {
4505                 btrfs_put_ordered_extent(ordered);
4506                 ret = 1;
4507                 goto out_unlock;
4508         }
4509
4510         em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4511         if (IS_ERR(em)) {
4512                 ret = PTR_ERR(em);
4513                 goto out_unlock;
4514         }
4515
4516         /*
4517          * This extent does not actually cover the logical extent anymore,
4518          * move on to the next inode.
4519          */
4520         if (em->block_start > logical ||
4521             em->block_start + em->block_len < logical + len ||
4522             test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4523                 free_extent_map(em);
4524                 ret = 1;
4525                 goto out_unlock;
4526         }
4527         free_extent_map(em);
4528
4529 out_unlock:
4530         unlock_extent_cached(io_tree, lockstart, lockend, &cached_state);
4531         return ret;
4532 }
4533
4534 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4535                                       struct scrub_copy_nocow_ctx *nocow_ctx)
4536 {
4537         struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info;
4538         struct btrfs_key key;
4539         struct inode *inode;
4540         struct page *page;
4541         struct btrfs_root *local_root;
4542         struct extent_io_tree *io_tree;
4543         u64 physical_for_dev_replace;
4544         u64 nocow_ctx_logical;
4545         u64 len = nocow_ctx->len;
4546         unsigned long index;
4547         int srcu_index;
4548         int ret = 0;
4549         int err = 0;
4550
4551         key.objectid = root;
4552         key.type = BTRFS_ROOT_ITEM_KEY;
4553         key.offset = (u64)-1;
4554
4555         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4556
4557         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
4558         if (IS_ERR(local_root)) {
4559                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4560                 return PTR_ERR(local_root);
4561         }
4562
4563         key.type = BTRFS_INODE_ITEM_KEY;
4564         key.objectid = inum;
4565         key.offset = 0;
4566         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
4567         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4568         if (IS_ERR(inode))
4569                 return PTR_ERR(inode);
4570
4571         /* Avoid truncate/dio/punch hole.. */
4572         inode_lock(inode);
4573         inode_dio_wait(inode);
4574
4575         physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4576         io_tree = &BTRFS_I(inode)->io_tree;
4577         nocow_ctx_logical = nocow_ctx->logical;
4578
4579         ret = check_extent_to_block(BTRFS_I(inode), offset, len,
4580                         nocow_ctx_logical);
4581         if (ret) {
4582                 ret = ret > 0 ? 0 : ret;
4583                 goto out;
4584         }
4585
4586         while (len >= PAGE_SIZE) {
4587                 index = offset >> PAGE_SHIFT;
4588 again:
4589                 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4590                 if (!page) {
4591                         btrfs_err(fs_info, "find_or_create_page() failed");
4592                         ret = -ENOMEM;
4593                         goto out;
4594                 }
4595
4596                 if (PageUptodate(page)) {
4597                         if (PageDirty(page))
4598                                 goto next_page;
4599                 } else {
4600                         ClearPageError(page);
4601                         err = extent_read_full_page(io_tree, page,
4602                                                            btrfs_get_extent,
4603                                                            nocow_ctx->mirror_num);
4604                         if (err) {
4605                                 ret = err;
4606                                 goto next_page;
4607                         }
4608
4609                         lock_page(page);
4610                         /*
4611                          * If the page has been remove from the page cache,
4612                          * the data on it is meaningless, because it may be
4613                          * old one, the new data may be written into the new
4614                          * page in the page cache.
4615                          */
4616                         if (page->mapping != inode->i_mapping) {
4617                                 unlock_page(page);
4618                                 put_page(page);
4619                                 goto again;
4620                         }
4621                         if (!PageUptodate(page)) {
4622                                 ret = -EIO;
4623                                 goto next_page;
4624                         }
4625                 }
4626
4627                 ret = check_extent_to_block(BTRFS_I(inode), offset, len,
4628                                             nocow_ctx_logical);
4629                 if (ret) {
4630                         ret = ret > 0 ? 0 : ret;
4631                         goto next_page;
4632                 }
4633
4634                 err = write_page_nocow(nocow_ctx->sctx,
4635                                        physical_for_dev_replace, page);
4636                 if (err)
4637                         ret = err;
4638 next_page:
4639                 unlock_page(page);
4640                 put_page(page);
4641
4642                 if (ret)
4643                         break;
4644
4645                 offset += PAGE_SIZE;
4646                 physical_for_dev_replace += PAGE_SIZE;
4647                 nocow_ctx_logical += PAGE_SIZE;
4648                 len -= PAGE_SIZE;
4649         }
4650         ret = COPY_COMPLETE;
4651 out:
4652         inode_unlock(inode);
4653         iput(inode);
4654         return ret;
4655 }
4656
4657 static int write_page_nocow(struct scrub_ctx *sctx,
4658                             u64 physical_for_dev_replace, struct page *page)
4659 {
4660         struct bio *bio;
4661         struct btrfs_device *dev;
4662
4663         dev = sctx->wr_tgtdev;
4664         if (!dev)
4665                 return -EIO;
4666         if (!dev->bdev) {
4667                 btrfs_warn_rl(dev->fs_info,
4668                         "scrub write_page_nocow(bdev == NULL) is unexpected");
4669                 return -EIO;
4670         }
4671         bio = btrfs_io_bio_alloc(1);
4672         bio->bi_iter.bi_size = 0;
4673         bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
4674         bio_set_dev(bio, dev->bdev);
4675         bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
4676         /* bio_add_page won't fail on a freshly allocated bio */
4677         bio_add_page(bio, page, PAGE_SIZE, 0);
4678
4679         if (btrfsic_submit_bio_wait(bio)) {
4680                 bio_put(bio);
4681                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4682                 return -EIO;
4683         }
4684
4685         bio_put(bio);
4686         return 0;
4687 }