GNU Linux-libre 4.19.264-gnu1
[releases.git] / fs / btrfs / scrub.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
4  */
5
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include "ctree.h"
10 #include "volumes.h"
11 #include "disk-io.h"
12 #include "ordered-data.h"
13 #include "transaction.h"
14 #include "backref.h"
15 #include "extent_io.h"
16 #include "dev-replace.h"
17 #include "check-integrity.h"
18 #include "rcu-string.h"
19 #include "raid56.h"
20
21 /*
22  * This is only the first step towards a full-features scrub. It reads all
23  * extent and super block and verifies the checksums. In case a bad checksum
24  * is found or the extent cannot be read, good data will be written back if
25  * any can be found.
26  *
27  * Future enhancements:
28  *  - In case an unrepairable extent is encountered, track which files are
29  *    affected and report them
30  *  - track and record media errors, throw out bad devices
31  *  - add a mode to also read unallocated space
32  */
33
34 struct scrub_block;
35 struct scrub_ctx;
36
37 /*
38  * the following three values only influence the performance.
39  * The last one configures the number of parallel and outstanding I/O
40  * operations. The first two values configure an upper limit for the number
41  * of (dynamically allocated) pages that are added to a bio.
42  */
43 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
44 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
45 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
46
47 /*
48  * the following value times PAGE_SIZE needs to be large enough to match the
49  * largest node/leaf/sector size that shall be supported.
50  * Values larger than BTRFS_STRIPE_LEN are not supported.
51  */
52 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
53
54 struct scrub_recover {
55         refcount_t              refs;
56         struct btrfs_bio        *bbio;
57         u64                     map_length;
58 };
59
60 struct scrub_page {
61         struct scrub_block      *sblock;
62         struct page             *page;
63         struct btrfs_device     *dev;
64         struct list_head        list;
65         u64                     flags;  /* extent flags */
66         u64                     generation;
67         u64                     logical;
68         u64                     physical;
69         u64                     physical_for_dev_replace;
70         atomic_t                refs;
71         struct {
72                 unsigned int    mirror_num:8;
73                 unsigned int    have_csum:1;
74                 unsigned int    io_error:1;
75         };
76         u8                      csum[BTRFS_CSUM_SIZE];
77
78         struct scrub_recover    *recover;
79 };
80
81 struct scrub_bio {
82         int                     index;
83         struct scrub_ctx        *sctx;
84         struct btrfs_device     *dev;
85         struct bio              *bio;
86         blk_status_t            status;
87         u64                     logical;
88         u64                     physical;
89 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
90         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
91 #else
92         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
93 #endif
94         int                     page_count;
95         int                     next_free;
96         struct btrfs_work       work;
97 };
98
99 struct scrub_block {
100         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
101         int                     page_count;
102         atomic_t                outstanding_pages;
103         refcount_t              refs; /* free mem on transition to zero */
104         struct scrub_ctx        *sctx;
105         struct scrub_parity     *sparity;
106         struct {
107                 unsigned int    header_error:1;
108                 unsigned int    checksum_error:1;
109                 unsigned int    no_io_error_seen:1;
110                 unsigned int    generation_error:1; /* also sets header_error */
111
112                 /* The following is for the data used to check parity */
113                 /* It is for the data with checksum */
114                 unsigned int    data_corrected:1;
115         };
116         struct btrfs_work       work;
117 };
118
119 /* Used for the chunks with parity stripe such RAID5/6 */
120 struct scrub_parity {
121         struct scrub_ctx        *sctx;
122
123         struct btrfs_device     *scrub_dev;
124
125         u64                     logic_start;
126
127         u64                     logic_end;
128
129         int                     nsectors;
130
131         u64                     stripe_len;
132
133         refcount_t              refs;
134
135         struct list_head        spages;
136
137         /* Work of parity check and repair */
138         struct btrfs_work       work;
139
140         /* Mark the parity blocks which have data */
141         unsigned long           *dbitmap;
142
143         /*
144          * Mark the parity blocks which have data, but errors happen when
145          * read data or check data
146          */
147         unsigned long           *ebitmap;
148
149         unsigned long           bitmap[0];
150 };
151
152 struct scrub_ctx {
153         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
154         struct btrfs_fs_info    *fs_info;
155         int                     first_free;
156         int                     curr;
157         atomic_t                bios_in_flight;
158         atomic_t                workers_pending;
159         spinlock_t              list_lock;
160         wait_queue_head_t       list_wait;
161         u16                     csum_size;
162         struct list_head        csum_list;
163         atomic_t                cancel_req;
164         int                     readonly;
165         int                     pages_per_rd_bio;
166
167         int                     is_dev_replace;
168
169         struct scrub_bio        *wr_curr_bio;
170         struct mutex            wr_lock;
171         int                     pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
172         struct btrfs_device     *wr_tgtdev;
173         bool                    flush_all_writes;
174
175         /*
176          * statistics
177          */
178         struct btrfs_scrub_progress stat;
179         spinlock_t              stat_lock;
180
181         /*
182          * Use a ref counter to avoid use-after-free issues. Scrub workers
183          * decrement bios_in_flight and workers_pending and then do a wakeup
184          * on the list_wait wait queue. We must ensure the main scrub task
185          * doesn't free the scrub context before or while the workers are
186          * doing the wakeup() call.
187          */
188         refcount_t              refs;
189 };
190
191 struct scrub_warning {
192         struct btrfs_path       *path;
193         u64                     extent_item_size;
194         const char              *errstr;
195         u64                     physical;
196         u64                     logical;
197         struct btrfs_device     *dev;
198 };
199
200 struct full_stripe_lock {
201         struct rb_node node;
202         u64 logical;
203         u64 refs;
204         struct mutex mutex;
205 };
206
207 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
208 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
209 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
210 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
211                                      struct scrub_block *sblocks_for_recheck);
212 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
213                                 struct scrub_block *sblock,
214                                 int retry_failed_mirror);
215 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
216 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
217                                              struct scrub_block *sblock_good);
218 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
219                                             struct scrub_block *sblock_good,
220                                             int page_num, int force_write);
221 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
222 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
223                                            int page_num);
224 static int scrub_checksum_data(struct scrub_block *sblock);
225 static int scrub_checksum_tree_block(struct scrub_block *sblock);
226 static int scrub_checksum_super(struct scrub_block *sblock);
227 static void scrub_block_get(struct scrub_block *sblock);
228 static void scrub_block_put(struct scrub_block *sblock);
229 static void scrub_page_get(struct scrub_page *spage);
230 static void scrub_page_put(struct scrub_page *spage);
231 static void scrub_parity_get(struct scrub_parity *sparity);
232 static void scrub_parity_put(struct scrub_parity *sparity);
233 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
234                                     struct scrub_page *spage);
235 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
236                        u64 physical, struct btrfs_device *dev, u64 flags,
237                        u64 gen, int mirror_num, u8 *csum, int force,
238                        u64 physical_for_dev_replace);
239 static void scrub_bio_end_io(struct bio *bio);
240 static void scrub_bio_end_io_worker(struct btrfs_work *work);
241 static void scrub_block_complete(struct scrub_block *sblock);
242 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
243                                u64 extent_logical, u64 extent_len,
244                                u64 *extent_physical,
245                                struct btrfs_device **extent_dev,
246                                int *extent_mirror_num);
247 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
248                                     struct scrub_page *spage);
249 static void scrub_wr_submit(struct scrub_ctx *sctx);
250 static void scrub_wr_bio_end_io(struct bio *bio);
251 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
252 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
253 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
254 static void scrub_put_ctx(struct scrub_ctx *sctx);
255
256 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
257 {
258         return page->recover &&
259                (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
260 }
261
262 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
263 {
264         refcount_inc(&sctx->refs);
265         atomic_inc(&sctx->bios_in_flight);
266 }
267
268 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
269 {
270         atomic_dec(&sctx->bios_in_flight);
271         wake_up(&sctx->list_wait);
272         scrub_put_ctx(sctx);
273 }
274
275 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
276 {
277         while (atomic_read(&fs_info->scrub_pause_req)) {
278                 mutex_unlock(&fs_info->scrub_lock);
279                 wait_event(fs_info->scrub_pause_wait,
280                    atomic_read(&fs_info->scrub_pause_req) == 0);
281                 mutex_lock(&fs_info->scrub_lock);
282         }
283 }
284
285 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
286 {
287         atomic_inc(&fs_info->scrubs_paused);
288         wake_up(&fs_info->scrub_pause_wait);
289 }
290
291 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
292 {
293         mutex_lock(&fs_info->scrub_lock);
294         __scrub_blocked_if_needed(fs_info);
295         atomic_dec(&fs_info->scrubs_paused);
296         mutex_unlock(&fs_info->scrub_lock);
297
298         wake_up(&fs_info->scrub_pause_wait);
299 }
300
301 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
302 {
303         scrub_pause_on(fs_info);
304         scrub_pause_off(fs_info);
305 }
306
307 /*
308  * Insert new full stripe lock into full stripe locks tree
309  *
310  * Return pointer to existing or newly inserted full_stripe_lock structure if
311  * everything works well.
312  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
313  *
314  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
315  * function
316  */
317 static struct full_stripe_lock *insert_full_stripe_lock(
318                 struct btrfs_full_stripe_locks_tree *locks_root,
319                 u64 fstripe_logical)
320 {
321         struct rb_node **p;
322         struct rb_node *parent = NULL;
323         struct full_stripe_lock *entry;
324         struct full_stripe_lock *ret;
325         unsigned int nofs_flag;
326
327         lockdep_assert_held(&locks_root->lock);
328
329         p = &locks_root->root.rb_node;
330         while (*p) {
331                 parent = *p;
332                 entry = rb_entry(parent, struct full_stripe_lock, node);
333                 if (fstripe_logical < entry->logical) {
334                         p = &(*p)->rb_left;
335                 } else if (fstripe_logical > entry->logical) {
336                         p = &(*p)->rb_right;
337                 } else {
338                         entry->refs++;
339                         return entry;
340                 }
341         }
342
343         /*
344          * Insert new lock.
345          *
346          * We must use GFP_NOFS because the scrub task might be waiting for a
347          * worker task executing this function and in turn a transaction commit
348          * might be waiting the scrub task to pause (which needs to wait for all
349          * the worker tasks to complete before pausing).
350          */
351         nofs_flag = memalloc_nofs_save();
352         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
353         memalloc_nofs_restore(nofs_flag);
354         if (!ret)
355                 return ERR_PTR(-ENOMEM);
356         ret->logical = fstripe_logical;
357         ret->refs = 1;
358         mutex_init(&ret->mutex);
359
360         rb_link_node(&ret->node, parent, p);
361         rb_insert_color(&ret->node, &locks_root->root);
362         return ret;
363 }
364
365 /*
366  * Search for a full stripe lock of a block group
367  *
368  * Return pointer to existing full stripe lock if found
369  * Return NULL if not found
370  */
371 static struct full_stripe_lock *search_full_stripe_lock(
372                 struct btrfs_full_stripe_locks_tree *locks_root,
373                 u64 fstripe_logical)
374 {
375         struct rb_node *node;
376         struct full_stripe_lock *entry;
377
378         lockdep_assert_held(&locks_root->lock);
379
380         node = locks_root->root.rb_node;
381         while (node) {
382                 entry = rb_entry(node, struct full_stripe_lock, node);
383                 if (fstripe_logical < entry->logical)
384                         node = node->rb_left;
385                 else if (fstripe_logical > entry->logical)
386                         node = node->rb_right;
387                 else
388                         return entry;
389         }
390         return NULL;
391 }
392
393 /*
394  * Helper to get full stripe logical from a normal bytenr.
395  *
396  * Caller must ensure @cache is a RAID56 block group.
397  */
398 static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
399                                    u64 bytenr)
400 {
401         u64 ret;
402
403         /*
404          * Due to chunk item size limit, full stripe length should not be
405          * larger than U32_MAX. Just a sanity check here.
406          */
407         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
408
409         /*
410          * round_down() can only handle power of 2, while RAID56 full
411          * stripe length can be 64KiB * n, so we need to manually round down.
412          */
413         ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
414                 cache->full_stripe_len + cache->key.objectid;
415         return ret;
416 }
417
418 /*
419  * Lock a full stripe to avoid concurrency of recovery and read
420  *
421  * It's only used for profiles with parities (RAID5/6), for other profiles it
422  * does nothing.
423  *
424  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
425  * So caller must call unlock_full_stripe() at the same context.
426  *
427  * Return <0 if encounters error.
428  */
429 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
430                             bool *locked_ret)
431 {
432         struct btrfs_block_group_cache *bg_cache;
433         struct btrfs_full_stripe_locks_tree *locks_root;
434         struct full_stripe_lock *existing;
435         u64 fstripe_start;
436         int ret = 0;
437
438         *locked_ret = false;
439         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
440         if (!bg_cache) {
441                 ASSERT(0);
442                 return -ENOENT;
443         }
444
445         /* Profiles not based on parity don't need full stripe lock */
446         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
447                 goto out;
448         locks_root = &bg_cache->full_stripe_locks_root;
449
450         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
451
452         /* Now insert the full stripe lock */
453         mutex_lock(&locks_root->lock);
454         existing = insert_full_stripe_lock(locks_root, fstripe_start);
455         mutex_unlock(&locks_root->lock);
456         if (IS_ERR(existing)) {
457                 ret = PTR_ERR(existing);
458                 goto out;
459         }
460         mutex_lock(&existing->mutex);
461         *locked_ret = true;
462 out:
463         btrfs_put_block_group(bg_cache);
464         return ret;
465 }
466
467 /*
468  * Unlock a full stripe.
469  *
470  * NOTE: Caller must ensure it's the same context calling corresponding
471  * lock_full_stripe().
472  *
473  * Return 0 if we unlock full stripe without problem.
474  * Return <0 for error
475  */
476 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
477                               bool locked)
478 {
479         struct btrfs_block_group_cache *bg_cache;
480         struct btrfs_full_stripe_locks_tree *locks_root;
481         struct full_stripe_lock *fstripe_lock;
482         u64 fstripe_start;
483         bool freeit = false;
484         int ret = 0;
485
486         /* If we didn't acquire full stripe lock, no need to continue */
487         if (!locked)
488                 return 0;
489
490         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
491         if (!bg_cache) {
492                 ASSERT(0);
493                 return -ENOENT;
494         }
495         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
496                 goto out;
497
498         locks_root = &bg_cache->full_stripe_locks_root;
499         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
500
501         mutex_lock(&locks_root->lock);
502         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
503         /* Unpaired unlock_full_stripe() detected */
504         if (!fstripe_lock) {
505                 WARN_ON(1);
506                 ret = -ENOENT;
507                 mutex_unlock(&locks_root->lock);
508                 goto out;
509         }
510
511         if (fstripe_lock->refs == 0) {
512                 WARN_ON(1);
513                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
514                         fstripe_lock->logical);
515         } else {
516                 fstripe_lock->refs--;
517         }
518
519         if (fstripe_lock->refs == 0) {
520                 rb_erase(&fstripe_lock->node, &locks_root->root);
521                 freeit = true;
522         }
523         mutex_unlock(&locks_root->lock);
524
525         mutex_unlock(&fstripe_lock->mutex);
526         if (freeit)
527                 kfree(fstripe_lock);
528 out:
529         btrfs_put_block_group(bg_cache);
530         return ret;
531 }
532
533 static void scrub_free_csums(struct scrub_ctx *sctx)
534 {
535         while (!list_empty(&sctx->csum_list)) {
536                 struct btrfs_ordered_sum *sum;
537                 sum = list_first_entry(&sctx->csum_list,
538                                        struct btrfs_ordered_sum, list);
539                 list_del(&sum->list);
540                 kfree(sum);
541         }
542 }
543
544 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
545 {
546         int i;
547
548         if (!sctx)
549                 return;
550
551         /* this can happen when scrub is cancelled */
552         if (sctx->curr != -1) {
553                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
554
555                 for (i = 0; i < sbio->page_count; i++) {
556                         WARN_ON(!sbio->pagev[i]->page);
557                         scrub_block_put(sbio->pagev[i]->sblock);
558                 }
559                 bio_put(sbio->bio);
560         }
561
562         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
563                 struct scrub_bio *sbio = sctx->bios[i];
564
565                 if (!sbio)
566                         break;
567                 kfree(sbio);
568         }
569
570         kfree(sctx->wr_curr_bio);
571         scrub_free_csums(sctx);
572         kfree(sctx);
573 }
574
575 static void scrub_put_ctx(struct scrub_ctx *sctx)
576 {
577         if (refcount_dec_and_test(&sctx->refs))
578                 scrub_free_ctx(sctx);
579 }
580
581 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
582                 struct btrfs_fs_info *fs_info, int is_dev_replace)
583 {
584         struct scrub_ctx *sctx;
585         int             i;
586
587         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
588         if (!sctx)
589                 goto nomem;
590         refcount_set(&sctx->refs, 1);
591         sctx->is_dev_replace = is_dev_replace;
592         sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
593         sctx->curr = -1;
594         sctx->fs_info = fs_info;
595         INIT_LIST_HEAD(&sctx->csum_list);
596         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
597                 struct scrub_bio *sbio;
598
599                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
600                 if (!sbio)
601                         goto nomem;
602                 sctx->bios[i] = sbio;
603
604                 sbio->index = i;
605                 sbio->sctx = sctx;
606                 sbio->page_count = 0;
607                 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
608                                 scrub_bio_end_io_worker, NULL, NULL);
609
610                 if (i != SCRUB_BIOS_PER_SCTX - 1)
611                         sctx->bios[i]->next_free = i + 1;
612                 else
613                         sctx->bios[i]->next_free = -1;
614         }
615         sctx->first_free = 0;
616         atomic_set(&sctx->bios_in_flight, 0);
617         atomic_set(&sctx->workers_pending, 0);
618         atomic_set(&sctx->cancel_req, 0);
619         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
620
621         spin_lock_init(&sctx->list_lock);
622         spin_lock_init(&sctx->stat_lock);
623         init_waitqueue_head(&sctx->list_wait);
624
625         WARN_ON(sctx->wr_curr_bio != NULL);
626         mutex_init(&sctx->wr_lock);
627         sctx->wr_curr_bio = NULL;
628         if (is_dev_replace) {
629                 WARN_ON(!fs_info->dev_replace.tgtdev);
630                 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
631                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
632                 sctx->flush_all_writes = false;
633         }
634
635         return sctx;
636
637 nomem:
638         scrub_free_ctx(sctx);
639         return ERR_PTR(-ENOMEM);
640 }
641
642 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
643                                      void *warn_ctx)
644 {
645         u64 isize;
646         u32 nlink;
647         int ret;
648         int i;
649         unsigned nofs_flag;
650         struct extent_buffer *eb;
651         struct btrfs_inode_item *inode_item;
652         struct scrub_warning *swarn = warn_ctx;
653         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
654         struct inode_fs_paths *ipath = NULL;
655         struct btrfs_root *local_root;
656         struct btrfs_key root_key;
657         struct btrfs_key key;
658
659         root_key.objectid = root;
660         root_key.type = BTRFS_ROOT_ITEM_KEY;
661         root_key.offset = (u64)-1;
662         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
663         if (IS_ERR(local_root)) {
664                 ret = PTR_ERR(local_root);
665                 goto err;
666         }
667
668         /*
669          * this makes the path point to (inum INODE_ITEM ioff)
670          */
671         key.objectid = inum;
672         key.type = BTRFS_INODE_ITEM_KEY;
673         key.offset = 0;
674
675         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
676         if (ret) {
677                 btrfs_release_path(swarn->path);
678                 goto err;
679         }
680
681         eb = swarn->path->nodes[0];
682         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
683                                         struct btrfs_inode_item);
684         isize = btrfs_inode_size(eb, inode_item);
685         nlink = btrfs_inode_nlink(eb, inode_item);
686         btrfs_release_path(swarn->path);
687
688         /*
689          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
690          * uses GFP_NOFS in this context, so we keep it consistent but it does
691          * not seem to be strictly necessary.
692          */
693         nofs_flag = memalloc_nofs_save();
694         ipath = init_ipath(4096, local_root, swarn->path);
695         memalloc_nofs_restore(nofs_flag);
696         if (IS_ERR(ipath)) {
697                 ret = PTR_ERR(ipath);
698                 ipath = NULL;
699                 goto err;
700         }
701         ret = paths_from_inode(inum, ipath);
702
703         if (ret < 0)
704                 goto err;
705
706         /*
707          * we deliberately ignore the bit ipath might have been too small to
708          * hold all of the paths here
709          */
710         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
711                 btrfs_warn_in_rcu(fs_info,
712 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
713                                   swarn->errstr, swarn->logical,
714                                   rcu_str_deref(swarn->dev->name),
715                                   swarn->physical,
716                                   root, inum, offset,
717                                   min(isize - offset, (u64)PAGE_SIZE), nlink,
718                                   (char *)(unsigned long)ipath->fspath->val[i]);
719
720         free_ipath(ipath);
721         return 0;
722
723 err:
724         btrfs_warn_in_rcu(fs_info,
725                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
726                           swarn->errstr, swarn->logical,
727                           rcu_str_deref(swarn->dev->name),
728                           swarn->physical,
729                           root, inum, offset, ret);
730
731         free_ipath(ipath);
732         return 0;
733 }
734
735 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
736 {
737         struct btrfs_device *dev;
738         struct btrfs_fs_info *fs_info;
739         struct btrfs_path *path;
740         struct btrfs_key found_key;
741         struct extent_buffer *eb;
742         struct btrfs_extent_item *ei;
743         struct scrub_warning swarn;
744         unsigned long ptr = 0;
745         u64 extent_item_pos;
746         u64 flags = 0;
747         u64 ref_root;
748         u32 item_size;
749         u8 ref_level = 0;
750         int ret;
751
752         WARN_ON(sblock->page_count < 1);
753         dev = sblock->pagev[0]->dev;
754         fs_info = sblock->sctx->fs_info;
755
756         path = btrfs_alloc_path();
757         if (!path)
758                 return;
759
760         swarn.physical = sblock->pagev[0]->physical;
761         swarn.logical = sblock->pagev[0]->logical;
762         swarn.errstr = errstr;
763         swarn.dev = NULL;
764
765         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
766                                   &flags);
767         if (ret < 0)
768                 goto out;
769
770         extent_item_pos = swarn.logical - found_key.objectid;
771         swarn.extent_item_size = found_key.offset;
772
773         eb = path->nodes[0];
774         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
775         item_size = btrfs_item_size_nr(eb, path->slots[0]);
776
777         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
778                 do {
779                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
780                                                       item_size, &ref_root,
781                                                       &ref_level);
782                         btrfs_warn_in_rcu(fs_info,
783 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
784                                 errstr, swarn.logical,
785                                 rcu_str_deref(dev->name),
786                                 swarn.physical,
787                                 ref_level ? "node" : "leaf",
788                                 ret < 0 ? -1 : ref_level,
789                                 ret < 0 ? -1 : ref_root);
790                 } while (ret != 1);
791                 btrfs_release_path(path);
792         } else {
793                 btrfs_release_path(path);
794                 swarn.path = path;
795                 swarn.dev = dev;
796                 iterate_extent_inodes(fs_info, found_key.objectid,
797                                         extent_item_pos, 1,
798                                         scrub_print_warning_inode, &swarn, false);
799         }
800
801 out:
802         btrfs_free_path(path);
803 }
804
805 static inline void scrub_get_recover(struct scrub_recover *recover)
806 {
807         refcount_inc(&recover->refs);
808 }
809
810 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
811                                      struct scrub_recover *recover)
812 {
813         if (refcount_dec_and_test(&recover->refs)) {
814                 btrfs_bio_counter_dec(fs_info);
815                 btrfs_put_bbio(recover->bbio);
816                 kfree(recover);
817         }
818 }
819
820 /*
821  * scrub_handle_errored_block gets called when either verification of the
822  * pages failed or the bio failed to read, e.g. with EIO. In the latter
823  * case, this function handles all pages in the bio, even though only one
824  * may be bad.
825  * The goal of this function is to repair the errored block by using the
826  * contents of one of the mirrors.
827  */
828 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
829 {
830         struct scrub_ctx *sctx = sblock_to_check->sctx;
831         struct btrfs_device *dev;
832         struct btrfs_fs_info *fs_info;
833         u64 logical;
834         unsigned int failed_mirror_index;
835         unsigned int is_metadata;
836         unsigned int have_csum;
837         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
838         struct scrub_block *sblock_bad;
839         int ret;
840         int mirror_index;
841         int page_num;
842         int success;
843         bool full_stripe_locked;
844         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
845                                       DEFAULT_RATELIMIT_BURST);
846
847         BUG_ON(sblock_to_check->page_count < 1);
848         fs_info = sctx->fs_info;
849         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
850                 /*
851                  * if we find an error in a super block, we just report it.
852                  * They will get written with the next transaction commit
853                  * anyway
854                  */
855                 spin_lock(&sctx->stat_lock);
856                 ++sctx->stat.super_errors;
857                 spin_unlock(&sctx->stat_lock);
858                 return 0;
859         }
860         logical = sblock_to_check->pagev[0]->logical;
861         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
862         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
863         is_metadata = !(sblock_to_check->pagev[0]->flags &
864                         BTRFS_EXTENT_FLAG_DATA);
865         have_csum = sblock_to_check->pagev[0]->have_csum;
866         dev = sblock_to_check->pagev[0]->dev;
867
868         /*
869          * For RAID5/6, race can happen for a different device scrub thread.
870          * For data corruption, Parity and Data threads will both try
871          * to recovery the data.
872          * Race can lead to doubly added csum error, or even unrecoverable
873          * error.
874          */
875         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
876         if (ret < 0) {
877                 spin_lock(&sctx->stat_lock);
878                 if (ret == -ENOMEM)
879                         sctx->stat.malloc_errors++;
880                 sctx->stat.read_errors++;
881                 sctx->stat.uncorrectable_errors++;
882                 spin_unlock(&sctx->stat_lock);
883                 return ret;
884         }
885
886         /*
887          * read all mirrors one after the other. This includes to
888          * re-read the extent or metadata block that failed (that was
889          * the cause that this fixup code is called) another time,
890          * page by page this time in order to know which pages
891          * caused I/O errors and which ones are good (for all mirrors).
892          * It is the goal to handle the situation when more than one
893          * mirror contains I/O errors, but the errors do not
894          * overlap, i.e. the data can be repaired by selecting the
895          * pages from those mirrors without I/O error on the
896          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
897          * would be that mirror #1 has an I/O error on the first page,
898          * the second page is good, and mirror #2 has an I/O error on
899          * the second page, but the first page is good.
900          * Then the first page of the first mirror can be repaired by
901          * taking the first page of the second mirror, and the
902          * second page of the second mirror can be repaired by
903          * copying the contents of the 2nd page of the 1st mirror.
904          * One more note: if the pages of one mirror contain I/O
905          * errors, the checksum cannot be verified. In order to get
906          * the best data for repairing, the first attempt is to find
907          * a mirror without I/O errors and with a validated checksum.
908          * Only if this is not possible, the pages are picked from
909          * mirrors with I/O errors without considering the checksum.
910          * If the latter is the case, at the end, the checksum of the
911          * repaired area is verified in order to correctly maintain
912          * the statistics.
913          */
914
915         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
916                                       sizeof(*sblocks_for_recheck), GFP_NOFS);
917         if (!sblocks_for_recheck) {
918                 spin_lock(&sctx->stat_lock);
919                 sctx->stat.malloc_errors++;
920                 sctx->stat.read_errors++;
921                 sctx->stat.uncorrectable_errors++;
922                 spin_unlock(&sctx->stat_lock);
923                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
924                 goto out;
925         }
926
927         /* setup the context, map the logical blocks and alloc the pages */
928         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
929         if (ret) {
930                 spin_lock(&sctx->stat_lock);
931                 sctx->stat.read_errors++;
932                 sctx->stat.uncorrectable_errors++;
933                 spin_unlock(&sctx->stat_lock);
934                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
935                 goto out;
936         }
937         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
938         sblock_bad = sblocks_for_recheck + failed_mirror_index;
939
940         /* build and submit the bios for the failed mirror, check checksums */
941         scrub_recheck_block(fs_info, sblock_bad, 1);
942
943         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
944             sblock_bad->no_io_error_seen) {
945                 /*
946                  * the error disappeared after reading page by page, or
947                  * the area was part of a huge bio and other parts of the
948                  * bio caused I/O errors, or the block layer merged several
949                  * read requests into one and the error is caused by a
950                  * different bio (usually one of the two latter cases is
951                  * the cause)
952                  */
953                 spin_lock(&sctx->stat_lock);
954                 sctx->stat.unverified_errors++;
955                 sblock_to_check->data_corrected = 1;
956                 spin_unlock(&sctx->stat_lock);
957
958                 if (sctx->is_dev_replace)
959                         scrub_write_block_to_dev_replace(sblock_bad);
960                 goto out;
961         }
962
963         if (!sblock_bad->no_io_error_seen) {
964                 spin_lock(&sctx->stat_lock);
965                 sctx->stat.read_errors++;
966                 spin_unlock(&sctx->stat_lock);
967                 if (__ratelimit(&_rs))
968                         scrub_print_warning("i/o error", sblock_to_check);
969                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
970         } else if (sblock_bad->checksum_error) {
971                 spin_lock(&sctx->stat_lock);
972                 sctx->stat.csum_errors++;
973                 spin_unlock(&sctx->stat_lock);
974                 if (__ratelimit(&_rs))
975                         scrub_print_warning("checksum error", sblock_to_check);
976                 btrfs_dev_stat_inc_and_print(dev,
977                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
978         } else if (sblock_bad->header_error) {
979                 spin_lock(&sctx->stat_lock);
980                 sctx->stat.verify_errors++;
981                 spin_unlock(&sctx->stat_lock);
982                 if (__ratelimit(&_rs))
983                         scrub_print_warning("checksum/header error",
984                                             sblock_to_check);
985                 if (sblock_bad->generation_error)
986                         btrfs_dev_stat_inc_and_print(dev,
987                                 BTRFS_DEV_STAT_GENERATION_ERRS);
988                 else
989                         btrfs_dev_stat_inc_and_print(dev,
990                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
991         }
992
993         if (sctx->readonly) {
994                 ASSERT(!sctx->is_dev_replace);
995                 goto out;
996         }
997
998         /*
999          * now build and submit the bios for the other mirrors, check
1000          * checksums.
1001          * First try to pick the mirror which is completely without I/O
1002          * errors and also does not have a checksum error.
1003          * If one is found, and if a checksum is present, the full block
1004          * that is known to contain an error is rewritten. Afterwards
1005          * the block is known to be corrected.
1006          * If a mirror is found which is completely correct, and no
1007          * checksum is present, only those pages are rewritten that had
1008          * an I/O error in the block to be repaired, since it cannot be
1009          * determined, which copy of the other pages is better (and it
1010          * could happen otherwise that a correct page would be
1011          * overwritten by a bad one).
1012          */
1013         for (mirror_index = 0; ;mirror_index++) {
1014                 struct scrub_block *sblock_other;
1015
1016                 if (mirror_index == failed_mirror_index)
1017                         continue;
1018
1019                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1020                 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1021                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1022                                 break;
1023                         if (!sblocks_for_recheck[mirror_index].page_count)
1024                                 break;
1025
1026                         sblock_other = sblocks_for_recheck + mirror_index;
1027                 } else {
1028                         struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1029                         int max_allowed = r->bbio->num_stripes -
1030                                                 r->bbio->num_tgtdevs;
1031
1032                         if (mirror_index >= max_allowed)
1033                                 break;
1034                         if (!sblocks_for_recheck[1].page_count)
1035                                 break;
1036
1037                         ASSERT(failed_mirror_index == 0);
1038                         sblock_other = sblocks_for_recheck + 1;
1039                         sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1040                 }
1041
1042                 /* build and submit the bios, check checksums */
1043                 scrub_recheck_block(fs_info, sblock_other, 0);
1044
1045                 if (!sblock_other->header_error &&
1046                     !sblock_other->checksum_error &&
1047                     sblock_other->no_io_error_seen) {
1048                         if (sctx->is_dev_replace) {
1049                                 scrub_write_block_to_dev_replace(sblock_other);
1050                                 goto corrected_error;
1051                         } else {
1052                                 ret = scrub_repair_block_from_good_copy(
1053                                                 sblock_bad, sblock_other);
1054                                 if (!ret)
1055                                         goto corrected_error;
1056                         }
1057                 }
1058         }
1059
1060         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1061                 goto did_not_correct_error;
1062
1063         /*
1064          * In case of I/O errors in the area that is supposed to be
1065          * repaired, continue by picking good copies of those pages.
1066          * Select the good pages from mirrors to rewrite bad pages from
1067          * the area to fix. Afterwards verify the checksum of the block
1068          * that is supposed to be repaired. This verification step is
1069          * only done for the purpose of statistic counting and for the
1070          * final scrub report, whether errors remain.
1071          * A perfect algorithm could make use of the checksum and try
1072          * all possible combinations of pages from the different mirrors
1073          * until the checksum verification succeeds. For example, when
1074          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1075          * of mirror #2 is readable but the final checksum test fails,
1076          * then the 2nd page of mirror #3 could be tried, whether now
1077          * the final checksum succeeds. But this would be a rare
1078          * exception and is therefore not implemented. At least it is
1079          * avoided that the good copy is overwritten.
1080          * A more useful improvement would be to pick the sectors
1081          * without I/O error based on sector sizes (512 bytes on legacy
1082          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1083          * mirror could be repaired by taking 512 byte of a different
1084          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1085          * area are unreadable.
1086          */
1087         success = 1;
1088         for (page_num = 0; page_num < sblock_bad->page_count;
1089              page_num++) {
1090                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1091                 struct scrub_block *sblock_other = NULL;
1092
1093                 /* skip no-io-error page in scrub */
1094                 if (!page_bad->io_error && !sctx->is_dev_replace)
1095                         continue;
1096
1097                 if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1098                         /*
1099                          * In case of dev replace, if raid56 rebuild process
1100                          * didn't work out correct data, then copy the content
1101                          * in sblock_bad to make sure target device is identical
1102                          * to source device, instead of writing garbage data in
1103                          * sblock_for_recheck array to target device.
1104                          */
1105                         sblock_other = NULL;
1106                 } else if (page_bad->io_error) {
1107                         /* try to find no-io-error page in mirrors */
1108                         for (mirror_index = 0;
1109                              mirror_index < BTRFS_MAX_MIRRORS &&
1110                              sblocks_for_recheck[mirror_index].page_count > 0;
1111                              mirror_index++) {
1112                                 if (!sblocks_for_recheck[mirror_index].
1113                                     pagev[page_num]->io_error) {
1114                                         sblock_other = sblocks_for_recheck +
1115                                                        mirror_index;
1116                                         break;
1117                                 }
1118                         }
1119                         if (!sblock_other)
1120                                 success = 0;
1121                 }
1122
1123                 if (sctx->is_dev_replace) {
1124                         /*
1125                          * did not find a mirror to fetch the page
1126                          * from. scrub_write_page_to_dev_replace()
1127                          * handles this case (page->io_error), by
1128                          * filling the block with zeros before
1129                          * submitting the write request
1130                          */
1131                         if (!sblock_other)
1132                                 sblock_other = sblock_bad;
1133
1134                         if (scrub_write_page_to_dev_replace(sblock_other,
1135                                                             page_num) != 0) {
1136                                 btrfs_dev_replace_stats_inc(
1137                                         &fs_info->dev_replace.num_write_errors);
1138                                 success = 0;
1139                         }
1140                 } else if (sblock_other) {
1141                         ret = scrub_repair_page_from_good_copy(sblock_bad,
1142                                                                sblock_other,
1143                                                                page_num, 0);
1144                         if (0 == ret)
1145                                 page_bad->io_error = 0;
1146                         else
1147                                 success = 0;
1148                 }
1149         }
1150
1151         if (success && !sctx->is_dev_replace) {
1152                 if (is_metadata || have_csum) {
1153                         /*
1154                          * need to verify the checksum now that all
1155                          * sectors on disk are repaired (the write
1156                          * request for data to be repaired is on its way).
1157                          * Just be lazy and use scrub_recheck_block()
1158                          * which re-reads the data before the checksum
1159                          * is verified, but most likely the data comes out
1160                          * of the page cache.
1161                          */
1162                         scrub_recheck_block(fs_info, sblock_bad, 1);
1163                         if (!sblock_bad->header_error &&
1164                             !sblock_bad->checksum_error &&
1165                             sblock_bad->no_io_error_seen)
1166                                 goto corrected_error;
1167                         else
1168                                 goto did_not_correct_error;
1169                 } else {
1170 corrected_error:
1171                         spin_lock(&sctx->stat_lock);
1172                         sctx->stat.corrected_errors++;
1173                         sblock_to_check->data_corrected = 1;
1174                         spin_unlock(&sctx->stat_lock);
1175                         btrfs_err_rl_in_rcu(fs_info,
1176                                 "fixed up error at logical %llu on dev %s",
1177                                 logical, rcu_str_deref(dev->name));
1178                 }
1179         } else {
1180 did_not_correct_error:
1181                 spin_lock(&sctx->stat_lock);
1182                 sctx->stat.uncorrectable_errors++;
1183                 spin_unlock(&sctx->stat_lock);
1184                 btrfs_err_rl_in_rcu(fs_info,
1185                         "unable to fixup (regular) error at logical %llu on dev %s",
1186                         logical, rcu_str_deref(dev->name));
1187         }
1188
1189 out:
1190         if (sblocks_for_recheck) {
1191                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1192                      mirror_index++) {
1193                         struct scrub_block *sblock = sblocks_for_recheck +
1194                                                      mirror_index;
1195                         struct scrub_recover *recover;
1196                         int page_index;
1197
1198                         for (page_index = 0; page_index < sblock->page_count;
1199                              page_index++) {
1200                                 sblock->pagev[page_index]->sblock = NULL;
1201                                 recover = sblock->pagev[page_index]->recover;
1202                                 if (recover) {
1203                                         scrub_put_recover(fs_info, recover);
1204                                         sblock->pagev[page_index]->recover =
1205                                                                         NULL;
1206                                 }
1207                                 scrub_page_put(sblock->pagev[page_index]);
1208                         }
1209                 }
1210                 kfree(sblocks_for_recheck);
1211         }
1212
1213         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1214         if (ret < 0)
1215                 return ret;
1216         return 0;
1217 }
1218
1219 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1220 {
1221         if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1222                 return 2;
1223         else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1224                 return 3;
1225         else
1226                 return (int)bbio->num_stripes;
1227 }
1228
1229 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1230                                                  u64 *raid_map,
1231                                                  u64 mapped_length,
1232                                                  int nstripes, int mirror,
1233                                                  int *stripe_index,
1234                                                  u64 *stripe_offset)
1235 {
1236         int i;
1237
1238         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1239                 /* RAID5/6 */
1240                 for (i = 0; i < nstripes; i++) {
1241                         if (raid_map[i] == RAID6_Q_STRIPE ||
1242                             raid_map[i] == RAID5_P_STRIPE)
1243                                 continue;
1244
1245                         if (logical >= raid_map[i] &&
1246                             logical < raid_map[i] + mapped_length)
1247                                 break;
1248                 }
1249
1250                 *stripe_index = i;
1251                 *stripe_offset = logical - raid_map[i];
1252         } else {
1253                 /* The other RAID type */
1254                 *stripe_index = mirror;
1255                 *stripe_offset = 0;
1256         }
1257 }
1258
1259 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1260                                      struct scrub_block *sblocks_for_recheck)
1261 {
1262         struct scrub_ctx *sctx = original_sblock->sctx;
1263         struct btrfs_fs_info *fs_info = sctx->fs_info;
1264         u64 length = original_sblock->page_count * PAGE_SIZE;
1265         u64 logical = original_sblock->pagev[0]->logical;
1266         u64 generation = original_sblock->pagev[0]->generation;
1267         u64 flags = original_sblock->pagev[0]->flags;
1268         u64 have_csum = original_sblock->pagev[0]->have_csum;
1269         struct scrub_recover *recover;
1270         struct btrfs_bio *bbio;
1271         u64 sublen;
1272         u64 mapped_length;
1273         u64 stripe_offset;
1274         int stripe_index;
1275         int page_index = 0;
1276         int mirror_index;
1277         int nmirrors;
1278         int ret;
1279
1280         /*
1281          * note: the two members refs and outstanding_pages
1282          * are not used (and not set) in the blocks that are used for
1283          * the recheck procedure
1284          */
1285
1286         while (length > 0) {
1287                 sublen = min_t(u64, length, PAGE_SIZE);
1288                 mapped_length = sublen;
1289                 bbio = NULL;
1290
1291                 /*
1292                  * with a length of PAGE_SIZE, each returned stripe
1293                  * represents one mirror
1294                  */
1295                 btrfs_bio_counter_inc_blocked(fs_info);
1296                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1297                                 logical, &mapped_length, &bbio);
1298                 if (ret || !bbio || mapped_length < sublen) {
1299                         btrfs_put_bbio(bbio);
1300                         btrfs_bio_counter_dec(fs_info);
1301                         return -EIO;
1302                 }
1303
1304                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1305                 if (!recover) {
1306                         btrfs_put_bbio(bbio);
1307                         btrfs_bio_counter_dec(fs_info);
1308                         return -ENOMEM;
1309                 }
1310
1311                 refcount_set(&recover->refs, 1);
1312                 recover->bbio = bbio;
1313                 recover->map_length = mapped_length;
1314
1315                 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1316
1317                 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1318
1319                 for (mirror_index = 0; mirror_index < nmirrors;
1320                      mirror_index++) {
1321                         struct scrub_block *sblock;
1322                         struct scrub_page *page;
1323
1324                         sblock = sblocks_for_recheck + mirror_index;
1325                         sblock->sctx = sctx;
1326
1327                         page = kzalloc(sizeof(*page), GFP_NOFS);
1328                         if (!page) {
1329 leave_nomem:
1330                                 spin_lock(&sctx->stat_lock);
1331                                 sctx->stat.malloc_errors++;
1332                                 spin_unlock(&sctx->stat_lock);
1333                                 scrub_put_recover(fs_info, recover);
1334                                 return -ENOMEM;
1335                         }
1336                         scrub_page_get(page);
1337                         sblock->pagev[page_index] = page;
1338                         page->sblock = sblock;
1339                         page->flags = flags;
1340                         page->generation = generation;
1341                         page->logical = logical;
1342                         page->have_csum = have_csum;
1343                         if (have_csum)
1344                                 memcpy(page->csum,
1345                                        original_sblock->pagev[0]->csum,
1346                                        sctx->csum_size);
1347
1348                         scrub_stripe_index_and_offset(logical,
1349                                                       bbio->map_type,
1350                                                       bbio->raid_map,
1351                                                       mapped_length,
1352                                                       bbio->num_stripes -
1353                                                       bbio->num_tgtdevs,
1354                                                       mirror_index,
1355                                                       &stripe_index,
1356                                                       &stripe_offset);
1357                         page->physical = bbio->stripes[stripe_index].physical +
1358                                          stripe_offset;
1359                         page->dev = bbio->stripes[stripe_index].dev;
1360
1361                         BUG_ON(page_index >= original_sblock->page_count);
1362                         page->physical_for_dev_replace =
1363                                 original_sblock->pagev[page_index]->
1364                                 physical_for_dev_replace;
1365                         /* for missing devices, dev->bdev is NULL */
1366                         page->mirror_num = mirror_index + 1;
1367                         sblock->page_count++;
1368                         page->page = alloc_page(GFP_NOFS);
1369                         if (!page->page)
1370                                 goto leave_nomem;
1371
1372                         scrub_get_recover(recover);
1373                         page->recover = recover;
1374                 }
1375                 scrub_put_recover(fs_info, recover);
1376                 length -= sublen;
1377                 logical += sublen;
1378                 page_index++;
1379         }
1380
1381         return 0;
1382 }
1383
1384 static void scrub_bio_wait_endio(struct bio *bio)
1385 {
1386         complete(bio->bi_private);
1387 }
1388
1389 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1390                                         struct bio *bio,
1391                                         struct scrub_page *page)
1392 {
1393         DECLARE_COMPLETION_ONSTACK(done);
1394         int ret;
1395         int mirror_num;
1396
1397         bio->bi_iter.bi_sector = page->logical >> 9;
1398         bio->bi_private = &done;
1399         bio->bi_end_io = scrub_bio_wait_endio;
1400
1401         mirror_num = page->sblock->pagev[0]->mirror_num;
1402         ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
1403                                     page->recover->map_length,
1404                                     mirror_num, 0);
1405         if (ret)
1406                 return ret;
1407
1408         wait_for_completion_io(&done);
1409         return blk_status_to_errno(bio->bi_status);
1410 }
1411
1412 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1413                                           struct scrub_block *sblock)
1414 {
1415         struct scrub_page *first_page = sblock->pagev[0];
1416         struct bio *bio;
1417         int page_num;
1418
1419         /* All pages in sblock belong to the same stripe on the same device. */
1420         ASSERT(first_page->dev);
1421         if (!first_page->dev->bdev)
1422                 goto out;
1423
1424         bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
1425         bio_set_dev(bio, first_page->dev->bdev);
1426
1427         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1428                 struct scrub_page *page = sblock->pagev[page_num];
1429
1430                 WARN_ON(!page->page);
1431                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1432         }
1433
1434         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1435                 bio_put(bio);
1436                 goto out;
1437         }
1438
1439         bio_put(bio);
1440
1441         scrub_recheck_block_checksum(sblock);
1442
1443         return;
1444 out:
1445         for (page_num = 0; page_num < sblock->page_count; page_num++)
1446                 sblock->pagev[page_num]->io_error = 1;
1447
1448         sblock->no_io_error_seen = 0;
1449 }
1450
1451 /*
1452  * this function will check the on disk data for checksum errors, header
1453  * errors and read I/O errors. If any I/O errors happen, the exact pages
1454  * which are errored are marked as being bad. The goal is to enable scrub
1455  * to take those pages that are not errored from all the mirrors so that
1456  * the pages that are errored in the just handled mirror can be repaired.
1457  */
1458 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1459                                 struct scrub_block *sblock,
1460                                 int retry_failed_mirror)
1461 {
1462         int page_num;
1463
1464         sblock->no_io_error_seen = 1;
1465
1466         /* short cut for raid56 */
1467         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1468                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1469
1470         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1471                 struct bio *bio;
1472                 struct scrub_page *page = sblock->pagev[page_num];
1473
1474                 if (page->dev->bdev == NULL) {
1475                         page->io_error = 1;
1476                         sblock->no_io_error_seen = 0;
1477                         continue;
1478                 }
1479
1480                 WARN_ON(!page->page);
1481                 bio = btrfs_io_bio_alloc(1);
1482                 bio_set_dev(bio, page->dev->bdev);
1483
1484                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1485                 bio->bi_iter.bi_sector = page->physical >> 9;
1486                 bio->bi_opf = REQ_OP_READ;
1487
1488                 if (btrfsic_submit_bio_wait(bio)) {
1489                         page->io_error = 1;
1490                         sblock->no_io_error_seen = 0;
1491                 }
1492
1493                 bio_put(bio);
1494         }
1495
1496         if (sblock->no_io_error_seen)
1497                 scrub_recheck_block_checksum(sblock);
1498 }
1499
1500 static inline int scrub_check_fsid(u8 fsid[],
1501                                    struct scrub_page *spage)
1502 {
1503         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1504         int ret;
1505
1506         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1507         return !ret;
1508 }
1509
1510 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1511 {
1512         sblock->header_error = 0;
1513         sblock->checksum_error = 0;
1514         sblock->generation_error = 0;
1515
1516         if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1517                 scrub_checksum_data(sblock);
1518         else
1519                 scrub_checksum_tree_block(sblock);
1520 }
1521
1522 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1523                                              struct scrub_block *sblock_good)
1524 {
1525         int page_num;
1526         int ret = 0;
1527
1528         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1529                 int ret_sub;
1530
1531                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1532                                                            sblock_good,
1533                                                            page_num, 1);
1534                 if (ret_sub)
1535                         ret = ret_sub;
1536         }
1537
1538         return ret;
1539 }
1540
1541 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1542                                             struct scrub_block *sblock_good,
1543                                             int page_num, int force_write)
1544 {
1545         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1546         struct scrub_page *page_good = sblock_good->pagev[page_num];
1547         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1548
1549         BUG_ON(page_bad->page == NULL);
1550         BUG_ON(page_good->page == NULL);
1551         if (force_write || sblock_bad->header_error ||
1552             sblock_bad->checksum_error || page_bad->io_error) {
1553                 struct bio *bio;
1554                 int ret;
1555
1556                 if (!page_bad->dev->bdev) {
1557                         btrfs_warn_rl(fs_info,
1558                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1559                         return -EIO;
1560                 }
1561
1562                 bio = btrfs_io_bio_alloc(1);
1563                 bio_set_dev(bio, page_bad->dev->bdev);
1564                 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1565                 bio->bi_opf = REQ_OP_WRITE;
1566
1567                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1568                 if (PAGE_SIZE != ret) {
1569                         bio_put(bio);
1570                         return -EIO;
1571                 }
1572
1573                 if (btrfsic_submit_bio_wait(bio)) {
1574                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1575                                 BTRFS_DEV_STAT_WRITE_ERRS);
1576                         btrfs_dev_replace_stats_inc(
1577                                 &fs_info->dev_replace.num_write_errors);
1578                         bio_put(bio);
1579                         return -EIO;
1580                 }
1581                 bio_put(bio);
1582         }
1583
1584         return 0;
1585 }
1586
1587 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1588 {
1589         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1590         int page_num;
1591
1592         /*
1593          * This block is used for the check of the parity on the source device,
1594          * so the data needn't be written into the destination device.
1595          */
1596         if (sblock->sparity)
1597                 return;
1598
1599         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1600                 int ret;
1601
1602                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1603                 if (ret)
1604                         btrfs_dev_replace_stats_inc(
1605                                 &fs_info->dev_replace.num_write_errors);
1606         }
1607 }
1608
1609 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1610                                            int page_num)
1611 {
1612         struct scrub_page *spage = sblock->pagev[page_num];
1613
1614         BUG_ON(spage->page == NULL);
1615         if (spage->io_error) {
1616                 void *mapped_buffer = kmap_atomic(spage->page);
1617
1618                 clear_page(mapped_buffer);
1619                 flush_dcache_page(spage->page);
1620                 kunmap_atomic(mapped_buffer);
1621         }
1622         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1623 }
1624
1625 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1626                                     struct scrub_page *spage)
1627 {
1628         struct scrub_bio *sbio;
1629         int ret;
1630
1631         mutex_lock(&sctx->wr_lock);
1632 again:
1633         if (!sctx->wr_curr_bio) {
1634                 unsigned int nofs_flag;
1635
1636                 /*
1637                  * We must use GFP_NOFS because the scrub task might be waiting
1638                  * for a worker task executing this function and in turn a
1639                  * transaction commit might be waiting the scrub task to pause
1640                  * (which needs to wait for all the worker tasks to complete
1641                  * before pausing).
1642                  */
1643                 nofs_flag = memalloc_nofs_save();
1644                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1645                                               GFP_KERNEL);
1646                 memalloc_nofs_restore(nofs_flag);
1647                 if (!sctx->wr_curr_bio) {
1648                         mutex_unlock(&sctx->wr_lock);
1649                         return -ENOMEM;
1650                 }
1651                 sctx->wr_curr_bio->sctx = sctx;
1652                 sctx->wr_curr_bio->page_count = 0;
1653         }
1654         sbio = sctx->wr_curr_bio;
1655         if (sbio->page_count == 0) {
1656                 struct bio *bio;
1657
1658                 sbio->physical = spage->physical_for_dev_replace;
1659                 sbio->logical = spage->logical;
1660                 sbio->dev = sctx->wr_tgtdev;
1661                 bio = sbio->bio;
1662                 if (!bio) {
1663                         bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1664                         sbio->bio = bio;
1665                 }
1666
1667                 bio->bi_private = sbio;
1668                 bio->bi_end_io = scrub_wr_bio_end_io;
1669                 bio_set_dev(bio, sbio->dev->bdev);
1670                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1671                 bio->bi_opf = REQ_OP_WRITE;
1672                 sbio->status = 0;
1673         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1674                    spage->physical_for_dev_replace ||
1675                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1676                    spage->logical) {
1677                 scrub_wr_submit(sctx);
1678                 goto again;
1679         }
1680
1681         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1682         if (ret != PAGE_SIZE) {
1683                 if (sbio->page_count < 1) {
1684                         bio_put(sbio->bio);
1685                         sbio->bio = NULL;
1686                         mutex_unlock(&sctx->wr_lock);
1687                         return -EIO;
1688                 }
1689                 scrub_wr_submit(sctx);
1690                 goto again;
1691         }
1692
1693         sbio->pagev[sbio->page_count] = spage;
1694         scrub_page_get(spage);
1695         sbio->page_count++;
1696         if (sbio->page_count == sctx->pages_per_wr_bio)
1697                 scrub_wr_submit(sctx);
1698         mutex_unlock(&sctx->wr_lock);
1699
1700         return 0;
1701 }
1702
1703 static void scrub_wr_submit(struct scrub_ctx *sctx)
1704 {
1705         struct scrub_bio *sbio;
1706
1707         if (!sctx->wr_curr_bio)
1708                 return;
1709
1710         sbio = sctx->wr_curr_bio;
1711         sctx->wr_curr_bio = NULL;
1712         WARN_ON(!sbio->bio->bi_disk);
1713         scrub_pending_bio_inc(sctx);
1714         /* process all writes in a single worker thread. Then the block layer
1715          * orders the requests before sending them to the driver which
1716          * doubled the write performance on spinning disks when measured
1717          * with Linux 3.5 */
1718         btrfsic_submit_bio(sbio->bio);
1719 }
1720
1721 static void scrub_wr_bio_end_io(struct bio *bio)
1722 {
1723         struct scrub_bio *sbio = bio->bi_private;
1724         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1725
1726         sbio->status = bio->bi_status;
1727         sbio->bio = bio;
1728
1729         btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1730                          scrub_wr_bio_end_io_worker, NULL, NULL);
1731         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1732 }
1733
1734 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1735 {
1736         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1737         struct scrub_ctx *sctx = sbio->sctx;
1738         int i;
1739
1740         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1741         if (sbio->status) {
1742                 struct btrfs_dev_replace *dev_replace =
1743                         &sbio->sctx->fs_info->dev_replace;
1744
1745                 for (i = 0; i < sbio->page_count; i++) {
1746                         struct scrub_page *spage = sbio->pagev[i];
1747
1748                         spage->io_error = 1;
1749                         btrfs_dev_replace_stats_inc(&dev_replace->
1750                                                     num_write_errors);
1751                 }
1752         }
1753
1754         for (i = 0; i < sbio->page_count; i++)
1755                 scrub_page_put(sbio->pagev[i]);
1756
1757         bio_put(sbio->bio);
1758         kfree(sbio);
1759         scrub_pending_bio_dec(sctx);
1760 }
1761
1762 static int scrub_checksum(struct scrub_block *sblock)
1763 {
1764         u64 flags;
1765         int ret;
1766
1767         /*
1768          * No need to initialize these stats currently,
1769          * because this function only use return value
1770          * instead of these stats value.
1771          *
1772          * Todo:
1773          * always use stats
1774          */
1775         sblock->header_error = 0;
1776         sblock->generation_error = 0;
1777         sblock->checksum_error = 0;
1778
1779         WARN_ON(sblock->page_count < 1);
1780         flags = sblock->pagev[0]->flags;
1781         ret = 0;
1782         if (flags & BTRFS_EXTENT_FLAG_DATA)
1783                 ret = scrub_checksum_data(sblock);
1784         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1785                 ret = scrub_checksum_tree_block(sblock);
1786         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1787                 (void)scrub_checksum_super(sblock);
1788         else
1789                 WARN_ON(1);
1790         if (ret)
1791                 scrub_handle_errored_block(sblock);
1792
1793         return ret;
1794 }
1795
1796 static int scrub_checksum_data(struct scrub_block *sblock)
1797 {
1798         struct scrub_ctx *sctx = sblock->sctx;
1799         u8 csum[BTRFS_CSUM_SIZE];
1800         u8 *on_disk_csum;
1801         struct page *page;
1802         void *buffer;
1803         u32 crc = ~(u32)0;
1804         u64 len;
1805         int index;
1806
1807         BUG_ON(sblock->page_count < 1);
1808         if (!sblock->pagev[0]->have_csum)
1809                 return 0;
1810
1811         on_disk_csum = sblock->pagev[0]->csum;
1812         page = sblock->pagev[0]->page;
1813         buffer = kmap_atomic(page);
1814
1815         len = sctx->fs_info->sectorsize;
1816         index = 0;
1817         for (;;) {
1818                 u64 l = min_t(u64, len, PAGE_SIZE);
1819
1820                 crc = btrfs_csum_data(buffer, crc, l);
1821                 kunmap_atomic(buffer);
1822                 len -= l;
1823                 if (len == 0)
1824                         break;
1825                 index++;
1826                 BUG_ON(index >= sblock->page_count);
1827                 BUG_ON(!sblock->pagev[index]->page);
1828                 page = sblock->pagev[index]->page;
1829                 buffer = kmap_atomic(page);
1830         }
1831
1832         btrfs_csum_final(crc, csum);
1833         if (memcmp(csum, on_disk_csum, sctx->csum_size))
1834                 sblock->checksum_error = 1;
1835
1836         return sblock->checksum_error;
1837 }
1838
1839 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1840 {
1841         struct scrub_ctx *sctx = sblock->sctx;
1842         struct btrfs_header *h;
1843         struct btrfs_fs_info *fs_info = sctx->fs_info;
1844         u8 calculated_csum[BTRFS_CSUM_SIZE];
1845         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1846         struct page *page;
1847         void *mapped_buffer;
1848         u64 mapped_size;
1849         void *p;
1850         u32 crc = ~(u32)0;
1851         u64 len;
1852         int index;
1853
1854         BUG_ON(sblock->page_count < 1);
1855         page = sblock->pagev[0]->page;
1856         mapped_buffer = kmap_atomic(page);
1857         h = (struct btrfs_header *)mapped_buffer;
1858         memcpy(on_disk_csum, h->csum, sctx->csum_size);
1859
1860         /*
1861          * we don't use the getter functions here, as we
1862          * a) don't have an extent buffer and
1863          * b) the page is already kmapped
1864          */
1865         if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1866                 sblock->header_error = 1;
1867
1868         if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
1869                 sblock->header_error = 1;
1870                 sblock->generation_error = 1;
1871         }
1872
1873         if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
1874                 sblock->header_error = 1;
1875
1876         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1877                    BTRFS_UUID_SIZE))
1878                 sblock->header_error = 1;
1879
1880         len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
1881         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1882         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1883         index = 0;
1884         for (;;) {
1885                 u64 l = min_t(u64, len, mapped_size);
1886
1887                 crc = btrfs_csum_data(p, crc, l);
1888                 kunmap_atomic(mapped_buffer);
1889                 len -= l;
1890                 if (len == 0)
1891                         break;
1892                 index++;
1893                 BUG_ON(index >= sblock->page_count);
1894                 BUG_ON(!sblock->pagev[index]->page);
1895                 page = sblock->pagev[index]->page;
1896                 mapped_buffer = kmap_atomic(page);
1897                 mapped_size = PAGE_SIZE;
1898                 p = mapped_buffer;
1899         }
1900
1901         btrfs_csum_final(crc, calculated_csum);
1902         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1903                 sblock->checksum_error = 1;
1904
1905         return sblock->header_error || sblock->checksum_error;
1906 }
1907
1908 static int scrub_checksum_super(struct scrub_block *sblock)
1909 {
1910         struct btrfs_super_block *s;
1911         struct scrub_ctx *sctx = sblock->sctx;
1912         u8 calculated_csum[BTRFS_CSUM_SIZE];
1913         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1914         struct page *page;
1915         void *mapped_buffer;
1916         u64 mapped_size;
1917         void *p;
1918         u32 crc = ~(u32)0;
1919         int fail_gen = 0;
1920         int fail_cor = 0;
1921         u64 len;
1922         int index;
1923
1924         BUG_ON(sblock->page_count < 1);
1925         page = sblock->pagev[0]->page;
1926         mapped_buffer = kmap_atomic(page);
1927         s = (struct btrfs_super_block *)mapped_buffer;
1928         memcpy(on_disk_csum, s->csum, sctx->csum_size);
1929
1930         if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
1931                 ++fail_cor;
1932
1933         if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1934                 ++fail_gen;
1935
1936         if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
1937                 ++fail_cor;
1938
1939         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1940         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1941         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1942         index = 0;
1943         for (;;) {
1944                 u64 l = min_t(u64, len, mapped_size);
1945
1946                 crc = btrfs_csum_data(p, crc, l);
1947                 kunmap_atomic(mapped_buffer);
1948                 len -= l;
1949                 if (len == 0)
1950                         break;
1951                 index++;
1952                 BUG_ON(index >= sblock->page_count);
1953                 BUG_ON(!sblock->pagev[index]->page);
1954                 page = sblock->pagev[index]->page;
1955                 mapped_buffer = kmap_atomic(page);
1956                 mapped_size = PAGE_SIZE;
1957                 p = mapped_buffer;
1958         }
1959
1960         btrfs_csum_final(crc, calculated_csum);
1961         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1962                 ++fail_cor;
1963
1964         if (fail_cor + fail_gen) {
1965                 /*
1966                  * if we find an error in a super block, we just report it.
1967                  * They will get written with the next transaction commit
1968                  * anyway
1969                  */
1970                 spin_lock(&sctx->stat_lock);
1971                 ++sctx->stat.super_errors;
1972                 spin_unlock(&sctx->stat_lock);
1973                 if (fail_cor)
1974                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1975                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1976                 else
1977                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1978                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1979         }
1980
1981         return fail_cor + fail_gen;
1982 }
1983
1984 static void scrub_block_get(struct scrub_block *sblock)
1985 {
1986         refcount_inc(&sblock->refs);
1987 }
1988
1989 static void scrub_block_put(struct scrub_block *sblock)
1990 {
1991         if (refcount_dec_and_test(&sblock->refs)) {
1992                 int i;
1993
1994                 if (sblock->sparity)
1995                         scrub_parity_put(sblock->sparity);
1996
1997                 for (i = 0; i < sblock->page_count; i++)
1998                         scrub_page_put(sblock->pagev[i]);
1999                 kfree(sblock);
2000         }
2001 }
2002
2003 static void scrub_page_get(struct scrub_page *spage)
2004 {
2005         atomic_inc(&spage->refs);
2006 }
2007
2008 static void scrub_page_put(struct scrub_page *spage)
2009 {
2010         if (atomic_dec_and_test(&spage->refs)) {
2011                 if (spage->page)
2012                         __free_page(spage->page);
2013                 kfree(spage);
2014         }
2015 }
2016
2017 static void scrub_submit(struct scrub_ctx *sctx)
2018 {
2019         struct scrub_bio *sbio;
2020
2021         if (sctx->curr == -1)
2022                 return;
2023
2024         sbio = sctx->bios[sctx->curr];
2025         sctx->curr = -1;
2026         scrub_pending_bio_inc(sctx);
2027         btrfsic_submit_bio(sbio->bio);
2028 }
2029
2030 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2031                                     struct scrub_page *spage)
2032 {
2033         struct scrub_block *sblock = spage->sblock;
2034         struct scrub_bio *sbio;
2035         int ret;
2036
2037 again:
2038         /*
2039          * grab a fresh bio or wait for one to become available
2040          */
2041         while (sctx->curr == -1) {
2042                 spin_lock(&sctx->list_lock);
2043                 sctx->curr = sctx->first_free;
2044                 if (sctx->curr != -1) {
2045                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2046                         sctx->bios[sctx->curr]->next_free = -1;
2047                         sctx->bios[sctx->curr]->page_count = 0;
2048                         spin_unlock(&sctx->list_lock);
2049                 } else {
2050                         spin_unlock(&sctx->list_lock);
2051                         wait_event(sctx->list_wait, sctx->first_free != -1);
2052                 }
2053         }
2054         sbio = sctx->bios[sctx->curr];
2055         if (sbio->page_count == 0) {
2056                 struct bio *bio;
2057
2058                 sbio->physical = spage->physical;
2059                 sbio->logical = spage->logical;
2060                 sbio->dev = spage->dev;
2061                 bio = sbio->bio;
2062                 if (!bio) {
2063                         bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
2064                         sbio->bio = bio;
2065                 }
2066
2067                 bio->bi_private = sbio;
2068                 bio->bi_end_io = scrub_bio_end_io;
2069                 bio_set_dev(bio, sbio->dev->bdev);
2070                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2071                 bio->bi_opf = REQ_OP_READ;
2072                 sbio->status = 0;
2073         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2074                    spage->physical ||
2075                    sbio->logical + sbio->page_count * PAGE_SIZE !=
2076                    spage->logical ||
2077                    sbio->dev != spage->dev) {
2078                 scrub_submit(sctx);
2079                 goto again;
2080         }
2081
2082         sbio->pagev[sbio->page_count] = spage;
2083         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2084         if (ret != PAGE_SIZE) {
2085                 if (sbio->page_count < 1) {
2086                         bio_put(sbio->bio);
2087                         sbio->bio = NULL;
2088                         return -EIO;
2089                 }
2090                 scrub_submit(sctx);
2091                 goto again;
2092         }
2093
2094         scrub_block_get(sblock); /* one for the page added to the bio */
2095         atomic_inc(&sblock->outstanding_pages);
2096         sbio->page_count++;
2097         if (sbio->page_count == sctx->pages_per_rd_bio)
2098                 scrub_submit(sctx);
2099
2100         return 0;
2101 }
2102
2103 static void scrub_missing_raid56_end_io(struct bio *bio)
2104 {
2105         struct scrub_block *sblock = bio->bi_private;
2106         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2107
2108         if (bio->bi_status)
2109                 sblock->no_io_error_seen = 0;
2110
2111         bio_put(bio);
2112
2113         btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2114 }
2115
2116 static void scrub_missing_raid56_worker(struct btrfs_work *work)
2117 {
2118         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2119         struct scrub_ctx *sctx = sblock->sctx;
2120         struct btrfs_fs_info *fs_info = sctx->fs_info;
2121         u64 logical;
2122         struct btrfs_device *dev;
2123
2124         logical = sblock->pagev[0]->logical;
2125         dev = sblock->pagev[0]->dev;
2126
2127         if (sblock->no_io_error_seen)
2128                 scrub_recheck_block_checksum(sblock);
2129
2130         if (!sblock->no_io_error_seen) {
2131                 spin_lock(&sctx->stat_lock);
2132                 sctx->stat.read_errors++;
2133                 spin_unlock(&sctx->stat_lock);
2134                 btrfs_err_rl_in_rcu(fs_info,
2135                         "IO error rebuilding logical %llu for dev %s",
2136                         logical, rcu_str_deref(dev->name));
2137         } else if (sblock->header_error || sblock->checksum_error) {
2138                 spin_lock(&sctx->stat_lock);
2139                 sctx->stat.uncorrectable_errors++;
2140                 spin_unlock(&sctx->stat_lock);
2141                 btrfs_err_rl_in_rcu(fs_info,
2142                         "failed to rebuild valid logical %llu for dev %s",
2143                         logical, rcu_str_deref(dev->name));
2144         } else {
2145                 scrub_write_block_to_dev_replace(sblock);
2146         }
2147
2148         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2149                 mutex_lock(&sctx->wr_lock);
2150                 scrub_wr_submit(sctx);
2151                 mutex_unlock(&sctx->wr_lock);
2152         }
2153
2154         scrub_block_put(sblock);
2155         scrub_pending_bio_dec(sctx);
2156 }
2157
2158 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2159 {
2160         struct scrub_ctx *sctx = sblock->sctx;
2161         struct btrfs_fs_info *fs_info = sctx->fs_info;
2162         u64 length = sblock->page_count * PAGE_SIZE;
2163         u64 logical = sblock->pagev[0]->logical;
2164         struct btrfs_bio *bbio = NULL;
2165         struct bio *bio;
2166         struct btrfs_raid_bio *rbio;
2167         int ret;
2168         int i;
2169
2170         btrfs_bio_counter_inc_blocked(fs_info);
2171         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2172                         &length, &bbio);
2173         if (ret || !bbio || !bbio->raid_map)
2174                 goto bbio_out;
2175
2176         if (WARN_ON(!sctx->is_dev_replace ||
2177                     !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2178                 /*
2179                  * We shouldn't be scrubbing a missing device. Even for dev
2180                  * replace, we should only get here for RAID 5/6. We either
2181                  * managed to mount something with no mirrors remaining or
2182                  * there's a bug in scrub_remap_extent()/btrfs_map_block().
2183                  */
2184                 goto bbio_out;
2185         }
2186
2187         bio = btrfs_io_bio_alloc(0);
2188         bio->bi_iter.bi_sector = logical >> 9;
2189         bio->bi_private = sblock;
2190         bio->bi_end_io = scrub_missing_raid56_end_io;
2191
2192         rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2193         if (!rbio)
2194                 goto rbio_out;
2195
2196         for (i = 0; i < sblock->page_count; i++) {
2197                 struct scrub_page *spage = sblock->pagev[i];
2198
2199                 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2200         }
2201
2202         btrfs_init_work(&sblock->work, btrfs_scrub_helper,
2203                         scrub_missing_raid56_worker, NULL, NULL);
2204         scrub_block_get(sblock);
2205         scrub_pending_bio_inc(sctx);
2206         raid56_submit_missing_rbio(rbio);
2207         return;
2208
2209 rbio_out:
2210         bio_put(bio);
2211 bbio_out:
2212         btrfs_bio_counter_dec(fs_info);
2213         btrfs_put_bbio(bbio);
2214         spin_lock(&sctx->stat_lock);
2215         sctx->stat.malloc_errors++;
2216         spin_unlock(&sctx->stat_lock);
2217 }
2218
2219 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2220                        u64 physical, struct btrfs_device *dev, u64 flags,
2221                        u64 gen, int mirror_num, u8 *csum, int force,
2222                        u64 physical_for_dev_replace)
2223 {
2224         struct scrub_block *sblock;
2225         int index;
2226
2227         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2228         if (!sblock) {
2229                 spin_lock(&sctx->stat_lock);
2230                 sctx->stat.malloc_errors++;
2231                 spin_unlock(&sctx->stat_lock);
2232                 return -ENOMEM;
2233         }
2234
2235         /* one ref inside this function, plus one for each page added to
2236          * a bio later on */
2237         refcount_set(&sblock->refs, 1);
2238         sblock->sctx = sctx;
2239         sblock->no_io_error_seen = 1;
2240
2241         for (index = 0; len > 0; index++) {
2242                 struct scrub_page *spage;
2243                 u64 l = min_t(u64, len, PAGE_SIZE);
2244
2245                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2246                 if (!spage) {
2247 leave_nomem:
2248                         spin_lock(&sctx->stat_lock);
2249                         sctx->stat.malloc_errors++;
2250                         spin_unlock(&sctx->stat_lock);
2251                         scrub_block_put(sblock);
2252                         return -ENOMEM;
2253                 }
2254                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2255                 scrub_page_get(spage);
2256                 sblock->pagev[index] = spage;
2257                 spage->sblock = sblock;
2258                 spage->dev = dev;
2259                 spage->flags = flags;
2260                 spage->generation = gen;
2261                 spage->logical = logical;
2262                 spage->physical = physical;
2263                 spage->physical_for_dev_replace = physical_for_dev_replace;
2264                 spage->mirror_num = mirror_num;
2265                 if (csum) {
2266                         spage->have_csum = 1;
2267                         memcpy(spage->csum, csum, sctx->csum_size);
2268                 } else {
2269                         spage->have_csum = 0;
2270                 }
2271                 sblock->page_count++;
2272                 spage->page = alloc_page(GFP_KERNEL);
2273                 if (!spage->page)
2274                         goto leave_nomem;
2275                 len -= l;
2276                 logical += l;
2277                 physical += l;
2278                 physical_for_dev_replace += l;
2279         }
2280
2281         WARN_ON(sblock->page_count == 0);
2282         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2283                 /*
2284                  * This case should only be hit for RAID 5/6 device replace. See
2285                  * the comment in scrub_missing_raid56_pages() for details.
2286                  */
2287                 scrub_missing_raid56_pages(sblock);
2288         } else {
2289                 for (index = 0; index < sblock->page_count; index++) {
2290                         struct scrub_page *spage = sblock->pagev[index];
2291                         int ret;
2292
2293                         ret = scrub_add_page_to_rd_bio(sctx, spage);
2294                         if (ret) {
2295                                 scrub_block_put(sblock);
2296                                 return ret;
2297                         }
2298                 }
2299
2300                 if (force)
2301                         scrub_submit(sctx);
2302         }
2303
2304         /* last one frees, either here or in bio completion for last page */
2305         scrub_block_put(sblock);
2306         return 0;
2307 }
2308
2309 static void scrub_bio_end_io(struct bio *bio)
2310 {
2311         struct scrub_bio *sbio = bio->bi_private;
2312         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2313
2314         sbio->status = bio->bi_status;
2315         sbio->bio = bio;
2316
2317         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2318 }
2319
2320 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2321 {
2322         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2323         struct scrub_ctx *sctx = sbio->sctx;
2324         int i;
2325
2326         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2327         if (sbio->status) {
2328                 for (i = 0; i < sbio->page_count; i++) {
2329                         struct scrub_page *spage = sbio->pagev[i];
2330
2331                         spage->io_error = 1;
2332                         spage->sblock->no_io_error_seen = 0;
2333                 }
2334         }
2335
2336         /* now complete the scrub_block items that have all pages completed */
2337         for (i = 0; i < sbio->page_count; i++) {
2338                 struct scrub_page *spage = sbio->pagev[i];
2339                 struct scrub_block *sblock = spage->sblock;
2340
2341                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2342                         scrub_block_complete(sblock);
2343                 scrub_block_put(sblock);
2344         }
2345
2346         bio_put(sbio->bio);
2347         sbio->bio = NULL;
2348         spin_lock(&sctx->list_lock);
2349         sbio->next_free = sctx->first_free;
2350         sctx->first_free = sbio->index;
2351         spin_unlock(&sctx->list_lock);
2352
2353         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2354                 mutex_lock(&sctx->wr_lock);
2355                 scrub_wr_submit(sctx);
2356                 mutex_unlock(&sctx->wr_lock);
2357         }
2358
2359         scrub_pending_bio_dec(sctx);
2360 }
2361
2362 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2363                                        unsigned long *bitmap,
2364                                        u64 start, u64 len)
2365 {
2366         u64 offset;
2367         u64 nsectors64;
2368         u32 nsectors;
2369         int sectorsize = sparity->sctx->fs_info->sectorsize;
2370
2371         if (len >= sparity->stripe_len) {
2372                 bitmap_set(bitmap, 0, sparity->nsectors);
2373                 return;
2374         }
2375
2376         start -= sparity->logic_start;
2377         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2378         offset = div_u64(offset, sectorsize);
2379         nsectors64 = div_u64(len, sectorsize);
2380
2381         ASSERT(nsectors64 < UINT_MAX);
2382         nsectors = (u32)nsectors64;
2383
2384         if (offset + nsectors <= sparity->nsectors) {
2385                 bitmap_set(bitmap, offset, nsectors);
2386                 return;
2387         }
2388
2389         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2390         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2391 }
2392
2393 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2394                                                    u64 start, u64 len)
2395 {
2396         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2397 }
2398
2399 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2400                                                   u64 start, u64 len)
2401 {
2402         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2403 }
2404
2405 static void scrub_block_complete(struct scrub_block *sblock)
2406 {
2407         int corrupted = 0;
2408
2409         if (!sblock->no_io_error_seen) {
2410                 corrupted = 1;
2411                 scrub_handle_errored_block(sblock);
2412         } else {
2413                 /*
2414                  * if has checksum error, write via repair mechanism in
2415                  * dev replace case, otherwise write here in dev replace
2416                  * case.
2417                  */
2418                 corrupted = scrub_checksum(sblock);
2419                 if (!corrupted && sblock->sctx->is_dev_replace)
2420                         scrub_write_block_to_dev_replace(sblock);
2421         }
2422
2423         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2424                 u64 start = sblock->pagev[0]->logical;
2425                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2426                           PAGE_SIZE;
2427
2428                 scrub_parity_mark_sectors_error(sblock->sparity,
2429                                                 start, end - start);
2430         }
2431 }
2432
2433 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2434 {
2435         struct btrfs_ordered_sum *sum = NULL;
2436         unsigned long index;
2437         unsigned long num_sectors;
2438
2439         while (!list_empty(&sctx->csum_list)) {
2440                 sum = list_first_entry(&sctx->csum_list,
2441                                        struct btrfs_ordered_sum, list);
2442                 if (sum->bytenr > logical)
2443                         return 0;
2444                 if (sum->bytenr + sum->len > logical)
2445                         break;
2446
2447                 ++sctx->stat.csum_discards;
2448                 list_del(&sum->list);
2449                 kfree(sum);
2450                 sum = NULL;
2451         }
2452         if (!sum)
2453                 return 0;
2454
2455         index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
2456         ASSERT(index < UINT_MAX);
2457
2458         num_sectors = sum->len / sctx->fs_info->sectorsize;
2459         memcpy(csum, sum->sums + index, sctx->csum_size);
2460         if (index == num_sectors - 1) {
2461                 list_del(&sum->list);
2462                 kfree(sum);
2463         }
2464         return 1;
2465 }
2466
2467 /* scrub extent tries to collect up to 64 kB for each bio */
2468 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2469                         u64 logical, u64 len,
2470                         u64 physical, struct btrfs_device *dev, u64 flags,
2471                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2472 {
2473         int ret;
2474         u8 csum[BTRFS_CSUM_SIZE];
2475         u32 blocksize;
2476
2477         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2478                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2479                         blocksize = map->stripe_len;
2480                 else
2481                         blocksize = sctx->fs_info->sectorsize;
2482                 spin_lock(&sctx->stat_lock);
2483                 sctx->stat.data_extents_scrubbed++;
2484                 sctx->stat.data_bytes_scrubbed += len;
2485                 spin_unlock(&sctx->stat_lock);
2486         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2487                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2488                         blocksize = map->stripe_len;
2489                 else
2490                         blocksize = sctx->fs_info->nodesize;
2491                 spin_lock(&sctx->stat_lock);
2492                 sctx->stat.tree_extents_scrubbed++;
2493                 sctx->stat.tree_bytes_scrubbed += len;
2494                 spin_unlock(&sctx->stat_lock);
2495         } else {
2496                 blocksize = sctx->fs_info->sectorsize;
2497                 WARN_ON(1);
2498         }
2499
2500         while (len) {
2501                 u64 l = min_t(u64, len, blocksize);
2502                 int have_csum = 0;
2503
2504                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2505                         /* push csums to sbio */
2506                         have_csum = scrub_find_csum(sctx, logical, csum);
2507                         if (have_csum == 0)
2508                                 ++sctx->stat.no_csum;
2509                 }
2510                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2511                                   mirror_num, have_csum ? csum : NULL, 0,
2512                                   physical_for_dev_replace);
2513                 if (ret)
2514                         return ret;
2515                 len -= l;
2516                 logical += l;
2517                 physical += l;
2518                 physical_for_dev_replace += l;
2519         }
2520         return 0;
2521 }
2522
2523 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2524                                   u64 logical, u64 len,
2525                                   u64 physical, struct btrfs_device *dev,
2526                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2527 {
2528         struct scrub_ctx *sctx = sparity->sctx;
2529         struct scrub_block *sblock;
2530         int index;
2531
2532         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2533         if (!sblock) {
2534                 spin_lock(&sctx->stat_lock);
2535                 sctx->stat.malloc_errors++;
2536                 spin_unlock(&sctx->stat_lock);
2537                 return -ENOMEM;
2538         }
2539
2540         /* one ref inside this function, plus one for each page added to
2541          * a bio later on */
2542         refcount_set(&sblock->refs, 1);
2543         sblock->sctx = sctx;
2544         sblock->no_io_error_seen = 1;
2545         sblock->sparity = sparity;
2546         scrub_parity_get(sparity);
2547
2548         for (index = 0; len > 0; index++) {
2549                 struct scrub_page *spage;
2550                 u64 l = min_t(u64, len, PAGE_SIZE);
2551
2552                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2553                 if (!spage) {
2554 leave_nomem:
2555                         spin_lock(&sctx->stat_lock);
2556                         sctx->stat.malloc_errors++;
2557                         spin_unlock(&sctx->stat_lock);
2558                         scrub_block_put(sblock);
2559                         return -ENOMEM;
2560                 }
2561                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2562                 /* For scrub block */
2563                 scrub_page_get(spage);
2564                 sblock->pagev[index] = spage;
2565                 /* For scrub parity */
2566                 scrub_page_get(spage);
2567                 list_add_tail(&spage->list, &sparity->spages);
2568                 spage->sblock = sblock;
2569                 spage->dev = dev;
2570                 spage->flags = flags;
2571                 spage->generation = gen;
2572                 spage->logical = logical;
2573                 spage->physical = physical;
2574                 spage->mirror_num = mirror_num;
2575                 if (csum) {
2576                         spage->have_csum = 1;
2577                         memcpy(spage->csum, csum, sctx->csum_size);
2578                 } else {
2579                         spage->have_csum = 0;
2580                 }
2581                 sblock->page_count++;
2582                 spage->page = alloc_page(GFP_KERNEL);
2583                 if (!spage->page)
2584                         goto leave_nomem;
2585                 len -= l;
2586                 logical += l;
2587                 physical += l;
2588         }
2589
2590         WARN_ON(sblock->page_count == 0);
2591         for (index = 0; index < sblock->page_count; index++) {
2592                 struct scrub_page *spage = sblock->pagev[index];
2593                 int ret;
2594
2595                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2596                 if (ret) {
2597                         scrub_block_put(sblock);
2598                         return ret;
2599                 }
2600         }
2601
2602         /* last one frees, either here or in bio completion for last page */
2603         scrub_block_put(sblock);
2604         return 0;
2605 }
2606
2607 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2608                                    u64 logical, u64 len,
2609                                    u64 physical, struct btrfs_device *dev,
2610                                    u64 flags, u64 gen, int mirror_num)
2611 {
2612         struct scrub_ctx *sctx = sparity->sctx;
2613         int ret;
2614         u8 csum[BTRFS_CSUM_SIZE];
2615         u32 blocksize;
2616
2617         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2618                 scrub_parity_mark_sectors_error(sparity, logical, len);
2619                 return 0;
2620         }
2621
2622         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2623                 blocksize = sparity->stripe_len;
2624         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2625                 blocksize = sparity->stripe_len;
2626         } else {
2627                 blocksize = sctx->fs_info->sectorsize;
2628                 WARN_ON(1);
2629         }
2630
2631         while (len) {
2632                 u64 l = min_t(u64, len, blocksize);
2633                 int have_csum = 0;
2634
2635                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2636                         /* push csums to sbio */
2637                         have_csum = scrub_find_csum(sctx, logical, csum);
2638                         if (have_csum == 0)
2639                                 goto skip;
2640                 }
2641                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2642                                              flags, gen, mirror_num,
2643                                              have_csum ? csum : NULL);
2644                 if (ret)
2645                         return ret;
2646 skip:
2647                 len -= l;
2648                 logical += l;
2649                 physical += l;
2650         }
2651         return 0;
2652 }
2653
2654 /*
2655  * Given a physical address, this will calculate it's
2656  * logical offset. if this is a parity stripe, it will return
2657  * the most left data stripe's logical offset.
2658  *
2659  * return 0 if it is a data stripe, 1 means parity stripe.
2660  */
2661 static int get_raid56_logic_offset(u64 physical, int num,
2662                                    struct map_lookup *map, u64 *offset,
2663                                    u64 *stripe_start)
2664 {
2665         int i;
2666         int j = 0;
2667         u64 stripe_nr;
2668         u64 last_offset;
2669         u32 stripe_index;
2670         u32 rot;
2671
2672         last_offset = (physical - map->stripes[num].physical) *
2673                       nr_data_stripes(map);
2674         if (stripe_start)
2675                 *stripe_start = last_offset;
2676
2677         *offset = last_offset;
2678         for (i = 0; i < nr_data_stripes(map); i++) {
2679                 *offset = last_offset + i * map->stripe_len;
2680
2681                 stripe_nr = div64_u64(*offset, map->stripe_len);
2682                 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
2683
2684                 /* Work out the disk rotation on this stripe-set */
2685                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2686                 /* calculate which stripe this data locates */
2687                 rot += i;
2688                 stripe_index = rot % map->num_stripes;
2689                 if (stripe_index == num)
2690                         return 0;
2691                 if (stripe_index < num)
2692                         j++;
2693         }
2694         *offset = last_offset + j * map->stripe_len;
2695         return 1;
2696 }
2697
2698 static void scrub_free_parity(struct scrub_parity *sparity)
2699 {
2700         struct scrub_ctx *sctx = sparity->sctx;
2701         struct scrub_page *curr, *next;
2702         int nbits;
2703
2704         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2705         if (nbits) {
2706                 spin_lock(&sctx->stat_lock);
2707                 sctx->stat.read_errors += nbits;
2708                 sctx->stat.uncorrectable_errors += nbits;
2709                 spin_unlock(&sctx->stat_lock);
2710         }
2711
2712         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2713                 list_del_init(&curr->list);
2714                 scrub_page_put(curr);
2715         }
2716
2717         kfree(sparity);
2718 }
2719
2720 static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2721 {
2722         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2723                                                     work);
2724         struct scrub_ctx *sctx = sparity->sctx;
2725
2726         scrub_free_parity(sparity);
2727         scrub_pending_bio_dec(sctx);
2728 }
2729
2730 static void scrub_parity_bio_endio(struct bio *bio)
2731 {
2732         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2733         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2734
2735         if (bio->bi_status)
2736                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2737                           sparity->nsectors);
2738
2739         bio_put(bio);
2740
2741         btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
2742                         scrub_parity_bio_endio_worker, NULL, NULL);
2743         btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
2744 }
2745
2746 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2747 {
2748         struct scrub_ctx *sctx = sparity->sctx;
2749         struct btrfs_fs_info *fs_info = sctx->fs_info;
2750         struct bio *bio;
2751         struct btrfs_raid_bio *rbio;
2752         struct btrfs_bio *bbio = NULL;
2753         u64 length;
2754         int ret;
2755
2756         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2757                            sparity->nsectors))
2758                 goto out;
2759
2760         length = sparity->logic_end - sparity->logic_start;
2761
2762         btrfs_bio_counter_inc_blocked(fs_info);
2763         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2764                                &length, &bbio);
2765         if (ret || !bbio || !bbio->raid_map)
2766                 goto bbio_out;
2767
2768         bio = btrfs_io_bio_alloc(0);
2769         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2770         bio->bi_private = sparity;
2771         bio->bi_end_io = scrub_parity_bio_endio;
2772
2773         rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
2774                                               length, sparity->scrub_dev,
2775                                               sparity->dbitmap,
2776                                               sparity->nsectors);
2777         if (!rbio)
2778                 goto rbio_out;
2779
2780         scrub_pending_bio_inc(sctx);
2781         raid56_parity_submit_scrub_rbio(rbio);
2782         return;
2783
2784 rbio_out:
2785         bio_put(bio);
2786 bbio_out:
2787         btrfs_bio_counter_dec(fs_info);
2788         btrfs_put_bbio(bbio);
2789         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2790                   sparity->nsectors);
2791         spin_lock(&sctx->stat_lock);
2792         sctx->stat.malloc_errors++;
2793         spin_unlock(&sctx->stat_lock);
2794 out:
2795         scrub_free_parity(sparity);
2796 }
2797
2798 static inline int scrub_calc_parity_bitmap_len(int nsectors)
2799 {
2800         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
2801 }
2802
2803 static void scrub_parity_get(struct scrub_parity *sparity)
2804 {
2805         refcount_inc(&sparity->refs);
2806 }
2807
2808 static void scrub_parity_put(struct scrub_parity *sparity)
2809 {
2810         if (!refcount_dec_and_test(&sparity->refs))
2811                 return;
2812
2813         scrub_parity_check_and_repair(sparity);
2814 }
2815
2816 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2817                                                   struct map_lookup *map,
2818                                                   struct btrfs_device *sdev,
2819                                                   struct btrfs_path *path,
2820                                                   u64 logic_start,
2821                                                   u64 logic_end)
2822 {
2823         struct btrfs_fs_info *fs_info = sctx->fs_info;
2824         struct btrfs_root *root = fs_info->extent_root;
2825         struct btrfs_root *csum_root = fs_info->csum_root;
2826         struct btrfs_extent_item *extent;
2827         struct btrfs_bio *bbio = NULL;
2828         u64 flags;
2829         int ret;
2830         int slot;
2831         struct extent_buffer *l;
2832         struct btrfs_key key;
2833         u64 generation;
2834         u64 extent_logical;
2835         u64 extent_physical;
2836         u64 extent_len;
2837         u64 mapped_length;
2838         struct btrfs_device *extent_dev;
2839         struct scrub_parity *sparity;
2840         int nsectors;
2841         int bitmap_len;
2842         int extent_mirror_num;
2843         int stop_loop = 0;
2844
2845         nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
2846         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2847         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2848                           GFP_NOFS);
2849         if (!sparity) {
2850                 spin_lock(&sctx->stat_lock);
2851                 sctx->stat.malloc_errors++;
2852                 spin_unlock(&sctx->stat_lock);
2853                 return -ENOMEM;
2854         }
2855
2856         sparity->stripe_len = map->stripe_len;
2857         sparity->nsectors = nsectors;
2858         sparity->sctx = sctx;
2859         sparity->scrub_dev = sdev;
2860         sparity->logic_start = logic_start;
2861         sparity->logic_end = logic_end;
2862         refcount_set(&sparity->refs, 1);
2863         INIT_LIST_HEAD(&sparity->spages);
2864         sparity->dbitmap = sparity->bitmap;
2865         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2866
2867         ret = 0;
2868         while (logic_start < logic_end) {
2869                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2870                         key.type = BTRFS_METADATA_ITEM_KEY;
2871                 else
2872                         key.type = BTRFS_EXTENT_ITEM_KEY;
2873                 key.objectid = logic_start;
2874                 key.offset = (u64)-1;
2875
2876                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2877                 if (ret < 0)
2878                         goto out;
2879
2880                 if (ret > 0) {
2881                         ret = btrfs_previous_extent_item(root, path, 0);
2882                         if (ret < 0)
2883                                 goto out;
2884                         if (ret > 0) {
2885                                 btrfs_release_path(path);
2886                                 ret = btrfs_search_slot(NULL, root, &key,
2887                                                         path, 0, 0);
2888                                 if (ret < 0)
2889                                         goto out;
2890                         }
2891                 }
2892
2893                 stop_loop = 0;
2894                 while (1) {
2895                         u64 bytes;
2896
2897                         l = path->nodes[0];
2898                         slot = path->slots[0];
2899                         if (slot >= btrfs_header_nritems(l)) {
2900                                 ret = btrfs_next_leaf(root, path);
2901                                 if (ret == 0)
2902                                         continue;
2903                                 if (ret < 0)
2904                                         goto out;
2905
2906                                 stop_loop = 1;
2907                                 break;
2908                         }
2909                         btrfs_item_key_to_cpu(l, &key, slot);
2910
2911                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2912                             key.type != BTRFS_METADATA_ITEM_KEY)
2913                                 goto next;
2914
2915                         if (key.type == BTRFS_METADATA_ITEM_KEY)
2916                                 bytes = fs_info->nodesize;
2917                         else
2918                                 bytes = key.offset;
2919
2920                         if (key.objectid + bytes <= logic_start)
2921                                 goto next;
2922
2923                         if (key.objectid >= logic_end) {
2924                                 stop_loop = 1;
2925                                 break;
2926                         }
2927
2928                         while (key.objectid >= logic_start + map->stripe_len)
2929                                 logic_start += map->stripe_len;
2930
2931                         extent = btrfs_item_ptr(l, slot,
2932                                                 struct btrfs_extent_item);
2933                         flags = btrfs_extent_flags(l, extent);
2934                         generation = btrfs_extent_generation(l, extent);
2935
2936                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
2937                             (key.objectid < logic_start ||
2938                              key.objectid + bytes >
2939                              logic_start + map->stripe_len)) {
2940                                 btrfs_err(fs_info,
2941                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2942                                           key.objectid, logic_start);
2943                                 spin_lock(&sctx->stat_lock);
2944                                 sctx->stat.uncorrectable_errors++;
2945                                 spin_unlock(&sctx->stat_lock);
2946                                 goto next;
2947                         }
2948 again:
2949                         extent_logical = key.objectid;
2950                         extent_len = bytes;
2951
2952                         if (extent_logical < logic_start) {
2953                                 extent_len -= logic_start - extent_logical;
2954                                 extent_logical = logic_start;
2955                         }
2956
2957                         if (extent_logical + extent_len >
2958                             logic_start + map->stripe_len)
2959                                 extent_len = logic_start + map->stripe_len -
2960                                              extent_logical;
2961
2962                         scrub_parity_mark_sectors_data(sparity, extent_logical,
2963                                                        extent_len);
2964
2965                         mapped_length = extent_len;
2966                         bbio = NULL;
2967                         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
2968                                         extent_logical, &mapped_length, &bbio,
2969                                         0);
2970                         if (!ret) {
2971                                 if (!bbio || mapped_length < extent_len)
2972                                         ret = -EIO;
2973                         }
2974                         if (ret) {
2975                                 btrfs_put_bbio(bbio);
2976                                 goto out;
2977                         }
2978                         extent_physical = bbio->stripes[0].physical;
2979                         extent_mirror_num = bbio->mirror_num;
2980                         extent_dev = bbio->stripes[0].dev;
2981                         btrfs_put_bbio(bbio);
2982
2983                         ret = btrfs_lookup_csums_range(csum_root,
2984                                                 extent_logical,
2985                                                 extent_logical + extent_len - 1,
2986                                                 &sctx->csum_list, 1);
2987                         if (ret)
2988                                 goto out;
2989
2990                         ret = scrub_extent_for_parity(sparity, extent_logical,
2991                                                       extent_len,
2992                                                       extent_physical,
2993                                                       extent_dev, flags,
2994                                                       generation,
2995                                                       extent_mirror_num);
2996
2997                         scrub_free_csums(sctx);
2998
2999                         if (ret)
3000                                 goto out;
3001
3002                         if (extent_logical + extent_len <
3003                             key.objectid + bytes) {
3004                                 logic_start += map->stripe_len;
3005
3006                                 if (logic_start >= logic_end) {
3007                                         stop_loop = 1;
3008                                         break;
3009                                 }
3010
3011                                 if (logic_start < key.objectid + bytes) {
3012                                         cond_resched();
3013                                         goto again;
3014                                 }
3015                         }
3016 next:
3017                         path->slots[0]++;
3018                 }
3019
3020                 btrfs_release_path(path);
3021
3022                 if (stop_loop)
3023                         break;
3024
3025                 logic_start += map->stripe_len;
3026         }
3027 out:
3028         if (ret < 0)
3029                 scrub_parity_mark_sectors_error(sparity, logic_start,
3030                                                 logic_end - logic_start);
3031         scrub_parity_put(sparity);
3032         scrub_submit(sctx);
3033         mutex_lock(&sctx->wr_lock);
3034         scrub_wr_submit(sctx);
3035         mutex_unlock(&sctx->wr_lock);
3036
3037         btrfs_release_path(path);
3038         return ret < 0 ? ret : 0;
3039 }
3040
3041 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3042                                            struct map_lookup *map,
3043                                            struct btrfs_device *scrub_dev,
3044                                            int num, u64 base, u64 length)
3045 {
3046         struct btrfs_path *path, *ppath;
3047         struct btrfs_fs_info *fs_info = sctx->fs_info;
3048         struct btrfs_root *root = fs_info->extent_root;
3049         struct btrfs_root *csum_root = fs_info->csum_root;
3050         struct btrfs_extent_item *extent;
3051         struct blk_plug plug;
3052         u64 flags;
3053         int ret;
3054         int slot;
3055         u64 nstripes;
3056         struct extent_buffer *l;
3057         u64 physical;
3058         u64 logical;
3059         u64 logic_end;
3060         u64 physical_end;
3061         u64 generation;
3062         int mirror_num;
3063         struct reada_control *reada1;
3064         struct reada_control *reada2;
3065         struct btrfs_key key;
3066         struct btrfs_key key_end;
3067         u64 increment = map->stripe_len;
3068         u64 offset;
3069         u64 extent_logical;
3070         u64 extent_physical;
3071         u64 extent_len;
3072         u64 stripe_logical;
3073         u64 stripe_end;
3074         struct btrfs_device *extent_dev;
3075         int extent_mirror_num;
3076         int stop_loop = 0;
3077
3078         physical = map->stripes[num].physical;
3079         offset = 0;
3080         nstripes = div64_u64(length, map->stripe_len);
3081         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3082                 offset = map->stripe_len * num;
3083                 increment = map->stripe_len * map->num_stripes;
3084                 mirror_num = 1;
3085         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3086                 int factor = map->num_stripes / map->sub_stripes;
3087                 offset = map->stripe_len * (num / map->sub_stripes);
3088                 increment = map->stripe_len * factor;
3089                 mirror_num = num % map->sub_stripes + 1;
3090         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3091                 increment = map->stripe_len;
3092                 mirror_num = num % map->num_stripes + 1;
3093         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3094                 increment = map->stripe_len;
3095                 mirror_num = num % map->num_stripes + 1;
3096         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3097                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3098                 increment = map->stripe_len * nr_data_stripes(map);
3099                 mirror_num = 1;
3100         } else {
3101                 increment = map->stripe_len;
3102                 mirror_num = 1;
3103         }
3104
3105         path = btrfs_alloc_path();
3106         if (!path)
3107                 return -ENOMEM;
3108
3109         ppath = btrfs_alloc_path();
3110         if (!ppath) {
3111                 btrfs_free_path(path);
3112                 return -ENOMEM;
3113         }
3114
3115         /*
3116          * work on commit root. The related disk blocks are static as
3117          * long as COW is applied. This means, it is save to rewrite
3118          * them to repair disk errors without any race conditions
3119          */
3120         path->search_commit_root = 1;
3121         path->skip_locking = 1;
3122
3123         ppath->search_commit_root = 1;
3124         ppath->skip_locking = 1;
3125         /*
3126          * trigger the readahead for extent tree csum tree and wait for
3127          * completion. During readahead, the scrub is officially paused
3128          * to not hold off transaction commits
3129          */
3130         logical = base + offset;
3131         physical_end = physical + nstripes * map->stripe_len;
3132         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3133                 get_raid56_logic_offset(physical_end, num,
3134                                         map, &logic_end, NULL);
3135                 logic_end += base;
3136         } else {
3137                 logic_end = logical + increment * nstripes;
3138         }
3139         wait_event(sctx->list_wait,
3140                    atomic_read(&sctx->bios_in_flight) == 0);
3141         scrub_blocked_if_needed(fs_info);
3142
3143         /* FIXME it might be better to start readahead at commit root */
3144         key.objectid = logical;
3145         key.type = BTRFS_EXTENT_ITEM_KEY;
3146         key.offset = (u64)0;
3147         key_end.objectid = logic_end;
3148         key_end.type = BTRFS_METADATA_ITEM_KEY;
3149         key_end.offset = (u64)-1;
3150         reada1 = btrfs_reada_add(root, &key, &key_end);
3151
3152         key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3153         key.type = BTRFS_EXTENT_CSUM_KEY;
3154         key.offset = logical;
3155         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3156         key_end.type = BTRFS_EXTENT_CSUM_KEY;
3157         key_end.offset = logic_end;
3158         reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3159
3160         if (!IS_ERR(reada1))
3161                 btrfs_reada_wait(reada1);
3162         if (!IS_ERR(reada2))
3163                 btrfs_reada_wait(reada2);
3164
3165
3166         /*
3167          * collect all data csums for the stripe to avoid seeking during
3168          * the scrub. This might currently (crc32) end up to be about 1MB
3169          */
3170         blk_start_plug(&plug);
3171
3172         /*
3173          * now find all extents for each stripe and scrub them
3174          */
3175         ret = 0;
3176         while (physical < physical_end) {
3177                 /*
3178                  * canceled?
3179                  */
3180                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3181                     atomic_read(&sctx->cancel_req)) {
3182                         ret = -ECANCELED;
3183                         goto out;
3184                 }
3185                 /*
3186                  * check to see if we have to pause
3187                  */
3188                 if (atomic_read(&fs_info->scrub_pause_req)) {
3189                         /* push queued extents */
3190                         sctx->flush_all_writes = true;
3191                         scrub_submit(sctx);
3192                         mutex_lock(&sctx->wr_lock);
3193                         scrub_wr_submit(sctx);
3194                         mutex_unlock(&sctx->wr_lock);
3195                         wait_event(sctx->list_wait,
3196                                    atomic_read(&sctx->bios_in_flight) == 0);
3197                         sctx->flush_all_writes = false;
3198                         scrub_blocked_if_needed(fs_info);
3199                 }
3200
3201                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3202                         ret = get_raid56_logic_offset(physical, num, map,
3203                                                       &logical,
3204                                                       &stripe_logical);
3205                         logical += base;
3206                         if (ret) {
3207                                 /* it is parity strip */
3208                                 stripe_logical += base;
3209                                 stripe_end = stripe_logical + increment;
3210                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3211                                                           ppath, stripe_logical,
3212                                                           stripe_end);
3213                                 if (ret)
3214                                         goto out;
3215                                 goto skip;
3216                         }
3217                 }
3218
3219                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3220                         key.type = BTRFS_METADATA_ITEM_KEY;
3221                 else
3222                         key.type = BTRFS_EXTENT_ITEM_KEY;
3223                 key.objectid = logical;
3224                 key.offset = (u64)-1;
3225
3226                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3227                 if (ret < 0)
3228                         goto out;
3229
3230                 if (ret > 0) {
3231                         ret = btrfs_previous_extent_item(root, path, 0);
3232                         if (ret < 0)
3233                                 goto out;
3234                         if (ret > 0) {
3235                                 /* there's no smaller item, so stick with the
3236                                  * larger one */
3237                                 btrfs_release_path(path);
3238                                 ret = btrfs_search_slot(NULL, root, &key,
3239                                                         path, 0, 0);
3240                                 if (ret < 0)
3241                                         goto out;
3242                         }
3243                 }
3244
3245                 stop_loop = 0;
3246                 while (1) {
3247                         u64 bytes;
3248
3249                         l = path->nodes[0];
3250                         slot = path->slots[0];
3251                         if (slot >= btrfs_header_nritems(l)) {
3252                                 ret = btrfs_next_leaf(root, path);
3253                                 if (ret == 0)
3254                                         continue;
3255                                 if (ret < 0)
3256                                         goto out;
3257
3258                                 stop_loop = 1;
3259                                 break;
3260                         }
3261                         btrfs_item_key_to_cpu(l, &key, slot);
3262
3263                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3264                             key.type != BTRFS_METADATA_ITEM_KEY)
3265                                 goto next;
3266
3267                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3268                                 bytes = fs_info->nodesize;
3269                         else
3270                                 bytes = key.offset;
3271
3272                         if (key.objectid + bytes <= logical)
3273                                 goto next;
3274
3275                         if (key.objectid >= logical + map->stripe_len) {
3276                                 /* out of this device extent */
3277                                 if (key.objectid >= logic_end)
3278                                         stop_loop = 1;
3279                                 break;
3280                         }
3281
3282                         extent = btrfs_item_ptr(l, slot,
3283                                                 struct btrfs_extent_item);
3284                         flags = btrfs_extent_flags(l, extent);
3285                         generation = btrfs_extent_generation(l, extent);
3286
3287                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3288                             (key.objectid < logical ||
3289                              key.objectid + bytes >
3290                              logical + map->stripe_len)) {
3291                                 btrfs_err(fs_info,
3292                                            "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3293                                        key.objectid, logical);
3294                                 spin_lock(&sctx->stat_lock);
3295                                 sctx->stat.uncorrectable_errors++;
3296                                 spin_unlock(&sctx->stat_lock);
3297                                 goto next;
3298                         }
3299
3300 again:
3301                         extent_logical = key.objectid;
3302                         extent_len = bytes;
3303
3304                         /*
3305                          * trim extent to this stripe
3306                          */
3307                         if (extent_logical < logical) {
3308                                 extent_len -= logical - extent_logical;
3309                                 extent_logical = logical;
3310                         }
3311                         if (extent_logical + extent_len >
3312                             logical + map->stripe_len) {
3313                                 extent_len = logical + map->stripe_len -
3314                                              extent_logical;
3315                         }
3316
3317                         extent_physical = extent_logical - logical + physical;
3318                         extent_dev = scrub_dev;
3319                         extent_mirror_num = mirror_num;
3320                         if (sctx->is_dev_replace)
3321                                 scrub_remap_extent(fs_info, extent_logical,
3322                                                    extent_len, &extent_physical,
3323                                                    &extent_dev,
3324                                                    &extent_mirror_num);
3325
3326                         ret = btrfs_lookup_csums_range(csum_root,
3327                                                        extent_logical,
3328                                                        extent_logical +
3329                                                        extent_len - 1,
3330                                                        &sctx->csum_list, 1);
3331                         if (ret)
3332                                 goto out;
3333
3334                         ret = scrub_extent(sctx, map, extent_logical, extent_len,
3335                                            extent_physical, extent_dev, flags,
3336                                            generation, extent_mirror_num,
3337                                            extent_logical - logical + physical);
3338
3339                         scrub_free_csums(sctx);
3340
3341                         if (ret)
3342                                 goto out;
3343
3344                         if (extent_logical + extent_len <
3345                             key.objectid + bytes) {
3346                                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3347                                         /*
3348                                          * loop until we find next data stripe
3349                                          * or we have finished all stripes.
3350                                          */
3351 loop:
3352                                         physical += map->stripe_len;
3353                                         ret = get_raid56_logic_offset(physical,
3354                                                         num, map, &logical,
3355                                                         &stripe_logical);
3356                                         logical += base;
3357
3358                                         if (ret && physical < physical_end) {
3359                                                 stripe_logical += base;
3360                                                 stripe_end = stripe_logical +
3361                                                                 increment;
3362                                                 ret = scrub_raid56_parity(sctx,
3363                                                         map, scrub_dev, ppath,
3364                                                         stripe_logical,
3365                                                         stripe_end);
3366                                                 if (ret)
3367                                                         goto out;
3368                                                 goto loop;
3369                                         }
3370                                 } else {
3371                                         physical += map->stripe_len;
3372                                         logical += increment;
3373                                 }
3374                                 if (logical < key.objectid + bytes) {
3375                                         cond_resched();
3376                                         goto again;
3377                                 }
3378
3379                                 if (physical >= physical_end) {
3380                                         stop_loop = 1;
3381                                         break;
3382                                 }
3383                         }
3384 next:
3385                         path->slots[0]++;
3386                 }
3387                 btrfs_release_path(path);
3388 skip:
3389                 logical += increment;
3390                 physical += map->stripe_len;
3391                 spin_lock(&sctx->stat_lock);
3392                 if (stop_loop)
3393                         sctx->stat.last_physical = map->stripes[num].physical +
3394                                                    length;
3395                 else
3396                         sctx->stat.last_physical = physical;
3397                 spin_unlock(&sctx->stat_lock);
3398                 if (stop_loop)
3399                         break;
3400         }
3401 out:
3402         /* push queued extents */
3403         scrub_submit(sctx);
3404         mutex_lock(&sctx->wr_lock);
3405         scrub_wr_submit(sctx);
3406         mutex_unlock(&sctx->wr_lock);
3407
3408         blk_finish_plug(&plug);
3409         btrfs_free_path(path);
3410         btrfs_free_path(ppath);
3411         return ret < 0 ? ret : 0;
3412 }
3413
3414 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3415                                           struct btrfs_device *scrub_dev,
3416                                           u64 chunk_offset, u64 length,
3417                                           u64 dev_offset,
3418                                           struct btrfs_block_group_cache *cache)
3419 {
3420         struct btrfs_fs_info *fs_info = sctx->fs_info;
3421         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3422         struct map_lookup *map;
3423         struct extent_map *em;
3424         int i;
3425         int ret = 0;
3426
3427         read_lock(&map_tree->map_tree.lock);
3428         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3429         read_unlock(&map_tree->map_tree.lock);
3430
3431         if (!em) {
3432                 /*
3433                  * Might have been an unused block group deleted by the cleaner
3434                  * kthread or relocation.
3435                  */
3436                 spin_lock(&cache->lock);
3437                 if (!cache->removed)
3438                         ret = -EINVAL;
3439                 spin_unlock(&cache->lock);
3440
3441                 return ret;
3442         }
3443
3444         map = em->map_lookup;
3445         if (em->start != chunk_offset)
3446                 goto out;
3447
3448         if (em->len < length)
3449                 goto out;
3450
3451         for (i = 0; i < map->num_stripes; ++i) {
3452                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3453                     map->stripes[i].physical == dev_offset) {
3454                         ret = scrub_stripe(sctx, map, scrub_dev, i,
3455                                            chunk_offset, length);
3456                         if (ret)
3457                                 goto out;
3458                 }
3459         }
3460 out:
3461         free_extent_map(em);
3462
3463         return ret;
3464 }
3465
3466 static noinline_for_stack
3467 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3468                            struct btrfs_device *scrub_dev, u64 start, u64 end)
3469 {
3470         struct btrfs_dev_extent *dev_extent = NULL;
3471         struct btrfs_path *path;
3472         struct btrfs_fs_info *fs_info = sctx->fs_info;
3473         struct btrfs_root *root = fs_info->dev_root;
3474         u64 length;
3475         u64 chunk_offset;
3476         int ret = 0;
3477         int ro_set;
3478         int slot;
3479         struct extent_buffer *l;
3480         struct btrfs_key key;
3481         struct btrfs_key found_key;
3482         struct btrfs_block_group_cache *cache;
3483         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3484
3485         path = btrfs_alloc_path();
3486         if (!path)
3487                 return -ENOMEM;
3488
3489         path->reada = READA_FORWARD;
3490         path->search_commit_root = 1;
3491         path->skip_locking = 1;
3492
3493         key.objectid = scrub_dev->devid;
3494         key.offset = 0ull;
3495         key.type = BTRFS_DEV_EXTENT_KEY;
3496
3497         while (1) {
3498                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3499                 if (ret < 0)
3500                         break;
3501                 if (ret > 0) {
3502                         if (path->slots[0] >=
3503                             btrfs_header_nritems(path->nodes[0])) {
3504                                 ret = btrfs_next_leaf(root, path);
3505                                 if (ret < 0)
3506                                         break;
3507                                 if (ret > 0) {
3508                                         ret = 0;
3509                                         break;
3510                                 }
3511                         } else {
3512                                 ret = 0;
3513                         }
3514                 }
3515
3516                 l = path->nodes[0];
3517                 slot = path->slots[0];
3518
3519                 btrfs_item_key_to_cpu(l, &found_key, slot);
3520
3521                 if (found_key.objectid != scrub_dev->devid)
3522                         break;
3523
3524                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3525                         break;
3526
3527                 if (found_key.offset >= end)
3528                         break;
3529
3530                 if (found_key.offset < key.offset)
3531                         break;
3532
3533                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3534                 length = btrfs_dev_extent_length(l, dev_extent);
3535
3536                 if (found_key.offset + length <= start)
3537                         goto skip;
3538
3539                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3540
3541                 /*
3542                  * get a reference on the corresponding block group to prevent
3543                  * the chunk from going away while we scrub it
3544                  */
3545                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3546
3547                 /* some chunks are removed but not committed to disk yet,
3548                  * continue scrubbing */
3549                 if (!cache)
3550                         goto skip;
3551
3552                 /*
3553                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3554                  * to avoid deadlock caused by:
3555                  * btrfs_inc_block_group_ro()
3556                  * -> btrfs_wait_for_commit()
3557                  * -> btrfs_commit_transaction()
3558                  * -> btrfs_scrub_pause()
3559                  */
3560                 scrub_pause_on(fs_info);
3561                 ret = btrfs_inc_block_group_ro(cache);
3562                 if (!ret && sctx->is_dev_replace) {
3563                         /*
3564                          * If we are doing a device replace wait for any tasks
3565                          * that started dellaloc right before we set the block
3566                          * group to RO mode, as they might have just allocated
3567                          * an extent from it or decided they could do a nocow
3568                          * write. And if any such tasks did that, wait for their
3569                          * ordered extents to complete and then commit the
3570                          * current transaction, so that we can later see the new
3571                          * extent items in the extent tree - the ordered extents
3572                          * create delayed data references (for cow writes) when
3573                          * they complete, which will be run and insert the
3574                          * corresponding extent items into the extent tree when
3575                          * we commit the transaction they used when running
3576                          * inode.c:btrfs_finish_ordered_io(). We later use
3577                          * the commit root of the extent tree to find extents
3578                          * to copy from the srcdev into the tgtdev, and we don't
3579                          * want to miss any new extents.
3580                          */
3581                         btrfs_wait_block_group_reservations(cache);
3582                         btrfs_wait_nocow_writers(cache);
3583                         ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
3584                                                        cache->key.objectid,
3585                                                        cache->key.offset);
3586                         if (ret > 0) {
3587                                 struct btrfs_trans_handle *trans;
3588
3589                                 trans = btrfs_join_transaction(root);
3590                                 if (IS_ERR(trans))
3591                                         ret = PTR_ERR(trans);
3592                                 else
3593                                         ret = btrfs_commit_transaction(trans);
3594                                 if (ret) {
3595                                         scrub_pause_off(fs_info);
3596                                         btrfs_put_block_group(cache);
3597                                         break;
3598                                 }
3599                         }
3600                 }
3601                 scrub_pause_off(fs_info);
3602
3603                 if (ret == 0) {
3604                         ro_set = 1;
3605                 } else if (ret == -ENOSPC) {
3606                         /*
3607                          * btrfs_inc_block_group_ro return -ENOSPC when it
3608                          * failed in creating new chunk for metadata.
3609                          * It is not a problem for scrub/replace, because
3610                          * metadata are always cowed, and our scrub paused
3611                          * commit_transactions.
3612                          */
3613                         ro_set = 0;
3614                 } else {
3615                         btrfs_warn(fs_info,
3616                                    "failed setting block group ro: %d", ret);
3617                         btrfs_put_block_group(cache);
3618                         break;
3619                 }
3620
3621                 btrfs_dev_replace_write_lock(&fs_info->dev_replace);
3622                 dev_replace->cursor_right = found_key.offset + length;
3623                 dev_replace->cursor_left = found_key.offset;
3624                 dev_replace->item_needs_writeback = 1;
3625                 btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
3626                 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3627                                   found_key.offset, cache);
3628
3629                 /*
3630                  * flush, submit all pending read and write bios, afterwards
3631                  * wait for them.
3632                  * Note that in the dev replace case, a read request causes
3633                  * write requests that are submitted in the read completion
3634                  * worker. Therefore in the current situation, it is required
3635                  * that all write requests are flushed, so that all read and
3636                  * write requests are really completed when bios_in_flight
3637                  * changes to 0.
3638                  */
3639                 sctx->flush_all_writes = true;
3640                 scrub_submit(sctx);
3641                 mutex_lock(&sctx->wr_lock);
3642                 scrub_wr_submit(sctx);
3643                 mutex_unlock(&sctx->wr_lock);
3644
3645                 wait_event(sctx->list_wait,
3646                            atomic_read(&sctx->bios_in_flight) == 0);
3647
3648                 scrub_pause_on(fs_info);
3649
3650                 /*
3651                  * must be called before we decrease @scrub_paused.
3652                  * make sure we don't block transaction commit while
3653                  * we are waiting pending workers finished.
3654                  */
3655                 wait_event(sctx->list_wait,
3656                            atomic_read(&sctx->workers_pending) == 0);
3657                 sctx->flush_all_writes = false;
3658
3659                 scrub_pause_off(fs_info);
3660
3661                 btrfs_dev_replace_write_lock(&fs_info->dev_replace);
3662                 dev_replace->cursor_left = dev_replace->cursor_right;
3663                 dev_replace->item_needs_writeback = 1;
3664                 btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
3665
3666                 if (ro_set)
3667                         btrfs_dec_block_group_ro(cache);
3668
3669                 /*
3670                  * We might have prevented the cleaner kthread from deleting
3671                  * this block group if it was already unused because we raced
3672                  * and set it to RO mode first. So add it back to the unused
3673                  * list, otherwise it might not ever be deleted unless a manual
3674                  * balance is triggered or it becomes used and unused again.
3675                  */
3676                 spin_lock(&cache->lock);
3677                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3678                     btrfs_block_group_used(&cache->item) == 0) {
3679                         spin_unlock(&cache->lock);
3680                         btrfs_mark_bg_unused(cache);
3681                 } else {
3682                         spin_unlock(&cache->lock);
3683                 }
3684
3685                 btrfs_put_block_group(cache);
3686                 if (ret)
3687                         break;
3688                 if (sctx->is_dev_replace &&
3689                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3690                         ret = -EIO;
3691                         break;
3692                 }
3693                 if (sctx->stat.malloc_errors > 0) {
3694                         ret = -ENOMEM;
3695                         break;
3696                 }
3697 skip:
3698                 key.offset = found_key.offset + length;
3699                 btrfs_release_path(path);
3700         }
3701
3702         btrfs_free_path(path);
3703
3704         return ret;
3705 }
3706
3707 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3708                                            struct btrfs_device *scrub_dev)
3709 {
3710         int     i;
3711         u64     bytenr;
3712         u64     gen;
3713         int     ret;
3714         struct btrfs_fs_info *fs_info = sctx->fs_info;
3715
3716         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3717                 return -EIO;
3718
3719         /* Seed devices of a new filesystem has their own generation. */
3720         if (scrub_dev->fs_devices != fs_info->fs_devices)
3721                 gen = scrub_dev->generation;
3722         else
3723                 gen = fs_info->last_trans_committed;
3724
3725         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3726                 bytenr = btrfs_sb_offset(i);
3727                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3728                     scrub_dev->commit_total_bytes)
3729                         break;
3730
3731                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3732                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3733                                   NULL, 1, bytenr);
3734                 if (ret)
3735                         return ret;
3736         }
3737         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3738
3739         return 0;
3740 }
3741
3742 /*
3743  * get a reference count on fs_info->scrub_workers. start worker if necessary
3744  */
3745 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3746                                                 int is_dev_replace)
3747 {
3748         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
3749         int max_active = fs_info->thread_pool_size;
3750
3751         if (fs_info->scrub_workers_refcnt == 0) {
3752                 fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
3753                                 flags, is_dev_replace ? 1 : max_active, 4);
3754                 if (!fs_info->scrub_workers)
3755                         goto fail_scrub_workers;
3756
3757                 fs_info->scrub_wr_completion_workers =
3758                         btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
3759                                               max_active, 2);
3760                 if (!fs_info->scrub_wr_completion_workers)
3761                         goto fail_scrub_wr_completion_workers;
3762
3763                 fs_info->scrub_parity_workers =
3764                         btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
3765                                               max_active, 2);
3766                 if (!fs_info->scrub_parity_workers)
3767                         goto fail_scrub_parity_workers;
3768         }
3769         ++fs_info->scrub_workers_refcnt;
3770         return 0;
3771
3772 fail_scrub_parity_workers:
3773         btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3774 fail_scrub_wr_completion_workers:
3775         btrfs_destroy_workqueue(fs_info->scrub_workers);
3776 fail_scrub_workers:
3777         return -ENOMEM;
3778 }
3779
3780 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3781                     u64 end, struct btrfs_scrub_progress *progress,
3782                     int readonly, int is_dev_replace)
3783 {
3784         struct scrub_ctx *sctx;
3785         int ret;
3786         struct btrfs_device *dev;
3787         unsigned int nofs_flag;
3788         struct btrfs_workqueue *scrub_workers = NULL;
3789         struct btrfs_workqueue *scrub_wr_comp = NULL;
3790         struct btrfs_workqueue *scrub_parity = NULL;
3791
3792         if (btrfs_fs_closing(fs_info))
3793                 return -EINVAL;
3794
3795         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
3796                 /*
3797                  * in this case scrub is unable to calculate the checksum
3798                  * the way scrub is implemented. Do not handle this
3799                  * situation at all because it won't ever happen.
3800                  */
3801                 btrfs_err(fs_info,
3802                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
3803                        fs_info->nodesize,
3804                        BTRFS_STRIPE_LEN);
3805                 return -EINVAL;
3806         }
3807
3808         if (fs_info->sectorsize != PAGE_SIZE) {
3809                 /* not supported for data w/o checksums */
3810                 btrfs_err_rl(fs_info,
3811                            "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
3812                        fs_info->sectorsize, PAGE_SIZE);
3813                 return -EINVAL;
3814         }
3815
3816         if (fs_info->nodesize >
3817             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3818             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3819                 /*
3820                  * would exhaust the array bounds of pagev member in
3821                  * struct scrub_block
3822                  */
3823                 btrfs_err(fs_info,
3824                           "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
3825                        fs_info->nodesize,
3826                        SCRUB_MAX_PAGES_PER_BLOCK,
3827                        fs_info->sectorsize,
3828                        SCRUB_MAX_PAGES_PER_BLOCK);
3829                 return -EINVAL;
3830         }
3831
3832         /* Allocate outside of device_list_mutex */
3833         sctx = scrub_setup_ctx(fs_info, is_dev_replace);
3834         if (IS_ERR(sctx))
3835                 return PTR_ERR(sctx);
3836
3837         mutex_lock(&fs_info->fs_devices->device_list_mutex);
3838         dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
3839         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
3840                      !is_dev_replace)) {
3841                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3842                 ret = -ENODEV;
3843                 goto out_free_ctx;
3844         }
3845
3846         if (!is_dev_replace && !readonly &&
3847             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
3848                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3849                 btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
3850                                 rcu_str_deref(dev->name));
3851                 ret = -EROFS;
3852                 goto out_free_ctx;
3853         }
3854
3855         mutex_lock(&fs_info->scrub_lock);
3856         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3857             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
3858                 mutex_unlock(&fs_info->scrub_lock);
3859                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3860                 ret = -EIO;
3861                 goto out_free_ctx;
3862         }
3863
3864         btrfs_dev_replace_read_lock(&fs_info->dev_replace);
3865         if (dev->scrub_ctx ||
3866             (!is_dev_replace &&
3867              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3868                 btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
3869                 mutex_unlock(&fs_info->scrub_lock);
3870                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3871                 ret = -EINPROGRESS;
3872                 goto out_free_ctx;
3873         }
3874         btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
3875
3876         ret = scrub_workers_get(fs_info, is_dev_replace);
3877         if (ret) {
3878                 mutex_unlock(&fs_info->scrub_lock);
3879                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3880                 goto out_free_ctx;
3881         }
3882
3883         sctx->readonly = readonly;
3884         dev->scrub_ctx = sctx;
3885         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3886
3887         /*
3888          * checking @scrub_pause_req here, we can avoid
3889          * race between committing transaction and scrubbing.
3890          */
3891         __scrub_blocked_if_needed(fs_info);
3892         atomic_inc(&fs_info->scrubs_running);
3893         mutex_unlock(&fs_info->scrub_lock);
3894
3895         /*
3896          * In order to avoid deadlock with reclaim when there is a transaction
3897          * trying to pause scrub, make sure we use GFP_NOFS for all the
3898          * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
3899          * invoked by our callees. The pausing request is done when the
3900          * transaction commit starts, and it blocks the transaction until scrub
3901          * is paused (done at specific points at scrub_stripe() or right above
3902          * before incrementing fs_info->scrubs_running).
3903          */
3904         nofs_flag = memalloc_nofs_save();
3905         if (!is_dev_replace) {
3906                 /*
3907                  * by holding device list mutex, we can
3908                  * kick off writing super in log tree sync.
3909                  */
3910                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3911                 ret = scrub_supers(sctx, dev);
3912                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3913         }
3914
3915         if (!ret)
3916                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
3917         memalloc_nofs_restore(nofs_flag);
3918
3919         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3920         atomic_dec(&fs_info->scrubs_running);
3921         wake_up(&fs_info->scrub_pause_wait);
3922
3923         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
3924
3925         if (progress)
3926                 memcpy(progress, &sctx->stat, sizeof(*progress));
3927
3928         mutex_lock(&fs_info->scrub_lock);
3929         dev->scrub_ctx = NULL;
3930         if (--fs_info->scrub_workers_refcnt == 0) {
3931                 scrub_workers = fs_info->scrub_workers;
3932                 scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3933                 scrub_parity = fs_info->scrub_parity_workers;
3934         }
3935         mutex_unlock(&fs_info->scrub_lock);
3936
3937         btrfs_destroy_workqueue(scrub_workers);
3938         btrfs_destroy_workqueue(scrub_wr_comp);
3939         btrfs_destroy_workqueue(scrub_parity);
3940         scrub_put_ctx(sctx);
3941
3942         return ret;
3943
3944 out_free_ctx:
3945         scrub_free_ctx(sctx);
3946
3947         return ret;
3948 }
3949
3950 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
3951 {
3952         mutex_lock(&fs_info->scrub_lock);
3953         atomic_inc(&fs_info->scrub_pause_req);
3954         while (atomic_read(&fs_info->scrubs_paused) !=
3955                atomic_read(&fs_info->scrubs_running)) {
3956                 mutex_unlock(&fs_info->scrub_lock);
3957                 wait_event(fs_info->scrub_pause_wait,
3958                            atomic_read(&fs_info->scrubs_paused) ==
3959                            atomic_read(&fs_info->scrubs_running));
3960                 mutex_lock(&fs_info->scrub_lock);
3961         }
3962         mutex_unlock(&fs_info->scrub_lock);
3963 }
3964
3965 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
3966 {
3967         atomic_dec(&fs_info->scrub_pause_req);
3968         wake_up(&fs_info->scrub_pause_wait);
3969 }
3970
3971 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
3972 {
3973         mutex_lock(&fs_info->scrub_lock);
3974         if (!atomic_read(&fs_info->scrubs_running)) {
3975                 mutex_unlock(&fs_info->scrub_lock);
3976                 return -ENOTCONN;
3977         }
3978
3979         atomic_inc(&fs_info->scrub_cancel_req);
3980         while (atomic_read(&fs_info->scrubs_running)) {
3981                 mutex_unlock(&fs_info->scrub_lock);
3982                 wait_event(fs_info->scrub_pause_wait,
3983                            atomic_read(&fs_info->scrubs_running) == 0);
3984                 mutex_lock(&fs_info->scrub_lock);
3985         }
3986         atomic_dec(&fs_info->scrub_cancel_req);
3987         mutex_unlock(&fs_info->scrub_lock);
3988
3989         return 0;
3990 }
3991
3992 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
3993                            struct btrfs_device *dev)
3994 {
3995         struct scrub_ctx *sctx;
3996
3997         mutex_lock(&fs_info->scrub_lock);
3998         sctx = dev->scrub_ctx;
3999         if (!sctx) {
4000                 mutex_unlock(&fs_info->scrub_lock);
4001                 return -ENOTCONN;
4002         }
4003         atomic_inc(&sctx->cancel_req);
4004         while (dev->scrub_ctx) {
4005                 mutex_unlock(&fs_info->scrub_lock);
4006                 wait_event(fs_info->scrub_pause_wait,
4007                            dev->scrub_ctx == NULL);
4008                 mutex_lock(&fs_info->scrub_lock);
4009         }
4010         mutex_unlock(&fs_info->scrub_lock);
4011
4012         return 0;
4013 }
4014
4015 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4016                          struct btrfs_scrub_progress *progress)
4017 {
4018         struct btrfs_device *dev;
4019         struct scrub_ctx *sctx = NULL;
4020
4021         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4022         dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
4023         if (dev)
4024                 sctx = dev->scrub_ctx;
4025         if (sctx)
4026                 memcpy(progress, &sctx->stat, sizeof(*progress));
4027         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4028
4029         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4030 }
4031
4032 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4033                                u64 extent_logical, u64 extent_len,
4034                                u64 *extent_physical,
4035                                struct btrfs_device **extent_dev,
4036                                int *extent_mirror_num)
4037 {
4038         u64 mapped_length;
4039         struct btrfs_bio *bbio = NULL;
4040         int ret;
4041
4042         mapped_length = extent_len;
4043         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4044                               &mapped_length, &bbio, 0);
4045         if (ret || !bbio || mapped_length < extent_len ||
4046             !bbio->stripes[0].dev->bdev) {
4047                 btrfs_put_bbio(bbio);
4048                 return;
4049         }
4050
4051         *extent_physical = bbio->stripes[0].physical;
4052         *extent_mirror_num = bbio->mirror_num;
4053         *extent_dev = bbio->stripes[0].dev;
4054         btrfs_put_bbio(bbio);
4055 }